diff --git a/.gitignore b/.gitignore
index 224bd2f3a9cf305cc4205f30d7742928de5f8b99..fd308878407aa8e0c6745b1a837a94e3fff0b3e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,6 +41,8 @@ cscope.out
 autoconf/aclocal.m4
 autoconf/autom4te.cache
 /compile_commands.json
+# Visual Studio built-in CMake configuration
+/CMakeSettings.json
 
 #==============================================================================#
 # Directories to ignore (do not add trailing '/'s, they skip symlinks).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e7a9dd8988f3990f048ad0cf3d2074df3bd7539..4ff0e6a90e571e23ab899b057a1e7141c24547d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,10 @@ if(POLICY CMP0068)
   set(CMAKE_BUILD_WITH_INSTALL_NAME_DIR ON)
 endif()
 
+if(POLICY CMP0075)
+  cmake_policy(SET CMP0075 NEW)
+endif()
+
 if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 8)
 endif()
@@ -855,7 +859,7 @@ if( LLVM_INCLUDE_UTILS )
 else()
   if ( LLVM_INCLUDE_TESTS )
     message(FATAL_ERROR "Including tests when not building utils will not work.
-    Either set LLVM_INCLUDE_UTILS to On, or set LLVM_INCLDE_TESTS to Off.")
+    Either set LLVM_INCLUDE_UTILS to On, or set LLVM_INCLUDE_TESTS to Off.")
   endif()
 endif()
 
@@ -974,7 +978,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
   add_custom_target(llvm-headers DEPENDS intrinsics_gen)
   set_target_properties(llvm-headers PROPERTIES FOLDER "Misc")
 
-  if (NOT CMAKE_CONFIGURATION_TYPES)
+  if (NOT LLVM_ENABLE_IDE)
     add_llvm_install_targets(install-llvm-headers
                              DEPENDS llvm-headers
                              COMPONENT llvm-headers)
@@ -984,7 +988,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
   add_custom_target(llvm-libraries)
   set_target_properties(llvm-libraries PROPERTIES FOLDER "Misc")
 
-  if (NOT CMAKE_CONFIGURATION_TYPES)
+  if (NOT LLVM_ENABLE_IDE)
     add_llvm_install_targets(install-llvm-libraries
                              DEPENDS llvm-libraries
                              COMPONENT llvm-libraries)
@@ -995,7 +999,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
     list(REMOVE_DUPLICATES LLVM_LIBS)
     foreach(lib ${LLVM_LIBS})
       add_dependencies(llvm-libraries ${lib})
-      if (NOT CMAKE_CONFIGURATION_TYPES)
+      if (NOT LLVM_ENABLE_IDE)
         add_dependencies(install-llvm-libraries install-${lib})
       endif()
     endforeach()
@@ -1005,7 +1009,7 @@ endif()
 # This must be at the end of the LLVM root CMakeLists file because it must run
 # after all targets are created.
 if(LLVM_DISTRIBUTION_COMPONENTS)
-  if(CMAKE_CONFIGURATION_TYPES)
+  if(LLVM_ENABLE_IDE)
     message(FATAL_ERROR "LLVM_DISTRIBUTION_COMPONENTS cannot be specified with multi-configuration generators (i.e. Xcode or Visual Studio)")
   endif()
 
diff --git a/CREDITS.TXT b/CREDITS.TXT
index 7108051d67ab80cf0f9a31a4e1a004214c21c7a7..e279701f57d90bb182b24dfb48d88c4b64443843 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -492,8 +492,8 @@ D: Thread Local Storage implementation
 N: Bill Wendling
 I: wendling
 E: isanbard@gmail.com
-D: Release manager, IR Linker, LTO
-D: Bunches of stuff
+D: Release manager, IR Linker, LTO.
+D: Bunches of stuff.
 
 N: Bob Wilson
 E: bob.wilson@acm.org
@@ -502,3 +502,11 @@ D: Advanced SIMD (NEON) support in the ARM backend.
 N: QingShan Zhang
 E: qshanz@cn.ibm.com
 D: PowerPC Backend Developer
+
+N: Li Jia He
+E: hljhehlj@cn.ibm.com
+D: PowerPC Backend Developer
+
+N: Zixuan Wu
+E: wuzish@cn.ibm.com
+D: PowerPC Backend Developer
diff --git a/bindings/go/llvm/ir.go b/bindings/go/llvm/ir.go
index 0f4877429cc11075a5f3d0a5a8a0e5d89f7f7a5a..1872a2ffe51092a31af4b685827d8ad0e6ffcb8d 100644
--- a/bindings/go/llvm/ir.go
+++ b/bindings/go/llvm/ir.go
@@ -739,7 +739,6 @@ func (v Value) IsAPHINode() (rv Value)             { rv.C = C.LLVMIsAPHINode(v.C
 func (v Value) IsASelectInst() (rv Value)          { rv.C = C.LLVMIsASelectInst(v.C); return }
 func (v Value) IsAShuffleVectorInst() (rv Value)   { rv.C = C.LLVMIsAShuffleVectorInst(v.C); return }
 func (v Value) IsAStoreInst() (rv Value)           { rv.C = C.LLVMIsAStoreInst(v.C); return }
-func (v Value) IsATerminatorInst() (rv Value)      { rv.C = C.LLVMIsATerminatorInst(v.C); return }
 func (v Value) IsABranchInst() (rv Value)          { rv.C = C.LLVMIsABranchInst(v.C); return }
 func (v Value) IsAInvokeInst() (rv Value)          { rv.C = C.LLVMIsAInvokeInst(v.C); return }
 func (v Value) IsAReturnInst() (rv Value)          { rv.C = C.LLVMIsAReturnInst(v.C); return }
@@ -1259,6 +1258,19 @@ func InlineAsm(t Type, asmString, constraints string, hasSideEffects, isAlignSta
 	return
 }
 
+// Operations on aggregates
+func (v Value) Indices() []uint32 {
+	num := C.LLVMGetNumIndices(v.C)
+	indicesPtr := C.LLVMGetIndices(v.C)
+	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+	rawIndices := (*[1 << 30]C.uint)(unsafe.Pointer(indicesPtr))[:num:num]
+	indices := make([]uint32, num)
+	for i := range indices {
+		indices[i] = uint32(rawIndices[i])
+	}
+	return indices
+}
+
 //-------------------------------------------------------------------------
 // llvm.Builder
 //-------------------------------------------------------------------------
diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli
index 97b6a695fa24d65a43af782444eabfbabfaa93af..f12eb6efa61c797639d0d817ac17e4e04f79427b 100644
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli
@@ -1887,16 +1887,16 @@ val set_volatile : bool -> llvalue -> unit
 val is_terminator : llvalue -> bool
 
 (** [successor v i] returns the successor at index [i] for the value [v].
-    See the method [llvm::TerminatorInst::getSuccessor]. *)
+    See the method [llvm::Instruction::getSuccessor]. *)
 val successor : llvalue -> int -> llbasicblock
 
 (** [set_successor v i o] sets the successor of the value [v] at the index [i] to
     the value [o].
-    See the method [llvm::TerminatorInst::setSuccessor]. *)
+    See the method [llvm::Instruction::setSuccessor]. *)
 val set_successor : llvalue -> int -> llbasicblock -> unit
 
 (** [num_successors v] returns the number of successors for the value [v].
-    See the method [llvm::TerminatorInst::getNumSuccessors]. *)
+    See the method [llvm::Instruction::getNumSuccessors]. *)
 val num_successors : llvalue -> int
 
 (** [successors v] returns the successors of [v]. *)
diff --git a/bindings/ocaml/llvm/llvm_ocaml.c b/bindings/ocaml/llvm/llvm_ocaml.c
index c637941d81d3a297ae764232182a71eeed512a55..cdf6c6a1206a620f7364761add4a7dbef441212b 100644
--- a/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/bindings/ocaml/llvm/llvm_ocaml.c
@@ -483,9 +483,9 @@ CAMLprim value llvm_struct_set_body(LLVMTypeRef Ty,
 CAMLprim value llvm_struct_name(LLVMTypeRef Ty)
 {
   CAMLparam0();
+  CAMLlocal1(result);
   const char *C = LLVMGetStructName(Ty);
   if (C) {
-    CAMLlocal1(result);
     result = caml_alloc_small(1, 0);
     Store_field(result, 0, caml_copy_string(C));
     CAMLreturn(result);
@@ -636,6 +636,7 @@ enum ValueKind {
 
 CAMLprim value llvm_classify_value(LLVMValueRef Val) {
   CAMLparam0();
+  CAMLlocal1(result);
   if (!Val)
     CAMLreturn(Val_int(NullValue));
   if (LLVMIsAConstant(Val)) {
@@ -652,7 +653,6 @@ CAMLprim value llvm_classify_value(LLVMValueRef Val) {
     DEFINE_CASE(Val, ConstantVector);
   }
   if (LLVMIsAInstruction(Val)) {
-    CAMLlocal1(result);
     result = caml_alloc_small(1, 0);
     Store_field(result, 0, Val_int(LLVMGetInstructionOpcode(Val)));
     CAMLreturn(result);
@@ -822,12 +822,11 @@ CAMLprim LLVMValueRef llvm_mdnull(LLVMContextRef C) {
 /* llvalue -> string option */
 CAMLprim value llvm_get_mdstring(LLVMValueRef V) {
   CAMLparam0();
+  CAMLlocal2(Option, Str);
   const char *S;
   unsigned Len;
 
   if ((S = LLVMGetMDString(V, &Len))) {
-    CAMLlocal2(Option, Str);
-
     Str = caml_alloc_string(Len);
     memcpy(String_val(Str), S, Len);
     Option = alloc(1,0);
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 4dde95e30f30c6212efa301331fe10c1ed360cc2..189971655583a40532272d608d807212198b14be 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -635,6 +635,7 @@ macro(add_llvm_library name)
     set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS_BUILDTREE_ONLY ${name})
   else()
     if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "LTO" OR
+        ${name} STREQUAL "OptRemarks" OR
         (LLVM_LINK_LLVM_DYLIB AND ${name} STREQUAL "LLVM"))
       set(install_dir lib${LLVM_LIBDIR_SUFFIX})
       if(ARG_SHARED OR BUILD_SHARED_LIBS)
@@ -659,7 +660,7 @@ macro(add_llvm_library name)
               ${install_type} DESTINATION ${install_dir}
               COMPONENT ${name})
 
-      if (NOT CMAKE_CONFIGURATION_TYPES)
+      if (NOT LLVM_ENABLE_IDE)
         add_llvm_install_targets(install-${name}
                                  DEPENDS ${name}
                                  COMPONENT ${name})
@@ -890,7 +891,7 @@ macro(add_llvm_tool name)
               RUNTIME DESTINATION ${LLVM_TOOLS_INSTALL_DIR}
               COMPONENT ${name})
 
-      if (NOT CMAKE_CONFIGURATION_TYPES)
+      if (NOT LLVM_ENABLE_IDE)
         add_llvm_install_targets(install-${name}
                                  DEPENDS ${name}
                                  COMPONENT ${name})
@@ -928,7 +929,7 @@ macro(add_llvm_utility name)
     install (TARGETS ${name}
       RUNTIME DESTINATION ${LLVM_UTILS_INSTALL_DIR}
       COMPONENT ${name})
-    if (NOT CMAKE_CONFIGURATION_TYPES)
+    if (NOT LLVM_ENABLE_IDE)
       add_llvm_install_targets(install-${name}
                                DEPENDS ${name}
                                COMPONENT ${name})
@@ -1409,7 +1410,7 @@ function(add_lit_testsuite target comment)
 endfunction()
 
 function(add_lit_testsuites project directory)
-  if (NOT CMAKE_CONFIGURATION_TYPES)
+  if (NOT LLVM_ENABLE_IDE)
     cmake_parse_arguments(ARG "" "" "PARAMS;DEPENDS;ARGS" ${ARGN})
 
     # Search recursively for test directories by assuming anything not
@@ -1468,7 +1469,7 @@ function(llvm_install_library_symlink name dest type)
           CODE "install_symlink(${full_name} ${full_dest} ${output_dir})"
           COMPONENT ${component})
 
-  if (NOT CMAKE_CONFIGURATION_TYPES AND NOT ARG_ALWAYS_GENERATE)
+  if (NOT LLVM_ENABLE_IDE AND NOT ARG_ALWAYS_GENERATE)
     add_llvm_install_targets(install-${name}
                              DEPENDS ${name} ${dest} install-${dest}
                              COMPONENT ${name})
@@ -1501,7 +1502,7 @@ function(llvm_install_symlink name dest)
           CODE "install_symlink(${full_name} ${full_dest} ${LLVM_TOOLS_INSTALL_DIR})"
           COMPONENT ${component})
 
-  if (NOT CMAKE_CONFIGURATION_TYPES AND NOT ARG_ALWAYS_GENERATE)
+  if (NOT LLVM_ENABLE_IDE AND NOT ARG_ALWAYS_GENERATE)
     add_llvm_install_targets(install-${name}
                              DEPENDS ${name} ${dest} install-${dest}
                              COMPONENT ${name})
diff --git a/cmake/modules/CMakeLists.txt b/cmake/modules/CMakeLists.txt
index 6c316a2f04fb5f5cbca980b2a2c4cc3abeb38c87..f5cc0006fa06ad5fc2a11707be1d3dac062013d4 100644
--- a/cmake/modules/CMakeLists.txt
+++ b/cmake/modules/CMakeLists.txt
@@ -132,7 +132,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
     PATTERN LLVM-Config.cmake EXCLUDE
     PATTERN GetHostTriple.cmake EXCLUDE)
 
-  if (NOT CMAKE_CONFIGURATION_TYPES)
+  if (NOT LLVM_ENABLE_IDE)
     # Add a dummy target so this can be used with LLVM_DISTRIBUTION_COMPONENTS
     add_custom_target(cmake-exports)
     add_llvm_install_targets(install-cmake-exports
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 2c9bd14ad054cc6cf1e20961aa5b8e39e342d503..b590f768244540ffc696a12f73944dadc275fec8 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -23,7 +23,7 @@ string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
 # Ninja Job Pool support
 # The following only works with the Ninja generator in CMake >= 3.0.
 set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
-  "Define the maximum number of concurrent compilation jobs.")
+  "Define the maximum number of concurrent compilation jobs (Ninja only).")
 if(LLVM_PARALLEL_COMPILE_JOBS)
   if(NOT CMAKE_MAKE_PROGRAM MATCHES "ninja")
     message(WARNING "Job pooling is only available with Ninja generators.")
@@ -34,7 +34,7 @@ if(LLVM_PARALLEL_COMPILE_JOBS)
 endif()
 
 set(LLVM_PARALLEL_LINK_JOBS "" CACHE STRING
-  "Define the maximum number of concurrent link jobs.")
+  "Define the maximum number of concurrent link jobs (Ninja only).")
 if(CMAKE_MAKE_PROGRAM MATCHES "ninja")
   if(NOT LLVM_PARALLEL_LINK_JOBS AND uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
     message(STATUS "ThinLTO provides its own parallel linking - limiting parallel link jobs to 2.")
@@ -224,6 +224,10 @@ if(NOT WIN32 AND NOT CYGWIN)
   append_if(SUPPORTS_FVISIBILITY_INLINES_HIDDEN_FLAG "-fvisibility-inlines-hidden" CMAKE_CXX_FLAGS)
 endif()
 
+if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND MINGW)
+  add_definitions( -D_FILE_OFFSET_BITS=64 )
+endif()
+
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT WIN32 )
   # TODO: support other platforms and toolchains.
   if( LLVM_BUILD_32_BITS )
@@ -576,6 +580,7 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
     append("-Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 
+  add_flag_if_supported("-Wimplicit-fallthrough" IMPLICIT_FALLTHROUGH_FLAG)
   add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
   append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
   append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
@@ -868,12 +873,19 @@ else()
   set(LLVM_ENABLE_PLUGINS ON)
 endif()
 
+# By default we should enable LLVM_ENABLE_IDE only for multi-configuration
+# generators. This option disables optional build system features that make IDEs
+# less usable.
 set(LLVM_ENABLE_IDE_default OFF)
-if (XCODE OR MSVC_IDE OR CMAKE_EXTRA_GENERATOR)
+if (CMAKE_CONFIGURATION_TYPES)
   set(LLVM_ENABLE_IDE_default ON)
 endif()
-option(LLVM_ENABLE_IDE "Generate targets and process sources for use with an IDE"
-    ${LLVM_ENABLE_IDE_default})
+option(LLVM_ENABLE_IDE
+       "Disable optional build system features that cause problems for IDE generators"
+       ${LLVM_ENABLE_IDE_default})
+if (CMAKE_CONFIGURATION_TYPES AND NOT LLVM_ENABLE_IDE)
+  message(WARNING "Disabling LLVM_ENABLE_IDE on multi-configuration generators is not recommended.")
+endif()
 
 function(get_compile_definitions)
   get_directory_property(top_dir_definitions DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index f65f31d797cf6c0384df64ec95af7b0222455748..7cbd2863500cf1ef606db7b16c474580f51d3147 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -52,16 +52,15 @@ function(llvm_process_sources OUT_VAR)
   cmake_parse_arguments(ARG "" "" "ADDITIONAL_HEADERS;ADDITIONAL_HEADER_DIRS" ${ARGN})
   set(sources ${ARG_UNPARSED_ARGUMENTS})
   llvm_check_source_file_list( ${sources} )
-  if( LLVM_ENABLE_IDE )
-    # This adds .td and .h files to the Visual Studio solution:
-    add_td_sources(sources)
-    find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}")
-    if (hdrs)
-      set_source_files_properties(${hdrs} PROPERTIES HEADER_FILE_ONLY ON)
-    endif()
-    set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON)
-    list(APPEND sources ${ARG_ADDITIONAL_HEADERS} ${hdrs})
+  
+  # This adds .td and .h files to the Visual Studio solution:
+  add_td_sources(sources)
+  find_all_header_files(hdrs "${ARG_ADDITIONAL_HEADER_DIRS}")
+  if (hdrs)
+    set_source_files_properties(${hdrs} PROPERTIES HEADER_FILE_ONLY ON)
   endif()
+  set_source_files_properties(${ARG_ADDITIONAL_HEADERS} PROPERTIES HEADER_FILE_ONLY ON)
+  list(APPEND sources ${ARG_ADDITIONAL_HEADERS} ${hdrs})
 
   set( ${OUT_VAR} ${sources} PARENT_SCOPE )
 endfunction(llvm_process_sources)
diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
index 1ddda1bae9ec5b1aba10329591d7936a46d364a7..03685f9e352834d826da2f6196fbe0f0b7bc3f27 100644
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -78,139 +78,143 @@ names from both the *Processor* and *Alternative Processor* can be used.
   .. table:: AMDGPU Processors
      :name: amdgpu-processor-table
 
-     =========== =============== ============ ===== ========= ======= ==================
-     Processor   Alternative     Target       dGPU/ Target    ROCm    Example
-                 Processor       Triple       APU   Features  Support Products
+     =========== =============== ============ ===== ========== ======= ======================
+     Processor   Alternative     Target       dGPU/ Target     ROCm    Example
+                 Processor       Triple       APU   Features   Support Products
                                  Architecture       Supported
                                                     [Default]
-     =========== =============== ============ ===== ========= ======= ==================
+     =========== =============== ============ ===== ========== ======= ======================
      **Radeon HD 2000/3000 Series (R600)** [AMD-RADEON-HD-2000-3000]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``r600``                    ``r600``     dGPU
      ``r630``                    ``r600``     dGPU
      ``rs880``                   ``r600``     dGPU
      ``rv670``                   ``r600``     dGPU
      **Radeon HD 4000 Series (R700)** [AMD-RADEON-HD-4000]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``rv710``                   ``r600``     dGPU
      ``rv730``                   ``r600``     dGPU
      ``rv770``                   ``r600``     dGPU
      **Radeon HD 5000 Series (Evergreen)** [AMD-RADEON-HD-5000]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``cedar``                   ``r600``     dGPU
      ``cypress``                 ``r600``     dGPU
      ``juniper``                 ``r600``     dGPU
      ``redwood``                 ``r600``     dGPU
      ``sumo``                    ``r600``     dGPU
      **Radeon HD 6000 Series (Northern Islands)** [AMD-RADEON-HD-6000]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``barts``                   ``r600``     dGPU
      ``caicos``                  ``r600``     dGPU
      ``cayman``                  ``r600``     dGPU
      ``turks``                   ``r600``     dGPU
      **GCN GFX6 (Southern Islands (SI))** [AMD-GCN-GFX6]_
-     -----------------------------------------------------------------------------------
+     ----------------------------------------------------------------------------------------
      ``gfx600``  - ``tahiti``    ``amdgcn``   dGPU
      ``gfx601``  - ``hainan``    ``amdgcn``   dGPU
                  - ``oland``
                  - ``pitcairn``
                  - ``verde``
      **GCN GFX7 (Sea Islands (CI))** [AMD-GCN-GFX7]_
-     -----------------------------------------------------------------------------------
-     ``gfx700``  - ``kaveri``    ``amdgcn``   APU                     - A6-7000
-                                                                      - A6 Pro-7050B
-                                                                      - A8-7100
-                                                                      - A8 Pro-7150B
-                                                                      - A10-7300
-                                                                      - A10 Pro-7350B
-                                                                      - FX-7500
-                                                                      - A8-7200P
-                                                                      - A10-7400P
-                                                                      - FX-7600P
-     ``gfx701``  - ``hawaii``    ``amdgcn``   dGPU            ROCm    - FirePro W8100
-                                                                      - FirePro W9100
-                                                                      - FirePro S9150
-                                                                      - FirePro S9170
-     ``gfx702``                  ``amdgcn``   dGPU            ROCm    - Radeon R9 290
-                                                                      - Radeon R9 290x
-                                                                      - Radeon R390
-                                                                      - Radeon R390x
-     ``gfx703``  - ``kabini``    ``amdgcn``   APU                     - E1-2100
-                 - ``mullins``                                        - E1-2200
-                                                                      - E1-2500
-                                                                      - E2-3000
-                                                                      - E2-3800
-                                                                      - A4-5000
-                                                                      - A4-5100
-                                                                      - A6-5200
-                                                                      - A4 Pro-3340B
-     ``gfx704``  - ``bonaire``   ``amdgcn``   dGPU                    - Radeon HD 7790
-                                                                      - Radeon HD 8770
-                                                                      - R7 260
-                                                                      - R7 260X
+     ----------------------------------------------------------------------------------------
+     ``gfx700``  - ``kaveri``    ``amdgcn``   APU                      - A6-7000
+                                                                       - A6 Pro-7050B
+                                                                       - A8-7100
+                                                                       - A8 Pro-7150B
+                                                                       - A10-7300
+                                                                       - A10 Pro-7350B
+                                                                       - FX-7500
+                                                                       - A8-7200P
+                                                                       - A10-7400P
+                                                                       - FX-7600P
+     ``gfx701``  - ``hawaii``    ``amdgcn``   dGPU             ROCm    - FirePro W8100
+                                                                       - FirePro W9100
+                                                                       - FirePro S9150
+                                                                       - FirePro S9170
+     ``gfx702``                  ``amdgcn``   dGPU             ROCm    - Radeon R9 290
+                                                                       - Radeon R9 290x
+                                                                       - Radeon R390
+                                                                       - Radeon R390x
+     ``gfx703``  - ``kabini``    ``amdgcn``   APU                      - E1-2100
+                 - ``mullins``                                         - E1-2200
+                                                                       - E1-2500
+                                                                       - E2-3000
+                                                                       - E2-3800
+                                                                       - A4-5000
+                                                                       - A4-5100
+                                                                       - A6-5200
+                                                                       - A4 Pro-3340B
+     ``gfx704``  - ``bonaire``   ``amdgcn``   dGPU                     - Radeon HD 7790
+                                                                       - Radeon HD 8770
+                                                                       - R7 260
+                                                                       - R7 260X
      **GCN GFX8 (Volcanic Islands (VI))** [AMD-GCN-GFX8]_
-     -----------------------------------------------------------------------------------
-     ``gfx801``  - ``carrizo``   ``amdgcn``   APU   - xnack           - A6-8500P
-                                                      [on]            - Pro A6-8500B
-                                                                      - A8-8600P
-                                                                      - Pro A8-8600B
-                                                                      - FX-8800P
-                                                                      - Pro A12-8800B
-     \                           ``amdgcn``   APU   - xnack   ROCm    - A10-8700P
-                                                      [on]            - Pro A10-8700B
-                                                                      - A10-8780P
-     \                           ``amdgcn``   APU   - xnack           - A10-9600P
-                                                      [on]            - A10-9630P
-                                                                      - A12-9700P
-                                                                      - A12-9730P
-                                                                      - FX-9800P
-                                                                      - FX-9830P
-     \                           ``amdgcn``   APU   - xnack           - E2-9010
-                                                      [on]            - A6-9210
-                                                                      - A9-9410
-     ``gfx802``  - ``iceland``   ``amdgcn``   dGPU  - xnack   ROCm    - FirePro S7150
-                 - ``tonga``                          [off]           - FirePro S7100
-                                                                      - FirePro W7100
-                                                                      - Radeon R285
-                                                                      - Radeon R9 380
-                                                                      - Radeon R9 385
-                                                                      - Mobile FirePro
-                                                                        M7170
-     ``gfx803``  - ``fiji``      ``amdgcn``   dGPU  - xnack   ROCm    - Radeon R9 Nano
-                                                      [off]           - Radeon R9 Fury
-                                                                      - Radeon R9 FuryX
-                                                                      - Radeon Pro Duo
-                                                                      - FirePro S9300x2
-                                                                      - Radeon Instinct MI8
-     \           - ``polaris10`` ``amdgcn``   dGPU  - xnack   ROCm    - Radeon RX 470
-                                                      [off]           - Radeon RX 480
-                                                                      - Radeon Instinct MI6
-     \           - ``polaris11`` ``amdgcn``   dGPU  - xnack   ROCm    - Radeon RX 460
+     ----------------------------------------------------------------------------------------
+     ``gfx801``  - ``carrizo``   ``amdgcn``   APU   - xnack            - A6-8500P
+                                                      [on]             - Pro A6-8500B
+                                                                       - A8-8600P
+                                                                       - Pro A8-8600B
+                                                                       - FX-8800P
+                                                                       - Pro A12-8800B
+     \                           ``amdgcn``   APU   - xnack    ROCm    - A10-8700P
+                                                      [on]             - Pro A10-8700B
+                                                                       - A10-8780P
+     \                           ``amdgcn``   APU   - xnack            - A10-9600P
+                                                      [on]             - A10-9630P
+                                                                       - A12-9700P
+                                                                       - A12-9730P
+                                                                       - FX-9800P
+                                                                       - FX-9830P
+     \                           ``amdgcn``   APU   - xnack            - E2-9010
+                                                      [on]             - A6-9210
+                                                                       - A9-9410
+     ``gfx802``  - ``iceland``   ``amdgcn``   dGPU  - xnack    ROCm    - FirePro S7150
+                 - ``tonga``                          [off]            - FirePro S7100
+                                                                       - FirePro W7100
+                                                                       - Radeon R285
+                                                                       - Radeon R9 380
+                                                                       - Radeon R9 385
+                                                                       - Mobile FirePro
+                                                                         M7170
+     ``gfx803``  - ``fiji``      ``amdgcn``   dGPU  - xnack    ROCm    - Radeon R9 Nano
+                                                      [off]            - Radeon R9 Fury
+                                                                       - Radeon R9 FuryX
+                                                                       - Radeon Pro Duo
+                                                                       - FirePro S9300x2
+                                                                       - Radeon Instinct MI8
+     \           - ``polaris10`` ``amdgcn``   dGPU  - xnack    ROCm    - Radeon RX 470
+                                                      [off]            - Radeon RX 480
+                                                                       - Radeon Instinct MI6
+     \           - ``polaris11`` ``amdgcn``   dGPU  - xnack    ROCm    - Radeon RX 460
                                                       [off]
      ``gfx810``  - ``stoney``    ``amdgcn``   APU   - xnack
                                                       [on]
      **GCN GFX9** [AMD-GCN-GFX9]_
-     -----------------------------------------------------------------------------------
-     ``gfx900``                  ``amdgcn``   dGPU  - xnack   ROCm    - Radeon Vega
-                                                      [off]             Frontier Edition
-                                                                      - Radeon RX Vega 56
-                                                                      - Radeon RX Vega 64
-                                                                      - Radeon RX Vega 64
-                                                                        Liquid
-                                                                      - Radeon Instinct MI25
-     ``gfx902``                  ``amdgcn``   APU   - xnack           - Ryzen 3 2200G
-                                                      [on]            - Ryzen 5 2400G
-     ``gfx904``                  ``amdgcn``   dGPU  - xnack           *TBA*
+     ----------------------------------------------------------------------------------------
+     ``gfx900``                  ``amdgcn``   dGPU  - xnack    ROCm    - Radeon Vega
+                                                      [off]              Frontier Edition
+                                                                       - Radeon RX Vega 56
+                                                                       - Radeon RX Vega 64
+                                                                       - Radeon RX Vega 64
+                                                                         Liquid
+                                                                       - Radeon Instinct MI25
+     ``gfx902``                  ``amdgcn``   APU   - xnack            - Ryzen 3 2200G
+                                                      [on]             - Ryzen 5 2400G
+     ``gfx904``                  ``amdgcn``   dGPU  - xnack            *TBA*
                                                       [off]
-                                                                      .. TODO
-                                                                         Add product
-                                                                         names.
-     ``gfx906``                  ``amdgcn``   dGPU  - xnack           *TBA*
-                                                      [off]
-                                                                      .. TODO
-                                                                         Add product
-                                                                         names.
-     =========== =============== ============ ===== ========= ======= ==================
+                                                                       .. TODO
+                                                                          Add product
+                                                                          names.
+     ``gfx906``                  ``amdgcn``   dGPU  - xnack            - Radeon Instinct MI50
+                                                      [off]            - Radeon Instinct MI60
+                                                      sram-ecc
+                                                      [on]
+     ``gfx909``                  ``amdgcn``   APU   - xnack            *TBA* (Raven Ridge 2)
+                                                      [on]
+                                                                       .. TODO
+                                                                          Add product
+                                                                          names.
+     =========== =============== ============ ===== ========== ======= ======================
 
 .. _amdgpu-target-features:
 
@@ -241,24 +245,26 @@ For example:
   .. table:: AMDGPU Target Features
      :name: amdgpu-target-feature-table
 
-     ============== ==================================================
-     Target Feature Description
-     ============== ==================================================
-     -m[no-]xnack   Enable/disable generating code that has
-                    memory clauses that are compatible with
-                    having XNACK replay enabled.
-
-                    This is used for demand paging and page
-                    migration. If XNACK replay is enabled in
-                    the device, then if a page fault occurs
-                    the code may execute incorrectly if the
-                    ``xnack`` feature is not enabled. Executing
-                    code that has the feature enabled on a
-                    device that does not have XNACK replay
-                    enabled will execute correctly, but may
-                    be less performant than code with the
-                    feature disabled.
-     ============== ==================================================
+     =============== ==================================================
+     Target Feature  Description
+     =============== ==================================================
+     -m[no-]xnack    Enable/disable generating code that has
+                     memory clauses that are compatible with
+                     having XNACK replay enabled.
+
+                     This is used for demand paging and page
+                     migration. If XNACK replay is enabled in
+                     the device, then if a page fault occurs
+                     the code may execute incorrectly if the
+                     ``xnack`` feature is not enabled. Executing
+                     code that has the feature enabled on a
+                     device that does not have XNACK replay
+                     enabled will execute correctly, but may
+                     be less performant than code with the
+                     feature disabled.
+     -m[no-]sram-ecc Enable/disable generating code that assumes SRAM
+                     ECC is enabled/disabled.
+     =============== ==================================================
 
 .. _amdgpu-address-spaces:
 
@@ -544,6 +550,17 @@ The AMDGPU backend uses the following ELF header:
                                                   be 0.
                                                   See
                                                   :ref:`amdgpu-target-features`.
+     ``EF_AMDGPU_SRAM_ECC``            0x00000200 Indicates if the ``sram-ecc``
+                                                  target feature is
+                                                  enabled for all code
+                                                  contained in the code object.
+                                                  If the processor
+                                                  does not support the
+                                                  ``sram-ecc`` target
+                                                  feature then must
+                                                  be 0.
+                                                  See
+                                                  :ref:`amdgpu-target-features`.
      ================================= ========== =============================
 
   .. table:: AMDGPU ``EF_AMDGPU_MACH`` Values
@@ -589,6 +606,7 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX904``  0x02e      ``gfx904``
      ``EF_AMDGPU_MACH_AMDGCN_GFX906``  0x02f      ``gfx906``
      *reserved*                        0x030      Reserved.
+     ``EF_AMDGPU_MACH_AMDGCN_GFX909``  0x031      ``gfx909``
      ================================= ========== =============================
 
 Sections
diff --git a/docs/AdvancedBuilds.rst b/docs/AdvancedBuilds.rst
index c559bdeb28023288365df561af3f74a09f6aa257..d2a2ef58b23e4c85a6bea3a2f16baa9a49e36afa 100644
--- a/docs/AdvancedBuilds.rst
+++ b/docs/AdvancedBuilds.rst
@@ -41,6 +41,16 @@ This command itself isn't terribly useful because it assumes default
 configurations for each stage. The next series of examples utilize CMake cache
 scripts to provide more complex options.
 
+By default, only a few CMake options will be passed between stages.
+The list, called _BOOTSTRAP_DEFAULT_PASSTHROUGH, is defined in clang/CMakeLists.txt.
+To force the passing of the variables between stages, use the -DCLANG_BOOTSTRAP_PASSTHROUGH
+CMake option, each variable separated by a ";". As example:
+
+.. code-block:: console
+
+  $ cmake -G Ninja -DCLANG_ENABLE_BOOTSTRAP=On -DCLANG_BOOTSTRAP_PASSTHROUGH="CMAKE_INSTALL_PREFIX;CMAKE_VERBOSE_MAKEFILE" <path to source>
+  $ ninja stage2
+
 The clang build system refers to builds as stages. A stage1 build is a standard
 build using the compiler installed on the host, and a stage2 build is built
 using the stage1 compiler. This nomenclature holds up to more stages too. In
diff --git a/docs/BranchWeightMetadata.rst b/docs/BranchWeightMetadata.rst
index 9bd8bd4ae744afdfe446e90682cd3c505d01a72f..e09587179ec3b56bbea10b818f628295101314b8 100644
--- a/docs/BranchWeightMetadata.rst
+++ b/docs/BranchWeightMetadata.rst
@@ -9,10 +9,10 @@ Introduction
 ============
 
 Branch Weight Metadata represents branch weights as its likeliness to be taken
-(see :doc:`BlockFrequencyTerminology`). Metadata is assigned to the
-``TerminatorInst`` as a ``MDNode`` of the ``MD_prof`` kind. The first operator
-is always a ``MDString`` node with the string "branch_weights".  Number of
-operators depends on the terminator type.
+(see :doc:`BlockFrequencyTerminology`). Metadata is assigned to an
+``Instruction`` that is a terminator as a ``MDNode`` of the ``MD_prof`` kind.
+The first operator is always a ``MDString`` node with the string
+"branch_weights".  Number of operators depends on the terminator type.
 
 Branch weights might be fetch from the profiling file, or generated based on
 `__builtin_expect`_ instruction.
diff --git a/docs/BugLifeCycle.rst b/docs/BugLifeCycle.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c74aa1d3a62bc0e8752915a44b8988101caa5393
--- /dev/null
+++ b/docs/BugLifeCycle.rst
@@ -0,0 +1,140 @@
+===================
+LLVM Bug Life Cycle
+===================
+
+.. contents::
+   :local:
+
+
+
+Introduction - Achieving consistency in how we deal with bug reports
+====================================================================
+
+We aim to achieve a basic level of consistency in how reported bugs evolve from
+being reported, to being worked on, and finally getting closed out. The
+consistency helps reporters, developers and others to gain a better
+understanding of what a particular bug state actually means and what to expect
+might happen next.
+
+At the same time, we aim to not over-specify the life cycle of bugs in the
+`the LLVM Bug Tracking System <https://bugs.llvm.org/enter_bug.cgi>`_, as the
+overall goal is to make it easier to work with and understand the bug reports.
+
+The main parts of the life cycle documented here are:
+
+#. `Reporting`_
+#. `Triaging`_
+#. `Actively working on fixing`_
+#. `Closing`_
+
+Furthermore, some of the metadata in the bug tracker, such as who to notify on
+newly reported bugs or what the breakdown into products & components is we use,
+needs to be maintained. See the following for details:
+
+#. `Maintenance of Bug products/component metadata`_
+#. `Maintenance of cc-by-default settings`_
+
+
+.. _Reporting:
+
+Reporting bugs
+==============
+
+See :doc:`HowToSubmitABug` on further details on how to submit good bug reports.
+
+Make sure that you have one or more people on cc on the bug report that you
+think will react to it. We aim to automatically add specific people on cc for
+most products/components, but may not always succeed in doing so.
+
+If you know the area of LLVM code the root cause of the bug is in, good
+candidates to add as cc may be the same people you'd ask for a code review in
+that area. See :ref:`finding-potential-reviewers` for more details.
+
+
+.. _Triaging:
+
+Triaging bugs
+=============
+
+Bugs with status NEW indicate that they still need to be triaged.
+When triage is complete, the status of the bug is moved to CONFIRMED.
+
+The goal of triaging a bug is to make sure a newly reported bug ends up in a
+good, actionable, state. Try to answer the following questions while triaging.
+
+* Is the reported behavior actually wrong?
+
+  * E.g. does a miscompile example depend on undefined behavior?
+
+* Can you easily reproduce the bug?
+
+  * If not, are there reasonable excuses why it cannot easily be reproduced?
+
+* Is it related to an already reported bug?
+
+  * Use the "See also"/"depends on"/"blocks" fields if so.
+  * Close it as a duplicate if so, pointing to the issue it duplicates.
+
+* Are the following fields filled in correctly?
+
+  * Product
+  * Component
+  * Title
+
+* CC others not already cc’ed that you happen to know would be good to pull in.
+* Add the "beginner" keyword if you think this would be a good bug to be fixed
+  by someone new to LLVM.
+
+.. _Actively working on fixing:
+
+Actively working on fixing bugs
+===============================
+
+Please remember to assign the bug to yourself if you're actively working on
+fixing it and to unassign it when you're no longer actively working on it.  You
+unassign a bug by setting the Assignee field to "unassignedbugs@nondot.org".
+
+.. _Closing:
+
+Resolving/Closing bugs
+======================
+
+For simplicity, we only have 1 status for all resolved or closed bugs:
+RESOLVED.
+
+Resolving bugs is good! Make sure to properly record the reason for resolving.
+Examples of reasons for resolving are:
+
+* Revision NNNNNN fixed the bug.
+* The bug cannot be reproduced with revision NNNNNN.
+* The circumstances for the bug don't apply anymore.
+* There is a sound reason for not fixing it (WONTFIX).
+* There is a specific and plausible reason to think that a given bug is
+  otherwise inapplicable or obsolete.
+
+  * One example is an old open bug that doesn't contain enough information to
+    clearly understand the problem being reported (e.g. not reproducible). It is
+    fine to resolve such a bug e.g. with resolution WORKSFORME and leaving a
+    comment to encourage the reporter to reopen the bug with more information
+    if it's still reproducable on their end.
+
+If a bug is resolved, please fill in the revision number it was fixed in in the
+"Fixed by Commit(s)" field.
+
+
+.. _Maintenance of Bug products/component metadata:
+
+Maintenance of products/components metadata
+===========================================
+
+Please raise a bug against "Bugzilla Admin"/"Products" to request any changes
+to be made to the breakdown of products & components modeled in Bugzilla.
+
+
+.. _Maintenance of cc-by-default settings:
+
+Maintenance of cc-by-default settings
+=====================================
+
+Please raise a bug against "Bugzilla Admin"/"Products" to request any changes
+to be made to the cc-by-default settings for specific components.
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 75df8a6226893050345619bf7c0db90d610ad449..6581b33ba1c716ccc6e0f53c39b5b2fb3745907e 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -24,6 +24,9 @@ match.  The file to verify is read from standard input unless the
 OPTIONS
 -------
 
+Options are parsed from the environment variable ``FILECHECK_OPTS``
+and from the command line.
+
 .. option:: -help
 
  Print a summary of command line options.
@@ -116,6 +119,10 @@ OPTIONS
   as old tests are migrated to the new non-overlapping ``CHECK-DAG:``
   implementation.
 
+.. option:: --color
+
+  Use colors in output (autodetected by default).
+
 EXIT STATUS
 -----------
 
diff --git a/docs/CommandGuide/llvm-exegesis.rst b/docs/CommandGuide/llvm-exegesis.rst
index 4181a9987213eec01094d92722c429bd34ffc5c9..f27db9e57edc6c4a8d93764ba39178eb6e094911 100644
--- a/docs/CommandGuide/llvm-exegesis.rst
+++ b/docs/CommandGuide/llvm-exegesis.rst
@@ -175,9 +175,10 @@ OPTIONS
  Specify the opcode to measure, by index. See example 1 for details.
  Either `opcode-index`, `opcode-name` or `snippets-file` must be set.
 
-.. option:: -opcode-name=<LLVM opcode name>
+.. option:: -opcode-name=<opcode name 1>,<opcode name 2>,...
 
- Specify the opcode to measure, by name. See example 1 for details.
+ Specify the opcode to measure, by name. Several opcodes can be specified as
+ a comma-separated list. See example 1 for details.
  Either `opcode-index`, `opcode-name` or `snippets-file` must be set.
 
  .. option:: -snippets-file=<filename>
@@ -223,6 +224,10 @@ OPTIONS
 
  If set, ignore instructions that do not have a sched class (class idx = 0).
 
+ .. option:: -mcpu=<cpu name>
+
+  If set, measure the cpu characteristics using the counters for this CPU. This
+  is useful when creating new sched models (the host CPU is unknown to LLVM).
 
 EXIT STATUS
 -----------
diff --git a/docs/CommandGuide/tblgen.rst b/docs/CommandGuide/tblgen.rst
index 55b542948469d1d9f256497c59f1d631e285a53a..3105e0c8076710abdcdbaa2ca800150b8c837e9b 100644
--- a/docs/CommandGuide/tblgen.rst
+++ b/docs/CommandGuide/tblgen.rst
@@ -130,6 +130,10 @@ OPTIONS
 
  Generate enhanced disassembly info.
 
+.. option:: -gen-exegesis
+
+ Generate llvm-exegesis tables.
+
 .. option:: -version
 
  Show the version number of this program.
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 4eda6c77b9f7ea239b17c6dbb50f278b749566e4..9125197a73a86a16a20036d5882ae915971a3690 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -730,10 +730,6 @@ code already distributed under a more liberal license (like the UIUC license),
 and GPL-containing subprojects are kept in separate SVN repositories whose
 LICENSE.txt files specifically indicate that they contain GPL code.
 
-We have no plans to change the license of LLVM.  If you have questions or
-comments about the license, please contact the `LLVM Developer's Mailing
-List <mailto:llvm-dev@lists.llvm.org>`_.
-
 Patents
 -------
 
diff --git a/docs/HowToBuildWithPGO.rst b/docs/HowToBuildWithPGO.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ba93bc64a294ab9b2cdd1763160620336bf6070a
--- /dev/null
+++ b/docs/HowToBuildWithPGO.rst
@@ -0,0 +1,163 @@
+=============================================================
+How To Build Clang and LLVM with Profile-Guided Optimizations
+=============================================================
+
+Introduction
+============
+
+PGO (Profile-Guided Optimization) allows your compiler to better optimize code
+for how it actually runs. Users report that applying this to Clang and LLVM can
+decrease overall compile time by 20%.
+
+This guide walks you through how to build Clang with PGO, though it also applies
+to other subprojects, such as LLD.
+
+
+Using the script
+================
+
+We have a script at ``utils/collect_and_build_with_pgo.py``. This script is
+tested on a few Linux flavors, and requires a checkout of LLVM, Clang, and
+compiler-rt. Despite the the name, it performs four clean builds of Clang, so it
+can take a while to run to completion. Please see the script's ``--help`` for
+more information on how to run it, and the different options available to you.
+If you want to get the most out of PGO for a particular use-case (e.g. compiling
+a specific large piece of software), please do read the section below on
+'benchmark' selection.
+
+Please note that this script is only tested on a few Linux distros. Patches to
+add support for other platforms, as always, are highly appreciated. :)
+
+This script also supports a ``--dry-run`` option, which causes it to print
+important commands instead of running them.
+
+
+Selecting 'benchmarks'
+======================
+
+PGO does best when the profiles gathered represent how the user plans to use the
+compiler. Notably, highly accurate profiles of llc building x86_64 code aren't
+incredibly helpful if you're going to be targeting ARM.
+
+By default, the script above does two things to get solid coverage. It:
+
+- runs all of Clang and LLVM's lit tests, and
+- uses the instrumented Clang to build Clang, LLVM, and all of the other
+  LLVM subprojects available to it.
+
+Together, these should give you:
+
+- solid coverage of building C++,
+- good coverage of building C,
+- great coverage of running optimizations,
+- great coverage of the backend for your host's architecture, and
+- some coverage of other architectures (if other arches are supported backends).
+
+Altogether, this should cover a diverse set of uses for Clang and LLVM. If you
+have very specific needs (e.g. your compiler is meant to compile a large browser
+for four different platforms, or similar), you may want to do something else.
+This is configurable in the script itself.
+
+
+Building Clang with PGO
+=======================
+
+If you prefer to not use the script, this briefly goes over how to build
+Clang/LLVM with PGO.
+
+First, you should have at least LLVM, Clang, and compiler-rt checked out
+locally.
+
+Next, at a high level, you're going to need to do the following:
+
+1. Build a standard Release Clang and the relevant libclang_rt.profile library
+2. Build Clang using the Clang you built above, but with instrumentation
+3. Use the instrumented Clang to generate profiles, which consists of two steps:
+
+  - Running the instrumented Clang/LLVM/lld/etc. on tasks that represent how
+    users will use said tools.
+  - Using a tool to convert the "raw" profiles generated above into a single,
+    final PGO profile.
+
+4. Build a final release Clang (along with whatever other binaries you need)
+   using the profile collected from your benchmark
+
+In more detailed steps:
+
+1. Configure a Clang build as you normally would. It's highly recommended that
+   you use the Release configuration for this, since it will be used to build
+   another Clang. Because you need Clang and supporting libraries, you'll want
+   to build the ``all`` target (e.g. ``ninja all`` or ``make -j4 all``).
+
+2. Configure a Clang build as above, but add the following CMake args:
+
+   - ``-DLLVM_BUILD_INSTRUMENTED=IR`` -- This causes us to build everything
+     with instrumentation.
+   - ``-DLLVM_BUILD_RUNTIME=No`` -- A few projects have bad interactions when
+     built with profiling, and aren't necessary to build. This flag turns them
+     off.
+   - ``-DCMAKE_C_COMPILER=/path/to/stage1/clang`` - Use the Clang we built in
+     step 1.
+   - ``-DCMAKE_CXX_COMPILER=/path/to/stage1/clang++`` - Same as above.
+
+ In this build directory, you simply need to build the ``clang`` target (and
+ whatever supporting tooling your benchmark requires).
+
+3. As mentioned above, this has two steps: gathering profile data, and then
+   massaging it into a useful form:
+
+   a. Build your benchmark using the Clang generated in step 2. The 'standard'
+      benchmark recommended is to run ``check-clang`` and ``check-llvm`` in your
+      instrumented Clang's build directory, and to do a full build of Clang/LLVM
+      using your instrumented Clang. So, create yet another build directory,
+      with the following CMake arguments:
+
+      - ``-DCMAKE_C_COMPILER=/path/to/stage2/clang`` - Use the Clang we built in
+        step 2.
+      - ``-DCMAKE_CXX_COMPILER=/path/to/stage2/clang++`` - Same as above.
+
+      If your users are fans of debug info, you may want to consider using
+      ``-DCMAKE_BUILD_TYPE=RelWithDebInfo`` instead of
+      ``-DCMAKE_BUILD_TYPE=Release``. This will grant better coverage of
+      debug info pieces of clang, but will take longer to complete and will
+      result in a much larger build directory.
+
+      It's recommended to build the ``all`` target with your instrumented Clang,
+      since more coverage is often better.
+
+  b. You should now have a few ``*.profdata`` files in
+     ``path/to/stage2/profiles/``. You need to merge these using
+     ``llvm-profdata`` (even if you only have one! The profile merge transforms
+     profraw into actual profile data, as well). This can be done with
+     ``/path/to/stage1/llvm-profdata -merge
+     -output=/path/to/output/profdata.prof path/to/stage2/profiles/*.profdata``.
+
+4. Now, build your final, PGO-optimized Clang. To do this, you'll want to pass
+   the following additional arguments to CMake.
+
+   - ``-DLLVM_PROFDATA_FILE=/path/to/output/profdata.prof`` - Use the PGO
+     profile from the previous step.
+   - ``-DCMAKE_C_COMPILER=/path/to/stage1/clang`` - Use the Clang we built in
+     step 1.
+   - ``-DCMAKE_CXX_COMPILER=/path/to/stage1/clang++`` - Same as above.
+
+   From here, you can build whatever targets you need.
+
+   .. note::
+     You may see warnings about a mismatched profile in the build output. These
+     are generally harmless. To silence them, you can add
+     ``-DCMAKE_C_FLAGS='-Wno-backend-plugin'
+     -DCMAKE_CXX_FLAGS='-Wno-backend-plugin'`` to your CMake invocation.
+
+
+Congrats! You now have a Clang built with profile-guided optimizations, and you
+can delete all but the final build directory if you'd like.
+
+If this worked well for you and you plan on doing it often, there's a slight
+optimization that can be made: LLVM and Clang have a tool called tblgen that's
+built and run during the build process. While it's potentially nice to build
+this for coverage as part of step 3, none of your other builds should benefit
+from building it. You can pass the CMake options
+``-DCLANG_TABLEGEN=/path/to/stage1/bin/clang-tblgen
+-DLLVM_TABLEGEN=/path/to/stage1/bin/llvm-tblgen`` to steps 2 and onward to avoid
+these useless rebuilds.
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 9fcfd29a6e85888daef0d0e3e797792223467d44..06e092fb9fc5269b60fd6f5d69a315002d933be9 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -1450,6 +1450,10 @@ example:
 ``noredzone``
     This attribute indicates that the code generator should not use a
     red zone, even if the target-specific ABI normally permits it.
+``indirect-tls-seg-refs``
+    This attribute indicates that the code generator should not use
+    direct TLS access through segment registers, even if the
+    target-specific ABI normally permits it.
 ``noreturn``
     This function attribute indicates that the function never returns
     normally. This produces undefined behavior at runtime if the
@@ -2922,7 +2926,7 @@ Simple Constants
     hexadecimal notation (see below). The assembler requires the exact
     decimal value of a floating-point constant. For example, the
     assembler accepts 1.25 but rejects 1.3 because 1.3 is a repeating
-    decimal in binary. Floating-point constants must have a 
+    decimal in binary. Floating-point constants must have a
     :ref:`floating-point <t_floating>` type.
 **Null pointer constants**
     The identifier '``null``' is recognized as a null pointer constant
@@ -3327,7 +3331,7 @@ The following is the syntax for constant expressions:
     value won't fit in the integer type, the result is a
     :ref:`poison value <poisonvalues>`.
 ``uitofp (CST to TYPE)``
-    Convert an unsigned integer constant to the corresponding 
+    Convert an unsigned integer constant to the corresponding
     floating-point constant. TYPE must be a scalar or vector floating-point
     type.  CST must be of scalar or vector integer type. Both CST and TYPE must
     be scalars, or vectors of the same number of elements.
@@ -5430,7 +5434,7 @@ Irreducible loop header weights are typically based on profile data.
 '``invariant.group``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The experimental ``invariant.group`` metadata may be attached to 
+The experimental ``invariant.group`` metadata may be attached to
 ``load``/``store`` instructions referencing a single metadata with no entries.
 The existence of the ``invariant.group`` metadata on the instruction tells
 the optimizer that every ``load`` and ``store`` to the same pointer operand
@@ -6871,7 +6875,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fadd``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -6879,7 +6883,7 @@ Semantics:
 
 The value produced is the floating-point sum of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -6968,7 +6972,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fsub``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -6976,7 +6980,7 @@ Semantics:
 
 The value produced is the floating-point difference of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7063,7 +7067,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fmul``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7071,7 +7075,7 @@ Semantics:
 
 The value produced is the floating-point product of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7197,7 +7201,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``fdiv``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7205,7 +7209,7 @@ Semantics:
 
 The value produced is the floating-point quotient of the two operands.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -7340,7 +7344,7 @@ Arguments:
 """"""""""
 
 The two arguments to the '``frem``' instruction must be
-:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of 
+:ref:`floating-point <t_floating>` or :ref:`vector <t_vector>` of
 floating-point values. Both arguments must have identical types.
 
 Semantics:
@@ -7348,10 +7352,10 @@ Semantics:
 
 The value produced is the floating-point remainder of the two operands.
 This is the same output as a libm '``fmod``' function, but without any
-possibility of setting ``errno``. The remainder has the same sign as the 
+possibility of setting ``errno``. The remainder has the same sign as the
 dividend.
 This instruction is assumed to execute in the default :ref:`floating-point
-environment <floatenv>`. 
+environment <floatenv>`.
 This instruction can also take any number of :ref:`fast-math
 flags <fastmath>`, which are optimization hints to enable otherwise
 unsafe floating-point optimizations:
@@ -8805,7 +8809,7 @@ Semantics:
 
 The '``fptrunc``' instruction casts a ``value`` from a larger
 :ref:`floating-point <t_floating>` type to a smaller :ref:`floating-point
-<t_floating>` type.  
+<t_floating>` type.
 This instruction is assumed to execute in the default :ref:`floating-point
 environment <floatenv>`.
 
@@ -10324,7 +10328,28 @@ Note that calling this intrinsic does not prevent function inlining or
 other aggressive transformations, so the value returned may not be that
 of the obvious source-language caller.
 
-This intrinsic is only implemented for x86.
+This intrinsic is only implemented for x86 and aarch64.
+
+'``llvm.sponentry``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare i8* @llvm.sponentry()
+
+Overview:
+"""""""""
+
+The '``llvm.sponentry``' intrinsic returns the stack pointer value at
+the entry of the current function calling this intrinsic.
+
+Semantics:
+""""""""""
+
+Note this intrinsic is only verified on AArch64.
 
 '``llvm.frameaddress``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -11560,6 +11585,82 @@ NaN, the intrinsic lowering is responsible for quieting the inputs to
 correctly return the non-NaN input (e.g. by using the equivalent of
 ``llvm.canonicalize``).
 
+'``llvm.minimum.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.minimum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.minimum.f32(float %Val0, float %Val1)
+      declare double    @llvm.minimum.f64(double %Val0, double %Val1)
+      declare x86_fp80  @llvm.minimum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+      declare fp128     @llvm.minimum.f128(fp128 %Val0, fp128 %Val1)
+      declare ppc_fp128 @llvm.minimum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1)
+
+Overview:
+"""""""""
+
+The '``llvm.minimum.*``' intrinsics return the minimum of the two
+arguments, propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""""""""""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""""""""""
+If either operand is a NaN, returns NaN. Otherwise returns the lesser
+of the two arguments. -0.0 is considered to be less than +0.0 for this
+intrinsic. Note that these are the semantics specified in the draft of
+IEEE 754-2018.
+
+'``llvm.maximum.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.maximum`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.maximum.f32(float %Val0, float %Val1)
+      declare double    @llvm.maximum.f64(double %Val0, double %Val1)
+      declare x86_fp80  @llvm.maximum.f80(x86_fp80 %Val0, x86_fp80 %Val1)
+      declare fp128     @llvm.maximum.f128(fp128 %Val0, fp128 %Val1)
+      declare ppc_fp128 @llvm.maximum.ppcf128(ppc_fp128 %Val0, ppc_fp128 %Val1)
+
+Overview:
+"""""""""
+
+The '``llvm.maximum.*``' intrinsics return the maximum of the two
+arguments, propagating NaNs and treating -0.0 as less than +0.0.
+
+
+Arguments:
+""""""""""
+
+The arguments and return value are floating-point numbers of the same
+type.
+
+Semantics:
+""""""""""
+If either operand is a NaN, returns NaN. Otherwise returns the greater
+of the two arguments. -0.0 is considered to be less than +0.0 for this
+intrinsic. Note that these are the semantics specified in the draft of
+IEEE 754-2018.
+
 '``llvm.copysign.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -12035,11 +12136,11 @@ Overview:
 
 The '``llvm.fshl``' family of intrinsic functions performs a funnel shift left:
 the first two values are concatenated as { %a : %b } (%a is the most significant
-bits of the wide value), the combined value is shifted left, and the most 
-significant bits are extracted to produce a result that is the same size as the 
-original arguments. If the first 2 arguments are identical, this is equivalent 
-to a rotate left operation. For vector types, the operation occurs for each 
-element of the vector. The shift argument is treated as an unsigned amount 
+bits of the wide value), the combined value is shifted left, and the most
+significant bits are extracted to produce a result that is the same size as the
+original arguments. If the first 2 arguments are identical, this is equivalent
+to a rotate left operation. For vector types, the operation occurs for each
+element of the vector. The shift argument is treated as an unsigned amount
 modulo the element size of the arguments.
 
 Arguments:
@@ -12081,11 +12182,11 @@ Overview:
 
 The '``llvm.fshr``' family of intrinsic functions performs a funnel shift right:
 the first two values are concatenated as { %a : %b } (%a is the most significant
-bits of the wide value), the combined value is shifted right, and the least 
-significant bits are extracted to produce a result that is the same size as the 
-original arguments. If the first 2 arguments are identical, this is equivalent 
-to a rotate right operation. For vector types, the operation occurs for each 
-element of the vector. The shift argument is treated as an unsigned amount 
+bits of the wide value), the combined value is shifted right, and the least
+significant bits are extracted to produce a result that is the same size as the
+original arguments. If the first 2 arguments are identical, this is equivalent
+to a rotate right operation. For vector types, the operation occurs for each
+element of the vector. The shift argument is treated as an unsigned amount
 modulo the element size of the arguments.
 
 Arguments:
@@ -13366,7 +13467,7 @@ The '``llvm.masked.expandload``' intrinsic is designed for reading multiple scal
     %Tmp = call <8 x double> @llvm.masked.expandload.v8f64(double* %Bptr, <8 x i1> %Mask, <8 x double> undef)
     ; Store the result in A
     call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %Tmp, <8 x double>* %Aptr, i32 8, <8 x i1> %Mask)
-    
+
     ; %Bptr should be increased on each iteration according to the number of '1' elements in the Mask.
     %MaskI = bitcast <8 x i1> %Mask to i8
     %MaskIPopcnt = call i8 @llvm.ctpop.i8(i8 %MaskI)
@@ -13423,7 +13524,7 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
     %Tmp = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %Aptr, i32 8, <8 x i1> %Mask, <8 x double> undef)
     ; Store all selected elements consecutively in array B
     call <void> @llvm.masked.compressstore.v8f64(<8 x double> %Tmp, double* %Bptr, <8 x i1> %Mask)
-    
+
     ; %Bptr should be increased on each iteration according to the number of '1' elements in the Mask.
     %MaskI = bitcast <8 x i1> %Mask to i8
     %MaskIPopcnt = call i8 @llvm.ctpop.i8(i8 %MaskI)
@@ -13915,7 +14016,7 @@ value operands and has the same type as the operands.  The remainder has the
 same sign as the dividend.
 
 '``llvm.experimental.constrained.fma``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
@@ -14056,7 +14157,7 @@ Overview:
 
 The '``llvm.experimental.constrained.powi``' intrinsic returns the first operand
 raised to the (positive or negative) power specified by the second operand. The
-order of evaluation of multiplications is not defined. When a vector of 
+order of evaluation of multiplications is not defined. When a vector of
 floating-point type is used, the second argument remains a scalar integer value.
 
 
@@ -14382,7 +14483,7 @@ Overview:
 """""""""
 
 The '``llvm.experimental.constrained.nearbyint``' intrinsic returns the first
-operand rounded to the nearest integer. It will not raise an inexact 
+operand rounded to the nearest integer. It will not raise an inexact
 floating-point exception if the operand is not an integer.
 
 
@@ -14405,6 +14506,225 @@ mode is determined by the runtime floating-point environment.  The rounding
 mode argument is only intended as information to the compiler.
 
 
+'``llvm.experimental.constrained.maxnum``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.maxnum(<type> <op1>, <type> <op2>
+                                            metadata <rounding mode>,
+                                            metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.maxnum``' intrinsic returns the maximum 
+of the two arguments.
+
+Arguments:
+""""""""""
+
+The first two arguments and the return value are floating-point numbers 
+of the same type.
+
+The third and forth arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function follows the IEEE-754 semantics for maxNum. The rounding mode is
+described, not determined, by the rounding mode argument. The actual rounding
+mode is determined by the runtime floating-point environment. The rounding
+mode argument is only intended as information to the compiler.
+
+
+'``llvm.experimental.constrained.minnum``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.minnum(<type> <op1>, <type> <op2>
+                                            metadata <rounding mode>,
+                                            metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.minnum``' intrinsic returns the minimum
+of the two arguments.
+
+Arguments:
+""""""""""
+
+The first two arguments and the return value are floating-point numbers
+of the same type.
+
+The third and forth arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function follows the IEEE-754 semantics for minNum. The rounding mode is
+described, not determined, by the rounding mode argument. The actual rounding
+mode is determined by the runtime floating-point environment. The rounding
+mode argument is only intended as information to the compiler.
+
+
+'``llvm.experimental.constrained.ceil``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.ceil(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.ceil``' intrinsic returns the ceiling of the 
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating-point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above. The rounding mode is currently unused for this
+intrinsic.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``ceil`` functions
+would and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.floor``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.floor(<type> <op1>,
+                                           metadata <rounding mode>,
+                                           metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.floor``' intrinsic returns the floor of the 
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating-point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above. The rounding mode is currently unused for this
+intrinsic.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``floor`` functions
+would and handles error conditions in the same way. 
+
+
+'``llvm.experimental.constrained.round``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.round(<type> <op1>,
+                                           metadata <rounding mode>,
+                                           metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.round``' intrinsic returns the first 
+operand rounded to the nearest integer.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating-point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above. The rounding mode is currently unused for this
+intrinsic.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``round`` functions
+would and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.trunc``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.trunc(<type> <op1>,
+                                           metadata <truncing mode>,
+                                           metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.trunc``' intrinsic returns the first 
+operand rounded to the nearest integer not larger in magnitude than the 
+operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating-point numbers of the same
+type.
+
+The second and third arguments specify the truncing mode and exception
+behavior as described above. The truncing mode is currently unused for this
+intrinsic.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``trunc`` functions
+would and handles error conditions in the same way.
+
+
 General Intrinsics
 ------------------
 
@@ -15097,6 +15417,51 @@ Semantics:
 This intrinsic actually does nothing, but optimizers must assume that it
 has externally observable side effects.
 
+'``llvm.is.constant.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use llvm.is.constant with any argument type.
+
+::
+
+      declare i1 @llvm.is.constant.i32(i32 %operand) nounwind readnone
+      declare i1 @llvm.is.constant.f32(float %operand) nounwind readnone
+      declare i1 @llvm.is.constant.TYPENAME(TYPE %operand) nounwind readnone
+
+Overview:
+"""""""""
+
+The '``llvm.is.constant``' intrinsic will return true if the argument
+is known to be a manifest compile-time constant. It is guaranteed to
+fold to either true or false before generating machine code.
+
+Semantics:
+""""""""""
+
+This intrinsic generates no code. If its argument is known to be a
+manifest compile-time constant value, then the intrinsic will be
+converted to a constant true value. Otherwise, it will be converted to
+a constant false value.
+
+In particular, note that if the argument is a constant expression
+which refers to a global (the address of which _is_ a constant, but
+not manifest during the compile), then the intrinsic evaluates to
+false.
+
+The result also intentionally depends on the result of optimization
+passes -- e.g., the result can change depending on whether a
+function gets inlined or not. A function's parameters are
+obviously not constant. However, a call like
+``llvm.is.constant.i32(i32 %param)`` *can* return true after the
+function is inlined, if the value passed to the function parameter was
+a constant.
+
+On the other hand, if constant folding is not run, it will never
+evaluate to true, even in simple cases.
+
 Stack Map Intrinsics
 --------------------
 
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 53cb3b5980a98a5373290366d8ff52727814f9d8..640e1611da6c442d395dc9de9304ab6c85c1460a 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -94,6 +94,12 @@ them to participate. Many people will see the email notification on cfe-commits
 or llvm-commits, and if the subject line suggests the patch is something they
 should look at, they will.
 
+
+.. _finding-potential-reviewers:
+
+Finding potential reviewers
+---------------------------
+
 Here are a couple of ways to pick the initial reviewer(s):
 
 * Use ``svn blame`` and the commit log to find names of people who have
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index 64b7de5be157573a1a03a4a4a01a03d7c52984b3..88c56700eb36967cb821b950c1cc7ba855ae03e1 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -3736,13 +3736,6 @@ Important Subclasses of the ``Instruction`` class
   `ICmpInst <LangRef.html#i_icmp>`_ (integer opreands), and
   `FCmpInst <LangRef.html#i_fcmp>`_ (floating point operands).
 
-.. _TerminatorInst:
-
-* ``TerminatorInst``
-
-  This subclass is the parent of all terminator instructions (those which can
-  terminate a block).
-
 .. _m_Instruction:
 
 Important Public Members of the ``Instruction`` class
@@ -4068,7 +4061,7 @@ This class represents a single entry single exit section of the code, commonly
 known as a basic block by the compiler community.  The ``BasicBlock`` class
 maintains a list of Instruction_\ s, which form the body of the block.  Matching
 the language definition, the last element of this list of instructions is always
-a terminator instruction (a subclass of the TerminatorInst_ class).
+a terminator instruction.
 
 In addition to tracking the list of instructions that make up the block, the
 ``BasicBlock`` class also keeps track of the :ref:`Function <c_Function>` that
@@ -4119,7 +4112,7 @@ Important Public Members of the ``BasicBlock`` class
   Returns a pointer to :ref:`Function <c_Function>` the block is embedded into,
   or a null pointer if it is homeless.
 
-* ``TerminatorInst *getTerminator()``
+* ``Instruction *getTerminator()``
 
   Returns a pointer to the terminator instruction that appears at the end of the
   ``BasicBlock``.  If there is no terminator instruction, or if the last
diff --git a/docs/Proposals/TestSuite.rst b/docs/Proposals/TestSuite.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8c7531783d44bfe44a8d619d888701a7536e8e65
--- /dev/null
+++ b/docs/Proposals/TestSuite.rst
@@ -0,0 +1,317 @@
+=====================
+Test-Suite Extentions
+=====================
+
+.. contents::
+   :depth: 1
+   :local:
+
+Abstract
+========
+
+These are ideas for additional programs, benchmarks, applications and
+algorithms that could be added to the LLVM Test-Suite.
+The test-suite could be much larger than it is now, which would help us
+detecting compiler errors (crashes, miscompiles) during development.
+
+Most probably, the reason why the programs below have not been added to
+the test-suite yet is that nobody has found time to do it. But there
+might be other issues as well, such as
+
+ * Licensing (Support can still be added as external module,
+              like for the SPEC benchmarks)
+
+ * Language (in particular, there is no official LLVM frontend
+             for FORTRAN yet)
+
+ * Parallelism (currently, all programs in test-suite use
+                one thread only)
+
+Benchmarks
+==========
+
+SPEC CPU 2017
+-------------
+https://www.spec.org/cpu2017/
+
+The following have not been included yet because they contain Fortran
+code.
+
+In case of cactuBSSN only a small portion is Fortran. The hosts's
+Fortran compiler could be used for these parts.
+
+Note that CMake's Ninja generator has difficulties with Fortran. See the
+`CMake documentation <https://cmake.org/cmake/help/v3.13/generator/Ninja.html#fortran-support>`_
+for details.
+
+ * 503.bwaves_r/603.bwaves_s
+ * 507.cactuBSSN_r
+ * 521.wrf_r/621.wrf_s
+ * 527.cam4_r/627.cam4_s
+ * 628.pop2_s
+ * 548.exchange2_r/648.exchange2_s
+ * 549.fotonik3d_r/649.fotonik3d_s
+ * 554.roms_r/654.roms_s
+
+SPEC OMP2012
+------------
+https://www.spec.org/omp2012/
+
+ * 350.md
+ * 351.bwaves
+ * 352.nab
+ * 357.bt331
+ * 358.botsalgn
+ * 359.botsspar
+ * 360.ilbdc
+ * 362.fma3d
+ * 363.swim
+ * 367.imagick
+ * 370.mgrid331
+ * 371.applu331
+ * 372.smithwa
+ * 376.kdtree
+
+OpenCV
+------
+https://opencv.org/
+
+OpenMP 4.x SIMD Benchmarks
+--------------------------
+https://github.com/flwende/simd_benchmarks
+
+PWM-benchmarking
+----------------
+https://github.com/tbepler/PWM-benchmarking
+
+SLAMBench
+---------
+https://github.com/pamela-project/slambench
+
+FireHose
+--------
+http://firehose.sandia.gov/
+
+A Benchmark for the C/C++ Standard Library
+------------------------------------------
+https://github.com/hiraditya/std-benchmark
+
+OpenBenchmarking.org CPU / Processor Suite
+------------------------------------------
+https://openbenchmarking.org/suite/pts/cpu
+
+This is a subset of the
+`Phoronix Test Suite <https://github.com/phoronix-test-suite/phoronix-test-suite/>`_
+and is itself a collection of benchmark suites
+
+Parboil Benchmarks
+------------------
+http://impact.crhc.illinois.edu/parboil/parboil.aspx
+
+MachSuite
+---------
+https://breagen.github.io/MachSuite/
+
+Rodinia
+-------
+http://lava.cs.virginia.edu/Rodinia/download_links.htm
+
+Rodinia has already been partially included in
+MultiSource/Benchmarks/Rodinia. Benchmarks still missing are:
+
+ * streamcluster
+ * particlefilter
+ * nw
+ * nn
+ * myocyte
+ * mummergpu
+ * lud
+ * leukocyte
+ * lavaMD
+ * kmeans
+ * hotspot3D
+ * heartwall
+ * cfd
+ * bfs
+ * b+tree
+
+vecmathlib tests harness
+------------------------
+https://bitbucket.org/eschnett/vecmathlib/wiki/Home
+
+PARSEC
+------
+http://parsec.cs.princeton.edu/
+
+Graph500 reference implementations
+----------------------------------
+https://github.com/graph500/graph500/tree/v2-spec
+
+NAS Parallel Benchmarks
+-----------------------
+https://www.nas.nasa.gov/publications/npb.html
+
+The official benchmark is written in Fortran, but an unofficial
+C-translation is available as well:
+https://github.com/benchmark-subsetting/NPB3.0-omp-C
+
+DARPA HPCS SSCA#2 C/OpenMP reference implementation
+---------------------------------------------------
+http://www.highproductivity.org/SSCABmks.htm
+
+This web site does not exist any more, but there seems to be a copy of
+some of the benchmarks
+https://github.com/gtcasl/hpc-benchmarks/tree/master/SSCA2v2.2
+
+Kokkos
+------
+https://github.com/kokkos/kokkos-kernels/tree/master/perf_test
+https://github.com/kokkos/kokkos/tree/master/benchmarks
+
+PolyMage
+--------
+https://github.com/bondhugula/polymage-benchmarks
+
+PolyBench
+---------
+https://sourceforge.net/projects/polybench/
+
+A modified version of Polybench 3.2 is already presented in
+SingleSource/Benchmarks/Polybench. A newer version 4.2.1 is available.
+
+High Performance Geometric Multigrid
+------------------------------------
+https://crd.lbl.gov/departments/computer-science/PAR/research/hpgmg/
+
+RAJA Performance Suite
+----------------------
+https://github.com/LLNL/RAJAPerf
+
+CORAL-2 Benchmarks
+------------------
+https://asc.llnl.gov/coral-2-benchmarks/
+
+Many of its programs have already been integreated in
+MultiSource/Benchmarks/DOE-ProxyApps-C and
+MultiSource/Benchmarks/DOE-ProxyApps-C++.
+
+ * Nekbone
+ * QMCPack
+ * LAMMPS
+ * Kripke
+ * Quicksilver
+ * PENNANT
+ * Big Data Analytic Suite
+ * Deep Learning Suite
+ * Stream
+ * Stride
+ * ML/DL micro-benchmark
+ * Pynamic
+ * ACME
+ * VPIC
+ * Laghos
+ * Parallel Integer Sort
+ * Havoq
+
+NWChem
+------
+http://www.nwchem-sw.org/index.php/Benchmarks
+
+TVM
+----
+https://github.com/dmlc/tvm/tree/master/apps/benchmark
+
+HydroBench
+----------
+https://github.com/HydroBench/Hydro
+
+ParRes
+------
+https://github.com/ParRes/Kernels/tree/master/Cxx11
+
+Applications/Libraries
+======================
+
+GnuPG
+-----
+https://gnupg.org/
+
+Blitz++
+-------
+https://sourceforge.net/projects/blitz/
+
+FFmpeg
+------
+https://ffmpeg.org/
+
+FreePOOMA
+---------
+http://www.nongnu.org/freepooma/
+
+FTensors
+--------
+http://www.wlandry.net/Projects/FTensor
+
+rawspeed
+--------
+https://github.com/darktable-org/rawspeed
+
+Its test dataset is 756 MB in size, which is too large to be included
+into the test-suite repository.
+
+Generic Algorithms
+==================
+
+Image processing
+----------------
+
+Resampling
+``````````
+
+ * Bilinear
+ * Bicubic
+ * Lanczos
+
+Dither
+``````
+
+ * Threshold
+ * Random
+ * Halftone
+ * Bayer
+ * Floyd-Steinberg
+ * Jarvis
+ * Stucki
+ * Burkes
+ * Sierra
+ * Atkinson
+ * Gradient-based
+
+Feature detection
+`````````````````
+
+ * Harris
+ * Histogram of Oriented Gradients
+
+Color conversion
+````````````````
+
+ * RGB to grayscale
+ * HSL to RGB
+
+Graph
+-----
+
+Search Algorithms
+`````````````````
+
+ * Breadth-First-Search
+ * Depth-First-Search
+ * Dijkstra's algorithm
+ * A-Star
+
+Spanning Tree
+`````````````
+
+ * Kruskal's algorithm
+ * Prim's algorithm
diff --git a/docs/index.rst b/docs/index.rst
index 7edfdd241918c501279bc923e4690e999e89181e..df70de095bd94761c7d8c01ce6d18dfe609765b4 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -68,6 +68,7 @@ representation.
    CMakePrimer
    AdvancedBuilds
    HowToBuildOnARM
+   HowToBuildWithPGO
    HowToCrossCompileBuiltinsOnArm
    HowToCrossCompileLLVM
    CommandGuide/index
@@ -107,6 +108,9 @@ representation.
 :doc:`HowToBuildOnARM`
    Notes on building and testing LLVM/Clang on ARM.
 
+:doc:`HowToBuildWithPGO`
+    Notes on building LLVM/Clang with PGO.
+
 :doc:`HowToCrossCompileBuiltinsOnArm`
    Notes on cross-building and testing the compiler-rt builtins for Arm.
 
@@ -450,6 +454,7 @@ Information about LLVM's development process.
    Packaging
    ReleaseProcess
    Phabricator
+   BugLifeCycle
 
 :doc:`Contributing`
    An overview on how to contribute to LLVM.
@@ -480,6 +485,9 @@ Information about LLVM's development process.
    Describes how to use the Phabricator code review tool hosted on
    http://reviews.llvm.org/ and its command line interface, Arcanist.
 
+:doc:`BugLifeCycle`
+   Describes how bugs are reported, triaged and closed.
+
 Community
 =========
 
@@ -559,6 +567,7 @@ can be better.
 
    CodeOfConduct
    Proposals/GitHubMove
+   Proposals/TestSuite
    Proposals/VectorizationPlan
 
 :doc:`CodeOfConduct`
@@ -568,6 +577,9 @@ can be better.
 :doc:`Proposals/GitHubMove`
    Proposal to move from SVN/Git to GitHub.
 
+:doc:`Proposals/TestSuite`
+   Proposals for additional benchmarks/programs for llvm's test-suite.
+
 :doc:`Proposals/VectorizationPlan`
    Proposal to model the process and upgrade the infrastructure of LLVM's Loop Vectorizer.
 
diff --git a/docs/tutorial/BuildingAJIT1.rst b/docs/tutorial/BuildingAJIT1.rst
index 2b83df42fc247028cbe692746d46b94446f3f452..fcb755bd286f18bc8e5d3d30c2f683a21ad3f8ea 100644
--- a/docs/tutorial/BuildingAJIT1.rst
+++ b/docs/tutorial/BuildingAJIT1.rst
@@ -8,18 +8,19 @@ Building a JIT: Starting out with KaleidoscopeJIT
 Chapter 1 Introduction
 ======================
 
-**Warning: This text is currently out of date due to ORC API updates.**
+**Warning: This tutorial is currently being updated to account for ORC API
+changes. Only Chapters 1 and 2 are up-to-date.**
 
-**The example code has been updated and can be used. The text will be updated
-once the API churn dies down.**
+**Example code from Chapters 3 to 5 will compile and run, but has not been
+updated**
 
 Welcome to Chapter 1 of the "Building an ORC-based JIT in LLVM" tutorial. This
 tutorial runs through the implementation of a JIT compiler using LLVM's
 On-Request-Compilation (ORC) APIs. It begins with a simplified version of the
 KaleidoscopeJIT class used in the
 `Implementing a language with LLVM <LangImpl01.html>`_ tutorials and then
-introduces new features like optimization, lazy compilation and remote
-execution.
+introduces new features like concurrent compilation, optimization, lazy
+compilation and remote execution.
 
 The goal of this tutorial is to introduce you to LLVM's ORC JIT APIs, show how
 these APIs interact with other parts of LLVM, and to teach you how to recombine
@@ -45,11 +46,9 @@ The structure of the tutorial is:
 - `Chapter #5 <BuildingAJIT5.html>`_: Add process isolation by JITing code into
   a remote process with reduced privileges using the JIT Remote APIs.
 
-To provide input for our JIT we will use the Kaleidoscope REPL from
-`Chapter 7 <LangImpl07.html>`_ of the "Implementing a language in LLVM tutorial",
-with one minor modification: We will remove the FunctionPassManager from the
-code for that chapter and replace it with optimization support in our JIT class
-in Chapter #2.
+To provide input for our JIT we will use a lightly modified version of the
+Kaleidoscope REPL from `Chapter 7 <LangImpl07.html>`_ of the "Implementing a
+language in LLVM tutorial".
 
 Finally, a word on API generations: ORC is the 3rd generation of LLVM JIT API.
 It was preceded by MCJIT, and before that by the (now deleted) legacy JIT.
@@ -63,32 +62,29 @@ JIT API Basics
 
 The purpose of a JIT compiler is to compile code "on-the-fly" as it is needed,
 rather than compiling whole programs to disk ahead of time as a traditional
-compiler does. To support that aim our initial, bare-bones JIT API will be:
+compiler does. To support that aim our initial, bare-bones JIT API will have
+just two functions:
 
-1. Handle addModule(Module &M) -- Make the given IR module available for
-   execution.
-2. JITSymbol findSymbol(const std::string &Name) -- Search for pointers to
+1. ``Error addModule(std::unique_ptr<Module> M)``: Make the given IR module
+   available for execution.
+2. ``Expected<JITEvaluatedSymbol> lookup()``: Search for pointers to
    symbols (functions or variables) that have been added to the JIT.
-3. void removeModule(Handle H) -- Remove a module from the JIT, releasing any
-   memory that had been used for the compiled code.
 
 A basic use-case for this API, executing the 'main' function from a module,
 will look like:
 
 .. code-block:: c++
 
-  std::unique_ptr<Module> M = buildModule();
   JIT J;
-  Handle H = J.addModule(*M);
-  int (*Main)(int, char*[]) = (int(*)(int, char*[]))J.getSymbolAddress("main");
+  J.addModule(buildModule());
+  auto *Main = (int(*)(int, char*[]))J.lookup("main").getAddress();
   int Result = Main();
-  J.removeModule(H);
 
 The APIs that we build in these tutorials will all be variations on this simple
-theme. Behind the API we will refine the implementation of the JIT to add
-support for optimization and lazy compilation. Eventually we will extend the
-API itself to allow higher-level program representations (e.g. ASTs) to be
-added to the JIT.
+theme. Behind this API we will refine the implementation of the JIT to add
+support for concurrent compilation, optimization and lazy compilation.
+Eventually we will extend the API itself to allow higher-level program
+representations (e.g. ASTs) to be added to the JIT.
 
 KaleidoscopeJIT
 ===============
@@ -100,12 +96,10 @@ the REPL code from `Chapter 7 <LangImpl07.html>`_ of that tutorial to supply the
 input for our JIT: Each time the user enters an expression the REPL will add a
 new IR module containing the code for that expression to the JIT. If the
 expression is a top-level expression like '1+1' or 'sin(x)', the REPL will also
-use the findSymbol method of our JIT class find and execute the code for the
-expression, and then use the removeModule method to remove the code again
-(since there's no way to re-invoke an anonymous expression). In later chapters
-of this tutorial we'll modify the REPL to enable new interactions with our JIT
-class, but for now we will take this setup for granted and focus our attention on
-the implementation of our JIT itself.
+use the lookup method of our JIT class find and execute the code for the
+expression. In later chapters of this tutorial we will modify the REPL to enable
+new interactions with our JIT class, but for now we will take this setup for
+granted and focus our attention on the implementation of our JIT itself.
 
 Our KaleidoscopeJIT class is defined in the KaleidoscopeJIT.h header. After the
 usual include guards and #includes [2]_, we get to the definition of our class:
@@ -115,216 +109,155 @@ usual include guards and #includes [2]_, we get to the definition of our class:
   #ifndef LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
   #define LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
 
-  #include "llvm/ADT/STLExtras.h"
-  #include "llvm/ExecutionEngine/ExecutionEngine.h"
+  #include "llvm/ADT/StringRef.h"
   #include "llvm/ExecutionEngine/JITSymbol.h"
-  #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
-  #include "llvm/ExecutionEngine/SectionMemoryManager.h"
   #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+  #include "llvm/ExecutionEngine/Orc/Core.h"
+  #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
   #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-  #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+  #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
   #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+  #include "llvm/ExecutionEngine/SectionMemoryManager.h"
   #include "llvm/IR/DataLayout.h"
-  #include "llvm/IR/Mangler.h"
-  #include "llvm/Support/DynamicLibrary.h"
-  #include "llvm/Support/raw_ostream.h"
-  #include "llvm/Target/TargetMachine.h"
-  #include <algorithm>
+  #include "llvm/IR/LLVMContext.h"
   #include <memory>
-  #include <string>
-  #include <vector>
 
   namespace llvm {
   namespace orc {
 
   class KaleidoscopeJIT {
   private:
-    std::unique_ptr<TargetMachine> TM;
-    const DataLayout DL;
+    ExecutionSession ES;
     RTDyldObjectLinkingLayer ObjectLayer;
-    IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+    IRCompileLayer CompileLayer;
+
+    DataLayout DL;
+    MangleAndInterner Mangle;
+    ThreadSafeContext Ctx;
 
   public:
-    using ModuleHandle = decltype(CompileLayer)::ModuleHandleT;
-
-Our class begins with four members: A TargetMachine, TM, which will be used to
-build our LLVM compiler instance; A DataLayout, DL, which will be used for
-symbol mangling (more on that later), and two ORC *layers*: an
-RTDyldObjectLinkingLayer and a CompileLayer. We'll be talking more about layers
-in the next chapter, but for now you can think of them as analogous to LLVM
-Passes: they wrap up useful JIT utilities behind an easy to compose interface.
-The first layer, ObjectLayer, is the foundation of our JIT: it takes in-memory
-object files produced by a compiler and links them on the fly to make them
-executable. This JIT-on-top-of-a-linker design was introduced in MCJIT, however
-the linker was hidden inside the MCJIT class. In ORC we expose the linker so
-that clients can access and configure it directly if they need to. In this
-tutorial our ObjectLayer will just be used to support the next layer in our
-stack: the CompileLayer, which will be responsible for taking LLVM IR, compiling
-it, and passing the resulting in-memory object files down to the object linking
-layer below.
-
-That's it for member variables, after that we have a single typedef:
-ModuleHandle. This is the handle type that will be returned from our JIT's
-addModule method, and can be passed to the removeModule method to remove a
-module. The IRCompileLayer class already provides a convenient handle type
-(IRCompileLayer::ModuleHandleT), so we just alias our ModuleHandle to this.
+    KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL)
+        : ObjectLayer(ES,
+                      []() { return llvm::make_unique<SectionMemoryManager>(); }),
+          CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))),
+          DL(std::move(DL)), Mangle(ES, this->DL),
+          Ctx(llvm::make_unique<LLVMContext>()) {
+      ES.getMainJITDylib().setGenerator(
+          cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
+    }
+
+Our class begins with six member variables: An ExecutionSession member, ``ES``,
+which provides context for our running JIT'd code (including the string pool,
+global mutex, and error reporting facilities); An RTDyldObjectLinkingLayer,
+``ObjectLayer``, that can be used to add object files to our JIT (though we will
+not use it directly); An IRCompileLayer, ``CompileLayer``, that can be used to
+add LLVM Modules to our JIT (and which builds on the ObjectLayer), A DataLayout
+and MangleAndInterner, ``DL`` and ``Mangle``, that will be used for symbol mangling
+(more on that later); and finally an LLVMContext that clients will use when
+building IR files for the JIT.
+
+Next up we have our class constructor, which takes a `JITTargetMachineBuilder``
+that will be used by our IRCompiler, and a ``DataLayout`` that we will use to
+initialize our DL member. The constructor begins by initializing our
+ObjectLayer.  The ObjectLayer requires a reference to the ExecutionSession, and
+a function object that will build a JIT memory manager for each module that is
+added (a JIT memory manager manages memory allocations, memory permissions, and
+registration of exception handlers for JIT'd code). For this we use a lambda
+that returns a SectionMemoryManager, an off-the-shelf utility that provides all
+the basic memory management functionality required for this chapter. Next we
+initialize our CompileLayer. The CompileLayer needs three things: (1) A
+reference to the ExecutionSession, (2) A reference to our object layer, and (3)
+a compiler instance to use to perform the actual compilation from IR to object
+files. We use the off-the-shelf ConcurrentIRCompiler utility as our compiler,
+which we construct using this constructor's JITTargetMachineBuilder argument.
+The ConcurrentIRCompiler utility will use the JITTargetMachineBuilder to build
+llvm TargetMachines (which are not thread safe) as needed for compiles. After
+this, we initialize our supporting members: ``DL``, ``Mangler`` and ``Ctx`` with
+the input DataLayout, the ExecutionSession and DL member, and a new default
+constucted LLVMContext respectively. Now that our members have been initialized,
+so the one thing that remains to do is to tweak the configuration of the
+*JITDylib* that we will store our code in. We want to modify this dylib to
+contain not only the symbols that we add to it, but also the symbols from our
+REPL process as well. We do this by attaching a
+``DynamicLibrarySearchGenerator`` instance using the
+``DynamicLibrarySearchGenerator::GetForCurrentProcess`` method.
+
 
 .. code-block:: c++
 
-  KaleidoscopeJIT()
-      : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
-        ObjectLayer([]() { return std::make_shared<SectionMemoryManager>(); }),
-        CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
-    llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
-  }
+  static Expected<std::unique_ptr<KaleidoscopeJIT>> Create() {
+    auto JTMB = JITTargetMachineBuilder::detectHost();
 
-  TargetMachine &getTargetMachine() { return *TM; }
-
-Next up we have our class constructor. We begin by initializing TM using the
-EngineBuilder::selectTarget helper method which constructs a TargetMachine for
-the current process. Then we use our newly created TargetMachine to initialize
-DL, our DataLayout. After that we need to initialize our ObjectLayer. The
-ObjectLayer requires a function object that will build a JIT memory manager for
-each module that is added (a JIT memory manager manages memory allocations,
-memory permissions, and registration of exception handlers for JIT'd code). For
-this we use a lambda that returns a SectionMemoryManager, an off-the-shelf
-utility that provides all the basic memory management functionality required for
-this chapter. Next we initialize our CompileLayer. The CompileLayer needs two
-things: (1) A reference to our object layer, and (2) a compiler instance to use
-to perform the actual compilation from IR to object files. We use the
-off-the-shelf SimpleCompiler instance for now. Finally, in the body of the
-constructor, we call the DynamicLibrary::LoadLibraryPermanently method with a
-nullptr argument. Normally the LoadLibraryPermanently method is called with the
-path of a dynamic library to load, but when passed a null pointer it will 'load'
-the host process itself, making its exported symbols available for execution.
+    if (!JTMB)
+      return JTMB.takeError();
 
-.. code-block:: c++
+    auto DL = JTMB->getDefaultDataLayoutForTarget();
+    if (!DL)
+      return DL.takeError();
 
-  ModuleHandle addModule(std::unique_ptr<Module> M) {
-    // Build our symbol resolver:
-    // Lambda 1: Look back into the JIT itself to find symbols that are part of
-    //           the same "logical dylib".
-    // Lambda 2: Search for external symbols in the host process.
-    auto Resolver = createLambdaResolver(
-        [&](const std::string &Name) {
-          if (auto Sym = CompileLayer.findSymbol(Name, false))
-            return Sym;
-          return JITSymbol(nullptr);
-        },
-        [](const std::string &Name) {
-          if (auto SymAddr =
-                RTDyldMemoryManager::getSymbolAddressInProcess(Name))
-            return JITSymbol(SymAddr, JITSymbolFlags::Exported);
-          return JITSymbol(nullptr);
-        });
-
-    // Add the set to the JIT with the resolver we created above and a newly
-    // created SectionMemoryManager.
-    return cantFail(CompileLayer.addModule(std::move(M),
-                                           std::move(Resolver)));
+    return llvm::make_unique<KaleidoscopeJIT>(std::move(*JTMB), std::move(*DL));
   }
 
-Now we come to the first of our JIT API methods: addModule. This method is
-responsible for adding IR to the JIT and making it available for execution. In
-this initial implementation of our JIT we will make our modules "available for
-execution" by adding them straight to the CompileLayer, which will immediately
-compile them. In later chapters we will teach our JIT to defer compilation
-of individual functions until they're actually called.
-
-To add our module to the CompileLayer we need to supply both the module and a
-symbol resolver. The symbol resolver is responsible for supplying the JIT with
-an address for each *external symbol* in the module we are adding. External
-symbols are any symbol not defined within the module itself, including calls to
-functions outside the JIT and calls to functions defined in other modules that
-have already been added to the JIT. (It may seem as though modules added to the
-JIT should know about one another by default, but since we would still have to
-supply a symbol resolver for references to code outside the JIT it turns out to
-be easier to re-use this one mechanism for all symbol resolution.) This has the
-added benefit that the user has full control over the symbol resolution
-process. Should we search for definitions within the JIT first, then fall back
-on external definitions? Or should we prefer external definitions where
-available and only JIT code if we don't already have an available
-implementation? By using a single symbol resolution scheme we are free to choose
-whatever makes the most sense for any given use case.
-
-Building a symbol resolver is made especially easy by the *createLambdaResolver*
-function. This function takes two lambdas [3]_ and returns a JITSymbolResolver
-instance. The first lambda is used as the implementation of the resolver's
-findSymbolInLogicalDylib method, which searches for symbol definitions that
-should be thought of as being part of the same "logical" dynamic library as this
-Module. If you are familiar with static linking: this means that
-findSymbolInLogicalDylib should expose symbols with common linkage and hidden
-visibility. If all this sounds foreign you can ignore the details and just
-remember that this is the first method that the linker will use to try to find a
-symbol definition. If the findSymbolInLogicalDylib method returns a null result
-then the linker will call the second symbol resolver method, called findSymbol,
-which searches for symbols that should be thought of as external to (but
-visibile from) the module and its logical dylib. In this tutorial we will adopt
-the following simple scheme: All modules added to the JIT will behave as if they
-were linked into a single, ever-growing logical dylib. To implement this our
-first lambda (the one defining findSymbolInLogicalDylib) will just search for
-JIT'd code by calling the CompileLayer's findSymbol method. If we don't find a
-symbol in the JIT itself we'll fall back to our second lambda, which implements
-findSymbol. This will use the RTDyldMemoryManager::getSymbolAddressInProcess
-method to search for the symbol within the program itself. If we can't find a
-symbol definition via either of these paths, the JIT will refuse to accept our
-module, returning a "symbol not found" error.
-
-Now that we've built our symbol resolver, we're ready to add our module to the
-JIT. We do this by calling the CompileLayer's addModule method. The addModule
-method returns an ``Expected<CompileLayer::ModuleHandle>``, since in more
-advanced JIT configurations it could fail. In our basic configuration we know
-that it will always succeed so we use the cantFail utility to assert that no
-error occurred, and extract the handle value. Since we have already typedef'd
-our ModuleHandle type to be the same as the CompileLayer's handle type, we can
-return the unwrapped handle directly.
+  const DataLayout &getDataLayout() const { return DL; }
 
-.. code-block:: c++
+  LLVMContext &getContext() { return *Ctx.getContext(); }
 
-  JITSymbol findSymbol(const std::string Name) {
-    std::string MangledName;
-    raw_string_ostream MangledNameStream(MangledName);
-    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-    return CompileLayer.findSymbol(MangledNameStream.str(), true);
-  }
+Next we have a named constructor, ``Create``, which will build a KaleidoscopeJIT
+instance that is configured to generate code for our host process. It does this
+by first generating a JITTargetMachineBuilder instance using that clases's
+detectHost method and then using that instance to generate a datalayout for
+the target process. Each of these operations can fail, so each returns its
+result wrapped in an Expected value [3]_ that we must check for error before
+continuing. If both operations succeed we can unwrap their results (using the
+dereference operator) and pass them into KaleidoscopeJIT's constructor on the
+last line of the function.
+
+Following the named constructor we have the ``getDataLayout()`` and
+``getContext()`` methods. These are used to make data structures created and
+managed by the JIT (especially the LLVMContext) available to the REPL code that
+will build our IR modules.
 
-  JITTargetAddress getSymbolAddress(const std::string Name) {
-    return cantFail(findSymbol(Name).getAddress());
+.. code-block:: c++
+
+  void addModule(std::unique_ptr<Module> M) {
+    cantFail(CompileLayer.add(ES.getMainJITDylib(),
+                              ThreadSafeModule(std::move(M), Ctx)));
   }
 
-  void removeModule(ModuleHandle H) {
-    cantFail(CompileLayer.removeModule(H));
+  Expected<JITEvaluatedSymbol> lookup(StringRef Name) {
+    return ES.lookup({&ES.getMainJITDylib()}, Mangle(Name.str()));
   }
 
-Now that we can add code to our JIT, we need a way to find the symbols we've
-added to it. To do that we call the findSymbol method on our CompileLayer, but
-with a twist: We have to *mangle* the name of the symbol we're searching for
-first. The ORC JIT components use mangled symbols internally the same way a
-static compiler and linker would, rather than using plain IR symbol names. This
-allows JIT'd code to interoperate easily with precompiled code in the
-application or shared libraries. The kind of mangling will depend on the
-DataLayout, which in turn depends on the target platform. To allow us to remain
-portable and search based on the un-mangled name, we just re-produce this
-mangling ourselves.
-
-Next we have a convenience function, getSymbolAddress, which returns the address
-of a given symbol. Like CompileLayer's addModule function, JITSymbol's getAddress
-function is allowed to fail [4]_, however we know that it will not in our simple
-example, so we wrap it in a call to cantFail.
-
-We now come to the last method in our JIT API: removeModule. This method is
-responsible for destructing the MemoryManager and SymbolResolver that were
-added with a given module, freeing any resources they were using in the
-process. In our Kaleidoscope demo we rely on this method to remove the module
-representing the most recent top-level expression, preventing it from being
-treated as a duplicate definition when the next top-level expression is
-entered. It is generally good to free any module that you know you won't need
-to call further, just to free up the resources dedicated to it. However, you
-don't strictly need to do this: All resources will be cleaned up when your
-JIT class is destructed, if they haven't been freed before then. Like
-``CompileLayer::addModule`` and ``JITSymbol::getAddress``, removeModule may
-fail in general but will never fail in our example, so we wrap it in a call to
-cantFail.
+Now we come to the first of our JIT API methods: addModule. This method is
+responsible for adding IR to the JIT and making it available for execution. In
+this initial implementation of our JIT we will make our modules "available for
+execution" by adding them to the CompileLayer, which will it turn store the
+Module in the main JITDylib. This process will create new symbol table entries
+in the JITDylib for each definition in the module, and will defer compilation of
+the module until any of its definitions is looked up. Note that this is not lazy
+compilation: just referencing a definition, even if it is never used, will be
+enough to trigger compilation. In later chapters we will teach our JIT to defer
+compilation of functions until they're actually called.  To add our Module we
+must first wrap it in a ThreadSafeModule instance, which manages the lifetime of
+the Module's LLVMContext (our Ctx member) in a thread-friendly way. In our
+example, all modules will share the Ctx member, which will exist for the
+duration of the JIT. Once we switch to concurrent compilation in later chapters
+we will use a new context per module.
+
+Our last method is ``lookup``, which allows us to look up addresses for
+function and variable definitions added to the JIT based on their symbol names.
+As noted above, lookup will implicitly trigger compilation for any symbol
+that has not already been compiled. Our lookup method calls through to
+`ExecutionSession::lookup`, passing in a list of dylibs to search (in our case
+just the main dylib), and the symbol name to search for, with a twist: We have
+to *mangle* the name of the symbol we're searching for first. The ORC JIT
+components use mangled symbols internally the same way a static compiler and
+linker would, rather than using plain IR symbol names. This allows JIT'd code
+to interoperate easily with precompiled code in the application or shared
+libraries. The kind of mangling will depend on the DataLayout, which in turn
+depends on the target platform. To allow us to remain portable and search based
+on the un-mangled name, we just re-produce this mangling ourselves using our
+``Mangle`` member function object.
 
 This brings us to the end of Chapter 1 of Building a JIT. You now have a basic
 but fully functioning JIT stack that you can use to take LLVM IR and make it
@@ -362,42 +295,29 @@ Here is the code:
 .. [2] +-----------------------------+-----------------------------------------------+
        |         File                |               Reason for inclusion            |
        +=============================+===============================================+
-       |      STLExtras.h            | LLVM utilities that are useful when working   |
-       |                             | with the STL.                                 |
+       |        JITSymbol.h          | Defines the lookup result type                |
+       |                             | JITEvaluatedSymbol                            |
        +-----------------------------+-----------------------------------------------+
-       |   ExecutionEngine.h         | Access to the EngineBuilder::selectTarget     |
-       |                             | method.                                       |
+       |       CompileUtils.h        | Provides the SimpleCompiler class.            |
        +-----------------------------+-----------------------------------------------+
-       |                             | Access to the                                 |
-       | RTDyldMemoryManager.h       | RTDyldMemoryManager::getSymbolAddressInProcess|
-       |                             | method.                                       |
+       |           Core.h            | Core utilities such as ExecutionSession and   |
+       |                             | JITDylib.                                     |
        +-----------------------------+-----------------------------------------------+
-       |    CompileUtils.h           | Provides the SimpleCompiler class.            |
+       |      ExecutionUtils.h       | Provides the DynamicLibrarySearchGenerator    |
+       |                             | class.                                        |
        +-----------------------------+-----------------------------------------------+
-       |   IRCompileLayer.h          | Provides the IRCompileLayer class.            |
+       |      IRCompileLayer.h       | Provides the IRCompileLayer class.            |
        +-----------------------------+-----------------------------------------------+
-       |                             | Access the createLambdaResolver function,     |
-       |   LambdaResolver.h          | which provides easy construction of symbol    |
-       |                             | resolvers.                                    |
+       |  JITTargetMachineBuilder.h  | Provides the JITTargetMachineBuilder class.   |
        +-----------------------------+-----------------------------------------------+
-       |  RTDyldObjectLinkingLayer.h | Provides the RTDyldObjectLinkingLayer class.  |
+       | RTDyldObjectLinkingLayer.h  | Provides the RTDyldObjectLinkingLayer class.  |
        +-----------------------------+-----------------------------------------------+
-       |       Mangler.h             | Provides the Mangler class for platform       |
-       |                             | specific name-mangling.                       |
+       |   SectionMemoryManager.h    | Provides the SectionMemoryManager class.      |
        +-----------------------------+-----------------------------------------------+
-       |   DynamicLibrary.h          | Provides the DynamicLibrary class, which      |
-       |                             | makes symbols in the host process searchable. |
+       |        DataLayout.h         | Provides the DataLayout class.                |
        +-----------------------------+-----------------------------------------------+
-       |                             | A fast output stream class. We use the        |
-       |     raw_ostream.h           | raw_string_ostream subclass for symbol        |
-       |                             | mangling                                      |
+       |        LLVMContext.h        | Provides the LLVMContext class.               |
        +-----------------------------+-----------------------------------------------+
-       |   TargetMachine.h           | LLVM target machine description class.        |
-       +-----------------------------+-----------------------------------------------+
-
-.. [3] Actually they don't have to be lambdas, any object with a call operator
-       will do, including plain old functions or std::functions.
 
-.. [4] ``JITSymbol::getAddress`` will force the JIT to compile the definition of
-       the symbol if it hasn't already been compiled, and since the compilation
-       process could fail getAddress must be able to return this failure.
+.. [3] See the ErrorHandling section in the LLVM Programmer's Manual
+       (http://llvm.org/docs/ProgrammersManual.html#error-handling)
\ No newline at end of file
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
index 0b8bb381d08a76c5167de8a2cb174a56ad32f670..1df5aff086935677b114ae4d2f68198fc325deed 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
@@ -14,24 +14,18 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
 #define LLVM_EXECUTIONENGINE_ORC_KALEIDOSCOPEJIT_H
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
+#include "llvm/IR/LLVMContext.h"
 #include <memory>
-#include <string>
-#include <vector>
 
 namespace llvm {
 namespace orc {
@@ -39,59 +33,48 @@ namespace orc {
 class KaleidoscopeJIT {
 private:
   ExecutionSession ES;
-  std::shared_ptr<SymbolResolver> Resolver;
-  std::unique_ptr<TargetMachine> TM;
-  const DataLayout DL;
   RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  IRCompileLayer CompileLayer;
+
+  DataLayout DL;
+  MangleAndInterner Mangle;
+  ThreadSafeContext Ctx;
 
 public:
-  KaleidoscopeJIT()
-      : Resolver(createLegacyLookupResolver(
-            ES,
-            [this](const std::string &Name) -> JITSymbol {
-              if (auto Sym = CompileLayer.findSymbol(Name, false))
-                return Sym;
-              else if (auto Err = Sym.takeError())
-                return std::move(Err);
-              if (auto SymAddr =
-                      RTDyldMemoryManager::getSymbolAddressInProcess(Name))
-                return JITSymbol(SymAddr, JITSymbolFlags::Exported);
-              return nullptr;
-            },
-            [](Error Err) { cantFail(std::move(Err), "lookupFlags failed"); })),
-        TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
-        ObjectLayer(ES,
-                    [this](VModuleKey) {
-                      return RTDyldObjectLinkingLayer::Resources{
-                          std::make_shared<SectionMemoryManager>(), Resolver};
-                    }),
-        CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
-    llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
+  KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL)
+      : ObjectLayer(ES,
+                    []() { return llvm::make_unique<SectionMemoryManager>(); }),
+        CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))),
+        DL(std::move(DL)), Mangle(ES, this->DL),
+        Ctx(llvm::make_unique<LLVMContext>()) {
+    ES.getMainJITDylib().setGenerator(
+        cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
   }
 
-  TargetMachine &getTargetMachine() { return *TM; }
+  static Expected<std::unique_ptr<KaleidoscopeJIT>> Create() {
+    auto JTMB = JITTargetMachineBuilder::detectHost();
 
-  VModuleKey addModule(std::unique_ptr<Module> M) {
-    // Add the module to the JIT with a new VModuleKey.
-    auto K = ES.allocateVModule();
-    cantFail(CompileLayer.addModule(K, std::move(M)));
-    return K;
-  }
+    if (!JTMB)
+      return JTMB.takeError();
 
-  JITSymbol findSymbol(const std::string Name) {
-    std::string MangledName;
-    raw_string_ostream MangledNameStream(MangledName);
-    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-    return CompileLayer.findSymbol(MangledNameStream.str(), true);
+    auto DL = JTMB->getDefaultDataLayoutForTarget();
+    if (!DL)
+      return DL.takeError();
+
+    return llvm::make_unique<KaleidoscopeJIT>(std::move(*JTMB), std::move(*DL));
   }
 
-  JITTargetAddress getSymbolAddress(const std::string Name) {
-    return cantFail(findSymbol(Name).getAddress());
+  const DataLayout &getDataLayout() const { return DL; }
+
+  LLVMContext &getContext() { return *Ctx.getContext(); }
+
+  Error addModule(std::unique_ptr<Module> M) {
+    return CompileLayer.add(ES.getMainJITDylib(),
+                            ThreadSafeModule(std::move(M), Ctx));
   }
 
-  void removeModule(VModuleKey K) {
-    cantFail(CompileLayer.removeModule(K));
+  Expected<JITEvaluatedSymbol> lookup(StringRef Name) {
+    return ES.lookup({&ES.getMainJITDylib()}, Mangle(Name.str()));
   }
 };
 
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
index 7652e80c69a1c57c14f15c34cf727dee3fec4f90..5a66b367c27368a8d383cb9ad8685e4b4cdccee8 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp
@@ -676,10 +676,11 @@ static std::unique_ptr<FunctionAST> ParseDefinition() {
 }
 
 /// toplevelexpr ::= expression
-static std::unique_ptr<FunctionAST> ParseTopLevelExpr() {
+static std::unique_ptr<FunctionAST> ParseTopLevelExpr(unsigned ExprCount) {
   if (auto E = ParseExpression()) {
     // Make an anonymous proto.
-    auto Proto = llvm::make_unique<PrototypeAST>("__anon_expr",
+    auto Proto = llvm::make_unique<PrototypeAST>(("__anon_expr" +
+                                                  Twine(ExprCount)).str(),
                                                  std::vector<std::string>());
     return llvm::make_unique<FunctionAST>(std::move(Proto), std::move(E));
   }
@@ -696,12 +697,13 @@ static std::unique_ptr<PrototypeAST> ParseExtern() {
 // Code Generation
 //===----------------------------------------------------------------------===//
 
-static LLVMContext TheContext;
-static IRBuilder<> Builder(TheContext);
+static std::unique_ptr<KaleidoscopeJIT> TheJIT;
+static LLVMContext *TheContext;
+static std::unique_ptr<IRBuilder<>> Builder;
 static std::unique_ptr<Module> TheModule;
 static std::map<std::string, AllocaInst *> NamedValues;
-static std::unique_ptr<KaleidoscopeJIT> TheJIT;
 static std::map<std::string, std::unique_ptr<PrototypeAST>> FunctionProtos;
+static ExitOnError ExitOnErr;
 
 Value *LogErrorV(const char *Str) {
   LogError(Str);
@@ -729,11 +731,11 @@ static AllocaInst *CreateEntryBlockAlloca(Function *TheFunction,
                                           const std::string &VarName) {
   IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
                    TheFunction->getEntryBlock().begin());
-  return TmpB.CreateAlloca(Type::getDoubleTy(TheContext), nullptr, VarName);
+  return TmpB.CreateAlloca(Type::getDoubleTy(*TheContext), nullptr, VarName);
 }
 
 Value *NumberExprAST::codegen() {
-  return ConstantFP::get(TheContext, APFloat(Val));
+  return ConstantFP::get(*TheContext, APFloat(Val));
 }
 
 Value *VariableExprAST::codegen() {
@@ -743,7 +745,7 @@ Value *VariableExprAST::codegen() {
     return LogErrorV("Unknown variable name");
 
   // Load the value.
-  return Builder.CreateLoad(V, Name.c_str());
+  return Builder->CreateLoad(V, Name.c_str());
 }
 
 Value *UnaryExprAST::codegen() {
@@ -755,7 +757,7 @@ Value *UnaryExprAST::codegen() {
   if (!F)
     return LogErrorV("Unknown unary operator");
 
-  return Builder.CreateCall(F, OperandV, "unop");
+  return Builder->CreateCall(F, OperandV, "unop");
 }
 
 Value *BinaryExprAST::codegen() {
@@ -778,7 +780,7 @@ Value *BinaryExprAST::codegen() {
     if (!Variable)
       return LogErrorV("Unknown variable name");
 
-    Builder.CreateStore(Val, Variable);
+    Builder->CreateStore(Val, Variable);
     return Val;
   }
 
@@ -789,15 +791,15 @@ Value *BinaryExprAST::codegen() {
 
   switch (Op) {
   case '+':
-    return Builder.CreateFAdd(L, R, "addtmp");
+    return Builder->CreateFAdd(L, R, "addtmp");
   case '-':
-    return Builder.CreateFSub(L, R, "subtmp");
+    return Builder->CreateFSub(L, R, "subtmp");
   case '*':
-    return Builder.CreateFMul(L, R, "multmp");
+    return Builder->CreateFMul(L, R, "multmp");
   case '<':
-    L = Builder.CreateFCmpULT(L, R, "cmptmp");
+    L = Builder->CreateFCmpULT(L, R, "cmptmp");
     // Convert bool 0/1 to double 0.0 or 1.0
-    return Builder.CreateUIToFP(L, Type::getDoubleTy(TheContext), "booltmp");
+    return Builder->CreateUIToFP(L, Type::getDoubleTy(*TheContext), "booltmp");
   default:
     break;
   }
@@ -808,7 +810,7 @@ Value *BinaryExprAST::codegen() {
   assert(F && "binary operator not found!");
 
   Value *Ops[] = {L, R};
-  return Builder.CreateCall(F, Ops, "binop");
+  return Builder->CreateCall(F, Ops, "binop");
 }
 
 Value *CallExprAST::codegen() {
@@ -828,7 +830,7 @@ Value *CallExprAST::codegen() {
       return nullptr;
   }
 
-  return Builder.CreateCall(CalleeF, ArgsV, "calltmp");
+  return Builder->CreateCall(CalleeF, ArgsV, "calltmp");
 }
 
 Value *IfExprAST::codegen() {
@@ -837,46 +839,46 @@ Value *IfExprAST::codegen() {
     return nullptr;
 
   // Convert condition to a bool by comparing equal to 0.0.
-  CondV = Builder.CreateFCmpONE(
-      CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond");
+  CondV = Builder->CreateFCmpONE(
+      CondV, ConstantFP::get(*TheContext, APFloat(0.0)), "ifcond");
 
-  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+  Function *TheFunction = Builder->GetInsertBlock()->getParent();
 
   // Create blocks for the then and else cases.  Insert the 'then' block at the
   // end of the function.
-  BasicBlock *ThenBB = BasicBlock::Create(TheContext, "then", TheFunction);
-  BasicBlock *ElseBB = BasicBlock::Create(TheContext, "else");
-  BasicBlock *MergeBB = BasicBlock::Create(TheContext, "ifcont");
+  BasicBlock *ThenBB = BasicBlock::Create(*TheContext, "then", TheFunction);
+  BasicBlock *ElseBB = BasicBlock::Create(*TheContext, "else");
+  BasicBlock *MergeBB = BasicBlock::Create(*TheContext, "ifcont");
 
-  Builder.CreateCondBr(CondV, ThenBB, ElseBB);
+  Builder->CreateCondBr(CondV, ThenBB, ElseBB);
 
   // Emit then value.
-  Builder.SetInsertPoint(ThenBB);
+  Builder->SetInsertPoint(ThenBB);
 
   Value *ThenV = Then->codegen();
   if (!ThenV)
     return nullptr;
 
-  Builder.CreateBr(MergeBB);
+  Builder->CreateBr(MergeBB);
   // Codegen of 'Then' can change the current block, update ThenBB for the PHI.
-  ThenBB = Builder.GetInsertBlock();
+  ThenBB = Builder->GetInsertBlock();
 
   // Emit else block.
   TheFunction->getBasicBlockList().push_back(ElseBB);
-  Builder.SetInsertPoint(ElseBB);
+  Builder->SetInsertPoint(ElseBB);
 
   Value *ElseV = Else->codegen();
   if (!ElseV)
     return nullptr;
 
-  Builder.CreateBr(MergeBB);
+  Builder->CreateBr(MergeBB);
   // Codegen of 'Else' can change the current block, update ElseBB for the PHI.
-  ElseBB = Builder.GetInsertBlock();
+  ElseBB = Builder->GetInsertBlock();
 
   // Emit merge block.
   TheFunction->getBasicBlockList().push_back(MergeBB);
-  Builder.SetInsertPoint(MergeBB);
-  PHINode *PN = Builder.CreatePHI(Type::getDoubleTy(TheContext), 2, "iftmp");
+  Builder->SetInsertPoint(MergeBB);
+  PHINode *PN = Builder->CreatePHI(Type::getDoubleTy(*TheContext), 2, "iftmp");
 
   PN->addIncoming(ThenV, ThenBB);
   PN->addIncoming(ElseV, ElseBB);
@@ -903,7 +905,7 @@ Value *IfExprAST::codegen() {
 //   br endcond, loop, endloop
 // outloop:
 Value *ForExprAST::codegen() {
-  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+  Function *TheFunction = Builder->GetInsertBlock()->getParent();
 
   // Create an alloca for the variable in the entry block.
   AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
@@ -914,17 +916,17 @@ Value *ForExprAST::codegen() {
     return nullptr;
 
   // Store the value into the alloca.
-  Builder.CreateStore(StartVal, Alloca);
+  Builder->CreateStore(StartVal, Alloca);
 
   // Make the new basic block for the loop header, inserting after current
   // block.
-  BasicBlock *LoopBB = BasicBlock::Create(TheContext, "loop", TheFunction);
+  BasicBlock *LoopBB = BasicBlock::Create(*TheContext, "loop", TheFunction);
 
   // Insert an explicit fall through from the current block to the LoopBB.
-  Builder.CreateBr(LoopBB);
+  Builder->CreateBr(LoopBB);
 
   // Start insertion in LoopBB.
-  Builder.SetInsertPoint(LoopBB);
+  Builder->SetInsertPoint(LoopBB);
 
   // Within the loop, the variable is defined equal to the PHI node.  If it
   // shadows an existing variable, we have to restore it, so save it now.
@@ -945,7 +947,7 @@ Value *ForExprAST::codegen() {
       return nullptr;
   } else {
     // If not specified, use 1.0.
-    StepVal = ConstantFP::get(TheContext, APFloat(1.0));
+    StepVal = ConstantFP::get(*TheContext, APFloat(1.0));
   }
 
   // Compute the end condition.
@@ -955,23 +957,23 @@ Value *ForExprAST::codegen() {
 
   // Reload, increment, and restore the alloca.  This handles the case where
   // the body of the loop mutates the variable.
-  Value *CurVar = Builder.CreateLoad(Alloca, VarName.c_str());
-  Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
-  Builder.CreateStore(NextVar, Alloca);
+  Value *CurVar = Builder->CreateLoad(Alloca, VarName.c_str());
+  Value *NextVar = Builder->CreateFAdd(CurVar, StepVal, "nextvar");
+  Builder->CreateStore(NextVar, Alloca);
 
   // Convert condition to a bool by comparing equal to 0.0.
-  EndCond = Builder.CreateFCmpONE(
-      EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond");
+  EndCond = Builder->CreateFCmpONE(
+      EndCond, ConstantFP::get(*TheContext, APFloat(0.0)), "loopcond");
 
   // Create the "after loop" block and insert it.
   BasicBlock *AfterBB =
-      BasicBlock::Create(TheContext, "afterloop", TheFunction);
+      BasicBlock::Create(*TheContext, "afterloop", TheFunction);
 
   // Insert the conditional branch into the end of LoopEndBB.
-  Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
+  Builder->CreateCondBr(EndCond, LoopBB, AfterBB);
 
   // Any new code will be inserted in AfterBB.
-  Builder.SetInsertPoint(AfterBB);
+  Builder->SetInsertPoint(AfterBB);
 
   // Restore the unshadowed variable.
   if (OldVal)
@@ -980,13 +982,13 @@ Value *ForExprAST::codegen() {
     NamedValues.erase(VarName);
 
   // for expr always returns 0.0.
-  return Constant::getNullValue(Type::getDoubleTy(TheContext));
+  return Constant::getNullValue(Type::getDoubleTy(*TheContext));
 }
 
 Value *VarExprAST::codegen() {
   std::vector<AllocaInst *> OldBindings;
 
-  Function *TheFunction = Builder.GetInsertBlock()->getParent();
+  Function *TheFunction = Builder->GetInsertBlock()->getParent();
 
   // Register all variables and emit their initializer.
   for (unsigned i = 0, e = VarNames.size(); i != e; ++i) {
@@ -1004,11 +1006,11 @@ Value *VarExprAST::codegen() {
       if (!InitVal)
         return nullptr;
     } else { // If not specified, use 0.0.
-      InitVal = ConstantFP::get(TheContext, APFloat(0.0));
+      InitVal = ConstantFP::get(*TheContext, APFloat(0.0));
     }
 
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
-    Builder.CreateStore(InitVal, Alloca);
+    Builder->CreateStore(InitVal, Alloca);
 
     // Remember the old variable binding so that we can restore the binding when
     // we unrecurse.
@@ -1033,9 +1035,9 @@ Value *VarExprAST::codegen() {
 
 Function *PrototypeAST::codegen() {
   // Make the function type:  double(double,double) etc.
-  std::vector<Type *> Doubles(Args.size(), Type::getDoubleTy(TheContext));
+  std::vector<Type *> Doubles(Args.size(), Type::getDoubleTy(*TheContext));
   FunctionType *FT =
-      FunctionType::get(Type::getDoubleTy(TheContext), Doubles, false);
+      FunctionType::get(Type::getDoubleTy(*TheContext), Doubles, false);
 
   Function *F =
       Function::Create(FT, Function::ExternalLinkage, Name, TheModule.get());
@@ -1062,8 +1064,8 @@ Function *FunctionAST::codegen() {
     BinopPrecedence[P.getOperatorName()] = P.getBinaryPrecedence();
 
   // Create a new basic block to start insertion into.
-  BasicBlock *BB = BasicBlock::Create(TheContext, "entry", TheFunction);
-  Builder.SetInsertPoint(BB);
+  BasicBlock *BB = BasicBlock::Create(*TheContext, "entry", TheFunction);
+  Builder->SetInsertPoint(BB);
 
   // Record the function arguments in the NamedValues map.
   NamedValues.clear();
@@ -1072,7 +1074,7 @@ Function *FunctionAST::codegen() {
     AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, Arg.getName());
 
     // Store the initial value into the alloca.
-    Builder.CreateStore(&Arg, Alloca);
+    Builder->CreateStore(&Arg, Alloca);
 
     // Add arguments to variable symbol table.
     NamedValues[Arg.getName()] = Alloca;
@@ -1080,7 +1082,7 @@ Function *FunctionAST::codegen() {
 
   if (Value *RetVal = Body->codegen()) {
     // Finish off the function.
-    Builder.CreateRet(RetVal);
+    Builder->CreateRet(RetVal);
 
     // Validate the generated code, checking for consistency.
     verifyFunction(*TheFunction);
@@ -1102,8 +1104,11 @@ Function *FunctionAST::codegen() {
 
 static void InitializeModule() {
   // Open a new module.
-  TheModule = llvm::make_unique<Module>("my cool jit", TheContext);
-  TheModule->setDataLayout(TheJIT->getTargetMachine().createDataLayout());
+  TheModule = llvm::make_unique<Module>("my cool jit", *TheContext);
+  TheModule->setDataLayout(TheJIT->getDataLayout());
+
+  // Create a new builder for the module.
+  Builder = llvm::make_unique<IRBuilder<>>(*TheContext);
 }
 
 static void HandleDefinition() {
@@ -1112,7 +1117,7 @@ static void HandleDefinition() {
       fprintf(stderr, "Read function definition:");
       FnIR->print(errs());
       fprintf(stderr, "\n");
-      TheJIT->addModule(std::move(TheModule));
+      ExitOnErr(TheJIT->addModule(std::move(TheModule)));
       InitializeModule();
     }
   } else {
@@ -1136,23 +1141,27 @@ static void HandleExtern() {
 }
 
 static void HandleTopLevelExpression() {
+  static unsigned ExprCount = 0;
+
+  // Update ExprCount. This number will be added to anonymous expressions to
+  // prevent them from clashing.
+  ++ExprCount;
+
   // Evaluate a top-level expression into an anonymous function.
-  if (auto FnAST = ParseTopLevelExpr()) {
+  if (auto FnAST = ParseTopLevelExpr(ExprCount)) {
     if (FnAST->codegen()) {
       // JIT the module containing the anonymous expression, keeping a handle so
       // we can free it later.
-      auto H = TheJIT->addModule(std::move(TheModule));
+      ExitOnErr(TheJIT->addModule(std::move(TheModule)));
       InitializeModule();
 
-      // Get the anonymous expression's address and cast it to the right type,
-      // double(*)(), so we can call it as a native function.
-      double (*FP)() =
-        (double (*)())(intptr_t)TheJIT->getSymbolAddress("__anon_expr");
+      // Get the anonymous expression's JITSymbol.
+      auto Sym =
+        ExitOnErr(TheJIT->lookup(("__anon_expr" + Twine(ExprCount)).str()));
+
+      auto *FP = (double (*)())(intptr_t)Sym.getAddress();
       assert(FP && "Failed to codegen function");
       fprintf(stderr, "Evaluated to %f\n", FP());
-
-      // Delete the anonymous expression module from the JIT.
-      TheJIT->removeModule(H);
     }
   } else {
     // Skip token for error recovery.
@@ -1220,7 +1229,8 @@ int main() {
   fprintf(stderr, "ready> ");
   getNextToken();
 
-  TheJIT = llvm::make_unique<KaleidoscopeJIT>();
+  TheJIT = ExitOnErr(KaleidoscopeJIT::Create());
+  TheContext = &TheJIT->getContext();
 
   InitializeModule();
 
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
index 9ea84d1a8581bc744069bae9b0308b562afb676e..7c803b138c0666e9406bccad50f6aeb653c07ef1 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
@@ -47,13 +47,13 @@ private:
   std::shared_ptr<SymbolResolver> Resolver;
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  LegacyRTDyldObjectLinkingLayer ObjectLayer;
+  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
   using OptimizeFunction =
       std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
-  IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+  LegacyIRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
 public:
   KaleidoscopeJIT()
@@ -73,7 +73,7 @@ public:
         TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
         ObjectLayer(ES,
                     [this](VModuleKey) {
-                      return RTDyldObjectLinkingLayer::Resources{
+                      return LegacyRTDyldObjectLinkingLayer::Resources{
                           std::make_shared<SectionMemoryManager>(), Resolver};
                     }),
         CompileLayer(ObjectLayer, SimpleCompiler(*TM)),
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
index 80c39bd70f72339df76f60c3a7c4f195ed8bd5d5..ce0111d2f6b845f6a0fb1ef382e9c892aa030df7 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
@@ -51,23 +51,23 @@ private:
   std::map<VModuleKey, std::shared_ptr<SymbolResolver>> Resolvers;
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  LegacyRTDyldObjectLinkingLayer ObjectLayer;
+  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
   using OptimizeFunction =
       std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
-  IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+  LegacyIRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
   std::unique_ptr<JITCompileCallbackManager> CompileCallbackManager;
-  CompileOnDemandLayer<decltype(OptimizeLayer)> CODLayer;
+  LegacyCompileOnDemandLayer<decltype(OptimizeLayer)> CODLayer;
 
 public:
   KaleidoscopeJIT()
       : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
         ObjectLayer(ES,
                     [this](VModuleKey K) {
-                      return RTDyldObjectLinkingLayer::Resources{
+                      return LegacyRTDyldObjectLinkingLayer::Resources{
                           std::make_shared<SectionMemoryManager>(),
                           Resolvers[K]};
                     }),
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
index 04ad86e34bfb10ae62469fa3c62e3cead70a26f9..ffca65fbcd4f52c30f3fa40508eeb9773b961ba9 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
@@ -77,13 +77,13 @@ private:
   std::shared_ptr<SymbolResolver> Resolver;
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  LegacyRTDyldObjectLinkingLayer ObjectLayer;
+  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
   using OptimizeFunction =
       std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
-  IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+  LegacyIRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
   std::unique_ptr<JITCompileCallbackManager> CompileCallbackMgr;
   std::unique_ptr<IndirectStubsManager> IndirectStubsMgr;
@@ -108,7 +108,7 @@ public:
         TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
         ObjectLayer(ES,
                     [this](VModuleKey K) {
-                      return RTDyldObjectLinkingLayer::Resources{
+                      return LegacyRTDyldObjectLinkingLayer::Resources{
                           std::make_shared<SectionMemoryManager>(), Resolver};
                     }),
         CompileLayer(ObjectLayer, SimpleCompiler(*TM)),
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
index 010f54363778b201d6e56b35732801fa6e29d0fd..f1ae5b022895b8c9b2ddbd404072aa4634d5b09b 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
@@ -82,13 +82,13 @@ private:
   std::shared_ptr<SymbolResolver> Resolver;
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  RTDyldObjectLinkingLayer ObjectLayer;
-  IRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
+  LegacyRTDyldObjectLinkingLayer ObjectLayer;
+  LegacyIRCompileLayer<decltype(ObjectLayer), SimpleCompiler> CompileLayer;
 
   using OptimizeFunction =
       std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
-  IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
+  LegacyIRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
   JITCompileCallbackManager *CompileCallbackMgr;
   std::unique_ptr<IndirectStubsManager> IndirectStubsMgr;
@@ -116,7 +116,7 @@ public:
         DL(TM->createDataLayout()),
         ObjectLayer(ES,
                     [this](VModuleKey K) {
-                      return RTDyldObjectLinkingLayer::Resources{
+                      return LegacyRTDyldObjectLinkingLayer::Resources{
                           cantFail(this->Remote.createRemoteMemoryManager()),
                           Resolver};
                     }),
diff --git a/examples/Kaleidoscope/include/KaleidoscopeJIT.h b/examples/Kaleidoscope/include/KaleidoscopeJIT.h
index 7239aea7ba1bcf6e2424ba267a65abd85998bc72..972773a64f7e4f2280599960a4d8f5346570dfe9 100644
--- a/examples/Kaleidoscope/include/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/include/KaleidoscopeJIT.h
@@ -40,8 +40,8 @@ namespace orc {
 
 class KaleidoscopeJIT {
 public:
-  using ObjLayerT = RTDyldObjectLinkingLayer;
-  using CompileLayerT = IRCompileLayer<ObjLayerT, SimpleCompiler>;
+  using ObjLayerT = LegacyRTDyldObjectLinkingLayer;
+  using CompileLayerT = LegacyIRCompileLayer<ObjLayerT, SimpleCompiler>;
 
   KaleidoscopeJIT()
       : Resolver(createLegacyLookupResolver(
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index f7f22387b53e2f5347950fd6a4c4df421c6a9447..c093c0906ce3b32ddf9f19bb7126dcaaba2050ff 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -929,6 +929,44 @@ void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char *Name,
 void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char *Name,
                                  LLVMValueRef Val);
 
+/**
+ * Return the directory of the debug location for this value, which must be
+ * an llvm::Instruction, llvm::GlobalVariable, or llvm::Function.
+ *
+ * @see llvm::Instruction::getDebugLoc()
+ * @see llvm::GlobalVariable::getDebugInfo()
+ * @see llvm::Function::getSubprogram()
+ */
+const char *LLVMGetDebugLocDirectory(LLVMValueRef Val, unsigned *Length);
+
+/**
+ * Return the filename of the debug location for this value, which must be
+ * an llvm::Instruction, llvm::GlobalVariable, or llvm::Function.
+ *
+ * @see llvm::Instruction::getDebugLoc()
+ * @see llvm::GlobalVariable::getDebugInfo()
+ * @see llvm::Function::getSubprogram()
+ */
+const char *LLVMGetDebugLocFilename(LLVMValueRef Val, unsigned *Length);
+
+/**
+ * Return the line number of the debug location for this value, which must be
+ * an llvm::Instruction, llvm::GlobalVariable, or llvm::Function.
+ *
+ * @see llvm::Instruction::getDebugLoc()
+ * @see llvm::GlobalVariable::getDebugInfo()
+ * @see llvm::Function::getSubprogram()
+ */
+unsigned LLVMGetDebugLocLine(LLVMValueRef Val);
+
+/**
+ * Return the column number of the debug location for this value, which must be
+ * an llvm::Instruction.
+ *
+ * @see llvm::Instruction::getDebugLoc()
+ */
+unsigned LLVMGetDebugLocColumn(LLVMValueRef Val);
+
 /**
  * Add a function to a module under a specified name.
  *
@@ -1501,16 +1539,15 @@ LLVMTypeRef LLVMX86MMXType(void);
       macro(SelectInst)                     \
       macro(ShuffleVectorInst)              \
       macro(StoreInst)                      \
-      macro(TerminatorInst)                 \
-        macro(BranchInst)                   \
-        macro(IndirectBrInst)               \
-        macro(InvokeInst)                   \
-        macro(ReturnInst)                   \
-        macro(SwitchInst)                   \
-        macro(UnreachableInst)              \
-        macro(ResumeInst)                   \
-        macro(CleanupReturnInst)            \
-        macro(CatchReturnInst)              \
+      macro(BranchInst)                     \
+      macro(IndirectBrInst)                 \
+      macro(InvokeInst)                     \
+      macro(ReturnInst)                     \
+      macro(SwitchInst)                     \
+      macro(UnreachableInst)                \
+      macro(ResumeInst)                     \
+      macro(CleanupReturnInst)              \
+      macro(CatchReturnInst)                \
       macro(FuncletPadInst)                 \
         macro(CatchPadInst)                 \
         macro(CleanupPadInst)               \
@@ -2344,6 +2381,54 @@ void LLVMSetPersonalityFn(LLVMValueRef Fn, LLVMValueRef PersonalityFn);
  */
 unsigned LLVMGetIntrinsicID(LLVMValueRef Fn);
 
+/**
+ * Create or insert the declaration of an intrinsic.  For overloaded intrinsics,
+ * parameter types must be provided to uniquely identify an overload.
+ *
+ * @see llvm::Intrinsic::getDeclaration()
+ */
+LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod,
+                                         unsigned ID,
+                                         LLVMTypeRef *ParamTypes,
+                                         size_t ParamCount);
+
+/**
+ * Retrieves the type of an intrinsic.  For overloaded intrinsics, parameter
+ * types must be provided to uniquely identify an overload.
+ *
+ * @see llvm::Intrinsic::getType()
+ */
+LLVMTypeRef LLVMIntrinsicGetType(LLVMContextRef Ctx, unsigned ID,
+                                 LLVMTypeRef *ParamTypes, size_t ParamCount);
+
+/**
+ * Retrieves the name of an intrinsic.
+ *
+ * @see llvm::Intrinsic::getName()
+ */
+const char *LLVMIntrinsicGetName(unsigned ID, size_t *NameLength);
+
+/**
+ * Copies the name of an overloaded intrinsic identified by a given list of
+ * parameter types.
+ *
+ * Unlike LLVMIntrinsicGetName, the caller is responsible for freeing the
+ * returned string.
+ *
+ * @see llvm::Intrinsic::getName()
+ */
+const char *LLVMIntrinsicCopyOverloadedName(unsigned ID,
+                                            LLVMTypeRef *ParamTypes,
+                                            size_t ParamCount,
+                                            size_t *NameLength);
+
+/**
+ * Obtain if the intrinsic identified by the given ID is overloaded.
+ *
+ * @see llvm::Intrinsic::isOverloaded()
+ */
+LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID);
+
 /**
  * Obtain the calling function of a function.
  *
@@ -2641,7 +2726,7 @@ LLVMValueRef LLVMGetBasicBlockParent(LLVMBasicBlockRef BB);
  * If the basic block does not have a terminator (it is not well-formed
  * if it doesn't), then NULL is returned.
  *
- * The returned LLVMValueRef corresponds to a llvm::TerminatorInst.
+ * The returned LLVMValueRef corresponds to an llvm::Instruction.
  *
  * @see llvm::BasicBlock::getTerminator()
  */
@@ -2913,6 +2998,15 @@ LLVMRealPredicate LLVMGetFCmpPredicate(LLVMValueRef Inst);
  */
 LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst);
 
+/**
+ * Determine whether an instruction is a terminator. This routine is named to
+ * be compatible with historical functions that did this by querying the
+ * underlying C++ type.
+ *
+ * @see llvm::Instruction::isTerminator()
+ */
+LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst);
+
 /**
  * @defgroup LLVMCCoreValueInstructionCall Call Sites and Invocations
  *
@@ -3053,8 +3147,8 @@ void LLVMSetUnwindDest(LLVMValueRef InvokeInst, LLVMBasicBlockRef B);
 /**
  * @defgroup LLVMCCoreValueInstructionTerminator Terminators
  *
- * Functions in this group only apply to instructions that map to
- * llvm::TerminatorInst instances.
+ * Functions in this group only apply to instructions for which
+ * LLVMIsATerminatorInst returns true.
  *
  * @{
  */
@@ -3062,21 +3156,21 @@ void LLVMSetUnwindDest(LLVMValueRef InvokeInst, LLVMBasicBlockRef B);
 /**
  * Return the number of successors that this terminator has.
  *
- * @see llvm::TerminatorInst::getNumSuccessors
+ * @see llvm::Instruction::getNumSuccessors
  */
 unsigned LLVMGetNumSuccessors(LLVMValueRef Term);
 
 /**
  * Return the specified successor.
  *
- * @see llvm::TerminatorInst::getSuccessor
+ * @see llvm::Instruction::getSuccessor
  */
 LLVMBasicBlockRef LLVMGetSuccessor(LLVMValueRef Term, unsigned i);
 
 /**
  * Update the specified successor to point at the provided block.
  *
- * @see llvm::TerminatorInst::setSuccessor
+ * @see llvm::Instruction::setSuccessor
  */
 void LLVMSetSuccessor(LLVMValueRef Term, unsigned i, LLVMBasicBlockRef block);
 
@@ -3427,6 +3521,35 @@ LLVMValueRef LLVMBuildNot(LLVMBuilderRef, LLVMValueRef V, const char *Name);
 LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef, LLVMTypeRef Ty, const char *Name);
 LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef, LLVMTypeRef Ty,
                                   LLVMValueRef Val, const char *Name);
+
+/**
+ * Creates and inserts a memset to the specified pointer and the 
+ * specified value.
+ *
+ * @see llvm::IRRBuilder::CreateMemSet()
+ */
+LLVMValueRef LLVMBuildMemSet(LLVMBuilderRef B, LLVMValueRef Ptr,
+                             LLVMValueRef Val, LLVMValueRef Len,
+                             unsigned Align);
+/**
+ * Creates and inserts a memcpy between the specified pointers.
+ *
+ * @see llvm::IRRBuilder::CreateMemCpy()
+ */
+LLVMValueRef LLVMBuildMemCpy(LLVMBuilderRef B, 
+                             LLVMValueRef Dst, unsigned DstAlign,
+                             LLVMValueRef Src, unsigned SrcAlign,
+                             LLVMValueRef Size);
+/**
+ * Creates and inserts a memmove between the specified pointers.
+ *
+ * @see llvm::IRRBuilder::CreateMemMove()
+ */
+LLVMValueRef LLVMBuildMemMove(LLVMBuilderRef B, 
+                              LLVMValueRef Dst, unsigned DstAlign,
+                              LLVMValueRef Src, unsigned SrcAlign,
+                              LLVMValueRef Size);
+
 LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef, LLVMTypeRef Ty, const char *Name);
 LLVMValueRef LLVMBuildArrayAlloca(LLVMBuilderRef, LLVMTypeRef Ty,
                                   LLVMValueRef Val, const char *Name);
diff --git a/include/llvm-c/ExecutionEngine.h b/include/llvm-c/ExecutionEngine.h
index 49ae6fee45f0138f75cc984149ff08a156e1f1fa..e8ebef9ab15d8a709046c167a18e80813288ede4 100644
--- a/include/llvm-c/ExecutionEngine.h
+++ b/include/llvm-c/ExecutionEngine.h
@@ -186,7 +186,7 @@ void LLVMDisposeMCJITMemoryManager(LLVMMCJITMemoryManagerRef MM);
 
 LLVMJITEventListenerRef LLVMCreateGDBRegistrationListener(void);
 LLVMJITEventListenerRef LLVMCreateIntelJITEventListener(void);
-LLVMJITEventListenerRef LLVMCreateOprofileJITEventListener(void);
+LLVMJITEventListenerRef LLVMCreateOProfileJITEventListener(void);
 LLVMJITEventListenerRef LLVMCreatePerfJITEventListener(void);
 
 /**
diff --git a/include/llvm-c/OptRemarks.h b/include/llvm-c/OptRemarks.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a90394e711c57962c462f8b17eb85f86146cf1b
--- /dev/null
+++ b/include/llvm-c/OptRemarks.h
@@ -0,0 +1,204 @@
+/*===-- llvm-c/OptRemarks.h - OptRemarks Public C Interface -------*- C -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header provides a public interface to an opt-remark library.          *|
+|* LLVM provides an implementation of this interface.                         *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_OPT_REMARKS_H
+#define LLVM_C_OPT_REMARKS_H
+
+#include "llvm-c/Core.h"
+#include "llvm-c/Types.h"
+#ifdef __cplusplus
+#include <cstddef>
+extern "C" {
+#else
+#include <stddef.h>
+#endif /* !defined(__cplusplus) */
+
+/**
+ * @defgroup LLVMCOPTREMARKS OptRemarks
+ * @ingroup LLVMC
+ *
+ * @{
+ */
+
+#define OPT_REMARKS_API_VERSION 0
+
+/**
+ * String containing a buffer and a length. The buffer is not guaranteed to be
+ * zero-terminated.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  const char *Str;
+  uint32_t Len;
+} LLVMOptRemarkStringRef;
+
+/**
+ * DebugLoc containing File, Line and Column.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  // File:
+  LLVMOptRemarkStringRef SourceFile;
+  // Line:
+  uint32_t SourceLineNumber;
+  // Column:
+  uint32_t SourceColumnNumber;
+} LLVMOptRemarkDebugLoc;
+
+/**
+ * Element of the "Args" list. The key might give more information about what
+ * are the semantics of the value, e.g. "Callee" will tell you that the value
+ * is a symbol that names a function.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  // e.g. "Callee"
+  LLVMOptRemarkStringRef Key;
+  // e.g. "malloc"
+  LLVMOptRemarkStringRef Value;
+
+  // "DebugLoc": Optional
+  LLVMOptRemarkDebugLoc DebugLoc;
+} LLVMOptRemarkArg;
+
+/**
+ * One remark entry.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+typedef struct {
+  // e.g. !Missed, !Passed
+  LLVMOptRemarkStringRef RemarkType;
+  // "Pass": Required
+  LLVMOptRemarkStringRef PassName;
+  // "Name": Required
+  LLVMOptRemarkStringRef RemarkName;
+  // "Function": Required
+  LLVMOptRemarkStringRef FunctionName;
+
+  // "DebugLoc": Optional
+  LLVMOptRemarkDebugLoc DebugLoc;
+  // "Hotness": Optional
+  uint32_t Hotness;
+  // "Args": Optional. It is an array of `num_args` elements.
+  uint32_t NumArgs;
+  LLVMOptRemarkArg *Args;
+} LLVMOptRemarkEntry;
+
+typedef struct LLVMOptRemarkOpaqueParser *LLVMOptRemarkParserRef;
+
+/**
+ * Creates a remark parser that can be used to read and parse the buffer located
+ * in \p Buf of size \p Size.
+ *
+ * \p Buf cannot be NULL.
+ *
+ * This function should be paired with LLVMOptRemarkParserDispose() to avoid
+ * leaking resources.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf,
+                                                        uint64_t Size);
+
+/**
+ * Returns the next remark in the file.
+ *
+ * The value pointed to by the return value is invalidated by the next call to
+ * LLVMOptRemarkParserGetNext().
+ *
+ * If the parser reaches the end of the buffer, the return value will be NULL.
+ *
+ * In the case of an error, the return value will be NULL, and:
+ *
+ * 1) LLVMOptRemarkParserHasError() will return `1`.
+ *
+ * 2) LLVMOptRemarkParserGetErrorMessage() will return a descriptive error
+ *    message.
+ *
+ * An error may occur if:
+ *
+ * 1) An argument is invalid.
+ *
+ * 2) There is a YAML parsing error. This type of error aborts parsing
+ *    immediately and returns `1`. It can occur on malformed YAML.
+ *
+ * 3) Remark parsing error. If this type of error occurs, the parser won't call
+ *    the handler and will continue to the next one. It can occur on malformed
+ *    remarks, like missing or extra fields in the file.
+ *
+ * Here is a quick example of the usage:
+ *
+ * ```
+ *  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, Size);
+ *  LLVMOptRemarkEntry *Remark = NULL;
+ *  while ((Remark == LLVMOptRemarkParserGetNext(Parser))) {
+ *    // use Remark
+ *  }
+ *  bool HasError = LLVMOptRemarkParserHasError(Parser);
+ *  LLVMOptRemarkParserDispose(Parser);
+ * ```
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern LLVMOptRemarkEntry *
+LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Returns `1` if the parser encountered an error while parsing the buffer.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Returns a null-terminated string containing an error message.
+ *
+ * In case of no error, the result is `NULL`.
+ *
+ * The memory of the string is bound to the lifetime of \p Parser. If
+ * LLVMOptRemarkParserDispose() is called, the memory of the string will be
+ * released.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern const char *
+LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Releases all the resources used by \p Parser.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser);
+
+/**
+ * Returns the version of the opt-remarks dylib.
+ *
+ * \since OPT_REMARKS_API_VERSION=0
+ */
+extern uint32_t LLVMOptRemarkVersion(void);
+
+/**
+ * @} // endgoup LLVMCOPTREMARKS
+ */
+
+#ifdef __cplusplus
+}
+#endif /* !defined(__cplusplus) */
+
+#endif /* LLVM_C_OPT_REMARKS_H */
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index 5c59af4c04ba6a22737cc4109a95c9a875d26ab2..52ed183c78aed1fa2f12eaac1b355d25d13b52aa 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -1243,6 +1243,32 @@ inline APFloat maxnum(const APFloat &A, const APFloat &B) {
   return (A.compare(B) == APFloat::cmpLessThan) ? B : A;
 }
 
+/// Implements IEEE 754-2018 minimum semantics. Returns the smaller of 2
+/// arguments, propagating NaNs and treating -0 as less than +0.
+LLVM_READONLY
+inline APFloat minimum(const APFloat &A, const APFloat &B) {
+  if (A.isNaN())
+    return A;
+  if (B.isNaN())
+    return B;
+  if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
+    return A.isNegative() ? A : B;
+  return (B.compare(A) == APFloat::cmpLessThan) ? B : A;
+}
+
+/// Implements IEEE 754-2018 maximum semantics. Returns the larger of 2
+/// arguments, propagating NaNs and treating -0 as less than +0.
+LLVM_READONLY
+inline APFloat maximum(const APFloat &A, const APFloat &B) {
+  if (A.isNaN())
+    return A;
+  if (B.isNaN())
+    return B;
+  if (A.isZero() && B.isZero() && (A.isNegative() != B.isNegative()))
+    return A.isNegative() ? B : A;
+  return (A.compare(B) == APFloat::cmpLessThan) ? B : A;
+}
+
 } // namespace llvm
 
 #undef APFLOAT_DISPATCH_ON_SEMANTICS
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index 380f1db0d04adbf8c28424412cb291fa77fbca62..1f50502fff92bad73eec7b41fc3fec39fd501429 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -25,6 +25,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstring>
+#include <initializer_list>
 #include <iterator>
 #include <new>
 #include <type_traits>
@@ -38,6 +39,34 @@ namespace detail {
 // implementation without requiring two members.
 template <typename KeyT, typename ValueT>
 struct DenseMapPair : public std::pair<KeyT, ValueT> {
+
+  // FIXME: Switch to inheriting constructors when we drop support for older
+  //        clang versions.
+  // NOTE: This default constructor is declared with '{}' rather than
+  //       '= default' to work around a separate bug in clang-3.8. This can
+  //       also go when we switch to inheriting constructors.
+  DenseMapPair() {}
+
+  DenseMapPair(const KeyT &Key, const ValueT &Value)
+      : std::pair<KeyT, ValueT>(Key, Value) {}
+
+  DenseMapPair(KeyT &&Key, ValueT &&Value)
+      : std::pair<KeyT, ValueT>(std::move(Key), std::move(Value)) {}
+
+  template <typename AltKeyT, typename AltValueT>
+  DenseMapPair(AltKeyT &&AltKey, AltValueT &&AltValue,
+               typename std::enable_if<
+                   std::is_convertible<AltKeyT, KeyT>::value &&
+                   std::is_convertible<AltValueT, ValueT>::value>::type * = 0)
+      : std::pair<KeyT, ValueT>(std::forward<AltKeyT>(AltKey),
+                                std::forward<AltValueT>(AltValue)) {}
+
+  template <typename AltPairT>
+  DenseMapPair(AltPairT &&AltPair,
+               typename std::enable_if<std::is_convertible<
+                   AltPairT, std::pair<KeyT, ValueT>>::value>::type * = 0)
+      : std::pair<KeyT, ValueT>(std::forward<AltPairT>(AltPair)) {}
+
   KeyT &getFirst() { return std::pair<KeyT, ValueT>::first; }
   const KeyT &getFirst() const { return std::pair<KeyT, ValueT>::first; }
   ValueT &getSecond() { return std::pair<KeyT, ValueT>::second; }
@@ -46,9 +75,10 @@ struct DenseMapPair : public std::pair<KeyT, ValueT> {
 
 } // end namespace detail
 
-template <
-    typename KeyT, typename ValueT, typename KeyInfoT = DenseMapInfo<KeyT>,
-    typename Bucket = detail::DenseMapPair<KeyT, ValueT>, bool IsConst = false>
+template <typename KeyT, typename ValueT,
+          typename KeyInfoT = DenseMapInfo<KeyT>,
+          typename Bucket = llvm::detail::DenseMapPair<KeyT, ValueT>,
+          bool IsConst = false>
 class DenseMapIterator;
 
 template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
@@ -639,9 +669,43 @@ public:
   }
 };
 
+/// Equality comparison for DenseMap.
+///
+/// Iterates over elements of LHS confirming that each (key, value) pair in LHS
+/// is also in RHS, and that no additional pairs are in RHS.
+/// Equivalent to N calls to RHS.find and N value comparisons. Amortized
+/// complexity is linear, worst case is O(N^2) (if every hash collides).
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
+bool operator==(
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
+  if (LHS.size() != RHS.size())
+    return false;
+
+  for (auto &KV : LHS) {
+    auto I = RHS.find(KV.first);
+    if (I == RHS.end() || I->second != KV.second)
+      return false;
+  }
+
+  return true;
+}
+
+/// Inequality comparison for DenseMap.
+///
+/// Equivalent to !(LHS == RHS). See operator== for performance notes.
+template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
+          typename BucketT>
+bool operator!=(
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &LHS,
+    const DenseMapBase<DerivedT, KeyT, ValueT, KeyInfoT, BucketT> &RHS) {
+  return !(LHS == RHS);
+}
+
 template <typename KeyT, typename ValueT,
           typename KeyInfoT = DenseMapInfo<KeyT>,
-          typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
+          typename BucketT = llvm::detail::DenseMapPair<KeyT, ValueT>>
 class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
                                      KeyT, ValueT, KeyInfoT, BucketT> {
   friend class DenseMapBase<DenseMap, KeyT, ValueT, KeyInfoT, BucketT>;
@@ -676,6 +740,11 @@ public:
     this->insert(I, E);
   }
 
+  DenseMap(std::initializer_list<typename BaseT::value_type> Vals) {
+    init(Vals.size());
+    this->insert(Vals.begin(), Vals.end());
+  }
+
   ~DenseMap() {
     this->destroyAll();
     operator delete(Buckets);
@@ -798,7 +867,7 @@ private:
 
 template <typename KeyT, typename ValueT, unsigned InlineBuckets = 4,
           typename KeyInfoT = DenseMapInfo<KeyT>,
-          typename BucketT = detail::DenseMapPair<KeyT, ValueT>>
+          typename BucketT = llvm::detail::DenseMapPair<KeyT, ValueT>>
 class SmallDenseMap
     : public DenseMapBase<
           SmallDenseMap<KeyT, ValueT, InlineBuckets, KeyInfoT, BucketT>, KeyT,
diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h
index 52fe4adb5bd3f9eaf64591c3b71f49ec937607e3..e85a38587e41dcd97366b64dc0eaeac4a0c97bca 100644
--- a/include/llvm/ADT/DenseSet.h
+++ b/include/llvm/ADT/DenseSet.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/type_traits.h"
 #include <algorithm>
 #include <cstddef>
@@ -67,7 +68,7 @@ public:
   explicit DenseSetImpl(unsigned InitialReserve = 0) : TheMap(InitialReserve) {}
 
   DenseSetImpl(std::initializer_list<ValueT> Elems)
-      : DenseSetImpl(Elems.size()) {
+      : DenseSetImpl(PowerOf2Ceil(Elems.size())) {
     insert(Elems.begin(), Elems.end());
   }
 
@@ -214,6 +215,34 @@ public:
   }
 };
 
+/// Equality comparison for DenseSet.
+///
+/// Iterates over elements of LHS confirming that each element is also a member
+/// of RHS, and that RHS contains no additional values.
+/// Equivalent to N calls to RHS.count. Amortized complexity is linear, worst
+/// case is O(N^2) (if every hash collides).
+template <typename ValueT, typename MapTy, typename ValueInfoT>
+bool operator==(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
+                const DenseSetImpl<ValueT, MapTy, ValueInfoT> &RHS) {
+  if (LHS.size() != RHS.size())
+    return false;
+
+  for (auto &E : LHS)
+    if (!RHS.count(E))
+      return false;
+
+  return true;
+}
+
+/// Inequality comparison for DenseSet.
+///
+/// Equivalent to !(LHS == RHS). See operator== for performance notes.
+template <typename ValueT, typename MapTy, typename ValueInfoT>
+bool operator!=(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
+                const DenseSetImpl<ValueT, MapTy, ValueInfoT> &RHS) {
+  return !(LHS == RHS);
+}
+
 } // end namespace detail
 
 /// Implements a dense probed hash-table based set.
diff --git a/include/llvm/ADT/PostOrderIterator.h b/include/llvm/ADT/PostOrderIterator.h
index dc8a9b6e78b20961c694d173c93db36fc4198471..d77b12228cb15b219a95416dc342c55b319e26a4 100644
--- a/include/llvm/ADT/PostOrderIterator.h
+++ b/include/llvm/ADT/PostOrderIterator.h
@@ -296,12 +296,15 @@ class ReversePostOrderTraversal {
 
 public:
   using rpo_iterator = typename std::vector<NodeRef>::reverse_iterator;
+  using const_rpo_iterator = typename std::vector<NodeRef>::const_reverse_iterator;
 
   ReversePostOrderTraversal(GraphT G) { Initialize(GT::getEntryNode(G)); }
 
   // Because we want a reverse post order, use reverse iterators from the vector
   rpo_iterator begin() { return Blocks.rbegin(); }
+  const_rpo_iterator begin() const { return Blocks.crbegin(); }
   rpo_iterator end() { return Blocks.rend(); }
+  const_rpo_iterator end() const { return Blocks.crend(); }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index c209c4aede9bb67047d8b12369bdf39ebc48a05b..4a93ee55e76dcae13dd5b1c1aea7f27efc8fd0fe 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -195,6 +195,12 @@ void adl_swap(T &&lhs, T &&rhs) noexcept(
   adl_detail::adl_swap(std::forward<T>(lhs), std::forward<T>(rhs));
 }
 
+/// Test whether \p RangeOrContainer is empty. Similar to C++17 std::empty.
+template <typename T>
+constexpr bool empty(const T &RangeOrContainer) {
+  return adl_begin(RangeOrContainer) == adl_end(RangeOrContainer);
+}
+
 // mapped_iterator - This is a simple iterator adapter that causes a function to
 // be applied whenever operator* is invoked on the iterator.
 
diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h
index 4cbf40c76805eeb8475c98b9e51d3a89a13980e8..84e73bcbace871d48d47c3b3124fe5bc49cecaac 100644
--- a/include/llvm/ADT/SparseBitVector.h
+++ b/include/llvm/ADT/SparseBitVector.h
@@ -261,21 +261,33 @@ class SparseBitVector {
     BITWORD_SIZE = SparseBitVectorElement<ElementSize>::BITWORD_SIZE
   };
 
-  // Pointer to our current Element.
-  ElementListIter CurrElementIter;
   ElementList Elements;
+  // Pointer to our current Element. This has no visible effect on the external
+  // state of a SparseBitVector, it's just used to improve performance in the
+  // common case of testing/modifying bits with similar indices.
+  mutable ElementListIter CurrElementIter;
 
   // This is like std::lower_bound, except we do linear searching from the
   // current position.
-  ElementListIter FindLowerBound(unsigned ElementIndex) {
+  ElementListIter FindLowerBoundImpl(unsigned ElementIndex) const {
+
+    // We cache a non-const iterator so we're forced to resort to const_cast to
+    // get the begin/end in the case where 'this' is const. To avoid duplication
+    // of code with the only difference being whether the const cast is present
+    // 'this' is always const in this particular function and we sort out the
+    // difference in FindLowerBound and FindLowerBoundConst.
+    ElementListIter Begin =
+        const_cast<SparseBitVector<ElementSize> *>(this)->Elements.begin();
+    ElementListIter End =
+        const_cast<SparseBitVector<ElementSize> *>(this)->Elements.end();
 
     if (Elements.empty()) {
-      CurrElementIter = Elements.begin();
-      return Elements.begin();
+      CurrElementIter = Begin;
+      return CurrElementIter;
     }
 
     // Make sure our current iterator is valid.
-    if (CurrElementIter == Elements.end())
+    if (CurrElementIter == End)
       --CurrElementIter;
 
     // Search from our current iterator, either backwards or forwards,
@@ -284,17 +296,23 @@ class SparseBitVector {
     if (CurrElementIter->index() == ElementIndex) {
       return ElementIter;
     } else if (CurrElementIter->index() > ElementIndex) {
-      while (ElementIter != Elements.begin()
+      while (ElementIter != Begin
              && ElementIter->index() > ElementIndex)
         --ElementIter;
     } else {
-      while (ElementIter != Elements.end() &&
+      while (ElementIter != End &&
              ElementIter->index() < ElementIndex)
         ++ElementIter;
     }
     CurrElementIter = ElementIter;
     return ElementIter;
   }
+  ElementListConstIter FindLowerBoundConst(unsigned ElementIndex) const {
+    return FindLowerBoundImpl(ElementIndex);
+  }
+  ElementListIter FindLowerBound(unsigned ElementIndex) {
+    return FindLowerBoundImpl(ElementIndex);
+  }
 
   // Iterator to walk set bits in the bitmap.  This iterator is a lot uglier
   // than it would be, in order to be efficient.
@@ -423,22 +441,12 @@ class SparseBitVector {
 public:
   using iterator = SparseBitVectorIterator;
 
-  SparseBitVector() {
-    CurrElementIter = Elements.begin();
-  }
+  SparseBitVector() : Elements(), CurrElementIter(Elements.begin()) {}
 
-  // SparseBitVector copy ctor.
-  SparseBitVector(const SparseBitVector &RHS) {
-    ElementListConstIter ElementIter = RHS.Elements.begin();
-    while (ElementIter != RHS.Elements.end()) {
-      Elements.push_back(SparseBitVectorElement<ElementSize>(*ElementIter));
-      ++ElementIter;
-    }
-
-    CurrElementIter = Elements.begin ();
-  }
-
-  ~SparseBitVector() = default;
+  SparseBitVector(const SparseBitVector &RHS)
+      : Elements(RHS.Elements), CurrElementIter(Elements.begin()) {}
+  SparseBitVector(SparseBitVector &&RHS)
+      : Elements(std::move(RHS.Elements)), CurrElementIter(Elements.begin()) {}
 
   // Clear.
   void clear() {
@@ -450,26 +458,23 @@ public:
     if (this == &RHS)
       return *this;
 
-    Elements.clear();
-
-    ElementListConstIter ElementIter = RHS.Elements.begin();
-    while (ElementIter != RHS.Elements.end()) {
-      Elements.push_back(SparseBitVectorElement<ElementSize>(*ElementIter));
-      ++ElementIter;
-    }
-
-    CurrElementIter = Elements.begin ();
-
+    Elements = RHS.Elements;
+    CurrElementIter = Elements.begin();
+    return *this;
+  }
+  SparseBitVector &operator=(SparseBitVector &&RHS) {
+    Elements = std::move(RHS.Elements);
+    CurrElementIter = Elements.begin();
     return *this;
   }
 
   // Test, Reset, and Set a bit in the bitmap.
-  bool test(unsigned Idx) {
+  bool test(unsigned Idx) const {
     if (Elements.empty())
       return false;
 
     unsigned ElementIndex = Idx / ElementSize;
-    ElementListIter ElementIter = FindLowerBound(ElementIndex);
+    ElementListConstIter ElementIter = FindLowerBoundConst(ElementIndex);
 
     // If we can't find an element that is supposed to contain this bit, there
     // is nothing more to do.
diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h
index cb40fc1781dbc053194ff34fbc01a763a2381058..7f7ed69a005447c19177a6a599c5f577743a4f75 100644
--- a/include/llvm/ADT/iterator.h
+++ b/include/llvm/ADT/iterator.h
@@ -202,9 +202,7 @@ template <
     typename ReferenceT = typename std::conditional<
         std::is_same<T, typename std::iterator_traits<
                             WrappedIteratorT>::value_type>::value,
-        typename std::iterator_traits<WrappedIteratorT>::reference, T &>::type,
-    // Don't provide these, they are mostly to act as aliases below.
-    typename WrappedTraitsT = std::iterator_traits<WrappedIteratorT>>
+        typename std::iterator_traits<WrappedIteratorT>::reference, T &>::type>
 class iterator_adaptor_base
     : public iterator_facade_base<DerivedT, IteratorCategoryT, T,
                                   DifferenceTypeT, PointerT, ReferenceT> {
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index be3496bbd95518a7b535031baee0f8e4e7f2516c..2efcd9dafa195ad64ad7fa13bd9ce6527f85b9ed 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -335,8 +335,7 @@ public:
 
   /// A convenience wrapper around the primary \c alias interface.
   AliasResult alias(const Value *V1, const Value *V2) {
-    return alias(V1, MemoryLocation::UnknownSize, V2,
-                 MemoryLocation::UnknownSize);
+    return alias(V1, LocationSize::unknown(), V2, LocationSize::unknown());
   }
 
   /// A trivial helper function to check to see if the specified pointers are
@@ -1075,6 +1074,29 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
+/// A wrapper pass for external alias analyses. This just squirrels away the
+/// callback used to run any analyses and register their results.
+struct ExternalAAWrapperPass : ImmutablePass {
+  using CallbackT = std::function<void(Pass &, Function &, AAResults &)>;
+
+  CallbackT CB;
+
+  static char ID;
+
+  ExternalAAWrapperPass() : ImmutablePass(ID) {
+    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  explicit ExternalAAWrapperPass(CallbackT CB)
+      : ImmutablePass(ID), CB(std::move(CB)) {
+    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+};
+
 FunctionPass *createAAResultsWrapperPass();
 
 /// A wrapper pass around a callback which can be used to populate the
diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index cf4981d1eb279bd0af74a07a5b50a9fa176939e3..7ed5cd5c4734601857b7b5fbc82f4a487ab4fb04 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -389,10 +389,6 @@ public:
   /// set is returned.
   AliasSet &getAliasSetFor(const MemoryLocation &MemLoc);
 
-  /// Return true if the specified instruction "may" (or must) alias one of the
-  /// members in any of the sets.
-  bool containsUnknown(const Instruction *I) const;
-
   /// Return the underlying alias analysis object used by this tracker.
   AliasAnalysis &getAliasAnalysis() const { return AA; }
 
@@ -441,12 +437,7 @@ private:
     return *Entry;
   }
 
-  AliasSet &addPointer(Value *P, LocationSize Size, const AAMDNodes &AAInfo,
-                       AliasSet::AccessLattice E);
-  AliasSet &addPointer(MemoryLocation Loc,
-                       AliasSet::AccessLattice E) {
-    return addPointer(const_cast<Value*>(Loc.Ptr), Loc.Size, Loc.AATags, E);
-  }
+  AliasSet &addPointer(MemoryLocation Loc, AliasSet::AccessLattice E);
   AliasSet *mergeAliasSetsForPointer(const Value *Ptr, LocationSize Size,
                                      const AAMDNodes &AAInfo);
 
diff --git a/include/llvm/Analysis/CFG.h b/include/llvm/Analysis/CFG.h
index cccdd1637411852cdd1cc840b80d90cb55538913..caae0b6e2a8ffaba87a4ab28e4fd231230133dc8 100644
--- a/include/llvm/Analysis/CFG.h
+++ b/include/llvm/Analysis/CFG.h
@@ -25,7 +25,6 @@ class DominatorTree;
 class Function;
 class Instruction;
 class LoopInfo;
-class TerminatorInst;
 
 /// Analyze the specified function to find all of the loop backedges in the
 /// function and return them.  This is a relatively cheap (compared to
@@ -46,7 +45,7 @@ unsigned GetSuccessorNumber(const BasicBlock *BB, const BasicBlock *Succ);
 /// edges from a block with multiple successors to a block with multiple
 /// predecessors.
 ///
-bool isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
+bool isCriticalEdge(const Instruction *TI, unsigned SuccNum,
                     bool AllowIdenticalEdges = false);
 
 /// Determine whether instruction 'To' is reachable from 'From',
diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
index a4b642b9ea3f51db01d371d1ee6733cf5addca77..5996dd90bcfd5a5c0f6ade0f806263e68c3e6d2d 100644
--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
@@ -150,7 +150,7 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
   /// Display the raw branch weights from PGO.
   std::string getEdgeAttributes(const BasicBlock *Node, succ_const_iterator I,
                                 const Function *F) {
-    const TerminatorInst *TI = Node->getTerminator();
+    const Instruction *TI = Node->getTerminator();
     if (TI->getNumSuccessors() == 1)
       return "";
 
diff --git a/include/llvm/Analysis/DivergenceAnalysis.h b/include/llvm/Analysis/DivergenceAnalysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fadf52288bc87eb38a043a8b71a3abaebd9835d
--- /dev/null
+++ b/include/llvm/Analysis/DivergenceAnalysis.h
@@ -0,0 +1,178 @@
+//===- llvm/Analysis/DivergenceAnalysis.h - Divergence Analysis -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// The divergence analysis determines which instructions and branches are
+// divergent given a set of divergent source instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+#define LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include <vector>
+
+namespace llvm {
+class Module;
+class Value;
+class Instruction;
+class Loop;
+class raw_ostream;
+class TargetTransformInfo;
+
+/// \brief Generic divergence analysis for reducible CFGs.
+///
+/// This analysis propagates divergence in a data-parallel context from sources
+/// of divergence to all users. It requires reducible CFGs. All assignments
+/// should be in SSA form.
+class DivergenceAnalysis {
+public:
+  /// \brief This instance will analyze the whole function \p F or the loop \p
+  /// RegionLoop.
+  ///
+  /// \param RegionLoop if non-null the analysis is restricted to \p RegionLoop.
+  /// Otherwise the whole function is analyzed.
+  /// \param IsLCSSAForm whether the analysis may assume that the IR in the
+  /// region in in LCSSA form.
+  DivergenceAnalysis(const Function &F, const Loop *RegionLoop,
+                     const DominatorTree &DT, const LoopInfo &LI,
+                     SyncDependenceAnalysis &SDA, bool IsLCSSAForm);
+
+  /// \brief The loop that defines the analyzed region (if any).
+  const Loop *getRegionLoop() const { return RegionLoop; }
+  const Function &getFunction() const { return F; }
+
+  /// \brief Whether \p BB is part of the region.
+  bool inRegion(const BasicBlock &BB) const;
+  /// \brief Whether \p I is part of the region.
+  bool inRegion(const Instruction &I) const;
+
+  /// \brief Mark \p UniVal as a value that is always uniform.
+  void addUniformOverride(const Value &UniVal);
+
+  /// \brief Mark \p DivVal as a value that is always divergent.
+  void markDivergent(const Value &DivVal);
+
+  /// \brief Propagate divergence to all instructions in the region.
+  /// Divergence is seeded by calls to \p markDivergent.
+  void compute();
+
+  /// \brief Whether any value was marked or analyzed to be divergent.
+  bool hasDetectedDivergence() const { return !DivergentValues.empty(); }
+
+  /// \brief Whether \p Val will always return a uniform value regardless of its
+  /// operands
+  bool isAlwaysUniform(const Value &Val) const;
+
+  /// \brief Whether \p Val is a divergent value
+  bool isDivergent(const Value &Val) const;
+
+  void print(raw_ostream &OS, const Module *) const;
+
+private:
+  bool updateTerminator(const Instruction &Term) const;
+  bool updatePHINode(const PHINode &Phi) const;
+
+  /// \brief Computes whether \p Inst is divergent based on the
+  /// divergence of its operands.
+  ///
+  /// \returns Whether \p Inst is divergent.
+  ///
+  /// This should only be called for non-phi, non-terminator instructions.
+  bool updateNormalInstruction(const Instruction &Inst) const;
+
+  /// \brief Mark users of live-out users as divergent.
+  ///
+  /// \param LoopHeader the header of the divergent loop.
+  ///
+  /// Marks all users of live-out values of the loop headed by \p LoopHeader
+  /// as divergent and puts them on the worklist.
+  void taintLoopLiveOuts(const BasicBlock &LoopHeader);
+
+  /// \brief Push all users of \p Val (in the region) to the worklist
+  void pushUsers(const Value &I);
+
+  /// \brief Push all phi nodes in @block to the worklist
+  void pushPHINodes(const BasicBlock &Block);
+
+  /// \brief Mark \p Block as join divergent
+  ///
+  /// A block is join divergent if two threads may reach it from different
+  /// incoming blocks at the same time.
+  void markBlockJoinDivergent(const BasicBlock &Block) {
+    DivergentJoinBlocks.insert(&Block);
+  }
+
+  /// \brief Whether \p Val is divergent when read in \p ObservingBlock.
+  bool isTemporalDivergent(const BasicBlock &ObservingBlock,
+                           const Value &Val) const;
+
+  /// \brief Whether \p Block is join divergent
+  ///
+  /// (see markBlockJoinDivergent).
+  bool isJoinDivergent(const BasicBlock &Block) const {
+    return DivergentJoinBlocks.find(&Block) != DivergentJoinBlocks.end();
+  }
+
+  /// \brief Propagate control-induced divergence to users (phi nodes and
+  /// instructions).
+  //
+  // \param JoinBlock is a divergent loop exit or join point of two disjoint
+  // paths.
+  // \returns Whether \p JoinBlock is a divergent loop exit of \p TermLoop.
+  bool propagateJoinDivergence(const BasicBlock &JoinBlock,
+                               const Loop *TermLoop);
+
+  /// \brief Propagate induced value divergence due to control divergence in \p
+  /// Term.
+  void propagateBranchDivergence(const Instruction &Term);
+
+  /// \brief Propagate divergent caused by a divergent loop exit.
+  ///
+  /// \param ExitingLoop is a divergent loop.
+  void propagateLoopDivergence(const Loop &ExitingLoop);
+
+private:
+  const Function &F;
+  // If regionLoop != nullptr, analysis is only performed within \p RegionLoop.
+  // Otw, analyze the whole function
+  const Loop *RegionLoop;
+
+  const DominatorTree &DT;
+  const LoopInfo &LI;
+
+  // Recognized divergent loops
+  DenseSet<const Loop *> DivergentLoops;
+
+  // The SDA links divergent branches to divergent control-flow joins.
+  SyncDependenceAnalysis &SDA;
+
+  // Use simplified code path for LCSSA form.
+  bool IsLCSSAForm;
+
+  // Set of known-uniform values.
+  DenseSet<const Value *> UniformOverrides;
+
+  // Blocks with joining divergent control from different predecessors.
+  DenseSet<const BasicBlock *> DivergentJoinBlocks;
+
+  // Detected/marked divergent values.
+  DenseSet<const Value *> DivergentValues;
+
+  // Internal worklist for divergence propagation.
+  std::vector<const Instruction *> Worklist;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_DIVERGENCE_ANALYSIS_H
diff --git a/include/llvm/Analysis/InlineCost.h b/include/llvm/Analysis/InlineCost.h
index 529fb75bec9bc45d7751af49ff6ac9d2f1fd3586..4c270354b0c4fd72046276efc12271166377f066 100644
--- a/include/llvm/Analysis/InlineCost.h
+++ b/include/llvm/Analysis/InlineCost.h
@@ -46,7 +46,6 @@ const int IndirectCallThreshold = 100;
 const int CallPenalty = 25;
 const int LastCallToStaticBonus = 15000;
 const int ColdccPenalty = 2000;
-const int NoreturnPenalty = 10000;
 /// Do not inline functions which allocate this many bytes on the stack
 /// when the caller is recursive.
 const unsigned TotalAllocaSizeRecursiveCaller = 1024;
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 86b402b2394f8bdc94fb20ff394808c80111b439..c59c86c499404536c640efa718ac01f21fdb9400 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -564,10 +564,10 @@ public:
   /// Print the information about the memory accesses in the loop.
   void print(raw_ostream &OS, unsigned Depth = 0) const;
 
-  /// If the loop has any store of a variant value to an invariant address, then
+  /// If the loop has multiple stores to an invariant address, then
   /// return true, else return false.
-  bool hasVariantStoreToLoopInvariantAddress() const {
-    return HasVariantStoreToLoopInvariantAddress;
+  bool hasMultipleStoresToLoopInvariantAddress() const {
+    return HasMultipleStoresToLoopInvariantAddress;
   }
 
   /// Used to add runtime SCEV checks. Simplifies SCEV expressions and converts
@@ -620,8 +620,8 @@ private:
   /// Cache the result of analyzeLoop.
   bool CanVecMem;
 
-  /// Indicator that there is a store of a variant value to a uniform address.
-  bool HasVariantStoreToLoopInvariantAddress;
+  /// Indicator that there are multiple stores to a uniform address.
+  bool HasMultipleStoresToLoopInvariantAddress;
 
   /// The diagnostics report generated for the analysis.  E.g. why we
   /// couldn't analyze the loop.
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 1c40cffc7f673580c791b14ac35c54793b6a1c50..52340b0cb51ca4e70c6bb626f50aee60b6a44730 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -304,7 +304,7 @@ private:
     /// The maximum size of the dereferences of the pointer.
     ///
     /// May be UnknownSize if the sizes are unknown.
-    LocationSize Size = MemoryLocation::UnknownSize;
+    LocationSize Size = LocationSize::unknown();
     /// The AA tags associated with dereferences of the pointer.
     ///
     /// The members may be null if there are no tags or conflicting tags.
diff --git a/include/llvm/Analysis/MemoryLocation.h b/include/llvm/Analysis/MemoryLocation.h
index 509efa2ca1dae6a313a5a62fc06175bc50a8292f..cf839c5a1eb8764cd1ac8cafcf52d45e62f846e1 100644
--- a/include/llvm/Analysis/MemoryLocation.h
+++ b/include/llvm/Analysis/MemoryLocation.h
@@ -239,7 +239,7 @@ public:
   }
 
   explicit MemoryLocation(const Value *Ptr = nullptr,
-                          LocationSize Size = UnknownSize,
+                          LocationSize Size = LocationSize::unknown(),
                           const AAMDNodes &AATags = AAMDNodes())
       : Ptr(Ptr), Size(Size), AATags(AATags) {}
 
diff --git a/include/llvm/Analysis/MustExecute.h b/include/llvm/Analysis/MustExecute.h
index 40a02735d1b7c51350e45738997c591abd572420..05c28d139889af4b3a1e3a88a11c580b55343640 100644
--- a/include/llvm/Analysis/MustExecute.h
+++ b/include/llvm/Analysis/MustExecute.h
@@ -19,6 +19,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/InstructionPrecedenceTracking.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
@@ -45,33 +46,30 @@ class Loop;
 /// loop were made and the info wasn't recomputed properly, the behavior of all
 /// methods except for computeLoopSafetyInfo is undefined.
 class LoopSafetyInfo {
-  bool MayThrow = false;       // The current loop contains an instruction which
-                               // may throw.
-  bool HeaderMayThrow = false; // Same as previous, but specific to loop header
+  // Used to update funclet bundle operands.
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
 
-  /// Collect all blocks from \p CurLoop which lie on all possible paths from
-  /// the header of \p CurLoop (inclusive) to BB (exclusive) into the set
-  /// \p Predecessors. If \p BB is the header, \p Predecessors will be empty.
-  void collectTransitivePredecessors(
-      const Loop *CurLoop, const BasicBlock *BB,
-      SmallPtrSetImpl<const BasicBlock *> &Predecessors) const;
+protected:
+  /// Computes block colors.
+  void computeBlockColors(const Loop *CurLoop);
 
 public:
-  // Used to update funclet bundle operands.
-  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  /// Returns block colors map that is used to update funclet operand bundles.
+  const DenseMap<BasicBlock *, ColorVector> &getBlockColors() const;
 
-  /// Returns true iff the header block of the loop for which this info is
-  /// calculated contains an instruction that may throw or otherwise exit
-  /// abnormally.
-  bool headerMayThrow() const;
+  /// Copy colors of block \p Old into the block \p New.
+  void copyColors(BasicBlock *New, BasicBlock *Old);
+
+  /// Returns true iff the block \p BB potentially may throw exception. It can
+  /// be false-positive in cases when we want to avoid complex analysis.
+  virtual bool blockMayThrow(const BasicBlock *BB) const = 0;
 
   /// Returns true iff any block of the loop for which this info is contains an
   /// instruction that may throw or otherwise exit abnormally.
-  bool anyBlockMayThrow() const;
+  virtual bool anyBlockMayThrow() const = 0;
 
   /// Return true if we must reach the block \p BB under assumption that the
-  /// loop \p CurLoop is entered and no instruction throws or otherwise exits
-  /// abnormally.
+  /// loop \p CurLoop is entered.
   bool allLoopPathsLeadToBlock(const Loop *CurLoop, const BasicBlock *BB,
                                const DominatorTree *DT) const;
 
@@ -80,16 +78,80 @@ public:
   /// as argument. Updates safety information in LoopSafetyInfo argument.
   /// Note: This is defined to clear and reinitialize an already initialized
   /// LoopSafetyInfo.  Some callers rely on this fact.
-  void computeLoopSafetyInfo(Loop *);
+  virtual void computeLoopSafetyInfo(const Loop *CurLoop) = 0;
+
+  /// Returns true if the instruction in a loop is guaranteed to execute at
+  /// least once (under the assumption that the loop is entered).
+  virtual bool isGuaranteedToExecute(const Instruction &Inst,
+                                     const DominatorTree *DT,
+                                     const Loop *CurLoop) const = 0;
 
   LoopSafetyInfo() = default;
+
+  virtual ~LoopSafetyInfo() = default;
 };
 
-/// Returns true if the instruction in a loop is guaranteed to execute at least
-/// once (under the assumption that the loop is entered).
-bool isGuaranteedToExecute(const Instruction &Inst, const DominatorTree *DT,
-                           const Loop *CurLoop,
-                           const LoopSafetyInfo *SafetyInfo);
+
+/// Simple and conservative implementation of LoopSafetyInfo that can give
+/// false-positive answers to its queries in order to avoid complicated
+/// analysis.
+class SimpleLoopSafetyInfo: public LoopSafetyInfo {
+  bool MayThrow = false;       // The current loop contains an instruction which
+                               // may throw.
+  bool HeaderMayThrow = false; // Same as previous, but specific to loop header
+
+public:
+  virtual bool blockMayThrow(const BasicBlock *BB) const;
+
+  virtual bool anyBlockMayThrow() const;
+
+  virtual void computeLoopSafetyInfo(const Loop *CurLoop);
+
+  virtual bool isGuaranteedToExecute(const Instruction &Inst,
+                                     const DominatorTree *DT,
+                                     const Loop *CurLoop) const;
+
+  SimpleLoopSafetyInfo() : LoopSafetyInfo() {};
+
+  virtual ~SimpleLoopSafetyInfo() {};
+};
+
+/// This implementation of LoopSafetyInfo use ImplicitControlFlowTracking to
+/// give precise answers on "may throw" queries. This implementation uses cache
+/// that should be invalidated by calling the methods insertInstructionTo and
+/// removeInstruction whenever we modify a basic block's contents by adding or
+/// removing instructions.
+class ICFLoopSafetyInfo: public LoopSafetyInfo {
+  bool MayThrow = false;       // The current loop contains an instruction which
+                               // may throw.
+  // Contains information about implicit control flow in this loop's blocks.
+  mutable ImplicitControlFlowTracking ICF;
+
+public:
+  virtual bool blockMayThrow(const BasicBlock *BB) const;
+
+  virtual bool anyBlockMayThrow() const;
+
+  virtual void computeLoopSafetyInfo(const Loop *CurLoop);
+
+  virtual bool isGuaranteedToExecute(const Instruction &Inst,
+                                     const DominatorTree *DT,
+                                     const Loop *CurLoop) const;
+
+  /// Inform the safety info that we are planning to insert a new instruction
+  /// into the basic block \p BB. It will make all cache updates to keep it
+  /// correct after this insertion.
+  void insertInstructionTo(const BasicBlock *BB);
+
+  /// Inform safety info that we are planning to remove the instruction \p Inst
+  /// from its block. It will make all cache updates to keep it correct after
+  /// this removal.
+  void removeInstruction(const Instruction *Inst);
+
+  ICFLoopSafetyInfo(DominatorTree *DT) : LoopSafetyInfo(), ICF(DT) {};
+
+  virtual ~ICFLoopSafetyInfo() {};
+};
 
 }
 
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 89918e3c205bf00a6ebef83a8762ca54d907a7c0..8f4200b07e5c7bd3a09fa7ccfa5c7c62ed5febef 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -1833,6 +1833,10 @@ private:
   const SCEV *getOrCreateMulExpr(SmallVectorImpl<const SCEV *> &Ops,
                                  SCEV::NoWrapFlags Flags);
 
+  // Get addrec expr already created or create a new one.
+  const SCEV *getOrCreateAddRecExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                    const Loop *L, SCEV::NoWrapFlags Flags);
+
   /// Return x if \p Val is f(x) where f is a 1-1 function.
   const SCEV *stripInjectiveFunctions(const SCEV *Val) const;
 
diff --git a/include/llvm/Analysis/SparsePropagation.h b/include/llvm/Analysis/SparsePropagation.h
index 04e94f7cd5279448ffb930bbe491cc2b6216e19e..02a2e64268b7f91d1130e7cb278978414557d46c 100644
--- a/include/llvm/Analysis/SparsePropagation.h
+++ b/include/llvm/Analysis/SparsePropagation.h
@@ -189,12 +189,12 @@ private:
 
   /// getFeasibleSuccessors - Return a vector of booleans to indicate which
   /// successors are reachable from a given terminator instruction.
-  void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs,
+  void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl<bool> &Succs,
                              bool AggressiveUndef);
 
   void visitInst(Instruction &I);
   void visitPHINode(PHINode &I);
-  void visitTerminatorInst(TerminatorInst &TI);
+  void visitTerminator(Instruction &TI);
 };
 
 //===----------------------------------------------------------------------===//
@@ -286,7 +286,7 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::markEdgeExecutable(
 
 template <class LatticeKey, class LatticeVal, class KeyInfo>
 void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::getFeasibleSuccessors(
-    TerminatorInst &TI, SmallVectorImpl<bool> &Succs, bool AggressiveUndef) {
+    Instruction &TI, SmallVectorImpl<bool> &Succs, bool AggressiveUndef) {
   Succs.resize(TI.getNumSuccessors());
   if (TI.getNumSuccessors() == 0)
     return;
@@ -374,7 +374,7 @@ template <class LatticeKey, class LatticeVal, class KeyInfo>
 bool SparseSolver<LatticeKey, LatticeVal, KeyInfo>::isEdgeFeasible(
     BasicBlock *From, BasicBlock *To, bool AggressiveUndef) {
   SmallVector<bool, 16> SuccFeasible;
-  TerminatorInst *TI = From->getTerminator();
+  Instruction *TI = From->getTerminator();
   getFeasibleSuccessors(*TI, SuccFeasible, AggressiveUndef);
 
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
@@ -385,8 +385,8 @@ bool SparseSolver<LatticeKey, LatticeVal, KeyInfo>::isEdgeFeasible(
 }
 
 template <class LatticeKey, class LatticeVal, class KeyInfo>
-void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::visitTerminatorInst(
-    TerminatorInst &TI) {
+void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::visitTerminator(
+    Instruction &TI) {
   SmallVector<bool, 16> SuccFeasible;
   getFeasibleSuccessors(TI, SuccFeasible, true);
 
@@ -465,8 +465,8 @@ void SparseSolver<LatticeKey, LatticeVal, KeyInfo>::visitInst(Instruction &I) {
     if (ChangedValue.second != LatticeFunc->getUntrackedVal())
       UpdateState(ChangedValue.first, ChangedValue.second);
 
-  if (TerminatorInst *TI = dyn_cast<TerminatorInst>(&I))
-    visitTerminatorInst(*TI);
+  if (I.isTerminator())
+    visitTerminator(I);
 }
 
 template <class LatticeKey, class LatticeVal, class KeyInfo>
diff --git a/include/llvm/Analysis/SyncDependenceAnalysis.h b/include/llvm/Analysis/SyncDependenceAnalysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..df693d9d8e8c2b2cb1eb78c50b604819df997a2c
--- /dev/null
+++ b/include/llvm/Analysis/SyncDependenceAnalysis.h
@@ -0,0 +1,86 @@
+//===- SyncDependenceAnalysis.h - Divergent Branch Dependence -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file defines the SyncDependenceAnalysis class, which computes for
+// every divergent branch the set of phi nodes that the branch will make
+// divergent.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H
+#define LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include <memory>
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class Loop;
+class PostDominatorTree;
+
+using ConstBlockSet = SmallPtrSet<const BasicBlock *, 4>;
+
+/// \brief Relates points of divergent control to join points in
+/// reducible CFGs.
+///
+/// This analysis relates points of divergent control to points of converging
+/// divergent control. The analysis requires all loops to be reducible.
+class SyncDependenceAnalysis {
+  void visitSuccessor(const BasicBlock &succBlock, const Loop *termLoop,
+                      const BasicBlock *defBlock);
+
+public:
+  bool inRegion(const BasicBlock &BB) const;
+
+  ~SyncDependenceAnalysis();
+  SyncDependenceAnalysis(const DominatorTree &DT, const PostDominatorTree &PDT,
+                         const LoopInfo &LI);
+
+  /// \brief Computes divergent join points and loop exits caused by branch
+  /// divergence in \p Term.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from \p Term.
+  /// The set also contains loop exits if there two disjoint paths:
+  /// one from \p Term to the loop exit and another from \p Term to the loop
+  /// header. Those exit blocks are added to the returned set.
+  /// If L is the parent loop of \p Term and an exit of L is in the returned
+  /// set then L is a divergent loop.
+  const ConstBlockSet &join_blocks(const Instruction &Term);
+
+  /// \brief Computes divergent join points and loop exits (in the surrounding
+  /// loop) caused by the divergent loop exits of\p Loop.
+  ///
+  /// The set of blocks which are reachable by disjoint paths from the
+  /// loop exits of \p Loop.
+  /// This treats the loop as a single node in \p Loop's parent loop.
+  /// The returned set has the same properties as for join_blocks(TermInst&).
+  const ConstBlockSet &join_blocks(const Loop &Loop);
+
+private:
+  static ConstBlockSet EmptyBlockSet;
+
+  ReversePostOrderTraversal<const Function *> FuncRPOT;
+  const DominatorTree &DT;
+  const PostDominatorTree &PDT;
+  const LoopInfo &LI;
+
+  std::map<const Loop *, std::unique_ptr<ConstBlockSet>> CachedLoopExitJoins;
+  std::map<const Instruction *, std::unique_ptr<ConstBlockSet>>
+      CachedBranchJoins;
+};
+
+} // namespace llvm
+
+#endif // LLVM_ANALYSIS_SYNC_DEPENDENCE_ANALYSIS_H
diff --git a/include/llvm/Analysis/TargetLibraryInfo.def b/include/llvm/Analysis/TargetLibraryInfo.def
index f94debba9c52b8f46fc1c9f94ec98f0450d6acea..518a85ee1a016949d06ed95eea057a14fd5c0f66 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/include/llvm/Analysis/TargetLibraryInfo.def
@@ -565,6 +565,30 @@ TLI_DEFINE_STRING_INTERNAL("cosl")
 /// char *ctermid(char *s);
 TLI_DEFINE_ENUM_INTERNAL(ctermid)
 TLI_DEFINE_STRING_INTERNAL("ctermid")
+/// int execl(const char *path, const char *arg, ...);
+TLI_DEFINE_ENUM_INTERNAL(execl)
+TLI_DEFINE_STRING_INTERNAL("execl")
+/// int execle(const char *file, const char *arg, ..., char * const envp[]);
+TLI_DEFINE_ENUM_INTERNAL(execle)
+TLI_DEFINE_STRING_INTERNAL("execle")
+/// int execlp(const char *file, const char *arg, ...);
+TLI_DEFINE_ENUM_INTERNAL(execlp)
+TLI_DEFINE_STRING_INTERNAL("execlp")
+/// int execv(const char *path, char *const argv[]);
+TLI_DEFINE_ENUM_INTERNAL(execv)
+TLI_DEFINE_STRING_INTERNAL("execv")
+/// int execvP(const char *file, const char *search_path, char *const argv[]);
+TLI_DEFINE_ENUM_INTERNAL(execvP)
+TLI_DEFINE_STRING_INTERNAL("execvP")
+/// int execve(const char *filename, char *const argv[], char *const envp[]);
+TLI_DEFINE_ENUM_INTERNAL(execve)
+TLI_DEFINE_STRING_INTERNAL("execve")
+/// int execvp(const char *file, char *const argv[]);
+TLI_DEFINE_ENUM_INTERNAL(execvp)
+TLI_DEFINE_STRING_INTERNAL("execvp")
+/// int execvpe(const char *file, char *const argv[], char *const envp[]);
+TLI_DEFINE_ENUM_INTERNAL(execvpe)
+TLI_DEFINE_STRING_INTERNAL("execvpe")
 /// double exp(double x);
 TLI_DEFINE_ENUM_INTERNAL(exp)
 TLI_DEFINE_STRING_INTERNAL("exp")
@@ -709,6 +733,9 @@ TLI_DEFINE_STRING_INTERNAL("fopen")
 /// FILE *fopen64(const char *filename, const char *opentype)
 TLI_DEFINE_ENUM_INTERNAL(fopen64)
 TLI_DEFINE_STRING_INTERNAL("fopen64")
+/// int fork();
+TLI_DEFINE_ENUM_INTERNAL(fork)
+TLI_DEFINE_STRING_INTERNAL("fork")
 /// int fprintf(FILE *stream, const char *format, ...);
 TLI_DEFINE_ENUM_INTERNAL(fprintf)
 TLI_DEFINE_STRING_INTERNAL("fprintf")
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 18b5a5cf0e5c385400d40c52f0221f90f708b99a..eb0e0270157fd05f3eaf4b36f3aeb6e86b3fcd9d 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -587,6 +587,11 @@ public:
   /// Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
+  /// Enable matching of interleaved access groups that contain predicated 
+  /// accesses or gaps and therefore vectorized using masked
+  /// vector loads/stores.
+  bool enableMaskedInterleavedAccessVectorization() const;
+
   /// Indicate that it is potentially unsafe to automatically vectorize
   /// floating-point operations because the semantics of vector and scalar
   /// floating-point semantics may differ. For example, ARM NEON v7 SIMD math
@@ -766,7 +771,9 @@ public:
 
   /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The index and subtype parameters are used by the subvector insertion and
-  /// extraction shuffle kinds.
+  /// extraction shuffle kinds to show the insert/extract point and the type of
+  /// the subvector being inserted/extracted. 
+  /// NOTE: For subvector extractions Tp represents the source type.
   int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index = 0,
                      Type *SubTp = nullptr) const;
 
@@ -821,9 +828,13 @@ public:
   ///    load allows gaps)
   /// \p Alignment is the alignment of the memory operation
   /// \p AddressSpace is address space of the pointer.
+  /// \p UseMaskForCond indicates if the memory access is predicated.
+  /// \p UseMaskForGaps indicates if gaps should be masked.
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace) const;
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false) const;
 
   /// Calculate the cost of performing a vector reduction.
   ///
@@ -1072,6 +1083,7 @@ public:
   virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
       bool IsZeroCmp) const = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
+  virtual bool enableMaskedInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
                                               unsigned BitWidth,
@@ -1132,7 +1144,9 @@ public:
                                          unsigned Factor,
                                          ArrayRef<unsigned> Indices,
                                          unsigned Alignment,
-                                         unsigned AddressSpace) = 0;
+                                         unsigned AddressSpace,
+                                         bool UseMaskForCond = false,
+                                         bool UseMaskForGaps = false) = 0;
   virtual int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                          bool IsPairwiseForm) = 0;
   virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
@@ -1346,6 +1360,9 @@ public:
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
+  bool enableMaskedInterleavedAccessVectorization() override {
+    return Impl.enableMaskedInterleavedAccessVectorization();
+  }
   bool isFPVectorizationPotentiallyUnsafe() override {
     return Impl.isFPVectorizationPotentiallyUnsafe();
   }
@@ -1471,9 +1488,11 @@ public:
   }
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace) override {
+                                 unsigned AddressSpace, bool UseMaskForCond,
+                                 bool UseMaskForGaps) override {
     return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
   }
   int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
                                  bool IsPairwiseForm) override {
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index e39fe66c0a479536a514cdbddc07fa24aebcd3e2..5e79c5cdfe0353ad4e6c89f5e32db0fcb8a8b433 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -160,6 +160,7 @@ public:
     case Intrinsic::invariant_end:
     case Intrinsic::launder_invariant_group:
     case Intrinsic::strip_invariant_group:
+    case Intrinsic::is_constant:
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
     case Intrinsic::objectsize:
@@ -313,6 +314,8 @@ public:
 
   bool enableInterleavedAccessVectorization() { return false; }
 
+  bool enableMaskedInterleavedAccessVectorization() { return false; }
+
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
 
   bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -450,8 +453,9 @@ public:
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace) {
+                                      unsigned Alignment, unsigned AddressSpace,
+                                      bool UseMaskForCond = false,
+                                      bool UseMaskForGaps = false) {
     return 1;
   }
 
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index 622d932f74fdb94533ad686353ea89fe1e08ee90..797260f439a04078787fad427f874067d3e96980 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -24,6 +24,7 @@ namespace llvm {
 template <typename T> class ArrayRef;
 class DemandedBits;
 class GetElementPtrInst;
+class InterleaveGroup; 
 class Loop;
 class ScalarEvolution;
 class TargetTransformInfo;
@@ -125,6 +126,35 @@ computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 /// This function always sets a (possibly null) value for each K in Kinds.
 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 
+/// Create a mask that filters the members of an interleave group where there
+/// are gaps.
+///
+/// For example, the mask for \p Group with interleave-factor 3
+/// and \p VF 4, that has only its first member present is:
+///
+///   <1,0,0,1,0,0,1,0,0,1,0,0>
+///
+/// Note: The result is a mask of 0's and 1's, as opposed to the other
+/// create[*]Mask() utilities which create a shuffle mask (mask that
+/// consists of indices).
+Constant *createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
+                               const InterleaveGroup &Group);
+
+/// Create a mask with replicated elements.
+///
+/// This function creates a shuffle mask for replicating each of the \p VF 
+/// elements in a vector \p ReplicationFactor times. It can be used to
+/// transform a mask of \p VF elements into a mask of
+/// \p VF * \p ReplicationFactor elements used by a predicated
+/// interleaved-group of loads/stores whose Interleaved-factor ==
+/// \p ReplicationFactor.
+///
+/// For example, the mask for \p ReplicationFactor=3 and \p VF=4 is:
+///
+///   <0,0,0,1,1,1,2,2,2,3,3,3>
+Constant *createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor,
+                               unsigned VF);
+
 /// Create an interleave shuffle mask.
 ///
 /// This function creates a shuffle mask for interleaving \p NumVecs vectors of
@@ -293,6 +323,23 @@ public:
     propagateMetadata(NewInst, VL);
   }
 
+  /// Returns true if this Group requires a scalar iteration to handle gaps.
+  bool requiresScalarEpilogue() const {
+    // If the last member of the Group exists, then a scalar epilog is not
+    // needed for this group.
+    if (getMember(getFactor() - 1))
+      return false;
+
+    // We have a group with gaps. It therefore cannot be a group of stores,
+    // and it can't be a reversed access, because such groups get invalidated.
+    assert(!getMember(0)->mayWriteToMemory() &&
+           "Group should have been invalidated");
+    assert(!isReverse() && "Group should have been invalidated");
+
+    // This is a group of loads, with gaps, and without a last-member
+    return true;
+  }
+
 private:
   unsigned Factor; // Interleave Factor.
   bool Reverse;
@@ -328,20 +375,31 @@ public:
   InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
                         DominatorTree *DT, LoopInfo *LI,
                         const LoopAccessInfo *LAI)
-    : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
+      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
+
+  ~InterleavedAccessInfo() { reset(); }
 
-  ~InterleavedAccessInfo() {
+  /// Analyze the interleaved accesses and collect them in interleave
+  /// groups. Substitute symbolic strides using \p Strides.
+  /// Consider also predicated loads/stores in the analysis if
+  /// \p EnableMaskedInterleavedGroup is true.
+  void analyzeInterleaving(bool EnableMaskedInterleavedGroup);
+
+  /// Invalidate groups, e.g., in case all blocks in loop will be predicated
+  /// contrary to original assumption. Although we currently prevent group
+  /// formation for predicated accesses, we may be able to relax this limitation
+  /// in the future once we handle more complicated blocks.
+  void reset() {
     SmallPtrSet<InterleaveGroup *, 4> DelSet;
     // Avoid releasing a pointer twice.
     for (auto &I : InterleaveGroupMap)
       DelSet.insert(I.second);
     for (auto *Ptr : DelSet)
       delete Ptr;
+    InterleaveGroupMap.clear();
+    RequiresScalarEpilogue = false;
   }
 
-  /// Analyze the interleaved accesses and collect them in interleave
-  /// groups. Substitute symbolic strides using \p Strides.
-  void analyzeInterleaving();
 
   /// Check if \p Instr belongs to any interleave group.
   bool isInterleaved(Instruction *Instr) const {
@@ -362,6 +420,11 @@ public:
   /// out-of-bounds requires a scalar epilogue iteration for correctness.
   bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
 
+  /// Invalidate groups that require a scalar epilogue (due to gaps). This can
+  /// happen when optimizing for size forbids a scalar epilogue, and the gap
+  /// cannot be filtered by masking the load/store.
+  void invalidateGroupsRequiringScalarEpilogue();
+
 private:
   /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
   /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
diff --git a/include/llvm/BinaryFormat/Dwarf.def b/include/llvm/BinaryFormat/Dwarf.def
index 6b7a7412f4d92dfa43b54dfe59473556bccb945a..512cc64926db5a6e0692895a6af1660c0e9d6a8f 100644
--- a/include/llvm/BinaryFormat/Dwarf.def
+++ b/include/llvm/BinaryFormat/Dwarf.def
@@ -873,6 +873,7 @@ HANDLE_DWARF_SECTION(DebugTypes, ".debug_types", "debug-types")
 HANDLE_DWARF_SECTION(DebugLine, ".debug_line", "debug-line")
 HANDLE_DWARF_SECTION(DebugLineStr, ".debug_line_str", "debug-line-str")
 HANDLE_DWARF_SECTION(DebugLoc, ".debug_loc", "debug-loc")
+HANDLE_DWARF_SECTION(DebugLoclists, ".debug_loclists", "debug-loclists")
 HANDLE_DWARF_SECTION(DebugFrame, ".debug_frame", "debug-frame")
 HANDLE_DWARF_SECTION(DebugMacro, ".debug_macro", "debug-macro")
 HANDLE_DWARF_SECTION(DebugNames, ".debug_names", "debug-names")
diff --git a/include/llvm/BinaryFormat/ELF.h b/include/llvm/BinaryFormat/ELF.h
index 2e778779117b110919e03c8df6793047758d54f5..ebbf830a60e9db9318dcc2d8a580da094250e74e 100644
--- a/include/llvm/BinaryFormat/ELF.h
+++ b/include/llvm/BinaryFormat/ELF.h
@@ -701,6 +701,7 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
   EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
   EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
+  EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031,
 
   // Reserved for AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027,
@@ -708,11 +709,14 @@ enum : unsigned {
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX906,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX909,
 
-  // Indicates if the xnack target feature is enabled for all code contained in
-  // the object.
+  // Indicates if the "xnack" target feature is enabled for all code contained
+  // in the object.
   EF_AMDGPU_XNACK = 0x100,
+  // Indicates if the "sram-ecc" target feature is enabled for all code
+  // contained in the object.
+  EF_AMDGPU_SRAM_ECC = 0x200,
 };
 
 // ELF Relocation types for AMDGPU
@@ -725,6 +729,38 @@ enum {
 #include "ELFRelocs/BPF.def"
 };
 
+// MSP430 specific e_flags
+enum : unsigned {
+  EF_MSP430_MACH_MSP430x11 = 11,
+  EF_MSP430_MACH_MSP430x11x1 = 110,
+  EF_MSP430_MACH_MSP430x12 = 12,
+  EF_MSP430_MACH_MSP430x13 = 13,
+  EF_MSP430_MACH_MSP430x14 = 14,
+  EF_MSP430_MACH_MSP430x15 = 15,
+  EF_MSP430_MACH_MSP430x16 = 16,
+  EF_MSP430_MACH_MSP430x20 = 20,
+  EF_MSP430_MACH_MSP430x22 = 22,
+  EF_MSP430_MACH_MSP430x23 = 23,
+  EF_MSP430_MACH_MSP430x24 = 24,
+  EF_MSP430_MACH_MSP430x26 = 26,
+  EF_MSP430_MACH_MSP430x31 = 31,
+  EF_MSP430_MACH_MSP430x32 = 32,
+  EF_MSP430_MACH_MSP430x33 = 33,
+  EF_MSP430_MACH_MSP430x41 = 41,
+  EF_MSP430_MACH_MSP430x42 = 42,
+  EF_MSP430_MACH_MSP430x43 = 43,
+  EF_MSP430_MACH_MSP430x44 = 44,
+  EF_MSP430_MACH_MSP430X = 45,
+  EF_MSP430_MACH_MSP430x46 = 46,
+  EF_MSP430_MACH_MSP430x47 = 47,
+  EF_MSP430_MACH_MSP430x54 = 54,
+};
+
+// ELF Relocation types for MSP430
+enum {
+#include "ELFRelocs/MSP430.def"
+};
+
 #undef ELF_RELOC
 
 // Section header.
@@ -829,6 +865,8 @@ enum : unsigned {
   SHT_MIPS_DWARF = 0x7000001e,          // DWARF debugging section.
   SHT_MIPS_ABIFLAGS = 0x7000002a,       // ABI information.
 
+  SHT_MSP430_ATTRIBUTES = 0x70000003U,
+
   SHT_HIPROC = 0x7fffffff,              // Highest processor arch-specific type.
   SHT_LOUSER = 0x80000000,              // Lowest type reserved for applications.
   SHT_HIUSER = 0xffffffff               // Highest type reserved for applications.
diff --git a/include/llvm/BinaryFormat/ELFRelocs/MSP430.def b/include/llvm/BinaryFormat/ELFRelocs/MSP430.def
new file mode 100644
index 0000000000000000000000000000000000000000..96990abf2db4825c13271108ddb8e014703640c5
--- /dev/null
+++ b/include/llvm/BinaryFormat/ELFRelocs/MSP430.def
@@ -0,0 +1,16 @@
+
+#ifndef ELF_RELOC
+#error "ELF_RELOC must be defined"
+#endif
+
+ELF_RELOC(R_MSP430_NONE,               0)
+ELF_RELOC(R_MSP430_32,                 1)
+ELF_RELOC(R_MSP430_10_PCREL,           2)
+ELF_RELOC(R_MSP430_16,                 3)
+ELF_RELOC(R_MSP430_16_PCREL,           4)
+ELF_RELOC(R_MSP430_16_BYTE,            5)
+ELF_RELOC(R_MSP430_16_PCREL_BYTE,      6)
+ELF_RELOC(R_MSP430_2X_PCREL,           7)
+ELF_RELOC(R_MSP430_RL_PCREL,           8)
+ELF_RELOC(R_MSP430_8,                  9)
+ELF_RELOC(R_MSP430_SYM_DIFF,           10)
diff --git a/include/llvm/BinaryFormat/Wasm.h b/include/llvm/BinaryFormat/Wasm.h
index 44dd92ea90103422bdd812e2eae28bd804844046..3d25c9d15e4e7ff2fe73c716fd0425efa443bc06 100644
--- a/include/llvm/BinaryFormat/Wasm.h
+++ b/include/llvm/BinaryFormat/Wasm.h
@@ -214,6 +214,7 @@ enum : unsigned {
 
 enum : unsigned {
   WASM_LIMITS_FLAG_HAS_MAX = 0x1,
+  WASM_LIMITS_FLAG_IS_SHARED = 0x2,
 };
 
 // Kind codes used in the custom "name" section
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index b460cdc0ba1e8c209f110cb5e280e470d35ddd09..224a41bc2b7aba965904741c4567ffa942f01ea6 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -80,6 +80,23 @@ private:
   using BaseT = TargetTransformInfoImplCRTPBase<T>;
   using TTI = TargetTransformInfo;
 
+  /// Estimate a cost of Broadcast as an extract and sequence of insert
+  /// operations.
+  unsigned getBroadcastShuffleOverhead(Type *Ty) {
+    assert(Ty->isVectorTy() && "Can only shuffle vectors");
+    unsigned Cost = 0;
+    // Broadcast cost is equal to the cost of extracting the zero'th element
+    // plus the cost of inserting it into every element of the result vector.
+    Cost += static_cast<T *>(this)->getVectorInstrCost(
+        Instruction::ExtractElement, Ty, 0);
+
+    for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, Ty, i);
+    }
+    return Cost;
+  }
+
   /// Estimate a cost of shuffle as a sequence of extract and insert
   /// operations.
   unsigned getPermuteShuffleOverhead(Type *Ty) {
@@ -554,7 +571,10 @@ public:
   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                           Type *SubTp) {
     switch (Kind) {
+    case TTI::SK_Broadcast:
+      return getBroadcastShuffleOverhead(Tp);
     case TTI::SK_Select:
+    case TTI::SK_Reverse:
     case TTI::SK_Transpose:
     case TTI::SK_PermuteSingleSrc:
     case TTI::SK_PermuteTwoSrc:
@@ -783,8 +803,9 @@ public:
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace) {
+                                      unsigned Alignment, unsigned AddressSpace,
+                                      bool UseMaskForCond = false,
+                                      bool UseMaskForGaps = false) {
     VectorType *VT = dyn_cast<VectorType>(VecTy);
     assert(VT && "Expect a vector type for interleaved memory op");
 
@@ -795,8 +816,13 @@ public:
     VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts);
 
     // Firstly, the cost of load/store operation.
-    unsigned Cost = static_cast<T *>(this)->getMemoryOpCost(
-        Opcode, VecTy, Alignment, AddressSpace);
+    unsigned Cost;
+    if (UseMaskForCond || UseMaskForGaps)
+      Cost = static_cast<T *>(this)->getMaskedMemoryOpCost(
+          Opcode, VecTy, Alignment, AddressSpace);
+    else
+      Cost = static_cast<T *>(this)->getMemoryOpCost(Opcode, VecTy, Alignment,
+                                                     AddressSpace);
 
     // Legalize the vector type, and get the legalized and unlegalized type
     // sizes.
@@ -892,6 +918,40 @@ public:
                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
     }
 
+    if (!UseMaskForCond)
+      return Cost;
+
+    Type *I8Type = Type::getInt8Ty(VT->getContext());
+    VectorType *MaskVT = VectorType::get(I8Type, NumElts);
+    SubVT = VectorType::get(I8Type, NumSubElts);
+
+    // The Mask shuffling cost is extract all the elements of the Mask
+    // and insert each of them Factor times into the wide vector:
+    //
+    // E.g. an interleaved group with factor 3:
+    //    %mask = icmp ult <8 x i32> %vec1, %vec2
+    //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
+    //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
+    // The cost is estimated as extract all mask elements from the <8xi1> mask
+    // vector and insert them factor times into the <24xi1> shuffled mask
+    // vector.
+    for (unsigned i = 0; i < NumSubElts; i++)
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::ExtractElement, SubVT, i);
+
+    for (unsigned i = 0; i < NumElts; i++)
+      Cost += static_cast<T *>(this)->getVectorInstrCost(
+          Instruction::InsertElement, MaskVT, i);
+
+    // The Gaps mask is invariant and created outside the loop, therefore the
+    // cost of creating it is not accounted for here. However if we have both
+    // a MaskForGaps and some other mask that guards the execution of the
+    // memory access, we need to account for the cost of And-ing the two masks
+    // inside the loop.
+    if (UseMaskForGaps)
+      Cost += static_cast<T *>(this)->getArithmeticInstrCost(
+          BinaryOperator::And, MaskVT); 
+
     return Cost;
   }
 
@@ -1042,12 +1102,12 @@ public:
     case Intrinsic::minnum:
       ISDs.push_back(ISD::FMINNUM);
       if (FMF.noNaNs())
-        ISDs.push_back(ISD::FMINNAN);
+        ISDs.push_back(ISD::FMINIMUM);
       break;
     case Intrinsic::maxnum:
       ISDs.push_back(ISD::FMAXNUM);
       if (FMF.noNaNs())
-        ISDs.push_back(ISD::FMAXNAN);
+        ISDs.push_back(ISD::FMAXIMUM);
       break;
     case Intrinsic::copysign:
       ISDs.push_back(ISD::FCOPYSIGN);
@@ -1284,12 +1344,13 @@ public:
         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
+      Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
       // Assume the pairwise shuffles add a cost.
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                                 NumVecElts, Ty);
+                                                 NumVecElts, SubTy);
       ArithCost += ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
-      Ty = VectorType::get(ScalarTy, NumVecElts);
+      Ty = SubTy;
       ++LongVectorCount;
     }
     // The minimal length of the vector is limited by the real length of vector
@@ -1297,8 +1358,8 @@ public:
     // reduction operations are performed on the vectors with the same
     // architecture-dependent length.
     ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
-                   ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                               NumVecElts, Ty);
+                   ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
+                                               0, Ty);
     ArithCost += (NumReduxLevels - LongVectorCount) *
                  ConcreteTTI->getArithmeticInstrCost(Opcode, Ty);
     return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
@@ -1331,15 +1392,16 @@ public:
         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
     while (NumVecElts > MVTLen) {
       NumVecElts /= 2;
+      Type *SubTy = VectorType::get(ScalarTy, NumVecElts);
       // Assume the pairwise shuffles add a cost.
       ShuffleCost += (IsPairwise + 1) *
                      ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                                 NumVecElts, Ty);
+                                                 NumVecElts, SubTy);
       MinMaxCost +=
           ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
           ConcreteTTI->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
                                           nullptr);
-      Ty = VectorType::get(ScalarTy, NumVecElts);
+      Ty = SubTy;
       CondTy = VectorType::get(ScalarCondTy, NumVecElts);
       ++LongVectorCount;
     }
@@ -1348,8 +1410,8 @@ public:
     // reduction opertions are perfomed on the vectors with the same
     // architecture-dependent length.
     ShuffleCost += (NumReduxLevels - LongVectorCount) * (IsPairwise + 1) *
-                   ConcreteTTI->getShuffleCost(TTI::SK_ExtractSubvector, Ty,
-                                               NumVecElts, Ty);
+                   ConcreteTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
+                                               0, Ty);
     MinMaxCost +=
         (NumReduxLevels - LongVectorCount) *
         (ConcreteTTI->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, nullptr) +
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 873587651efda04580dc633c446a6fec78edb7a9..e1132ac59c829af869ec17919fcdc8940838e7ce 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -14,12 +14,14 @@
 
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "legalizer"
+using namespace llvm::MIPatternMatch;
 
 namespace llvm {
 class LegalizationArtifactCombiner {
@@ -36,15 +38,17 @@ public:
                         SmallVectorImpl<MachineInstr *> &DeadInsts) {
     if (MI.getOpcode() != TargetOpcode::G_ANYEXT)
       return false;
-    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
-                                           MI.getOperand(1).getReg(), MRI)) {
+
+    Builder.setInstr(MI);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
+
+    // aext(trunc x) - > aext/copy/trunc x
+    unsigned TruncSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      unsigned DstReg = MI.getOperand(0).getReg();
-      unsigned SrcReg = DefMI->getOperand(1).getReg();
-      Builder.setInstr(MI);
-      // We get a copy/trunc/extend depending on the sizes
-      Builder.buildAnyExtOrTrunc(DstReg, SrcReg);
-      markInstAndDefDead(MI, *DefMI, DeadInsts);
+      Builder.buildAnyExtOrTrunc(DstReg, TruncSrc);
+      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
@@ -55,24 +59,25 @@ public:
 
     if (MI.getOpcode() != TargetOpcode::G_ZEXT)
       return false;
-    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
-                                           MI.getOperand(1).getReg(), MRI)) {
-      unsigned DstReg = MI.getOperand(0).getReg();
+
+    Builder.setInstr(MI);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
+
+    // zext(trunc x) - > and (aext/copy/trunc x), mask
+    unsigned TruncSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLT DstTy = MRI.getType(DstReg);
       if (isInstUnsupported({TargetOpcode::G_AND, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      Builder.setInstr(MI);
-      unsigned ZExtSrc = MI.getOperand(1).getReg();
-      LLT ZExtSrcTy = MRI.getType(ZExtSrc);
-      APInt Mask = APInt::getAllOnesValue(ZExtSrcTy.getSizeInBits());
-      auto MaskCstMIB = Builder.buildConstant(DstTy, Mask.getZExtValue());
-      unsigned TruncSrc = DefMI->getOperand(1).getReg();
-      // We get a copy/trunc/extend depending on the sizes
-      auto SrcCopyOrTrunc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrc);
-      Builder.buildAnd(DstReg, SrcCopyOrTrunc, MaskCstMIB);
-      markInstAndDefDead(MI, *DefMI, DeadInsts);
+      LLT SrcTy = MRI.getType(SrcReg);
+      APInt Mask = APInt::getAllOnesValue(SrcTy.getSizeInBits());
+      auto MIBMask = Builder.buildConstant(DstTy, Mask.getZExtValue());
+      Builder.buildAnd(DstReg, Builder.buildAnyExtOrTrunc(DstTy, TruncSrc),
+                       MIBMask);
+      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
@@ -83,33 +88,34 @@ public:
 
     if (MI.getOpcode() != TargetOpcode::G_SEXT)
       return false;
-    if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_TRUNC,
-                                           MI.getOperand(1).getReg(), MRI)) {
-      unsigned DstReg = MI.getOperand(0).getReg();
+
+    Builder.setInstr(MI);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
+
+    // sext(trunc x) - > ashr (shl (aext/copy/trunc x), c), c
+    unsigned TruncSrc;
+    if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) {
       LLT DstTy = MRI.getType(DstReg);
       if (isInstUnsupported({TargetOpcode::G_SHL, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_ASHR, {DstTy}}) ||
           isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
         return false;
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
-      Builder.setInstr(MI);
-      unsigned SExtSrc = MI.getOperand(1).getReg();
-      LLT SExtSrcTy = MRI.getType(SExtSrc);
-      unsigned SizeDiff = DstTy.getSizeInBits() - SExtSrcTy.getSizeInBits();
-      auto SizeDiffMIB = Builder.buildConstant(DstTy, SizeDiff);
-      unsigned TruncSrcReg = DefMI->getOperand(1).getReg();
-      // We get a copy/trunc/extend depending on the sizes
-      auto SrcCopyExtOrTrunc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrcReg);
-      auto ShlMIB = Builder.buildInstr(TargetOpcode::G_SHL, DstTy,
-                                       SrcCopyExtOrTrunc, SizeDiffMIB);
-      Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, ShlMIB, SizeDiffMIB);
-      markInstAndDefDead(MI, *DefMI, DeadInsts);
+      LLT SrcTy = MRI.getType(SrcReg);
+      unsigned ShAmt = DstTy.getSizeInBits() - SrcTy.getSizeInBits();
+      auto MIBShAmt = Builder.buildConstant(DstTy, ShAmt);
+      auto MIBShl = Builder.buildInstr(
+          TargetOpcode::G_SHL, DstTy,
+          Builder.buildAnyExtOrTrunc(DstTy, TruncSrc), MIBShAmt);
+      Builder.buildInstr(TargetOpcode::G_ASHR, DstReg, MIBShl, MIBShAmt);
+      markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
     return tryFoldImplicitDef(MI, DeadInsts);
   }
 
-  /// Try to fold sb = EXTEND (G_IMPLICIT_DEF sa) -> sb = G_IMPLICIT_DEF
+  /// Try to fold G_[ASZ]EXT (G_IMPLICIT_DEF).
   bool tryFoldImplicitDef(MachineInstr &MI,
                           SmallVectorImpl<MachineInstr *> &DeadInsts) {
     unsigned Opcode = MI.getOpcode();
@@ -119,13 +125,25 @@ public:
 
     if (MachineInstr *DefMI = getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF,
                                            MI.getOperand(1).getReg(), MRI)) {
+      Builder.setInstr(MI);
       unsigned DstReg = MI.getOperand(0).getReg();
       LLT DstTy = MRI.getType(DstReg);
-      if (isInstUnsupported({TargetOpcode::G_IMPLICIT_DEF, {DstTy}}))
-        return false;
-      LLVM_DEBUG(dbgs() << ".. Combine EXT(IMPLICIT_DEF) " << MI;);
-      Builder.setInstr(MI);
-      Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, DstReg);
+
+      if (Opcode == TargetOpcode::G_ANYEXT) {
+        // G_ANYEXT (G_IMPLICIT_DEF) -> G_IMPLICIT_DEF
+        if (isInstUnsupported({TargetOpcode::G_IMPLICIT_DEF, {DstTy}}))
+          return false;
+        LLVM_DEBUG(dbgs() << ".. Combine G_ANYEXT(G_IMPLICIT_DEF): " << MI;);
+        Builder.buildInstr(TargetOpcode::G_IMPLICIT_DEF, DstReg);
+      } else {
+        // G_[SZ]EXT (G_IMPLICIT_DEF) -> G_CONSTANT 0 because the top
+        // bits will be 0 for G_ZEXT and 0/1 for the G_SEXT.
+        if (isInstUnsupported({TargetOpcode::G_CONSTANT, {DstTy}}))
+          return false;
+        LLVM_DEBUG(dbgs() << ".. Combine G_[SZ]EXT(G_IMPLICIT_DEF): " << MI;);
+        Builder.buildConstant(DstReg, 0);
+      }
+
       markInstAndDefDead(MI, *DefMI, DeadInsts);
       return true;
     }
@@ -277,6 +295,19 @@ private:
     auto Step = LI.getAction(Query);
     return Step.Action == Unsupported || Step.Action == NotFound;
   }
+
+  /// Looks through copy instructions and returns the actual
+  /// source register.
+  unsigned lookThroughCopyInstrs(unsigned Reg) {
+    unsigned TmpReg;
+    while (mi_match(Reg, MRI, m_Copy(m_Reg(TmpReg)))) {
+      if (MRI.getType(TmpReg).isValid())
+        Reg = TmpReg;
+      else
+        break;
+    }
+    return Reg;
+  }
 };
 
 } // namespace llvm
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index a8c26082f2210f496f8f6b32ecadf56214c21604..755805de1b08e35bc4121cabb713053cfee44ab7 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -121,7 +121,7 @@ struct LegalityQuery {
   ArrayRef<LLT> Types;
 
   struct MemDesc {
-    uint64_t Size;
+    uint64_t SizeInBits;
     AtomicOrdering Ordering;
   };
 
@@ -693,6 +693,8 @@ public:
         },
         [=](const LegalityQuery &Query) {
           LLT VecTy = Query.Types[TypeIdx];
+          if (MaxElements == 1)
+            return std::make_pair(TypeIdx, VecTy.getElementType());
           return std::make_pair(
               TypeIdx, LLT::vector(MaxElements, VecTy.getScalarSizeInBits()));
         });
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index ec9c46140d70556f3df5facb3552cc049ac065ff..ac620e4b69c501353ebf0986ede0751aa775d135 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -70,7 +70,7 @@ namespace ISD {
     /// of the frame or return address to return.  An index of zero corresponds
     /// to the current function's frame or return address, an index of one to
     /// the parent's frame or return address, and so on.
-    FRAMEADDR, RETURNADDR, ADDROFRETURNADDR,
+    FRAMEADDR, RETURNADDR, ADDROFRETURNADDR, SPONENTRY,
 
     /// LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
     /// Materializes the offset from the local object pointer of another
@@ -256,6 +256,22 @@ namespace ISD {
     /// Same for multiplication.
     SMULO, UMULO,
 
+    /// RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2
+    /// integers with the same bit width (W). If the true value of LHS + RHS
+    /// exceeds the largest value that can be represented by W bits, the
+    /// resulting value is this maximum value. Otherwise, if this value is less
+    /// than the smallest value that can be represented by W bits, the
+    /// resulting value is this minimum value.
+    SADDSAT, UADDSAT,
+
+    /// RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2
+    /// integers with the same bit width (W). If the true value of LHS - RHS
+    /// exceeds the largest value that can be represented by W bits, the
+    /// resulting value is this maximum value. Otherwise, if this value is less
+    /// than the smallest value that can be represented by W bits, the
+    /// resulting value is this minimum value.
+    SSUBSAT, USUBSAT,
+
     /// Simple binary floating point operators.
     FADD, FSUB, FMUL, FDIV, FREM,
 
@@ -272,7 +288,8 @@ namespace ISD {
     /// They are used to limit optimizations while the DAG is being optimized.
     STRICT_FSQRT, STRICT_FPOW, STRICT_FPOWI, STRICT_FSIN, STRICT_FCOS,
     STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, STRICT_FLOG10, STRICT_FLOG2,
-    STRICT_FRINT, STRICT_FNEARBYINT,
+    STRICT_FRINT, STRICT_FNEARBYINT, STRICT_FMAXNUM, STRICT_FMINNUM,
+    STRICT_FCEIL, STRICT_FFLOOR, STRICT_FROUND, STRICT_FTRUNC,
 
     /// FMA - Perform a * b + c with no intermediate rounding step.
     FMA,
@@ -556,13 +573,23 @@ namespace ISD {
     FCEIL, FTRUNC, FRINT, FNEARBYINT, FROUND, FFLOOR,
     /// FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two
     /// values.
-    /// In the case where a single input is NaN, the non-NaN input is returned.
+    //
+    /// In the case where a single input is a NaN (either signaling or quiet),
+    /// the non-NaN input is returned.
     ///
     /// The return value of (FMINNUM 0.0, -0.0) could be either 0.0 or -0.0.
     FMINNUM, FMAXNUM,
-    /// FMINNAN/FMAXNAN - Behave identically to FMINNUM/FMAXNUM, except that
-    /// when a single input is NaN, NaN is returned.
-    FMINNAN, FMAXNAN,
+
+    /// FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on
+    /// two values, following the IEEE-754 2008 definition. This differs from
+    /// FMINNUM/FMAXNUM in the handling of signaling NaNs. If one input is a
+    /// signaling NaN, returns a quiet NaN.
+    FMINNUM_IEEE, FMAXNUM_IEEE,
+
+    /// FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0
+    /// as less than 0.0. While FMINNUM_IEEE/FMAXNUM_IEEE follow IEEE 754-2008
+    /// semantics, FMINIMUM/FMAXIMUM follow IEEE 754-2018 draft semantics.
+    FMINIMUM, FMAXIMUM,
 
     /// FSINCOS - Compute both fsin and fcos as a single operation.
     FSINCOS,
diff --git a/include/llvm/CodeGen/LiveIntervals.h b/include/llvm/CodeGen/LiveIntervals.h
index 291a07a712cb76459d88e2899cbebbb17035247d..16ab1dc475c41bf2ff5901a92c81de829233dd99 100644
--- a/include/llvm/CodeGen/LiveIntervals.h
+++ b/include/llvm/CodeGen/LiveIntervals.h
@@ -198,10 +198,10 @@ class VirtRegMap;
     void pruneValue(LiveRange &LR, SlotIndex Kill,
                     SmallVectorImpl<SlotIndex> *EndPoints);
 
-    /// This function should not be used. Its intend is to tell you that
-    /// you are doing something wrong if you call pruveValue directly on a
+    /// This function should not be used. Its intent is to tell you that you are
+    /// doing something wrong if you call pruneValue directly on a
     /// LiveInterval. Indeed, you are supposed to call pruneValue on the main
-    /// LiveRange and all the LiveRange of the subranges if any.
+    /// LiveRange and all the LiveRanges of the subranges if any.
     LLVM_ATTRIBUTE_UNUSED void pruneValue(LiveInterval &, SlotIndex,
                                           SmallVectorImpl<SlotIndex> *) {
       llvm_unreachable(
diff --git a/include/llvm/CodeGen/LivePhysRegs.h b/include/llvm/CodeGen/LivePhysRegs.h
index 301a45066b4c63a62dd374c7ea1d18f193d937d2..7312902e21b71f0c3873c9a7d0a3f596d14c41ba 100644
--- a/include/llvm/CodeGen/LivePhysRegs.h
+++ b/include/llvm/CodeGen/LivePhysRegs.h
@@ -48,7 +48,8 @@ class raw_ostream;
 /// when walking backward/forward through a basic block.
 class LivePhysRegs {
   const TargetRegisterInfo *TRI = nullptr;
-  SparseSet<unsigned> LiveRegs;
+  using RegisterSet = SparseSet<MCPhysReg, identity<MCPhysReg>>;
+  RegisterSet LiveRegs;
 
 public:
   /// Constructs an unitialized set. init() needs to be called to initialize it.
@@ -76,7 +77,7 @@ public:
   bool empty() const { return LiveRegs.empty(); }
 
   /// Adds a physical register and all its sub-registers to the set.
-  void addReg(unsigned Reg) {
+  void addReg(MCPhysReg Reg) {
     assert(TRI && "LivePhysRegs is not initialized.");
     assert(Reg <= TRI->getNumRegs() && "Expected a physical register.");
     for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
@@ -86,7 +87,7 @@ public:
 
   /// Removes a physical register, all its sub-registers, and all its
   /// super-registers from the set.
-  void removeReg(unsigned Reg) {
+  void removeReg(MCPhysReg Reg) {
     assert(TRI && "LivePhysRegs is not initialized.");
     assert(Reg <= TRI->getNumRegs() && "Expected a physical register.");
     for (MCRegAliasIterator R(Reg, TRI, true); R.isValid(); ++R)
@@ -95,7 +96,7 @@ public:
 
   /// Removes physical registers clobbered by the regmask operand \p MO.
   void removeRegsInMask(const MachineOperand &MO,
-        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> *Clobbers =
+        SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> *Clobbers =
         nullptr);
 
   /// Returns true if register \p Reg is contained in the set. This also
@@ -103,10 +104,10 @@ public:
   /// addReg() always adds all sub-registers to the set as well.
   /// Note: Returns false if just some sub registers are live, use available()
   /// when searching a free register.
-  bool contains(unsigned Reg) const { return LiveRegs.count(Reg); }
+  bool contains(MCPhysReg Reg) const { return LiveRegs.count(Reg); }
 
   /// Returns true if register \p Reg and no aliasing register is in the set.
-  bool available(const MachineRegisterInfo &MRI, unsigned Reg) const;
+  bool available(const MachineRegisterInfo &MRI, MCPhysReg Reg) const;
 
   /// Remove defined registers and regmask kills from the set.
   void removeDefs(const MachineInstr &MI);
@@ -126,7 +127,7 @@ public:
   /// defined or clobbered by a regmask.  The operand will identify whether this
   /// is a regmask or register operand.
   void stepForward(const MachineInstr &MI,
-        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers);
+        SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> &Clobbers);
 
   /// Adds all live-in registers of basic block \p MBB.
   /// Live in registers are the registers in the blocks live-in list and the
@@ -143,7 +144,7 @@ public:
   /// registers.
   void addLiveOutsNoPristines(const MachineBasicBlock &MBB);
 
-  using const_iterator = SparseSet<unsigned>::const_iterator;
+  using const_iterator = RegisterSet::const_iterator;
 
   const_iterator begin() const { return LiveRegs.begin(); }
   const_iterator end() const { return LiveRegs.end(); }
diff --git a/include/llvm/CodeGen/LiveRegUnits.h b/include/llvm/CodeGen/LiveRegUnits.h
index 249545906e01d10c11215aa355783b9cd6956aac..5e9dd8b3cdf6983387469fd9975eb171bbfdca85 100644
--- a/include/llvm/CodeGen/LiveRegUnits.h
+++ b/include/llvm/CodeGen/LiveRegUnits.h
@@ -85,14 +85,14 @@ public:
   bool empty() const { return Units.none(); }
 
   /// Adds register units covered by physical register \p Reg.
-  void addReg(unsigned Reg) {
+  void addReg(MCPhysReg Reg) {
     for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit)
       Units.set(*Unit);
   }
 
   /// Adds register units covered by physical register \p Reg that are
   /// part of the lanemask \p Mask.
-  void addRegMasked(unsigned Reg, LaneBitmask Mask) {
+  void addRegMasked(MCPhysReg Reg, LaneBitmask Mask) {
     for (MCRegUnitMaskIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) {
       LaneBitmask UnitMask = (*Unit).second;
       if (UnitMask.none() || (UnitMask & Mask).any())
@@ -101,7 +101,7 @@ public:
   }
 
   /// Removes all register units covered by physical register \p Reg.
-  void removeReg(unsigned Reg) {
+  void removeReg(MCPhysReg Reg) {
     for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit)
       Units.reset(*Unit);
   }
@@ -115,7 +115,7 @@ public:
   void addRegsInMask(const uint32_t *RegMask);
 
   /// Returns true if no part of physical register \p Reg is live.
-  bool available(unsigned Reg) const {
+  bool available(MCPhysReg Reg) const {
     for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) {
       if (Units.test(*Unit))
         return false;
diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h
index dc9057521e7e48935553786d68b35f9043c2f919..98ac81915dc05c25d153d2b211b9eb798acae5c5 100644
--- a/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/include/llvm/CodeGen/MIRYamlMapping.h
@@ -494,6 +494,7 @@ struct MachineFunction {
   bool FailedISel = false;
   // Register information
   bool TracksRegLiveness = false;
+  bool HasWinCFI = false;
   std::vector<VirtualRegisterDefinition> VirtualRegisters;
   std::vector<MachineFunctionLiveIn> LiveIns;
   Optional<std::vector<FlowStringValue>> CalleeSavedRegisters;
@@ -517,6 +518,7 @@ template <> struct MappingTraits<MachineFunction> {
     YamlIO.mapOptional("selected", MF.Selected, false);
     YamlIO.mapOptional("failedISel", MF.FailedISel, false);
     YamlIO.mapOptional("tracksRegLiveness", MF.TracksRegLiveness, false);
+    YamlIO.mapOptional("hasWinCFI", MF.HasWinCFI, false);
     YamlIO.mapOptional("registers", MF.VirtualRegisters,
                        std::vector<VirtualRegisterDefinition>());
     YamlIO.mapOptional("liveins", MF.LiveIns,
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 7471b31484644e47c76b316f2826b5563070dbe2..35305bd53b28908fe0fcf45396822838c1043c5c 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -58,6 +58,7 @@ class DILocalVariable;
 class DILocation;
 class Function;
 class GlobalValue;
+class LLVMTargetMachine;
 class MachineConstantPool;
 class MachineFrameInfo;
 class MachineFunction;
@@ -70,7 +71,6 @@ class Pass;
 class PseudoSourceValueManager;
 class raw_ostream;
 class SlotIndexes;
-class TargetMachine;
 class TargetRegisterClass;
 class TargetSubtargetInfo;
 struct WasmEHFuncInfo;
@@ -225,7 +225,7 @@ struct LandingPadInfo {
 
 class MachineFunction {
   const Function &F;
-  const TargetMachine &Target;
+  const LLVMTargetMachine &Target;
   const TargetSubtargetInfo *STI;
   MCContext &Ctx;
   MachineModuleInfo &MMI;
@@ -316,6 +316,9 @@ class MachineFunction {
   /// Map a landing pad's EH symbol to the call site indexes.
   DenseMap<MCSymbol*, SmallVector<unsigned, 4>> LPadToCallSiteMap;
 
+  /// Map a landing pad to its index.
+  DenseMap<const MachineBasicBlock *, unsigned> WasmLPadToIndexMap;
+
   /// Map of invoke call site index values to associated begin EH_LABEL.
   DenseMap<MCSymbol*, unsigned> CallSiteMap;
 
@@ -385,7 +388,7 @@ public:
   using VariableDbgInfoMapTy = SmallVector<VariableDbgInfo, 4>;
   VariableDbgInfoMapTy VariableDbgInfos;
 
-  MachineFunction(const Function &F, const TargetMachine &Target,
+  MachineFunction(const Function &F, const LLVMTargetMachine &Target,
                   const TargetSubtargetInfo &STI, unsigned FunctionNum,
                   MachineModuleInfo &MMI);
   MachineFunction(const MachineFunction &) = delete;
@@ -433,7 +436,7 @@ public:
   unsigned getFunctionNumber() const { return FunctionNumber; }
 
   /// getTarget - Return the target machine this machine code is compiled with
-  const TargetMachine &getTarget() const { return Target; }
+  const LLVMTargetMachine &getTarget() const { return Target; }
 
   /// getSubtarget - Return the subtarget for which this machine code is being
   /// compiled.
@@ -810,7 +813,8 @@ public:
   LandingPadInfo &getOrCreateLandingPadInfo(MachineBasicBlock *LandingPad);
 
   /// Remap landing pad labels and remove any deleted landing pads.
-  void tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap = nullptr);
+  void tidyLandingPads(DenseMap<MCSymbol *, uintptr_t> *LPMap = nullptr,
+                       bool TidyIfNoBeginLabels = true);
 
   /// Return a reference to the landing pad info for the current function.
   const std::vector<LandingPadInfo> &getLandingPads() const {
@@ -853,6 +857,22 @@ public:
   /// Map the landing pad's EH symbol to the call site indexes.
   void setCallSiteLandingPad(MCSymbol *Sym, ArrayRef<unsigned> Sites);
 
+  /// Map the landing pad to its index. Used for Wasm exception handling.
+  void setWasmLandingPadIndex(const MachineBasicBlock *LPad, unsigned Index) {
+    WasmLPadToIndexMap[LPad] = Index;
+  }
+
+  /// Returns true if the landing pad has an associate index in wasm EH.
+  bool hasWasmLandingPadIndex(const MachineBasicBlock *LPad) const {
+    return WasmLPadToIndexMap.count(LPad);
+  }
+
+  /// Get the index in wasm EH for a given landing pad.
+  unsigned getWasmLandingPadIndex(const MachineBasicBlock *LPad) const {
+    assert(hasWasmLandingPadIndex(LPad));
+    return WasmLPadToIndexMap.lookup(LPad);
+  }
+
   /// Get the call site indexes for a landing pad EH symbol.
   SmallVectorImpl<unsigned> &getCallSiteLandingPad(MCSymbol *Sym) {
     assert(hasCallSiteLandingPad(Sym) &&
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 7c4e771ce7215aad394bfcd59832056f464259f4..ea1a2a536fc7377e0e33eadbee60acbbdb3be6a6 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -408,7 +408,7 @@ public:
   /// Returns the opcode of this MachineInstr.
   unsigned getOpcode() const { return MCID->Opcode; }
 
-  /// Access to explicit operands of the instruction.
+  /// Retuns the total number of operands.
   unsigned getNumOperands() const { return NumOperands; }
 
   const MachineOperand& getOperand(unsigned i) const {
diff --git a/include/llvm/CodeGen/MachineModuleInfo.h b/include/llvm/CodeGen/MachineModuleInfo.h
index 554e89019b76fd8b345e9340ff99f992dfd29b5b..4371420bc7a2bb83f935a0ddb50d47b99c9d1bdd 100644
--- a/include/llvm/CodeGen/MachineModuleInfo.h
+++ b/include/llvm/CodeGen/MachineModuleInfo.h
@@ -46,10 +46,10 @@ namespace llvm {
 class BasicBlock;
 class CallInst;
 class Function;
-class MachineFunction;
+class LLVMTargetMachine;
 class MMIAddrLabelMap;
+class MachineFunction;
 class Module;
-class TargetMachine;
 
 //===----------------------------------------------------------------------===//
 /// This class can be derived from and used by targets to hold private
@@ -76,7 +76,7 @@ protected:
 /// for specific use.
 ///
 class MachineModuleInfo : public ImmutablePass {
-  const TargetMachine &TM;
+  const LLVMTargetMachine &TM;
 
   /// This is the MCContext used for the entire code generator.
   MCContext Context;
@@ -145,7 +145,7 @@ class MachineModuleInfo : public ImmutablePass {
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  explicit MachineModuleInfo(const TargetMachine *TM = nullptr);
+  explicit MachineModuleInfo(const LLVMTargetMachine *TM = nullptr);
   ~MachineModuleInfo() override;
 
   // Initialization and Finalization
diff --git a/include/llvm/CodeGen/MachineOutliner.h b/include/llvm/CodeGen/MachineOutliner.h
index 95bfc24b57ffad600775ca95bf9c4238f1f3fd61..eaa741353abb0c4cb27298ac95bfe5bdeedd8dc5 100644
--- a/include/llvm/CodeGen/MachineOutliner.h
+++ b/include/llvm/CodeGen/MachineOutliner.h
@@ -169,9 +169,6 @@ public:
   /// This is initialized after we go through and create the actual function.
   MachineFunction *MF = nullptr;
 
-  /// A number assigned to this function which appears at the end of its name.
-  unsigned Name;
-
   /// The sequence of integers corresponding to the instructions in this
   /// function.
   std::vector<unsigned> Sequence;
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index a6836a53f048dc344a0f107e5a14492eb6af213c..fef010a23ef9d3b3e9477f4d73fa1d79c64c2392 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -689,15 +689,14 @@ public:
                                                unsigned MinNumRegs = 0);
 
   /// Constrain the register class or the register bank of the virtual register
-  /// \p Reg to be a common subclass and a common bank of both registers
-  /// provided respectively. Do nothing if any of the attributes (classes,
-  /// banks, or low-level types) of the registers are deemed incompatible, or if
-  /// the resulting register will have a class smaller than before and of size
-  /// less than \p MinNumRegs. Return true if such register attributes exist,
-  /// false otherwise.
+  /// \p Reg (and low-level type) to be a common subclass or a common bank of
+  /// both registers provided respectively (and a common low-level type). Do
+  /// nothing if any of the attributes (classes, banks, or low-level types) of
+  /// the registers are deemed incompatible, or if the resulting register will
+  /// have a class smaller than before and of size less than \p MinNumRegs.
+  /// Return true if such register attributes exist, false otherwise.
   ///
-  /// \note Assumes that each register has either a low-level type or a class
-  /// assigned, but not both. Use this method instead of constrainRegClass and
+  /// \note Use this method instead of constrainRegClass and
   /// RegisterBankInfo::constrainGenericRegister everywhere but SelectionDAG
   /// ISel / FastISel and GlobalISel's InstructionSelect pass respectively.
   bool constrainRegAttrs(unsigned Reg, unsigned ConstrainingReg,
diff --git a/include/llvm/CodeGen/RegisterUsageInfo.h b/include/llvm/CodeGen/RegisterUsageInfo.h
index efd175eeed3041b223b1a3b4670caad121281e14..efecc61d9c3039dbb7ef4c12278c78b565f64dce 100644
--- a/include/llvm/CodeGen/RegisterUsageInfo.h
+++ b/include/llvm/CodeGen/RegisterUsageInfo.h
@@ -29,7 +29,7 @@
 namespace llvm {
 
 class Function;
-class TargetMachine;
+class LLVMTargetMachine;
 
 class PhysicalRegisterUsageInfo : public ImmutablePass {
 public:
@@ -41,7 +41,7 @@ public:
   }
 
   /// Set TargetMachine which is used to print analysis.
-  void setTargetMachine(const TargetMachine &TM);
+  void setTargetMachine(const LLVMTargetMachine &TM);
 
   bool doInitialization(Module &M) override;
 
@@ -63,7 +63,7 @@ private:
   /// and 1 means content of register will be preserved around function call.
   DenseMap<const Function *, std::vector<uint32_t>> RegMasks;
 
-  const TargetMachine *TM;
+  const LLVMTargetMachine *TM;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index f2b072768b2597e5694e348ed919fbc4f9fc15e1..0870d67db390c4dfee649df311d8e633ce9e49bc 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -33,15 +33,15 @@
 namespace llvm {
 
 template<class Graph> class GraphWriter;
+class LLVMTargetMachine;
 class MachineFunction;
 class MachineRegisterInfo;
 class MCInstrDesc;
 struct MCSchedClassDesc;
-class ScheduleDAG;
 class SDNode;
 class SUnit;
+class ScheduleDAG;
 class TargetInstrInfo;
-class TargetMachine;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 
@@ -558,7 +558,7 @@ class TargetRegisterInfo;
 
   class ScheduleDAG {
   public:
-    const TargetMachine &TM;            ///< Target processor
+    const LLVMTargetMachine &TM;        ///< Target processor
     const TargetInstrInfo *TII;         ///< Target instruction information
     const TargetRegisterInfo *TRI;      ///< Target processor register info
     MachineFunction &MF;                ///< Machine function
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 973a3ddb1bace2bf6766e300db662ce7c9748cd3..3b144b92e2a6eb93499429e32741c2f93ec2d911 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -786,24 +786,6 @@ public:
   /// value assuming it was the smaller SrcTy value.
   SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT);
 
-  /// Return an operation which will any-extend the low lanes of the operand
-  /// into the specified vector type. For example,
-  /// this can convert a v16i8 into a v4i32 by any-extending the low four
-  /// lanes of the operand from i8 to i32.
-  SDValue getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL, EVT VT);
-
-  /// Return an operation which will sign extend the low lanes of the operand
-  /// into the specified vector type. For example,
-  /// this can convert a v16i8 into a v4i32 by sign extending the low four
-  /// lanes of the operand from i8 to i32.
-  SDValue getSignExtendVectorInReg(SDValue Op, const SDLoc &DL, EVT VT);
-
-  /// Return an operation which will zero extend the low lanes of the operand
-  /// into the specified vector type. For example,
-  /// this can convert a v16i8 into a v4i32 by zero extending the low four
-  /// lanes of the operand from i8 to i32.
-  SDValue getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL, EVT VT);
-
   /// Convert Op, which must be of integer type, to the integer type VT,
   /// by using an extension appropriate for the target's
   /// BooleanContent for type OpVT or truncating it.
diff --git a/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
index 580606441a9d0601d0ffad7efd32e456c070dded..2b2c48d57bc0fac3a04f9dbf09491194ab843c68 100644
--- a/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@@ -45,18 +45,21 @@ public:
         IsIndexSignExt(IsIndexSignExt) {}
 
   SDValue getBase() { return Base; }
+  SDValue getBase() const { return Base; }
   SDValue getIndex() { return Index; }
+  SDValue getIndex() const { return Index; }
 
-  bool equalBaseIndex(BaseIndexOffset &Other, const SelectionDAG &DAG) {
+  bool equalBaseIndex(const BaseIndexOffset &Other,
+                      const SelectionDAG &DAG) const {
     int64_t Off;
     return equalBaseIndex(Other, DAG, Off);
   }
 
-  bool equalBaseIndex(BaseIndexOffset &Other, const SelectionDAG &DAG,
-                      int64_t &Off);
+  bool equalBaseIndex(const BaseIndexOffset &Other, const SelectionDAG &DAG,
+                      int64_t &Off) const;
 
   /// Parses tree in Ptr for base, index, offset addresses.
-  static BaseIndexOffset match(LSBaseSDNode *N, const SelectionDAG &DAG);
+  static BaseIndexOffset match(const LSBaseSDNode *N, const SelectionDAG &DAG);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 28d27b7a459d685fef095d3afda7adba654daced..d125e888a5742d145f10ea43ec8046e917dcc078 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -672,6 +672,12 @@ public:
       case ISD::STRICT_FLOG2:
       case ISD::STRICT_FRINT:
       case ISD::STRICT_FNEARBYINT:
+      case ISD::STRICT_FMAXNUM:
+      case ISD::STRICT_FMINNUM:
+      case ISD::STRICT_FCEIL:
+      case ISD::STRICT_FFLOOR:
+      case ISD::STRICT_FROUND:
+      case ISD::STRICT_FTRUNC:
         return true;
     }
   }
diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index 55082222b7aea7d7fb048a80f36e5bf6c0b41b51..8c8a7be459fd5a605c23b3b16b30fda12e22ff06 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -413,10 +413,14 @@ class raw_ostream;
     /// Returns the base index for the given instruction.
     SlotIndex getInstructionIndex(const MachineInstr &MI) const {
       // Instructions inside a bundle have the same number as the bundle itself.
-      const MachineInstr &BundleStart = *getBundleStart(MI.getIterator());
-      assert(!BundleStart.isDebugInstr() &&
+      auto BundleStart = getBundleStart(MI.getIterator());
+      auto BundleEnd = getBundleEnd(MI.getIterator());
+      // Use the first non-debug instruction in the bundle to get SlotIndex.
+      const MachineInstr &BundleNonDebug =
+          *skipDebugInstructionsForward(BundleStart, BundleEnd);
+      assert(!BundleNonDebug.isDebugInstr() &&
              "Could not use a debug instruction to query mi2iMap.");
-      Mi2IndexMap::const_iterator itr = mi2iMap.find(&BundleStart);
+      Mi2IndexMap::const_iterator itr = mi2iMap.find(&BundleNonDebug);
       assert(itr != mi2iMap.end() && "Instruction not found in maps.");
       return itr->second;
     }
@@ -444,7 +448,7 @@ class raw_ostream;
     /// MI is not required to have an index.
     SlotIndex getIndexBefore(const MachineInstr &MI) const {
       const MachineBasicBlock *MBB = MI.getParent();
-      assert(MBB && "MI must be inserted inna basic block");
+      assert(MBB && "MI must be inserted in a basic block");
       MachineBasicBlock::const_iterator I = MI, B = MBB->begin();
       while (true) {
         if (I == B)
@@ -461,7 +465,7 @@ class raw_ostream;
     /// MI is not required to have an index.
     SlotIndex getIndexAfter(const MachineInstr &MI) const {
       const MachineBasicBlock *MBB = MI.getParent();
-      assert(MBB && "MI must be inserted inna basic block");
+      assert(MBB && "MI must be inserted in a basic block");
       MachineBasicBlock::const_iterator I = MI, E = MBB->end();
       while (true) {
         ++I;
diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h
index a5939070476e3e4f24c858e6e2d67436dbdf5145..38e575b1360fb1e6d38eec123d348ef058d8468b 100644
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@@ -279,7 +279,7 @@ public:
 
   /// Return the preferred vector type legalization action.
   virtual TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const {
+  getPreferredVectorAction(MVT VT) const {
     // The default action for one element vectors is to scalarize
     if (VT.getVectorNumElements() == 1)
       return TypeScalarizeVector;
@@ -819,6 +819,12 @@ public:
       case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
       case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
       case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
+      case ISD::STRICT_FMAXNUM: EqOpc = ISD::FMAXNUM; break;
+      case ISD::STRICT_FMINNUM: EqOpc = ISD::FMINNUM; break;
+      case ISD::STRICT_FCEIL: EqOpc = ISD::FCEIL; break;
+      case ISD::STRICT_FFLOOR: EqOpc = ISD::FFLOOR; break;
+      case ISD::STRICT_FROUND: EqOpc = ISD::FROUND; break;
+      case ISD::STRICT_FTRUNC: EqOpc = ISD::FTRUNC; break;
     }
 
     auto Action = getOperationAction(EqOpc, VT);
@@ -2058,6 +2064,14 @@ public:
     return true;
   }
 
+  /// Return true if the specified immediate is legal for the value input of a
+  /// store instruction.
+  virtual bool isLegalStoreImmediate(int64_t Value) const {
+    // Default implementation assumes that at least 0 works since it is likely
+    // that a zero register exists or a zero immediate is allowed.
+    return Value == 0;
+  }
+
   /// Return true if it's significantly cheaper to shift a vector by a uniform
   /// scalar than by an amount which will vary across each lane. On x86, for
   /// example, there is a "psllw" instruction for the former case, but no simple
@@ -2091,8 +2105,8 @@ public:
     case ISD::ADDE:
     case ISD::FMINNUM:
     case ISD::FMAXNUM:
-    case ISD::FMINNAN:
-    case ISD::FMAXNAN:
+    case ISD::FMINIMUM:
+    case ISD::FMAXIMUM:
       return true;
     default: return false;
     }
@@ -2908,11 +2922,22 @@ public:
   /// elements, returning true on success. Otherwise, analyze the expression and
   /// return a mask of KnownUndef and KnownZero elements for the expression
   /// (used to simplify the caller). The KnownUndef/Zero elements may only be
-  /// accurate for those bits in the DemandedMask
+  /// accurate for those bits in the DemandedMask.
   virtual bool SimplifyDemandedVectorEltsForTargetNode(
       SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
       APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;
 
+  /// Attempt to simplify any target nodes based on the demanded bits,
+  /// returning true on success. Otherwise, analyze the
+  /// expression and return a mask of KnownOne and KnownZero bits for the
+  /// expression (used to simplify the caller).  The KnownZero/One bits may only
+  /// be accurate for those bits in the DemandedMask.
+  virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op,
+                                                 const APInt &DemandedBits,
+                                                 KnownBits &Known,
+                                                 TargetLoweringOpt &TLO,
+                                                 unsigned Depth = 0) const;
+
   /// If \p SNaN is false, \returns true if \p Op is known to never be any
   /// NaN. If \p sNaN is true, returns if \p Op is known to never be a signaling
   /// NaN.
@@ -3644,6 +3669,42 @@ public:
   /// \returns True, if the expansion was successful, false otherwise
   bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
 
+  /// Expand float to UINT conversion
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandFP_TO_UINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
+  /// Expand UINT(i64) to double(f64) conversion
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandUINT_TO_FP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
+  /// Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
+  SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const;
+
+  /// Expand CTPOP nodes. Expands vector/scalar CTPOP nodes,
+  /// vector nodes can only succeed if all operations are legal/custom.
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandCTPOP(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
+  /// Expand CTLZ/CTLZ_ZERO_UNDEF nodes. Expands vector/scalar CTLZ nodes,
+  /// vector nodes can only succeed if all operations are legal/custom.
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandCTLZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
+  /// Expand CTTZ/CTTZ_ZERO_UNDEF nodes. Expands vector/scalar CTTZ nodes,
+  /// vector nodes can only succeed if all operations are legal/custom.
+  /// \param N Node to expand
+  /// \param Result output after conversion
+  /// \returns True, if the expansion was successful, false otherwise
+  bool expandCTTZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
+
   /// Turn load of vector type into a load of the individual elements.
   /// \param LD load to expand
   /// \returns MERGE_VALUEs of the scalar loads with their chains.
@@ -3681,6 +3742,11 @@ public:
   SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
                                   SDValue Index) const;
 
+  /// Method for building the DAG expansion of ISD::[US][ADD|SUB]SAT. This
+  /// method accepts integers or vectors of integers as its arguments.
+  SDValue getExpandedSaturationAdditionSubtraction(SDNode *Node,
+                                                   SelectionDAG &DAG) const;
+
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
   //
diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h
index 8f5c9cb8c3fa8a0c1f4850b36752bc5aa95972db..7fda8751d40ae9a296b0ca177c37f29f5ac854db 100644
--- a/include/llvm/CodeGen/TargetPassConfig.h
+++ b/include/llvm/CodeGen/TargetPassConfig.h
@@ -145,13 +145,13 @@ public:
 
   CodeGenOpt::Level getOptLevel() const;
 
-  /// Describe the status of the codegen
-  /// pipeline set by this target pass config.
-  /// Having a limited codegen pipeline means that options
-  /// have been used to restrict what codegen is doing.
-  /// In particular, that means that codegen won't emit
-  /// assembly code.
-  bool hasLimitedCodeGenPipeline() const;
+  /// Returns true if one of the `-start-after`, `-start-before`, `-stop-after`
+  /// or `-stop-before` options is set.
+  static bool hasLimitedCodeGenPipeline();
+
+  /// Returns true if none of the `-stop-before` and `-stop-after` options is
+  /// set.
+  static bool willCompleteCodeGenPipeline();
 
   /// If hasLimitedCodeGenPipeline is true, this method
   /// returns a string with the name of the options, separated
@@ -159,13 +159,6 @@ public:
   std::string
   getLimitedCodeGenPipelineReason(const char *Separator = "/") const;
 
-  /// Check if the codegen pipeline is limited in such a way that it
-  /// won't be complete. When the codegen pipeline is not complete,
-  /// this means it may not be possible to generate assembly from it.
-  bool willCompleteCodeGenPipeline() const {
-    return !hasLimitedCodeGenPipeline() || (!StopAfter && !StopBefore);
-  }
-
   void setDisableVerify(bool Disable) { setOpt(DisableVerify, Disable); }
 
   bool getEnableTailMerge() const { return EnableTailMerge; }
diff --git a/include/llvm/CodeGen/TargetSubtargetInfo.h b/include/llvm/CodeGen/TargetSubtargetInfo.h
index e28673de2253081cadf66290799f0cc9143aa3b0..968e4c4b810273ae3873459158d7f611358f9079 100644
--- a/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -169,6 +169,19 @@ public:
     return isZeroIdiom(MI, Mask);
   }
 
+  /// Returns true if MI is a candidate for move elimination.
+  ///
+  /// A candidate for move elimination may be optimized out at register renaming
+  /// stage. Subtargets can specify the set of optimizable moves by
+  /// instantiating tablegen class `IsOptimizableRegisterMove` (see
+  /// llvm/Target/TargetInstrPredicate.td).
+  ///
+  /// SubtargetEmitter is responsible for processing all the definitions of class
+  /// IsOptimizableRegisterMove, and auto-generate an override for this method.
+  virtual bool isOptimizableRegisterMove(const MachineInstr *MI) const {
+    return false;
+  }
+
   /// True if the subtarget should run MachineScheduler after aggressive
   /// coalescing.
   ///
diff --git a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
index b5479db97a1508dbe8fb199df6bf4c78d9ab7de7..6b5dd2d20d170b60bd8a5bf9ed3b1eb6dadd187f 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
@@ -47,7 +47,7 @@ public:
     return Error::success();
   }
   template <typename T> static Expected<T> deserializeAs(CVSymbol Symbol) {
-    T Record(Symbol.kind());
+    T Record(static_cast<SymbolRecordKind>(Symbol.kind()));
     if (auto EC = deserializeAs<T>(Symbol, Record))
       return std::move(EC);
     return Record;
diff --git a/include/llvm/DebugInfo/CodeView/TypeIndex.h b/include/llvm/DebugInfo/CodeView/TypeIndex.h
index 681b5f3aca9433182e99ff7d3cb312e7ec02a24b..58463a6b13dff3ed560644515b73f931f233d621 100644
--- a/include/llvm/DebugInfo/CodeView/TypeIndex.h
+++ b/include/llvm/DebugInfo/CodeView/TypeIndex.h
@@ -145,6 +145,13 @@ public:
     return TypeIndex(SimpleTypeKind::Void, SimpleTypeMode::NearPointer64);
   }
 
+  static TypeIndex NullptrT() {
+    // std::nullptr_t uses the pointer mode that doesn't indicate bit-width,
+    // presumably because std::nullptr_t is intended to be compatible with any
+    // pointer type.
+    return TypeIndex(SimpleTypeKind::Void, SimpleTypeMode::NearPointer);
+  }
+
   static TypeIndex SignedCharacter() {
     return TypeIndex(SimpleTypeKind::SignedCharacter);
   }
diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h
index ee6f53854e7ab302e6ffb63357deb08c8d6ef0e9..76f1f98ab660ae678ed5e4a50877eed37212aa24 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -429,6 +429,10 @@ public:
     return (Options & ClassOptions::ForwardReference) != ClassOptions::None;
   }
 
+  bool isScoped() const {
+    return (Options & ClassOptions::Scoped) != ClassOptions::None;
+  }
+
   uint16_t getMemberCount() const { return MemberCount; }
   ClassOptions getOptions() const { return Options; }
   TypeIndex getFieldList() const { return FieldList; }
@@ -655,7 +659,17 @@ public:
 
   ArrayRef<TypeIndex> getArgs() const { return ArgIndices; }
 
-  SmallVector<TypeIndex, 4> ArgIndices;
+  /// Indices of known build info arguments.
+  enum BuildInfoArg {
+    CurrentDirectory, ///< Absolute CWD path
+    BuildTool,        ///< Absolute compiler path
+    SourceFile,       ///< Path to main source file, relative or absolute
+    TypeServerPDB,    ///< Absolute path of type server PDB (/Fd)
+    CommandLine,      ///< Full canonical command line (maybe -cc1)
+    MaxArgs
+  };
+
+  SmallVector<TypeIndex, MaxArgs> ArgIndices;
 };
 
 // LF_VFTABLE
diff --git a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index 583740d2eb4b78fc31a8ec474277a45b254a8854..a84f074237de0a8784903627c7b554515711cb98 100644
--- a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -83,18 +83,21 @@ Error mergeIdRecords(MergingTypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
 Error mergeTypeAndIdRecords(MergingTypeTableBuilder &DestIds,
                             MergingTypeTableBuilder &DestTypes,
                             SmallVectorImpl<TypeIndex> &SourceToDest,
-                            const CVTypeArray &IdsAndTypes);
+                            const CVTypeArray &IdsAndTypes,
+                            Optional<EndPrecompRecord> &EndPrecomp);
 
 Error mergeTypeAndIdRecords(GlobalTypeTableBuilder &DestIds,
                             GlobalTypeTableBuilder &DestTypes,
                             SmallVectorImpl<TypeIndex> &SourceToDest,
                             const CVTypeArray &IdsAndTypes,
-                            ArrayRef<GloballyHashedType> Hashes);
+                            ArrayRef<GloballyHashedType> Hashes,
+                            Optional<EndPrecompRecord> &EndPrecomp);
 
 Error mergeTypeRecords(GlobalTypeTableBuilder &Dest,
                        SmallVectorImpl<TypeIndex> &SourceToDest,
                        const CVTypeArray &Types,
-                       ArrayRef<GloballyHashedType> Hashes);
+                       ArrayRef<GloballyHashedType> Hashes,
+                       Optional<EndPrecompRecord> &EndPrecomp);
 
 Error mergeIdRecords(GlobalTypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
                      SmallVectorImpl<TypeIndex> &SourceToDest,
diff --git a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
index 27d56d72f0a7ac819dc796f08680e2d5bc7ff3de..33797419a7b8fb687ab0da09902a81bbeb51dbcc 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
@@ -18,13 +18,13 @@ namespace llvm {
 class DWARFCompileUnit : public DWARFUnit {
 public:
   DWARFCompileUnit(DWARFContext &Context, const DWARFSection &Section,
-                   const DWARFUnitHeader &Header,
-                   const DWARFDebugAbbrev *DA, const DWARFSection *RS,
+                   const DWARFUnitHeader &Header, const DWARFDebugAbbrev *DA,
+                   const DWARFSection *RS, const DWARFSection *LocSection,
                    StringRef SS, const DWARFSection &SOS,
                    const DWARFSection *AOS, const DWARFSection &LS, bool LE,
                    bool IsDWO, const DWARFUnitVector &UnitVector)
-      : DWARFUnit(Context, Section, Header, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
-                  UnitVector) {}
+      : DWARFUnit(Context, Section, Header, DA, RS, LocSection, SS, SOS, AOS,
+                  LS, LE, IsDWO, UnitVector) {}
 
   /// VTable anchor.
   ~DWARFCompileUnit() override;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index c5b98ea5a2aacb321e1f689a95519c63437641f1..221f1f796980f343eb92ce666b54c679f8b6896c 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -76,7 +76,7 @@ class DWARFContext : public DIContext {
 
   DWARFUnitVector DWOUnits;
   std::unique_ptr<DWARFDebugAbbrev> AbbrevDWO;
-  std::unique_ptr<DWARFDebugLocDWO> LocDWO;
+  std::unique_ptr<DWARFDebugLoclists> LocDWO;
 
   /// The maximum DWARF version of all units.
   unsigned MaxVersion = 0;
@@ -262,7 +262,7 @@ public:
   const DWARFDebugAbbrev *getDebugAbbrevDWO();
 
   /// Get a pointer to the parsed DebugLoc object.
-  const DWARFDebugLocDWO *getDebugLocDWO();
+  const DWARFDebugLoclists *getDebugLocDWO();
 
   /// Get a pointer to the parsed DebugAranges object.
   const DWARFDebugAranges *getDebugAranges();
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index 9a73745fb6b4d82ed7f5fc0f9f7ac7bd9ce91cf7..da2098e15402bb883913216cd4894740a55986f3 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -73,19 +73,21 @@ public:
                                               uint32_t *Offset);
 };
 
-class DWARFDebugLocDWO {
+class DWARFDebugLoclists {
 public:
   struct Entry {
-    uint64_t Start;
-    uint32_t Length;
+    uint8_t Kind;
+    uint64_t Value0;
+    uint64_t Value1;
     SmallVector<char, 4> Loc;
   };
 
   struct LocationList {
     unsigned Offset;
     SmallVector<Entry, 2> Entries;
-    void dump(raw_ostream &OS, bool IsLittleEndian, unsigned AddressSize,
-              const MCRegisterInfo *RegInfo, unsigned Indent) const;
+    void dump(raw_ostream &OS, uint64_t BaseAddr, bool IsLittleEndian,
+              unsigned AddressSize, const MCRegisterInfo *RegInfo,
+              unsigned Indent) const;
   };
 
 private:
@@ -98,15 +100,15 @@ private:
   bool IsLittleEndian;
 
 public:
-  void parse(DataExtractor data);
-  void dump(raw_ostream &OS, const MCRegisterInfo *RegInfo,
+  void parse(DataExtractor data, unsigned Version);
+  void dump(raw_ostream &OS, uint64_t BaseAddr, const MCRegisterInfo *RegInfo,
             Optional<uint64_t> Offset) const;
 
   /// Return the location list at the given offset or nullptr.
   LocationList const *getLocationListAtOffset(uint64_t Offset) const;
 
-  static Optional<LocationList> parseOneLocationList(DataExtractor Data,
-                                                     uint32_t *Offset);
+  static Optional<LocationList>
+  parseOneLocationList(DataExtractor Data, unsigned *Offset, unsigned Version);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index ce7436d9faa387f93053ed159826da0008a2ca92..bc26edf006477350734b9130f53c58aa4394f359 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -18,7 +18,6 @@
 
 namespace llvm {
 
-struct BaseAddress;
 class raw_ostream;
 
 class DWARFDebugRangeList {
@@ -78,7 +77,7 @@ public:
   /// list. Has to be passed base address of the compile unit referencing this
   /// range list.
   DWARFAddressRangesVector
-  getAbsoluteRanges(llvm::Optional<BaseAddress> BaseAddr) const;
+  getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr) const;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
index e2e8ab5ed2193bb50d1c43511b26cee861888524..5cc8d789e5980e1311d4cad3907fce076519586b 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_DWARFDEBUGRNGLISTS_H
 #define LLVM_DEBUGINFO_DWARFDEBUGRNGLISTS_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
@@ -23,6 +24,7 @@ namespace llvm {
 
 class Error;
 class raw_ostream;
+class DWARFUnit;
 
 /// A class representing a single range list entry.
 struct RangeListEntry : public DWARFListEntryBase {
@@ -35,7 +37,9 @@ struct RangeListEntry : public DWARFListEntryBase {
 
   Error extract(DWARFDataExtractor Data, uint32_t End, uint32_t *OffsetPtr);
   void dump(raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
-            uint64_t &CurrentBase, DIDumpOptions DumpOpts) const;
+            uint64_t &CurrentBase, DIDumpOptions DumpOpts,
+            llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+                LookupPooledAddress) const;
   bool isSentinel() const { return EntryKind == dwarf::DW_RLE_end_of_list; }
 };
 
@@ -44,7 +48,8 @@ class DWARFDebugRnglist : public DWARFListType<RangeListEntry> {
 public:
   /// Build a DWARFAddressRangesVector from a rangelist.
   DWARFAddressRangesVector
-  getAbsoluteRanges(llvm::Optional<BaseAddress> BaseAddr) const;
+  getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr,
+                    DWARFUnit &U) const;
 };
 
 class DWARFDebugRnglistTable : public DWARFListTableBase<DWARFDebugRnglist> {
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h
index c77034f6348fe336ab3ff3491dc71cb7c3cab78f..baa47c2bfa580494c7bb1996bd1eca24dabbd662 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -404,6 +404,10 @@ public:
       Die = Die.getPreviousSibling();
   }
 
+  llvm::DWARFDie::iterator base() const {
+    return llvm::DWARFDie::iterator(AtEnd ? Die : Die.getSibling());
+  }
+
   reverse_iterator<llvm::DWARFDie::iterator> &operator++() {
     assert(!AtEnd && "Incrementing rend");
     llvm::DWARFDie D = Die.getPreviousSibling();
diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 1b5f71c946f9fb7998eb0702f55a1a412c3ef010..edf9442acd054674da49357d24bde7466b9ffa0d 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -101,6 +101,7 @@ public:
   Optional<int64_t> getAsSignedConstant() const;
   Optional<const char *> getAsCString() const;
   Optional<uint64_t> getAsAddress() const;
+  Optional<SectionedAddress> getAsSectionedAddress() const;
   Optional<uint64_t> getAsSectionOffset() const;
   Optional<ArrayRef<uint8_t>> getAsBlock() const;
   Optional<uint64_t> getAsCStringOffset() const;
@@ -238,6 +239,13 @@ inline Optional<uint64_t> toAddress(const Optional<DWARFFormValue> &V) {
   return None;
 }
 
+inline Optional<SectionedAddress>
+toSectionedAddress(const Optional<DWARFFormValue> &V) {
+  if (V)
+    return V->getAsSectionedAddress();
+  return None;
+}
+
 /// Take an optional DWARFFormValue and extract a address.
 ///
 /// \param V and optional DWARFFormValue to attempt to extract the value from.
diff --git a/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h b/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
index 8d1ac5c83c234ed0886657f4fd680bcb711c7b2f..073e02903c39e1ebf72416d974a71439c26a227e 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
@@ -24,6 +24,7 @@ class DWARFGdbIndex {
   uint32_t Version;
 
   uint32_t CuListOffset;
+  uint32_t TuListOffset;
   uint32_t AddressAreaOffset;
   uint32_t SymbolTableOffset;
   uint32_t ConstantPoolOffset;
@@ -34,6 +35,13 @@ class DWARFGdbIndex {
   };
   SmallVector<CompUnitEntry, 0> CuList;
 
+  struct TypeUnitEntry {
+    uint64_t Offset;
+    uint64_t TypeOffset;
+    uint64_t TypeSignature;
+  };
+  SmallVector<TypeUnitEntry, 0> TuList;
+
   struct AddressEntry {
     uint64_t LowAddress;  /// The low address.
     uint64_t HighAddress; /// The high address.
@@ -55,6 +63,7 @@ class DWARFGdbIndex {
   uint32_t StringPoolOffset;
 
   void dumpCUList(raw_ostream &OS) const;
+  void dumpTUList(raw_ostream &OS) const;
   void dumpAddressArea(raw_ostream &OS) const;
   void dumpSymbolTable(raw_ostream &OS) const;
   void dumpConstantPool(raw_ostream &OS) const;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
index 8cf9e4008921437805f3e98900572b9fbf7342a3..9b987314f209495505382c2afac559a495295dcb 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFListTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -99,6 +99,7 @@ public:
   uint32_t getHeaderOffset() const { return HeaderOffset; }
   uint8_t getAddrSize() const { return HeaderData.AddrSize; }
   uint32_t getLength() const { return HeaderData.Length; }
+  uint16_t getVersion() const { return HeaderData.Version; }
   StringRef getSectionName() const { return SectionName; }
   StringRef getListTypeString() const { return ListTypeString; }
   dwarf::DwarfFormat getFormat() const { return Format; }
@@ -156,7 +157,10 @@ public:
   uint32_t getHeaderOffset() const { return Header.getHeaderOffset(); }
   uint8_t getAddrSize() const { return Header.getAddrSize(); }
 
-  void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {}) const;
+  void dump(raw_ostream &OS,
+            llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+                LookupPooledAddress,
+            DIDumpOptions DumpOpts = {}) const;
 
   /// Return the contents of the offset entry designated by a given index.
   Optional<uint32_t> getOffsetEntry(uint32_t Index) const {
@@ -229,8 +233,11 @@ Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
 }
 
 template <typename DWARFListType>
-void DWARFListTableBase<DWARFListType>::dump(raw_ostream &OS,
-                                             DIDumpOptions DumpOpts) const {
+void DWARFListTableBase<DWARFListType>::dump(
+    raw_ostream &OS,
+    llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+        LookupPooledAddress,
+    DIDumpOptions DumpOpts) const {
   Header.dump(OS, DumpOpts);
   OS << HeaderString << "\n";
 
@@ -249,7 +256,7 @@ void DWARFListTableBase<DWARFListType>::dump(raw_ostream &OS,
   for (const auto &List : ListMap)
     for (const auto &Entry : List.second.getEntries())
       Entry.dump(OS, getAddrSize(), MaxEncodingStringLength, CurrentBase,
-                 DumpOpts);
+                 DumpOpts, LookupPooledAddress);
 }
 
 template <typename DWARFListType>
diff --git a/include/llvm/DebugInfo/DWARF/DWARFObject.h b/include/llvm/DebugInfo/DWARF/DWARFObject.h
index 6e8f370f4aeafed9080b646e187d56590b4315b6..5a808b0ec6a97ba13486475b9b8ba602e5a4dffa 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFObject.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFObject.h
@@ -33,11 +33,13 @@ public:
   virtual ArrayRef<SectionName> getSectionNames() const { return {}; }
   virtual bool isLittleEndian() const = 0;
   virtual uint8_t getAddressSize() const { llvm_unreachable("unimplemented"); }
-  virtual const DWARFSection &getInfoSection() const { return Dummy; }
+  virtual void
+  forEachInfoSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual void
   forEachTypesSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual StringRef getAbbrevSection() const { return ""; }
   virtual const DWARFSection &getLocSection() const { return Dummy; }
+  virtual const DWARFSection &getLoclistsSection() const { return Dummy; }
   virtual StringRef getARangeSection() const { return ""; }
   virtual StringRef getDebugFrameSection() const { return ""; }
   virtual StringRef getEHFrameSection() const { return ""; }
@@ -52,7 +54,8 @@ public:
   virtual StringRef getGnuPubNamesSection() const { return ""; }
   virtual StringRef getGnuPubTypesSection() const { return ""; }
   virtual const DWARFSection &getStringOffsetSection() const { return Dummy; }
-  virtual const DWARFSection &getInfoDWOSection() const { return Dummy; }
+  virtual void
+  forEachInfoDWOSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual void
   forEachTypesDWOSections(function_ref<void(const DWARFSection &)> F) const {}
   virtual StringRef getAbbrevDWOSection() const { return ""; }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFSection.h b/include/llvm/DebugInfo/DWARF/DWARFSection.h
index 77045f0794ae826b7caa96fb17e42c6ac95455ff..7f82359652971ac2bf23afcfb16796f09c164386 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFSection.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFSection.h
@@ -23,6 +23,11 @@ struct SectionName {
   bool IsNameUnique;
 };
 
+struct SectionedAddress {
+  uint64_t Address;
+  uint64_t SectionIndex;
+};
+
 } // end namespace llvm
 
 #endif // LLVM_DEBUGINFO_DWARF_DWARFSECTION_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
index 0a5a1aaa79d11f473c2b63cdc641fd8067d371c2..8ca5ba13fc2334fb73eff7a293b29a9c67ae81c8 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
@@ -26,13 +26,13 @@ class raw_ostream;
 class DWARFTypeUnit : public DWARFUnit {
 public:
   DWARFTypeUnit(DWARFContext &Context, const DWARFSection &Section,
-                const DWARFUnitHeader &Header,
-                const DWARFDebugAbbrev *DA, const DWARFSection *RS,
+                const DWARFUnitHeader &Header, const DWARFDebugAbbrev *DA,
+                const DWARFSection *RS, const DWARFSection *LocSection,
                 StringRef SS, const DWARFSection &SOS, const DWARFSection *AOS,
                 const DWARFSection &LS, bool LE, bool IsDWO,
                 const DWARFUnitVector &UnitVector)
-      : DWARFUnit(Context, Section, Header, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
-                  UnitVector) {}
+      : DWARFUnit(Context, Section, Header, DA, RS, LocSection, SS, SOS, AOS,
+                  LS, LE, IsDWO, UnitVector) {}
 
   uint64_t getTypeHash() const { return getHeader().getTypeHash(); }
   uint32_t getTypeOffset() const { return getHeader().getTypeOffset(); }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 39d43b91485f1bf84c0fb393feae33d144ab5751..458278e4282fd777386ae0bc847d9976dcd51579 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -153,18 +153,13 @@ public:
 private:
   void addUnitsImpl(DWARFContext &Context, const DWARFObject &Obj,
                     const DWARFSection &Section, const DWARFDebugAbbrev *DA,
-                    const DWARFSection *RS, StringRef SS,
-                    const DWARFSection &SOS, const DWARFSection *AOS,
-                    const DWARFSection &LS, bool LE, bool IsDWO, bool Lazy,
-                    DWARFSectionKind SectionKind);
+                    const DWARFSection *RS, const DWARFSection *LocSection,
+                    StringRef SS, const DWARFSection &SOS,
+                    const DWARFSection *AOS, const DWARFSection &LS, bool LE,
+                    bool IsDWO, bool Lazy, DWARFSectionKind SectionKind);
 };
 
 /// Represents base address of the CU.
-struct BaseAddress {
-  uint64_t Address;
-  uint64_t SectionIndex;
-};
-
 /// Represents a unit's contribution to the string offsets table.
 struct StrOffsetsContributionDescriptor {
   uint64_t Base = 0;
@@ -198,13 +193,19 @@ class DWARFUnit {
   const DWARFDebugAbbrev *Abbrev;
   const DWARFSection *RangeSection;
   uint32_t RangeSectionBase;
+  /// We either keep track of the location list section or its data, depending
+  /// on whether we are handling a split DWARF section or not.
+  union {
+    const DWARFSection *LocSection;
+    StringRef LocSectionData;
+  };
   const DWARFSection &LineSection;
   StringRef StringSection;
   const DWARFSection &StringOffsetSection;
   const DWARFSection *AddrOffsetSection;
   uint32_t AddrOffsetSectionBase = 0;
   bool isLittleEndian;
-  bool isDWO;
+  bool IsDWO;
   const DWARFUnitVector &UnitVector;
 
   /// Start, length, and DWARF format of the unit's contribution to the string
@@ -215,7 +216,7 @@ class DWARFUnit {
   Optional<DWARFDebugRnglistTable> RngListTable;
 
   mutable const DWARFAbbreviationDeclarationSet *Abbrevs;
-  llvm::Optional<BaseAddress> BaseAddr;
+  llvm::Optional<SectionedAddress> BaseAddr;
   /// The compile unit debug information entry items.
   std::vector<DWARFDebugInfoEntry> DieArray;
 
@@ -245,29 +246,30 @@ protected:
   /// length and form. The given offset is expected to be derived from the unit
   /// DIE's DW_AT_str_offsets_base attribute.
   Optional<StrOffsetsContributionDescriptor>
-  determineStringOffsetsTableContribution(DWARFDataExtractor &DA,
-                                          uint64_t Offset);
+  determineStringOffsetsTableContribution(DWARFDataExtractor &DA);
 
   /// Find the unit's contribution to the string offsets table and determine its
   /// length and form. The given offset is expected to be 0 in a dwo file or,
   /// in a dwp file, the start of the unit's contribution to the string offsets
   /// table section (as determined by the index table).
   Optional<StrOffsetsContributionDescriptor>
-  determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA,
-                                             uint64_t Offset);
+  determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA);
 
 public:
   DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
-            const DWARFUnitHeader &Header,
-            const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS,
-            const DWARFSection &SOS, const DWARFSection *AOS,
+            const DWARFUnitHeader &Header, const DWARFDebugAbbrev *DA,
+            const DWARFSection *RS, const DWARFSection *LocSection,
+            StringRef SS, const DWARFSection &SOS, const DWARFSection *AOS,
             const DWARFSection &LS, bool LE, bool IsDWO,
             const DWARFUnitVector &UnitVector);
 
   virtual ~DWARFUnit();
 
+  bool isDWOUnit() const { return IsDWO; }
   DWARFContext& getContext() const { return Context; }
   const DWARFSection &getInfoSection() const { return InfoSection; }
+  const DWARFSection *getLocSection() const { return LocSection; }
+  StringRef getLocSectionData() const { return LocSectionData; }
   uint32_t getOffset() const { return Header.getOffset(); }
   const dwarf::FormParams &getFormParams() const {
     return Header.getFormParams();
@@ -301,8 +303,8 @@ public:
     RangeSectionBase = Base;
   }
 
-  bool getAddrOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
-  bool getStringOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
+  Optional<SectionedAddress> getAddrOffsetSectionItem(uint32_t Index) const;
+  Optional<uint64_t> getStringOffsetSectionItem(uint32_t Index) const;
 
   DWARFDataExtractor getDebugInfoExtractor() const;
 
@@ -372,7 +374,7 @@ public:
     llvm_unreachable("Invalid UnitType.");
   }
 
-  llvm::Optional<BaseAddress> getBaseAddress();
+  llvm::Optional<SectionedAddress> getBaseAddress();
 
   DWARFDie getUnitDIE(bool ExtractUnitDIEOnly = true) {
     extractDIEsIfNeeded(ExtractUnitDIEOnly);
diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 3ad65cf51b1b7bf869f17ad9666128d3626a9322..e47fbea5646ed523cd55555f9f09bc117ffff623 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -97,6 +97,9 @@ private:
   /// lies between to valid DIEs.
   std::map<uint64_t, std::set<uint32_t>> ReferenceToDIEOffsets;
   uint32_t NumDebugLineErrors = 0;
+  // Used to relax some checks that do not currently work portably
+  bool IsObjectFile;
+  bool IsMachOObject;
 
   raw_ostream &error() const;
   raw_ostream &warn() const;
@@ -286,8 +289,8 @@ private:
 
 public:
   DWARFVerifier(raw_ostream &S, DWARFContext &D,
-                DIDumpOptions DumpOpts = DIDumpOptions::getForSingleDIE())
-      : OS(S), DCtx(D), DumpOpts(std::move(DumpOpts)) {}
+                DIDumpOptions DumpOpts = DIDumpOptions::getForSingleDIE());
+
   /// Verify the information in any of the following sections, if available:
   /// .debug_abbrev, debug_abbrev.dwo
   ///
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3b02f07e6485272c45e378133db5bb71764078f
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h
@@ -0,0 +1,36 @@
+//==- DIAEnumFrameData.h --------------------------------------- -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAENUMFRAMEDATA_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAENUMFRAMEDATA_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
+
+namespace llvm {
+namespace pdb {
+
+class DIAEnumFrameData : public IPDBEnumChildren<IPDBFrameData> {
+public:
+  explicit DIAEnumFrameData(CComPtr<IDiaEnumFrameData> DiaEnumerator);
+
+  uint32_t getChildCount() const override;
+  ChildTypePtr getChildAtIndex(uint32_t Index) const override;
+  ChildTypePtr getNext() override;
+  void reset() override;
+
+private:
+  CComPtr<IDiaEnumFrameData> Enumerator;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h b/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ce6cfc93030d043da7dc9e48698db7f3251c722
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/DIA/DIAFrameData.h
@@ -0,0 +1,39 @@
+//===- DIAFrameData.h - DIA Impl. of IPDBFrameData ---------------- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_DIA_DIAFRAMEDATA_H
+#define LLVM_DEBUGINFO_PDB_DIA_DIAFRAMEDATA_H
+
+#include "DIASupport.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
+
+namespace llvm {
+namespace pdb {
+
+class DIASession;
+
+class DIAFrameData : public IPDBFrameData {
+public:
+  explicit DIAFrameData(CComPtr<IDiaFrameData> DiaFrameData);
+
+  uint32_t getAddressOffset() const override;
+  uint32_t getAddressSection() const override;
+  uint32_t getLengthBlock() const override;
+  std::string getProgram() const override;
+  uint32_t getRelativeVirtualAddress() const override;
+  uint64_t getVirtualAddress() const override;
+
+private:
+  CComPtr<IDiaFrameData> FrameData;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIASession.h b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
index e355605c2960c26a095b32548adc0da589d7e7e9..592e061a8d83f13755b13a6eb097e090e71fcbaa 100644
--- a/include/llvm/DebugInfo/PDB/DIA/DIASession.h
+++ b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
@@ -85,6 +85,7 @@ public:
 
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
 
+  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override;
 private:
   CComPtr<IDiaSession> Session;
 };
diff --git a/include/llvm/DebugInfo/PDB/GenericError.h b/include/llvm/DebugInfo/PDB/GenericError.h
index 4e2e8b163b533e52b575f520545239183ef3d0a0..7b5a85295963b78760b81d3dd8bd36d51da647fa 100644
--- a/include/llvm/DebugInfo/PDB/GenericError.h
+++ b/include/llvm/DebugInfo/PDB/GenericError.h
@@ -21,24 +21,23 @@ enum class pdb_error_code {
   dia_sdk_not_present,
   dia_failed_loading,
   signature_out_of_date,
-  type_server_not_found,
   unspecified,
 };
-} // namespace codeview
+} // namespace pdb
 } // namespace llvm
 
 namespace std {
-    template <>
-    struct is_error_code_enum<llvm::pdb::pdb_error_code> : std::true_type {};
+template <>
+struct is_error_code_enum<llvm::pdb::pdb_error_code> : std::true_type {};
 } // namespace std
 
 namespace llvm {
 namespace pdb {
-    const std::error_category &PDBErrCategory();
+const std::error_category &PDBErrCategory();
 
-    inline std::error_code make_error_code(pdb_error_code E) {
-        return std::error_code(static_cast<int>(E), PDBErrCategory());
-    }
+inline std::error_code make_error_code(pdb_error_code E) {
+  return std::error_code(static_cast<int>(E), PDBErrCategory());
+}
 
 /// Base class for errors originating when parsing raw PDB files
 class PDBError : public ErrorInfo<PDBError, StringError> {
diff --git a/include/llvm/DebugInfo/PDB/IPDBFrameData.h b/include/llvm/DebugInfo/PDB/IPDBFrameData.h
new file mode 100644
index 0000000000000000000000000000000000000000..74679215b88017286aadd7dafc98060ddbae8cd9
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/IPDBFrameData.h
@@ -0,0 +1,36 @@
+//===- IPDBFrameData.h - base interface for frame data ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_IPDBFRAMEDATA_H
+#define LLVM_DEBUGINFO_PDB_IPDBFRAMEDATA_H
+
+#include <cstdint>
+#include <string>
+
+namespace llvm {
+namespace pdb {
+
+/// IPDBFrameData defines an interface used to represent a frame data of some
+/// code block.
+class IPDBFrameData {
+public:
+  virtual ~IPDBFrameData();
+
+  virtual uint32_t getAddressOffset() const = 0;
+  virtual uint32_t getAddressSection() const = 0;
+  virtual uint32_t getLengthBlock() const = 0;
+  virtual std::string getProgram() const = 0;
+  virtual uint32_t getRelativeVirtualAddress() const = 0;
+  virtual uint64_t getVirtualAddress() const = 0;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/IPDBSession.h b/include/llvm/DebugInfo/PDB/IPDBSession.h
index 24573cdb7797883f821f52b6af591106e3d93c29..88fd02c0a345287a8c82a213416a2dd627c1c908 100644
--- a/include/llvm/DebugInfo/PDB/IPDBSession.h
+++ b/include/llvm/DebugInfo/PDB/IPDBSession.h
@@ -91,6 +91,9 @@ public:
 
   virtual std::unique_ptr<IPDBEnumSectionContribs>
   getSectionContribs() const = 0;
+
+  virtual std::unique_ptr<IPDBEnumFrameData>
+  getFrameData() const = 0;
 };
 } // namespace pdb
 } // namespace llvm
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeSession.h b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
index 07ce85ef820da92e56e417f12ef228bf5671dfcb..4878e47d31217f7d4871365d3ae172331e2df960 100644
--- a/include/llvm/DebugInfo/PDB/Native/NativeSession.h
+++ b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
@@ -93,6 +93,8 @@ public:
 
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override;
 
+  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override;
+
   PDBFile &getPDBFile() { return *Pdb; }
   const PDBFile &getPDBFile() const { return *Pdb; }
 
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index 00cc720336cf1337d8c9979e4175660286dc93a4..b76576a7a263bb4675a9dfab39cd348549a46450 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -61,6 +61,10 @@ public:
   Expected<codeview::TypeIndex>
   findFullDeclForForwardRef(codeview::TypeIndex ForwardRefTI) const;
 
+  std::vector<codeview::TypeIndex> findRecordsByName(StringRef Name) const;
+
+  codeview::CVType getType(codeview::TypeIndex Index);
+
   BinarySubstreamRef getTypeRecordsSubstream() const;
 
   Error commit();
diff --git a/include/llvm/DebugInfo/PDB/PDBTypes.h b/include/llvm/DebugInfo/PDB/PDBTypes.h
index 6247018ce0f12ee4f5b359369bcea82fedcc12c1..917f3ed73910af8f87682ebc702afbe92135c5bc 100644
--- a/include/llvm/DebugInfo/PDB/PDBTypes.h
+++ b/include/llvm/DebugInfo/PDB/PDBTypes.h
@@ -12,6 +12,7 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include <cctype>
 #include <cstddef>
@@ -71,6 +72,7 @@ using IPDBEnumLineNumbers = IPDBEnumChildren<IPDBLineNumber>;
 using IPDBEnumTables = IPDBEnumChildren<IPDBTable>;
 using IPDBEnumInjectedSources = IPDBEnumChildren<IPDBInjectedSource>;
 using IPDBEnumSectionContribs = IPDBEnumChildren<IPDBSectionContrib>;
+using IPDBEnumFrameData = IPDBEnumChildren<IPDBFrameData>;
 
 /// Specifies which PDB reader implementation is to be used.  Only a value
 /// of PDB_ReaderType::DIA is currently supported, but Native is in the works.
diff --git a/include/llvm/Demangle/ItaniumDemangle.h b/include/llvm/Demangle/ItaniumDemangle.h
index bc60bc3454e30f81525b14cbbd65e60d3279e508..c5619a15bbee9370355c1c9a48748d0ba72b28a3 100644
--- a/include/llvm/Demangle/ItaniumDemangle.h
+++ b/include/llvm/Demangle/ItaniumDemangle.h
@@ -2134,8 +2134,7 @@ public:
   }
 };
 
-template <typename Alloc>
-struct Db {
+template <typename Derived, typename Alloc> struct AbstractManglingParser {
   const char *First;
   const char *Last;
 
@@ -2167,7 +2166,10 @@ struct Db {
 
   Alloc ASTAllocator;
 
-  Db(const char *First_, const char *Last_) : First(First_), Last(Last_) {}
+  AbstractManglingParser(const char *First_, const char *Last_)
+      : First(First_), Last(Last_) {}
+
+  Derived &getDerived() { return static_cast<Derived &>(*this); }
 
   void reset(const char *First_, const char *Last_) {
     First = First_;
@@ -2274,7 +2276,7 @@ struct Db {
     FunctionRefQual ReferenceQualifier = FrefQualNone;
     size_t ForwardTemplateRefsBegin;
 
-    NameState(Db *Enclosing)
+    NameState(AbstractManglingParser *Enclosing)
         : ForwardTemplateRefsBegin(Enclosing->ForwardTemplateRefs.size()) {}
   };
 
@@ -2324,35 +2326,36 @@ const char* parse_discriminator(const char* first, const char* last);
 //
 // <unscoped-template-name> ::= <unscoped-name>
 //                          ::= <substitution>
-template<typename Alloc> Node *Db<Alloc>::parseName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseName(NameState *State) {
   consumeIf('L'); // extension
 
   if (look() == 'N')
-    return parseNestedName(State);
+    return getDerived().parseNestedName(State);
   if (look() == 'Z')
-    return parseLocalName(State);
+    return getDerived().parseLocalName(State);
 
   //        ::= <unscoped-template-name> <template-args>
   if (look() == 'S' && look(1) != 't') {
-    Node *S = parseSubstitution();
+    Node *S = getDerived().parseSubstitution();
     if (S == nullptr)
       return nullptr;
     if (look() != 'I')
       return nullptr;
-    Node *TA = parseTemplateArgs(State != nullptr);
+    Node *TA = getDerived().parseTemplateArgs(State != nullptr);
     if (TA == nullptr)
       return nullptr;
     if (State) State->EndsWithTemplateArgs = true;
     return make<NameWithTemplateArgs>(S, TA);
   }
 
-  Node *N = parseUnscopedName(State);
+  Node *N = getDerived().parseUnscopedName(State);
   if (N == nullptr)
     return nullptr;
   //        ::= <unscoped-template-name> <template-args>
   if (look() == 'I') {
     Subs.push_back(N);
-    Node *TA = parseTemplateArgs(State != nullptr);
+    Node *TA = getDerived().parseTemplateArgs(State != nullptr);
     if (TA == nullptr)
       return nullptr;
     if (State) State->EndsWithTemplateArgs = true;
@@ -2365,10 +2368,11 @@ template<typename Alloc> Node *Db<Alloc>::parseName(NameState *State) {
 // <local-name> := Z <function encoding> E <entity name> [<discriminator>]
 //              := Z <function encoding> E s [<discriminator>]
 //              := Z <function encoding> Ed [ <parameter number> ] _ <entity name>
-template<typename Alloc> Node *Db<Alloc>::parseLocalName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseLocalName(NameState *State) {
   if (!consumeIf('Z'))
     return nullptr;
-  Node *Encoding = parseEncoding();
+  Node *Encoding = getDerived().parseEncoding();
   if (Encoding == nullptr || !consumeIf('E'))
     return nullptr;
 
@@ -2384,13 +2388,13 @@ template<typename Alloc> Node *Db<Alloc>::parseLocalName(NameState *State) {
     parseNumber(true);
     if (!consumeIf('_'))
       return nullptr;
-    Node *N = parseName(State);
+    Node *N = getDerived().parseName(State);
     if (N == nullptr)
       return nullptr;
     return make<LocalName>(Encoding, N);
   }
 
-  Node *Entity = parseName(State);
+  Node *Entity = getDerived().parseName(State);
   if (Entity == nullptr)
     return nullptr;
   First = parse_discriminator(First, Last);
@@ -2400,14 +2404,16 @@ template<typename Alloc> Node *Db<Alloc>::parseLocalName(NameState *State) {
 // <unscoped-name> ::= <unqualified-name>
 //                 ::= St <unqualified-name>   # ::std::
 // extension       ::= StL<unqualified-name>
-template<typename Alloc> Node *Db<Alloc>::parseUnscopedName(NameState *State) {
- if (consumeIf("StL") || consumeIf("St")) {
-   Node *R = parseUnqualifiedName(State);
-   if (R == nullptr)
-     return nullptr;
-   return make<StdQualifiedName>(R);
- }
- return parseUnqualifiedName(State);
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseUnscopedName(NameState *State) {
+  if (consumeIf("StL") || consumeIf("St")) {
+    Node *R = getDerived().parseUnqualifiedName(State);
+    if (R == nullptr)
+      return nullptr;
+    return make<StdQualifiedName>(R);
+  }
+  return getDerived().parseUnqualifiedName(State);
 }
 
 // <unqualified-name> ::= <operator-name> [abi-tags]
@@ -2415,27 +2421,28 @@ template<typename Alloc> Node *Db<Alloc>::parseUnscopedName(NameState *State) {
 //                    ::= <source-name>
 //                    ::= <unnamed-type-name>
 //                    ::= DC <source-name>+ E      # structured binding declaration
-template<typename Alloc>
-Node *Db<Alloc>::parseUnqualifiedName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseUnqualifiedName(NameState *State) {
   // <ctor-dtor-name>s are special-cased in parseNestedName().
   Node *Result;
   if (look() == 'U')
-    Result = parseUnnamedTypeName(State);
+    Result = getDerived().parseUnnamedTypeName(State);
   else if (look() >= '1' && look() <= '9')
-    Result = parseSourceName(State);
+    Result = getDerived().parseSourceName(State);
   else if (consumeIf("DC")) {
     size_t BindingsBegin = Names.size();
     do {
-      Node *Binding = parseSourceName(State);
+      Node *Binding = getDerived().parseSourceName(State);
       if (Binding == nullptr)
         return nullptr;
       Names.push_back(Binding);
     } while (!consumeIf('E'));
     Result = make<StructuredBindingName>(popTrailingNodeArray(BindingsBegin));
   } else
-    Result = parseOperatorName(State);
+    Result = getDerived().parseOperatorName(State);
   if (Result != nullptr)
-    Result = parseAbiTags(Result);
+    Result = getDerived().parseAbiTags(Result);
   return Result;
 }
 
@@ -2445,7 +2452,9 @@ Node *Db<Alloc>::parseUnqualifiedName(NameState *State) {
 // <closure-type-name> ::= Ul <lambda-sig> E [ <nonnegative number> ] _
 //
 // <lambda-sig> ::= <parameter type>+  # Parameter types or "v" if the lambda has no parameters
-template<typename Alloc> Node *Db<Alloc>::parseUnnamedTypeName(NameState *) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseUnnamedTypeName(NameState *) {
   if (consumeIf("Ut")) {
     StringView Count = parseNumber();
     if (!consumeIf('_'))
@@ -2458,7 +2467,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnnamedTypeName(NameState *) {
     if (!consumeIf("vE")) {
       size_t ParamsBegin = Names.size();
       do {
-        Node *P = parseType();
+        Node *P = getDerived().parseType();
         if (P == nullptr)
           return nullptr;
         Names.push_back(P);
@@ -2474,7 +2483,8 @@ template<typename Alloc> Node *Db<Alloc>::parseUnnamedTypeName(NameState *) {
 }
 
 // <source-name> ::= <positive length number> <identifier>
-template<typename Alloc> Node *Db<Alloc>::parseSourceName(NameState *) {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSourceName(NameState *) {
   size_t Length = 0;
   if (parsePositiveInteger(&Length))
     return nullptr;
@@ -2538,7 +2548,9 @@ template<typename Alloc> Node *Db<Alloc>::parseSourceName(NameState *) {
 //                   ::= rS    # >>=
 //                   ::= ss    # <=> C++2a
 //                   ::= v <digit> <source-name>        # vendor extended operator
-template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseOperatorName(NameState *State) {
   switch (look()) {
   case 'a':
     switch (look(1)) {
@@ -2578,7 +2590,7 @@ template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
       SwapAndRestore<bool> SavePermit(PermitForwardTemplateReferences,
                                       PermitForwardTemplateReferences ||
                                           State != nullptr);
-      Node* Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       if (State) State->CtorDtorConversion = true;
@@ -2642,7 +2654,7 @@ template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
     //                   ::= li <source-name>  # operator ""
     case 'i': {
       First += 2;
-      Node *SN = parseSourceName(State);
+      Node *SN = getDerived().parseSourceName(State);
       if (SN == nullptr)
         return nullptr;
       return make<LiteralOperator>(SN);
@@ -2763,7 +2775,7 @@ template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
   case 'v':
     if (std::isdigit(look(1))) {
       First += 2;
-      Node *SN = parseSourceName(State);
+      Node *SN = getDerived().parseSourceName(State);
       if (SN == nullptr)
         return nullptr;
       return make<ConversionOperatorType>(SN);
@@ -2781,8 +2793,10 @@ template<typename Alloc> Node *Db<Alloc>::parseOperatorName(NameState *State) {
 //                  ::= D1  # complete object destructor
 //                  ::= D2  # base object destructor
 //   extension      ::= D5    # ?
-template<typename Alloc>
-Node *Db<Alloc>::parseCtorDtorName(Node *&SoFar, NameState *State) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseCtorDtorName(Node *&SoFar,
+                                                          NameState *State) {
   if (SoFar->getKind() == Node::KSpecialSubstitution) {
     auto SSK = static_cast<SpecialSubstitution *>(SoFar)->SSK;
     switch (SSK) {
@@ -2793,6 +2807,7 @@ Node *Db<Alloc>::parseCtorDtorName(Node *&SoFar, NameState *State) {
       SoFar = make<ExpandedSpecialSubstitution>(SSK);
       if (!SoFar)
         return nullptr;
+      break;
     default:
       break;
     }
@@ -2806,7 +2821,7 @@ Node *Db<Alloc>::parseCtorDtorName(Node *&SoFar, NameState *State) {
     ++First;
     if (State) State->CtorDtorConversion = true;
     if (IsInherited) {
-      if (parseName(State) == nullptr)
+      if (getDerived().parseName(State) == nullptr)
         return nullptr;
     }
     return make<CtorDtorName>(SoFar, false, Variant);
@@ -2840,7 +2855,9 @@ Node *Db<Alloc>::parseCtorDtorName(Node *&SoFar, NameState *State) {
 // <template-prefix> ::= <prefix> <template unqualified-name>
 //                   ::= <template-param>
 //                   ::= <substitution>
-template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseNestedName(NameState *State) {
   if (!consumeIf('N'))
     return nullptr;
 
@@ -2881,7 +2898,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 
     //          ::= <template-param>
     if (look() == 'T') {
-      if (!PushComponent(parseTemplateParam()))
+      if (!PushComponent(getDerived().parseTemplateParam()))
         return nullptr;
       Subs.push_back(SoFar);
       continue;
@@ -2889,7 +2906,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 
     //          ::= <template-prefix> <template-args>
     if (look() == 'I') {
-      Node *TA = parseTemplateArgs(State != nullptr);
+      Node *TA = getDerived().parseTemplateArgs(State != nullptr);
       if (TA == nullptr || SoFar == nullptr)
         return nullptr;
       SoFar = make<NameWithTemplateArgs>(SoFar, TA);
@@ -2902,7 +2919,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 
     //          ::= <decltype>
     if (look() == 'D' && (look(1) == 't' || look(1) == 'T')) {
-      if (!PushComponent(parseDecltype()))
+      if (!PushComponent(getDerived().parseDecltype()))
         return nullptr;
       Subs.push_back(SoFar);
       continue;
@@ -2910,7 +2927,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 
     //          ::= <substitution>
     if (look() == 'S' && look(1) != 't') {
-      Node *S = parseSubstitution();
+      Node *S = getDerived().parseSubstitution();
       if (!PushComponent(S))
         return nullptr;
       if (SoFar != S)
@@ -2922,9 +2939,9 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
     if (look() == 'C' || (look() == 'D' && look(1) != 'C')) {
       if (SoFar == nullptr)
         return nullptr;
-      if (!PushComponent(parseCtorDtorName(SoFar, State)))
+      if (!PushComponent(getDerived().parseCtorDtorName(SoFar, State)))
         return nullptr;
-      SoFar = parseAbiTags(SoFar);
+      SoFar = getDerived().parseAbiTags(SoFar);
       if (SoFar == nullptr)
         return nullptr;
       Subs.push_back(SoFar);
@@ -2932,7 +2949,7 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
     }
 
     //          ::= <prefix> <unqualified-name>
-    if (!PushComponent(parseUnqualifiedName(State)))
+    if (!PushComponent(getDerived().parseUnqualifiedName(State)))
       return nullptr;
     Subs.push_back(SoFar);
   }
@@ -2945,12 +2962,13 @@ template<typename Alloc> Node *Db<Alloc>::parseNestedName(NameState *State) {
 }
 
 // <simple-id> ::= <source-name> [ <template-args> ]
-template<typename Alloc> Node *Db<Alloc>::parseSimpleId() {
-  Node *SN = parseSourceName(/*NameState=*/nullptr);
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSimpleId() {
+  Node *SN = getDerived().parseSourceName(/*NameState=*/nullptr);
   if (SN == nullptr)
     return nullptr;
   if (look() == 'I') {
-    Node *TA = parseTemplateArgs();
+    Node *TA = getDerived().parseTemplateArgs();
     if (TA == nullptr)
       return nullptr;
     return make<NameWithTemplateArgs>(SN, TA);
@@ -2960,12 +2978,13 @@ template<typename Alloc> Node *Db<Alloc>::parseSimpleId() {
 
 // <destructor-name> ::= <unresolved-type>  # e.g., ~T or ~decltype(f())
 //                   ::= <simple-id>        # e.g., ~A<2*N>
-template<typename Alloc> Node *Db<Alloc>::parseDestructorName() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseDestructorName() {
   Node *Result;
   if (std::isdigit(look()))
-    Result = parseSimpleId();
+    Result = getDerived().parseSimpleId();
   else
-    Result = parseUnresolvedType();
+    Result = getDerived().parseUnresolvedType();
   if (Result == nullptr)
     return nullptr;
   return make<DtorName>(Result);
@@ -2974,22 +2993,23 @@ template<typename Alloc> Node *Db<Alloc>::parseDestructorName() {
 // <unresolved-type> ::= <template-param>
 //                   ::= <decltype>
 //                   ::= <substitution>
-template<typename Alloc> Node *Db<Alloc>::parseUnresolvedType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseUnresolvedType() {
   if (look() == 'T') {
-    Node *TP = parseTemplateParam();
+    Node *TP = getDerived().parseTemplateParam();
     if (TP == nullptr)
       return nullptr;
     Subs.push_back(TP);
     return TP;
   }
   if (look() == 'D') {
-    Node *DT = parseDecltype();
+    Node *DT = getDerived().parseDecltype();
     if (DT == nullptr)
       return nullptr;
     Subs.push_back(DT);
     return DT;
   }
-  return parseSubstitution();
+  return getDerived().parseSubstitution();
 }
 
 // <base-unresolved-name> ::= <simple-id>                                # unresolved name
@@ -2999,20 +3019,21 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedType() {
 //                        ::= on <operator-name> <template-args>         # unresolved operator template-id
 //                        ::= dn <destructor-name>                       # destructor or pseudo-destructor;
 //                                                                         # e.g. ~X or ~X<N-1>
-template<typename Alloc> Node *Db<Alloc>::parseBaseUnresolvedName() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseBaseUnresolvedName() {
   if (std::isdigit(look()))
-    return parseSimpleId();
+    return getDerived().parseSimpleId();
 
   if (consumeIf("dn"))
-    return parseDestructorName();
+    return getDerived().parseDestructorName();
 
   consumeIf("on");
 
-  Node *Oper = parseOperatorName(/*NameState=*/nullptr);
+  Node *Oper = getDerived().parseOperatorName(/*NameState=*/nullptr);
   if (Oper == nullptr)
     return nullptr;
   if (look() == 'I') {
-    Node *TA = parseTemplateArgs();
+    Node *TA = getDerived().parseTemplateArgs();
     if (TA == nullptr)
       return nullptr;
     return make<NameWithTemplateArgs>(Oper, TA);
@@ -3031,18 +3052,19 @@ template<typename Alloc> Node *Db<Alloc>::parseBaseUnresolvedName() {
 //  (ignored)        ::= srN <unresolved-type>  <unresolved-qualifier-level>+ E <base-unresolved-name>
 //
 // <unresolved-qualifier-level> ::= <simple-id>
-template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseUnresolvedName() {
   Node *SoFar = nullptr;
 
   // srN <unresolved-type> [<template-args>] <unresolved-qualifier-level>* E <base-unresolved-name>
   // srN <unresolved-type>                   <unresolved-qualifier-level>+ E <base-unresolved-name>
   if (consumeIf("srN")) {
-    SoFar = parseUnresolvedType();
+    SoFar = getDerived().parseUnresolvedType();
     if (SoFar == nullptr)
       return nullptr;
 
     if (look() == 'I') {
-      Node *TA = parseTemplateArgs();
+      Node *TA = getDerived().parseTemplateArgs();
       if (TA == nullptr)
         return nullptr;
       SoFar = make<NameWithTemplateArgs>(SoFar, TA);
@@ -3051,7 +3073,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
     }
 
     while (!consumeIf('E')) {
-      Node *Qual = parseSimpleId();
+      Node *Qual = getDerived().parseSimpleId();
       if (Qual == nullptr)
         return nullptr;
       SoFar = make<QualifiedName>(SoFar, Qual);
@@ -3059,7 +3081,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
         return nullptr;
     }
 
-    Node *Base = parseBaseUnresolvedName();
+    Node *Base = getDerived().parseBaseUnresolvedName();
     if (Base == nullptr)
       return nullptr;
     return make<QualifiedName>(SoFar, Base);
@@ -3069,7 +3091,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
 
   // [gs] <base-unresolved-name>                     # x or (with "gs") ::x
   if (!consumeIf("sr")) {
-    SoFar = parseBaseUnresolvedName();
+    SoFar = getDerived().parseBaseUnresolvedName();
     if (SoFar == nullptr)
       return nullptr;
     if (Global)
@@ -3080,7 +3102,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
   // [gs] sr <unresolved-qualifier-level>+ E   <base-unresolved-name>
   if (std::isdigit(look())) {
     do {
-      Node *Qual = parseSimpleId();
+      Node *Qual = getDerived().parseSimpleId();
       if (Qual == nullptr)
         return nullptr;
       if (SoFar)
@@ -3096,12 +3118,12 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
   //      sr <unresolved-type>                 <base-unresolved-name>
   //      sr <unresolved-type> <template-args> <base-unresolved-name>
   else {
-    SoFar = parseUnresolvedType();
+    SoFar = getDerived().parseUnresolvedType();
     if (SoFar == nullptr)
       return nullptr;
 
     if (look() == 'I') {
-      Node *TA = parseTemplateArgs();
+      Node *TA = getDerived().parseTemplateArgs();
       if (TA == nullptr)
         return nullptr;
       SoFar = make<NameWithTemplateArgs>(SoFar, TA);
@@ -3112,7 +3134,7 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
 
   assert(SoFar != nullptr);
 
-  Node *Base = parseBaseUnresolvedName();
+  Node *Base = getDerived().parseBaseUnresolvedName();
   if (Base == nullptr)
     return nullptr;
   return make<QualifiedName>(SoFar, Base);
@@ -3120,7 +3142,8 @@ template<typename Alloc> Node *Db<Alloc>::parseUnresolvedName() {
 
 // <abi-tags> ::= <abi-tag> [<abi-tags>]
 // <abi-tag> ::= B <source-name>
-template<typename Alloc> Node *Db<Alloc>::parseAbiTags(Node *N) {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseAbiTags(Node *N) {
   while (consumeIf('B')) {
     StringView SN = parseBareSourceName();
     if (SN.empty())
@@ -3133,8 +3156,9 @@ template<typename Alloc> Node *Db<Alloc>::parseAbiTags(Node *N) {
 }
 
 // <number> ::= [n] <non-negative decimal integer>
-template<typename Alloc>
-StringView Db<Alloc>::parseNumber(bool AllowNegative) {
+template <typename Alloc, typename Derived>
+StringView
+AbstractManglingParser<Alloc, Derived>::parseNumber(bool AllowNegative) {
   const char *Tmp = First;
   if (AllowNegative)
     consumeIf('n');
@@ -3146,7 +3170,8 @@ StringView Db<Alloc>::parseNumber(bool AllowNegative) {
 }
 
 // <positive length number> ::= [0-9]*
-template<typename Alloc> bool Db<Alloc>::parsePositiveInteger(size_t *Out) {
+template <typename Alloc, typename Derived>
+bool AbstractManglingParser<Alloc, Derived>::parsePositiveInteger(size_t *Out) {
   *Out = 0;
   if (look() < '0' || look() > '9')
     return true;
@@ -3157,7 +3182,8 @@ template<typename Alloc> bool Db<Alloc>::parsePositiveInteger(size_t *Out) {
   return false;
 }
 
-template<typename Alloc> StringView Db<Alloc>::parseBareSourceName() {
+template <typename Alloc, typename Derived>
+StringView AbstractManglingParser<Alloc, Derived>::parseBareSourceName() {
   size_t Int = 0;
   if (parsePositiveInteger(&Int) || numLeft() < Int)
     return StringView();
@@ -3174,7 +3200,8 @@ template<typename Alloc> StringView Db<Alloc>::parseBareSourceName() {
 //
 // <ref-qualifier> ::= R                   # & ref-qualifier
 // <ref-qualifier> ::= O                   # && ref-qualifier
-template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseFunctionType() {
   Qualifiers CVQuals = parseCVQualifiers();
 
   Node *ExceptionSpec = nullptr;
@@ -3183,7 +3210,7 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
     if (!ExceptionSpec)
       return nullptr;
   } else if (consumeIf("DO")) {
-    Node *E = parseExpr();
+    Node *E = getDerived().parseExpr();
     if (E == nullptr || !consumeIf('E'))
       return nullptr;
     ExceptionSpec = make<NoexceptSpec>(E);
@@ -3192,7 +3219,7 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
   } else if (consumeIf("Dw")) {
     size_t SpecsBegin = Names.size();
     while (!consumeIf('E')) {
-      Node *T = parseType();
+      Node *T = getDerived().parseType();
       if (T == nullptr)
         return nullptr;
       Names.push_back(T);
@@ -3208,7 +3235,7 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
   if (!consumeIf('F'))
     return nullptr;
   consumeIf('Y'); // extern "C"
-  Node *ReturnType = parseType();
+  Node *ReturnType = getDerived().parseType();
   if (ReturnType == nullptr)
     return nullptr;
 
@@ -3227,7 +3254,7 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
       ReferenceQualifier = FrefQualRValue;
       break;
     }
-    Node *T = parseType();
+    Node *T = getDerived().parseType();
     if (T == nullptr)
       return nullptr;
     Names.push_back(T);
@@ -3243,7 +3270,8 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionType() {
 //                         ::= Dv [<dimension expression>] _ <element type>
 // <extended element type> ::= <element type>
 //                         ::= p # AltiVec vector pixel
-template<typename Alloc> Node *Db<Alloc>::parseVectorType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseVectorType() {
   if (!consumeIf("Dv"))
     return nullptr;
   if (look() >= '1' && look() <= '9') {
@@ -3252,24 +3280,24 @@ template<typename Alloc> Node *Db<Alloc>::parseVectorType() {
       return nullptr;
     if (consumeIf('p'))
       return make<PixelVectorType>(DimensionNumber);
-    Node *ElemType = parseType();
+    Node *ElemType = getDerived().parseType();
     if (ElemType == nullptr)
       return nullptr;
     return make<VectorType>(ElemType, DimensionNumber);
   }
 
   if (!consumeIf('_')) {
-    Node *DimExpr = parseExpr();
+    Node *DimExpr = getDerived().parseExpr();
     if (!DimExpr)
       return nullptr;
     if (!consumeIf('_'))
       return nullptr;
-    Node *ElemType = parseType();
+    Node *ElemType = getDerived().parseType();
     if (!ElemType)
       return nullptr;
     return make<VectorType>(ElemType, DimExpr);
   }
-  Node *ElemType = parseType();
+  Node *ElemType = getDerived().parseType();
   if (!ElemType)
     return nullptr;
   return make<VectorType>(ElemType, StringView());
@@ -3277,12 +3305,13 @@ template<typename Alloc> Node *Db<Alloc>::parseVectorType() {
 
 // <decltype>  ::= Dt <expression> E  # decltype of an id-expression or class member access (C++0x)
 //             ::= DT <expression> E  # decltype of an expression (C++0x)
-template<typename Alloc> Node *Db<Alloc>::parseDecltype() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseDecltype() {
   if (!consumeIf('D'))
     return nullptr;
   if (!consumeIf('t') && !consumeIf('T'))
     return nullptr;
-  Node *E = parseExpr();
+  Node *E = getDerived().parseExpr();
   if (E == nullptr)
     return nullptr;
   if (!consumeIf('E'))
@@ -3292,7 +3321,8 @@ template<typename Alloc> Node *Db<Alloc>::parseDecltype() {
 
 // <array-type> ::= A <positive dimension number> _ <element type>
 //              ::= A [<dimension expression>] _ <element type>
-template<typename Alloc> Node *Db<Alloc>::parseArrayType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseArrayType() {
   if (!consumeIf('A'))
     return nullptr;
 
@@ -3303,7 +3333,7 @@ template<typename Alloc> Node *Db<Alloc>::parseArrayType() {
     if (!consumeIf('_'))
       return nullptr;
   } else if (!consumeIf('_')) {
-    Node *DimExpr = parseExpr();
+    Node *DimExpr = getDerived().parseExpr();
     if (DimExpr == nullptr)
       return nullptr;
     if (!consumeIf('_'))
@@ -3311,20 +3341,21 @@ template<typename Alloc> Node *Db<Alloc>::parseArrayType() {
     Dimension = DimExpr;
   }
 
-  Node *Ty = parseType();
+  Node *Ty = getDerived().parseType();
   if (Ty == nullptr)
     return nullptr;
   return make<ArrayType>(Ty, Dimension);
 }
 
 // <pointer-to-member-type> ::= M <class type> <member type>
-template<typename Alloc> Node *Db<Alloc>::parsePointerToMemberType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parsePointerToMemberType() {
   if (!consumeIf('M'))
     return nullptr;
-  Node *ClassType = parseType();
+  Node *ClassType = getDerived().parseType();
   if (ClassType == nullptr)
     return nullptr;
-  Node *MemberType = parseType();
+  Node *MemberType = getDerived().parseType();
   if (MemberType == nullptr)
     return nullptr;
   return make<PointerToMemberType>(ClassType, MemberType);
@@ -3334,7 +3365,8 @@ template<typename Alloc> Node *Db<Alloc>::parsePointerToMemberType() {
 //                   ::= Ts <name>  # dependent elaborated type specifier using 'struct' or 'class'
 //                   ::= Tu <name>  # dependent elaborated type specifier using 'union'
 //                   ::= Te <name>  # dependent elaborated type specifier using 'enum'
-template<typename Alloc> Node *Db<Alloc>::parseClassEnumType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseClassEnumType() {
   StringView ElabSpef;
   if (consumeIf("Ts"))
     ElabSpef = "struct";
@@ -3343,7 +3375,7 @@ template<typename Alloc> Node *Db<Alloc>::parseClassEnumType() {
   else if (consumeIf("Te"))
     ElabSpef = "enum";
 
-  Node *Name = parseName();
+  Node *Name = getDerived().parseName();
   if (Name == nullptr)
     return nullptr;
 
@@ -3356,7 +3388,8 @@ template<typename Alloc> Node *Db<Alloc>::parseClassEnumType() {
 // <qualified-type>     ::= <qualifiers> <type>
 // <qualifiers> ::= <extended-qualifier>* <CV-qualifiers>
 // <extended-qualifier> ::= U <source-name> [<template-args>] # vendor extended type qualifier
-template<typename Alloc> Node *Db<Alloc>::parseQualifiedType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseQualifiedType() {
   if (consumeIf('U')) {
     StringView Qual = parseBareSourceName();
     if (Qual.empty())
@@ -3375,20 +3408,20 @@ template<typename Alloc> Node *Db<Alloc>::parseQualifiedType() {
       }
       if (Proto.empty())
         return nullptr;
-      Node *Child = parseQualifiedType();
+      Node *Child = getDerived().parseQualifiedType();
       if (Child == nullptr)
         return nullptr;
       return make<ObjCProtoName>(Child, Proto);
     }
 
-    Node *Child = parseQualifiedType();
+    Node *Child = getDerived().parseQualifiedType();
     if (Child == nullptr)
       return nullptr;
     return make<VendorExtQualType>(Child, Qual);
   }
 
   Qualifiers Quals = parseCVQualifiers();
-  Node *Ty = parseType();
+  Node *Ty = getDerived().parseType();
   if (Ty == nullptr)
     return nullptr;
   if (Quals != QualNone)
@@ -3416,7 +3449,8 @@ template<typename Alloc> Node *Db<Alloc>::parseQualifiedType() {
 //
 // <objc-name> ::= <k0 number> objcproto <k1 number> <identifier>  # k0 = 9 + <number of digits in k1> + k1
 // <objc-type> ::= <source-name>  # PU<11+>objcproto 11objc_object<source-name> 11objc_object -> id<source-name>
-template<typename Alloc> Node *Db<Alloc>::parseType() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseType() {
   Node *Result = nullptr;
 
   if (TypeCallback != nullptr)
@@ -3436,13 +3470,13 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
         (look(AfterQuals) == 'D' &&
          (look(AfterQuals + 1) == 'o' || look(AfterQuals + 1) == 'O' ||
           look(AfterQuals + 1) == 'w' || look(AfterQuals + 1) == 'x'))) {
-      Result = parseFunctionType();
+      Result = getDerived().parseFunctionType();
       break;
     }
     LLVM_FALLTHROUGH;
   }
   case 'U': {
-    Result = parseQualifiedType();
+    Result = getDerived().parseQualifiedType();
     break;
   }
   // <builtin-type> ::= v    # void
@@ -3580,18 +3614,18 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
     //             ::= <decltype>
     case 't':
     case 'T': {
-      Result = parseDecltype();
+      Result = getDerived().parseDecltype();
       break;
     }
     // extension   ::= <vector-type> # <vector-type> starts with Dv
     case 'v': {
-      Result = parseVectorType();
+      Result = getDerived().parseVectorType();
       break;
     }
     //           ::= Dp <type>       # pack expansion (C++0x)
     case 'p': {
       First += 2;
-      Node *Child = parseType();
+      Node *Child = getDerived().parseType();
       if (!Child)
         return nullptr;
       Result = make<ParameterPackExpansion>(Child);
@@ -3603,34 +3637,34 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
     case 'w':
     // Transaction safe function type.
     case 'x':
-      Result = parseFunctionType();
+      Result = getDerived().parseFunctionType();
       break;
     }
     break;
   //             ::= <function-type>
   case 'F': {
-    Result = parseFunctionType();
+    Result = getDerived().parseFunctionType();
     break;
   }
   //             ::= <array-type>
   case 'A': {
-    Result = parseArrayType();
+    Result = getDerived().parseArrayType();
     break;
   }
   //             ::= <pointer-to-member-type>
   case 'M': {
-    Result = parsePointerToMemberType();
+    Result = getDerived().parsePointerToMemberType();
     break;
   }
   //             ::= <template-param>
   case 'T': {
     // This could be an elaborate type specifier on a <class-enum-type>.
     if (look(1) == 's' || look(1) == 'u' || look(1) == 'e') {
-      Result = parseClassEnumType();
+      Result = getDerived().parseClassEnumType();
       break;
     }
 
-    Result = parseTemplateParam();
+    Result = getDerived().parseTemplateParam();
     if (Result == nullptr)
       return nullptr;
 
@@ -3645,7 +3679,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
     // parse them, take the second production.
 
     if (TryToParseTemplateArgs && look() == 'I') {
-      Node *TA = parseTemplateArgs();
+      Node *TA = getDerived().parseTemplateArgs();
       if (TA == nullptr)
         return nullptr;
       Result = make<NameWithTemplateArgs>(Result, TA);
@@ -3655,7 +3689,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= P <type>        # pointer
   case 'P': {
     ++First;
-    Node *Ptr = parseType();
+    Node *Ptr = getDerived().parseType();
     if (Ptr == nullptr)
       return nullptr;
     Result = make<PointerType>(Ptr);
@@ -3664,7 +3698,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= R <type>        # l-value reference
   case 'R': {
     ++First;
-    Node *Ref = parseType();
+    Node *Ref = getDerived().parseType();
     if (Ref == nullptr)
       return nullptr;
     Result = make<ReferenceType>(Ref, ReferenceKind::LValue);
@@ -3673,7 +3707,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= O <type>        # r-value reference (C++11)
   case 'O': {
     ++First;
-    Node *Ref = parseType();
+    Node *Ref = getDerived().parseType();
     if (Ref == nullptr)
       return nullptr;
     Result = make<ReferenceType>(Ref, ReferenceKind::RValue);
@@ -3682,7 +3716,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= C <type>        # complex pair (C99)
   case 'C': {
     ++First;
-    Node *P = parseType();
+    Node *P = getDerived().parseType();
     if (P == nullptr)
       return nullptr;
     Result = make<PostfixQualifiedType>(P, " complex");
@@ -3691,7 +3725,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= G <type>        # imaginary (C99)
   case 'G': {
     ++First;
-    Node *P = parseType();
+    Node *P = getDerived().parseType();
     if (P == nullptr)
       return P;
     Result = make<PostfixQualifiedType>(P, " imaginary");
@@ -3700,7 +3734,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   //             ::= <substitution>  # See Compression below
   case 'S': {
     if (look(1) && look(1) != 't') {
-      Node *Sub = parseSubstitution();
+      Node *Sub = getDerived().parseSubstitution();
       if (Sub == nullptr)
         return nullptr;
 
@@ -3715,7 +3749,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
       // parse them, take the second production.
 
       if (TryToParseTemplateArgs && look() == 'I') {
-        Node *TA = parseTemplateArgs();
+        Node *TA = getDerived().parseTemplateArgs();
         if (TA == nullptr)
           return nullptr;
         Result = make<NameWithTemplateArgs>(Sub, TA);
@@ -3730,7 +3764,7 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   }
   //        ::= <class-enum-type>
   default: {
-    Result = parseClassEnumType();
+    Result = getDerived().parseClassEnumType();
     break;
   }
   }
@@ -3743,24 +3777,28 @@ template<typename Alloc> Node *Db<Alloc>::parseType() {
   return Result;
 }
 
-template<typename Alloc> Node *Db<Alloc>::parsePrefixExpr(StringView Kind) {
-  Node *E = parseExpr();
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parsePrefixExpr(StringView Kind) {
+  Node *E = getDerived().parseExpr();
   if (E == nullptr)
     return nullptr;
   return make<PrefixExpr>(Kind, E);
 }
 
-template<typename Alloc> Node *Db<Alloc>::parseBinaryExpr(StringView Kind) {
-  Node *LHS = parseExpr();
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseBinaryExpr(StringView Kind) {
+  Node *LHS = getDerived().parseExpr();
   if (LHS == nullptr)
     return nullptr;
-  Node *RHS = parseExpr();
+  Node *RHS = getDerived().parseExpr();
   if (RHS == nullptr)
     return nullptr;
   return make<BinaryExpr>(LHS, Kind, RHS);
 }
 
-template<typename Alloc> Node *Db<Alloc>::parseIntegerLiteral(StringView Lit) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseIntegerLiteral(StringView Lit) {
   StringView Tmp = parseNumber(true);
   if (!Tmp.empty() && consumeIf('E'))
     return make<IntegerLiteral>(Lit, Tmp);
@@ -3768,7 +3806,8 @@ template<typename Alloc> Node *Db<Alloc>::parseIntegerLiteral(StringView Lit) {
 }
 
 // <CV-Qualifiers> ::= [r] [V] [K]
-template<typename Alloc> Qualifiers Db<Alloc>::parseCVQualifiers() {
+template <typename Alloc, typename Derived>
+Qualifiers AbstractManglingParser<Alloc, Derived>::parseCVQualifiers() {
   Qualifiers CVR = QualNone;
   if (consumeIf('r'))
     CVR |= QualRestrict;
@@ -3783,7 +3822,8 @@ template<typename Alloc> Qualifiers Db<Alloc>::parseCVQualifiers() {
 //                  ::= fp <top-level CV-Qualifiers> <parameter-2 non-negative number> _   # L == 0, second and later parameters
 //                  ::= fL <L-1 non-negative number> p <top-level CV-Qualifiers> _         # L > 0, first parameter
 //                  ::= fL <L-1 non-negative number> p <top-level CV-Qualifiers> <parameter-2 non-negative number> _   # L > 0, second and later parameters
-template<typename Alloc> Node *Db<Alloc>::parseFunctionParam() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseFunctionParam() {
   if (consumeIf("fp")) {
     parseCVQualifiers();
     StringView Num = parseNumber();
@@ -3810,26 +3850,27 @@ template<typename Alloc> Node *Db<Alloc>::parseFunctionParam() {
 // [gs] na <expression>* _ <type> E                     # new[] (expr-list) type
 // [gs] na <expression>* _ <type> <initializer>         # new[] (expr-list) type (init)
 // <initializer> ::= pi <expression>* E                 # parenthesized initialization
-template<typename Alloc> Node *Db<Alloc>::parseNewExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseNewExpr() {
   bool Global = consumeIf("gs");
   bool IsArray = look(1) == 'a';
   if (!consumeIf("nw") && !consumeIf("na"))
     return nullptr;
   size_t Exprs = Names.size();
   while (!consumeIf('_')) {
-    Node *Ex = parseExpr();
+    Node *Ex = getDerived().parseExpr();
     if (Ex == nullptr)
       return nullptr;
     Names.push_back(Ex);
   }
   NodeArray ExprList = popTrailingNodeArray(Exprs);
-  Node *Ty = parseType();
+  Node *Ty = getDerived().parseType();
   if (Ty == nullptr)
     return Ty;
   if (consumeIf("pi")) {
     size_t InitsBegin = Names.size();
     while (!consumeIf('E')) {
-      Node *Init = parseExpr();
+      Node *Init = getDerived().parseExpr();
       if (Init == nullptr)
         return Init;
       Names.push_back(Init);
@@ -3843,13 +3884,14 @@ template<typename Alloc> Node *Db<Alloc>::parseNewExpr() {
 
 // cv <type> <expression>                               # conversion with one argument
 // cv <type> _ <expression>* E                          # conversion with a different number of arguments
-template<typename Alloc> Node *Db<Alloc>::parseConversionExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseConversionExpr() {
   if (!consumeIf("cv"))
     return nullptr;
   Node *Ty;
   {
     SwapAndRestore<bool> SaveTemp(TryToParseTemplateArgs, false);
-    Ty = parseType();
+    Ty = getDerived().parseType();
   }
 
   if (Ty == nullptr)
@@ -3858,7 +3900,7 @@ template<typename Alloc> Node *Db<Alloc>::parseConversionExpr() {
   if (consumeIf('_')) {
     size_t ExprsBegin = Names.size();
     while (!consumeIf('E')) {
-      Node *E = parseExpr();
+      Node *E = getDerived().parseExpr();
       if (E == nullptr)
         return E;
       Names.push_back(E);
@@ -3867,7 +3909,7 @@ template<typename Alloc> Node *Db<Alloc>::parseConversionExpr() {
     return make<ConversionExpr>(Ty, Exprs);
   }
 
-  Node *E[1] = {parseExpr()};
+  Node *E[1] = {getDerived().parseExpr()};
   if (E[0] == nullptr)
     return nullptr;
   return make<ConversionExpr>(Ty, makeNodeArray(E, E + 1));
@@ -3879,13 +3921,14 @@ template<typename Alloc> Node *Db<Alloc>::parseConversionExpr() {
 //                ::= L <nullptr type> E                                 # nullptr literal (i.e., "LDnE")
 // FIXME:         ::= L <type> <real-part float> _ <imag-part float> E   # complex floating point literal (C 2000)
 //                ::= L <mangled-name> E                                 # external name
-template<typename Alloc> Node *Db<Alloc>::parseExprPrimary() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseExprPrimary() {
   if (!consumeIf('L'))
     return nullptr;
   switch (look()) {
   case 'w':
     ++First;
-    return parseIntegerLiteral("wchar_t");
+    return getDerived().parseIntegerLiteral("wchar_t");
   case 'b':
     if (consumeIf("b0E"))
       return make<BoolExpr>(0);
@@ -3894,55 +3937,55 @@ template<typename Alloc> Node *Db<Alloc>::parseExprPrimary() {
     return nullptr;
   case 'c':
     ++First;
-    return parseIntegerLiteral("char");
+    return getDerived().parseIntegerLiteral("char");
   case 'a':
     ++First;
-    return parseIntegerLiteral("signed char");
+    return getDerived().parseIntegerLiteral("signed char");
   case 'h':
     ++First;
-    return parseIntegerLiteral("unsigned char");
+    return getDerived().parseIntegerLiteral("unsigned char");
   case 's':
     ++First;
-    return parseIntegerLiteral("short");
+    return getDerived().parseIntegerLiteral("short");
   case 't':
     ++First;
-    return parseIntegerLiteral("unsigned short");
+    return getDerived().parseIntegerLiteral("unsigned short");
   case 'i':
     ++First;
-    return parseIntegerLiteral("");
+    return getDerived().parseIntegerLiteral("");
   case 'j':
     ++First;
-    return parseIntegerLiteral("u");
+    return getDerived().parseIntegerLiteral("u");
   case 'l':
     ++First;
-    return parseIntegerLiteral("l");
+    return getDerived().parseIntegerLiteral("l");
   case 'm':
     ++First;
-    return parseIntegerLiteral("ul");
+    return getDerived().parseIntegerLiteral("ul");
   case 'x':
     ++First;
-    return parseIntegerLiteral("ll");
+    return getDerived().parseIntegerLiteral("ll");
   case 'y':
     ++First;
-    return parseIntegerLiteral("ull");
+    return getDerived().parseIntegerLiteral("ull");
   case 'n':
     ++First;
-    return parseIntegerLiteral("__int128");
+    return getDerived().parseIntegerLiteral("__int128");
   case 'o':
     ++First;
-    return parseIntegerLiteral("unsigned __int128");
+    return getDerived().parseIntegerLiteral("unsigned __int128");
   case 'f':
     ++First;
-    return parseFloatingLiteral<float>();
+    return getDerived().template parseFloatingLiteral<float>();
   case 'd':
     ++First;
-    return parseFloatingLiteral<double>();
+    return getDerived().template parseFloatingLiteral<double>();
   case 'e':
     ++First;
-    return parseFloatingLiteral<long double>();
+    return getDerived().template parseFloatingLiteral<long double>();
   case '_':
     if (consumeIf("_Z")) {
-      Node *R = parseEncoding();
+      Node *R = getDerived().parseEncoding();
       if (R != nullptr && consumeIf('E'))
         return R;
     }
@@ -3953,7 +3996,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExprPrimary() {
     return nullptr;
   default: {
     // might be named type
-    Node *T = parseType();
+    Node *T = getDerived().parseType();
     if (T == nullptr)
       return nullptr;
     StringView N = parseNumber();
@@ -3973,45 +4016,46 @@ template<typename Alloc> Node *Db<Alloc>::parseExprPrimary() {
 //                     ::= di <field source-name> <braced-expression>    # .name = expr
 //                     ::= dx <index expression> <braced-expression>     # [expr] = expr
 //                     ::= dX <range begin expression> <range end expression> <braced-expression>
-template<typename Alloc> Node *Db<Alloc>::parseBracedExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseBracedExpr() {
   if (look() == 'd') {
     switch (look(1)) {
     case 'i': {
       First += 2;
-      Node *Field = parseSourceName(/*NameState=*/nullptr);
+      Node *Field = getDerived().parseSourceName(/*NameState=*/nullptr);
       if (Field == nullptr)
         return nullptr;
-      Node *Init = parseBracedExpr();
+      Node *Init = getDerived().parseBracedExpr();
       if (Init == nullptr)
         return nullptr;
       return make<BracedExpr>(Field, Init, /*isArray=*/false);
     }
     case 'x': {
       First += 2;
-      Node *Index = parseExpr();
+      Node *Index = getDerived().parseExpr();
       if (Index == nullptr)
         return nullptr;
-      Node *Init = parseBracedExpr();
+      Node *Init = getDerived().parseBracedExpr();
       if (Init == nullptr)
         return nullptr;
       return make<BracedExpr>(Index, Init, /*isArray=*/true);
     }
     case 'X': {
       First += 2;
-      Node *RangeBegin = parseExpr();
+      Node *RangeBegin = getDerived().parseExpr();
       if (RangeBegin == nullptr)
         return nullptr;
-      Node *RangeEnd = parseExpr();
+      Node *RangeEnd = getDerived().parseExpr();
       if (RangeEnd == nullptr)
         return nullptr;
-      Node *Init = parseBracedExpr();
+      Node *Init = getDerived().parseBracedExpr();
       if (Init == nullptr)
         return nullptr;
       return make<BracedRangeExpr>(RangeBegin, RangeEnd, Init);
     }
     }
   }
-  return parseExpr();
+  return getDerived().parseExpr();
 }
 
 // (not yet in the spec)
@@ -4019,7 +4063,8 @@ template<typename Alloc> Node *Db<Alloc>::parseBracedExpr() {
 //             ::= fR <binary-operator-name> <expression> <expression>
 //             ::= fl <binary-operator-name> <expression>
 //             ::= fr <binary-operator-name> <expression>
-template<typename Alloc> Node *Db<Alloc>::parseFoldExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseFoldExpr() {
   if (!consumeIf('f'))
     return nullptr;
 
@@ -4069,11 +4114,11 @@ template<typename Alloc> Node *Db<Alloc>::parseFoldExpr() {
   else if (consumeIf("rS")) OperatorName = ">>=";
   else return nullptr;
 
-  Node *Pack = parseExpr(), *Init = nullptr;
+  Node *Pack = getDerived().parseExpr(), *Init = nullptr;
   if (Pack == nullptr)
     return nullptr;
   if (HasInitializer) {
-    Init = parseExpr();
+    Init = getDerived().parseExpr();
     if (Init == nullptr)
       return nullptr;
   }
@@ -4128,49 +4173,50 @@ template<typename Alloc> Node *Db<Alloc>::parseFoldExpr() {
 //              ::= fl <binary-operator-name> <expression>
 //              ::= fr <binary-operator-name> <expression>
 //              ::= <expr-primary>
-template<typename Alloc> Node *Db<Alloc>::parseExpr() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseExpr() {
   bool Global = consumeIf("gs");
   if (numLeft() < 2)
     return nullptr;
 
   switch (*First) {
   case 'L':
-    return parseExprPrimary();
+    return getDerived().parseExprPrimary();
   case 'T':
-    return parseTemplateParam();
+    return getDerived().parseTemplateParam();
   case 'f': {
     // Disambiguate a fold expression from a <function-param>.
     if (look(1) == 'p' || (look(1) == 'L' && std::isdigit(look(2))))
-      return parseFunctionParam();
-    return parseFoldExpr();
+      return getDerived().parseFunctionParam();
+    return getDerived().parseFoldExpr();
   }
   case 'a':
     switch (First[1]) {
     case 'a':
       First += 2;
-      return parseBinaryExpr("&&");
+      return getDerived().parseBinaryExpr("&&");
     case 'd':
       First += 2;
-      return parsePrefixExpr("&");
+      return getDerived().parsePrefixExpr("&");
     case 'n':
       First += 2;
-      return parseBinaryExpr("&");
+      return getDerived().parseBinaryExpr("&");
     case 'N':
       First += 2;
-      return parseBinaryExpr("&=");
+      return getDerived().parseBinaryExpr("&=");
     case 'S':
       First += 2;
-      return parseBinaryExpr("=");
+      return getDerived().parseBinaryExpr("=");
     case 't': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<EnclosingExpr>("alignof (", Ty, ")");
     }
     case 'z': {
       First += 2;
-      Node *Ty = parseExpr();
+      Node *Ty = getDerived().parseExpr();
       if (Ty == nullptr)
         return nullptr;
       return make<EnclosingExpr>("alignof (", Ty, ")");
@@ -4182,10 +4228,10 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     // cc <type> <expression>                               # const_cast<type>(expression)
     case 'c': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return Ty;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<CastExpr>("const_cast", Ty, Ex);
@@ -4193,12 +4239,12 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     // cl <expression>+ E                                   # call
     case 'l': {
       First += 2;
-      Node *Callee = parseExpr();
+      Node *Callee = getDerived().parseExpr();
       if (Callee == nullptr)
         return Callee;
       size_t ExprsBegin = Names.size();
       while (!consumeIf('E')) {
-        Node *E = parseExpr();
+        Node *E = getDerived().parseExpr();
         if (E == nullptr)
           return E;
         Names.push_back(E);
@@ -4207,104 +4253,104 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     }
     case 'm':
       First += 2;
-      return parseBinaryExpr(",");
+      return getDerived().parseBinaryExpr(",");
     case 'o':
       First += 2;
-      return parsePrefixExpr("~");
+      return getDerived().parsePrefixExpr("~");
     case 'v':
-      return parseConversionExpr();
+      return getDerived().parseConversionExpr();
     }
     return nullptr;
   case 'd':
     switch (First[1]) {
     case 'a': {
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<DeleteExpr>(Ex, Global, /*is_array=*/true);
     }
     case 'c': {
       First += 2;
-      Node *T = parseType();
+      Node *T = getDerived().parseType();
       if (T == nullptr)
         return T;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<CastExpr>("dynamic_cast", T, Ex);
     }
     case 'e':
       First += 2;
-      return parsePrefixExpr("*");
+      return getDerived().parsePrefixExpr("*");
     case 'l': {
       First += 2;
-      Node *E = parseExpr();
+      Node *E = getDerived().parseExpr();
       if (E == nullptr)
         return E;
       return make<DeleteExpr>(E, Global, /*is_array=*/false);
     }
     case 'n':
-      return parseUnresolvedName();
+      return getDerived().parseUnresolvedName();
     case 's': {
       First += 2;
-      Node *LHS = parseExpr();
+      Node *LHS = getDerived().parseExpr();
       if (LHS == nullptr)
         return nullptr;
-      Node *RHS = parseExpr();
+      Node *RHS = getDerived().parseExpr();
       if (RHS == nullptr)
         return nullptr;
       return make<MemberExpr>(LHS, ".*", RHS);
     }
     case 't': {
       First += 2;
-      Node *LHS = parseExpr();
+      Node *LHS = getDerived().parseExpr();
       if (LHS == nullptr)
         return LHS;
-      Node *RHS = parseExpr();
+      Node *RHS = getDerived().parseExpr();
       if (RHS == nullptr)
         return nullptr;
       return make<MemberExpr>(LHS, ".", RHS);
     }
     case 'v':
       First += 2;
-      return parseBinaryExpr("/");
+      return getDerived().parseBinaryExpr("/");
     case 'V':
       First += 2;
-      return parseBinaryExpr("/=");
+      return getDerived().parseBinaryExpr("/=");
     }
     return nullptr;
   case 'e':
     switch (First[1]) {
     case 'o':
       First += 2;
-      return parseBinaryExpr("^");
+      return getDerived().parseBinaryExpr("^");
     case 'O':
       First += 2;
-      return parseBinaryExpr("^=");
+      return getDerived().parseBinaryExpr("^=");
     case 'q':
       First += 2;
-      return parseBinaryExpr("==");
+      return getDerived().parseBinaryExpr("==");
     }
     return nullptr;
   case 'g':
     switch (First[1]) {
     case 'e':
       First += 2;
-      return parseBinaryExpr(">=");
+      return getDerived().parseBinaryExpr(">=");
     case 't':
       First += 2;
-      return parseBinaryExpr(">");
+      return getDerived().parseBinaryExpr(">");
     }
     return nullptr;
   case 'i':
     switch (First[1]) {
     case 'x': {
       First += 2;
-      Node *Base = parseExpr();
+      Node *Base = getDerived().parseExpr();
       if (Base == nullptr)
         return nullptr;
-      Node *Index = parseExpr();
+      Node *Index = getDerived().parseExpr();
       if (Index == nullptr)
         return Index;
       return make<ArraySubscriptExpr>(Base, Index);
@@ -4313,7 +4359,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
       First += 2;
       size_t InitsBegin = Names.size();
       while (!consumeIf('E')) {
-        Node *E = parseBracedExpr();
+        Node *E = getDerived().parseBracedExpr();
         if (E == nullptr)
           return nullptr;
         Names.push_back(E);
@@ -4326,37 +4372,37 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     switch (First[1]) {
     case 'e':
       First += 2;
-      return parseBinaryExpr("<=");
+      return getDerived().parseBinaryExpr("<=");
     case 's':
       First += 2;
-      return parseBinaryExpr("<<");
+      return getDerived().parseBinaryExpr("<<");
     case 'S':
       First += 2;
-      return parseBinaryExpr("<<=");
+      return getDerived().parseBinaryExpr("<<=");
     case 't':
       First += 2;
-      return parseBinaryExpr("<");
+      return getDerived().parseBinaryExpr("<");
     }
     return nullptr;
   case 'm':
     switch (First[1]) {
     case 'i':
       First += 2;
-      return parseBinaryExpr("-");
+      return getDerived().parseBinaryExpr("-");
     case 'I':
       First += 2;
-      return parseBinaryExpr("-=");
+      return getDerived().parseBinaryExpr("-=");
     case 'l':
       First += 2;
-      return parseBinaryExpr("*");
+      return getDerived().parseBinaryExpr("*");
     case 'L':
       First += 2;
-      return parseBinaryExpr("*=");
+      return getDerived().parseBinaryExpr("*=");
     case 'm':
       First += 2;
       if (consumeIf('_'))
-        return parsePrefixExpr("--");
-      Node *Ex = parseExpr();
+        return getDerived().parsePrefixExpr("--");
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return nullptr;
       return make<PostfixExpr>(Ex, "--");
@@ -4366,19 +4412,19 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     switch (First[1]) {
     case 'a':
     case 'w':
-      return parseNewExpr();
+      return getDerived().parseNewExpr();
     case 'e':
       First += 2;
-      return parseBinaryExpr("!=");
+      return getDerived().parseBinaryExpr("!=");
     case 'g':
       First += 2;
-      return parsePrefixExpr("-");
+      return getDerived().parsePrefixExpr("-");
     case 't':
       First += 2;
-      return parsePrefixExpr("!");
+      return getDerived().parsePrefixExpr("!");
     case 'x':
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<EnclosingExpr>("noexcept (", Ex, ")");
@@ -4387,47 +4433,47 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
   case 'o':
     switch (First[1]) {
     case 'n':
-      return parseUnresolvedName();
+      return getDerived().parseUnresolvedName();
     case 'o':
       First += 2;
-      return parseBinaryExpr("||");
+      return getDerived().parseBinaryExpr("||");
     case 'r':
       First += 2;
-      return parseBinaryExpr("|");
+      return getDerived().parseBinaryExpr("|");
     case 'R':
       First += 2;
-      return parseBinaryExpr("|=");
+      return getDerived().parseBinaryExpr("|=");
     }
     return nullptr;
   case 'p':
     switch (First[1]) {
     case 'm':
       First += 2;
-      return parseBinaryExpr("->*");
+      return getDerived().parseBinaryExpr("->*");
     case 'l':
       First += 2;
-      return parseBinaryExpr("+");
+      return getDerived().parseBinaryExpr("+");
     case 'L':
       First += 2;
-      return parseBinaryExpr("+=");
+      return getDerived().parseBinaryExpr("+=");
     case 'p': {
       First += 2;
       if (consumeIf('_'))
-        return parsePrefixExpr("++");
-      Node *Ex = parseExpr();
+        return getDerived().parsePrefixExpr("++");
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<PostfixExpr>(Ex, "++");
     }
     case 's':
       First += 2;
-      return parsePrefixExpr("+");
+      return getDerived().parsePrefixExpr("+");
     case 't': {
       First += 2;
-      Node *L = parseExpr();
+      Node *L = getDerived().parseExpr();
       if (L == nullptr)
         return nullptr;
-      Node *R = parseExpr();
+      Node *R = getDerived().parseExpr();
       if (R == nullptr)
         return nullptr;
       return make<MemberExpr>(L, "->", R);
@@ -4437,13 +4483,13 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
   case 'q':
     if (First[1] == 'u') {
       First += 2;
-      Node *Cond = parseExpr();
+      Node *Cond = getDerived().parseExpr();
       if (Cond == nullptr)
         return nullptr;
-      Node *LHS = parseExpr();
+      Node *LHS = getDerived().parseExpr();
       if (LHS == nullptr)
         return nullptr;
-      Node *RHS = parseExpr();
+      Node *RHS = getDerived().parseExpr();
       if (RHS == nullptr)
         return nullptr;
       return make<ConditionalExpr>(Cond, LHS, RHS);
@@ -4453,59 +4499,59 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     switch (First[1]) {
     case 'c': {
       First += 2;
-      Node *T = parseType();
+      Node *T = getDerived().parseType();
       if (T == nullptr)
         return T;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<CastExpr>("reinterpret_cast", T, Ex);
     }
     case 'm':
       First += 2;
-      return parseBinaryExpr("%");
+      return getDerived().parseBinaryExpr("%");
     case 'M':
       First += 2;
-      return parseBinaryExpr("%=");
+      return getDerived().parseBinaryExpr("%=");
     case 's':
       First += 2;
-      return parseBinaryExpr(">>");
+      return getDerived().parseBinaryExpr(">>");
     case 'S':
       First += 2;
-      return parseBinaryExpr(">>=");
+      return getDerived().parseBinaryExpr(">>=");
     }
     return nullptr;
   case 's':
     switch (First[1]) {
     case 'c': {
       First += 2;
-      Node *T = parseType();
+      Node *T = getDerived().parseType();
       if (T == nullptr)
         return T;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<CastExpr>("static_cast", T, Ex);
     }
     case 'p': {
       First += 2;
-      Node *Child = parseExpr();
+      Node *Child = getDerived().parseExpr();
       if (Child == nullptr)
         return nullptr;
       return make<ParameterPackExpansion>(Child);
     }
     case 'r':
-      return parseUnresolvedName();
+      return getDerived().parseUnresolvedName();
     case 't': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return Ty;
       return make<EnclosingExpr>("sizeof (", Ty, ")");
     }
     case 'z': {
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<EnclosingExpr>("sizeof (", Ex, ")");
@@ -4513,12 +4559,12 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     case 'Z':
       First += 2;
       if (look() == 'T') {
-        Node *R = parseTemplateParam();
+        Node *R = getDerived().parseTemplateParam();
         if (R == nullptr)
           return nullptr;
         return make<SizeofParamPackExpr>(R);
       } else if (look() == 'f') {
-        Node *FP = parseFunctionParam();
+        Node *FP = getDerived().parseFunctionParam();
         if (FP == nullptr)
           return nullptr;
         return make<EnclosingExpr>("sizeof... (", FP, ")");
@@ -4528,7 +4574,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
       First += 2;
       size_t ArgsBegin = Names.size();
       while (!consumeIf('E')) {
-        Node *Arg = parseTemplateArg();
+        Node *Arg = getDerived().parseTemplateArg();
         if (Arg == nullptr)
           return nullptr;
         Names.push_back(Arg);
@@ -4544,26 +4590,26 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
     switch (First[1]) {
     case 'e': {
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return Ex;
       return make<EnclosingExpr>("typeid (", Ex, ")");
     }
     case 'i': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return Ty;
       return make<EnclosingExpr>("typeid (", Ty, ")");
     }
     case 'l': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       size_t InitsBegin = Names.size();
       while (!consumeIf('E')) {
-        Node *E = parseBracedExpr();
+        Node *E = getDerived().parseBracedExpr();
         if (E == nullptr)
           return nullptr;
         Names.push_back(E);
@@ -4575,7 +4621,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
       return make<NameType>("throw");
     case 'w': {
       First += 2;
-      Node *Ex = parseExpr();
+      Node *Ex = getDerived().parseExpr();
       if (Ex == nullptr)
         return nullptr;
       return make<ThrowExpr>(Ex);
@@ -4591,7 +4637,7 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
   case '7':
   case '8':
   case '9':
-    return parseUnresolvedName();
+    return getDerived().parseUnresolvedName();
   }
   return nullptr;
 }
@@ -4604,7 +4650,8 @@ template<typename Alloc> Node *Db<Alloc>::parseExpr() {
 //
 // <v-offset>  ::= <offset number> _ <virtual offset number>
 //               # virtual base override, with vcall offset
-template<typename Alloc> bool Db<Alloc>::parseCallOffset() {
+template <typename Alloc, typename Derived>
+bool AbstractManglingParser<Alloc, Derived>::parseCallOffset() {
   // Just scan through the call offset, we never add this information into the
   // output.
   if (consumeIf('h'))
@@ -4633,14 +4680,15 @@ template<typename Alloc> bool Db<Alloc>::parseCallOffset() {
 //                ::= GR <object name> <seq-id> _    # Subsequent temporaries
 //      extension ::= TC <first type> <number> _ <second type> # construction vtable for second-in-first
 //      extension ::= GR <object name> # reference temporary for object
-template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSpecialName() {
   switch (look()) {
   case 'T':
     switch (look(1)) {
     // TV <type>    # virtual table
     case 'V': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<SpecialName>("vtable for ", Ty);
@@ -4648,7 +4696,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TT <type>    # VTT structure (construction vtable index)
     case 'T': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<SpecialName>("VTT for ", Ty);
@@ -4656,7 +4704,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TI <type>    # typeinfo structure
     case 'I': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<SpecialName>("typeinfo for ", Ty);
@@ -4664,7 +4712,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TS <type>    # typeinfo name (null-terminated byte string)
     case 'S': {
       First += 2;
-      Node *Ty = parseType();
+      Node *Ty = getDerived().parseType();
       if (Ty == nullptr)
         return nullptr;
       return make<SpecialName>("typeinfo name for ", Ty);
@@ -4674,7 +4722,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
       First += 2;
       if (parseCallOffset() || parseCallOffset())
         return nullptr;
-      Node *Encoding = parseEncoding();
+      Node *Encoding = getDerived().parseEncoding();
       if (Encoding == nullptr)
         return nullptr;
       return make<SpecialName>("covariant return thunk to ", Encoding);
@@ -4683,12 +4731,12 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     //               # construction vtable for second-in-first
     case 'C': {
       First += 2;
-      Node *FirstType = parseType();
+      Node *FirstType = getDerived().parseType();
       if (FirstType == nullptr)
         return nullptr;
       if (parseNumber(true).empty() || !consumeIf('_'))
         return nullptr;
-      Node *SecondType = parseType();
+      Node *SecondType = getDerived().parseType();
       if (SecondType == nullptr)
         return nullptr;
       return make<CtorVtableSpecialName>(SecondType, FirstType);
@@ -4696,7 +4744,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TW <object name> # Thread-local wrapper
     case 'W': {
       First += 2;
-      Node *Name = parseName();
+      Node *Name = getDerived().parseName();
       if (Name == nullptr)
         return nullptr;
       return make<SpecialName>("thread-local wrapper routine for ", Name);
@@ -4704,7 +4752,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // TH <object name> # Thread-local initialization
     case 'H': {
       First += 2;
-      Node *Name = parseName();
+      Node *Name = getDerived().parseName();
       if (Name == nullptr)
         return nullptr;
       return make<SpecialName>("thread-local initialization routine for ", Name);
@@ -4715,7 +4763,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
       bool IsVirt = look() == 'v';
       if (parseCallOffset())
         return nullptr;
-      Node *BaseEncoding = parseEncoding();
+      Node *BaseEncoding = getDerived().parseEncoding();
       if (BaseEncoding == nullptr)
         return nullptr;
       if (IsVirt)
@@ -4729,7 +4777,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // GV <object name> # Guard variable for one-time initialization
     case 'V': {
       First += 2;
-      Node *Name = parseName();
+      Node *Name = getDerived().parseName();
       if (Name == nullptr)
         return nullptr;
       return make<SpecialName>("guard variable for ", Name);
@@ -4739,7 +4787,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
     // GR <object name> <seq-id> _    # Subsequent temporaries
     case 'R': {
       First += 2;
-      Node *Name = parseName();
+      Node *Name = getDerived().parseName();
       if (Name == nullptr)
         return nullptr;
       size_t Count;
@@ -4756,9 +4804,10 @@ template<typename Alloc> Node *Db<Alloc>::parseSpecialName() {
 // <encoding> ::= <function name> <bare-function-type>
 //            ::= <data name>
 //            ::= <special-name>
-template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseEncoding() {
   if (look() == 'G' || look() == 'T')
-    return parseSpecialName();
+    return getDerived().parseSpecialName();
 
   auto IsEndOfEncoding = [&] {
     // The set of chars that can potentially follow an <encoding> (none of which
@@ -4768,7 +4817,7 @@ template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
   };
 
   NameState NameInfo(this);
-  Node *Name = parseName(&NameInfo);
+  Node *Name = getDerived().parseName(&NameInfo);
   if (Name == nullptr)
     return nullptr;
 
@@ -4782,7 +4831,7 @@ template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
   if (consumeIf("Ua9enable_ifI")) {
     size_t BeforeArgs = Names.size();
     while (!consumeIf('E')) {
-      Node *Arg = parseTemplateArg();
+      Node *Arg = getDerived().parseTemplateArg();
       if (Arg == nullptr)
         return nullptr;
       Names.push_back(Arg);
@@ -4794,7 +4843,7 @@ template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
 
   Node *ReturnType = nullptr;
   if (!NameInfo.CtorDtorConversion && NameInfo.EndsWithTemplateArgs) {
-    ReturnType = parseType();
+    ReturnType = getDerived().parseType();
     if (ReturnType == nullptr)
       return nullptr;
   }
@@ -4806,7 +4855,7 @@ template<typename Alloc> Node *Db<Alloc>::parseEncoding() {
 
   size_t ParamsBegin = Names.size();
   do {
-    Node *Ty = parseType();
+    Node *Ty = getDerived().parseType();
     if (Ty == nullptr)
       return nullptr;
     Names.push_back(Ty);
@@ -4852,9 +4901,9 @@ struct FloatData<long double>
     static constexpr const char *spec = "%LaL";
 };
 
-template<typename Alloc>
-template<class Float>
-Node *Db<Alloc>::parseFloatingLiteral() {
+template <typename Alloc, typename Derived>
+template <class Float>
+Node *AbstractManglingParser<Alloc, Derived>::parseFloatingLiteral() {
   const size_t N = FloatData<Float>::mangled_size;
   if (numLeft() <= N)
     return nullptr;
@@ -4869,7 +4918,8 @@ Node *Db<Alloc>::parseFloatingLiteral() {
 }
 
 // <seq-id> ::= <0-9A-Z>+
-template<typename Alloc> bool Db<Alloc>::parseSeqId(size_t *Out) {
+template <typename Alloc, typename Derived>
+bool AbstractManglingParser<Alloc, Derived>::parseSeqId(size_t *Out) {
   if (!(look() >= '0' && look() <= '9') &&
       !(look() >= 'A' && look() <= 'Z'))
     return true;
@@ -4900,7 +4950,8 @@ template<typename Alloc> bool Db<Alloc>::parseSeqId(size_t *Out) {
 // <substitution> ::= Si # ::std::basic_istream<char,  std::char_traits<char> >
 // <substitution> ::= So # ::std::basic_ostream<char,  std::char_traits<char> >
 // <substitution> ::= Sd # ::std::basic_iostream<char, std::char_traits<char> >
-template<typename Alloc> Node *Db<Alloc>::parseSubstitution() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseSubstitution() {
   if (!consumeIf('S'))
     return nullptr;
 
@@ -4939,7 +4990,7 @@ template<typename Alloc> Node *Db<Alloc>::parseSubstitution() {
     // Itanium C++ ABI 5.1.2: If a name that would use a built-in <substitution>
     // has ABI tags, the tags are appended to the substitution; the result is a
     // substitutable component.
-    Node *WithTags = parseAbiTags(SpecialSub);
+    Node *WithTags = getDerived().parseAbiTags(SpecialSub);
     if (WithTags != SpecialSub) {
       Subs.push_back(WithTags);
       SpecialSub = WithTags;
@@ -4966,7 +5017,8 @@ template<typename Alloc> Node *Db<Alloc>::parseSubstitution() {
 
 // <template-param> ::= T_    # first template parameter
 //                  ::= T <parameter-2 non-negative number> _
-template<typename Alloc> Node *Db<Alloc>::parseTemplateParam() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseTemplateParam() {
   if (!consumeIf('T'))
     return nullptr;
 
@@ -5007,11 +5059,12 @@ template<typename Alloc> Node *Db<Alloc>::parseTemplateParam() {
 //                ::= <expr-primary>            # simple expressions
 //                ::= J <template-arg>* E       # argument pack
 //                ::= LZ <encoding> E           # extension
-template<typename Alloc> Node *Db<Alloc>::parseTemplateArg() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parseTemplateArg() {
   switch (look()) {
   case 'X': {
     ++First;
-    Node *Arg = parseExpr();
+    Node *Arg = getDerived().parseExpr();
     if (Arg == nullptr || !consumeIf('E'))
       return nullptr;
     return Arg;
@@ -5020,7 +5073,7 @@ template<typename Alloc> Node *Db<Alloc>::parseTemplateArg() {
     ++First;
     size_t ArgsBegin = Names.size();
     while (!consumeIf('E')) {
-      Node *Arg = parseTemplateArg();
+      Node *Arg = getDerived().parseTemplateArg();
       if (Arg == nullptr)
         return nullptr;
       Names.push_back(Arg);
@@ -5032,23 +5085,24 @@ template<typename Alloc> Node *Db<Alloc>::parseTemplateArg() {
     //                ::= LZ <encoding> E           # extension
     if (look(1) == 'Z') {
       First += 2;
-      Node *Arg = parseEncoding();
+      Node *Arg = getDerived().parseEncoding();
       if (Arg == nullptr || !consumeIf('E'))
         return nullptr;
       return Arg;
     }
     //                ::= <expr-primary>            # simple expressions
-    return parseExprPrimary();
+    return getDerived().parseExprPrimary();
   }
   default:
-    return parseType();
+    return getDerived().parseType();
   }
 }
 
 // <template-args> ::= I <template-arg>* E
 //     extension, the abi says <template-arg>+
-template <typename Alloc>
-Node *Db<Alloc>::parseTemplateArgs(bool TagTemplates) {
+template <typename Derived, typename Alloc>
+Node *
+AbstractManglingParser<Derived, Alloc>::parseTemplateArgs(bool TagTemplates) {
   if (!consumeIf('I'))
     return nullptr;
 
@@ -5061,7 +5115,7 @@ Node *Db<Alloc>::parseTemplateArgs(bool TagTemplates) {
   while (!consumeIf('E')) {
     if (TagTemplates) {
       auto OldParams = std::move(TemplateParams);
-      Node *Arg = parseTemplateArg();
+      Node *Arg = getDerived().parseTemplateArg();
       TemplateParams = std::move(OldParams);
       if (Arg == nullptr)
         return nullptr;
@@ -5075,7 +5129,7 @@ Node *Db<Alloc>::parseTemplateArgs(bool TagTemplates) {
       }
       TemplateParams.push_back(TableEntry);
     } else {
-      Node *Arg = parseTemplateArg();
+      Node *Arg = getDerived().parseTemplateArg();
       if (Arg == nullptr)
         return nullptr;
       Names.push_back(Arg);
@@ -5089,9 +5143,10 @@ Node *Db<Alloc>::parseTemplateArgs(bool TagTemplates) {
 // extension      ::= ___Z <encoding> _block_invoke
 // extension      ::= ___Z <encoding> _block_invoke<decimal-digit>+
 // extension      ::= ___Z <encoding> _block_invoke_<decimal-digit>+
-template<typename Alloc> Node *Db<Alloc>::parse() {
+template <typename Derived, typename Alloc>
+Node *AbstractManglingParser<Derived, Alloc>::parse() {
   if (consumeIf("_Z")) {
-    Node *Encoding = parseEncoding();
+    Node *Encoding = getDerived().parseEncoding();
     if (Encoding == nullptr)
       return nullptr;
     if (look() == '.') {
@@ -5104,7 +5159,7 @@ template<typename Alloc> Node *Db<Alloc>::parse() {
   }
 
   if (consumeIf("___Z")) {
-    Node *Encoding = parseEncoding();
+    Node *Encoding = getDerived().parseEncoding();
     if (Encoding == nullptr || !consumeIf("_block_invoke"))
       return nullptr;
     bool RequireNumber = consumeIf('_');
@@ -5117,12 +5172,18 @@ template<typename Alloc> Node *Db<Alloc>::parse() {
     return make<SpecialName>("invocation function for block in ", Encoding);
   }
 
-  Node *Ty = parseType();
+  Node *Ty = getDerived().parseType();
   if (numLeft() != 0)
     return nullptr;
   return Ty;
 }
 
+template <typename Alloc>
+struct ManglingParser : AbstractManglingParser<ManglingParser<Alloc>, Alloc> {
+  using AbstractManglingParser<ManglingParser<Alloc>,
+                               Alloc>::AbstractManglingParser;
+};
+
 }  // namespace itanium_demangle
 }  // namespace llvm
 
diff --git a/include/llvm/Demangle/MicrosoftDemangle.h b/include/llvm/Demangle/MicrosoftDemangle.h
new file mode 100644
index 0000000000000000000000000000000000000000..b186758ebe24ba5aa2573fc682fce34206187fbf
--- /dev/null
+++ b/include/llvm/Demangle/MicrosoftDemangle.h
@@ -0,0 +1,276 @@
+//===------------------------- MicrosoftDemangle.h --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEMANGLE_MICROSOFT_DEMANGLE_H
+#define LLVM_DEMANGLE_MICROSOFT_DEMANGLE_H
+
+#include "llvm/Demangle/Compiler.h"
+#include "llvm/Demangle/MicrosoftDemangleNodes.h"
+#include "llvm/Demangle/StringView.h"
+#include "llvm/Demangle/Utility.h"
+
+#include <utility>
+
+namespace llvm {
+namespace ms_demangle {
+// This memory allocator is extremely fast, but it doesn't call dtors
+// for allocated objects. That means you can't use STL containers
+// (such as std::vector) with this allocator. But it pays off --
+// the demangler is 3x faster with this allocator compared to one with
+// STL containers.
+constexpr size_t AllocUnit = 4096;
+
+class ArenaAllocator {
+  struct AllocatorNode {
+    uint8_t *Buf = nullptr;
+    size_t Used = 0;
+    size_t Capacity = 0;
+    AllocatorNode *Next = nullptr;
+  };
+
+  void addNode(size_t Capacity) {
+    AllocatorNode *NewHead = new AllocatorNode;
+    NewHead->Buf = new uint8_t[Capacity];
+    NewHead->Next = Head;
+    NewHead->Capacity = Capacity;
+    Head = NewHead;
+    NewHead->Used = 0;
+  }
+
+public:
+  ArenaAllocator() { addNode(AllocUnit); }
+
+  ~ArenaAllocator() {
+    while (Head) {
+      assert(Head->Buf);
+      delete[] Head->Buf;
+      AllocatorNode *Next = Head->Next;
+      delete Head;
+      Head = Next;
+    }
+  }
+
+  char *allocUnalignedBuffer(size_t Length) {
+    uint8_t *Buf = Head->Buf + Head->Used;
+
+    Head->Used += Length;
+    if (Head->Used > Head->Capacity) {
+      // It's possible we need a buffer which is larger than our default unit
+      // size, so we need to be careful to add a node with capacity that is at
+      // least as large as what we need.
+      addNode(std::max(AllocUnit, Length));
+      Head->Used = Length;
+      Buf = Head->Buf;
+    }
+
+    return reinterpret_cast<char *>(Buf);
+  }
+
+  template <typename T, typename... Args> T *allocArray(size_t Count) {
+
+    size_t Size = Count * sizeof(T);
+    assert(Head && Head->Buf);
+
+    size_t P = (size_t)Head->Buf + Head->Used;
+    uintptr_t AlignedP =
+        (((size_t)P + alignof(T) - 1) & ~(size_t)(alignof(T) - 1));
+    uint8_t *PP = (uint8_t *)AlignedP;
+    size_t Adjustment = AlignedP - P;
+
+    Head->Used += Size + Adjustment;
+    if (Head->Used < Head->Capacity)
+      return new (PP) T[Count]();
+
+    addNode(AllocUnit);
+    Head->Used = Size;
+    return new (Head->Buf) T[Count]();
+  }
+
+  template <typename T, typename... Args> T *alloc(Args &&... ConstructorArgs) {
+
+    size_t Size = sizeof(T);
+    assert(Head && Head->Buf);
+
+    size_t P = (size_t)Head->Buf + Head->Used;
+    uintptr_t AlignedP =
+        (((size_t)P + alignof(T) - 1) & ~(size_t)(alignof(T) - 1));
+    uint8_t *PP = (uint8_t *)AlignedP;
+    size_t Adjustment = AlignedP - P;
+
+    Head->Used += Size + Adjustment;
+    if (Head->Used < Head->Capacity)
+      return new (PP) T(std::forward<Args>(ConstructorArgs)...);
+
+    addNode(AllocUnit);
+    Head->Used = Size;
+    return new (Head->Buf) T(std::forward<Args>(ConstructorArgs)...);
+  }
+
+private:
+  AllocatorNode *Head = nullptr;
+};
+
+struct BackrefContext {
+  static constexpr size_t Max = 10;
+
+  TypeNode *FunctionParams[Max];
+  size_t FunctionParamCount = 0;
+
+  // The first 10 BackReferences in a mangled name can be back-referenced by
+  // special name @[0-9]. This is a storage for the first 10 BackReferences.
+  NamedIdentifierNode *Names[Max];
+  size_t NamesCount = 0;
+};
+
+enum class QualifierMangleMode { Drop, Mangle, Result };
+
+enum NameBackrefBehavior : uint8_t {
+  NBB_None = 0,          // don't save any names as backrefs.
+  NBB_Template = 1 << 0, // save template instanations.
+  NBB_Simple = 1 << 1,   // save simple names.
+};
+
+enum class FunctionIdentifierCodeGroup { Basic, Under, DoubleUnder };
+
+// Demangler class takes the main role in demangling symbols.
+// It has a set of functions to parse mangled symbols into Type instances.
+// It also has a set of functions to convert Type instances to strings.
+class Demangler {
+public:
+  Demangler() = default;
+  virtual ~Demangler() = default;
+
+  // You are supposed to call parse() first and then check if error is true.  If
+  // it is false, call output() to write the formatted name to the given stream.
+  SymbolNode *parse(StringView &MangledName);
+
+  TagTypeNode *parseTagUniqueName(StringView &MangledName);
+
+  // True if an error occurred.
+  bool Error = false;
+
+  void dumpBackReferences();
+
+private:
+  SymbolNode *demangleEncodedSymbol(StringView &MangledName,
+                                    QualifiedNameNode *QN);
+
+  VariableSymbolNode *demangleVariableEncoding(StringView &MangledName,
+                                               StorageClass SC);
+  FunctionSymbolNode *demangleFunctionEncoding(StringView &MangledName);
+
+  Qualifiers demanglePointerExtQualifiers(StringView &MangledName);
+
+  // Parser functions. This is a recursive-descent parser.
+  TypeNode *demangleType(StringView &MangledName, QualifierMangleMode QMM);
+  PrimitiveTypeNode *demanglePrimitiveType(StringView &MangledName);
+  CustomTypeNode *demangleCustomType(StringView &MangledName);
+  TagTypeNode *demangleClassType(StringView &MangledName);
+  PointerTypeNode *demanglePointerType(StringView &MangledName);
+  PointerTypeNode *demangleMemberPointerType(StringView &MangledName);
+  FunctionSignatureNode *demangleFunctionType(StringView &MangledName,
+                                              bool HasThisQuals);
+
+  ArrayTypeNode *demangleArrayType(StringView &MangledName);
+
+  NodeArrayNode *demangleTemplateParameterList(StringView &MangledName);
+  NodeArrayNode *demangleFunctionParameterList(StringView &MangledName);
+
+  std::pair<uint64_t, bool> demangleNumber(StringView &MangledName);
+  uint64_t demangleUnsigned(StringView &MangledName);
+  int64_t demangleSigned(StringView &MangledName);
+
+  void memorizeString(StringView s);
+  void memorizeIdentifier(IdentifierNode *Identifier);
+
+  /// Allocate a copy of \p Borrowed into memory that we own.
+  StringView copyString(StringView Borrowed);
+
+  QualifiedNameNode *demangleFullyQualifiedTypeName(StringView &MangledName);
+  QualifiedNameNode *demangleFullyQualifiedSymbolName(StringView &MangledName);
+
+  IdentifierNode *demangleUnqualifiedTypeName(StringView &MangledName,
+                                              bool Memorize);
+  IdentifierNode *demangleUnqualifiedSymbolName(StringView &MangledName,
+                                                NameBackrefBehavior NBB);
+
+  QualifiedNameNode *demangleNameScopeChain(StringView &MangledName,
+                                            IdentifierNode *UnqualifiedName);
+  IdentifierNode *demangleNameScopePiece(StringView &MangledName);
+
+  NamedIdentifierNode *demangleBackRefName(StringView &MangledName);
+  IdentifierNode *demangleTemplateInstantiationName(StringView &MangledName,
+                                                    NameBackrefBehavior NBB);
+  IdentifierNode *demangleFunctionIdentifierCode(StringView &MangledName);
+  IdentifierNode *
+  demangleFunctionIdentifierCode(StringView &MangledName,
+                                 FunctionIdentifierCodeGroup Group);
+  StructorIdentifierNode *demangleStructorIdentifier(StringView &MangledName,
+                                                     bool IsDestructor);
+  ConversionOperatorIdentifierNode *
+  demangleConversionOperatorIdentifier(StringView &MangledName);
+  LiteralOperatorIdentifierNode *
+  demangleLiteralOperatorIdentifier(StringView &MangledName);
+
+  SymbolNode *demangleSpecialIntrinsic(StringView &MangledName);
+  SpecialTableSymbolNode *
+  demangleSpecialTableSymbolNode(StringView &MangledName,
+                                 SpecialIntrinsicKind SIK);
+  LocalStaticGuardVariableNode *
+  demangleLocalStaticGuard(StringView &MangledName);
+  VariableSymbolNode *demangleUntypedVariable(ArenaAllocator &Arena,
+                                              StringView &MangledName,
+                                              StringView VariableName);
+  VariableSymbolNode *
+  demangleRttiBaseClassDescriptorNode(ArenaAllocator &Arena,
+                                      StringView &MangledName);
+  FunctionSymbolNode *demangleInitFiniStub(StringView &MangledName,
+                                           bool IsDestructor);
+
+  NamedIdentifierNode *demangleSimpleName(StringView &MangledName,
+                                          bool Memorize);
+  NamedIdentifierNode *demangleAnonymousNamespaceName(StringView &MangledName);
+  NamedIdentifierNode *demangleLocallyScopedNamePiece(StringView &MangledName);
+  EncodedStringLiteralNode *demangleStringLiteral(StringView &MangledName);
+  FunctionSymbolNode *demangleVcallThunkNode(StringView &MangledName);
+
+  StringView demangleSimpleString(StringView &MangledName, bool Memorize);
+
+  FuncClass demangleFunctionClass(StringView &MangledName);
+  CallingConv demangleCallingConvention(StringView &MangledName);
+  StorageClass demangleVariableStorageClass(StringView &MangledName);
+  void demangleThrowSpecification(StringView &MangledName);
+  wchar_t demangleWcharLiteral(StringView &MangledName);
+  uint8_t demangleCharLiteral(StringView &MangledName);
+
+  std::pair<Qualifiers, bool> demangleQualifiers(StringView &MangledName);
+
+  // Memory allocator.
+  ArenaAllocator Arena;
+
+  // A single type uses one global back-ref table for all function params.
+  // This means back-refs can even go "into" other types.  Examples:
+  //
+  //  // Second int* is a back-ref to first.
+  //  void foo(int *, int*);
+  //
+  //  // Second int* is not a back-ref to first (first is not a function param).
+  //  int* foo(int*);
+  //
+  //  // Second int* is a back-ref to first (ALL function types share the same
+  //  // back-ref map.
+  //  using F = void(*)(int*);
+  //  F G(int *);
+  BackrefContext Backrefs;
+};
+
+} // namespace ms_demangle
+} // namespace llvm
+
+#endif // LLVM_DEMANGLE_MICROSOFT_DEMANGLE_H
diff --git a/lib/Demangle/MicrosoftDemangleNodes.h b/include/llvm/Demangle/MicrosoftDemangleNodes.h
similarity index 86%
rename from lib/Demangle/MicrosoftDemangleNodes.h
rename to include/llvm/Demangle/MicrosoftDemangleNodes.h
index caa7eb3b5262bcb86a62e0efb9173d3bc94cb413..1d0b66a7bf41c86b6a69fda67e0e9ce5eb574b4b 100644
--- a/lib/Demangle/MicrosoftDemangleNodes.h
+++ b/include/llvm/Demangle/MicrosoftDemangleNodes.h
@@ -10,104 +10,6 @@ class OutputStream;
 namespace llvm {
 namespace ms_demangle {
 
-// This memory allocator is extremely fast, but it doesn't call dtors
-// for allocated objects. That means you can't use STL containers
-// (such as std::vector) with this allocator. But it pays off --
-// the demangler is 3x faster with this allocator compared to one with
-// STL containers.
-constexpr size_t AllocUnit = 4096;
-
-class ArenaAllocator {
-  struct AllocatorNode {
-    uint8_t *Buf = nullptr;
-    size_t Used = 0;
-    size_t Capacity = 0;
-    AllocatorNode *Next = nullptr;
-  };
-
-  void addNode(size_t Capacity) {
-    AllocatorNode *NewHead = new AllocatorNode;
-    NewHead->Buf = new uint8_t[Capacity];
-    NewHead->Next = Head;
-    NewHead->Capacity = Capacity;
-    Head = NewHead;
-    NewHead->Used = 0;
-  }
-
-public:
-  ArenaAllocator() { addNode(AllocUnit); }
-
-  ~ArenaAllocator() {
-    while (Head) {
-      assert(Head->Buf);
-      delete[] Head->Buf;
-      AllocatorNode *Next = Head->Next;
-      delete Head;
-      Head = Next;
-    }
-  }
-
-  char *allocUnalignedBuffer(size_t Length) {
-    uint8_t *Buf = Head->Buf + Head->Used;
-
-    Head->Used += Length;
-    if (Head->Used > Head->Capacity) {
-      // It's possible we need a buffer which is larger than our default unit
-      // size, so we need to be careful to add a node with capacity that is at
-      // least as large as what we need.
-      addNode(std::max(AllocUnit, Length));
-      Head->Used = Length;
-      Buf = Head->Buf;
-    }
-
-    return reinterpret_cast<char *>(Buf);
-  }
-
-  template <typename T, typename... Args>
-  T *allocArray(size_t Count) {
-
-    size_t Size = Count * sizeof(T);
-    assert(Head && Head->Buf);
-
-    size_t P = (size_t)Head->Buf + Head->Used;
-    uintptr_t AlignedP =
-        (((size_t)P + alignof(T) - 1) & ~(size_t)(alignof(T) - 1));
-    uint8_t *PP = (uint8_t *)AlignedP;
-    size_t Adjustment = AlignedP - P;
-
-    Head->Used += Size + Adjustment;
-    if (Head->Used < Head->Capacity)
-      return new (PP) T[Count]();
-
-    addNode(AllocUnit);
-    Head->Used = Size;
-    return new (Head->Buf) T[Count]();
-  }
-
-  template <typename T, typename... Args> T *alloc(Args &&... ConstructorArgs) {
-
-    size_t Size = sizeof(T);
-    assert(Head && Head->Buf);
-
-    size_t P = (size_t)Head->Buf + Head->Used;
-    uintptr_t AlignedP =
-        (((size_t)P + alignof(T) - 1) & ~(size_t)(alignof(T) - 1));
-    uint8_t *PP = (uint8_t *)AlignedP;
-    size_t Adjustment = AlignedP - P;
-
-    Head->Used += Size + Adjustment;
-    if (Head->Used < Head->Capacity)
-      return new (PP) T(std::forward<Args>(ConstructorArgs)...);
-
-    addNode(AllocUnit);
-    Head->Used = Size;
-    return new (Head->Buf) T(std::forward<Args>(ConstructorArgs)...);
-  }
-
-private:
-  AllocatorNode *Head = nullptr;
-};
-
 // Storage classes
 enum Qualifiers : uint8_t {
   Q_None = 0,
diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index 1ce772ccde956fc82fcdecab3996c77040d61908..589ca612f0466198400112b9d7f8f52f16af5001 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h
@@ -35,34 +35,12 @@ class ObjectFile;
 
 } // end namespace object
 
-/// JITEvent_EmittedFunctionDetails - Helper struct for containing information
-/// about a generated machine code function.
-struct JITEvent_EmittedFunctionDetails {
-  struct LineStart {
-    /// The address at which the current line changes.
-    uintptr_t Address;
-
-    /// The new location information.  These can be translated to DebugLocTuples
-    /// using MF->getDebugLocTuple().
-    DebugLoc Loc;
-  };
-
-  /// The machine function the struct contains information for.
-  const MachineFunction *MF;
-
-  /// The list of line boundary information, sorted by address.
-  std::vector<LineStart> LineStarts;
-};
-
 /// JITEventListener - Abstract interface for use by the JIT to notify clients
 /// about significant events during compilation. For example, to notify
 /// profilers and debuggers that need to know where functions have been emitted.
 ///
 /// The default implementation of each method does nothing.
 class JITEventListener {
-public:
-  using EmittedFunctionDetails = JITEvent_EmittedFunctionDetails;
-
 public:
   JITEventListener() = default;
   virtual ~JITEventListener() = default;
diff --git a/include/llvm/ExecutionEngine/JITSymbol.h b/include/llvm/ExecutionEngine/JITSymbol.h
index 18b972ed8291b8b269a387d3f37e2a73cdfbcda4..05c9590726dfc0130c5f1476f59210b98c6290d7 100644
--- a/include/llvm/ExecutionEngine/JITSymbol.h
+++ b/include/llvm/ExecutionEngine/JITSymbol.h
@@ -40,6 +40,18 @@ class SymbolRef;
 /// Represents an address in the target process's address space.
 using JITTargetAddress = uint64_t;
 
+/// Convert a JITTargetAddress to a pointer.
+template <typename T> T jitTargetAddressToPointer(JITTargetAddress Addr) {
+  static_assert(std::is_pointer<T>::value, "T must be a pointer type");
+  uintptr_t IntPtr = static_cast<uintptr_t>(Addr);
+  assert(IntPtr == Addr && "JITTargetAddress value out of range for uintptr_t");
+  return reinterpret_cast<T>(IntPtr);
+}
+
+template <typename T> JITTargetAddress pointerToJITTargetAddress(T *Ptr) {
+  return static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(Ptr));
+}
+
 /// Flags for symbols in the JIT.
 class JITSymbolFlags {
 public:
diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
index 2003f8e43b88ea297240cba6b4260b7018ee5824..884878925cde8e81070d517f241bd79639bf380c 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h
@@ -62,7 +62,7 @@ namespace orc {
 
 class ExtractingIRMaterializationUnit;
 
-class CompileOnDemandLayer2 : public IRLayer {
+class CompileOnDemandLayer : public IRLayer {
   friend class PartitioningIRMaterializationUnit;
 
 public:
@@ -84,8 +84,8 @@ public:
   /// symbol in them is requested.
   static Optional<GlobalValueSet> compileWholeModule(GlobalValueSet Requested);
 
-  /// Construct a CompileOnDemandLayer2.
-  CompileOnDemandLayer2(ExecutionSession &ES, IRLayer &BaseLayer,
+  /// Construct a CompileOnDemandLayer.
+  CompileOnDemandLayer(ExecutionSession &ES, IRLayer &BaseLayer,
                         LazyCallThroughManager &LCTMgr,
                         IndirectStubsManagerBuilder BuildIndirectStubsManager);
 
@@ -94,8 +94,7 @@ public:
 
   /// Emits the given module. This should not be called by clients: it will be
   /// called by the JIT when a definition added via the add method is requested.
-  void emit(MaterializationResponsibility R, VModuleKey K,
-            ThreadSafeModule TSM) override;
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
 
 private:
   struct PerDylibResources {
@@ -142,7 +141,7 @@ private:
 template <typename BaseLayerT,
           typename CompileCallbackMgrT = JITCompileCallbackManager,
           typename IndirectStubsMgrT = IndirectStubsManager>
-class CompileOnDemandLayer {
+class LegacyCompileOnDemandLayer {
 private:
   template <typename MaterializerFtor>
   class LambdaMaterializer final : public ValueMaterializer {
@@ -266,13 +265,13 @@ public:
       std::function<void(VModuleKey K, std::shared_ptr<SymbolResolver> R)>;
 
   /// Construct a compile-on-demand layer instance.
-  CompileOnDemandLayer(ExecutionSession &ES, BaseLayerT &BaseLayer,
-                       SymbolResolverGetter GetSymbolResolver,
-                       SymbolResolverSetter SetSymbolResolver,
-                       PartitioningFtor Partition,
-                       CompileCallbackMgrT &CallbackMgr,
-                       IndirectStubsManagerBuilderT CreateIndirectStubsManager,
-                       bool CloneStubsIntoPartitions = true)
+  LegacyCompileOnDemandLayer(ExecutionSession &ES, BaseLayerT &BaseLayer,
+                             SymbolResolverGetter GetSymbolResolver,
+                             SymbolResolverSetter SetSymbolResolver,
+                             PartitioningFtor Partition,
+                             CompileCallbackMgrT &CallbackMgr,
+                             IndirectStubsManagerBuilderT CreateIndirectStubsManager,
+                             bool CloneStubsIntoPartitions = true)
       : ES(ES), BaseLayer(BaseLayer),
         GetSymbolResolver(std::move(GetSymbolResolver)),
         SetSymbolResolver(std::move(SetSymbolResolver)),
@@ -280,7 +279,7 @@ public:
         CreateIndirectStubsManager(std::move(CreateIndirectStubsManager)),
         CloneStubsIntoPartitions(CloneStubsIntoPartitions) {}
 
-  ~CompileOnDemandLayer() {
+  ~LegacyCompileOnDemandLayer() {
     // FIXME: Report error on log.
     while (!LogicalDylibs.empty())
       consumeError(removeModule(LogicalDylibs.begin()->first));
diff --git a/include/llvm/ExecutionEngine/Orc/CompileUtils.h b/include/llvm/ExecutionEngine/Orc/CompileUtils.h
index 3d02f9d05e4997ab4baf33793f46df9700ad4b1b..f34f88311ba526b2bb45f94e5a5c6abd4c96fa8b 100644
--- a/include/llvm/ExecutionEngine/Orc/CompileUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/CompileUtils.h
@@ -38,7 +38,7 @@ namespace orc {
 
 /// Simple compile functor: Takes a single IR module and returns an ObjectFile.
 /// This compiler supports a single compilation thread and LLVMContext only.
-/// For multithreaded compilation, use MultiThreadedSimpleCompiler below.
+/// For multithreaded compilation, use ConcurrentIRCompiler below.
 class SimpleCompiler {
 public:
   using CompileResult = std::unique_ptr<MemoryBuffer>;
@@ -105,10 +105,10 @@ private:
 ///
 /// This class creates a new TargetMachine and SimpleCompiler instance for each
 /// compile.
-class MultiThreadedSimpleCompiler {
+class ConcurrentIRCompiler {
 public:
-  MultiThreadedSimpleCompiler(JITTargetMachineBuilder JTMB,
-                              ObjectCache *ObjCache = nullptr)
+  ConcurrentIRCompiler(JITTargetMachineBuilder JTMB,
+                       ObjectCache *ObjCache = nullptr)
       : JTMB(std::move(JTMB)), ObjCache(ObjCache) {}
 
   void setObjectCache(ObjectCache *ObjCache) { this->ObjCache = ObjCache; }
diff --git a/include/llvm/ExecutionEngine/Orc/Core.h b/include/llvm/ExecutionEngine/Orc/Core.h
index f3ea2aef6209d5abe1283f911c3d5fa243f112e7..39d306e0bd4c4cf51f33c520fee4c03b98b12115 100644
--- a/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/include/llvm/ExecutionEngine/Orc/Core.h
@@ -20,10 +20,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 
-#include <list>
-#include <map>
 #include <memory>
-#include <set>
 #include <vector>
 
 #define DEBUG_TYPE "orc"
@@ -44,21 +41,21 @@ using VModuleKey = uint64_t;
 
 /// A set of symbol names (represented by SymbolStringPtrs for
 //         efficiency).
-using SymbolNameSet = std::set<SymbolStringPtr>;
+using SymbolNameSet = DenseSet<SymbolStringPtr>;
 
 /// A map from symbol names (as SymbolStringPtrs) to JITSymbols
 ///        (address/flags pairs).
-using SymbolMap = std::map<SymbolStringPtr, JITEvaluatedSymbol>;
+using SymbolMap = DenseMap<SymbolStringPtr, JITEvaluatedSymbol>;
 
 /// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags.
-using SymbolFlagsMap = std::map<SymbolStringPtr, JITSymbolFlags>;
+using SymbolFlagsMap = DenseMap<SymbolStringPtr, JITSymbolFlags>;
 
 /// A base class for materialization failures that allows the failing
 ///        symbols to be obtained for logging.
-using SymbolDependenceMap = std::map<JITDylib *, SymbolNameSet>;
+using SymbolDependenceMap = DenseMap<JITDylib *, SymbolNameSet>;
 
-/// A list of JITDylib pointers.
-using JITDylibList = std::vector<JITDylib *>;
+/// A list of (JITDylib*, bool) pairs.
+using JITDylibSearchList = std::vector<std::pair<JITDylib *, bool>>;
 
 /// Render a SymbolStringPtr.
 raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym);
@@ -88,8 +85,8 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps);
 /// Render a MaterializationUnit.
 raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU);
 
-/// Render a JITDylibList.
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibList &JDs);
+/// Render a JITDylibSearchList.
+raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs);
 
 /// Callback to notify client that symbols have been resolved.
 using SymbolsResolvedCallback = std::function<void(Expected<SymbolMap>)>;
@@ -171,6 +168,9 @@ public:
   ///        into.
   JITDylib &getTargetJITDylib() const { return JD; }
 
+  /// Returns the VModuleKey for this instance.
+  VModuleKey getVModuleKey() const { return K; }
+
   /// Returns the symbol flags map for this responsibility instance.
   /// Note: The returned flags may have transient flags (Lazy, Materializing)
   /// set. These should be stripped with JITSymbolFlags::stripTransientFlags
@@ -221,7 +221,8 @@ public:
   /// Delegates responsibility for the given symbols to the returned
   /// materialization responsibility. Useful for breaking up work between
   /// threads, or different kinds of materialization processes.
-  MaterializationResponsibility delegate(const SymbolNameSet &Symbols);
+  MaterializationResponsibility delegate(const SymbolNameSet &Symbols,
+                                         VModuleKey NewKey = VModuleKey());
 
   void addDependencies(const SymbolStringPtr &Name,
                        const SymbolDependenceMap &Dependencies);
@@ -232,10 +233,12 @@ public:
 private:
   /// Create a MaterializationResponsibility for the given JITDylib and
   ///        initial symbols.
-  MaterializationResponsibility(JITDylib &JD, SymbolFlagsMap SymbolFlags);
+  MaterializationResponsibility(JITDylib &JD, SymbolFlagsMap SymbolFlags,
+                                VModuleKey K);
 
   JITDylib &JD;
   SymbolFlagsMap SymbolFlags;
+  VModuleKey K;
 };
 
 /// A MaterializationUnit represents a set of symbol definitions that can
@@ -248,8 +251,8 @@ private:
 /// stronger definition is added or already present.
 class MaterializationUnit {
 public:
-  MaterializationUnit(SymbolFlagsMap InitalSymbolFlags)
-      : SymbolFlags(std::move(InitalSymbolFlags)) {}
+  MaterializationUnit(SymbolFlagsMap InitalSymbolFlags, VModuleKey K)
+      : SymbolFlags(std::move(InitalSymbolFlags)), K(std::move(K)) {}
 
   virtual ~MaterializationUnit() {}
 
@@ -264,7 +267,8 @@ public:
   /// ExecutionSession::DispatchMaterializationFunction) to trigger
   /// materialization of this MaterializationUnit.
   void doMaterialize(JITDylib &JD) {
-    materialize(MaterializationResponsibility(JD, std::move(SymbolFlags)));
+    materialize(MaterializationResponsibility(JD, std::move(SymbolFlags),
+                                              std::move(K)));
   }
 
   /// Called by JITDylibs to notify MaterializationUnits that the given symbol
@@ -276,6 +280,7 @@ public:
 
 protected:
   SymbolFlagsMap SymbolFlags;
+  VModuleKey K;
 
 private:
   virtual void anchor();
@@ -301,7 +306,7 @@ using MaterializationUnitList =
 /// materialized.
 class AbsoluteSymbolsMaterializationUnit : public MaterializationUnit {
 public:
-  AbsoluteSymbolsMaterializationUnit(SymbolMap Symbols);
+  AbsoluteSymbolsMaterializationUnit(SymbolMap Symbols, VModuleKey K);
 
   StringRef getName() const override;
 
@@ -324,9 +329,9 @@ private:
 /// \endcode
 ///
 inline std::unique_ptr<AbsoluteSymbolsMaterializationUnit>
-absoluteSymbols(SymbolMap Symbols) {
+absoluteSymbols(SymbolMap Symbols, VModuleKey K = VModuleKey()) {
   return llvm::make_unique<AbsoluteSymbolsMaterializationUnit>(
-      std::move(Symbols));
+      std::move(Symbols), std::move(K));
 }
 
 struct SymbolAliasMapEntry {
@@ -339,20 +344,22 @@ struct SymbolAliasMapEntry {
 };
 
 /// A map of Symbols to (Symbol, Flags) pairs.
-using SymbolAliasMap = std::map<SymbolStringPtr, SymbolAliasMapEntry>;
+using SymbolAliasMap = DenseMap<SymbolStringPtr, SymbolAliasMapEntry>;
 
 /// A materialization unit for symbol aliases. Allows existing symbols to be
 /// aliased with alternate flags.
 class ReExportsMaterializationUnit : public MaterializationUnit {
 public:
   /// SourceJD is allowed to be nullptr, in which case the source JITDylib is
-  /// taken to be whatever JITDylib these definitions are materialized in. This
-  /// is useful for defining aliases within a JITDylib.
+  /// taken to be whatever JITDylib these definitions are materialized in (and
+  /// MatchNonExported has no effect). This is useful for defining aliases
+  /// within a JITDylib.
   ///
   /// Note: Care must be taken that no sets of aliases form a cycle, as such
   ///       a cycle will result in a deadlock when any symbol in the cycle is
   ///       resolved.
-  ReExportsMaterializationUnit(JITDylib *SourceJD, SymbolAliasMap Aliases);
+  ReExportsMaterializationUnit(JITDylib *SourceJD, bool MatchNonExported,
+                               SymbolAliasMap Aliases, VModuleKey K);
 
   StringRef getName() const override;
 
@@ -362,6 +369,7 @@ private:
   static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases);
 
   JITDylib *SourceJD = nullptr;
+  bool MatchNonExported = false;
   SymbolAliasMap Aliases;
 };
 
@@ -377,17 +385,21 @@ private:
 ///     return Err;
 /// \endcode
 inline std::unique_ptr<ReExportsMaterializationUnit>
-symbolAliases(SymbolAliasMap Aliases) {
-  return llvm::make_unique<ReExportsMaterializationUnit>(nullptr,
-                                                         std::move(Aliases));
+symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
+  return llvm::make_unique<ReExportsMaterializationUnit>(
+      nullptr, true, std::move(Aliases), std::move(K));
 }
 
 /// Create a materialization unit for re-exporting symbols from another JITDylib
 /// with alternative names/flags.
+/// If MatchNonExported is true then non-exported symbols from SourceJD can be
+/// re-exported. If it is false, attempts to re-export a non-exported symbol
+/// will result in a "symbol not found" error.
 inline std::unique_ptr<ReExportsMaterializationUnit>
-reexports(JITDylib &SourceJD, SymbolAliasMap Aliases) {
-  return llvm::make_unique<ReExportsMaterializationUnit>(&SourceJD,
-                                                         std::move(Aliases));
+reexports(JITDylib &SourceJD, SymbolAliasMap Aliases,
+          bool MatchNonExported = false, VModuleKey K = VModuleKey()) {
+  return llvm::make_unique<ReExportsMaterializationUnit>(
+      &SourceJD, MatchNonExported, std::move(Aliases), std::move(K));
 }
 
 /// Build a SymbolAliasMap for the common case where you want to re-export
@@ -395,15 +407,23 @@ reexports(JITDylib &SourceJD, SymbolAliasMap Aliases) {
 Expected<SymbolAliasMap>
 buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols);
 
-class ReexportsFallbackDefinitionGenerator {
+/// ReexportsGenerator can be used with JITDylib::setGenerator to automatically
+/// re-export a subset of the source JITDylib's symbols in the target.
+class ReexportsGenerator {
 public:
   using SymbolPredicate = std::function<bool(SymbolStringPtr)>;
-  ReexportsFallbackDefinitionGenerator(JITDylib &BackingJD,
-                                       SymbolPredicate Allow);
+
+  /// Create a reexports generator. If an Allow predicate is passed, only
+  /// symbols for which the predicate returns true will be reexported. If no
+  /// Allow predicate is passed, all symbols will be exported.
+  ReexportsGenerator(JITDylib &SourceJD, bool MatchNonExported = false,
+                     SymbolPredicate Allow = SymbolPredicate());
+
   SymbolNameSet operator()(JITDylib &JD, const SymbolNameSet &Names);
 
 private:
-  JITDylib &BackingJD;
+  JITDylib &SourceJD;
+  bool MatchNonExported = false;
   SymbolPredicate Allow;
 };
 
@@ -478,11 +498,11 @@ class JITDylib {
   friend class ExecutionSession;
   friend class MaterializationResponsibility;
 public:
-  using FallbackDefinitionGeneratorFunction = std::function<SymbolNameSet(
+  using GeneratorFunction = std::function<SymbolNameSet(
       JITDylib &Parent, const SymbolNameSet &Names)>;
 
   using AsynchronousSymbolQuerySet =
-      std::set<std::shared_ptr<AsynchronousSymbolQuery>>;
+    std::set<std::shared_ptr<AsynchronousSymbolQuery>>;
 
   JITDylib(const JITDylib &) = delete;
   JITDylib &operator=(const JITDylib &) = delete;
@@ -495,12 +515,12 @@ public:
   /// Get a reference to the ExecutionSession for this JITDylib.
   ExecutionSession &getExecutionSession() const { return ES; }
 
-  /// Set a fallback defenition generator. If set, lookup and lookupFlags will
-  /// pass the unresolved symbols set to the fallback definition generator,
-  /// allowing it to add a new definition to the JITDylib.
-  void setFallbackDefinitionGenerator(
-      FallbackDefinitionGeneratorFunction FallbackDefinitionGenerator) {
-    this->FallbackDefinitionGenerator = std::move(FallbackDefinitionGenerator);
+  /// Set a definition generator. If set, whenever a symbol fails to resolve
+  /// within this JITDylib, lookup and lookupFlags will pass the unresolved
+  /// symbols set to the definition generator. The generator can optionally
+  /// add a definition for the unresolved symbols to the dylib.
+  void setGenerator(GeneratorFunction DefGenerator) {
+    this->DefGenerator = std::move(DefGenerator);
   }
 
   /// Set the search order to be used when fixing up definitions in JITDylib.
@@ -522,16 +542,18 @@ public:
   /// as the first in the search order (instead of this dylib) ensures that
   /// definitions within this dylib resolve to the lazy-compiling stubs,
   /// rather than immediately materializing the definitions in this dylib.
-  void setSearchOrder(JITDylibList NewSearchOrder,
-                      bool SearchThisJITDylibFirst = true);
+  void setSearchOrder(JITDylibSearchList NewSearchOrder,
+                      bool SearchThisJITDylibFirst = true,
+                      bool MatchNonExportedInThisDylib = true);
 
   /// Add the given JITDylib to the search order for definitions in this
   /// JITDylib.
-  void addToSearchOrder(JITDylib &JD);
+  void addToSearchOrder(JITDylib &JD, bool MatcNonExported = false);
 
   /// Replace OldJD with NewJD in the search order if OldJD is present.
   /// Otherwise this operation is a no-op.
-  void replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD);
+  void replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
+                            bool MatchNonExported = false);
 
   /// Remove the given JITDylib from the search order for this JITDylib if it is
   /// present. Otherwise this operation is a no-op.
@@ -540,7 +562,7 @@ public:
   /// Do something with the search order (run under the session lock).
   template <typename Func>
   auto withSearchOrderDo(Func &&F)
-      -> decltype(F(std::declval<const JITDylibList &>()));
+      -> decltype(F(std::declval<const JITDylibSearchList &>()));
 
   /// Define all symbols provided by the materialization unit to be part of this
   /// JITDylib.
@@ -602,7 +624,7 @@ private:
   };
 
   using UnmaterializedInfosMap =
-      std::map<SymbolStringPtr, std::shared_ptr<UnmaterializedInfo>>;
+      DenseMap<SymbolStringPtr, std::shared_ptr<UnmaterializedInfo>>;
 
   struct MaterializingInfo {
     AsynchronousSymbolQueryList PendingQueries;
@@ -611,7 +633,7 @@ private:
     bool IsEmitted = false;
   };
 
-  using MaterializingInfosMap = std::map<SymbolStringPtr, MaterializingInfo>;
+  using MaterializingInfosMap = DenseMap<SymbolStringPtr, MaterializingInfo>;
 
   using LookupImplActionFlags = enum {
     None = 0,
@@ -628,10 +650,12 @@ private:
                                 const SymbolNameSet &Names);
 
   void lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                  SymbolNameSet &Unresolved, MaterializationUnitList &MUs);
+                  SymbolNameSet &Unresolved, bool MatchNonExported,
+                  MaterializationUnitList &MUs);
 
   void lodgeQueryImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                      SymbolNameSet &Unresolved, MaterializationUnitList &MUs);
+                      SymbolNameSet &Unresolved, bool MatchNonExported,
+                      MaterializationUnitList &MUs);
 
   LookupImplActionFlags
   lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
@@ -665,8 +689,8 @@ private:
   SymbolMap Symbols;
   UnmaterializedInfosMap UnmaterializedInfos;
   MaterializingInfosMap MaterializingInfos;
-  FallbackDefinitionGeneratorFunction FallbackDefinitionGenerator;
-  JITDylibList SearchOrder;
+  GeneratorFunction DefGenerator;
+  JITDylibSearchList SearchOrder;
 };
 
 /// An ExecutionSession represents a running JIT program.
@@ -750,6 +774,10 @@ public:
 
   /// Search the given JITDylib list for the given symbols.
   ///
+  /// SearchOrder lists the JITDylibs to search. For each dylib, the associated
+  /// boolean indicates whether the search should match against non-exported
+  /// (hidden visibility) symbols in that dylib (true means match against
+  /// non-exported symbols, false means do not match).
   ///
   /// The OnResolve callback will be called once all requested symbols are
   /// resolved, or if an error occurs prior to resolution.
@@ -766,7 +794,7 @@ public:
   /// dependenant symbols for this query (e.g. it is being made by a top level
   /// client to get an address to call) then the value NoDependenciesToRegister
   /// can be used.
-  void lookup(const JITDylibList &JDs, SymbolNameSet Symbols,
+  void lookup(const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
               SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
               RegisterDependenciesFunction RegisterDependencies);
 
@@ -777,20 +805,29 @@ public:
   /// or an error occurs. If WaitUntilReady is false and an error occurs
   /// after resolution, the function will return a success value, but the
   /// error will be reported via reportErrors.
-  Expected<SymbolMap> lookup(const JITDylibList &JDs,
+  Expected<SymbolMap> lookup(const JITDylibSearchList &SearchOrder,
                              const SymbolNameSet &Symbols,
-                             RegisterDependenciesFunction RegisterDependencies,
+                             RegisterDependenciesFunction RegisterDependencies =
+                                 NoDependenciesToRegister,
                              bool WaitUntilReady = true);
 
-  /// Convenience version of the blocking version of lookup above. Uses the main
-  /// JITDylib's search order as the lookup order, and registers no
-  /// dependencies.
-  Expected<SymbolMap> lookup(const SymbolNameSet &Symbols) {
-    return getMainJITDylib().withSearchOrderDo(
-        [&](const JITDylibList &SearchOrder) {
-          return lookup(SearchOrder, Symbols, NoDependenciesToRegister, true);
-        });
-  }
+  /// Convenience version of blocking lookup.
+  /// Searches each of the JITDylibs in the search order in turn for the given
+  /// symbol.
+  Expected<JITEvaluatedSymbol> lookup(const JITDylibSearchList &SearchOrder,
+                                      SymbolStringPtr Symbol);
+
+  /// Convenience version of blocking lookup.
+  /// Searches each of the JITDylibs in the search order in turn for the given
+  /// symbol. The search will not find non-exported symbols.
+  Expected<JITEvaluatedSymbol> lookup(ArrayRef<JITDylib *> SearchOrder,
+                                      SymbolStringPtr Symbol);
+
+  /// Convenience version of blocking lookup.
+  /// Searches each of the JITDylibs in the search order in turn for the given
+  /// symbol. The search will not find non-exported symbols.
+  Expected<JITEvaluatedSymbol> lookup(ArrayRef<JITDylib *> SearchOrder,
+                                      StringRef Symbol);
 
   /// Materialize the given unit.
   void dispatchMaterialization(JITDylib &JD,
@@ -836,7 +873,7 @@ private:
 
 template <typename Func>
 auto JITDylib::withSearchOrderDo(Func &&F)
-    -> decltype(F(std::declval<const JITDylibList &>())) {
+    -> decltype(F(std::declval<const JITDylibSearchList &>())) {
   return ES.runSessionLocked([&]() { return F(SearchOrder); });
 }
 
@@ -873,16 +910,6 @@ Error JITDylib::define(std::unique_ptr<MaterializationUnitType> &MU) {
   });
 }
 
-/// Look up the given names in the given JITDylibs.
-/// JDs will be searched in order and no JITDylib pointer may be null.
-/// All symbols must be found within the given JITDylibs or an error
-/// will be returned.
-Expected<SymbolMap> lookup(const JITDylibList &JDs, SymbolNameSet Names);
-
-/// Look up a symbol by searching a list of JITDylibs.
-Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs,
-                                    SymbolStringPtr Name);
-
 /// Mangles symbol names then uniques them in the context of an
 /// ExecutionSession.
 class MangleAndInterner {
diff --git a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
index 52250662a9545a91ed73ec77226f6ffebb5aa53e..88559f822e5d06c505d05030055138535b427a55 100644
--- a/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -94,11 +94,11 @@ iterator_range<CtorDtorIterator> getDestructors(const Module &M);
 /// Convenience class for recording constructor/destructor names for
 ///        later execution.
 template <typename JITLayerT>
-class CtorDtorRunner {
+class LegacyCtorDtorRunner {
 public:
   /// Construct a CtorDtorRunner for the given range using the given
   ///        name mangling function.
-  CtorDtorRunner(std::vector<std::string> CtorDtorNames, VModuleKey K)
+  LegacyCtorDtorRunner(std::vector<std::string> CtorDtorNames, VModuleKey K)
       : CtorDtorNames(std::move(CtorDtorNames)), K(K) {}
 
   /// Run the recorded constructors/destructors through the given JIT
@@ -129,9 +129,9 @@ private:
   orc::VModuleKey K;
 };
 
-class CtorDtorRunner2 {
+class CtorDtorRunner {
 public:
-  CtorDtorRunner2(JITDylib &JD) : JD(JD) {}
+  CtorDtorRunner(JITDylib &JD) : JD(JD) {}
   void add(iterator_range<CtorDtorIterator> CtorDtors);
   Error run();
 
@@ -177,11 +177,11 @@ protected:
                                void *DSOHandle);
 };
 
-class LocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase {
+class LegacyLocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase {
 public:
   /// Create a runtime-overrides class.
   template <typename MangleFtorT>
-  LocalCXXRuntimeOverrides(const MangleFtorT &Mangle) {
+  LegacyLocalCXXRuntimeOverrides(const MangleFtorT &Mangle) {
     addOverride(Mangle("__dso_handle"), toTargetAddress(&DSOHandleOverride));
     addOverride(Mangle("__cxa_atexit"), toTargetAddress(&CXAAtExitOverride));
   }
@@ -202,7 +202,7 @@ private:
   StringMap<JITTargetAddress> CXXRuntimeOverrides;
 };
 
-class LocalCXXRuntimeOverrides2 : public LocalCXXRuntimeOverridesBase {
+class LocalCXXRuntimeOverrides : public LocalCXXRuntimeOverridesBase {
 public:
   Error enable(JITDylib &JD, MangleAndInterner &Mangler);
 };
@@ -212,32 +212,30 @@ public:
 /// If an instance of this class is attached to a JITDylib as a fallback
 /// definition generator, then any symbol found in the given DynamicLibrary that
 /// passes the 'Allow' predicate will be added to the JITDylib.
-class DynamicLibraryFallbackGenerator {
+class DynamicLibrarySearchGenerator {
 public:
   using SymbolPredicate = std::function<bool(SymbolStringPtr)>;
 
-  static bool AllowAll(SymbolStringPtr Name) { return true; }
-
-  /// Create a DynamicLibraryFallbackGenerator that searches for symbols in the
+  /// Create a DynamicLibrarySearchGenerator that searches for symbols in the
   /// given sys::DynamicLibrary.
-  /// Only symbols that match the 'Allow' predicate will be searched for.
-  DynamicLibraryFallbackGenerator(sys::DynamicLibrary Dylib,
-                                  const DataLayout &DL,
-                                  SymbolPredicate Allow = AllowAll);
+  /// If the Allow predicate is given then only symbols matching the predicate
+  /// will be searched for in the DynamicLibrary. If the predicate is not given
+  /// then all symbols will be searched for.
+  DynamicLibrarySearchGenerator(sys::DynamicLibrary Dylib, const DataLayout &DL,
+                                SymbolPredicate Allow = SymbolPredicate());
 
   /// Permanently loads the library at the given path and, on success, returns
-  /// a DynamicLibraryFallbackGenerator that will search it for symbol
-  /// definitions matching the Allow predicate.
-  /// On failure returns the reason the library failed to load.
-  static Expected<DynamicLibraryFallbackGenerator>
+  /// a DynamicLibrarySearchGenerator that will search it for symbol definitions
+  /// in the library. On failure returns the reason the library failed to load.
+  static Expected<DynamicLibrarySearchGenerator>
   Load(const char *FileName, const DataLayout &DL,
-       SymbolPredicate Allow = AllowAll);
+       SymbolPredicate Allow = SymbolPredicate());
 
-  /// Creates a DynamicLibraryFallbackGenerator that searches for symbols in
+  /// Creates a DynamicLibrarySearchGenerator that searches for symbols in
   /// the current process.
-  static Expected<DynamicLibraryFallbackGenerator>
-  CreateForCurrentProcess(const DataLayout &DL,
-                          SymbolPredicate Allow = AllowAll) {
+  static Expected<DynamicLibrarySearchGenerator>
+  GetForCurrentProcess(const DataLayout &DL,
+                       SymbolPredicate Allow = SymbolPredicate()) {
     return Load(nullptr, DL, std::move(Allow));
   }
 
diff --git a/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
index cb8df26bfdc30a3f4e3e39264e88fec668e868f5..30d71e69cd70cb7236e132da76dc5c22de19478a 100644
--- a/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/IRCompileLayer.h
@@ -28,7 +28,7 @@ class Module;
 
 namespace orc {
 
-class IRCompileLayer2 : public IRLayer {
+class IRCompileLayer : public IRLayer {
 public:
   using CompileFunction =
       std::function<Expected<std::unique_ptr<MemoryBuffer>>(Module &)>;
@@ -36,13 +36,12 @@ public:
   using NotifyCompiledFunction =
       std::function<void(VModuleKey K, ThreadSafeModule TSM)>;
 
-  IRCompileLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer,
-                  CompileFunction Compile);
+  IRCompileLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
+                 CompileFunction Compile);
 
   void setNotifyCompiled(NotifyCompiledFunction NotifyCompiled);
 
-  void emit(MaterializationResponsibility R, VModuleKey K,
-            ThreadSafeModule TSM) override;
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
 
 private:
   mutable std::mutex IRLayerMutex;
@@ -57,15 +56,15 @@ private:
 /// object file and adds this module file to the layer below, which must
 /// implement the object layer concept.
 template <typename BaseLayerT, typename CompileFtor>
-class IRCompileLayer {
+class LegacyIRCompileLayer {
 public:
   /// Callback type for notifications when modules are compiled.
   using NotifyCompiledCallback =
       std::function<void(VModuleKey K, std::unique_ptr<Module>)>;
 
-  /// Construct an IRCompileLayer with the given BaseLayer, which must
+  /// Construct an LegacyIRCompileLayer with the given BaseLayer, which must
   ///        implement the ObjectLayer concept.
-  IRCompileLayer(
+  LegacyIRCompileLayer(
       BaseLayerT &BaseLayer, CompileFtor Compile,
       NotifyCompiledCallback NotifyCompiled = NotifyCompiledCallback())
       : BaseLayer(BaseLayer), Compile(std::move(Compile)),
diff --git a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
index d5f91cef359d5ca770e42fc2a739ebc0059ea5fe..49e65b9f2a805fe8ae559171149c11ebddcf9a76 100644
--- a/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/IRTransformLayer.h
@@ -23,20 +23,19 @@ namespace llvm {
 class Module;
 namespace orc {
 
-class IRTransformLayer2 : public IRLayer {
+class IRTransformLayer : public IRLayer {
 public:
   using TransformFunction = std::function<Expected<ThreadSafeModule>(
       ThreadSafeModule, const MaterializationResponsibility &R)>;
 
-  IRTransformLayer2(ExecutionSession &ES, IRLayer &BaseLayer,
-                    TransformFunction Transform = identityTransform);
+  IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer,
+                   TransformFunction Transform = identityTransform);
 
   void setTransform(TransformFunction Transform) {
     this->Transform = std::move(Transform);
   }
 
-  void emit(MaterializationResponsibility R, VModuleKey K,
-            ThreadSafeModule TSM) override;
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
 
   static ThreadSafeModule
   identityTransform(ThreadSafeModule TSM,
@@ -54,11 +53,11 @@ private:
 ///   This layer applies a user supplied transform to each module that is added,
 /// then adds the transformed module to the layer below.
 template <typename BaseLayerT, typename TransformFtor>
-class IRTransformLayer {
+class LegacyIRTransformLayer {
 public:
 
-  /// Construct an IRTransformLayer with the given BaseLayer
-  IRTransformLayer(BaseLayerT &BaseLayer,
+  /// Construct an LegacyIRTransformLayer with the given BaseLayer
+  LegacyIRTransformLayer(BaseLayerT &BaseLayer,
                    TransformFtor Transform = TransformFtor())
     : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
diff --git a/include/llvm/ExecutionEngine/Orc/LLJIT.h b/include/llvm/ExecutionEngine/Orc/LLJIT.h
index 400d4cbe7f09bcfd73f0a86491eff48cc68404f8..ce3e5d519c73cf64b0e56d97650938ad8d0f00e5 100644
--- a/include/llvm/ExecutionEngine/Orc/LLJIT.h
+++ b/include/llvm/ExecutionEngine/Orc/LLJIT.h
@@ -49,6 +49,11 @@ public:
   /// Returns a reference to the JITDylib representing the JIT'd main program.
   JITDylib &getMainJITDylib() { return Main; }
 
+  /// Create a new JITDylib with the given name and return a reference to it.
+  JITDylib &createJITDylib(std::string Name) {
+    return ES->createJITDylib(std::move(Name));
+  }
+
   /// Convenience method for defining an absolute symbol.
   Error defineAbsolute(StringRef Name, JITEvaluatedSymbol Address);
 
@@ -99,7 +104,7 @@ public:
   Error runDestructors() { return DtorRunner.run(); }
 
   /// Returns a reference to the ObjLinkingLayer
-  RTDyldObjectLinkingLayer2 &getObjLinkingLayer() { return ObjLinkingLayer; }
+  RTDyldObjectLinkingLayer &getObjLinkingLayer() { return ObjLinkingLayer; }
 
 protected:
 
@@ -111,8 +116,6 @@ protected:
   LLJIT(std::unique_ptr<ExecutionSession> ES, JITTargetMachineBuilder JTMB,
         DataLayout DL, unsigned NumCompileThreads);
 
-  std::unique_ptr<RuntimeDyld::MemoryManager> getMemoryManager(VModuleKey K);
-
   std::string mangle(StringRef UnmangledName);
 
   Error applyDataLayout(Module &M);
@@ -125,10 +128,10 @@ protected:
   DataLayout DL;
   std::unique_ptr<ThreadPool> CompileThreads;
 
-  RTDyldObjectLinkingLayer2 ObjLinkingLayer;
-  IRCompileLayer2 CompileLayer;
+  RTDyldObjectLinkingLayer ObjLinkingLayer;
+  IRCompileLayer CompileLayer;
 
-  CtorDtorRunner2 CtorRunner, DtorRunner;
+  CtorDtorRunner CtorRunner, DtorRunner;
 };
 
 /// An extended version of LLJIT that supports lazy function-at-a-time
@@ -141,17 +144,17 @@ public:
   /// LLLazyJIT with the given number of compile threads.
   static Expected<std::unique_ptr<LLLazyJIT>>
   Create(JITTargetMachineBuilder JTMB, DataLayout DL,
-         unsigned NumCompileThreads = 0);
+         JITTargetAddress ErrorAddr, unsigned NumCompileThreads = 0);
 
   /// Set an IR transform (e.g. pass manager pipeline) to run on each function
   /// when it is compiled.
-  void setLazyCompileTransform(IRTransformLayer2::TransformFunction Transform) {
+  void setLazyCompileTransform(IRTransformLayer::TransformFunction Transform) {
     TransformLayer.setTransform(std::move(Transform));
   }
 
   /// Sets the partition function.
   void
-  setPartitionFunction(CompileOnDemandLayer2::PartitionFunction Partition) {
+  setPartitionFunction(CompileOnDemandLayer::PartitionFunction Partition) {
     CODLayer.setPartitionFunction(std::move(Partition));
   }
 
@@ -180,8 +183,8 @@ private:
   std::unique_ptr<LazyCallThroughManager> LCTMgr;
   std::function<std::unique_ptr<IndirectStubsManager>()> ISMBuilder;
 
-  IRTransformLayer2 TransformLayer;
-  CompileOnDemandLayer2 CODLayer;
+  IRTransformLayer TransformLayer;
+  CompileOnDemandLayer CODLayer;
 };
 
 } // End namespace orc
diff --git a/include/llvm/ExecutionEngine/Orc/Layer.h b/include/llvm/ExecutionEngine/Orc/Layer.h
index 3bd23ae54165dc8ce81e7e4eae014fbde6203089..cd797445a2e6b3476de2a15a193d9ef94ac65f91 100644
--- a/include/llvm/ExecutionEngine/Orc/Layer.h
+++ b/include/llvm/ExecutionEngine/Orc/Layer.h
@@ -49,17 +49,11 @@ public:
 
   /// Adds a MaterializationUnit representing the given IR to the given
   /// JITDylib.
-  virtual Error add(JITDylib &JD, VModuleKey K, ThreadSafeModule TSM);
-
-  /// Adds a MaterializationUnit representing the given IR to the main
-  /// JITDylib.
-  Error add(VModuleKey K, ThreadSafeModule TSM) {
-    return add(ES.getMainJITDylib(), K, std::move(TSM));
-  }
+  virtual Error add(JITDylib &JD, ThreadSafeModule TSM,
+                    VModuleKey K = VModuleKey());
 
   /// Emit should materialize the given IR.
-  virtual void emit(MaterializationResponsibility R, VModuleKey K,
-                    ThreadSafeModule TSM) = 0;
+  virtual void emit(MaterializationResponsibility R, ThreadSafeModule TSM) = 0;
 
 private:
   bool CloneToNewContextOnEmit = false;
@@ -76,14 +70,16 @@ public:
 
   /// Create an IRMaterializationLayer. Scans the module to build the
   /// SymbolFlags and SymbolToDefinition maps.
-  IRMaterializationUnit(ExecutionSession &ES, ThreadSafeModule TSM);
+  IRMaterializationUnit(ExecutionSession &ES, ThreadSafeModule TSM,
+                        VModuleKey K);
 
   /// Create an IRMaterializationLayer from a module, and pre-existing
   /// SymbolFlags and SymbolToDefinition maps. The maps must provide
   /// entries for each definition in M.
   /// This constructor is useful for delegating work from one
   /// IRMaterializationUnit to another.
-  IRMaterializationUnit(ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
+  IRMaterializationUnit(ThreadSafeModule TSM, VModuleKey K,
+                        SymbolFlagsMap SymbolFlags,
                         SymbolNameToDefinitionMap SymbolToDefinition);
 
   /// Return the ModuleIdentifier as the name for this MaterializationUnit.
@@ -125,16 +121,11 @@ public:
 
   /// Adds a MaterializationUnit representing the given IR to the given
   /// JITDylib.
-  virtual Error add(JITDylib &JD, VModuleKey K, std::unique_ptr<MemoryBuffer> O);
-
-  /// Adds a MaterializationUnit representing the given object to the main
-  /// JITDylib.
-  Error add(VModuleKey K, std::unique_ptr<MemoryBuffer> O) {
-    return add(ES.getMainJITDylib(), K, std::move(O));
-  }
+  virtual Error add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O,
+                    VModuleKey K = VModuleKey());
 
   /// Emit should materialize the given IR.
-  virtual void emit(MaterializationResponsibility R, VModuleKey K,
+  virtual void emit(MaterializationResponsibility R,
                     std::unique_ptr<MemoryBuffer> O) = 0;
 
 private:
@@ -161,7 +152,6 @@ private:
   void discard(const JITDylib &JD, const SymbolStringPtr &Name) override;
 
   ObjectLayer &L;
-  VModuleKey K;
   std::unique_ptr<MemoryBuffer> O;
 };
 
diff --git a/include/llvm/ExecutionEngine/Orc/LazyReexports.h b/include/llvm/ExecutionEngine/Orc/LazyReexports.h
index 8f897009ac28b0a48e22e6e3c0e3044882777f32..b5041325bce2152939951ffe5641b24f70667ed6 100644
--- a/include/llvm/ExecutionEngine/Orc/LazyReexports.h
+++ b/include/llvm/ExecutionEngine/Orc/LazyReexports.h
@@ -159,7 +159,8 @@ public:
   LazyReexportsMaterializationUnit(LazyCallThroughManager &LCTManager,
                                    IndirectStubsManager &ISManager,
                                    JITDylib &SourceJD,
-                                   SymbolAliasMap CallableAliases);
+                                   SymbolAliasMap CallableAliases,
+                                   VModuleKey K);
 
   StringRef getName() const override;
 
@@ -182,9 +183,10 @@ private:
 inline std::unique_ptr<LazyReexportsMaterializationUnit>
 lazyReexports(LazyCallThroughManager &LCTManager,
               IndirectStubsManager &ISManager, JITDylib &SourceJD,
-              SymbolAliasMap CallableAliases) {
+              SymbolAliasMap CallableAliases, VModuleKey K = VModuleKey()) {
   return llvm::make_unique<LazyReexportsMaterializationUnit>(
-      LCTManager, ISManager, SourceJD, std::move(CallableAliases));
+      LCTManager, ISManager, SourceJD, std::move(CallableAliases),
+      std::move(K));
 }
 
 } // End namespace orc
diff --git a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
index c6b43a9c8ed6b8e91c54bfc06c3d60561443fca8..44d6b490e19da5d790fde83645d57bf0b133b40c 100644
--- a/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/ObjectTransformLayer.h
@@ -23,16 +23,16 @@
 namespace llvm {
 namespace orc {
 
-class ObjectTransformLayer2 : public ObjectLayer {
+class ObjectTransformLayer : public ObjectLayer {
 public:
   using TransformFunction =
       std::function<Expected<std::unique_ptr<MemoryBuffer>>(
           std::unique_ptr<MemoryBuffer>)>;
 
-  ObjectTransformLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer,
-                        TransformFunction Transform);
+  ObjectTransformLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
+                       TransformFunction Transform);
 
-  void emit(MaterializationResponsibility R, VModuleKey K,
+  void emit(MaterializationResponsibility R,
             std::unique_ptr<MemoryBuffer> O) override;
 
 private:
@@ -46,11 +46,11 @@ private:
 /// immediately applies the user supplied functor to each object, then adds
 /// the set of transformed objects to the layer below.
 template <typename BaseLayerT, typename TransformFtor>
-class ObjectTransformLayer {
+class LegacyObjectTransformLayer {
 public:
   /// Construct an ObjectTransformLayer with the given BaseLayer
-  ObjectTransformLayer(BaseLayerT &BaseLayer,
-                       TransformFtor Transform = TransformFtor())
+  LegacyObjectTransformLayer(BaseLayerT &BaseLayer,
+                             TransformFtor Transform = TransformFtor())
       : BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
   /// Apply the transform functor to each object in the object set, then
diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
index 99468e269d31537a924152913bf682364301d7fc..3e07f5cf37423fddf34d846746e6845b1184a012 100644
--- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
+++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h
@@ -118,30 +118,33 @@ public:
         Unmapped.back().RemoteCodeAddr =
             Client.reserveMem(Id, CodeSize, CodeAlign);
 
-        LLVM_DEBUG(dbgs() << "  code: "
-                          << format("0x%016x", Unmapped.back().RemoteCodeAddr)
-                          << " (" << CodeSize << " bytes, alignment "
-                          << CodeAlign << ")\n");
+        LLVM_DEBUG(
+            dbgs() << "  code: "
+                   << format("0x%016" PRIx64, Unmapped.back().RemoteCodeAddr)
+                   << " (" << CodeSize << " bytes, alignment " << CodeAlign
+                   << ")\n");
       }
 
       if (RODataSize != 0) {
         Unmapped.back().RemoteRODataAddr =
             Client.reserveMem(Id, RODataSize, RODataAlign);
 
-        LLVM_DEBUG(dbgs() << "  ro-data: "
-                          << format("0x%016x", Unmapped.back().RemoteRODataAddr)
-                          << " (" << RODataSize << " bytes, alignment "
-                          << RODataAlign << ")\n");
+        LLVM_DEBUG(
+            dbgs() << "  ro-data: "
+                   << format("0x%016" PRIx64, Unmapped.back().RemoteRODataAddr)
+                   << " (" << RODataSize << " bytes, alignment " << RODataAlign
+                   << ")\n");
       }
 
       if (RWDataSize != 0) {
         Unmapped.back().RemoteRWDataAddr =
             Client.reserveMem(Id, RWDataSize, RWDataAlign);
 
-        LLVM_DEBUG(dbgs() << "  rw-data: "
-                          << format("0x%016x", Unmapped.back().RemoteRWDataAddr)
-                          << " (" << RWDataSize << " bytes, alignment "
-                          << RWDataAlign << ")\n");
+        LLVM_DEBUG(
+            dbgs() << "  rw-data: "
+                   << format("0x%016" PRIx64, Unmapped.back().RemoteRWDataAddr)
+                   << " (" << RWDataSize << " bytes, alignment " << RWDataAlign
+                   << ")\n");
       }
     }
 
@@ -269,9 +272,9 @@ public:
       for (auto &Alloc : Allocs) {
         NextAddr = alignTo(NextAddr, Alloc.getAlign());
         Dyld.mapSectionAddress(Alloc.getLocalAddress(), NextAddr);
-        LLVM_DEBUG(dbgs() << "     "
-                          << static_cast<void *>(Alloc.getLocalAddress())
-                          << " -> " << format("0x%016x", NextAddr) << "\n");
+        LLVM_DEBUG(
+            dbgs() << "     " << static_cast<void *>(Alloc.getLocalAddress())
+                   << " -> " << format("0x%016" PRIx64, NextAddr) << "\n");
         Alloc.setRemoteAddress(NextAddr);
 
         // Only advance NextAddr if it was non-null to begin with,
@@ -293,7 +296,7 @@ public:
           LLVM_DEBUG(dbgs() << "  copying section: "
                             << static_cast<void *>(Alloc.getLocalAddress())
                             << " -> "
-                            << format("0x%016x", Alloc.getRemoteAddress())
+                            << format("0x%016" PRIx64, Alloc.getRemoteAddress())
                             << " (" << Alloc.getSize() << " bytes)\n";);
 
           if (Client.writeMem(Alloc.getRemoteAddress(), Alloc.getLocalAddress(),
@@ -306,7 +309,8 @@ public:
                           << (Permissions & sys::Memory::MF_WRITE ? 'W' : '-')
                           << (Permissions & sys::Memory::MF_EXEC ? 'X' : '-')
                           << " permissions on block: "
-                          << format("0x%016x", RemoteSegmentAddr) << "\n");
+                          << format("0x%016" PRIx64, RemoteSegmentAddr)
+                          << "\n");
         if (Client.setProtections(Id, RemoteSegmentAddr, Permissions))
           return true;
       }
@@ -510,8 +514,8 @@ public:
   /// Call the int(void) function at the given address in the target and return
   /// its result.
   Expected<int> callIntVoid(JITTargetAddress Addr) {
-    LLVM_DEBUG(dbgs() << "Calling int(*)(void) " << format("0x%016x", Addr)
-                      << "\n");
+    LLVM_DEBUG(dbgs() << "Calling int(*)(void) "
+                      << format("0x%016" PRIx64, Addr) << "\n");
     return callB<exec::CallIntVoid>(Addr);
   }
 
@@ -520,15 +524,15 @@ public:
   Expected<int> callMain(JITTargetAddress Addr,
                          const std::vector<std::string> &Args) {
     LLVM_DEBUG(dbgs() << "Calling int(*)(int, char*[]) "
-                      << format("0x%016x", Addr) << "\n");
+                      << format("0x%016" PRIx64, Addr) << "\n");
     return callB<exec::CallMain>(Addr, Args);
   }
 
   /// Call the void() function at the given address in the target and wait for
   /// it to finish.
   Error callVoidVoid(JITTargetAddress Addr) {
-    LLVM_DEBUG(dbgs() << "Calling void(*)(void) " << format("0x%016x", Addr)
-                      << "\n");
+    LLVM_DEBUG(dbgs() << "Calling void(*)(void) "
+                      << format("0x%016" PRIx64, Addr) << "\n");
     return callB<exec::CallVoidVoid>(Addr);
   }
 
diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index 0c30520a21bb8f749026815f98d08ef0a99f9d65..401f6e3fa811612cc9970e26ff20484adc9f2147 100644
--- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -36,7 +36,7 @@
 namespace llvm {
 namespace orc {
 
-class RTDyldObjectLinkingLayer2 : public ObjectLayer {
+class RTDyldObjectLinkingLayer : public ObjectLayer {
 public:
   /// Functor for receiving object-loaded notifications.
   using NotifyLoadedFunction =
@@ -47,17 +47,17 @@ public:
   using NotifyEmittedFunction = std::function<void(VModuleKey)>;
 
   using GetMemoryManagerFunction =
-      std::function<std::unique_ptr<RuntimeDyld::MemoryManager>(VModuleKey)>;
+      std::function<std::unique_ptr<RuntimeDyld::MemoryManager>()>;
 
   /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
   ///        and NotifyEmitted functors.
-  RTDyldObjectLinkingLayer2(
+  RTDyldObjectLinkingLayer(
       ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager,
       NotifyLoadedFunction NotifyLoaded = NotifyLoadedFunction(),
       NotifyEmittedFunction NotifyEmitted = NotifyEmittedFunction());
 
   /// Emit the object.
-  void emit(MaterializationResponsibility R, VModuleKey K,
+  void emit(MaterializationResponsibility R,
             std::unique_ptr<MemoryBuffer> O) override;
 
   /// Set the 'ProcessAllSections' flag.
@@ -66,7 +66,7 @@ public:
   /// the memory manager, rather than just the sections required for execution.
   ///
   /// This is kludgy, and may be removed in the future.
-  RTDyldObjectLinkingLayer2 &setProcessAllSections(bool ProcessAllSections) {
+  RTDyldObjectLinkingLayer &setProcessAllSections(bool ProcessAllSections) {
     this->ProcessAllSections = ProcessAllSections;
     return *this;
   }
@@ -79,13 +79,13 @@ public:
   ///
   /// FIXME: We should be able to remove this if/when COFF properly tracks
   /// exported symbols.
-  RTDyldObjectLinkingLayer2 &
+  RTDyldObjectLinkingLayer &
   setOverrideObjectFlagsWithResponsibilityFlags(bool OverrideObjectFlags) {
     this->OverrideObjectFlags = OverrideObjectFlags;
     return *this;
   }
 
-  /// If set, this RTDyldObjectLinkingLayer2 instance will claim responsibility
+  /// If set, this RTDyldObjectLinkingLayer instance will claim responsibility
   /// for any symbols provided by a given object file that were not already in
   /// the MaterializationResponsibility instance. Setting this flag allows
   /// higher-level program representations (e.g. LLVM IR) to be added based on
@@ -96,7 +96,7 @@ public:
   /// deterministically). If this option is set, clashes for the additional
   /// symbols may not be detected until late, and detection may depend on
   /// the flow of control through JIT'd code. Use with care.
-  RTDyldObjectLinkingLayer2 &
+  RTDyldObjectLinkingLayer &
   setAutoClaimResponsibilityForObjectSymbols(bool AutoClaimObjectSymbols) {
     this->AutoClaimObjectSymbols = AutoClaimObjectSymbols;
     return *this;
@@ -118,10 +118,10 @@ private:
   bool ProcessAllSections = false;
   bool OverrideObjectFlags = false;
   bool AutoClaimObjectSymbols = false;
-  std::map<VModuleKey, std::shared_ptr<RuntimeDyld::MemoryManager>> MemMgrs;
+  std::vector<std::unique_ptr<RuntimeDyld::MemoryManager>> MemMgrs;
 };
 
-class RTDyldObjectLinkingLayerBase {
+class LegacyRTDyldObjectLinkingLayerBase {
 public:
   using ObjectPtr = std::unique_ptr<MemoryBuffer>;
 
@@ -173,10 +173,10 @@ protected:
 /// object files to be loaded into memory, linked, and the addresses of their
 /// symbols queried. All objects added to this layer can see each other's
 /// symbols.
-class RTDyldObjectLinkingLayer : public RTDyldObjectLinkingLayerBase {
+class LegacyRTDyldObjectLinkingLayer : public LegacyRTDyldObjectLinkingLayerBase {
 public:
 
-  using RTDyldObjectLinkingLayerBase::ObjectPtr;
+  using LegacyRTDyldObjectLinkingLayerBase::ObjectPtr;
 
   /// Functor for receiving object-loaded notifications.
   using NotifyLoadedFtor =
@@ -197,7 +197,7 @@ private:
   template <typename MemoryManagerPtrT>
   class ConcreteLinkedObject : public LinkedObject {
   public:
-    ConcreteLinkedObject(RTDyldObjectLinkingLayer &Parent, VModuleKey K,
+    ConcreteLinkedObject(LegacyRTDyldObjectLinkingLayer &Parent, VModuleKey K,
                          OwnedObject Obj, MemoryManagerPtrT MemMgr,
                          std::shared_ptr<SymbolResolver> Resolver,
                          bool ProcessAllSections)
@@ -313,7 +313,7 @@ private:
     };
 
     VModuleKey K;
-    RTDyldObjectLinkingLayer &Parent;
+    LegacyRTDyldObjectLinkingLayer &Parent;
     MemoryManagerPtrT MemMgr;
     OwnedObject ObjForNotify;
     std::unique_ptr<PreFinalizeContents> PFC;
@@ -321,7 +321,7 @@ private:
 
   template <typename MemoryManagerPtrT>
   std::unique_ptr<ConcreteLinkedObject<MemoryManagerPtrT>>
-  createLinkedObject(RTDyldObjectLinkingLayer &Parent, VModuleKey K,
+  createLinkedObject(LegacyRTDyldObjectLinkingLayer &Parent, VModuleKey K,
                      OwnedObject Obj, MemoryManagerPtrT MemMgr,
                      std::shared_ptr<SymbolResolver> Resolver,
                      bool ProcessAllSections) {
@@ -341,7 +341,7 @@ public:
 
   /// Construct an ObjectLinkingLayer with the given NotifyLoaded,
   ///        and NotifyFinalized functors.
-  RTDyldObjectLinkingLayer(
+  LegacyRTDyldObjectLinkingLayer(
       ExecutionSession &ES, ResourcesGetter GetResources,
       NotifyLoadedFtor NotifyLoaded = NotifyLoadedFtor(),
       NotifyFinalizedFtor NotifyFinalized = NotifyFinalizedFtor(),
diff --git a/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h b/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
index 4c45cfd199dd21c5982003d86d67a5b9a2f022dd..717076e25609018c23362262993dc9fcc80dd2d3 100644
--- a/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
+++ b/include/llvm/ExecutionEngine/Orc/SymbolStringPool.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H
 #define LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include <atomic>
 #include <mutex>
@@ -49,10 +50,13 @@ private:
 /// Pointer to a pooled string representing a symbol name.
 class SymbolStringPtr {
   friend class SymbolStringPool;
+  friend struct DenseMapInfo<SymbolStringPtr>;
   friend bool operator==(const SymbolStringPtr &LHS,
                          const SymbolStringPtr &RHS);
   friend bool operator<(const SymbolStringPtr &LHS, const SymbolStringPtr &RHS);
 
+  static SymbolStringPool::PoolMapEntry Tombstone;
+
 public:
   SymbolStringPtr() = default;
   SymbolStringPtr(const SymbolStringPtr &Other)
@@ -142,6 +146,29 @@ inline bool SymbolStringPool::empty() const {
 }
 
 } // end namespace orc
+
+template <>
+struct DenseMapInfo<orc::SymbolStringPtr> {
+
+  static orc::SymbolStringPtr getEmptyKey() {
+    return orc::SymbolStringPtr();
+  }
+
+  static orc::SymbolStringPtr getTombstoneKey() {
+    return orc::SymbolStringPtr(&orc::SymbolStringPtr::Tombstone);
+  }
+
+  static unsigned getHashValue(orc::SymbolStringPtr V) {
+    uintptr_t IV = reinterpret_cast<uintptr_t>(V.S);
+    return unsigned(IV) ^ unsigned(IV >> 9);
+  }
+
+  static bool isEqual(const orc::SymbolStringPtr &LHS,
+                      const orc::SymbolStringPtr &RHS) {
+    return LHS.S == RHS.S;
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_EXECUTIONENGINE_ORC_SYMBOLSTRINGPOOL_H
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 1ee19975af754fa09298a1bbf90fdda2925f48fe..7244bba1ca59e36f29b03645a3fc05a9ff2bce2c 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -38,7 +38,6 @@ class LandingPadInst;
 class LLVMContext;
 class Module;
 class PHINode;
-class TerminatorInst;
 class ValueSymbolTable;
 
 /// LLVM Basic Block Representation
@@ -50,12 +49,12 @@ class ValueSymbolTable;
 /// represents a label to which a branch can jump.
 ///
 /// A well formed basic block is formed of a list of non-terminating
-/// instructions followed by a single TerminatorInst instruction.
-/// TerminatorInst's may not occur in the middle of basic blocks, and must
-/// terminate the blocks. The BasicBlock class allows malformed basic blocks to
-/// occur because it may be useful in the intermediate stage of constructing or
-/// modifying a program. However, the verifier will ensure that basic blocks
-/// are "well formed".
+/// instructions followed by a single terminator instruction. Terminator
+/// instructions may not occur in the middle of basic blocks, and must terminate
+/// the blocks. The BasicBlock class allows malformed basic blocks to occur
+/// because it may be useful in the intermediate stage of constructing or
+/// modifying a program. However, the verifier will ensure that basic blocks are
+/// "well formed".
 class BasicBlock final : public Value, // Basic blocks are data objects also
                          public ilist_node_with_parent<BasicBlock, Function> {
 public:
@@ -120,10 +119,10 @@ public:
 
   /// Returns the terminator instruction if the block is well formed or null
   /// if the block is not well formed.
-  const TerminatorInst *getTerminator() const LLVM_READONLY;
-  TerminatorInst *getTerminator() {
-    return const_cast<TerminatorInst *>(
-                        static_cast<const BasicBlock *>(this)->getTerminator());
+  const Instruction *getTerminator() const LLVM_READONLY;
+  Instruction *getTerminator() {
+    return const_cast<Instruction *>(
+        static_cast<const BasicBlock *>(this)->getTerminator());
   }
 
   /// Returns the call instruction calling \@llvm.experimental.deoptimize
diff --git a/include/llvm/IR/CFG.h b/include/llvm/IR/CFG.h
index fd384ef4949fcec474b93ff37cf5345191d57001..4140c8a212e7eb1f300c1fc451e0eab1fd9f35ce 100644
--- a/include/llvm/IR/CFG.h
+++ b/include/llvm/IR/CFG.h
@@ -73,7 +73,7 @@ public:
 
   inline reference operator*() const {
     assert(!It.atEnd() && "pred_iterator out of range!");
-    return cast<TerminatorInst>(*It)->getParent();
+    return cast<Instruction>(*It)->getParent();
   }
   inline pointer *operator->() const { return &operator*(); }
 
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 1b91537c5d9bc7e3755861e59bf313ed39be63bf..630f47e8bb5763712f8043b05053e6582b10bb23 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -158,7 +158,7 @@ public:
   /// Returns the number of non-debug IR instructions in this function.
   /// This is equivalent to the sum of the sizes of each basic block contained
   /// within this function.
-  unsigned getInstructionCount();
+  unsigned getInstructionCount() const;
 
   /// Returns the FunctionType for me.
   FunctionType *getFunctionType() const {
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 0af53c5b3f412287dae12c4a51ed432a434bde7b..e89c44380d0b0435248f49c13228cab292e4b9e9 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -705,6 +705,16 @@ public:
     return CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS, nullptr, Name);
   }
 
+  /// Create call to the minimum intrinsic.
+  CallInst *CreateMinimum(Value *LHS, Value *RHS, const Twine &Name = "") {
+    return CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS, nullptr, Name);
+  }
+
+  /// Create call to the maximum intrinsic.
+  CallInst *CreateMaximum(Value *LHS, Value *RHS, const Twine &Name = "") {
+    return CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS, nullptr, Name);
+  }
+
 private:
   /// Create a call to a masked intrinsic with given Id.
   CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef<Value *> Ops,
diff --git a/include/llvm/IR/InstVisitor.h b/include/llvm/IR/InstVisitor.h
index 55536f237d4535c2f43e1f3aa1f45e3ea940dc09..554417f984aa5cc1e1d0d5b666a854fb346fd866 100644
--- a/include/llvm/IR/InstVisitor.h
+++ b/include/llvm/IR/InstVisitor.h
@@ -166,15 +166,6 @@ public:
   // Specific Instruction type classes... note that all of the casts are
   // necessary because we use the instruction classes as opaque types...
   //
-  RetTy visitReturnInst(ReturnInst &I)            { DELEGATE(TerminatorInst);}
-  RetTy visitBranchInst(BranchInst &I)            { DELEGATE(TerminatorInst);}
-  RetTy visitSwitchInst(SwitchInst &I)            { DELEGATE(TerminatorInst);}
-  RetTy visitIndirectBrInst(IndirectBrInst &I)    { DELEGATE(TerminatorInst);}
-  RetTy visitResumeInst(ResumeInst &I)            { DELEGATE(TerminatorInst);}
-  RetTy visitUnreachableInst(UnreachableInst &I)  { DELEGATE(TerminatorInst);}
-  RetTy visitCleanupReturnInst(CleanupReturnInst &I) { DELEGATE(TerminatorInst);}
-  RetTy visitCatchReturnInst(CatchReturnInst &I)  { DELEGATE(TerminatorInst); }
-  RetTy visitCatchSwitchInst(CatchSwitchInst &I)  { DELEGATE(TerminatorInst);}
   RetTy visitICmpInst(ICmpInst &I)                { DELEGATE(CmpInst);}
   RetTy visitFCmpInst(FCmpInst &I)                { DELEGATE(CmpInst);}
   RetTy visitAllocaInst(AllocaInst &I)            { DELEGATE(UnaryInstruction);}
@@ -236,6 +227,37 @@ public:
     return static_cast<SubClass*>(this)->visitCallSite(&I);
   }
 
+  // While terminators don't have a distinct type modeling them, we support
+  // intercepting them with dedicated a visitor callback.
+  RetTy visitReturnInst(ReturnInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitBranchInst(BranchInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitSwitchInst(SwitchInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitIndirectBrInst(IndirectBrInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitResumeInst(ResumeInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitUnreachableInst(UnreachableInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitCleanupReturnInst(CleanupReturnInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitCatchReturnInst(CatchReturnInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitCatchSwitchInst(CatchSwitchInst &I) {
+    return static_cast<SubClass *>(this)->visitTerminator(I);
+  }
+  RetTy visitTerminator(Instruction &I)    { DELEGATE(Instruction);}
+
   // Next level propagators: If the user does not overload a specific
   // instruction type, they can overload one of these to get the whole class
   // of instructions...
@@ -243,7 +265,6 @@ public:
   RetTy visitCastInst(CastInst &I)                { DELEGATE(UnaryInstruction);}
   RetTy visitBinaryOperator(BinaryOperator &I)    { DELEGATE(Instruction);}
   RetTy visitCmpInst(CmpInst &I)                  { DELEGATE(Instruction);}
-  RetTy visitTerminatorInst(TerminatorInst &I)    { DELEGATE(Instruction);}
   RetTy visitUnaryInstruction(UnaryInstruction &I){ DELEGATE(Instruction);}
 
   // Provide a special visitor for a 'callsite' that visits both calls and
@@ -256,7 +277,7 @@ public:
       DELEGATE(Instruction);
 
     assert(CS.isInvoke());
-    DELEGATE(TerminatorInst);
+    return static_cast<SubClass *>(this)->visitTerminator(I);
   }
 
   // If the user wants a 'default' case, they can choose to override this
diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h
index 95cdb70e4fda1abba2f4afc93a2f09070a4c6dae..f8d23c7f6140f2ef78278f94166cc17bb6e1ca2a 100644
--- a/include/llvm/IR/InstrTypes.h
+++ b/include/llvm/IR/InstrTypes.h
@@ -45,34 +45,6 @@
 
 namespace llvm {
 
-//===----------------------------------------------------------------------===//
-//                            TerminatorInst Class
-//===----------------------------------------------------------------------===//
-
-/// Subclasses of this class are all able to terminate a basic
-/// block. Thus, these are all the flow control type of operations.
-///
-class TerminatorInst : public Instruction {
-protected:
-  TerminatorInst(Type *Ty, Instruction::TermOps iType,
-                 Use *Ops, unsigned NumOps,
-                 Instruction *InsertBefore = nullptr)
-    : Instruction(Ty, iType, Ops, NumOps, InsertBefore) {}
-
-  TerminatorInst(Type *Ty, Instruction::TermOps iType,
-                 Use *Ops, unsigned NumOps, BasicBlock *InsertAtEnd)
-    : Instruction(Ty, iType, Ops, NumOps, InsertAtEnd) {}
-
-public:
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static bool classof(const Instruction *I) {
-    return I->isTerminator();
-  }
-  static bool classof(const Value *V) {
-    return isa<Instruction>(V) && classof(cast<Instruction>(V));
-  }
-};
-
 //===----------------------------------------------------------------------===//
 //                          UnaryInstruction Class
 //===----------------------------------------------------------------------===//
@@ -336,22 +308,6 @@ public:
   static BinaryOperator *CreateNot(Value *Op, const Twine &Name,
                                    BasicBlock *InsertAtEnd);
 
-  /// Check if the given Value is a NEG, FNeg, or NOT instruction.
-  ///
-  static bool isNeg(const Value *V);
-  static bool isFNeg(const Value *V, bool IgnoreZeroSign=false);
-  static bool isNot(const Value *V);
-
-  /// Helper functions to extract the unary argument of a NEG, FNEG or NOT
-  /// operation implemented via Sub, FSub, or Xor.
-  ///
-  static const Value *getNegArgument(const Value *BinOp);
-  static       Value *getNegArgument(      Value *BinOp);
-  static const Value *getFNegArgument(const Value *BinOp);
-  static       Value *getFNegArgument(      Value *BinOp);
-  static const Value *getNotArgument(const Value *BinOp);
-  static       Value *getNotArgument(      Value *BinOp);
-
   BinaryOps getOpcode() const {
     return static_cast<BinaryOps>(Instruction::getOpcode());
   }
@@ -721,7 +677,8 @@ public:
 protected:
   CmpInst(Type *ty, Instruction::OtherOps op, Predicate pred,
           Value *LHS, Value *RHS, const Twine &Name = "",
-          Instruction *InsertBefore = nullptr);
+          Instruction *InsertBefore = nullptr,
+          Instruction *FlagsSource = nullptr);
 
   CmpInst(Type *ty, Instruction::OtherOps op, Predicate pred,
           Value *LHS, Value *RHS, const Twine &Name,
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index 8bdc935425d2d862a5e8e22881cab5a5b9fb04bd..7b2c13c5328239fbf03877f632fc946694376de3 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -1299,12 +1299,13 @@ public:
 
   /// Constructor with no-insertion semantics
   FCmpInst(
-    Predicate pred, ///< The predicate to use for the comparison
+    Predicate Pred, ///< The predicate to use for the comparison
     Value *LHS,     ///< The left-hand-side of the expression
     Value *RHS,     ///< The right-hand-side of the expression
-    const Twine &NameStr = "" ///< Name of the instruction
-  ) : CmpInst(makeCmpResultType(LHS->getType()),
-              Instruction::FCmp, pred, LHS, RHS, NameStr) {
+    const Twine &NameStr = "", ///< Name of the instruction
+    Instruction *FlagsSource = nullptr
+  ) : CmpInst(makeCmpResultType(LHS->getType()), Instruction::FCmp, Pred, LHS,
+              RHS, NameStr, nullptr, FlagsSource) {
     AssertOK();
   }
 
@@ -1357,8 +1358,6 @@ class InvokeInst;
 
 template <class T> struct CallBaseParent { using type = Instruction; };
 
-template <> struct CallBaseParent<InvokeInst> { using type = TerminatorInst; };
-
 //===----------------------------------------------------------------------===//
 /// Base class for all callable instructions (InvokeInst and CallInst)
 /// Holds everything related to calling a function, abstracting from the base
@@ -1525,7 +1524,7 @@ public:
   /// indirect function invocation.
   ///
   Function *getCalledFunction() const {
-    return dyn_cast<Function>(Op<-InstTy::ArgOffset>());
+    return dyn_cast_or_null<Function>(Op<-InstTy::ArgOffset>());
   }
 
   /// Determine whether this call has the given attribute.
@@ -3265,7 +3264,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(LandingPadInst, Value)
 /// Return a value (possibly void), from a function.  Execution
 /// does not continue in this function any longer.
 ///
-class ReturnInst : public TerminatorInst {
+class ReturnInst : public Instruction {
   ReturnInst(const ReturnInst &RI);
 
 private:
@@ -3325,8 +3324,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned idx) const {
     llvm_unreachable("ReturnInst has no successors!");
   }
@@ -3349,7 +3346,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ReturnInst, Value)
 //===---------------------------------------------------------------------------
 /// Conditional or Unconditional Branch instruction.
 ///
-class BranchInst : public TerminatorInst {
+class BranchInst : public Instruction {
   /// Ops list - Branches are strange.  The operands are ordered:
   ///  [Cond, FalseDest,] TrueDest.  This makes some accessors faster because
   /// they don't have to check for cond/uncond branchness. These are mostly
@@ -3493,7 +3490,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
 //===---------------------------------------------------------------------------
 /// Multiway switch
 ///
-class SwitchInst : public TerminatorInst {
+class SwitchInst : public Instruction {
   unsigned ReservedSpace;
 
   // Operand[0]    = Value to switch on
@@ -3576,7 +3573,7 @@ public:
     /// Returns number of current case.
     unsigned getCaseIndex() const { return Index; }
 
-    /// Returns TerminatorInst's successor index for current case successor.
+    /// Returns successor index for current case successor.
     unsigned getSuccessorIndex() const {
       assert(((unsigned)Index == DefaultPseudoIndex ||
               (unsigned)Index < SI->getNumCases()) &&
@@ -3632,7 +3629,7 @@ public:
     CaseIteratorImpl(SwitchInstT *SI, unsigned CaseNum) : Case(SI, CaseNum) {}
 
     /// Initializes case iterator for given SwitchInst and for given
-    /// TerminatorInst's successor index.
+    /// successor index.
     static CaseIteratorImpl fromSuccessorIndex(SwitchInstT *SI,
                                                unsigned SuccessorIndex) {
       assert(SuccessorIndex < SI->getNumSuccessors() &&
@@ -3850,7 +3847,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value)
 //===---------------------------------------------------------------------------
 /// Indirect Branch Instruction.
 ///
-class IndirectBrInst : public TerminatorInst {
+class IndirectBrInst : public Instruction {
   unsigned ReservedSpace;
 
   // Operand[0]   = Address to jump to
@@ -4226,7 +4223,7 @@ InvokeInst::InvokeInst(Value *Func, BasicBlock *IfNormal,
 //===---------------------------------------------------------------------------
 /// Resume the propagation of an exception.
 ///
-class ResumeInst : public TerminatorInst {
+class ResumeInst : public Instruction {
   ResumeInst(const ResumeInst &RI);
 
   explicit ResumeInst(Value *Exn, Instruction *InsertBefore=nullptr);
@@ -4264,8 +4261,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned idx) const {
     llvm_unreachable("ResumeInst has no successors!");
   }
@@ -4285,7 +4280,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ResumeInst, Value)
 //===----------------------------------------------------------------------===//
 //                         CatchSwitchInst Class
 //===----------------------------------------------------------------------===//
-class CatchSwitchInst : public TerminatorInst {
+class CatchSwitchInst : public Instruction {
   /// The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
@@ -4551,7 +4546,7 @@ public:
 //                               CatchReturnInst Class
 //===----------------------------------------------------------------------===//
 
-class CatchReturnInst : public TerminatorInst {
+class CatchReturnInst : public Instruction {
   CatchReturnInst(const CatchReturnInst &RI);
   CatchReturnInst(Value *CatchPad, BasicBlock *BB, Instruction *InsertBefore);
   CatchReturnInst(Value *CatchPad, BasicBlock *BB, BasicBlock *InsertAtEnd);
@@ -4611,8 +4606,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned Idx) const {
     assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
     return getSuccessor();
@@ -4634,7 +4627,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CatchReturnInst, Value)
 //                               CleanupReturnInst Class
 //===----------------------------------------------------------------------===//
 
-class CleanupReturnInst : public TerminatorInst {
+class CleanupReturnInst : public Instruction {
 private:
   CleanupReturnInst(const CleanupReturnInst &RI);
   CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values,
@@ -4707,8 +4700,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned Idx) const {
     assert(Idx == 0);
     return getUnwindDest();
@@ -4741,7 +4732,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CleanupReturnInst, Value)
 /// presence of this instruction indicates some higher level knowledge that the
 /// end of the block cannot be reached.
 ///
-class UnreachableInst : public TerminatorInst {
+class UnreachableInst : public Instruction {
 protected:
   // Note: Instruction needs to be a friend here to call cloneImpl.
   friend class Instruction;
@@ -4768,8 +4759,6 @@ public:
   }
 
 private:
-  friend TerminatorInst;
-
   BasicBlock *getSuccessor(unsigned idx) const {
     llvm_unreachable("UnreachableInst has no successors!");
   }
diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index 32a62a4cafc7ec7b4af553e6694032b1d12f78fa..80a7a705257483d4f5cd6db0f178353acd5c22a8 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@@ -251,6 +251,12 @@ namespace llvm {
       case Intrinsic::experimental_constrained_log2:
       case Intrinsic::experimental_constrained_rint:
       case Intrinsic::experimental_constrained_nearbyint:
+      case Intrinsic::experimental_constrained_maxnum:
+      case Intrinsic::experimental_constrained_minnum:
+      case Intrinsic::experimental_constrained_ceil:
+      case Intrinsic::experimental_constrained_floor:
+      case Intrinsic::experimental_constrained_round:
+      case Intrinsic::experimental_constrained_trunc:
         return true;
       default: return false;
       }
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index b405e86ef402cd56f0c644709a303a1aadf06b6f..04de1ca63a26e877df3ad76b1966e71f9a9e9519 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -320,6 +320,7 @@ def int_gcwrite : Intrinsic<[],
 def int_returnaddress : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_addressofreturnaddress : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_frameaddress  : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_sponentry  : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_read_register  : Intrinsic<[llvm_anyint_ty], [llvm_metadata_ty],
                                    [IntrReadMem], "llvm.read_register">;
 def int_write_register : Intrinsic<[], [llvm_metadata_ty, llvm_anyint_ty],
@@ -453,6 +454,14 @@ def int_maxnum : Intrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
   [IntrNoMem, IntrSpeculatable, Commutative]
 >;
+def int_minimum : Intrinsic<[llvm_anyfloat_ty],
+  [LLVMMatchType<0>, LLVMMatchType<0>],
+  [IntrNoMem, IntrSpeculatable, Commutative]
+>;
+def int_maximum : Intrinsic<[llvm_anyfloat_ty],
+  [LLVMMatchType<0>, LLVMMatchType<0>],
+  [IntrNoMem, IntrSpeculatable, Commutative]
+>;
 
 // NOTE: these are internal interfaces.
 def int_setjmp     : Intrinsic<[llvm_i32_ty],  [llvm_ptr_ty]>;
@@ -557,9 +566,35 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
                                                          [ LLVMMatchType<0>,
                                                            llvm_metadata_ty,
                                                            llvm_metadata_ty ]>;
+  def int_experimental_constrained_maxnum : Intrinsic<[ llvm_anyfloat_ty ],
+                                                      [ LLVMMatchType<0>,
+                                                        LLVMMatchType<0>,
+                                                        llvm_metadata_ty,
+                                                        llvm_metadata_ty ]>;
+  def int_experimental_constrained_minnum : Intrinsic<[ llvm_anyfloat_ty ],
+                                                      [ LLVMMatchType<0>,
+                                                        LLVMMatchType<0>,
+                                                        llvm_metadata_ty,
+                                                        llvm_metadata_ty ]>;
+  def int_experimental_constrained_ceil : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_floor : Intrinsic<[ llvm_anyfloat_ty ],
+                                                     [ LLVMMatchType<0>,
+                                                       llvm_metadata_ty,
+                                                       llvm_metadata_ty ]>;
+  def int_experimental_constrained_round : Intrinsic<[ llvm_anyfloat_ty ],
+                                                     [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_trunc : Intrinsic<[ llvm_anyfloat_ty ],
+                                                     [ LLVMMatchType<0>,
+                                                       llvm_metadata_ty,
+                                                       llvm_metadata_ty ]>;
 }
 // FIXME: Add intrinsics for fcmp, fptrunc, fpext, fptoui and fptosi.
-// FIXME: Add intrinsics for fabs, copysign, floor, ceil, trunc and round?
+// FIXME: Add intrinsics for fabs and copysign? 
 
 
 //===------------------------- Expect Intrinsics --------------------------===//
@@ -700,6 +735,21 @@ def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
                                        [LLVMMatchType<0>, LLVMMatchType<0>],
                                        [IntrNoMem, IntrSpeculatable]>;
 
+//===------------------------- Fixed Point Intrinsics ---------------------===//
+//
+def int_sadd_sat : Intrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+def int_uadd_sat : Intrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+def int_ssub_sat : Intrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable]>;
+def int_usub_sat : Intrinsic<[llvm_anyint_ty],
+                             [LLVMMatchType<0>, LLVMMatchType<0>],
+                             [IntrNoMem, IntrSpeculatable]>;
+
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
@@ -850,6 +900,10 @@ def int_convert_from_fp16 : Intrinsic<[llvm_anyfloat_ty], [llvm_i16_ty]>;
 def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
                                 [], "llvm.clear_cache">;
 
+// Intrinsic to detect whether its argument is a constant.
+def int_is_constant : Intrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem], "llvm.is.constant">;
+
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
index 688e863c1afe9c578a7a539cbf81663409b9f82c..ff25750fe399bc8f7fae882096e333ee123543d3 100644
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -44,6 +44,12 @@ def int_aarch64_dmb : GCCBuiltin<"__builtin_arm_dmb">, MSBuiltin<"__dmb">, Intri
 def int_aarch64_dsb : GCCBuiltin<"__builtin_arm_dsb">, MSBuiltin<"__dsb">, Intrinsic<[], [llvm_i32_ty]>;
 def int_aarch64_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">, Intrinsic<[], [llvm_i32_ty]>;
 
+// A space-consuming intrinsic primarily for testing block and jump table
+// placements. The first argument is the number of bytes this "instruction"
+// takes up, the second and return value are essentially chains, used to force
+// ordering during ISel.
+def int_aarch64_space : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], []>;
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -154,6 +160,11 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
+
+  class AdvSIMD_FP16FML_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -424,6 +435,12 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   // v8.2-A Dot Product
   def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;
+
+  // v8.2-A FP16 Fused Multiply-Add Long
+  def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
+  def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
+  def int_aarch64_neon_fmlal2 : AdvSIMD_FP16FML_Intrinsic;
+  def int_aarch64_neon_fmlsl2 : AdvSIMD_FP16FML_Intrinsic;
 }
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index ccf43a61c3de924ab05205e57d1a6c534e6c9bbc..67e7da7797a4b908e8c0143aa85e8d4d23474afa 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1484,18 +1484,10 @@ def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
   [llvm_i64_ty], [IntrConvergent]
 >;
 
-def int_amdgcn_break : Intrinsic<[llvm_i64_ty],
-  [llvm_i64_ty], [IntrNoMem, IntrConvergent]
->;
-
 def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty],
   [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
 >;
 
-def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty],
-  [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
->;
-
 def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
   [llvm_i64_ty], [IntrConvergent]
 >;
diff --git a/include/llvm/IR/IntrinsicsWebAssembly.td b/include/llvm/IR/IntrinsicsWebAssembly.td
index 54408d317d273dd0e1e50a9c8ee9f13e28575797..ff5964c3aaba884ccb44fed7a5c11b8d709e391f 100644
--- a/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -36,6 +36,17 @@ def int_wasm_mem_grow : Intrinsic<[llvm_anyint_ty],
 def int_wasm_current_memory : Intrinsic<[llvm_anyint_ty], [], [IntrReadMem]>;
 def int_wasm_grow_memory : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], []>;
 
+//===----------------------------------------------------------------------===//
+// Saturating float-to-int conversions
+//===----------------------------------------------------------------------===//
+
+def int_wasm_trunc_saturate_signed : Intrinsic<[llvm_anyint_ty],
+                                               [llvm_anyfloat_ty],
+                                               [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_trunc_saturate_unsigned : Intrinsic<[llvm_anyint_ty],
+                                                 [llvm_anyfloat_ty],
+                                                 [IntrNoMem, IntrSpeculatable]>;
+
 //===----------------------------------------------------------------------===//
 // Exception handling intrinsics
 //===----------------------------------------------------------------------===//
@@ -60,7 +71,8 @@ def int_wasm_catch : Intrinsic<[llvm_ptr_ty], [llvm_i32_ty],
 // WebAssembly EH must maintain the landingpads in the order assigned to them
 // by WasmEHPrepare pass to generate landingpad table in EHStreamer. This is
 // used in order to give them the indices in WasmEHPrepare.
-def int_wasm_landingpad_index: Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+def int_wasm_landingpad_index: Intrinsic<[], [llvm_token_ty, llvm_i32_ty],
+                                         [IntrNoMem]>;
 
 // Returns LSDA address of the current function.
 def int_wasm_lsda : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
@@ -91,14 +103,6 @@ def int_wasm_atomic_notify:
 // SIMD intrinsics
 //===----------------------------------------------------------------------===//
 
-def int_wasm_add_saturate_signed :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem, IntrSpeculatable]>;
-def int_wasm_add_saturate_unsigned :
-  Intrinsic<[llvm_anyvector_ty],
-            [LLVMMatchType<0>, LLVMMatchType<0>],
-            [IntrNoMem, IntrSpeculatable]>;
 def int_wasm_sub_saturate_signed :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>, LLVMMatchType<0>],
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index 778907b05eb4fb13183f2bbdd099f2d801e68dc1..9a456acf96652e8cdd2e429fe602d87a6b0299fc 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -408,6 +408,7 @@ public:
     return const_cast<GlobalValueSummary &>(
                          static_cast<const AliasSummary *>(this)->getAliasee());
   }
+  bool hasAliaseeGUID() const { return AliaseeGUID != 0; }
   const GlobalValue::GUID &getAliaseeGUID() const {
     assert(AliaseeGUID && "Unexpected missing aliasee GUID");
     return AliaseeGUID;
@@ -477,13 +478,17 @@ public:
         TypeCheckedLoadConstVCalls;
   };
 
-  /// Function attribute flags. Used to track if a function accesses memory,
-  /// recurses or aliases.
+  /// Flags specific to function summaries.
   struct FFlags {
+    // Function attribute flags. Used to track if a function accesses memory,
+    // recurses or aliases.
     unsigned ReadNone : 1;
     unsigned ReadOnly : 1;
     unsigned NoRecurse : 1;
     unsigned ReturnDoesNotAlias : 1;
+
+    // Indicate if the global value cannot be inlined.
+    unsigned NoInline : 1;
   };
 
   /// Create an empty FunctionSummary (with specified call edges).
@@ -510,8 +515,7 @@ private:
   /// during the initial compile step when the summary index is first built.
   unsigned InstCount;
 
-  /// Function attribute flags. Used to track if a function accesses memory,
-  /// recurses or aliases.
+  /// Function summary specific flags.
   FFlags FunFlags;
 
   /// List of <CalleeValueInfo, CalleeInfo> call edge pairs from this function.
@@ -545,7 +549,7 @@ public:
     return GVS->getSummaryKind() == FunctionKind;
   }
 
-  /// Get function attribute flags.
+  /// Get function summary flags.
   FFlags fflags() const { return FunFlags; }
 
   /// Get the instruction count recorded for this function.
diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index 7c058342265513731fdaa260eb444f248a7eeaea..dd30072ce57110a849e10dc7149cde2c7daa423f 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h
@@ -659,11 +659,32 @@ inline BinaryOp_match<LHS, RHS, Instruction::FSub> m_FSub(const LHS &L,
   return BinaryOp_match<LHS, RHS, Instruction::FSub>(L, R);
 }
 
+template <typename Op_t> struct FNeg_match {
+  Op_t X;
+
+  FNeg_match(const Op_t &Op) : X(Op) {}
+  template <typename OpTy> bool match(OpTy *V) {
+    auto *FPMO = dyn_cast<FPMathOperator>(V);
+    if (!FPMO || FPMO->getOpcode() != Instruction::FSub)
+      return false;
+    if (FPMO->hasNoSignedZeros()) {
+      // With 'nsz', any zero goes.
+      if (!cstfp_pred_ty<is_any_zero_fp>().match(FPMO->getOperand(0)))
+        return false;
+    } else {
+      // Without 'nsz', we need fsub -0.0, X exactly.
+      if (!cstfp_pred_ty<is_neg_zero_fp>().match(FPMO->getOperand(0)))
+        return false;
+    }
+    return X.match(FPMO->getOperand(1));
+  }
+};
+
 /// Match 'fneg X' as 'fsub -0.0, X'.
-template <typename RHS>
-inline BinaryOp_match<cstfp_pred_ty<is_neg_zero_fp>, RHS, Instruction::FSub>
-m_FNeg(const RHS &X) {
-  return m_FSub(m_NegZeroFP(), X);
+template <typename OpTy>
+inline FNeg_match<OpTy>
+m_FNeg(const OpTy &X) {
+  return FNeg_match<OpTy>(X);
 }
 
 /// Match 'fneg X' as 'fsub +-0.0, X'.
diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h
index 57bba5e34840d53621dc98624accc46084bad1c7..c0ad32f485c3f0a126ec10204bfc1ebbb02225d5 100644
--- a/include/llvm/LTO/Config.h
+++ b/include/llvm/LTO/Config.h
@@ -73,6 +73,9 @@ struct Config {
   /// Sample PGO profile path.
   std::string SampleProfile;
 
+  /// Name remapping file for profile data.
+  std::string ProfileRemapping;
+
   /// The directory to store .dwo files.
   std::string DwoDir;
 
diff --git a/include/llvm/LTO/legacy/LTOCodeGenerator.h b/include/llvm/LTO/legacy/LTOCodeGenerator.h
index f48ab02863a5b4da219a9bbebaa0ac9e4bfca798..8f23b7cb4574f0ab8b6b6acad549b0ec167bbd8d 100644
--- a/include/llvm/LTO/legacy/LTOCodeGenerator.h
+++ b/include/llvm/LTO/legacy/LTOCodeGenerator.h
@@ -48,6 +48,9 @@
 #include <string>
 #include <vector>
 
+/// Enable global value internalization in LTO.
+extern llvm::cl::opt<bool> EnableLTOInternalization;
+
 namespace llvm {
 template <typename T> class ArrayRef;
   class LLVMContext;
@@ -233,7 +236,7 @@ private:
   unsigned OptLevel = 2;
   lto_diagnostic_handler_t DiagHandler = nullptr;
   void *DiagContext = nullptr;
-  bool ShouldInternalize = true;
+  bool ShouldInternalize = EnableLTOInternalization;
   bool ShouldEmbedUselists = false;
   bool ShouldRestoreGlobalsLinkage = false;
   TargetMachine::CodeGenFileType FileType = TargetMachine::CGFT_ObjectFile;
diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 030d3c05aa5ad56ba05c163f2628ce76b38e5d3b..07835c21fcedb1696559d4b1436a1538b0dffe22 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -165,6 +165,11 @@ public:
     return 0;
   }
 
+  /// Check whether a given symbol has been flagged with MICROMIPS flag.
+  virtual bool isMicroMips(const MCSymbol *Sym) const {
+    return false;
+  }
+
   /// Handles all target related code padding when starting to write a new
   /// basic block to an object file.
   ///
diff --git a/include/llvm/MC/MCInstrAnalysis.h b/include/llvm/MC/MCInstrAnalysis.h
index 950a1afeef50b07f6e85b15edbf0bb384f4a76e3..200f10f7d64beaa698d3e9560d1a81c2e3c86f3f 100644
--- a/include/llvm/MC/MCInstrAnalysis.h
+++ b/include/llvm/MC/MCInstrAnalysis.h
@@ -136,6 +136,17 @@ public:
     return isZeroIdiom(MI, Mask, CPUID);
   }
 
+  /// Returns true if MI is a candidate for move elimination.
+  ///
+  /// Different subtargets may apply different constraints to optimizable
+  /// register moves. For example, on most X86 subtargets, a candidate for move
+  /// elimination cannot specify the same register for both source and
+  /// destination.
+  virtual bool isOptimizableRegisterMove(const MCInst &MI,
+                                         unsigned CPUID) const {
+    return false;
+  }
+
   /// Given a branch instruction try to get the address the branch
   /// targets. Return true on success, and the address in Target.
   virtual bool
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 8cf9e1cc55a064d76190f953134d274e76ec5c2e..729aa23ef333a3117602f8f72d7de99d1066bbd6 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -117,6 +117,8 @@ protected:
   MCSection *DwarfAddrSection;
   /// The DWARF v5 range list section.
   MCSection *DwarfRnglistsSection;
+  /// The DWARF v5 locations list section.
+  MCSection *DwarfLoclistsSection;
 
   /// The DWARF v5 range list section for fission.
   MCSection *DwarfRnglistsDWOSection;
@@ -258,6 +260,7 @@ public:
   MCSection *getDwarfARangesSection() const { return DwarfARangesSection; }
   MCSection *getDwarfRangesSection() const { return DwarfRangesSection; }
   MCSection *getDwarfRnglistsSection() const { return DwarfRnglistsSection; }
+  MCSection *getDwarfLoclistsSection() const { return DwarfLoclistsSection; }
   MCSection *getDwarfMacinfoSection() const { return DwarfMacinfoSection; }
 
   MCSection *getDwarfDebugNamesSection() const {
diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index 207183a69b0ed2fca84af0309175736b139ed28b..2e9b8dfa3b260f7b59252d4a1d9a4762cb04750e 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h
@@ -30,7 +30,6 @@ class AsmLexer : public MCAsmLexer {
   StringRef CurBuf;
   bool IsAtStartOfLine = true;
   bool IsAtStartOfStatement = true;
-  bool IsParsingMSInlineAsm = false;
   bool IsPeeking = false;
 
 protected:
@@ -44,7 +43,6 @@ public:
   ~AsmLexer() override;
 
   void setBuffer(StringRef Buf, const char *ptr = nullptr);
-  void setParsingMSInlineAsm(bool V) { IsParsingMSInlineAsm = V; }
 
   StringRef LexUntilEndOfStatement() override;
 
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index 8ff0df2a185ccef26a9db9c3f32b47f7b2e114be..ea13d1cdc09f04907402243461ec53f09900ee52 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -50,6 +50,7 @@ protected: // Can only create subclasses.
   bool SkipSpace = true;
   bool AllowAtInIdentifier;
   bool IsAtStartOfStatement = true;
+  bool LexMasmIntegers = false;
   AsmCommentConsumer *CommentConsumer = nullptr;
 
   MCAsmLexer();
@@ -146,6 +147,10 @@ public:
   void setCommentConsumer(AsmCommentConsumer *CommentConsumer) {
     this->CommentConsumer = CommentConsumer;
   }
+
+  /// Set whether to lex masm-style binary and hex literals. They look like
+  /// 0b1101 and 0ABCh respectively.
+  void setLexMasmIntegers(bool V) { LexMasmIntegers = V; }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 0d56f36fbae88361ec9f29138f1563cf4d58eaf5..b80289878e6e61f34dcfe8b5bfe296a5e9bb6f9c 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -122,17 +122,18 @@ public:
 private:
   MCTargetAsmParser *TargetParser = nullptr;
 
-  unsigned ShowParsedOperands : 1;
-
 protected: // Can only create subclasses.
   MCAsmParser();
 
+  SmallVector<MCPendingError, 0> PendingErrors;
+
   /// Flag tracking whether any errors have been encountered.
   bool HadError = false;
+
   /// Enable print [latency:throughput] in output file.
   bool EnablePrintSchedInfo = false;
 
-  SmallVector<MCPendingError, 1> PendingErrors;
+  bool ShowParsedOperands = false;
 
 public:
   MCAsmParser(const MCAsmParser &) = delete;
@@ -166,7 +167,7 @@ public:
   void setShowParsedOperands(bool Value) { ShowParsedOperands = Value; }
 
   void setEnablePrintSchedInfo(bool Value) { EnablePrintSchedInfo = Value; }
-  bool shouldPrintSchedInfo() { return EnablePrintSchedInfo; }
+  bool shouldPrintSchedInfo() const { return EnablePrintSchedInfo; }
 
   /// Run the parser on the input source buffer.
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false) = 0;
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 9f53a468903d9679f3ad4b8b672810d42038c179..41305296b004f67b97c6b07c637f65d3473d828a 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -142,6 +142,7 @@ struct MCSchedClassDesc {
 struct MCRegisterCostEntry {
   unsigned RegisterClassID;
   unsigned Cost;
+  bool AllowMoveElimination;
 };
 
 /// A register file descriptor.
@@ -159,6 +160,12 @@ struct MCRegisterFileDesc {
   uint16_t NumRegisterCostEntries;
   // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable.
   uint16_t RegisterCostEntryIdx;
+  // A value of zero means: there is no limit in the number of moves that can be
+  // eliminated every cycle.
+  uint16_t MaxMovesEliminatedPerCycle;
+  // Ture if this register file only knows how to optimize register moves from
+  // known zero registers.
+  bool AllowZeroMoveEliminationOnly;
 };
 
 /// Provide extra details about the machine processor.
@@ -176,22 +183,6 @@ struct MCExtraProcessorInfo {
   unsigned NumRegisterFiles;
   const MCRegisterCostEntry *RegisterCostTable;
   unsigned NumRegisterCostEntries;
-
-  struct PfmCountersInfo {
-    // An optional name of a performance counter that can be used to measure
-    // cycles.
-    const char *CycleCounter;
-
-    // An optional name of a performance counter that can be used to measure
-    // uops.
-    const char *UopsCounter;
-
-    // For each MCProcResourceDesc defined by the processor, an optional list of
-    // names of performance counters that can be used to measure the resource
-    // utilization.
-    const char **IssueCounters;
-  };
-  PfmCountersInfo PfmCounters;
 };
 
 /// Machine model for scheduling, bundling, and heuristics.
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 91fb4e537b4c8eae882176d4ddcc2b5fa490a445..edf0a72d9c12d3848529a541cf46ff0f14729c28 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -109,6 +109,11 @@ public:
 
   virtual void emitValue(const MCExpr *Value);
 
+  /// Emit the bytes in \p Data into the output.
+  ///
+  /// This is used to emit bytes in \p Data as sequence of .byte directives.
+  virtual void emitRawBytes(StringRef Data);
+
   virtual void finish();
 };
 
@@ -193,10 +198,6 @@ class MCStreamer {
 
   WinEH::FrameInfo *CurrentWinFrameInfo;
 
-  /// Retreive the current frame info if one is available and it is not yet
-  /// closed. Otherwise, issue an error and return null.
-  WinEH::FrameInfo *EnsureValidWinFrameInfo(SMLoc Loc);
-
   /// Tracks an index to represent the order a symbol was emitted in.
   /// Zero means we did not emit that symbol.
   DenseMap<const MCSymbol *, unsigned> SymbolOrdering;
@@ -219,10 +220,6 @@ protected:
   virtual void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame);
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &CurFrame);
 
-  /// When emitting an object file, create and emit a real label. When emitting
-  /// textual assembly, this should do nothing to avoid polluting our output.
-  virtual MCSymbol *EmitCFILabel();
-
   WinEH::FrameInfo *getCurrentWinFrameInfo() {
     return CurrentWinFrameInfo;
   }
@@ -261,6 +258,14 @@ public:
     return TargetStreamer.get();
   }
 
+  /// When emitting an object file, create and emit a real label. When emitting
+  /// textual assembly, this should do nothing to avoid polluting our output.
+  virtual MCSymbol *EmitCFILabel();
+
+  /// Retreive the current frame info if one is available and it is not yet
+  /// closed. Otherwise, issue an error and return null.
+  WinEH::FrameInfo *EnsureValidWinFrameInfo(SMLoc Loc);
+
   unsigned getNumFrameInfos() { return DwarfFrameInfos.size(); }
   ArrayRef<MCDwarfFrameInfo> getDwarfFrameInfos() const {
     return DwarfFrameInfos;
@@ -870,7 +875,7 @@ public:
 
   virtual MCSymbol *getDwarfLineTableSymbol(unsigned CUID);
   virtual void EmitCFISections(bool EH, bool Debug);
-  void EmitCFIStartProc(bool IsSimple);
+  void EmitCFIStartProc(bool IsSimple, SMLoc Loc = SMLoc());
   void EmitCFIEndProc();
   virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
   virtual void EmitCFIDefCfaOffset(int64_t Offset);
@@ -894,6 +899,11 @@ public:
 
   virtual void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc = SMLoc());
   virtual void EmitWinCFIEndProc(SMLoc Loc = SMLoc());
+  /// This is used on platforms, such as Windows on ARM64, that require function
+  /// or funclet sizes to be emitted in .xdata before the End marker is emitted
+  /// for the frame.  We cannot use the End marker, as it is not set at the
+  /// point of emitting .xdata, in order to indicate that the frame is active.
+  virtual void EmitWinCFIFuncletOrFuncEnd(SMLoc Loc = SMLoc());
   virtual void EmitWinCFIStartChained(SMLoc Loc = SMLoc());
   virtual void EmitWinCFIEndChained(SMLoc Loc = SMLoc());
   virtual void EmitWinCFIPushReg(unsigned Register, SMLoc Loc = SMLoc());
diff --git a/include/llvm/MC/MCWin64EH.h b/include/llvm/MC/MCWin64EH.h
index 83ea738de8c3d918a1416b5c9cf0697681413253..1a9f6f403d7c6a62647415a2d9e105925879a20d 100644
--- a/include/llvm/MC/MCWin64EH.h
+++ b/include/llvm/MC/MCWin64EH.h
@@ -56,6 +56,14 @@ public:
   void Emit(MCStreamer &Streamer) const override;
   void EmitUnwindInfo(MCStreamer &Streamer, WinEH::FrameInfo *FI) const override;
 };
+
+class ARM64UnwindEmitter : public WinEH::UnwindEmitter {
+public:
+  void Emit(MCStreamer &Streamer) const override;
+  void EmitUnwindInfo(MCStreamer &Streamer,
+                      WinEH::FrameInfo *FI) const override;
+};
+
 }
 } // end namespace llvm
 
diff --git a/include/llvm/MC/MCWinEH.h b/include/llvm/MC/MCWinEH.h
index 4ca52a6654eb7f51ffd24ce82064d281d0935dc8..98ef0367a11d115dc4f536dd079c8e86393a0260 100644
--- a/include/llvm/MC/MCWinEH.h
+++ b/include/llvm/MC/MCWinEH.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_MC_MCWINEH_H
 #define LLVM_MC_MCWINEH_H
 
+#include "llvm/ADT/MapVector.h"
 #include <vector>
 
 namespace llvm {
@@ -20,9 +21,9 @@ class MCSymbol;
 namespace WinEH {
 struct Instruction {
   const MCSymbol *Label;
-  const unsigned Offset;
-  const unsigned Register;
-  const unsigned Operation;
+  unsigned Offset;
+  unsigned Register;
+  unsigned Operation;
 
   Instruction(unsigned Op, MCSymbol *L, unsigned Reg, unsigned Off)
     : Label(L), Offset(Off), Register(Reg), Operation(Op) {}
@@ -31,6 +32,7 @@ struct Instruction {
 struct FrameInfo {
   const MCSymbol *Begin = nullptr;
   const MCSymbol *End = nullptr;
+  const MCSymbol *FuncletOrFuncEnd = nullptr;
   const MCSymbol *ExceptionHandler = nullptr;
   const MCSymbol *Function = nullptr;
   const MCSymbol *PrologEnd = nullptr;
@@ -43,6 +45,7 @@ struct FrameInfo {
   int LastFrameInst = -1;
   const FrameInfo *ChainedParent = nullptr;
   std::vector<Instruction> Instructions;
+  MapVector<MCSymbol*, std::vector<Instruction>> EpilogMap;
 
   FrameInfo() = default;
   FrameInfo(const MCSymbol *Function, const MCSymbol *BeginFuncEHLabel)
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index 54907cbca4c0e28eb1eda1a779eea740b4abb6e3..dff086078391a2cada4166de03648a58c667c78e 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -1021,6 +1021,8 @@ StringRef ELFObjectFile<ELFT>::getFileFormatName() const {
       return "ELF32-lanai";
     case ELF::EM_MIPS:
       return "ELF32-mips";
+    case ELF::EM_MSP430:
+      return "ELF32-msp430";
     case ELF::EM_PPC:
       return "ELF32-ppc";
     case ELF::EM_RISCV:
@@ -1091,6 +1093,8 @@ template <class ELFT> Triple::ArchType ELFObjectFile<ELFT>::getArch() const {
     default:
       report_fatal_error("Invalid ELFCLASS!");
     }
+  case ELF::EM_MSP430:
+    return Triple::msp430;
   case ELF::EM_PPC:
     return Triple::ppc;
   case ELF::EM_PPC64:
diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h
index 02d3dc324bce7353c6594de539357183b798de3d..22e5eb0caa0fa9ff5608a6b1439b17bbac0fd374 100644
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include <vector>
@@ -32,10 +33,13 @@ class ModuleSummaryIndex;
 /// A struct capturing PGO tunables.
 struct PGOOptions {
   PGOOptions(std::string ProfileGenFile = "", std::string ProfileUseFile = "",
-             std::string SampleProfileFile = "", bool RunProfileGen = false,
-             bool SamplePGOSupport = false)
+             std::string SampleProfileFile = "",
+             std::string ProfileRemappingFile = "",
+             bool RunProfileGen = false, bool SamplePGOSupport = false)
       : ProfileGenFile(ProfileGenFile), ProfileUseFile(ProfileUseFile),
-        SampleProfileFile(SampleProfileFile), RunProfileGen(RunProfileGen),
+        SampleProfileFile(SampleProfileFile),
+        ProfileRemappingFile(ProfileRemappingFile),
+        RunProfileGen(RunProfileGen),
         SamplePGOSupport(SamplePGOSupport || !SampleProfileFile.empty()) {
     assert((RunProfileGen ||
             !SampleProfileFile.empty() ||
@@ -45,6 +49,7 @@ struct PGOOptions {
   std::string ProfileGenFile;
   std::string ProfileUseFile;
   std::string SampleProfileFile;
+  std::string ProfileRemappingFile;
   bool RunProfileGen;
   bool SamplePGOSupport;
 };
@@ -380,8 +385,9 @@ public:
   /// If the sequence of passes aren't all the exact same kind of pass, it will
   /// be an error. You cannot mix different levels implicitly, you must
   /// explicitly form a pass manager in which to nest passes.
-  bool parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
+  Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
 
   /// {{@ Parse a textual pass pipeline description into a specific PassManager
   ///
@@ -390,12 +396,15 @@ public:
   /// this is the valid pipeline text:
   ///
   ///   function(lpass)
-  bool parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
-  bool parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
-  bool parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText,
-                         bool VerifyEachPass = true, bool DebugLogging = false);
+  Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
+  Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
+  Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText,
+                          bool VerifyEachPass = true,
+                          bool DebugLogging = false);
   /// @}}
 
   /// Parse a textual alias analysis pipeline into the provided AA manager.
@@ -413,7 +422,7 @@ public:
   /// Returns false if the text cannot be parsed cleanly. The specific state of
   /// the \p AA manager is unspecified if such an error is encountered and this
   /// returns false.
-  bool parseAAPipeline(AAManager &AA, StringRef PipelineText);
+  Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
   /// Register a callback for a default optimizer pipeline extension
   /// point
@@ -561,33 +570,34 @@ private:
   static Optional<std::vector<PipelineElement>>
   parsePipelineText(StringRef Text);
 
-  bool parseModulePass(ModulePassManager &MPM, const PipelineElement &E,
+  Error parseModulePass(ModulePassManager &MPM, const PipelineElement &E,
+                        bool VerifyEachPass, bool DebugLogging);
+  Error parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E,
                        bool VerifyEachPass, bool DebugLogging);
-  bool parseCGSCCPass(CGSCCPassManager &CGPM, const PipelineElement &E,
+  Error parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E,
+                          bool VerifyEachPass, bool DebugLogging);
+  Error parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
                       bool VerifyEachPass, bool DebugLogging);
-  bool parseFunctionPass(FunctionPassManager &FPM, const PipelineElement &E,
-                     bool VerifyEachPass, bool DebugLogging);
-  bool parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
-                     bool VerifyEachPass, bool DebugLogging);
   bool parseAAPassName(AAManager &AA, StringRef Name);
 
-  bool parseLoopPassPipeline(LoopPassManager &LPM,
-                             ArrayRef<PipelineElement> Pipeline,
-                             bool VerifyEachPass, bool DebugLogging);
-  bool parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                 ArrayRef<PipelineElement> Pipeline,
-                                 bool VerifyEachPass, bool DebugLogging);
-  bool parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+  Error parseLoopPassPipeline(LoopPassManager &LPM,
                               ArrayRef<PipelineElement> Pipeline,
                               bool VerifyEachPass, bool DebugLogging);
-  bool parseModulePassPipeline(ModulePassManager &MPM,
+  Error parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                  ArrayRef<PipelineElement> Pipeline,
+                                  bool VerifyEachPass, bool DebugLogging);
+  Error parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
                                ArrayRef<PipelineElement> Pipeline,
                                bool VerifyEachPass, bool DebugLogging);
+  Error parseModulePassPipeline(ModulePassManager &MPM,
+                                ArrayRef<PipelineElement> Pipeline,
+                                bool VerifyEachPass, bool DebugLogging);
 
   void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                          OptimizationLevel Level, bool RunProfileGen,
                          std::string ProfileGenFile,
-                         std::string ProfileUseFile);
+                         std::string ProfileUseFile,
+                         std::string ProfileRemappingFile);
 
   void invokePeepholeEPCallbacks(FunctionPassManager &, OptimizationLevel);
 
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index efc22dcd0d9a6ab60809372d659641886e82d2d4..08d78227611708ef782f2869f170f0dc67c10c90 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -348,6 +348,9 @@ struct InstrProfReaderIndexBase {
 using OnDiskHashTableImplV3 =
     OnDiskIterableChainedHashTable<InstrProfLookupTrait>;
 
+template <typename HashTableImpl>
+class InstrProfReaderItaniumRemapper;
+
 template <typename HashTableImpl>
 class InstrProfReaderIndex : public InstrProfReaderIndexBase {
 private:
@@ -355,6 +358,8 @@ private:
   typename HashTableImpl::data_iterator RecordIterator;
   uint64_t FormatVersion;
 
+  friend class InstrProfReaderItaniumRemapper<HashTableImpl>;
+
 public:
   InstrProfReaderIndex(const unsigned char *Buckets,
                        const unsigned char *const Payload,
@@ -386,13 +391,26 @@ public:
   }
 };
 
+/// Name matcher supporting fuzzy matching of symbol names to names in profiles.
+class InstrProfReaderRemapper {
+public:
+  virtual ~InstrProfReaderRemapper() {}
+  virtual Error populateRemappings() { return Error::success(); }
+  virtual Error getRecords(StringRef FuncName,
+                           ArrayRef<NamedInstrProfRecord> &Data) = 0;
+};
+
 /// Reader for the indexed binary instrprof format.
 class IndexedInstrProfReader : public InstrProfReader {
 private:
   /// The profile data file contents.
   std::unique_ptr<MemoryBuffer> DataBuffer;
+  /// The profile remapping file contents.
+  std::unique_ptr<MemoryBuffer> RemappingBuffer;
   /// The index into the profile data.
   std::unique_ptr<InstrProfReaderIndexBase> Index;
+  /// The profile remapping file contents.
+  std::unique_ptr<InstrProfReaderRemapper> Remapper;
   /// Profile summary data.
   std::unique_ptr<ProfileSummary> Summary;
   // Index to the current record in the record array.
@@ -404,8 +422,11 @@ private:
                                    const unsigned char *Cur);
 
 public:
-  IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
-      : DataBuffer(std::move(DataBuffer)), RecordIndex(0) {}
+  IndexedInstrProfReader(
+      std::unique_ptr<MemoryBuffer> DataBuffer,
+      std::unique_ptr<MemoryBuffer> RemappingBuffer = nullptr)
+      : DataBuffer(std::move(DataBuffer)),
+        RemappingBuffer(std::move(RemappingBuffer)), RecordIndex(0) {}
   IndexedInstrProfReader(const IndexedInstrProfReader &) = delete;
   IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
 
@@ -434,10 +455,11 @@ public:
 
   /// Factory method to create an indexed reader.
   static Expected<std::unique_ptr<IndexedInstrProfReader>>
-  create(const Twine &Path);
+  create(const Twine &Path, const Twine &RemappingPath = "");
 
   static Expected<std::unique_ptr<IndexedInstrProfReader>>
-  create(std::unique_ptr<MemoryBuffer> Buffer);
+  create(std::unique_ptr<MemoryBuffer> Buffer,
+         std::unique_ptr<MemoryBuffer> RemappingBuffer = nullptr);
 
   // Used for testing purpose only.
   void setValueProfDataEndianness(support::endianness Endianness) {
diff --git a/include/llvm/ProfileData/SampleProf.h b/include/llvm/ProfileData/SampleProf.h
index e632a1c955bb6e24c49de49200f574069b70bd5d..927dfd246878ebdf2346de3f582125c6e940e9cf 100644
--- a/include/llvm/ProfileData/SampleProf.h
+++ b/include/llvm/ProfileData/SampleProf.h
@@ -488,8 +488,6 @@ public:
   // If the format is SPF_Compact_Binary, the name is already a GUID and we
   // don't want to return the GUID of GUID.
   static uint64_t getGUID(StringRef Name) {
-    if (Format == SPF_Compact_Binary)
-      errs() << Name << '\n';
     return (Format == SPF_Compact_Binary) ? std::stoull(Name.data())
                                           : Function::getGUID(Name);
   }
diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
index c100e8004641622095e607a7edd9e765d40c2e2e..3c477cc347147ec6a62090604c2a9e37098e769f 100644
--- a/include/llvm/ProfileData/SampleProfReader.h
+++ b/include/llvm/ProfileData/SampleProfReader.h
@@ -222,6 +222,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SymbolRemappingReader.h"
 #include <algorithm>
 #include <cstdint>
 #include <memory>
@@ -289,11 +290,16 @@ public:
     // The function name may have been updated by adding suffix. In sample
     // profile, the function names are all stripped, so we need to strip
     // the function name suffix before matching with profile.
-    StringRef Fname = F.getName().split('.').first;
+    return getSamplesFor(F.getName().split('.').first);
+  }
+
+  /// Return the samples collected for function \p F.
+  virtual FunctionSamples *getSamplesFor(StringRef Fname) {
     std::string FGUID;
     Fname = getRepInFormat(Fname, getFormat(), FGUID);
-    if (Profiles.count(Fname))
-      return &Profiles[Fname];
+    auto It = Profiles.find(Fname);
+    if (It != Profiles.end())
+      return &It->second;
     return nullptr;
   }
 
@@ -337,6 +343,12 @@ protected:
   /// Profile summary information.
   std::unique_ptr<ProfileSummary> Summary;
 
+  /// Take ownership of the summary of this reader.
+  static std::unique_ptr<ProfileSummary>
+  takeSummary(SampleProfileReader &Reader) {
+    return std::move(Reader.Summary);
+  }
+
   /// Compute summary for this profile.
   void computeSummary();
 
@@ -525,6 +537,40 @@ protected:
   static const uint32_t GCOVTagAFDOFunction = 0xac000000;
 };
 
+/// A profile data reader proxy that remaps the profile data from another
+/// sample profile data reader, by applying a provided set of equivalences
+/// between components of the symbol names in the profile.
+class SampleProfileReaderItaniumRemapper : public SampleProfileReader {
+public:
+  SampleProfileReaderItaniumRemapper(
+      std::unique_ptr<MemoryBuffer> B, LLVMContext &C,
+      std::unique_ptr<SampleProfileReader> Underlying)
+      : SampleProfileReader(std::move(B), C, Underlying->getFormat()) {
+    Profiles = std::move(Underlying->getProfiles());
+    Summary = takeSummary(*Underlying);
+  }
+
+  /// Create a remapped sample profile from the given remapping file and
+  /// underlying samples.
+  static ErrorOr<std::unique_ptr<SampleProfileReader>>
+  create(const Twine &Filename, LLVMContext &C,
+         std::unique_ptr<SampleProfileReader> Underlying);
+
+  /// Read and validate the file header.
+  std::error_code readHeader() override { return sampleprof_error::success; }
+
+  /// Read remapping file and apply it to the sample profile.
+  std::error_code read() override;
+
+  /// Return the samples collected for function \p F.
+  FunctionSamples *getSamplesFor(StringRef FunctionName) override;
+  using SampleProfileReader::getSamplesFor;
+
+private:
+  SymbolRemappingReader Remappings;
+  DenseMap<SymbolRemappingReader::Key, FunctionSamples*> SampleMap;
+};
+
 } // end namespace sampleprof
 
 } // end namespace llvm
diff --git a/include/llvm/Support/ARMWinEH.h b/include/llvm/Support/ARMWinEH.h
index 1463629f45dc46f44bf3b16fd9e57a09ec218bfc..60174503ad4924e6b76a03846c38f3080573585c 100644
--- a/include/llvm/Support/ARMWinEH.h
+++ b/include/llvm/Support/ARMWinEH.h
@@ -207,6 +207,8 @@ std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
 
 /// ExceptionDataRecord - An entry in the table of exception data (.xdata)
 ///
+/// The format on ARM is:
+///
 ///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 ///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
 /// +-------+---------+-+-+-+---+-----------------------------------+
@@ -215,6 +217,16 @@ std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
 /// |    Reserved    |Ex. Code Words|   (Extended Epilogue Count)   |
 /// +-------+--------+--------------+-------------------------------+
 ///
+/// The format on ARM64 is:
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +---------+---------+-+-+---+-----------------------------------+
+/// |  C Wrd  | Epi Cnt |E|X|Ver|         Function Length           |
+/// +---------+------+--'-'-'---'---+-------------------------------+
+/// |    Reserved    |Ex. Code Words|   (Extended Epilogue Count)   |
+/// +-------+--------+--------------+-------------------------------+
+///
 /// Function Length : 18-bit field indicating the total length of the function
 ///                   in bytes divided by 2.  If a function is larger than
 ///                   512KB, then multiple pdata and xdata records must be used.
@@ -225,7 +237,7 @@ std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
 ///     header
 /// F : 1-bit field indicating that the record describes a function fragment
 ///     (implies that no prologue is present, and prologue processing should be
-///     skipped)
+///     skipped) (ARM only)
 /// Epilogue Count : 5-bit field that differs in meaning based on the E field.
 ///
 ///                  If E is set, then this field specifies the index of the
@@ -235,33 +247,43 @@ std::pair<uint16_t, uint32_t> SavedRegisterMask(const RuntimeFunction &RF);
 ///                  scopes.  If more than 31 scopes exist, then this field and
 ///                  the Code Words field must both be set to 0 to indicate that
 ///                  an extension word is required.
-/// Code Words : 4-bit field that species the number of 32-bit words needed to
-///              contain all the unwind codes.  If more than 15 words (63 code
-///              bytes) are required, then this field and the Epilogue Count
-///              field must both be set to 0 to indicate that an extension word
-///              is required.
+/// Code Words : 4-bit (5-bit on ARM64) field that specifies the number of
+///              32-bit words needed to contain all the unwind codes.  If more
+///              than 15 words (31 words on ARM64) are required, then this field
+///              and the Epilogue Count field must both be set to 0 to indicate
+///              that an extension word is required.
 /// Extended Epilogue Count, Extended Code Words :
 ///                          Valid only if Epilog Count and Code Words are both
 ///                          set to 0.  Provides an 8-bit extended code word
 ///                          count and 16-bits for epilogue count
 ///
+/// The epilogue scope format on ARM is:
+///
 ///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
 ///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
 /// +----------------+------+---+---+-------------------------------+
 /// |  Ep Start Idx  | Cond |Res|       Epilogue Start Offset       |
 /// +----------------+------+---+-----------------------------------+
 ///
+/// The epilogue scope format on ARM64 is:
+///
+///  3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+///  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+/// +-------------------+-------+---+-------------------------------+
+/// |  Ep Start Idx     |  Res  |   Epilogue Start Offset           |
+/// +-------------------+-------+-----------------------------------+
+///
 /// If the E bit is unset in the header, the header is followed by a series of
 /// epilogue scopes, which are sorted by their offset.
 ///
 /// Epilogue Start Offset: 18-bit field encoding the offset of epilogue relative
 ///                        to the start of the function in bytes divided by two
 /// Res : 2-bit field reserved for future expansion (must be set to 0)
-/// Condition : 4-bit field providing the condition under which the epilogue is
-///             executed.  Unconditional epilogues should set this field to 0xe.
-///             Epilogues must be entirely conditional or unconditional, and in
-///             Thumb-2 mode.  The epilogue beings with the first instruction
-///             after the IT opcode.
+/// Condition : (ARM only) 4-bit field providing the condition under which the
+///             epilogue is executed.  Unconditional epilogues should set this
+///             field to 0xe. Epilogues must be entirely conditional or
+///             unconditional, and in Thumb-2 mode.  The epilogue begins with
+///             the first instruction after the IT opcode.
 /// Epilogue Start Index : 8-bit field indicating the byte index of the first
 ///                        unwind code describing the epilogue
 ///
@@ -293,18 +315,33 @@ struct EpilogueScope {
   const support::ulittle32_t ES;
 
   EpilogueScope(const support::ulittle32_t Data) : ES(Data) {}
+  // Same for both ARM and AArch64.
   uint32_t EpilogueStartOffset() const {
     return (ES & 0x0003ffff);
   }
-  uint8_t Res() const {
+
+  // Different implementations for ARM and AArch64.
+  uint8_t ResARM() const {
     return ((ES & 0x000c0000) >> 18);
   }
+
+  uint8_t ResAArch64() const {
+    return ((ES & 0x000f0000) >> 18);
+  }
+
+  // Condition is only applicable to ARM.
   uint8_t Condition() const {
     return ((ES & 0x00f00000) >> 20);
   }
-  uint8_t EpilogueStartIndex() const {
+
+  // Different implementations for ARM and AArch64.
+  uint8_t EpilogueStartIndexARM() const {
     return ((ES & 0xff000000) >> 24);
   }
+
+  uint16_t EpilogueStartIndexAArch64() const {
+    return ((ES & 0xffc00000) >> 22);
+  }
 };
 
 struct ExceptionDataRecord;
@@ -312,13 +349,23 @@ inline size_t HeaderWords(const ExceptionDataRecord &XR);
 
 struct ExceptionDataRecord {
   const support::ulittle32_t *Data;
+  bool isAArch64;
 
-  ExceptionDataRecord(const support::ulittle32_t *Data) : Data(Data) {}
+  ExceptionDataRecord(const support::ulittle32_t *Data, bool isAArch64) :
+    Data(Data), isAArch64(isAArch64) {}
 
   uint32_t FunctionLength() const {
     return (Data[0] & 0x0003ffff);
   }
 
+  uint32_t FunctionLengthInBytesARM() const {
+    return FunctionLength() << 1;
+  }
+
+  uint32_t FunctionLengthInBytesAArch64() const {
+    return FunctionLength() << 2;
+  }
+
   uint8_t Vers() const {
     return (Data[0] & 0x000C0000) >> 18;
   }
@@ -332,18 +379,25 @@ struct ExceptionDataRecord {
   }
 
   bool F() const {
+    assert(!isAArch64 && "Fragments are only supported on ARMv7 WinEH");
     return ((Data[0] & 0x00400000) >> 22);
   }
 
   uint8_t EpilogueCount() const {
-    if (HeaderWords(*this) == 1)
+    if (HeaderWords(*this) == 1) {
+      if (isAArch64)
+        return (Data[0] & 0x07C00000) >> 22;
       return (Data[0] & 0x0f800000) >> 23;
+    }
     return Data[1] & 0x0000ffff;
   }
 
   uint8_t CodeWords() const {
-    if (HeaderWords(*this) == 1)
+    if (HeaderWords(*this) == 1) {
+      if (isAArch64)
+        return (Data[0] & 0xf8000000) >> 27;
       return (Data[0] & 0xf0000000) >> 28;
+    }
     return (Data[1] & 0x00ff0000) >> 16;
   }
 
@@ -373,6 +427,8 @@ struct ExceptionDataRecord {
 };
 
 inline size_t HeaderWords(const ExceptionDataRecord &XR) {
+  if (XR.isAArch64)
+    return (XR.Data[0] & 0xffc00000) ? 1 : 2;
   return (XR.Data[0] & 0xff800000) ? 1 : 2;
 }
 }
diff --git a/include/llvm/Support/BinaryStreamArray.h b/include/llvm/Support/BinaryStreamArray.h
index d1571cb37fc693c8cc80f47def7ba296878172ab..7b8a95b45735b3226724f64b83c2eeda5063bd16 100644
--- a/include/llvm/Support/BinaryStreamArray.h
+++ b/include/llvm/Support/BinaryStreamArray.h
@@ -125,6 +125,8 @@ public:
   BinaryStreamRef getUnderlyingStream() const { return Stream; }
   void setUnderlyingStream(BinaryStreamRef S) { Stream = S; }
 
+  void drop_front() { Stream = Stream.drop_front(begin()->length()); }
+
 private:
   BinaryStreamRef Stream;
   Extractor E;
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index 799b41fbf8b0fcb2c1e9542f99ec8b4a64ee864c..cd3543c130e085970e4ccf780428f5b5048685c7 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -56,9 +56,18 @@ namespace cl {
 // Returns true on success. Otherwise, this will print the error message to
 // stderr and exit if \p Errs is not set (nullptr by default), or print the
 // error message to \p Errs and return false if \p Errs is provided.
+//
+// If EnvVar is not nullptr, command-line options are also parsed from the
+// environment variable named by EnvVar.  Precedence is given to occurrences
+// from argv.  This precedence is currently implemented by parsing argv after
+// the environment variable, so it is only implemented correctly for options
+// that give precedence to later occurrences.  If your program supports options
+// that give precedence to earlier occurrences, you will need to extend this
+// function to support it correctly.
 bool ParseCommandLineOptions(int argc, const char *const *argv,
                              StringRef Overview = "",
-                             raw_ostream *Errs = nullptr);
+                             raw_ostream *Errs = nullptr,
+                             const char *EnvVar = nullptr);
 
 //===----------------------------------------------------------------------===//
 // ParseEnvironmentOptions - Environment variable option processing alternate
diff --git a/include/llvm/Support/DebugCounter.h b/include/llvm/Support/DebugCounter.h
index 83bd5a06c94a175cdedd79b88d828d7d74b34cf4..6eadd5c6aefff6bbb77939dc2fe69585ad1446e3 100644
--- a/include/llvm/Support/DebugCounter.h
+++ b/include/llvm/Support/DebugCounter.h
@@ -55,6 +55,8 @@ namespace llvm {
 
 class DebugCounter {
 public:
+  ~DebugCounter();
+
   /// Returns a reference to the singleton instance.
   static DebugCounter &instance();
 
diff --git a/include/llvm/Support/ErrorHandling.h b/include/llvm/Support/ErrorHandling.h
index 39cbfed2436af16784ec51a4d509b97a8819d163..fec39e59a717071b00b2feb6aebcd7d0bbf5b4dc 100644
--- a/include/llvm/Support/ErrorHandling.h
+++ b/include/llvm/Support/ErrorHandling.h
@@ -112,8 +112,8 @@ void install_out_of_memory_new_handler();
 /// in the unwind chain.
 ///
 /// If no error handler is installed (default), then a bad_alloc exception
-/// is thrown, if LLVM is compiled with exception support, otherwise an assertion
-/// is called.
+/// is thrown, if LLVM is compiled with exception support, otherwise an
+/// assertion is called.
 void report_bad_alloc_error(const char *Reason, bool GenCrashDiag = true);
 
 /// This function calls abort(), and prints the optional message to stderr.
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index 344484b285c2d96e298ad4f6e1a0897b0c25364b..971e8305a112c6c85e82af8ec92d48ea22b6fd80 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -1191,6 +1191,20 @@ struct SemiNCAInfo {
     });
     LLVM_DEBUG(dbgs() << "\n");
 
+    // Recalculate the DominatorTree when the number of updates
+    // exceeds a threshold, which usually makes direct updating slower than
+    // recalculation. We select this threshold proportional to the
+    // size of the DominatorTree. The constant is selected
+    // by choosing the one with an acceptable performance on some real-world
+    // inputs.
+
+    // Make unittests of the incremental algorithm work
+    if (DT.DomTreeNodes.size() <= 100) {
+      if (NumLegalized > DT.DomTreeNodes.size())
+        CalculateFromScratch(DT, &BUI);
+    } else if (NumLegalized > DT.DomTreeNodes.size() / 40)
+      CalculateFromScratch(DT, &BUI);
+
     // If the DominatorTree was recalculated at some point, stop the batch
     // updates. Full recalculations ignore batch updates and look at the actual
     // CFG.
diff --git a/include/llvm/Support/JSON.h b/include/llvm/Support/JSON.h
index 2fc0e7ddb90141e0226a4f940d0e684a720c80f9..7a04fd52bc50e4f052f791334823e4113fe0364f 100644
--- a/include/llvm/Support/JSON.h
+++ b/include/llvm/Support/JSON.h
@@ -294,9 +294,13 @@ public:
   Value(json::Array &&Elements) : Type(T_Array) {
     create<json::Array>(std::move(Elements));
   }
+  template <typename Elt>
+  Value(const std::vector<Elt> &C) : Value(json::Array(C)) {}
   Value(json::Object &&Properties) : Type(T_Object) {
     create<json::Object>(std::move(Properties));
   }
+  template <typename Elt>
+  Value(const std::map<std::string, Elt> &C) : Value(json::Object(C)) {}
   // Strings: types with value semantics. Must be valid UTF-8.
   Value(std::string V) : Type(T_String) {
     if (LLVM_UNLIKELY(!isUTF8(V))) {
diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h
index 01397e8ebb702ceb71d79b45c2a0d0d686e2c154..63241b52e1fa6b19eba020e51b728b825fe116d2 100644
--- a/include/llvm/Support/TargetParser.h
+++ b/include/llvm/Support/TargetParser.h
@@ -317,9 +317,10 @@ enum GPUKind : uint32_t {
   GK_GFX902 = 61,
   GK_GFX904 = 62,
   GK_GFX906 = 63,
+  GK_GFX909 = 65,
 
   GK_AMDGCN_FIRST = GK_GFX600,
-  GK_AMDGCN_LAST = GK_GFX906,
+  GK_AMDGCN_LAST = GK_GFX909,
 };
 
 /// Instruction set architecture version.
diff --git a/include/llvm/Support/VirtualFileSystem.h b/include/llvm/Support/VirtualFileSystem.h
index f2f8ffafc506479f8cccea04c03ed0be3deb2c07..b3326bbbe486b33322c04d68d48362cc780368eb 100644
--- a/include/llvm/Support/VirtualFileSystem.h
+++ b/include/llvm/Support/VirtualFileSystem.h
@@ -193,14 +193,22 @@ public:
 
 class FileSystem;
 
+namespace detail {
+
+/// Keeps state for the recursive_directory_iterator.
+struct RecDirIterState {
+  std::stack<directory_iterator, std::vector<directory_iterator>> Stack;
+  bool HasNoPushRequest = false;
+};
+
+} // end namespace detail
+
 /// An input iterator over the recursive contents of a virtual path,
 /// similar to llvm::sys::fs::recursive_directory_iterator.
 class recursive_directory_iterator {
-  using IterState =
-      std::stack<directory_iterator, std::vector<directory_iterator>>;
-
   FileSystem *FS;
-  std::shared_ptr<IterState> State; // Input iterator semantics on copy.
+  std::shared_ptr<detail::RecDirIterState>
+      State; // Input iterator semantics on copy.
 
 public:
   recursive_directory_iterator(FileSystem &FS, const Twine &Path,
@@ -212,8 +220,8 @@ public:
   /// Equivalent to operator++, with an error code.
   recursive_directory_iterator &increment(std::error_code &EC);
 
-  const directory_entry &operator*() const { return *State->top(); }
-  const directory_entry *operator->() const { return &*State->top(); }
+  const directory_entry &operator*() const { return *State->Stack.top(); }
+  const directory_entry *operator->() const { return &*State->Stack.top(); }
 
   bool operator==(const recursive_directory_iterator &Other) const {
     return State == Other.State; // identity
@@ -224,9 +232,12 @@ public:
 
   /// Gets the current level. Starting path is at level 0.
   int level() const {
-    assert(!State->empty() && "Cannot get level without any iteration state");
-    return State->size() - 1;
+    assert(!State->Stack.empty() &&
+           "Cannot get level without any iteration state");
+    return State->Stack.size() - 1;
   }
+
+  void no_push() { State->HasNoPushRequest = true; }
 };
 
 /// The virtual file system interface.
@@ -268,6 +279,9 @@ public:
   /// Check whether a file exists. Provided for convenience.
   bool exists(const Twine &Path);
 
+  /// Is the file mounted on a local filesystem?
+  virtual std::error_code isLocal(const Twine &Path, bool &Result);
+
   /// Make \a Path an absolute path.
   ///
   /// Makes \a Path absolute using the current directory if it is not already.
@@ -315,6 +329,7 @@ public:
   directory_iterator dir_begin(const Twine &Dir, std::error_code &EC) override;
   llvm::ErrorOr<std::string> getCurrentWorkingDirectory() const override;
   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
+  std::error_code isLocal(const Twine &Path, bool &Result) override;
   std::error_code getRealPath(const Twine &Path,
                               SmallVectorImpl<char> &Output) const override;
 
@@ -452,7 +467,7 @@ public:
   /// system.
   std::error_code getRealPath(const Twine &Path,
                               SmallVectorImpl<char> &Output) const override;
-
+  std::error_code isLocal(const Twine &Path, bool &Result) override;
   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
 };
 
@@ -490,7 +505,6 @@ class YAMLVFSWriter {
   Optional<bool> IsCaseSensitive;
   Optional<bool> IsOverlayRelative;
   Optional<bool> UseExternalNames;
-  Optional<bool> IgnoreNonExistentContents;
   std::string OverlayDir;
 
 public:
@@ -504,10 +518,6 @@ public:
 
   void setUseExternalNames(bool UseExtNames) { UseExternalNames = UseExtNames; }
 
-  void setIgnoreNonExistentContents(bool IgnoreContents) {
-    IgnoreNonExistentContents = IgnoreContents;
-  }
-
   void setOverlayDir(StringRef OverlayDirectory) {
     IsOverlayRelative = true;
     OverlayDir.assign(OverlayDirectory.str());
diff --git a/include/llvm/Support/Win64EH.h b/include/llvm/Support/Win64EH.h
index 928eb906de0c40ac50215ab39ed8df440b2fd789..e27bf1b3a1a5082744e21073d38becf356d013d1 100644
--- a/include/llvm/Support/Win64EH.h
+++ b/include/llvm/Support/Win64EH.h
@@ -33,7 +33,24 @@ enum UnwindOpcodes {
   UOP_SaveNonVolBig,
   UOP_SaveXMM128 = 8,
   UOP_SaveXMM128Big,
-  UOP_PushMachFrame
+  UOP_PushMachFrame,
+  // The following set of unwind opcodes is for ARM64.  They are documented at
+  // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+  UOP_AllocMedium,
+  UOP_SaveFPLRX,
+  UOP_SaveFPLR,
+  UOP_SaveReg,
+  UOP_SaveRegX,
+  UOP_SaveRegP,
+  UOP_SaveRegPX,
+  UOP_SaveFReg,
+  UOP_SaveFRegX,
+  UOP_SaveFRegP,
+  UOP_SaveFRegPX,
+  UOP_SetFP,
+  UOP_AddFP,
+  UOP_Nop,
+  UOP_End
 };
 
 /// UnwindCode - This union describes a single operation in a function prolog,
diff --git a/include/llvm/Support/WithColor.h b/include/llvm/Support/WithColor.h
index 85fc5fa0cf14d06322fe47fba37eb8cf64a4ca4d..76842d1c3dc8ef6b07f041465527e2bbac2015d8 100644
--- a/include/llvm/Support/WithColor.h
+++ b/include/llvm/Support/WithColor.h
@@ -29,23 +29,49 @@ enum class HighlightColor {
   Macro,
   Error,
   Warning,
-  Note
+  Note,
+  Remark
 };
 
 /// An RAII object that temporarily switches an output stream to a specific
 /// color.
 class WithColor {
   raw_ostream &OS;
-  /// Determine whether colors should be displayed.
-  bool colorsEnabled(raw_ostream &OS);
+  bool DisableColors;
 
 public:
   /// To be used like this: WithColor(OS, HighlightColor::String) << "text";
-  WithColor(raw_ostream &OS, HighlightColor S);
+  /// @param OS The output stream
+  /// @param S Symbolic name for syntax element to color
+  /// @param DisableColors Whether to ignore color changes regardless of -color
+  /// and support in OS
+  WithColor(raw_ostream &OS, HighlightColor S, bool DisableColors = false);
+  /// To be used like this: WithColor(OS, raw_ostream::Black) << "text";
+  /// @param OS The output stream
+  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
+  /// change only the bold attribute, and keep colors untouched
+  /// @param Bold Bold/brighter text, default false
+  /// @param BG If true, change the background, default: change foreground
+  /// @param DisableColors Whether to ignore color changes regardless of -color
+  /// and support in OS
+  WithColor(raw_ostream &OS,
+            raw_ostream::Colors Color = raw_ostream::SAVEDCOLOR,
+            bool Bold = false, bool BG = false, bool DisableColors = false)
+      : OS(OS), DisableColors(DisableColors) {
+    changeColor(Color, Bold, BG);
+  }
   ~WithColor();
 
   raw_ostream &get() { return OS; }
   operator raw_ostream &() { return OS; }
+  template <typename T> WithColor &operator<<(T &O) {
+    OS << O;
+    return *this;
+  }
+  template <typename T> WithColor &operator<<(const T &O) {
+    OS << O;
+    return *this;
+  }
 
   /// Convenience method for printing "error: " to stderr.
   static raw_ostream &error();
@@ -53,13 +79,36 @@ public:
   static raw_ostream &warning();
   /// Convenience method for printing "note: " to stderr.
   static raw_ostream &note();
+  /// Convenience method for printing "remark: " to stderr.
+  static raw_ostream &remark();
 
   /// Convenience method for printing "error: " to the given stream.
-  static raw_ostream &error(raw_ostream &OS, StringRef Prefix = "");
+  static raw_ostream &error(raw_ostream &OS, StringRef Prefix = "",
+                            bool DisableColors = false);
   /// Convenience method for printing "warning: " to the given stream.
-  static raw_ostream &warning(raw_ostream &OS, StringRef Prefix = "");
+  static raw_ostream &warning(raw_ostream &OS, StringRef Prefix = "",
+                              bool DisableColors = false);
   /// Convenience method for printing "note: " to the given stream.
-  static raw_ostream &note(raw_ostream &OS, StringRef Prefix = "");
+  static raw_ostream &note(raw_ostream &OS, StringRef Prefix = "",
+                           bool DisableColors = false);
+  /// Convenience method for printing "remark: " to the given stream.
+  static raw_ostream &remark(raw_ostream &OS, StringRef Prefix = "",
+                             bool DisableColors = false);
+
+  /// Determine whether colors are displayed.
+  bool colorsEnabled();
+
+  /// Change the color of text that will be output from this point forward.
+  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
+  /// change only the bold attribute, and keep colors untouched
+  /// @param Bold Bold/brighter text, default false
+  /// @param BG If true, change the background, default: change foreground
+  WithColor &changeColor(raw_ostream::Colors Color, bool Bold = false,
+                         bool BG = false);
+
+  /// Reset the colors to terminal defaults. Call this when you are done
+  /// outputting colored text, or before program exit.
+  WithColor &resetColor();
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/X86TargetParser.def b/include/llvm/Support/X86TargetParser.def
index e4af0657a35015c445518ccbbb75ac3472762352..eb45ed6a76afb1365ac909ff10181d8345469fcd 100644
--- a/include/llvm/Support/X86TargetParser.def
+++ b/include/llvm/Support/X86TargetParser.def
@@ -34,17 +34,20 @@ X86_VENDOR(VENDOR_AMD,   "amd")
 #ifndef X86_CPU_TYPE
 #define X86_CPU_TYPE(ARCHNAME, ENUM)
 #endif
-X86_CPU_TYPE_COMPAT_WITH_ALIAS("bonnell",    INTEL_BONNELL,    "bonnell", "atom")
-X86_CPU_TYPE_COMPAT           ("core2",      INTEL_CORE2,      "core2")
-X86_CPU_TYPE_COMPAT           ("nehalem",    INTEL_COREI7,     "corei7")
-X86_CPU_TYPE_COMPAT_WITH_ALIAS("amdfam10",   AMDFAM10H,        "amdfam10h", "amdfam10")
-X86_CPU_TYPE_COMPAT_WITH_ALIAS("bdver1",     AMDFAM15H,        "amdfam15h", "amdfam15")
-X86_CPU_TYPE_COMPAT_WITH_ALIAS("silvermont", INTEL_SILVERMONT, "silvermont", "slm")
-X86_CPU_TYPE_COMPAT           ("knl",        INTEL_KNL,        "knl")
-X86_CPU_TYPE_COMPAT           ("btver1",     AMD_BTVER1,       "btver1")
-X86_CPU_TYPE_COMPAT           ("btver2",     AMD_BTVER2,       "btver2")
-X86_CPU_TYPE_COMPAT           ("znver1",     AMDFAM17H,        "amdfam17h")
-X86_CPU_TYPE_COMPAT           ("knm",        INTEL_KNM,        "knm")
+X86_CPU_TYPE_COMPAT_WITH_ALIAS("bonnell",       INTEL_BONNELL,       "bonnell", "atom")
+X86_CPU_TYPE_COMPAT           ("core2",         INTEL_CORE2,         "core2")
+X86_CPU_TYPE_COMPAT           ("nehalem",       INTEL_COREI7,        "corei7")
+X86_CPU_TYPE_COMPAT_WITH_ALIAS("amdfam10",      AMDFAM10H,           "amdfam10h", "amdfam10")
+X86_CPU_TYPE_COMPAT_WITH_ALIAS("bdver1",        AMDFAM15H,           "amdfam15h", "amdfam15")
+X86_CPU_TYPE_COMPAT_WITH_ALIAS("silvermont",    INTEL_SILVERMONT,    "silvermont", "slm")
+X86_CPU_TYPE_COMPAT           ("knl",           INTEL_KNL,           "knl")
+X86_CPU_TYPE_COMPAT           ("btver1",        AMD_BTVER1,          "btver1")
+X86_CPU_TYPE_COMPAT           ("btver2",        AMD_BTVER2,          "btver2")
+X86_CPU_TYPE_COMPAT           ("znver1",        AMDFAM17H,           "amdfam17h")
+X86_CPU_TYPE_COMPAT           ("knm",           INTEL_KNM,           "knm")
+X86_CPU_TYPE_COMPAT           ("goldmont",      INTEL_GOLDMONT,      "goldmont")
+X86_CPU_TYPE_COMPAT           ("goldmont-plus", INTEL_GOLDMONT_PLUS, "goldmont-plus")
+X86_CPU_TYPE_COMPAT           ("tremont",       INTEL_TREMONT,       "tremont")
 // Entries below this are not in libgcc/compiler-rt.
 X86_CPU_TYPE                  ("i386",        INTEL_i386)
 X86_CPU_TYPE                  ("i486",        INTEL_i486)
@@ -64,9 +67,6 @@ X86_CPU_TYPE                  ("athlon",      AMD_ATHLON)
 X86_CPU_TYPE                  ("athlon-xp",   AMD_ATHLON_XP)
 X86_CPU_TYPE                  ("k8",          AMD_K8)
 X86_CPU_TYPE                  ("k8-sse3",     AMD_K8SSE3)
-X86_CPU_TYPE                  ("goldmont",    INTEL_GOLDMONT)
-X86_CPU_TYPE                  ("goldmont-plus", INTEL_GOLDMONT_PLUS)
-X86_CPU_TYPE                  ("tremont",     INTEL_TREMONT)
 #undef X86_CPU_TYPE_COMPAT_WITH_ALIAS
 #undef X86_CPU_TYPE_COMPAT
 #undef X86_CPU_TYPE
@@ -97,6 +97,8 @@ X86_CPU_SUBTYPE_COMPAT("broadwell",      INTEL_COREI7_BROADWELL,      "broadwell
 X86_CPU_SUBTYPE_COMPAT("skylake",        INTEL_COREI7_SKYLAKE,        "skylake")
 X86_CPU_SUBTYPE_COMPAT("skylake-avx512", INTEL_COREI7_SKYLAKE_AVX512, "skylake-avx512")
 X86_CPU_SUBTYPE_COMPAT("cannonlake",     INTEL_COREI7_CANNONLAKE,     "cannonlake")
+X86_CPU_SUBTYPE_COMPAT("icelake-client", INTEL_COREI7_ICELAKE_CLIENT, "icelake-client")
+X86_CPU_SUBTYPE_COMPAT("icelake-server", INTEL_COREI7_ICELAKE_SERVER, "icelake-server")
 // Entries below this are not in libgcc/compiler-rt.
 X86_CPU_SUBTYPE       ("core2",          INTEL_CORE2_65)
 X86_CPU_SUBTYPE       ("penryn",         INTEL_CORE2_45)
@@ -147,11 +149,16 @@ X86_FEATURE_COMPAT(27, FEATURE_AVX512IFMA,      "avx512ifma")
 X86_FEATURE_COMPAT(28, FEATURE_AVX5124VNNIW,    "avx5124vnniw")
 X86_FEATURE_COMPAT(29, FEATURE_AVX5124FMAPS,    "avx5124fmaps")
 X86_FEATURE_COMPAT(30, FEATURE_AVX512VPOPCNTDQ, "avx512vpopcntdq")
+X86_FEATURE_COMPAT(31, FEATURE_AVX512VBMI2,     "avx512vbmi2")
+X86_FEATURE_COMPAT(32, FEATURE_GFNI,            "gfni")
+X86_FEATURE_COMPAT(33, FEATURE_VPCLMULQDQ,      "vpclmulqdq")
+X86_FEATURE_COMPAT(34, FEATURE_AVX512VNNI,      "avx512vnni")
+X86_FEATURE_COMPAT(35, FEATURE_AVX512BITALG,    "avx512bitalg")
 // Features below here are not in libgcc/compiler-rt.
-X86_FEATURE       (32, FEATURE_MOVBE)
-X86_FEATURE       (33, FEATURE_ADX)
-X86_FEATURE       (34, FEATURE_EM64T)
-X86_FEATURE       (35, FEATURE_CLFLUSHOPT)
-X86_FEATURE       (36, FEATURE_SHA)
+X86_FEATURE       (64, FEATURE_MOVBE)
+X86_FEATURE       (65, FEATURE_ADX)
+X86_FEATURE       (66, FEATURE_EM64T)
+X86_FEATURE       (67, FEATURE_CLFLUSHOPT)
+X86_FEATURE       (68, FEATURE_SHA)
 #undef X86_FEATURE_COMPAT
 #undef X86_FEATURE
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 6836aa2aa067b158125834eb5728fb5b80331304..6219755e83a2cf959c99338b7c1b9638f143904c 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -250,7 +250,6 @@ struct has_ScalarEnumerationTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
     (sizeof(test<ScalarEnumerationTraits<T>>(nullptr)) == 1);
 };
@@ -267,7 +266,6 @@ struct has_ScalarBitSetTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value = (sizeof(test<ScalarBitSetTraits<T>>(nullptr)) == 1);
 };
 
@@ -287,7 +285,6 @@ struct has_ScalarTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<ScalarTraits<T>>(nullptr, nullptr, nullptr)) == 1);
 };
@@ -306,7 +303,6 @@ struct has_BlockScalarTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<BlockScalarTraits<T>>(nullptr, nullptr)) == 1);
 };
@@ -321,7 +317,6 @@ template <class T, class Context> struct has_MappingTraits {
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<MappingContextTraits<T, Context>>(nullptr)) == 1);
 };
@@ -335,7 +330,6 @@ template <class T> struct has_MappingTraits<T, EmptyContext> {
 
   template <typename U> static double test(...);
 
-public:
   static bool const value = (sizeof(test<MappingTraits<T>>(nullptr)) == 1);
 };
 
@@ -349,7 +343,6 @@ template <class T, class Context> struct has_MappingValidateTraits {
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<MappingContextTraits<T, Context>>(nullptr)) == 1);
 };
@@ -363,7 +356,6 @@ template <class T> struct has_MappingValidateTraits<T, EmptyContext> {
 
   template <typename U> static double test(...);
 
-public:
   static bool const value = (sizeof(test<MappingTraits<T>>(nullptr)) == 1);
 };
 
@@ -379,7 +371,6 @@ struct has_SequenceMethodTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =  (sizeof(test<SequenceTraits<T>>(nullptr)) == 1);
 };
 
@@ -395,7 +386,6 @@ struct has_CustomMappingTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value =
       (sizeof(test<CustomMappingTraits<T>>(nullptr)) == 1);
 };
@@ -425,7 +415,6 @@ struct has_FlowTraits<T, true>
   template<typename C>
   static char (&f(...))[2];
 
-public:
   static bool const value = sizeof(f<Derived>(nullptr)) == 2;
 };
 
@@ -446,7 +435,6 @@ struct has_DocumentListTraits
   template <typename U>
   static double test(...);
 
-public:
   static bool const value = (sizeof(test<DocumentListTraits<T>>(nullptr))==1);
 };
 
@@ -590,7 +578,6 @@ inline QuotingType needsQuotes(StringRef S) {
     // Safe scalar characters.
     case '_':
     case '-':
-    case '/':
     case '^':
     case '.':
     case ',':
@@ -607,6 +594,12 @@ inline QuotingType needsQuotes(StringRef S) {
     // DEL (0x7F) are excluded from the allowed character range.
     case 0x7F:
       return QuotingType::Double;
+    // Forward slash is allowed to be unquoted, but we quote it anyway.  We have
+    // many tests that use FileCheck against YAML output, and this output often
+    // contains paths.  If we quote backslashes but not forward slashes then
+    // paths will come out either quoted or unquoted depending on which platform
+    // the test is run on, making FileCheck comparisons difficult.
+    case '/':
     default: {
       // C0 control block (0x0 - 0x1F) is excluded from the allowed character
       // range.
diff --git a/include/llvm/Target/GenericOpcodes.td b/include/llvm/Target/GenericOpcodes.td
index 399cea9510755d313c5e04dacd3b0f28b852865e..af4fa8a1f04a76ee2329f78e9ab74252ee4d2b90 100644
--- a/include/llvm/Target/GenericOpcodes.td
+++ b/include/llvm/Target/GenericOpcodes.td
@@ -649,6 +649,9 @@ def G_EXTRACT : GenericInstruction {
 // Extract multiple registers specified size, starting from blocks given by
 // indexes. This will almost certainly be mapped to sub-register COPYs after
 // register banks have been selected.
+// The output operands are always ordered from lowest bits to highest:
+//   %bits_0_7:(s8), %bits_8_15:(s8),
+//       %bits_16_23:(s8), %bits_24_31:(s8) = G_UNMERGE_VALUES %0:(s32)
 def G_UNMERGE_VALUES : GenericInstruction {
   let OutOperandList = (outs type0:$dst0, variable_ops);
   let InOperandList = (ins type1:$src);
@@ -662,7 +665,10 @@ def G_INSERT : GenericInstruction {
   let hasSideEffects = 0;
 }
 
-/// Concatenate multiple registers of the same size into a wider register.
+// Concatenate multiple registers of the same size into a wider register.
+// The input operands are always ordered from lowest bits to highest:
+//   %0:(s32) = G_MERGE_VALUES %bits_0_7:(s8), %bits_8_15:(s8),
+//                             %bits_16_23:(s8), %bits_24_31:(s8)
 def G_MERGE_VALUES : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src0, variable_ops);
diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index a3d310cfe1cb7f03c112e898dd8313257a5e11ed..af26375802a250c21dee6cfbf0689ee3fc757963 100644
--- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -88,6 +88,7 @@ def : GINodeEquiv<G_CTTZ, cttz>;
 def : GINodeEquiv<G_CTLZ_ZERO_UNDEF, ctlz_zero_undef>;
 def : GINodeEquiv<G_CTTZ_ZERO_UNDEF, cttz_zero_undef>;
 def : GINodeEquiv<G_CTPOP, ctpop>;
+def : GINodeEquiv<G_EXTRACT_VECTOR_ELT, vector_extract>;
 
 // Broadly speaking G_LOAD is equivalent to ISD::LOAD but there are some
 // complications that tablegen must take care of. For example, Predicates such
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 538605a57ab4dc58fbe27b1025c41d94395f9846..abb1bb431f66865210ae60089a7d8a3fc2ea0a7d 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -1104,7 +1104,7 @@ def FAULTING_OP : StandardPseudoInstruction {
   let isBranch = 1;
 }
 def PATCHABLE_OP : StandardPseudoInstruction {
-  let OutOperandList = (outs unknown:$dst);
+  let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let usesCustomInserter = 1;
   let mayLoad = 1;
@@ -1164,8 +1164,8 @@ def PATCHABLE_TYPED_EVENT_CALL : StandardPseudoInstruction {
   let hasSideEffects = 1;
 }
 def FENTRY_CALL : StandardPseudoInstruction {
-  let OutOperandList = (outs unknown:$dst);
-  let InOperandList = (ins variable_ops);
+  let OutOperandList = (outs);
+  let InOperandList = (ins);
   let AsmString = "# FEntry call";
   let usesCustomInserter = 1;
   let mayLoad = 1;
@@ -1555,3 +1555,8 @@ include "llvm/Target/GlobalISel/Target.td"
 // Pull in the common support for the Global ISel DAG-based selector generation.
 //
 include "llvm/Target/GlobalISel/SelectionDAGCompat.td"
+
+//===----------------------------------------------------------------------===//
+// Pull in the common support for Pfm Counters generation.
+//
+include "llvm/Target/TargetPfmCounters.td"
diff --git a/include/llvm/Target/TargetInstrPredicate.td b/include/llvm/Target/TargetInstrPredicate.td
index c4b14eba77660d50a82c5e06837ab25da571e22f..e70da00979040ad76c03e2437130539db4cf8036 100644
--- a/include/llvm/Target/TargetInstrPredicate.td
+++ b/include/llvm/Target/TargetInstrPredicate.td
@@ -7,29 +7,39 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines MCInstPredicate classes and its subclasses.
+// This file defines class MCInstPredicate and its subclasses.
 //
-// MCInstPredicate is used to describe constraints on the opcode/operand(s) of
-// an instruction. Each MCInstPredicate class has a well-known semantic, and it
-// is used by a PredicateExpander to generate code for MachineInstr and/or
-// MCInst.
-//
-// MCInstPredicate definitions can be used to construct MCSchedPredicate
-// definitions. An MCSchedPredicate can be used in place of a SchedPredicate
-// when defining SchedReadVariant and SchedWriteVariant used by a processor
-// scheduling model.
+// MCInstPredicate definitions are used by target scheduling models to describe
+// constraints on instructions.
 //
-// Here is an example of MCInstPredicate definition:
+// Here is an example of an MCInstPredicate definition in tablegen:
 //
 // def MCInstPredicateExample : CheckAll<[
 //    CheckOpcode<[BLR]>,
 //    CheckIsRegOperand<0>,
 //    CheckNot<CheckRegOperand<0, LR>>]>;
 //
-// Predicate `MCInstPredicateExample` checks that the machine instruction in
-// input is a BLR, and that operand at index 0 is register `LR`.
+// The syntax for MCInstPredicate is declarative, and predicate definitions can
+// be composed together in order to generate more complex constraints.
+//
+// The `CheckAll` from the example defines a composition of three different
+// predicates.  Definition `MCInstPredicateExample` identifies instructions
+// whose opcode is BLR, and whose first operand is a register different from
+// register `LR`.
+//
+// Every MCInstPredicate class has a well-known semantic in tablegen. For
+// example, `CheckOpcode` is a special type of predicate used to describe a
+// constraint on the value of an instruction opcode.
 //
-// That predicate could be used to rewrite the following definition (from
+// MCInstPredicate definitions are typically used by scheduling models to
+// construct MCSchedPredicate definitions (see the definition of class
+// MCSchedPredicate in llvm/Target/TargetSchedule.td).
+// In particular, an MCSchedPredicate can be used instead of a SchedPredicate
+// when defining the set of SchedReadVariant and SchedWriteVariant of a
+// processor scheduling model.
+//
+// The `MCInstPredicateExample` definition above is equivalent (and therefore
+// could replace) the following definition from the ExynosM3 model (see
 // AArch64SchedExynosM3.td):
 //
 // def M3BranchLinkFastPred  : SchedPredicate<[{
@@ -37,22 +47,13 @@
 //    MI->getOperand(0).isReg() &&
 //    MI->getOperand(0).getReg() != AArch64::LR}]>;
 //
-// MCInstPredicate definitions are used to construct MCSchedPredicate (see the
-// definition of class MCSchedPredicate in llvm/Target/TargetSchedule.td).  An
-// MCSchedPredicate can be used by a `SchedVar` to associate a predicate with a
-// list of SchedReadWrites. Note that `SchedVar` are used to create SchedVariant
-// definitions.
-//
-// Each MCInstPredicate class has a well known semantic. For example,
-// `CheckOpcode` is only used to check the instruction opcode value.
-//
-// MCInstPredicate classes allow the definition of predicates in a declarative
-// way.  These predicates don't require a custom block of C++, and can be used
-// to define conditions on instructions without being bound to a particular
+// The main advantage of using MCInstPredicate instead of SchedPredicate is
+// portability: users don't need to specify predicates in C++. As a consequence
+// of this, MCInstPredicate definitions are not bound to a particular
 // representation (i.e. MachineInstr vs MCInst).
 //
-// It also means that tablegen backends must know how to parse and expand them
-// into code that works on MCInst (or MachineInst).
+// Tablegen backends know how to expand MCInstPredicate definitions into actual
+// C++ code that works on MachineInstr (and/or MCInst).
 //
 // Instances of class PredicateExpander (see utils/Tablegen/PredicateExpander.h)
 // know how to expand a predicate. For each MCInstPredicate class, there must be
@@ -105,28 +106,50 @@ class CheckSameRegOperand<int First, int Second> : MCInstPredicate {
   int SecondIndex = Second;
 }
 
+// Base class for checks on register/immediate operands.
+// It allows users to define checks like:
+//    MyFunction(MI->getOperand(Index).getImm()) == Val;
+//
+// In the example above, `MyFunction` is a function that takes as input an
+// immediate operand value, and returns another value. Field `FunctionMapper` is
+// the name of the function to call on the operand value.
+class CheckOperandBase<int Index, string Fn = ""> : MCOperandPredicate<Index> {
+  string FunctionMapper = Fn;
+}
+
 // Check that the machine register operand at position `Index` references
 // register R. This predicate assumes that we already checked that the machine
 // operand at position `Index` is a register operand.
-class CheckRegOperand<int Index, Register R> : MCOperandPredicate<Index> {
+class CheckRegOperand<int Index, Register R> : CheckOperandBase<Index> {
   Register Reg = R;
 }
 
 // Check if register operand at index `Index` is the invalid register.
-class CheckInvalidRegOperand<int Index> : MCOperandPredicate<Index>;
+class CheckInvalidRegOperand<int Index> : CheckOperandBase<Index>;
 
 // Check that the operand at position `Index` is immediate `Imm`.
-class CheckImmOperand<int Index, int Imm> : MCOperandPredicate<Index> {
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperand<int Index, int Imm> : CheckOperandBase<Index> {
   int ImmVal = Imm;
 }
 
 // Similar to CheckImmOperand, however the immediate is not a literal number.
 // This is useful when we want to compare the value of an operand against an
 // enum value, and we know the actual integer value of that enum.
-class CheckImmOperand_s<int Index, string Value> : MCOperandPredicate<Index> {
+class CheckImmOperand_s<int Index, string Value> : CheckOperandBase<Index> {
   string ImmVal = Value;
 }
 
+// Expands to a call to `FunctionMapper` if field `FunctionMapper` is set.
+// Otherwise, it expands to a CheckNot<CheckInvalidRegOperand<Index>>.
+class CheckRegOperandSimple<int Index> : CheckOperandBase<Index>;
+
+// Expands to a call to `FunctionMapper` if field `FunctionMapper` is set.
+// Otherwise, it simply evaluates to TruePred.
+class CheckImmOperandSimple<int Index> : CheckOperandBase<Index>;
+
 // Check that the operand at position `Index` is immediate value zero.
 class CheckZeroOperand<int Index> : CheckImmOperand<Index, 0>;
 
@@ -205,13 +228,13 @@ class FunctionPredicateBase<string name, MCStatement body> {
   MCStatement Body = body;
 }
 
-// Check that a call to method `Name` in class "XXXGenInstrInfo" (where XXX is
+// Check that a call to method `Name` in class "XXXInstrInfo" (where XXX is
 // the name of a target) returns true.
 //
 // TIIPredicate definitions are used to model calls to the target-specific
 // InstrInfo. A TIIPredicate is treated specially by the InstrInfoEmitter
 // tablegen backend, which will use it to automatically generate a definition in
-// the target specific `GenInstrInfo` class.
+// the target specific `InstrInfo` class.
 //
 // There cannot be multiple TIIPredicate definitions with the same name for the
 // same target.
@@ -313,7 +336,7 @@ class STIPredicate<STIPredicateDecl declaration,
 }
 
 // Convenience classes and definitions used by processor scheduling models to
-// describe dependency breaking instructions.
+// describe dependency breaking instructions and move elimination candidates.
 let UpdatesOpcodeMask = 1 in {
 
 def IsZeroIdiomDecl : STIPredicateDecl<"isZeroIdiom">;
@@ -323,8 +346,14 @@ def IsDepBreakingDecl : STIPredicateDecl<"isDependencyBreaking">;
 
 } // UpdatesOpcodeMask
 
+def IsOptimizableRegisterMoveDecl
+    : STIPredicateDecl<"isOptimizableRegisterMove">;
+
 class IsZeroIdiomFunction<list<DepBreakingClass> classes>
     : STIPredicate<IsZeroIdiomDecl, classes>;
 
 class IsDepBreakingFunction<list<DepBreakingClass> classes>
     : STIPredicate<IsDepBreakingDecl, classes>;
+
+class IsOptimizableRegisterMove<list<InstructionEquivalenceClass> classes>
+    : STIPredicate<IsOptimizableRegisterMoveDecl, classes>;
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index e743e9faa7efdd073891fa675ba14e6c4967cb5f..f968fa80d500f0c7c89d3706e4515396f5587bb3 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -284,18 +284,6 @@ public:
   void getNameWithPrefix(SmallVectorImpl<char> &Name, const GlobalValue *GV,
                          Mangler &Mang, bool MayAlwaysUsePrivate = false) const;
   MCSymbol *getSymbol(const GlobalValue *GV) const;
-
-  /// True if the target uses physical regs at Prolog/Epilog insertion
-  /// time. If true (most machines), all vregs must be allocated before
-  /// PEI. If false (virtual-register machines), then callee-save register
-  /// spilling and scavenging are not needed or used.
-  virtual bool usesPhysRegsForPEI() const { return true; }
-
-  /// True if the target wants to use interprocedural register allocation by
-  /// default. The -enable-ipra flag can be used to override this.
-  virtual bool useIPRA() const {
-    return false;
-  }
 };
 
 /// This class describes a target machine that is implemented with the LLVM
@@ -349,6 +337,18 @@ public:
   bool addAsmPrinter(PassManagerBase &PM, raw_pwrite_stream &Out,
                      raw_pwrite_stream *DwoOut, CodeGenFileType FileTYpe,
                      MCContext &Context);
+
+  /// True if the target uses physical regs at Prolog/Epilog insertion
+  /// time. If true (most machines), all vregs must be allocated before
+  /// PEI. If false (virtual-register machines), then callee-save register
+  /// spilling and scavenging are not needed or used.
+  virtual bool usesPhysRegsForPEI() const { return true; }
+
+  /// True if the target wants to use interprocedural register allocation by
+  /// default. The -enable-ipra flag can be used to override this.
+  virtual bool useIPRA() const {
+    return false;
+  }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Target/TargetPfmCounters.td b/include/llvm/Target/TargetPfmCounters.td
new file mode 100644
index 0000000000000000000000000000000000000000..0a55a558f30a52405607ddfc8f3adb5acf9519d4
--- /dev/null
+++ b/include/llvm/Target/TargetPfmCounters.td
@@ -0,0 +1,46 @@
+//===- TargetPfmCounters.td - Target Pfm Counters -*- tablegen ----------*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the target-independent interfaces for performance counters.
+
+// Definition of a hardware counters from libpfm identifiers.
+class PfmCounter<string counter> {
+  // The name of the counter that measures events.
+  // The name can be "some_counter + some_other_counter", in which case the
+  // measured value is the sum of events on these counters.
+  string Counter = counter;
+}
+
+// Issue counters can be tied to a ProcResource
+class PfmIssueCounter<string resource_name, string counter>
+    : PfmCounter<counter> {
+  // The name of the ProcResource on which uops are issued. This is used by
+  // llvm-exegesis to compare measurements with values in the SchedModels.
+  // If the CPU has a sched model, this should correspond to the name of a
+  // ProcResource.
+  string ResourceName = resource_name;
+}
+
+def NoPfmCounter : PfmCounter <""> {}
+
+// Set of PfmCounters for measuring sched model characteristics.
+class ProcPfmCounters {
+  // Processors can define how to measure cycles by defining a CycleCounter.
+  PfmCounter CycleCounter = NoPfmCounter;
+  // Processors can define how to measure uops by defining a UopsCounter.
+  PfmCounter UopsCounter = NoPfmCounter;
+  // Processors can define how to measure issued uops by defining IssueCounters.
+  list<PfmIssueCounter> IssueCounters = [];
+}
+
+// A binding of a set of counters to a CPU.
+class PfmCountersBinding<string cpu_name, ProcPfmCounters counters> {
+  string CpuName = cpu_name;
+  ProcPfmCounters Counters = counters;
+}
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 7d7ce2dabe025d689aee41cfab11b372158065bc..141e06693887cb9a9c869488b6a0ac9b6344e5ce 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -460,6 +460,10 @@ class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
 //  - The number of physical registers which can be used for register renaming
 //    purpose.
 //  - The cost of a register rename.
+//  - The set of registers that allow move elimination.
+//  - The maximum number of moves that can be eliminated every cycle.
+//  - Whether move elimination is limited to register moves whose input
+//    is known to be zero.
 //
 // The cost of a rename is the number of physical registers allocated by the
 // register alias table to map the new definition. By default, register can be
@@ -506,11 +510,35 @@ class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
 // partial write is combined with the previous super-register definition.  We
 // should add support for these cases, and correctly model merge problems with
 // partial register accesses.
+//
+// Field MaxMovesEliminatedPerCycle specifies how many moves can be eliminated
+// every cycle. A default value of zero for that field means: there is no limit
+// to the number of moves that can be eliminated by this register file.
+//
+// An instruction MI is a candidate for move elimination if a call to
+// method TargetSubtargetInfo::isOptimizableRegisterMove(MI) returns true (see
+// llvm/CodeGen/TargetSubtargetInfo.h, and llvm/MC/MCInstrAnalysis.h).
+//
+// Subtargets can instantiate tablegen class IsOptimizableRegisterMove (see
+// llvm/Target/TargetInstrPredicate.td) to customize the set of move elimination
+// candidates. By default, no instruction is a valid move elimination candidate.
+//
+// A register move MI is eliminated only if:
+//  - MI is a move elimination candidate.
+//  - The destination register is from a register class that allows move
+//    elimination (see field `AllowMoveElimination` below).
+//  - Constraints on the move kind, and the maximum number of moves that can be
+//    eliminated per cycle are all met.
+
 class RegisterFile<int numPhysRegs, list<RegisterClass> Classes = [],
-                   list<int> Costs = []> {
+                   list<int> Costs = [], list<bit> AllowMoveElim = [],
+                   int MaxMoveElimPerCy = 0, bit AllowZeroMoveElimOnly = 0> {
   list<RegisterClass> RegClasses = Classes;
   list<int> RegCosts = Costs;
+  list<bit> AllowMoveElimination = AllowMoveElim;
   int NumPhysRegs = numPhysRegs;
+  int MaxMovesEliminatedPerCycle = MaxMoveElimPerCy;
+  bit AllowZeroMoveEliminationOnly = AllowZeroMoveElimOnly;
   SchedMachineModel SchedModel = ?;
 }
 
@@ -529,31 +557,3 @@ class RetireControlUnit<int bufferSize, int retirePerCycle> {
   int MaxRetirePerCycle = retirePerCycle;
   SchedMachineModel SchedModel = ?;
 }
-
-// Allow the definition of hardware counters.
-class PfmCounter {
-  SchedMachineModel SchedModel = ?;
-}
-
-// Each processor can define how to measure cycles by defining a
-// PfmCycleCounter.
-class PfmCycleCounter<string counter> : PfmCounter {
-  string Counter = counter;
-}
-
-// Each ProcResourceUnits can define how to measure issued uops by defining
-// a PfmIssueCounter.
-class PfmIssueCounter<ProcResourceUnits resource, list<string> counters>
-    : PfmCounter{
-  // The resource units on which uops are issued.
-  ProcResourceUnits Resource = resource;
-  // The list of counters that measure issue events.
-  list<string> Counters = counters;
-}
-
-// Each processor can define how to measure NumMicroOps by defining a
-// PfmUopsCounter.
-class PfmUopsCounter<string counter> : PfmCounter {
-  string Counter = counter;
-}
-
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 4e463b9281d2ab01a4645f55032e603511b533bc..532e866be55426a23a2277bd72bf60205e1b92a5 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -373,6 +373,11 @@ def umin       : SDNode<"ISD::UMIN"      , SDTIntBinOp,
 def umax       : SDNode<"ISD::UMAX"      , SDTIntBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
 
+def saddsat    : SDNode<"ISD::SADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
+def uaddsat    : SDNode<"ISD::UADDSAT"   , SDTIntBinOp, [SDNPCommutative]>;
+def ssubsat    : SDNode<"ISD::SSUBSAT"   , SDTIntBinOp>;
+def usubsat    : SDNode<"ISD::USUBSAT"   , SDTIntBinOp>;
+
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
 def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
 def zext_invec : SDNode<"ISD::ZERO_EXTEND_VECTOR_INREG", SDTExtInvec>;
@@ -406,8 +411,14 @@ def fminnum    : SDNode<"ISD::FMINNUM"    , SDTFPBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
 def fmaxnum    : SDNode<"ISD::FMAXNUM"    , SDTFPBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
-def fminnan    : SDNode<"ISD::FMINNAN"    , SDTFPBinOp>;
-def fmaxnan    : SDNode<"ISD::FMAXNAN"    , SDTFPBinOp>;
+def fminnum_ieee : SDNode<"ISD::FMINNUM_IEEE", SDTFPBinOp,
+                          [SDNPCommutative]>;
+def fmaxnum_ieee  : SDNode<"ISD::FMAXNUM_IEEE", SDTFPBinOp,
+                           [SDNPCommutative]>;
+def fminimum   : SDNode<"ISD::FMINIMUM"   , SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def fmaximum   : SDNode<"ISD::FMAXIMUM"   , SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
 def fgetsign   : SDNode<"ISD::FGETSIGN"   , SDTFPToIntOp>;
 def fcanonicalize : SDNode<"ISD::FCANONICALIZE", SDTFPUnaryOp>;
 def fneg       : SDNode<"ISD::FNEG"       , SDTFPUnaryOp>;
diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index d427cb809bf01a7df1f86c8b4ccd84ec1e67075d..5ad880574ef7109637f2f7386b5b69d71ac1e648 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -56,10 +56,14 @@ public:
     // to find at least one summary for the GUID that is global or a local
     // in the referenced module for direct calls.
     LocalLinkageNotInModule,
-    // This corresponse to the NotEligibleToImport being set on the summary,
+    // This corresponds to the NotEligibleToImport being set on the summary,
     // which can happen in a few different cases (e.g. local that can't be
     // renamed or promoted because it is referenced on a llvm*.used variable).
-    NotEligible
+    NotEligible,
+    // This corresponds to NoInline being set on the function summary,
+    // which will happen if it is known that the inliner will not be able
+    // to inline the function (e.g. it is marked with a NoInline attribute).
+    NoInline
   };
 
   /// Information optionally tracked for candidates the importer decided
diff --git a/include/llvm/Transforms/IPO/SampleProfile.h b/include/llvm/Transforms/IPO/SampleProfile.h
index cd5a0563898e459015b8f42556ef9d85f732b854..af4a933ec1f66e0010be1b471ba1a836875dbb06 100644
--- a/include/llvm/Transforms/IPO/SampleProfile.h
+++ b/include/llvm/Transforms/IPO/SampleProfile.h
@@ -25,13 +25,16 @@ class Module;
 /// The sample profiler data loader pass.
 class SampleProfileLoaderPass : public PassInfoMixin<SampleProfileLoaderPass> {
 public:
-  SampleProfileLoaderPass(std::string File = "", bool IsThinLTOPreLink = false)
-      : ProfileFileName(File), IsThinLTOPreLink(IsThinLTOPreLink) {}
+  SampleProfileLoaderPass(std::string File = "", std::string RemappingFile = "",
+                          bool IsThinLTOPreLink = false)
+      : ProfileFileName(File), ProfileRemappingFileName(RemappingFile),
+        IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
 private:
   std::string ProfileFileName;
+  std::string ProfileRemappingFileName;
   bool IsThinLTOPreLink;
 };
 
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index d6d9529ba9a22c4735100bffd22fdb75916153a4..81ed0a2237e97be93e548bc85dbace334a2b3e89 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -24,9 +24,11 @@
 
 namespace llvm {
 
+class Triple;
 class FunctionPass;
 class ModulePass;
 class OptimizationRemarkEmitter;
+class Comdat;
 
 /// Instrumentation passes often insert conditional checks into entry blocks.
 /// Call this function before splitting the entry block to move instructions
@@ -36,6 +38,17 @@ class OptimizationRemarkEmitter;
 BasicBlock::iterator PrepareToSplitEntryBlock(BasicBlock &BB,
                                               BasicBlock::iterator IP);
 
+// Create a constant for Str so that we can pass it to the run-time lib.
+GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
+                                             bool AllowMerging,
+                                             const char *NamePrefix = "");
+
+// Returns F.getComdat() if it exists.
+// Otherwise creates a new comdat, sets F's comdat, and returns it.
+// Returns nullptr on failure.
+Comdat *GetOrCreateFunctionComdat(Function &F, Triple &T,
+                                  const std::string &ModuleId);
+
 // Insert GCOV profiling instrumentation
 struct GCOVOptions {
   static GCOVOptions getDefault();
diff --git a/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
index c0b37c470b746cc8b417e6febaf57324a73db0b2..fdc5df68a669d551eba4d010b3e548fa2419cc9b 100644
--- a/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
+++ b/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
@@ -36,12 +36,14 @@ public:
 /// The profile annotation (profile-instr-use) pass for IR based PGO.
 class PGOInstrumentationUse : public PassInfoMixin<PGOInstrumentationUse> {
 public:
-  PGOInstrumentationUse(std::string Filename = "");
+  PGOInstrumentationUse(std::string Filename = "",
+                        std::string RemappingFilename = "");
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 
 private:
   std::string ProfileFileName;
+  std::string ProfileRemappingFileName;
 };
 
 /// The indirect function call promotion pass.
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 9491e1bbac9381469283579264230ae0ae0bfd45..fe4ff621c6f0eb66858dbb61dba404e5488df55f 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -26,7 +26,6 @@ class ModulePass;
 class Pass;
 class GetElementPtrInst;
 class PassInfo;
-class TerminatorInst;
 class TargetLowering;
 class TargetMachine;
 
diff --git a/include/llvm/Transforms/Scalar/GVN.h b/include/llvm/Transforms/Scalar/GVN.h
index c01a1d77e96cabd93a35b71093a6f86606f3e354..784de7f9fe26bfbe63daaad0353d8a633844d132 100644
--- a/include/llvm/Transforms/Scalar/GVN.h
+++ b/include/llvm/Transforms/Scalar/GVN.h
@@ -237,7 +237,7 @@ private:
   }
 
   // List of critical edges to be split between iterations.
-  SmallVector<std::pair<TerminatorInst *, unsigned>, 4> toSplit;
+  SmallVector<std::pair<Instruction *, unsigned>, 4> toSplit;
 
   // Helper functions of redundant load elimination
   bool processLoad(LoadInst *L);
diff --git a/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
index 9848e0d54f2bfed1dcc5630bf6f03bdcbcbdcbe9..20c9a26b98cf090caea2e5b452776b3e66457f63 100644
--- a/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLPASS_H
 #define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLPASS_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/IR/PassManager.h"
 
@@ -30,16 +31,71 @@ public:
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 
+/// A set of parameters used to control various transforms performed by the
+/// LoopUnroll pass. Each of the boolean parameters can be set to:
+///      true - enabling the transformation.
+///      false - disabling the transformation.
+///      None - relying on a global default.
+///
+/// There is also OptLevel parameter, which is used for additional loop unroll
+/// tuning.
+///
+/// Intended use is to create a default object, modify parameters with
+/// additional setters and then pass it to LoopUnrollPass.
+///
+struct LoopUnrollOptions {
+  Optional<bool> AllowPartial;
+  Optional<bool> AllowPeeling;
+  Optional<bool> AllowRuntime;
+  Optional<bool> AllowUpperBound;
+  int OptLevel;
+
+  LoopUnrollOptions(int OptLevel = 2) : OptLevel(OptLevel) {}
+
+  /// Enables or disables partial unrolling. When disabled only full unrolling
+  /// is allowed.
+  LoopUnrollOptions &setPartial(bool Partial) {
+    AllowPartial = Partial;
+    return *this;
+  }
+
+  /// Enables or disables unrolling of loops with runtime trip count.
+  LoopUnrollOptions &setRuntime(bool Runtime) {
+    AllowRuntime = Runtime;
+    return *this;
+  }
+
+  /// Enables or disables loop peeling.
+  LoopUnrollOptions &setPeeling(bool Peeling) {
+    AllowPeeling = Peeling;
+    return *this;
+  }
+
+  /// Enables or disables the use of trip count upper bound
+  /// in loop unrolling.
+  LoopUnrollOptions &setUpperBound(bool UpperBound) {
+    AllowUpperBound = UpperBound;
+    return *this;
+  }
+
+  // Sets "optimization level" tuning parameter for loop unrolling.
+  LoopUnrollOptions &setOptLevel(int O) {
+    OptLevel = O;
+    return *this;
+  }
+};
+
 /// Loop unroll pass that will support both full and partial unrolling.
 /// It is a function pass to have access to function and module analyses.
 /// It will also put loops into canonical form (simplified and LCSSA).
 class LoopUnrollPass : public PassInfoMixin<LoopUnrollPass> {
-  const int OptLevel;
+  LoopUnrollOptions UnrollOpts;
 
 public:
   /// This uses the target information (or flags) to control the thresholds for
   /// different unrolling stategies but supports all of them.
-  explicit LoopUnrollPass(int OptLevel = 2) : OptLevel(OptLevel) {}
+  explicit LoopUnrollPass(LoopUnrollOptions UnrollOpts = {})
+      : UnrollOpts(UnrollOpts) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index dee1541f9d243e000e2330a84331b4cb651c8a0f..a0fc18825a5894e29ab8ddf12ca74199f435be75 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -128,7 +128,7 @@ struct CriticalEdgeSplittingOptions {
 /// IndirectBrInst.  Splitting these edges will almost always create an invalid
 /// program because the address of the new block won't be the one that is jumped
 /// to.
-BasicBlock *SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
+BasicBlock *SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
                               const CriticalEdgeSplittingOptions &Options =
                                   CriticalEdgeSplittingOptions());
 
@@ -148,7 +148,7 @@ inline bool SplitCriticalEdge(BasicBlock *Succ, pred_iterator PI,
                               const CriticalEdgeSplittingOptions &Options =
                                   CriticalEdgeSplittingOptions()) {
   bool MadeChange = false;
-  TerminatorInst *TI = (*PI)->getTerminator();
+  Instruction *TI = (*PI)->getTerminator();
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
     if (TI->getSuccessor(i) == Succ)
       MadeChange |= !!SplitCriticalEdge(TI, i, Options);
@@ -162,7 +162,7 @@ inline BasicBlock *
 SplitCriticalEdge(BasicBlock *Src, BasicBlock *Dst,
                   const CriticalEdgeSplittingOptions &Options =
                       CriticalEdgeSplittingOptions()) {
-  TerminatorInst *TI = Src->getTerminator();
+  Instruction *TI = Src->getTerminator();
   unsigned i = 0;
   while (true) {
     assert(i != TI->getNumSuccessors() && "Edge doesn't exist!");
@@ -257,11 +257,11 @@ ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
 /// Returns the NewBasicBlock's terminator.
 ///
 /// Updates DT and LI if given.
-TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
-                                          bool Unreachable,
-                                          MDNode *BranchWeights = nullptr,
-                                          DominatorTree *DT = nullptr,
-                                          LoopInfo *LI = nullptr);
+Instruction *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
+                                       bool Unreachable,
+                                       MDNode *BranchWeights = nullptr,
+                                       DominatorTree *DT = nullptr,
+                                       LoopInfo *LI = nullptr);
 
 /// SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen,
 /// but also creates the ElseBlock.
@@ -278,8 +278,8 @@ TerminatorInst *SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
 ///   SplitBefore
 ///   Tail
 void SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
-                                   TerminatorInst **ThenTerm,
-                                   TerminatorInst **ElseTerm,
+                                   Instruction **ThenTerm,
+                                   Instruction **ElseTerm,
                                    MDNode *BranchWeights = nullptr);
 
 /// Check whether BB is the merge point of a if-region.
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index bdcdf6f361f209b74e0cc730d5991d0a3ed54224..28efce6ac3fb9ba46d49b61aa77d0b8f9150574d 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -29,6 +29,7 @@ namespace llvm {
   ///
   /// Returns true if any attributes were set and false otherwise.
   bool inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI);
+  bool inferLibFuncAttributes(Module *M, StringRef Name, const TargetLibraryInfo &TLI);
 
   /// Check whether the overloaded unary floating point function
   /// corresponding to \a Ty is available.
@@ -36,6 +37,12 @@ namespace llvm {
                        LibFunc DoubleFn, LibFunc FloatFn,
                        LibFunc LongDoubleFn);
 
+  /// Get the name of the overloaded unary floating point function
+  /// corresponding to \a Ty.
+  StringRef getUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                            LibFunc DoubleFn, LibFunc FloatFn,
+                            LibFunc LongDoubleFn);
+
   /// Return V if it is an i8*, otherwise cast it to i8*.
   Value *castToCStr(Value *V, IRBuilder<> &B);
 
@@ -93,6 +100,13 @@ namespace llvm {
   Value *emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
                               const AttributeList &Attrs);
 
+  /// Emit a call to the unary function DoubleFn, FloatFn or LongDoubleFn,
+  /// depending of the type of Op.
+  Value *emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+                              LibFunc DoubleFn, LibFunc FloatFn,
+                              LibFunc LongDoubleFn, IRBuilder<> &B,
+                              const AttributeList &Attrs);
+
   /// Emit a call to the binary function named 'Name' (e.g. 'fmin'). This
   /// function is known to take type matching 'Op1' and 'Op2' and return one
   /// value with the same type. If 'Op1/Op2' are long double, 'l' is added as
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index e4d6053b70b610a787b2ec7ea41ef918007f5ebe..d7dce53fc76141caa29201a1fb0d056a55a40bdc 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -49,7 +49,6 @@ class ProfileSummaryInfo;
 class ReturnInst;
 
 /// Return an exact copy of the specified module
-///
 std::unique_ptr<Module> CloneModule(const Module &M);
 std::unique_ptr<Module> CloneModule(const Module &M, ValueToValueMapTy &VMap);
 
@@ -61,17 +60,15 @@ std::unique_ptr<Module>
 CloneModule(const Module &M, ValueToValueMapTy &VMap,
             function_ref<bool(const GlobalValue *)> ShouldCloneDefinition);
 
-/// ClonedCodeInfo - This struct can be used to capture information about code
+/// This struct can be used to capture information about code
 /// being cloned, while it is being cloned.
 struct ClonedCodeInfo {
-  /// ContainsCalls - This is set to true if the cloned code contains a normal
-  /// call instruction.
+  /// This is set to true if the cloned code contains a normal call instruction.
   bool ContainsCalls = false;
 
-  /// ContainsDynamicAllocas - This is set to true if the cloned code contains
-  /// a 'dynamic' alloca.  Dynamic allocas are allocas that are either not in
-  /// the entry block or they are in the entry block but are not a constant
-  /// size.
+  /// This is set to true if the cloned code contains a 'dynamic' alloca.
+  /// Dynamic allocas are allocas that are either not in the entry block or they
+  /// are in the entry block but are not a constant size.
   bool ContainsDynamicAllocas = false;
 
   /// All cloned call sites that have operand bundles attached are appended to
@@ -82,7 +79,7 @@ struct ClonedCodeInfo {
   ClonedCodeInfo() = default;
 };
 
-/// CloneBasicBlock - Return a copy of the specified basic block, but without
+/// Return a copy of the specified basic block, but without
 /// embedding the block into a particular function.  The block returned is an
 /// exact copy of the specified basic block, without any remapping having been
 /// performed.  Because of this, this is only suitable for applications where
@@ -109,13 +106,12 @@ struct ClonedCodeInfo {
 /// If you would like to collect additional information about the cloned
 /// function, you can specify a ClonedCodeInfo object with the optional fifth
 /// parameter.
-///
 BasicBlock *CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                             const Twine &NameSuffix = "", Function *F = nullptr,
                             ClonedCodeInfo *CodeInfo = nullptr,
                             DebugInfoFinder *DIFinder = nullptr);
 
-/// CloneFunction - Return a copy of the specified function and add it to that
+/// Return a copy of the specified function and add it to that
 /// function's module.  Also, any references specified in the VMap are changed
 /// to refer to their mapped value instead of the original one.  If any of the
 /// arguments to the function are in the VMap, the arguments are deleted from
@@ -154,7 +150,7 @@ void CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
                                const char *NameSuffix = "",
                                ClonedCodeInfo *CodeInfo = nullptr);
 
-/// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
+/// This works exactly like CloneFunctionInto,
 /// except that it does some simple constant prop and DCE on the fly.  The
 /// effect of this is to copy significantly less code in cases where (for
 /// example) a function call with constant arguments is inlined, and those
@@ -172,8 +168,8 @@ void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                ClonedCodeInfo *CodeInfo = nullptr,
                                Instruction *TheCall = nullptr);
 
-/// InlineFunctionInfo - This class captures the data input to the
-/// InlineFunction call, and records the auxiliary results produced by it.
+/// This class captures the data input to the InlineFunction call, and records
+/// the auxiliary results produced by it.
 class InlineFunctionInfo {
 public:
   explicit InlineFunctionInfo(CallGraph *cg = nullptr,
@@ -185,19 +181,19 @@ public:
       : CG(cg), GetAssumptionCache(GetAssumptionCache), PSI(PSI),
         CallerBFI(CallerBFI), CalleeBFI(CalleeBFI) {}
 
-  /// CG - If non-null, InlineFunction will update the callgraph to reflect the
+  /// If non-null, InlineFunction will update the callgraph to reflect the
   /// changes it makes.
   CallGraph *CG;
   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
   ProfileSummaryInfo *PSI;
   BlockFrequencyInfo *CallerBFI, *CalleeBFI;
 
-  /// StaticAllocas - InlineFunction fills this in with all static allocas that
-  /// get copied into the caller.
+  /// InlineFunction fills this in with all static allocas that get copied into
+  /// the caller.
   SmallVector<AllocaInst *, 4> StaticAllocas;
 
-  /// InlinedCalls - InlineFunction fills this in with callsites that were
-  /// inlined from the callee.  This is only filled in if CG is non-null.
+  /// InlineFunction fills this in with callsites that were inlined from the
+  /// callee. This is only filled in if CG is non-null.
   SmallVector<WeakTrackingVH, 8> InlinedCalls;
 
   /// All of the new call sites inlined into the caller.
@@ -214,7 +210,7 @@ public:
   }
 };
 
-/// InlineFunction - This function inlines the called function into the basic
+/// This function inlines the called function into the basic
 /// block of the caller.  This returns false if it is not possible to inline
 /// this call.  The program is still in a well defined state if this occurs
 /// though.
diff --git a/include/llvm/Transforms/Utils/CodeExtractor.h b/include/llvm/Transforms/Utils/CodeExtractor.h
index 0e5254acb0d3777af4f7ecd59d62aea044aa5316..13bef8418057002230a375cdff52094265a0e223 100644
--- a/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -64,6 +64,11 @@ class Value;
     unsigned NumExitBlocks = std::numeric_limits<unsigned>::max();
     Type *RetTy;
 
+    // Suffix to use when creating extracted function (appended to the original
+    // function name + "."). If empty, the default is to use the entry block
+    // label, if non-empty, otherwise "extracted".
+    std::string Suffix;
+
   public:
     /// Create a code extractor for a sequence of blocks.
     ///
@@ -78,7 +83,8 @@ class Value;
     CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT = nullptr,
                   bool AggregateArgs = false, BlockFrequencyInfo *BFI = nullptr,
                   BranchProbabilityInfo *BPI = nullptr,
-                  bool AllowVarArgs = false, bool AllowAlloca = false);
+                  bool AllowVarArgs = false, bool AllowAlloca = false,
+                  std::string Suffix = "");
 
     /// Create a code extractor for a loop body.
     ///
@@ -86,7 +92,8 @@ class Value;
     /// block sequence of the loop.
     CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs = false,
                   BlockFrequencyInfo *BFI = nullptr,
-                  BranchProbabilityInfo *BPI = nullptr);
+                  BranchProbabilityInfo *BPI = nullptr,
+                  std::string Suffix = "");
 
     /// Perform the extraction, returning the new function.
     ///
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index f7da69644da8ace5b2ad8f57dbf5cb9edb9b88a0..86a32bb63005984407cc3b0ffc2592565626901c 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -446,6 +446,15 @@ void copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI, MDNode *N,
 /// Remove the debug intrinsic instructions for the given instruction.
 void dropDebugUsers(Instruction &I);
 
+/// Hoist all of the instructions in the \p IfBlock to the dominant block
+/// \p DomBlock, by moving its instructions to the insertion point \p InsertPt.
+///
+/// The moved instructions receive the insertion point debug location values
+/// (DILocations) and their debug intrinsic instructions (dbg.values) are
+/// removed.
+void hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
+                              BasicBlock *BB);
+
 //===----------------------------------------------------------------------===//
 //  Intrinsic pattern matching
 //
diff --git a/include/llvm/Transforms/Utils/LoopRotationUtils.h b/include/llvm/Transforms/Utils/LoopRotationUtils.h
index 231e5bbb6deeeec46f90fa20786424e4c4729f46..cd5bc43010181538dc3e090c0c26d1e6dfae513a 100644
--- a/include/llvm/Transforms/Utils/LoopRotationUtils.h
+++ b/include/llvm/Transforms/Utils/LoopRotationUtils.h
@@ -20,6 +20,7 @@ class AssumptionCache;
 class DominatorTree;
 class Loop;
 class LoopInfo;
+class MemorySSAUpdater;
 class ScalarEvolution;
 struct SimplifyQuery;
 class TargetTransformInfo;
@@ -32,8 +33,8 @@ class TargetTransformInfo;
 /// LoopRotation. If it is true, the profitability heuristic will be ignored.
 bool LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
                   AssumptionCache *AC, DominatorTree *DT, ScalarEvolution *SE,
-                  const SimplifyQuery &SQ, bool RotationOnly,
-                  unsigned Threshold, bool IsUtilMode);
+                  MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ,
+                  bool RotationOnly, unsigned Threshold, bool IsUtilMode);
 
 } // namespace llvm
 
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h
index c75a1de11375c7796a9a5c0248497423029f99e5..f642852275c04fe066ab814019288045c905e361 100644
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -109,7 +109,7 @@ bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
 /// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
 bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
                 TargetLibraryInfo *, TargetTransformInfo *, Loop *,
-                AliasSetTracker *, LoopSafetyInfo *,
+                AliasSetTracker *, ICFLoopSafetyInfo *,
                 OptimizationRemarkEmitter *ORE);
 
 /// Walk the specified region of the CFG (defined by all blocks
@@ -122,7 +122,7 @@ bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
 /// ORE. It returns changed status.
 bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
                  TargetLibraryInfo *, Loop *, AliasSetTracker *,
-                 LoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
+                 ICFLoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
 
 /// This function deletes dead loops. The caller of this function needs to
 /// guarantee that the loop is infact dead.
@@ -151,7 +151,8 @@ bool promoteLoopAccessesToScalars(const SmallSetVector<Value *, 8> &,
                                   SmallVectorImpl<Instruction *> &,
                                   PredIteratorCache &, LoopInfo *,
                                   DominatorTree *, const TargetLibraryInfo *,
-                                  Loop *, AliasSetTracker *, LoopSafetyInfo *,
+                                  Loop *, AliasSetTracker *,
+                                  ICFLoopSafetyInfo *,
                                   OptimizationRemarkEmitter *);
 
 /// Does a BFS from a given node to all of its children inside a given loop.
diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h
index 4a791166299024b92fb95158ac1d5d0f44ce44ab..d02607acbbb57909c3cabd5d81772eff20d179c1 100644
--- a/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -76,6 +76,10 @@ public:
   /// block.
   bool HasValueForBlock(BasicBlock *BB) const;
 
+  /// Return the value for the specified block if the SSAUpdater has one,
+  /// otherwise return nullptr.
+  Value *FindValueForBlock(BasicBlock *BB) const;
+
   /// Construct SSA form, materializing a value that is live at the end
   /// of the specified block.
   Value *GetValueAtEndOfBlock(BasicBlock *BB);
diff --git a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index b7649ba883344cfcca04b54dcc8d11d45952cc10..cab0f3e7157578ea05a7331faa6ec66cb2da3957 100644
--- a/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -357,10 +357,9 @@ public:
       BBInfo *Info = *I;
 
       if (Info->DefBB != Info) {
-        // Record the available value at join nodes to speed up subsequent
-        // uses of this SSAUpdater for the same value.
-        if (Info->NumPreds > 1)
-          (*AvailableVals)[Info->BB] = Info->DefBB->AvailableVal;
+        // Record the available value to speed up subsequent uses of this
+        // SSAUpdater for the same value.
+        (*AvailableVals)[Info->BB] = Info->DefBB->AvailableVal;
         continue;
       }
 
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 2b344f4410736b19fd66beeea4238895dd23ea2e..025bcd44e3107d1e3af3851b08ef97b5df396684 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -77,21 +77,34 @@ private:
   OptimizationRemarkEmitter &ORE;
   bool UnsafeFPShrink;
   function_ref<void(Instruction *, Value *)> Replacer;
+  function_ref<void(Instruction *)> Eraser;
 
   /// Internal wrapper for RAUW that is the default implementation.
   ///
   /// Other users may provide an alternate function with this signature instead
   /// of this one.
-  static void replaceAllUsesWithDefault(Instruction *I, Value *With);
+  static void replaceAllUsesWithDefault(Instruction *I, Value *With) {
+    I->replaceAllUsesWith(With);
+  }
+
+  /// Internal wrapper for eraseFromParent that is the default implementation.
+  static void eraseFromParentDefault(Instruction *I) { I->eraseFromParent(); }
 
   /// Replace an instruction's uses with a value using our replacer.
   void replaceAllUsesWith(Instruction *I, Value *With);
 
+  /// Erase an instruction from its parent with our eraser.
+  void eraseFromParent(Instruction *I);
+
+  Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B);
+
 public:
-  LibCallSimplifier(const DataLayout &DL, const TargetLibraryInfo *TLI,
-                    OptimizationRemarkEmitter &ORE,
-                    function_ref<void(Instruction *, Value *)> Replacer =
-                        &replaceAllUsesWithDefault);
+  LibCallSimplifier(
+      const DataLayout &DL, const TargetLibraryInfo *TLI,
+      OptimizationRemarkEmitter &ORE,
+      function_ref<void(Instruction *, Value *)> Replacer =
+          &replaceAllUsesWithDefault,
+      function_ref<void(Instruction *)> Eraser = &eraseFromParentDefault);
 
   /// optimizeCall - Take the given call instruction and return a more
   /// optimal value to replace the instruction with or 0 if a more
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 2a6242099b2fec7c5a20858b93d384614387d09f..ceb660daa28cb5414e22a39bae2ac3b198be7510 100644
--- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -241,6 +241,10 @@ public:
   /// If false, good old LV code.
   bool canVectorize(bool UseVPlanNativePath);
 
+  /// Return true if we can vectorize this loop while folding its tail by
+  /// masking.
+  bool canFoldTailByMasking();
+
   /// Returns the primary induction variable.
   PHINode *getPrimaryInduction() { return PrimaryInduction; }
 
diff --git a/include/llvm/XRay/BlockIndexer.h b/include/llvm/XRay/BlockIndexer.h
index 46a7243685fa358e3941d7c8a998aabe33d1b0d0..b42fa17f3fb7bd34e13071b8e43e7c5933787334 100644
--- a/include/llvm/XRay/BlockIndexer.h
+++ b/include/llvm/XRay/BlockIndexer.h
@@ -54,6 +54,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
   /// The flush() function will clear out the current state of the visitor, to
   /// allow for explicitly flushing a block's records to the currently
diff --git a/include/llvm/XRay/BlockPrinter.h b/include/llvm/XRay/BlockPrinter.h
index 3a8f6e0d35ea3bc5a2822e9511c34312a94e5fcb..bfb21e2395172f693f3e29cf6480379603ac608b 100644
--- a/include/llvm/XRay/BlockPrinter.h
+++ b/include/llvm/XRay/BlockPrinter.h
@@ -50,6 +50,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
   void reset() { CurrentState = State::Start; }
 };
diff --git a/include/llvm/XRay/BlockVerifier.h b/include/llvm/XRay/BlockVerifier.h
index b43a435e93bba563129597422c0121fedd3a01a4..46371c13891a7a79143f010527ab3bd80a68dbf7 100644
--- a/include/llvm/XRay/BlockVerifier.h
+++ b/include/llvm/XRay/BlockVerifier.h
@@ -33,6 +33,7 @@ public:
     NewCPUId,
     TSCWrap,
     CustomEvent,
+    TypedEvent,
     Function,
     CallArg,
     EndOfBuffer,
@@ -58,6 +59,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
   Error verify();
   void reset();
diff --git a/include/llvm/XRay/FDRRecords.h b/include/llvm/XRay/FDRRecords.h
index c524dab2a33d308e024a90e57c8aa24ec71deedb..9d48332d5083d15e56b732937561c0e039486482 100644
--- a/include/llvm/XRay/FDRRecords.h
+++ b/include/llvm/XRay/FDRRecords.h
@@ -66,6 +66,7 @@ public:
     PIDEntry,
     NewBuffer,
     EndOfBuffer,
+    TypedEvent,
   };
 
   Type type() const override { return Type::Metadata; }
@@ -153,13 +154,14 @@ public:
 class CustomEventRecord : public MetadataRecord {
   int32_t Size = 0;
   uint64_t TSC = 0;
+  uint16_t CPU = 0;
   std::string Data{};
   friend class RecordInitializer;
 
 public:
   CustomEventRecord() = default;
-  explicit CustomEventRecord(uint64_t S, uint64_t T, std::string D)
-      : MetadataRecord(), Size(S), TSC(T), Data(std::move(D)) {}
+  explicit CustomEventRecord(uint64_t S, uint64_t T, uint16_t C, std::string D)
+      : MetadataRecord(), Size(S), TSC(T), CPU(C), Data(std::move(D)) {}
 
   MetadataType metadataType() const override {
     return MetadataType::CustomEvent;
@@ -167,6 +169,53 @@ public:
 
   int32_t size() const { return Size; }
   uint64_t tsc() const { return TSC; }
+  uint16_t cpu() const { return CPU; }
+  StringRef data() const { return Data; }
+
+  Error apply(RecordVisitor &V) override;
+};
+
+class CustomEventRecordV5 : public MetadataRecord {
+  int32_t Size = 0;
+  int32_t Delta = 0;
+  std::string Data{};
+  friend class RecordInitializer;
+
+public:
+  CustomEventRecordV5() = default;
+  explicit CustomEventRecordV5(int32_t S, int32_t D, std::string P)
+      : MetadataRecord(), Size(S), Delta(D), Data(std::move(P)) {}
+
+  MetadataType metadataType() const override {
+    return MetadataType::CustomEvent;
+  }
+
+  int32_t size() const { return Size; }
+  int32_t delta() const { return Delta; }
+  StringRef data() const { return Data; }
+
+  Error apply(RecordVisitor &V) override;
+};
+
+class TypedEventRecord : public MetadataRecord {
+  int32_t Size = 0;
+  int32_t Delta = 0;
+  uint16_t EventType = 0;
+  std::string Data{};
+  friend class RecordInitializer;
+
+public:
+  TypedEventRecord() = default;
+  explicit TypedEventRecord(int32_t S, int32_t D, uint16_t E, std::string P)
+      : MetadataRecord(), Size(S), Delta(D), Data(std::move(P)) {}
+
+  MetadataType metadataType() const override {
+    return MetadataType::TypedEvent;
+  }
+
+  int32_t size() const { return Size; }
+  int32_t delta() const { return Delta; }
+  uint16_t eventType() const { return EventType; }
   StringRef data() const { return Data; }
 
   Error apply(RecordVisitor &V) override;
@@ -267,15 +316,23 @@ public:
   virtual Error visit(NewBufferRecord &) = 0;
   virtual Error visit(EndBufferRecord &) = 0;
   virtual Error visit(FunctionRecord &) = 0;
+  virtual Error visit(CustomEventRecordV5 &) = 0;
+  virtual Error visit(TypedEventRecord &) = 0;
 };
 
 class RecordInitializer : public RecordVisitor {
   DataExtractor &E;
   uint32_t &OffsetPtr;
+  uint16_t Version;
 
 public:
+  static constexpr uint16_t DefaultVersion = 5u;
+
+  explicit RecordInitializer(DataExtractor &DE, uint32_t &OP, uint16_t V)
+      : RecordVisitor(), E(DE), OffsetPtr(OP), Version(V) {}
+
   explicit RecordInitializer(DataExtractor &DE, uint32_t &OP)
-      : RecordVisitor(), E(DE), OffsetPtr(OP) {}
+      : RecordInitializer(DE, OP, DefaultVersion) {}
 
   Error visit(BufferExtents &) override;
   Error visit(WallclockRecord &) override;
@@ -287,6 +344,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 };
 
 } // namespace xray
diff --git a/include/llvm/XRay/FDRTraceExpander.h b/include/llvm/XRay/FDRTraceExpander.h
index 7f8236b82b59763d0ff5ef0b5375da8055db676f..02a21bed5ce916fabab76ee8cbe22dc408bacdb3 100644
--- a/include/llvm/XRay/FDRTraceExpander.h
+++ b/include/llvm/XRay/FDRTraceExpander.h
@@ -27,10 +27,10 @@ class TraceExpander : public RecordVisitor {
   int32_t PID = 0;
   int32_t TID = 0;
   uint64_t BaseTSC = 0;
-  XRayRecord CurrentRecord{0, 0, RecordTypes::ENTER, 0, 0, 0, 0, {}};
+  XRayRecord CurrentRecord{0, 0, RecordTypes::ENTER, 0, 0, 0, 0, {}, {}};
   uint16_t CPUId = 0;
   uint16_t LogVersion = 0;
-  bool BuildingFunction = false;
+  bool BuildingRecord = false;
   bool IgnoringRecords = false;
 
   void resetCurrentRecord();
@@ -49,6 +49,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
   // Must be called after all the records have been processed, to handle the
   // most recent record generated.
diff --git a/include/llvm/XRay/FDRTraceWriter.h b/include/llvm/XRay/FDRTraceWriter.h
index 91488f89ecc529c3c8a2f239b54a60936a040ba7..7b3b5fa25eff4b31dae47b45ddfb7b2058444839 100644
--- a/include/llvm/XRay/FDRTraceWriter.h
+++ b/include/llvm/XRay/FDRTraceWriter.h
@@ -43,6 +43,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 
 private:
   support::endian::Writer OS;
diff --git a/include/llvm/XRay/RecordPrinter.h b/include/llvm/XRay/RecordPrinter.h
index bad1a5742b4529b7bdef6d809dd630133ef3aa7c..649c64ab6f5cbfbee1af0418f795ef3a349ca69b 100644
--- a/include/llvm/XRay/RecordPrinter.h
+++ b/include/llvm/XRay/RecordPrinter.h
@@ -40,6 +40,8 @@ public:
   Error visit(NewBufferRecord &) override;
   Error visit(EndBufferRecord &) override;
   Error visit(FunctionRecord &) override;
+  Error visit(CustomEventRecordV5 &) override;
+  Error visit(TypedEventRecord &) override;
 };
 
 } // namespace xray
diff --git a/include/llvm/XRay/XRayRecord.h b/include/llvm/XRay/XRayRecord.h
index 76873447f170aff63c1f767f619e2b3fec8d0a81..7685ec95838a4ed67e1f2fd59cccd53cadac9c5a 100644
--- a/include/llvm/XRay/XRayRecord.h
+++ b/include/llvm/XRay/XRayRecord.h
@@ -17,6 +17,7 @@
 
 #include <cstdint>
 #include <vector>
+#include <string>
 
 namespace llvm {
 namespace xray {
@@ -54,10 +55,23 @@ struct XRayFileHeader {
 /// This may or may not correspond to actual record types in the raw trace (as
 /// the loader implementation may synthesize this information in the process of
 /// of loading).
-enum class RecordTypes { ENTER, EXIT, TAIL_EXIT, ENTER_ARG };
+enum class RecordTypes {
+  ENTER,
+  EXIT,
+  TAIL_EXIT,
+  ENTER_ARG,
+  CUSTOM_EVENT,
+  TYPED_EVENT
+};
 
+/// An XRayRecord is the denormalized view of data associated in a trace. These
+/// records may not correspond to actual entries in the raw traces, but they are
+/// the logical representation of records in a higher-level event log.
 struct XRayRecord {
-  /// The type of record.
+  /// RecordType values are used as "sub-types" which have meaning in the
+  /// context of the `Type` below. For function call and custom event records,
+  /// the RecordType is always 0, while for typed events we store the type in
+  /// the RecordType field.
   uint16_t RecordType;
 
   /// The CPU where the thread is running. We assume number of CPUs <= 65536.
@@ -66,7 +80,7 @@ struct XRayRecord {
   /// Identifies the type of record.
   RecordTypes Type;
 
-  /// The function ID for the record.
+  /// The function ID for the record, if this is a function call record.
   int32_t FuncId;
 
   /// Get the full 8 bytes of the TSC when we get the log record.
@@ -80,6 +94,9 @@ struct XRayRecord {
 
   /// The function call arguments.
   std::vector<uint64_t> CallArgs;
+
+  /// For custom and typed events, we provide the raw data from the trace.
+  std::string Data;
 };
 
 } // namespace xray
diff --git a/include/llvm/XRay/YAMLXRayRecord.h b/include/llvm/XRay/YAMLXRayRecord.h
index 0de9ea0968e69bb1e19836c3cae2619e90ffae2a..6150196ed98df939f1ac6354e8fc8965d872e1bb 100644
--- a/include/llvm/XRay/YAMLXRayRecord.h
+++ b/include/llvm/XRay/YAMLXRayRecord.h
@@ -39,6 +39,7 @@ struct YAMLXRayRecord {
   uint32_t TId;
   uint32_t PId;
   std::vector<uint64_t> CallArgs;
+  std::string Data;
 };
 
 struct YAMLXRayTrace {
@@ -58,6 +59,8 @@ template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
     IO.enumCase(Type, "function-exit", xray::RecordTypes::EXIT);
     IO.enumCase(Type, "function-tail-exit", xray::RecordTypes::TAIL_EXIT);
     IO.enumCase(Type, "function-enter-arg", xray::RecordTypes::ENTER_ARG);
+    IO.enumCase(Type, "custom-event", xray::RecordTypes::CUSTOM_EVENT);
+    IO.enumCase(Type, "typed-event", xray::RecordTypes::TYPED_EVENT);
   }
 };
 
@@ -73,16 +76,16 @@ template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
 
 template <> struct MappingTraits<xray::YAMLXRayRecord> {
   static void mapping(IO &IO, xray::YAMLXRayRecord &Record) {
-    // FIXME: Make this type actually be descriptive
     IO.mapRequired("type", Record.RecordType);
-    IO.mapRequired("func-id", Record.FuncId);
+    IO.mapOptional("func-id", Record.FuncId);
     IO.mapOptional("function", Record.Function);
     IO.mapOptional("args", Record.CallArgs);
     IO.mapRequired("cpu", Record.CPU);
-    IO.mapRequired("thread", Record.TId);
+    IO.mapOptional("thread", Record.TId, 0U);
     IO.mapOptional("process", Record.PId, 0U);
     IO.mapRequired("kind", Record.Type);
     IO.mapRequired("tsc", Record.TSC);
+    IO.mapOptional("data", Record.Data);
   }
 
   static constexpr bool flow = true;
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 9abbab87885c071b306c6a941c8c617911ad6ec1..c918eff2b9766db41f6415d09a5e18a5251b0c9b 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -52,6 +52,7 @@ module LLVM_BinaryFormat {
     textual header "BinaryFormat/ELFRelocs/i386.def"
     textual header "BinaryFormat/ELFRelocs/Lanai.def"
     textual header "BinaryFormat/ELFRelocs/Mips.def"
+    textual header "BinaryFormat/ELFRelocs/MSP430.def"
     textual header "BinaryFormat/ELFRelocs/PowerPC64.def"
     textual header "BinaryFormat/ELFRelocs/PowerPC.def"
     textual header "BinaryFormat/ELFRelocs/RISCV.def"
@@ -88,12 +89,14 @@ module LLVM_DebugInfo_PDB {
   // FIXME: There should be a better way to specify this.
   exclude header "DebugInfo/PDB/DIA/DIADataStream.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+  exclude header "DebugInfo/PDB/DIA/DIAEnumFrameData.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumInjectedSources.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumSectionContribs.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumSourceFiles.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumSymbols.h"
   exclude header "DebugInfo/PDB/DIA/DIAEnumTables.h"
+  exclude header "DebugInfo/PDB/DIA/DIAFrameData.h"
   exclude header "DebugInfo/PDB/DIA/DIAInjectedSource.h"
   exclude header "DebugInfo/PDB/DIA/DIALineNumber.h"
   exclude header "DebugInfo/PDB/DIA/DIARawSymbol.h"
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 937437791d18d8fac87d51ba0b25723831c6964e..8ed48390818b988f001de71fd5f47f5760996a4b 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -640,28 +640,6 @@ AnalysisKey AAManager::Key;
 
 namespace {
 
-/// A wrapper pass for external alias analyses. This just squirrels away the
-/// callback used to run any analyses and register their results.
-struct ExternalAAWrapperPass : ImmutablePass {
-  using CallbackT = std::function<void(Pass &, Function &, AAResults &)>;
-
-  CallbackT CB;
-
-  static char ID;
-
-  ExternalAAWrapperPass() : ImmutablePass(ID) {
-    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  explicit ExternalAAWrapperPass(CallbackT CB)
-      : ImmutablePass(ID), CB(std::move(CB)) {
-    initializeExternalAAWrapperPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-};
 
 } // end anonymous namespace
 
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 0d0277e9c34e45673d621ee41f1a35d9b7e5cfc6..c152b0ddecae5e84c25990dad83f2ef565e66f03 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -114,10 +114,9 @@ void AliasSetTracker::removeAliasSet(AliasSet *AS) {
   if (AliasSet *Fwd = AS->Forward) {
     Fwd->dropRef(*this);
     AS->Forward = nullptr;
-  }
-
-  if (AS->Alias == AliasSet::SetMayAlias)
-    TotalMayAliasSetSize -= AS->size();
+  } else // Update TotalMayAliasSetSize only if not forwarding.
+      if (AS->Alias == AliasSet::SetMayAlias)
+        TotalMayAliasSetSize -= AS->size();
 
   AliasSets.erase(AS);
 }
@@ -232,8 +231,8 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst,
   if (AliasAny)
     return true;
 
-  if (!Inst->mayReadOrWriteMemory())
-    return false;
+  assert(Inst->mayReadOrWriteMemory() &&
+         "Instruction must either read or write memory.");
 
   for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
     if (auto *UnknownInst = getUnknownInst(i)) {
@@ -311,13 +310,6 @@ AliasSet *AliasSetTracker::mergeAliasSetsForPointer(const Value *Ptr,
   return FoundSet;
 }
 
-bool AliasSetTracker::containsUnknown(const Instruction *Inst) const {
-  for (const AliasSet &AS : *this)
-    if (!AS.Forward && AS.aliasesUnknownInst(Inst, AA))
-      return true;
-  return false;
-}
-
 AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
   AliasSet *FoundSet = nullptr;
   for (iterator I = begin(), E = end(); I != E;) {
@@ -326,7 +318,7 @@ AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
       continue;
     if (!FoundSet)            // If this is the first alias set ptr can go into.
       FoundSet = &*Cur;       // Remember it.
-    else if (!Cur->Forward)   // Otherwise, we must merge the sets.
+    else   // Otherwise, we must merge the sets.
       FoundSet->mergeSetIn(*Cur, *this);     // Merge in contents.
   }
   return FoundSet;
@@ -383,7 +375,7 @@ AliasSet &AliasSetTracker::getAliasSetFor(const MemoryLocation &MemLoc) {
 
 void AliasSetTracker::add(Value *Ptr, LocationSize Size,
                           const AAMDNodes &AAInfo) {
-  addPointer(Ptr, Size, AAInfo, AliasSet::NoAccess);
+  addPointer(MemoryLocation(Ptr, Size, AAInfo), AliasSet::NoAccess);
 }
 
 void AliasSetTracker::add(LoadInst *LI) {
@@ -518,8 +510,9 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
 
     // Loop over all of the pointers in this alias set.
     for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI)
-      addPointer(ASI.getPointer(), ASI.getSize(), ASI.getAAInfo(),
-                 (AliasSet::AccessLattice)AS.Access);
+      addPointer(
+          MemoryLocation(ASI.getPointer(), ASI.getSize(), ASI.getAAInfo()),
+          (AliasSet::AccessLattice)AS.Access);
   }
 }
 
@@ -612,10 +605,9 @@ AliasSet &AliasSetTracker::mergeAllAliasSets() {
   return *AliasAnyAS;
 }
 
-AliasSet &AliasSetTracker::addPointer(Value *P, LocationSize Size,
-                                      const AAMDNodes &AAInfo,
+AliasSet &AliasSetTracker::addPointer(MemoryLocation Loc,
                                       AliasSet::AccessLattice E) {
-  AliasSet &AS = getAliasSetFor(MemoryLocation(P, Size, AAInfo));
+  AliasSet &AS = getAliasSetFor(Loc);
   AS.Access |= E;
 
   if (!AliasAnyAS && (TotalMayAliasSetSize > SaturationThreshold)) {
@@ -649,7 +641,7 @@ void AliasSet::print(raw_ostream &OS) const {
     for (iterator I = begin(), E = end(); I != E; ++I) {
       if (I != begin()) OS << ", ";
       I.getPointer()->printAsOperand(OS << "(");
-      if (I.getSize() == MemoryLocation::UnknownSize)
+      if (I.getSize() == LocationSize::unknown())
         OS << ", unknown)";
       else 
         OS << ", " << I.getSize() << ")";
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 2f513004fe80c11f360adeabd28d6ac6dcefe7b3..b7aa395ab849cc0ab3de2b136ced39cd175c5dc8 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1019,8 +1019,8 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
 
   // If we don't know the size of the accesses through both GEPs, we can't
   // determine whether the struct fields accessed can't alias.
-  if (MaybeV1Size == MemoryLocation::UnknownSize ||
-      MaybeV2Size == MemoryLocation::UnknownSize)
+  if (MaybeV1Size == LocationSize::unknown() ||
+      MaybeV2Size == LocationSize::unknown())
     return MayAlias;
 
   const uint64_t V1Size = MaybeV1Size.getValue();
@@ -1184,8 +1184,7 @@ bool BasicAAResult::isGEPBaseAtNegativeOffset(const GEPOperator *GEPOp,
       const DecomposedGEP &DecompGEP, const DecomposedGEP &DecompObject,
       LocationSize MaybeObjectAccessSize) {
   // If the object access size is unknown, or the GEP isn't inbounds, bail.
-  if (MaybeObjectAccessSize == MemoryLocation::UnknownSize ||
-      !GEPOp->isInBounds())
+  if (MaybeObjectAccessSize == LocationSize::unknown() || !GEPOp->isInBounds())
     return false;
 
   const uint64_t ObjectAccessSize = MaybeObjectAccessSize.getValue();
@@ -1254,8 +1253,8 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
       return NoAlias;
     // Do the base pointers alias?
     AliasResult BaseAlias =
-        aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize, AAMDNodes(),
-                   UnderlyingV2, MemoryLocation::UnknownSize, AAMDNodes());
+        aliasCheck(UnderlyingV1, LocationSize::unknown(), AAMDNodes(),
+                   UnderlyingV2, LocationSize::unknown(), AAMDNodes());
 
     // Check for geps of non-aliasing underlying pointers where the offsets are
     // identical.
@@ -1314,13 +1313,12 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
     // pointer, we know they cannot alias.
 
     // If both accesses are unknown size, we can't do anything useful here.
-    if (V1Size == MemoryLocation::UnknownSize &&
-        V2Size == MemoryLocation::UnknownSize)
+    if (V1Size == LocationSize::unknown() && V2Size == LocationSize::unknown())
       return MayAlias;
 
-    AliasResult R = aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize,
-                               AAMDNodes(), V2, MemoryLocation::UnknownSize,
-                               V2AAInfo, nullptr, UnderlyingV2);
+    AliasResult R =
+        aliasCheck(UnderlyingV1, LocationSize::unknown(), AAMDNodes(), V2,
+                   LocationSize::unknown(), V2AAInfo, nullptr, UnderlyingV2);
     if (R != MustAlias) {
       // If V2 may alias GEP base pointer, conservatively returns MayAlias.
       // If V2 is known not to alias GEP base pointer, then the two values
@@ -1351,7 +1349,7 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
   // greater, we know they do not overlap.
   if (GEP1BaseOffset != 0 && DecompGEP1.VarIndices.empty()) {
     if (GEP1BaseOffset >= 0) {
-      if (V2Size != MemoryLocation::UnknownSize) {
+      if (V2Size != LocationSize::unknown()) {
         if ((uint64_t)GEP1BaseOffset < V2Size.getValue())
           return PartialAlias;
         return NoAlias;
@@ -1365,8 +1363,8 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
       // GEP1             V2
       // We need to know that V2Size is not unknown, otherwise we might have
       // stripped a gep with negative index ('gep <ptr>, -1, ...).
-      if (V1Size != MemoryLocation::UnknownSize &&
-          V2Size != MemoryLocation::UnknownSize) {
+      if (V1Size != LocationSize::unknown() &&
+          V2Size != LocationSize::unknown()) {
         if (-(uint64_t)GEP1BaseOffset < V1Size.getValue())
           return PartialAlias;
         return NoAlias;
@@ -1416,9 +1414,8 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
     // mod Modulo. Check whether that difference guarantees that the
     // two locations do not alias.
     uint64_t ModOffset = (uint64_t)GEP1BaseOffset & (Modulo - 1);
-    if (V1Size != MemoryLocation::UnknownSize &&
-        V2Size != MemoryLocation::UnknownSize &&
-        ModOffset >= V2Size.getValue() &&
+    if (V1Size != LocationSize::unknown() &&
+        V2Size != LocationSize::unknown() && ModOffset >= V2Size.getValue() &&
         V1Size.getValue() <= Modulo - ModOffset)
       return NoAlias;
 
@@ -1426,7 +1423,7 @@ BasicAAResult::aliasGEP(const GEPOperator *GEP1, LocationSize V1Size,
     // If GEP1BasePtr > V2 (GEP1BaseOffset > 0) then we know the pointers
     // don't alias if V2Size can fit in the gap between V2 and GEP1BasePtr.
     if (AllPositive && GEP1BaseOffset > 0 &&
-        V2Size != MemoryLocation::UnknownSize &&
+        V2Size != LocationSize::unknown() &&
         V2Size.getValue() <= (uint64_t)GEP1BaseOffset)
       return NoAlias;
 
@@ -1607,7 +1604,7 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
   // unknown to represent all the possible values the GEP could advance the
   // pointer to.
   if (isRecursive)
-    PNSize = MemoryLocation::UnknownSize;
+    PNSize = LocationSize::unknown();
 
   AliasResult Alias =
       aliasCheck(V2, V2Size, V2AAInfo, V1Srcs[0],
@@ -1864,8 +1861,8 @@ bool BasicAAResult::constantOffsetHeuristic(
     const SmallVectorImpl<VariableGEPIndex> &VarIndices,
     LocationSize MaybeV1Size, LocationSize MaybeV2Size, int64_t BaseOffset,
     AssumptionCache *AC, DominatorTree *DT) {
-  if (VarIndices.size() != 2 || MaybeV1Size == MemoryLocation::UnknownSize ||
-      MaybeV2Size == MemoryLocation::UnknownSize)
+  if (VarIndices.size() != 2 || MaybeV1Size == LocationSize::unknown() ||
+      MaybeV2Size == LocationSize::unknown())
     return false;
 
   const uint64_t V1Size = MaybeV1Size.getValue();
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 54a657073f0f7eb6483d273f8c4d47cd1d11d077..7f544b27fe9d6ab79f900b5baa3256e6af548e66 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -135,7 +135,7 @@ static const uint32_t IH_NONTAKEN_WEIGHT = 1;
 /// Add \p BB to PostDominatedByUnreachable set if applicable.
 void
 BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 0) {
     if (isa<UnreachableInst>(TI) ||
         // If this block is terminated by a call to
@@ -167,7 +167,7 @@ BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
 void
 BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
   assert(!PostDominatedByColdCall.count(BB));
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 0)
     return;
 
@@ -202,7 +202,7 @@ BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
 /// Predict that a successor which leads necessarily to an
 /// unreachable-terminated block as extremely unlikely.
 bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   (void) TI;
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
   assert(!isa<InvokeInst>(TI) &&
@@ -246,7 +246,7 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
 // heuristic. The probability of the edge coming to unreachable block is
 // set to min of metadata and unreachable heuristic.
 bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
   if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) || isa<IndirectBrInst>(TI)))
     return false;
@@ -348,7 +348,7 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
 /// Return true if we could compute the weights for cold edges.
 /// Return false, otherwise.
 bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   (void) TI;
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
   assert(!isa<InvokeInst>(TI) &&
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
index a319be8092f90dea91a35a2552bd0e65182e56b9..aa880a62b75464a8de4b77603aec0a7499e85c76 100644
--- a/lib/Analysis/CFG.cpp
+++ b/lib/Analysis/CFG.cpp
@@ -71,7 +71,7 @@ void llvm::FindFunctionBackedges(const Function &F,
 /// successor.
 unsigned llvm::GetSuccessorNumber(const BasicBlock *BB,
     const BasicBlock *Succ) {
-  const TerminatorInst *Term = BB->getTerminator();
+  const Instruction *Term = BB->getTerminator();
 #ifndef NDEBUG
   unsigned e = Term->getNumSuccessors();
 #endif
@@ -85,8 +85,9 @@ unsigned llvm::GetSuccessorNumber(const BasicBlock *BB,
 /// isCriticalEdge - Return true if the specified edge is a critical edge.
 /// Critical edges are edges from a block with multiple successors to a block
 /// with multiple predecessors.
-bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
+bool llvm::isCriticalEdge(const Instruction *TI, unsigned SuccNum,
                           bool AllowIdenticalEdges) {
+  assert(TI->isTerminator() && "Must be a terminator to have successors!");
   assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
   if (TI->getNumSuccessors() == 1) return false;
 
diff --git a/lib/Analysis/CFLAndersAliasAnalysis.cpp b/lib/Analysis/CFLAndersAliasAnalysis.cpp
index b43b48eeef773c54243da3c5be924a1adb261798..1c61dd369a05fbe7b9a3081c57fd7f5c27b7301d 100644
--- a/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -556,9 +556,9 @@ bool CFLAndersAAResult::FunctionInfo::mayAlias(
                                       OffsetValue{RHS, 0}, Comparator);
 
     if (RangePair.first != RangePair.second) {
-      // Be conservative about UnknownSize
-      if (MaybeLHSSize == MemoryLocation::UnknownSize ||
-          MaybeRHSSize == MemoryLocation::UnknownSize)
+      // Be conservative about unknown sizes
+      if (MaybeLHSSize == LocationSize::unknown() ||
+          MaybeRHSSize == LocationSize::unknown())
         return true;
 
       const uint64_t LHSSize = MaybeLHSSize.getValue();
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 6fdbda4e03f9097e7670dcfd6fb630069f5306be..c33e2a88127261af7347223329d7f85cd6a9ed67 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis
   Delinearization.cpp
   DemandedBits.cpp
   DependenceAnalysis.cpp
+  DivergenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
   EHPersonalities.cpp
@@ -80,6 +81,7 @@ add_llvm_library(LLVMAnalysis
   ScalarEvolutionAliasAnalysis.cpp
   ScalarEvolutionExpander.cpp
   ScalarEvolutionNormalization.cpp
+  SyncDependenceAnalysis.cpp
   SyntheticCountsUtils.cpp
   TargetLibraryInfo.cpp
   TargetTransformInfo.cpp
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index c73250a384591256091772e8f099664eba690a9d..92b0555913758ff32837ba385c612686ef96f1f2 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1363,6 +1363,8 @@ bool llvm::canConstantFoldCallTo(ImmutableCallSite CS, const Function *F) {
   case Intrinsic::fabs:
   case Intrinsic::minnum:
   case Intrinsic::maxnum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximum:
   case Intrinsic::log:
   case Intrinsic::log2:
   case Intrinsic::log10:
@@ -1424,6 +1426,7 @@ bool llvm::canConstantFoldCallTo(ImmutableCallSite CS, const Function *F) {
   case Intrinsic::x86_avx512_vcvtsd2usi64:
   case Intrinsic::x86_avx512_cvttsd2usi:
   case Intrinsic::x86_avx512_cvttsd2usi64:
+  case Intrinsic::is_constant:
     return true;
   default:
     return false;
@@ -1598,11 +1601,32 @@ double getValueAsDouble(ConstantFP *Op) {
   return APF.convertToDouble();
 }
 
+static bool isManifestConstant(const Constant *c) {
+  if (isa<ConstantData>(c)) {
+    return true;
+  } else if (isa<ConstantAggregate>(c) || isa<ConstantExpr>(c)) {
+    for (const Value *subc : c->operand_values()) {
+      if (!isManifestConstant(cast<Constant>(subc)))
+        return false;
+    }
+    return true;
+  }
+  return false;
+}
+
 Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
                                  ArrayRef<Constant *> Operands,
                                  const TargetLibraryInfo *TLI,
                                  ImmutableCallSite CS) {
   if (Operands.size() == 1) {
+    if (IntrinsicID == Intrinsic::is_constant) {
+      // We know we have a "Constant" argument. But we want to only
+      // return true for manifest constants, not those that depend on
+      // constants with unknowable values, e.g. GlobalValue or BlockAddress.
+      if (isManifestConstant(Operands[0]))
+        return ConstantInt::getTrue(Ty->getContext());
+      return nullptr;
+    }
     if (isa<UndefValue>(Operands[0])) {
       // cosine(arg) is between -1 and 1. cosine(invalid arg) is NaN
       if (IntrinsicID == Intrinsic::cos)
@@ -1912,6 +1936,18 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
           return ConstantFP::get(Ty->getContext(), maxnum(C1, C2));
         }
 
+        if (IntrinsicID == Intrinsic::minimum) {
+          const APFloat &C1 = Op1->getValueAPF();
+          const APFloat &C2 = Op2->getValueAPF();
+          return ConstantFP::get(Ty->getContext(), minimum(C1, C2));
+        }
+
+        if (IntrinsicID == Intrinsic::maximum) {
+          const APFloat &C1 = Op1->getValueAPF();
+          const APFloat &C2 = Op2->getValueAPF();
+          return ConstantFP::get(Ty->getContext(), maximum(C1, C2));
+        }
+
         if (!TLI)
           return nullptr;
         if ((Name == "pow" && TLI->has(LibFunc_pow)) ||
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index 79c2728d5620cb26123a1e663fcb5ebf2873268a..b544ae5f535d3abf27249771db6e8236a67bfc77 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -633,8 +633,8 @@ static AliasResult underlyingObjectsAlias(AliasAnalysis *AA,
                                           const MemoryLocation &LocB) {
   // Check the original locations (minus size) for noalias, which can happen for
   // tbaa, incompatible underlying object locations, etc.
-  MemoryLocation LocAS(LocA.Ptr, MemoryLocation::UnknownSize, LocA.AATags);
-  MemoryLocation LocBS(LocB.Ptr, MemoryLocation::UnknownSize, LocB.AATags);
+  MemoryLocation LocAS(LocA.Ptr, LocationSize::unknown(), LocA.AATags);
+  MemoryLocation LocBS(LocB.Ptr, LocationSize::unknown(), LocB.AATags);
   if (AA->alias(LocAS, LocBS) == NoAlias)
     return NoAlias;
 
diff --git a/lib/Analysis/DivergenceAnalysis.cpp b/lib/Analysis/DivergenceAnalysis.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..de47445c5e01286c39f0c101e88a516bc201b109
--- /dev/null
+++ b/lib/Analysis/DivergenceAnalysis.cpp
@@ -0,0 +1,424 @@
+//===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a general divergence analysis for loop vectorization
+// and GPU programs. It determines which branches and values in a loop or GPU
+// program are divergent. It can help branch optimizations such as jump
+// threading and loop unswitching to make better decisions.
+//
+// GPU programs typically use the SIMD execution model, where multiple threads
+// in the same execution group have to execute in lock-step. Therefore, if the
+// code contains divergent branches (i.e., threads in a group do not agree on
+// which path of the branch to take), the group of threads has to execute all
+// the paths from that branch with different subsets of threads enabled until
+// they re-converge.
+//
+// Due to this execution model, some optimizations such as jump
+// threading and loop unswitching can interfere with thread re-convergence.
+// Therefore, an analysis that computes which branches in a GPU program are
+// divergent can help the compiler to selectively run these optimizations.
+//
+// This implementation is derived from the Vectorization Analysis of the
+// Region Vectorizer (RV). That implementation in turn is based on the approach
+// described in
+//
+//   Improving Performance of OpenCL on CPUs
+//   Ralf Karrenberg and Sebastian Hack
+//   CC '12
+//
+// This DivergenceAnalysis implementation is generic in the sense that it does
+// not itself identify original sources of divergence.
+// Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
+// (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
+// (e.g., special variables that hold the thread ID or the iteration variable).
+//
+// The generic implementation propagates divergence to variables that are data
+// or sync dependent on a source of divergence.
+//
+// While data dependency is a well-known concept, the notion of sync dependency
+// is worth more explanation. Sync dependence characterizes the control flow
+// aspect of the propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// The sync dependence detection (which branch induces divergence in which join
+// points) is implemented in the SyncDependenceAnalysis.
+//
+// The current DivergenceAnalysis implementation has the following limitations:
+// 1. intra-procedural. It conservatively considers the arguments of a
+//    non-kernel-entry function and the return value of a function call as
+//    divergent.
+// 2. memory as black box. It conservatively considers values loaded from
+//    generic or local address as divergent. This can be improved by leveraging
+//    pointer analysis and/or by modelling non-escaping memory objects in SSA
+//    as done in RV.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "divergence-analysis"
+
+// class DivergenceAnalysis
+DivergenceAnalysis::DivergenceAnalysis(
+    const Function &F, const Loop *RegionLoop, const DominatorTree &DT,
+    const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm)
+    : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA),
+      IsLCSSAForm(IsLCSSAForm) {}
+
+void DivergenceAnalysis::markDivergent(const Value &DivVal) {
+  assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal));
+  assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
+  DivergentValues.insert(&DivVal);
+}
+
+void DivergenceAnalysis::addUniformOverride(const Value &UniVal) {
+  UniformOverrides.insert(&UniVal);
+}
+
+bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const {
+  if (Term.getNumSuccessors() <= 1)
+    return false;
+  if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) {
+    assert(BranchTerm->isConditional());
+    return isDivergent(*BranchTerm->getCondition());
+  }
+  if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) {
+    return isDivergent(*SwitchTerm->getCondition());
+  }
+  if (isa<InvokeInst>(Term)) {
+    return false; // ignore abnormal executions through landingpad
+  }
+
+  llvm_unreachable("unexpected terminator");
+}
+
+bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const {
+  // TODO function calls with side effects, etc
+  for (const auto &Op : I.operands()) {
+    if (isDivergent(*Op))
+      return true;
+  }
+  return false;
+}
+
+bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock,
+                                             const Value &Val) const {
+  const auto *Inst = dyn_cast<const Instruction>(&Val);
+  if (!Inst)
+    return false;
+  // check whether any divergent loop carrying Val terminates before control
+  // proceeds to ObservingBlock
+  for (const auto *Loop = LI.getLoopFor(Inst->getParent());
+       Loop != RegionLoop && !Loop->contains(&ObservingBlock);
+       Loop = Loop->getParentLoop()) {
+    if (DivergentLoops.find(Loop) != DivergentLoops.end())
+      return true;
+  }
+
+  return false;
+}
+
+bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const {
+  // joining divergent disjoint path in Phi parent block
+  if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) {
+    return true;
+  }
+
+  // An incoming value could be divergent by itself.
+  // Otherwise, an incoming value could be uniform within the loop
+  // that carries its definition but it may appear divergent
+  // from outside the loop. This happens when divergent loop exits
+  // drop definitions of that uniform value in different iterations.
+  //
+  // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
+  //   if (i % thread_id == 0) break;    // divergent loop exit
+  // }
+  // int divI = i;                 // divI is divergent
+  for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) {
+    const auto *InVal = Phi.getIncomingValue(i);
+    if (isDivergent(*Phi.getIncomingValue(i)) ||
+        isTemporalDivergent(*Phi.getParent(), *InVal)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool DivergenceAnalysis::inRegion(const Instruction &I) const {
+  return I.getParent() && inRegion(*I.getParent());
+}
+
+bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const {
+  return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
+}
+
+// marks all users of loop-carried values of the loop headed by LoopHeader as
+// divergent
+void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
+  auto *DivLoop = LI.getLoopFor(&LoopHeader);
+  assert(DivLoop && "loopHeader is not actually part of a loop");
+
+  SmallVector<BasicBlock *, 8> TaintStack;
+  DivLoop->getExitBlocks(TaintStack);
+
+  // Otherwise potential users of loop-carried values could be anywhere in the
+  // dominance region of DivLoop (including its fringes for phi nodes)
+  DenseSet<const BasicBlock *> Visited;
+  for (auto *Block : TaintStack) {
+    Visited.insert(Block);
+  }
+  Visited.insert(&LoopHeader);
+
+  while (!TaintStack.empty()) {
+    auto *UserBlock = TaintStack.back();
+    TaintStack.pop_back();
+
+    // don't spread divergence beyond the region
+    if (!inRegion(*UserBlock))
+      continue;
+
+    assert(!DivLoop->contains(UserBlock) &&
+           "irreducible control flow detected");
+
+    // phi nodes at the fringes of the dominance region
+    if (!DT.dominates(&LoopHeader, UserBlock)) {
+      // all PHI nodes of UserBlock become divergent
+      for (auto &Phi : UserBlock->phis()) {
+        Worklist.push_back(&Phi);
+      }
+      continue;
+    }
+
+    // taint outside users of values carried by DivLoop
+    for (auto &I : *UserBlock) {
+      if (isAlwaysUniform(I))
+        continue;
+      if (isDivergent(I))
+        continue;
+
+      for (auto &Op : I.operands()) {
+        auto *OpInst = dyn_cast<Instruction>(&Op);
+        if (!OpInst)
+          continue;
+        if (DivLoop->contains(OpInst->getParent())) {
+          markDivergent(I);
+          pushUsers(I);
+          break;
+        }
+      }
+    }
+
+    // visit all blocks in the dominance region
+    for (auto *SuccBlock : successors(UserBlock)) {
+      if (!Visited.insert(SuccBlock).second) {
+        continue;
+      }
+      TaintStack.push_back(SuccBlock);
+    }
+  }
+}
+
+void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) {
+  for (const auto &Phi : Block.phis()) {
+    if (isDivergent(Phi))
+      continue;
+    Worklist.push_back(&Phi);
+  }
+}
+
+void DivergenceAnalysis::pushUsers(const Value &V) {
+  for (const auto *User : V.users()) {
+    const auto *UserInst = dyn_cast<const Instruction>(User);
+    if (!UserInst)
+      continue;
+
+    if (isDivergent(*UserInst))
+      continue;
+
+    // only compute divergent inside loop
+    if (!inRegion(*UserInst))
+      continue;
+    Worklist.push_back(UserInst);
+  }
+}
+
+bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock,
+                                                 const Loop *BranchLoop) {
+  LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
+
+  // ignore divergence outside the region
+  if (!inRegion(JoinBlock)) {
+    return false;
+  }
+
+  // push non-divergent phi nodes in JoinBlock to the worklist
+  pushPHINodes(JoinBlock);
+
+  // JoinBlock is a divergent loop exit
+  if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
+    return true;
+  }
+
+  // disjoint-paths divergent at JoinBlock
+  markBlockJoinDivergent(JoinBlock);
+  return false;
+}
+
+void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) {
+  LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
+
+  markDivergent(Term);
+
+  const auto *BranchLoop = LI.getLoopFor(Term.getParent());
+
+  // whether there is a divergent loop exit from BranchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint from Term within the loop
+  // also iterates over loop exits that become divergent due to Term.
+  for (const auto *JoinBlock : SDA.join_blocks(Term)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // Branch loop is a divergent loop due to the divergent branch in Term
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) {
+  LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n");
+
+  // don't propagate beyond region
+  if (!inRegion(*ExitingLoop.getHeader()))
+    return;
+
+  const auto *BranchLoop = ExitingLoop.getParentLoop();
+
+  // Uses of loop-carried values could occur anywhere
+  // within the dominance region of the definition. All loop-carried
+  // definitions are dominated by the loop header (reducible control).
+  // Thus all users have to be in the dominance region of the loop header,
+  // except PHI nodes that can also live at the fringes of the dom region
+  // (incoming defining value).
+  if (!IsLCSSAForm)
+    taintLoopLiveOuts(*ExitingLoop.getHeader());
+
+  // whether there is a divergent loop exit from BranchLoop (if any)
+  bool IsBranchLoopDivergent = false;
+
+  // iterate over all blocks reachable by disjoint paths from exits of
+  // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
+  // become divergent.
+  for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
+    IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
+  }
+
+  // Branch loop is a divergent due to divergent loop exit in ExitingLoop
+  if (IsBranchLoopDivergent) {
+    assert(BranchLoop);
+    if (!DivergentLoops.insert(BranchLoop).second) {
+      return;
+    }
+    propagateLoopDivergence(*BranchLoop);
+  }
+}
+
+void DivergenceAnalysis::compute() {
+  for (auto *DivVal : DivergentValues) {
+    pushUsers(*DivVal);
+  }
+
+  // propagate divergence
+  while (!Worklist.empty()) {
+    const Instruction &I = *Worklist.back();
+    Worklist.pop_back();
+
+    // maintain uniformity of overrides
+    if (isAlwaysUniform(I))
+      continue;
+
+    bool WasDivergent = isDivergent(I);
+    if (WasDivergent)
+      continue;
+
+    // propagate divergence caused by terminator
+    if (I.isTerminator()) {
+      if (updateTerminator(I)) {
+        // propagate control divergence to affected instructions
+        propagateBranchDivergence(I);
+        continue;
+      }
+    }
+
+    // update divergence of I due to divergent operands
+    bool DivergentUpd = false;
+    const auto *Phi = dyn_cast<const PHINode>(&I);
+    if (Phi) {
+      DivergentUpd = updatePHINode(*Phi);
+    } else {
+      DivergentUpd = updateNormalInstruction(I);
+    }
+
+    // propagate value divergence to users
+    if (DivergentUpd) {
+      markDivergent(I);
+      pushUsers(I);
+    }
+  }
+}
+
+bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const {
+  return UniformOverrides.find(&V) != UniformOverrides.end();
+}
+
+bool DivergenceAnalysis::isDivergent(const Value &V) const {
+  return DivergentValues.find(&V) != DivergentValues.end();
+}
+
+void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
+  if (DivergentValues.empty())
+    return;
+  // iterate instructions using instructions() to ensure a deterministic order.
+  for (auto &I : instructions(F)) {
+    if (isDivergent(I))
+      OS << "DIVERGENT:" << I << '\n';
+  }
+}
diff --git a/lib/Analysis/EHPersonalities.cpp b/lib/Analysis/EHPersonalities.cpp
index 2d35a3fa9118d8c4e16a4e5c989ee8ee6d8cda2c..0df73aeebbdc2f0eea5fef145a443ace1456bda2 100644
--- a/lib/Analysis/EHPersonalities.cpp
+++ b/lib/Analysis/EHPersonalities.cpp
@@ -120,7 +120,7 @@ DenseMap<BasicBlock *, ColorVector> llvm::colorEHFunclets(Function &F) {
                            << "\'.\n");
 
     BasicBlock *SuccColor = Color;
-    TerminatorInst *Terminator = Visiting->getTerminator();
+    Instruction *Terminator = Visiting->getTerminator();
     if (auto *CatchRet = dyn_cast<CatchReturnInst>(Terminator)) {
       Value *ParentPad = CatchRet->getCatchSwitchParentPad();
       if (isa<ConstantTokenNone>(ParentPad))
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index fb032e0404cb38ee0ccea7e22d0763d66af4e22b..a3347dbcb93350dbe9d6519bf4a96f900712e09c 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -30,6 +31,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/InstVisitor.h"
@@ -720,6 +722,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
   case Instruction::FPToSI:
     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
       Cost += InlineConstants::CallPenalty;
+    break;
   default:
     break;
   }
@@ -1831,7 +1834,7 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) {
     if (!IR)
       return IR;
 
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
 
     // Add in the live successors by first checking whether we have terminator
     // that may be simplified based on the values simplified by this call.
@@ -1884,6 +1887,24 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) {
   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
     return "noduplicate";
 
+  // Loops generally act a lot like calls in that they act like barriers to
+  // movement, require a certain amount of setup, etc. So when optimising for
+  // size, we penalise any call sites that perform loops. We do this after all
+  // other costs here, so will likely only be dealing with relatively small
+  // functions (and hence DT and LI will hopefully be cheap).
+  if (Caller->optForMinSize()) {
+    DominatorTree DT(F);
+    LoopInfo LI(DT);
+    int NumLoops = 0;
+    for (Loop *L : LI) {
+      // Ignore loops that will not be executed
+      if (DeadBlocks.count(L->getHeader()))
+        continue;
+      NumLoops++;
+    }
+    Cost += NumLoops * InlineConstants::CallPenalty;
+  }
+
   // We applied the maximum possible vector bonus at the beginning. Now,
   // subtract the excess bonus, if any, from the Threshold before
   // comparing against Cost.
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 86f5652f830648bc6e10e2700ddc9a3ffc2f357e..fd6f4ba476eedf329ab48c639309c2905e63c7c9 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -2996,6 +2996,44 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
+static Value *simplifyICmpWithAbsNabs(CmpInst::Predicate Pred, Value *Op0,
+                                      Value *Op1) {
+  // We need a comparison with a constant.
+  const APInt *C;
+  if (!match(Op1, m_APInt(C)))
+    return nullptr;
+
+  // matchSelectPattern returns the negation part of an abs pattern in SP1.
+  // If the negate has an NSW flag, abs(INT_MIN) is undefined. Without that
+  // constraint, we can't make a contiguous range for the result of abs.
+  ICmpInst::Predicate AbsPred = ICmpInst::BAD_ICMP_PREDICATE;
+  Value *SP0, *SP1;
+  SelectPatternFlavor SPF = matchSelectPattern(Op0, SP0, SP1).Flavor;
+  if (SPF == SelectPatternFlavor::SPF_ABS &&
+      cast<Instruction>(SP1)->hasNoSignedWrap())
+    // The result of abs(X) is >= 0 (with nsw).
+    AbsPred = ICmpInst::ICMP_SGE;
+  if (SPF == SelectPatternFlavor::SPF_NABS)
+    // The result of -abs(X) is <= 0.
+    AbsPred = ICmpInst::ICMP_SLE;
+
+  if (AbsPred == ICmpInst::BAD_ICMP_PREDICATE)
+    return nullptr;
+
+  // If there is no intersection between abs/nabs and the range of this icmp,
+  // the icmp must be false. If the abs/nabs range is a subset of the icmp
+  // range, the icmp must be true.
+  APInt Zero = APInt::getNullValue(C->getBitWidth());
+  ConstantRange AbsRange = ConstantRange::makeExactICmpRegion(AbsPred, Zero);
+  ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(Pred, *C);
+  if (AbsRange.intersectWith(CmpRange).isEmptySet())
+    return getFalse(GetCompareTy(Op0));
+  if (CmpRange.contains(AbsRange))
+    return getTrue(GetCompareTy(Op0));
+
+  return nullptr;
+}
+
 /// Simplify integer comparisons where at least one operand of the compare
 /// matches an integer min/max idiom.
 static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
@@ -3427,6 +3465,9 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   if (Value *V = simplifyICmpWithMinMax(Pred, LHS, RHS, Q, MaxRecurse))
     return V;
 
+  if (Value *V = simplifyICmpWithAbsNabs(Pred, LHS, RHS))
+    return V;
+
   // Simplify comparisons of related pointers using a powerful, recursive
   // GEP-walk when we have target data available..
   if (LHS->getType()->isPointerTy())
@@ -3570,12 +3611,19 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
     if (C->isZero()) {
       switch (Pred) {
+      case FCmpInst::FCMP_OGE:
+        if (FMF.noNaNs() && CannotBeOrderedLessThanZero(LHS, Q.TLI))
+          return getTrue(RetTy);
+        break;
       case FCmpInst::FCMP_UGE:
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getTrue(RetTy);
         break;
+      case FCmpInst::FCMP_ULT:
+        if (FMF.noNaNs() && CannotBeOrderedLessThanZero(LHS, Q.TLI))
+          return getFalse(RetTy);
+        break;
       case FCmpInst::FCMP_OLT:
-        // X < 0
         if (CannotBeOrderedLessThanZero(LHS, Q.TLI))
           return getFalse(RetTy);
         break;
@@ -3826,6 +3874,34 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
   return nullptr;
 }
 
+/// Try to simplify a select instruction when its condition operand is a
+/// floating-point comparison.
+static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F) {
+  FCmpInst::Predicate Pred;
+  if (!match(Cond, m_FCmp(Pred, m_Specific(T), m_Specific(F))) &&
+      !match(Cond, m_FCmp(Pred, m_Specific(F), m_Specific(T))))
+    return nullptr;
+
+  // TODO: The transform may not be valid with -0.0. An incomplete way of
+  // testing for that possibility is to check if at least one operand is a
+  // non-zero constant.
+  const APFloat *C;
+  if ((match(T, m_APFloat(C)) && C->isNonZero()) ||
+      (match(F, m_APFloat(C)) && C->isNonZero())) {
+    // (T == F) ? T : F --> F
+    // (F == T) ? T : F --> F
+    if (Pred == FCmpInst::FCMP_OEQ)
+      return F;
+
+    // (T != F) ? T : F --> T
+    // (F != T) ? T : F --> T
+    if (Pred == FCmpInst::FCMP_UNE)
+      return T;
+  }
+
+  return nullptr;
+}
+
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
@@ -3862,6 +3938,9 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
           simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse))
     return V;
 
+  if (Value *V = simplifySelectWithFCmp(Cond, TrueVal, FalseVal))
+    return V;
+
   if (Value *V = foldSelectWithBinaryOp(Cond, TrueVal, FalseVal))
     return V;
 
@@ -4827,13 +4906,24 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     }
     break;
   case Intrinsic::maxnum:
-  case Intrinsic::minnum: {
+  case Intrinsic::minnum:
+  case Intrinsic::maximum:
+  case Intrinsic::minimum: {
     // If the arguments are the same, this is a no-op.
     if (Op0 == Op1) return Op0;
 
-    // If one argument is NaN or undef, return the other argument.
-    if (match(Op0, m_CombineOr(m_NaN(), m_Undef()))) return Op1;
-    if (match(Op1, m_CombineOr(m_NaN(), m_Undef()))) return Op0;
+    // If one argument is undef, return the other argument.
+    if (match(Op0, m_Undef()))
+      return Op1;
+    if (match(Op1, m_Undef()))
+      return Op0;
+
+    // If one argument is NaN, return other or NaN appropriately.
+    bool PropagateNaN = IID == Intrinsic::minimum || IID == Intrinsic::maximum;
+    if (match(Op0, m_NaN()))
+      return PropagateNaN ? Op0 : Op1;
+    if (match(Op1, m_NaN()))
+      return PropagateNaN ? Op1 : Op0;
 
     // Min/max of the same operation with common operand:
     // m(m(X, Y)), X --> m(X, Y) (4 commuted variants)
@@ -4846,9 +4936,9 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
           (M1->getOperand(0) == Op0 || M1->getOperand(1) == Op0))
         return Op1;
 
-    // minnum(X, -Inf) --> -Inf (and commuted variant)
-    // maxnum(X, +Inf) --> +Inf (and commuted variant)
-    bool UseNegInf = IID == Intrinsic::minnum;
+    // min(X, -Inf) --> -Inf (and commuted variant)
+    // max(X, +Inf) --> +Inf (and commuted variant)
+    bool UseNegInf = IID == Intrinsic::minnum || IID == Intrinsic::minimum;
     const APFloat *C;
     if ((match(Op0, m_APFloat(C)) && C->isInfinity() &&
          C->isNegative() == UseNegInf) ||
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index b1d585bfc683fdf4cc15beb8a9fa2c886aac8975..3f22ada803c9f404acce8709e32c92cc480de944 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -619,7 +619,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(
 
   // If the merge range is empty, then adding the edge didn't actually form any
   // new cycles. We're done.
-  if (MergeRange.begin() == MergeRange.end()) {
+  if (empty(MergeRange)) {
     // Now that the SCC structure is finalized, flip the kind to call.
     SourceN->setEdgeKind(TargetN, Edge::Call);
     return false; // No new cycle.
diff --git a/lib/Analysis/LegacyDivergenceAnalysis.cpp b/lib/Analysis/LegacyDivergenceAnalysis.cpp
index c417862524e4e0a144cbcd9c82e83ba988499a50..2089d1c53d0de298274ece2ac3549bf465ced6a9 100644
--- a/lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ b/lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -93,7 +93,7 @@ private:
   // A helper function that explores data dependents of V.
   void exploreDataDependency(Value *V);
   // A helper function that explores sync dependents of TI.
-  void exploreSyncDependency(TerminatorInst *TI);
+  void exploreSyncDependency(Instruction *TI);
   // Computes the influence region from Start to End. This region includes all
   // basic blocks on any simple path from Start to End.
   void computeInfluenceRegion(BasicBlock *Start, BasicBlock *End,
@@ -128,7 +128,7 @@ void DivergencePropagator::populateWithSourcesOfDivergence() {
   }
 }
 
-void DivergencePropagator::exploreSyncDependency(TerminatorInst *TI) {
+void DivergencePropagator::exploreSyncDependency(Instruction *TI) {
   // Propagation rule 1: if branch TI is divergent, all PHINodes in TI's
   // immediate post dominator are divergent. This rule handles if-then-else
   // patterns. For example,
@@ -252,11 +252,11 @@ void DivergencePropagator::propagate() {
   while (!Worklist.empty()) {
     Value *V = Worklist.back();
     Worklist.pop_back();
-    if (TerminatorInst *TI = dyn_cast<TerminatorInst>(V)) {
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
       // Terminators with less than two successors won't introduce sync
       // dependency. Ignore them.
-      if (TI->getNumSuccessors() > 1)
-        exploreSyncDependency(TI);
+      if (I->isTerminator() && I->getNumSuccessors() > 1)
+        exploreSyncDependency(I);
     }
     exploreDataDependency(V);
   }
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index 8312a0d1cff5779daf05d2c71e27528a7f398055..4b8e8afdabbd437c64d058c08ac0ef8f4de5c62a 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -509,7 +509,7 @@ public:
   /// Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
+    AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
     Accesses.insert(MemAccessInfo(Ptr, false));
     if (IsReadOnly)
       ReadOnlyPtr.insert(Ptr);
@@ -518,7 +518,7 @@ public:
   /// Register a store.
   void addStore(MemoryLocation &Loc) {
     Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Ptr, MemoryLocation::UnknownSize, Loc.AATags);
+    AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
     Accesses.insert(MemAccessInfo(Ptr, true));
   }
 
@@ -1869,13 +1869,9 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
   for (StoreInst *ST : Stores) {
     Value *Ptr = ST->getPointerOperand();
 
-    if (isUniform(Ptr)) {
-      // Consider multiple stores to the same uniform address as a store of a
-      // variant value.
-      bool MultipleStoresToUniformPtr = !UniformStores.insert(Ptr).second;
-      HasVariantStoreToLoopInvariantAddress |=
-          (!isUniform(ST->getValueOperand()) || MultipleStoresToUniformPtr);
-    }
+    if (isUniform(Ptr))
+      HasMultipleStoresToLoopInvariantAddress |=
+          !UniformStores.insert(Ptr).second;
 
     // If we did *not* see this pointer before, insert it to  the read-write
     // list. At this phase it is only a 'write' list.
@@ -2276,7 +2272,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
       PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
       DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
       NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
-      HasVariantStoreToLoopInvariantAddress(false) {
+      HasMultipleStoresToLoopInvariantAddress(false) {
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
 }
@@ -2308,8 +2304,8 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
   PtrRtChecking->print(OS, Depth);
   OS << "\n";
 
-  OS.indent(Depth) << "Variant Store to invariant address was "
-                   << (HasVariantStoreToLoopInvariantAddress ? "" : "not ")
+  OS.indent(Depth) << "Multiple stores to invariant address were "
+                   << (HasMultipleStoresToLoopInvariantAddress ? "" : "not ")
                    << "found in loop.\n";
 
   OS.indent(Depth) << "SCEV assumptions:\n";
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 99ff25a3fd37d6791b88cd4ffb4c2a238d12fb98..4b174b66d1e12aa55122d17bfdae7286bcb1444b 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -219,7 +219,7 @@ MDNode *Loop::getLoopID() const {
   SmallVector<BasicBlock *, 4> LatchesBlocks;
   getLoopLatches(LatchesBlocks);
   for (BasicBlock *BB : LatchesBlocks) {
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     MDNode *MD = TI->getMetadata(LLVMContext::MD_loop);
 
     if (!MD)
@@ -250,7 +250,7 @@ void Loop::setLoopID(MDNode *LoopID) const {
          "The loop should have no single latch at this point");
   BasicBlock *H = getHeader();
   for (BasicBlock *BB : this->blocks()) {
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     for (BasicBlock *Successor : successors(TI)) {
       if (Successor == H)
         TI->setMetadata(LLVMContext::MD_loop, LoopID);
diff --git a/lib/Analysis/MemoryLocation.cpp b/lib/Analysis/MemoryLocation.cpp
index 3cd4b4475ef6fb2861e358c6413f39d234e99f02..c0605f6ad3797adce2f9552411b52932cf48193d 100644
--- a/lib/Analysis/MemoryLocation.cpp
+++ b/lib/Analysis/MemoryLocation.cpp
@@ -55,7 +55,8 @@ MemoryLocation MemoryLocation::get(const VAArgInst *VI) {
   AAMDNodes AATags;
   VI->getAAMetadata(AATags);
 
-  return MemoryLocation(VI->getPointerOperand(), UnknownSize, AATags);
+  return MemoryLocation(VI->getPointerOperand(), LocationSize::unknown(),
+                        AATags);
 }
 
 MemoryLocation MemoryLocation::get(const AtomicCmpXchgInst *CXI) {
@@ -87,7 +88,7 @@ MemoryLocation MemoryLocation::getForSource(const AtomicMemTransferInst *MTI) {
 }
 
 MemoryLocation MemoryLocation::getForSource(const AnyMemTransferInst *MTI) {
-  uint64_t Size = UnknownSize;
+  uint64_t Size = MemoryLocation::UnknownSize;
   if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength()))
     Size = C->getValue().getZExtValue();
 
@@ -108,7 +109,7 @@ MemoryLocation MemoryLocation::getForDest(const AtomicMemIntrinsic *MI) {
 }
 
 MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) {
-  uint64_t Size = UnknownSize;
+  uint64_t Size = MemoryLocation::UnknownSize;
   if (ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength()))
     Size = C->getValue().getZExtValue();
 
@@ -189,5 +190,6 @@ MemoryLocation MemoryLocation::getForArgument(ImmutableCallSite CS,
   }
   // FIXME: Handle memset_pattern4 and memset_pattern8 also.
 
-  return MemoryLocation(CS.getArgument(ArgIdx), UnknownSize, AATags);
+  return MemoryLocation(CS.getArgument(ArgIdx), LocationSize::unknown(),
+                        AATags);
 }
diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp
index 51a5733a3efd65d3f1303dfafeaa40d6ee9f312c..880dc2f278522688bc4e2f310685933b9b514c0d 100644
--- a/lib/Analysis/MemorySSAUpdater.cpp
+++ b/lib/Analysis/MemorySSAUpdater.cpp
@@ -1104,7 +1104,7 @@ void MemorySSAUpdater::removeBlocks(
     const SmallPtrSetImpl<BasicBlock *> &DeadBlocks) {
   // First delete all uses of BB in MemoryPhis.
   for (BasicBlock *BB : DeadBlocks) {
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     assert(TI && "Basic block expected to have a terminator instruction");
     for (BasicBlock *Succ : successors(TI))
       if (!DeadBlocks.count(Succ))
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index bca40043fd9f2b959f13dafad3d92e5bc5969682..29b96ac746b2007fabfbeee7824439a39b7d1e0d 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -74,9 +74,17 @@ cl::opt<FunctionSummary::ForceSummaryHotnessType, true> FSEC(
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
 // Instruction or a GlobalVariable (which walks its initializer).
-static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
+// Return true if any of the operands contains blockaddress. This is important
+// to know when computing summary for global var, because if global variable
+// references basic block address we can't import it separately from function
+// containing that basic block. For simplicity we currently don't import such
+// global vars at all. When importing function we aren't interested if any 
+// instruction in it takes an address of any basic block, because instruction
+// can only take an address of basic block located in the same function.
+static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
                          SetVector<ValueInfo> &RefEdges,
                          SmallPtrSet<const User *, 8> &Visited) {
+  bool HasBlockAddress = false;
   SmallVector<const User *, 32> Worklist;
   Worklist.push_back(CurUser);
 
@@ -92,8 +100,10 @@ static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
       const User *Operand = dyn_cast<User>(OI);
       if (!Operand)
         continue;
-      if (isa<BlockAddress>(Operand))
+      if (isa<BlockAddress>(Operand)) {
+        HasBlockAddress = true;
         continue;
+      }
       if (auto *GV = dyn_cast<GlobalValue>(Operand)) {
         // We have a reference to a global value. This should be added to
         // the reference set unless it is a callee. Callees are handled
@@ -105,6 +115,7 @@ static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
       Worklist.push_back(Operand);
     }
   }
+  return HasBlockAddress;
 }
 
 static CalleeInfo::HotnessType getHotness(uint64_t ProfileCount,
@@ -339,20 +350,18 @@ static void computeFunctionSummary(
 
   bool NonRenamableLocal = isNonRenamableLocal(F);
   bool NotEligibleForImport =
-      NonRenamableLocal || HasInlineAsmMaybeReferencingInternal ||
-      // Inliner doesn't handle variadic functions.
-      // FIXME: refactor this to use the same code that inliner is using.
-      F.isVarArg() ||
-      // Don't try to import functions with noinline attribute.
-      F.getAttributes().hasFnAttribute(Attribute::NoInline);
+      NonRenamableLocal || HasInlineAsmMaybeReferencingInternal;
   GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport,
                                     /* Live = */ false, F.isDSOLocal());
   FunctionSummary::FFlags FunFlags{
       F.hasFnAttribute(Attribute::ReadNone),
       F.hasFnAttribute(Attribute::ReadOnly),
-      F.hasFnAttribute(Attribute::NoRecurse),
-      F.returnDoesNotAlias(),
-  };
+      F.hasFnAttribute(Attribute::NoRecurse), F.returnDoesNotAlias(),
+      // Inliner doesn't handle variadic functions.
+      // FIXME: refactor this to use the same code that inliner is using.
+      F.isVarArg() ||
+          // Don't try to import functions with noinline attribute.
+          F.getAttributes().hasFnAttribute(Attribute::NoInline)};
   auto FuncSummary = llvm::make_unique<FunctionSummary>(
       Flags, NumInsts, FunFlags, RefEdges.takeVector(),
       CallGraphEdges.takeVector(), TypeTests.takeVector(),
@@ -369,7 +378,7 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
                        DenseSet<GlobalValue::GUID> &CantBePromoted) {
   SetVector<ValueInfo> RefEdges;
   SmallPtrSet<const User *, 8> Visited;
-  findRefEdges(Index, &V, RefEdges, Visited);
+  bool HasBlockAddress = findRefEdges(Index, &V, RefEdges, Visited);
   bool NonRenamableLocal = isNonRenamableLocal(V);
   GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal,
                                     /* Live = */ false, V.isDSOLocal());
@@ -377,6 +386,8 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
       llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(V.getGUID());
+  if (HasBlockAddress)
+    GVarSummary->setNotEligibleToImport();
   Index.addGlobalValueSummary(V, std::move(GVarSummary));
 }
 
@@ -465,7 +476,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                         F->hasFnAttribute(Attribute::ReadNone),
                         F->hasFnAttribute(Attribute::ReadOnly),
                         F->hasFnAttribute(Attribute::NoRecurse),
-                        F->returnDoesNotAlias()},
+                        F->returnDoesNotAlias(),
+                        /* NoInline = */ false},
                     ArrayRef<ValueInfo>{}, ArrayRef<FunctionSummary::EdgeTy>{},
                     ArrayRef<GlobalValue::GUID>{},
                     ArrayRef<FunctionSummary::VFuncId>{},
diff --git a/lib/Analysis/MustExecute.cpp b/lib/Analysis/MustExecute.cpp
index 79ec8e400c03d63a99fd7d8898dbebf1f4fcb921..23e012626e244df2957531a18510132acc6c4bce 100644
--- a/lib/Analysis/MustExecute.cpp
+++ b/lib/Analysis/MustExecute.cpp
@@ -22,15 +22,27 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-bool LoopSafetyInfo::headerMayThrow() const {
-  return HeaderMayThrow;
+const DenseMap<BasicBlock *, ColorVector> &
+LoopSafetyInfo::getBlockColors() const {
+  return BlockColors;
 }
 
-bool LoopSafetyInfo::anyBlockMayThrow() const {
+void LoopSafetyInfo::copyColors(BasicBlock *New, BasicBlock *Old) {
+  ColorVector &ColorsForNewBlock = BlockColors[New];
+  ColorVector &ColorsForOldBlock = BlockColors[Old];
+  ColorsForNewBlock = ColorsForOldBlock;
+}
+
+bool SimpleLoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const {
+  (void)BB;
+  return anyBlockMayThrow();
+}
+
+bool SimpleLoopSafetyInfo::anyBlockMayThrow() const {
   return MayThrow;
 }
 
-void LoopSafetyInfo::computeLoopSafetyInfo(Loop *CurLoop) {
+void SimpleLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
   assert(CurLoop != nullptr && "CurLoop can't be null");
   BasicBlock *Header = CurLoop->getHeader();
   // Iterate over header and compute safety info.
@@ -46,6 +58,41 @@ void LoopSafetyInfo::computeLoopSafetyInfo(Loop *CurLoop) {
        (BB != BBE) && !MayThrow; ++BB)
     MayThrow |= !isGuaranteedToTransferExecutionToSuccessor(*BB);
 
+  computeBlockColors(CurLoop);
+}
+
+bool ICFLoopSafetyInfo::blockMayThrow(const BasicBlock *BB) const {
+  return ICF.hasICF(BB);
+}
+
+bool ICFLoopSafetyInfo::anyBlockMayThrow() const {
+  return MayThrow;
+}
+
+void ICFLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
+  assert(CurLoop != nullptr && "CurLoop can't be null");
+  ICF.clear();
+  MayThrow = false;
+  // Figure out the fact that at least one block may throw.
+  for (auto &BB : CurLoop->blocks())
+    if (ICF.hasICF(&*BB)) {
+      MayThrow = true;
+      break;
+    }
+  computeBlockColors(CurLoop);
+}
+
+void ICFLoopSafetyInfo::insertInstructionTo(const BasicBlock *BB) {
+  ICF.invalidateBlock(BB);
+}
+
+void ICFLoopSafetyInfo::removeInstruction(const Instruction *Inst) {
+  // TODO: So far we just conservatively drop cache, but maybe we can not do it
+  // when Inst is not an ICF instruction. Follow-up on that.
+  ICF.invalidateBlock(Inst->getParent());
+}
+
+void LoopSafetyInfo::computeBlockColors(const Loop *CurLoop) {
   // Compute funclet colors if we might sink/hoist in a function with a funclet
   // personality routine.
   Function *Fn = CurLoop->getHeader()->getParent();
@@ -98,9 +145,12 @@ static bool CanProveNotTakenFirstIteration(const BasicBlock *ExitBlock,
   return SimpleCst->isAllOnesValue();
 }
 
-void LoopSafetyInfo::collectTransitivePredecessors(
+/// Collect all blocks from \p CurLoop which lie on all possible paths from
+/// the header of \p CurLoop (inclusive) to BB (exclusive) into the set
+/// \p Predecessors. If \p BB is the header, \p Predecessors will be empty.
+static void collectTransitivePredecessors(
     const Loop *CurLoop, const BasicBlock *BB,
-    SmallPtrSetImpl<const BasicBlock *> &Predecessors) const {
+    SmallPtrSetImpl<const BasicBlock *> &Predecessors) {
   assert(Predecessors.empty() && "Garbage in predecessors set?");
   assert(CurLoop->contains(BB) && "Should only be called for loop blocks!");
   if (BB == CurLoop->getHeader())
@@ -148,7 +198,10 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
   // 3) Exit blocks which are not taken on 1st iteration.
   // Memoize blocks we've already checked.
   SmallPtrSet<const BasicBlock *, 4> CheckedSuccessors;
-  for (auto *Pred : Predecessors)
+  for (auto *Pred : Predecessors) {
+    // Predecessor block may throw, so it has a side exit.
+    if (blockMayThrow(Pred))
+      return false;
     for (auto *Succ : successors(Pred))
       if (CheckedSuccessors.insert(Succ).second &&
           Succ != BB && !Predecessors.count(Succ))
@@ -169,6 +222,7 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
         if (CurLoop->contains(Succ) ||
             !CanProveNotTakenFirstIteration(Succ, DT, CurLoop))
           return false;
+  }
 
   // All predecessors can only lead us to BB.
   return true;
@@ -176,13 +230,9 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
 
 /// Returns true if the instruction in a loop is guaranteed to execute at least
 /// once.
-bool llvm::isGuaranteedToExecute(const Instruction &Inst,
-                                 const DominatorTree *DT, const Loop *CurLoop,
-                                 const LoopSafetyInfo *SafetyInfo) {
-  // We have to check to make sure that the instruction dominates all
-  // of the exit blocks.  If it doesn't, then there is a path out of the loop
-  // which does not execute this instruction, so we can't hoist it.
-
+bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
+                                                 const DominatorTree *DT,
+                                                 const Loop *CurLoop) const {
   // If the instruction is in the header block for the loop (which is very
   // common), it is always guaranteed to dominate the exit blocks.  Since this
   // is a common case, and can save some work, check it now.
@@ -191,22 +241,20 @@ bool llvm::isGuaranteedToExecute(const Instruction &Inst,
     // Inst unless we can prove that Inst comes before the potential implicit
     // exit.  At the moment, we use a (cheap) hack for the common case where
     // the instruction of interest is the first one in the block.
-    return !SafetyInfo->headerMayThrow() ||
-      Inst.getParent()->getFirstNonPHIOrDbg() == &Inst;
-
-  // Somewhere in this loop there is an instruction which may throw and make us
-  // exit the loop.
-  if (SafetyInfo->anyBlockMayThrow())
-    return false;
+    return !HeaderMayThrow ||
+           Inst.getParent()->getFirstNonPHIOrDbg() == &Inst;
 
   // If there is a path from header to exit or latch that doesn't lead to our
   // instruction's block, return false.
-  if (!SafetyInfo->allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT))
-    return false;
-
-  return true;
+  return allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT);
 }
 
+bool ICFLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
+                                              const DominatorTree *DT,
+                                              const Loop *CurLoop) const {
+  return !ICF.isDominatedByICFIFromSameBlock(&Inst) &&
+         allLoopPathsLeadToBlock(CurLoop, Inst.getParent(), DT);
+}
 
 namespace {
   struct MustExecutePrinter : public FunctionPass {
@@ -240,9 +288,9 @@ static bool isMustExecuteIn(const Instruction &I, Loop *L, DominatorTree *DT) {
   // TODO: merge these two routines.  For the moment, we display the best
   // result obtained by *either* implementation.  This is a bit unfair since no
   // caller actually gets the full power at the moment.
-  LoopSafetyInfo LSI;
+  SimpleLoopSafetyInfo LSI;
   LSI.computeLoopSafetyInfo(L);
-  return isGuaranteedToExecute(I, DT, L, &LSI) ||
+  return LSI.isGuaranteedToExecute(I, DT, L) ||
     isGuaranteedToExecuteForEveryIteration(&I, L);
 }
 
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index aeaa5172b3e6bf848fc0b38eca093358a616e66c..7472b6201c2b3e36a08f087c64f95e30d0712392 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -51,6 +51,18 @@ static cl::opt<unsigned> ProfileSummaryHugeWorkingSetSizeThreshold(
              " blocks required to reach the -profile-summary-cutoff-hot"
              " percentile exceeds this count."));
 
+// The next two options override the counts derived from summary computation and
+// are useful for debugging purposes.
+static cl::opt<int> ProfileSummaryHotCount(
+    "profile-summary-hot-count", cl::ReallyHidden, cl::ZeroOrMore,
+    cl::desc("A fixed hot count that overrides the count derived from"
+             " profile-summary-cutoff-hot"));
+
+static cl::opt<int> ProfileSummaryColdCount(
+    "profile-summary-cold-count", cl::ReallyHidden, cl::ZeroOrMore,
+    cl::desc("A fixed cold count that overrides the count derived from"
+             " profile-summary-cutoff-cold"));
+
 // Find the summary entry for a desired percentile of counts.
 static const ProfileSummaryEntry &getEntryForPercentile(SummaryEntryVector &DS,
                                                         uint64_t Percentile) {
@@ -198,9 +210,15 @@ void ProfileSummaryInfo::computeThresholds() {
   auto &HotEntry =
       getEntryForPercentile(DetailedSummary, ProfileSummaryCutoffHot);
   HotCountThreshold = HotEntry.MinCount;
+  if (ProfileSummaryHotCount.getNumOccurrences() > 0)
+    HotCountThreshold = ProfileSummaryHotCount;
   auto &ColdEntry =
       getEntryForPercentile(DetailedSummary, ProfileSummaryCutoffCold);
   ColdCountThreshold = ColdEntry.MinCount;
+  if (ProfileSummaryColdCount.getNumOccurrences() > 0)
+    ColdCountThreshold = ProfileSummaryColdCount;
+  assert(ColdCountThreshold <= HotCountThreshold &&
+         "Cold count threshold cannot exceed hot count threshold!");
   HasHugeWorkingSetSize =
       HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
 }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index d99d4767366bbc2abdeffe567b005c2ef8019ad2..e5134f2eeda90e48dee2aa890a7c6ad7a6f0ee87 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -112,6 +112,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -162,6 +163,11 @@ static cl::opt<bool>
                   cl::desc("Verify no dangling value in ScalarEvolution's "
                            "ExprValueMap (slow)"));
 
+static cl::opt<bool> VerifyIR(
+    "scev-verify-ir", cl::Hidden,
+    cl::desc("Verify IR correctness when making sensitive SCEV queries (slow)"),
+    cl::init(false));
+
 static cl::opt<unsigned> MulOpsInlineThreshold(
     "scev-mulops-inline-threshold", cl::Hidden,
     cl::desc("Threshold for inlining multiplication operands into a SCEV"),
@@ -204,7 +210,7 @@ static cl::opt<unsigned>
 static cl::opt<unsigned>
     MaxAddRecSize("scalar-evolution-max-add-rec-size", cl::Hidden,
                   cl::desc("Max coefficients in AddRec during evolving"),
-                  cl::init(16));
+                  cl::init(8));
 
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
@@ -2758,6 +2764,29 @@ ScalarEvolution::getOrCreateAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   return S;
 }
 
+const SCEV *
+ScalarEvolution::getOrCreateAddRecExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                       const Loop *L, SCEV::NoWrapFlags Flags) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(scAddRecExpr);
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    ID.AddPointer(Ops[i]);
+  ID.AddPointer(L);
+  void *IP = nullptr;
+  SCEVAddRecExpr *S =
+      static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+  if (!S) {
+    const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
+    std::uninitialized_copy(Ops.begin(), Ops.end(), O);
+    S = new (SCEVAllocator)
+        SCEVAddRecExpr(ID.Intern(SCEVAllocator), O, Ops.size(), L);
+    UniqueSCEVs.InsertNode(S, IP);
+    addToLoopUseLists(S);
+  }
+  S->setNoWrapFlags(Flags);
+  return S;
+}
+
 const SCEV *
 ScalarEvolution::getOrCreateMulExpr(SmallVectorImpl<const SCEV *> &Ops,
                                     SCEV::NoWrapFlags Flags) {
@@ -3037,7 +3066,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       SmallVector<const SCEV*, 7> AddRecOps;
       for (int x = 0, xe = AddRec->getNumOperands() +
              OtherAddRec->getNumOperands() - 1; x != xe && !Overflow; ++x) {
-        const SCEV *Term = getZero(Ty);
+        SmallVector <const SCEV *, 7> SumOps;
         for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
           uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
           for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
@@ -3052,12 +3081,13 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
             const SCEV *CoeffTerm = getConstant(Ty, Coeff);
             const SCEV *Term1 = AddRec->getOperand(y-z);
             const SCEV *Term2 = OtherAddRec->getOperand(z);
-            Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1, Term2,
-                                               SCEV::FlagAnyWrap, Depth + 1),
-                              SCEV::FlagAnyWrap, Depth + 1);
+            SumOps.push_back(getMulExpr(CoeffTerm, Term1, Term2,
+                                        SCEV::FlagAnyWrap, Depth + 1));
           }
         }
-        AddRecOps.push_back(Term);
+        if (SumOps.empty())
+          SumOps.push_back(getZero(Ty));
+        AddRecOps.push_back(getAddExpr(SumOps, SCEV::FlagAnyWrap, Depth + 1));
       }
       if (!Overflow) {
         const SCEV *NewAddRec = getAddRecExpr(AddRecOps, AddRec->getLoop(),
@@ -3408,24 +3438,7 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
 
   // Okay, it looks like we really DO need an addrec expr.  Check to see if we
   // already have one, otherwise create a new one.
-  FoldingSetNodeID ID;
-  ID.AddInteger(scAddRecExpr);
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
-    ID.AddPointer(Operands[i]);
-  ID.AddPointer(L);
-  void *IP = nullptr;
-  SCEVAddRecExpr *S =
-    static_cast<SCEVAddRecExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
-  if (!S) {
-    const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Operands.size());
-    std::uninitialized_copy(Operands.begin(), Operands.end(), O);
-    S = new (SCEVAllocator) SCEVAddRecExpr(ID.Intern(SCEVAllocator),
-                                           O, Operands.size(), L);
-    UniqueSCEVs.InsertNode(S, IP);
-    addToLoopUseLists(S);
-  }
-  S->setNoWrapFlags(Flags);
-  return S;
+  return getOrCreateAddRecExpr(Operands, L, Flags);
 }
 
 const SCEV *
@@ -7072,7 +7085,7 @@ ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock,
     return getCouldNotCompute();
 
   bool IsOnlyExit = (L->getExitingBlock() != nullptr);
-  TerminatorInst *Term = ExitingBlock->getTerminator();
+  Instruction *Term = ExitingBlock->getTerminator();
   if (BranchInst *BI = dyn_cast<BranchInst>(Term)) {
     assert(BI->isConditional() && "If unconditional, it can't be in loop!");
     bool ExitIfTrue = !L->contains(BI->getSuccessor(0));
@@ -8802,7 +8815,13 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
                                            const SCEV *&LHS, const SCEV *&RHS,
                                            unsigned Depth) {
   bool Changed = false;
-
+  // Simplifies ICMP to trivial true or false by turning it into '0 == 0' or
+  // '0 != 0'.
+  auto TrivialCase = [&](bool TriviallyTrue) {
+    LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
+    Pred = TriviallyTrue ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+    return true;
+  };
   // If we hit the max recursion limit bail out.
   if (Depth >= 3)
     return false;
@@ -8814,9 +8833,9 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
       if (ConstantExpr::getICmp(Pred,
                                 LHSC->getValue(),
                                 RHSC->getValue())->isNullValue())
-        goto trivially_false;
+        return TrivialCase(false);
       else
-        goto trivially_true;
+        return TrivialCase(true);
     }
     // Otherwise swap the operands to put the constant on the right.
     std::swap(LHS, RHS);
@@ -8846,9 +8865,9 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     if (!ICmpInst::isEquality(Pred)) {
       ConstantRange ExactCR = ConstantRange::makeExactICmpRegion(Pred, RA);
       if (ExactCR.isFullSet())
-        goto trivially_true;
+        return TrivialCase(true);
       else if (ExactCR.isEmptySet())
-        goto trivially_false;
+        return TrivialCase(false);
 
       APInt NewRHS;
       CmpInst::Predicate NewPred;
@@ -8884,7 +8903,7 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
         // The "Should have been caught earlier!" messages refer to the fact
         // that the ExactCR.isFullSet() or ExactCR.isEmptySet() check above
         // should have fired on the corresponding cases, and canonicalized the
-        // check to trivially_true or trivially_false.
+        // check to trivial case.
 
       case ICmpInst::ICMP_UGE:
         assert(!RA.isMinValue() && "Should have been caught earlier!");
@@ -8917,9 +8936,9 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
   // Check for obvious equality.
   if (HasSameValue(LHS, RHS)) {
     if (ICmpInst::isTrueWhenEqual(Pred))
-      goto trivially_true;
+      return TrivialCase(true);
     if (ICmpInst::isFalseWhenEqual(Pred))
-      goto trivially_false;
+      return TrivialCase(false);
   }
 
   // If possible, canonicalize GE/LE comparisons to GT/LT comparisons, by
@@ -8987,18 +9006,6 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     return SimplifyICmpOperands(Pred, LHS, RHS, Depth+1);
 
   return Changed;
-
-trivially_true:
-  // Return 0 == 0.
-  LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
-  Pred = ICmpInst::ICMP_EQ;
-  return true;
-
-trivially_false:
-  // Return 0 != 0.
-  LHS = RHS = getConstant(ConstantInt::getFalse(getContext()));
-  Pred = ICmpInst::ICMP_NE;
-  return true;
 }
 
 bool ScalarEvolution::isKnownNegative(const SCEV *S) {
@@ -9369,6 +9376,11 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
   // (interprocedural conditions notwithstanding).
   if (!L) return true;
 
+  if (VerifyIR)
+    assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()) &&
+           "This cannot be done on broken IR!");
+
+
   if (isKnownViaNonRecursiveReasoning(Pred, LHS, RHS))
     return true;
 
@@ -9474,6 +9486,10 @@ ScalarEvolution::isLoopEntryGuardedByCond(const Loop *L,
   // (interprocedural conditions notwithstanding).
   if (!L) return false;
 
+  if (VerifyIR)
+    assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()) &&
+           "This cannot be done on broken IR!");
+
   // Both LHS and RHS must be available at loop entry.
   assert(isAvailableAtLoopEntry(LHS, L) &&
          "LHS is not available at Loop Entry");
diff --git a/lib/Analysis/SyncDependenceAnalysis.cpp b/lib/Analysis/SyncDependenceAnalysis.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1a7e4476d123de989357475ce37b4f9f8a4c4d1
--- /dev/null
+++ b/lib/Analysis/SyncDependenceAnalysis.cpp
@@ -0,0 +1,380 @@
+//===- SyncDependenceAnalysis.cpp - Divergent Branch Dependence Calculation
+//--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an algorithm that returns for a divergent branch
+// the set of basic blocks whose phi nodes become divergent due to divergent
+// control. These are the blocks that are reachable by two disjoint paths from
+// the branch or loop exits that have a reaching path that is disjoint from a
+// path to the loop latch.
+//
+// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
+// control-induced divergence in phi nodes.
+//
+// -- Summary --
+// The SyncDependenceAnalysis lazily computes sync dependences [3].
+// The analysis evaluates the disjoint path criterion [2] by a reduction
+// to SSA construction. The SSA construction algorithm is implemented as
+// a simple data-flow analysis [1].
+//
+// [1] "A Simple, Fast Dominance Algorithm", SPI '01, Cooper, Harvey and Kennedy
+// [2] "Efficiently Computing Static Single Assignment Form
+//     and the Control Dependence Graph", TOPLAS '91,
+//           Cytron, Ferrante, Rosen, Wegman and Zadeck
+// [3] "Improving Performance of OpenCL on CPUs", CC '12, Karrenberg and Hack
+// [4] "Divergence Analysis", TOPLAS '13, Sampaio, Souza, Collange and Pereira
+//
+// -- Sync dependence --
+// Sync dependence [4] characterizes the control flow aspect of the
+// propagation of branch divergence. For example,
+//
+//   %cond = icmp slt i32 %tid, 10
+//   br i1 %cond, label %then, label %else
+// then:
+//   br label %merge
+// else:
+//   br label %merge
+// merge:
+//   %a = phi i32 [ 0, %then ], [ 1, %else ]
+//
+// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
+// because %tid is not on its use-def chains, %a is sync dependent on %tid
+// because the branch "br i1 %cond" depends on %tid and affects which value %a
+// is assigned to.
+//
+// -- Reduction to SSA construction --
+// There are two disjoint paths from A to X, if a certain variant of SSA
+// construction places a phi node in X under the following set-up scheme [2].
+//
+// This variant of SSA construction ignores incoming undef values.
+// That is paths from the entry without a definition do not result in
+// phi nodes.
+//
+//       entry
+//     /      \
+//    A        \
+//  /   \       Y
+// B     C     /
+//  \   /  \  /
+//    D     E
+//     \   /
+//       F
+// Assume that A contains a divergent branch. We are interested
+// in the set of all blocks where each block is reachable from A
+// via two disjoint paths. This would be the set {D, F} in this
+// case.
+// To generally reduce this query to SSA construction we introduce
+// a virtual variable x and assign to x different values in each
+// successor block of A.
+//           entry
+//         /      \
+//        A        \
+//      /   \       Y
+// x = 0   x = 1   /
+//      \  /   \  /
+//        D     E
+//         \   /
+//           F
+// Our flavor of SSA construction for x will construct the following
+//            entry
+//          /      \
+//         A        \
+//       /   \       Y
+// x0 = 0   x1 = 1  /
+//       \   /   \ /
+//      x2=phi    E
+//         \     /
+//          x3=phi
+// The blocks D and F contain phi nodes and are thus each reachable
+// by two disjoins paths from A.
+//
+// -- Remarks --
+// In case of loop exits we need to check the disjoint path criterion for loops
+// [2]. To this end, we check whether the definition of x differs between the
+// loop exit and the loop header (_after_ SSA construction).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+
+#include <stack>
+#include <unordered_set>
+
+#define DEBUG_TYPE "sync-dependence"
+
+namespace llvm {
+
+ConstBlockSet SyncDependenceAnalysis::EmptyBlockSet;
+
+SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT,
+                                               const PostDominatorTree &PDT,
+                                               const LoopInfo &LI)
+    : FuncRPOT(DT.getRoot()->getParent()), DT(DT), PDT(PDT), LI(LI) {}
+
+SyncDependenceAnalysis::~SyncDependenceAnalysis() {}
+
+using FunctionRPOT = ReversePostOrderTraversal<const Function *>;
+
+// divergence propagator for reducible CFGs
+struct DivergencePropagator {
+  const FunctionRPOT &FuncRPOT;
+  const DominatorTree &DT;
+  const PostDominatorTree &PDT;
+  const LoopInfo &LI;
+
+  // identified join points
+  std::unique_ptr<ConstBlockSet> JoinBlocks;
+
+  // reached loop exits (by a path disjoint to a path to the loop header)
+  SmallPtrSet<const BasicBlock *, 4> ReachedLoopExits;
+
+  // if DefMap[B] == C then C is the dominating definition at block B
+  // if DefMap[B] ~ undef then we haven't seen B yet
+  // if DefMap[B] == B then B is a join point of disjoint paths from X or B is
+  // an immediate successor of X (initial value).
+  using DefiningBlockMap = std::map<const BasicBlock *, const BasicBlock *>;
+  DefiningBlockMap DefMap;
+
+  // all blocks with pending visits
+  std::unordered_set<const BasicBlock *> PendingUpdates;
+
+  DivergencePropagator(const FunctionRPOT &FuncRPOT, const DominatorTree &DT,
+                       const PostDominatorTree &PDT, const LoopInfo &LI)
+      : FuncRPOT(FuncRPOT), DT(DT), PDT(PDT), LI(LI),
+        JoinBlocks(new ConstBlockSet) {}
+
+  // set the definition at @block and mark @block as pending for a visit
+  void addPending(const BasicBlock &Block, const BasicBlock &DefBlock) {
+    bool WasAdded = DefMap.emplace(&Block, &DefBlock).second;
+    if (WasAdded)
+      PendingUpdates.insert(&Block);
+  }
+
+  void printDefs(raw_ostream &Out) {
+    Out << "Propagator::DefMap {\n";
+    for (const auto *Block : FuncRPOT) {
+      auto It = DefMap.find(Block);
+      Out << Block->getName() << " : ";
+      if (It == DefMap.end()) {
+        Out << "\n";
+      } else {
+        const auto *DefBlock = It->second;
+        Out << (DefBlock ? DefBlock->getName() : "<null>") << "\n";
+      }
+    }
+    Out << "}\n";
+  }
+
+  // process @succBlock with reaching definition @defBlock
+  // the original divergent branch was in @parentLoop (if any)
+  void visitSuccessor(const BasicBlock &SuccBlock, const Loop *ParentLoop,
+                      const BasicBlock &DefBlock) {
+
+    // @succBlock is a loop exit
+    if (ParentLoop && !ParentLoop->contains(&SuccBlock)) {
+      DefMap.emplace(&SuccBlock, &DefBlock);
+      ReachedLoopExits.insert(&SuccBlock);
+      return;
+    }
+
+    // first reaching def?
+    auto ItLastDef = DefMap.find(&SuccBlock);
+    if (ItLastDef == DefMap.end()) {
+      addPending(SuccBlock, DefBlock);
+      return;
+    }
+
+    // a join of at least two definitions
+    if (ItLastDef->second != &DefBlock) {
+      // do we know this join already?
+      if (!JoinBlocks->insert(&SuccBlock).second)
+        return;
+
+      // update the definition
+      addPending(SuccBlock, SuccBlock);
+    }
+  }
+
+  // find all blocks reachable by two disjoint paths from @rootTerm.
+  // This method works for both divergent terminators and loops with
+  // divergent exits.
+  // @rootBlock is either the block containing the branch or the header of the
+  // divergent loop.
+  // @nodeSuccessors is the set of successors of the node (Loop or Terminator)
+  // headed by @rootBlock.
+  // @parentLoop is the parent loop of the Loop or the loop that contains the
+  // Terminator.
+  template <typename SuccessorIterable>
+  std::unique_ptr<ConstBlockSet>
+  computeJoinPoints(const BasicBlock &RootBlock,
+                    SuccessorIterable NodeSuccessors, const Loop *ParentLoop) {
+    assert(JoinBlocks);
+
+    // immediate post dominator (no join block beyond that block)
+    const auto *PdNode = PDT.getNode(const_cast<BasicBlock *>(&RootBlock));
+    const auto *IpdNode = PdNode->getIDom();
+    const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
+
+    // bootstrap with branch targets
+    for (const auto *SuccBlock : NodeSuccessors) {
+      DefMap.emplace(SuccBlock, SuccBlock);
+
+      if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
+        // immediate loop exit from node.
+        ReachedLoopExits.insert(SuccBlock);
+        continue;
+      } else {
+        // regular successor
+        PendingUpdates.insert(SuccBlock);
+      }
+    }
+
+    auto ItBeginRPO = FuncRPOT.begin();
+
+    // skip until term (TODO RPOT won't let us start at @term directly)
+    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {}
+
+    auto ItEndRPO = FuncRPOT.end();
+    assert(ItBeginRPO != ItEndRPO);
+
+    // propagate definitions at the immediate successors of the node in RPO
+    auto ItBlockRPO = ItBeginRPO;
+    while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
+      const auto *Block = *ItBlockRPO;
+
+      // skip @block if not pending update
+      auto ItPending = PendingUpdates.find(Block);
+      if (ItPending == PendingUpdates.end())
+        continue;
+      PendingUpdates.erase(ItPending);
+
+      // propagate definition at @block to its successors
+      auto ItDef = DefMap.find(Block);
+      const auto *DefBlock = ItDef->second;
+      assert(DefBlock);
+
+      auto *BlockLoop = LI.getLoopFor(Block);
+      if (ParentLoop &&
+          (ParentLoop != BlockLoop && ParentLoop->contains(BlockLoop))) {
+        // if the successor is the header of a nested loop pretend its a
+        // single node with the loop's exits as successors
+        SmallVector<BasicBlock *, 4> BlockLoopExits;
+        BlockLoop->getExitBlocks(BlockLoopExits);
+        for (const auto *BlockLoopExit : BlockLoopExits) {
+          visitSuccessor(*BlockLoopExit, ParentLoop, *DefBlock);
+        }
+
+      } else {
+        // the successors are either on the same loop level or loop exits
+        for (const auto *SuccBlock : successors(Block)) {
+          visitSuccessor(*SuccBlock, ParentLoop, *DefBlock);
+        }
+      }
+    }
+
+    // We need to know the definition at the parent loop header to decide
+    // whether the definition at the header is different from the definition at
+    // the loop exits, which would indicate a divergent loop exits.
+    //
+    // A // loop header
+    // |
+    // B // nested loop header
+    // |
+    // C -> X (exit from B loop) -..-> (A latch)
+    // |
+    // D -> back to B (B latch)
+    // |
+    // proper exit from both loops
+    //
+    // D post-dominates B as it is the only proper exit from the "A loop".
+    // If C has a divergent branch, propagation will therefore stop at D.
+    // That implies that B will never receive a definition.
+    // But that definition can only be the same as at D (D itself in thise case)
+    // because all paths to anywhere have to pass through D.
+    //
+    const BasicBlock *ParentLoopHeader =
+        ParentLoop ? ParentLoop->getHeader() : nullptr;
+    if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
+      DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
+    }
+
+    // analyze reached loop exits
+    if (!ReachedLoopExits.empty()) {
+      assert(ParentLoop);
+      const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
+      LLVM_DEBUG(printDefs(dbgs()));
+      assert(HeaderDefBlock && "no definition in header of carrying loop");
+
+      for (const auto *ExitBlock : ReachedLoopExits) {
+        auto ItExitDef = DefMap.find(ExitBlock);
+        assert((ItExitDef != DefMap.end()) &&
+               "no reaching def at reachable loop exit");
+        if (ItExitDef->second != HeaderDefBlock) {
+          JoinBlocks->insert(ExitBlock);
+        }
+      }
+    }
+
+    return std::move(JoinBlocks);
+  }
+};
+
+const ConstBlockSet &SyncDependenceAnalysis::join_blocks(const Loop &Loop) {
+  using LoopExitVec = SmallVector<BasicBlock *, 4>;
+  LoopExitVec LoopExits;
+  Loop.getExitBlocks(LoopExits);
+  if (LoopExits.size() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedLoopExitJoins.find(&Loop);
+  if (ItCached != CachedLoopExitJoins.end())
+    return *ItCached->second;
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  auto JoinBlocks = Propagator.computeJoinPoints<const LoopExitVec &>(
+      *Loop.getHeader(), LoopExits, Loop.getParentLoop());
+
+  auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+const ConstBlockSet &
+SyncDependenceAnalysis::join_blocks(const Instruction &Term) {
+  // trivial case
+  if (Term.getNumSuccessors() < 1) {
+    return EmptyBlockSet;
+  }
+
+  // already available in cache?
+  auto ItCached = CachedBranchJoins.find(&Term);
+  if (ItCached != CachedBranchJoins.end())
+    return *ItCached->second;
+
+  // compute all join points
+  DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
+  const auto &TermBlock = *Term.getParent();
+  auto JoinBlocks = Propagator.computeJoinPoints<succ_const_range>(
+      TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock));
+
+  auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
+  assert(ItInserted.second);
+  return *ItInserted.first->second;
+}
+
+} // namespace llvm
diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp
index fb678febe23cc6a2afb2c7f51673fa722973286d..4643f75da42d14a10b39954a9e9d4061a39984e1 100644
--- a/lib/Analysis/TargetLibraryInfo.cpp
+++ b/lib/Analysis/TargetLibraryInfo.cpp
@@ -413,17 +413,17 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_flsll);
   }
 
-  // The following functions are available on Linux,
-  // but Android uses bionic instead of glibc.
-  if (!T.isOSLinux() || T.isAndroid()) {
+  // The following functions are only available on GNU/Linux (using glibc).
+  // Linux variants without glibc (eg: bionic, musl) may have some subset.
+  if (!T.isOSLinux() || !T.isGNUEnvironment()) {
     TLI.setUnavailable(LibFunc_dunder_strdup);
     TLI.setUnavailable(LibFunc_dunder_strtok_r);
     TLI.setUnavailable(LibFunc_dunder_isoc99_scanf);
     TLI.setUnavailable(LibFunc_dunder_isoc99_sscanf);
     TLI.setUnavailable(LibFunc_under_IO_getc);
     TLI.setUnavailable(LibFunc_under_IO_putc);
-    // But, Android has memalign.
-    if (!T.isAndroid())
+    // But, Android and musl have memalign.
+    if (!T.isAndroid() && !T.isMusl())
       TLI.setUnavailable(LibFunc_memalign);
     TLI.setUnavailable(LibFunc_fopen64);
     TLI.setUnavailable(LibFunc_fseeko64);
@@ -613,6 +613,24 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   unsigned NumParams = FTy.getNumParams();
 
   switch (F) {
+  case LibFunc_execl:
+  case LibFunc_execlp:
+  case LibFunc_execle:
+    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
+  case LibFunc_execv:
+  case LibFunc_execvp:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
+  case LibFunc_execvP:
+  case LibFunc_execvpe:
+  case LibFunc_execve:
+    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(2)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy(32));
   case LibFunc_strlen:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getReturnType()->isIntegerTy());
@@ -863,6 +881,8 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
+  case LibFunc_fork:
+    return (NumParams == 0 && FTy.getReturnType()->isIntegerTy(32));
   case LibFunc_fdopen:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 4ad48e351a4a9981b22f8d840cad6d393aa19ea7..6e4eb8ff0cdf2a47cddd20a38e6c9cf3a568b027 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -268,6 +268,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
 
+bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const {
+  return TTIImpl->enableMaskedInterleavedAccessVectorization();
+}
+
 bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const {
   return TTIImpl->isFPVectorizationPotentiallyUnsafe();
 }
@@ -515,9 +519,12 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 int TargetTransformInfo::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace) const {
+    unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+    bool UseMaskForGaps) const {
   int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                                 Alignment, AddressSpace);
+                                                 Alignment, AddressSpace,
+                                                 UseMaskForCond,
+                                                 UseMaskForGaps);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 02d071717f046b9bd6b91a851fbcf806a4401e16..ed17441d1e404e2c70f5ea62b191987394ffcd3d 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -2510,6 +2510,44 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
     // valid for all elements of the vector (for example if vector is sign
     // extended, shifted, etc).
     return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
+
+  case Instruction::ShuffleVector: {
+    // TODO: This is copied almost directly from the SelectionDAG version of
+    //       ComputeNumSignBits. It would be better if we could share common
+    //       code. If not, make sure that changes are translated to the DAG.
+
+    // Collect the minimum number of sign bits that are shared by every vector
+    // element referenced by the shuffle.
+    auto *Shuf = cast<ShuffleVectorInst>(U);
+    int NumElts = Shuf->getOperand(0)->getType()->getVectorNumElements();
+    int NumMaskElts = Shuf->getMask()->getType()->getVectorNumElements();
+    APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+    for (int i = 0; i != NumMaskElts; ++i) {
+      int M = Shuf->getMaskValue(i);
+      assert(M < NumElts * 2 && "Invalid shuffle mask constant");
+      // For undef elements, we don't know anything about the common state of
+      // the shuffle result.
+      if (M == -1)
+        return 1;
+      if (M < NumElts)
+        DemandedLHS.setBit(M % NumElts);
+      else
+        DemandedRHS.setBit(M % NumElts);
+    }
+    Tmp = std::numeric_limits<unsigned>::max();
+    if (!!DemandedLHS)
+      Tmp = ComputeNumSignBits(Shuf->getOperand(0), Depth + 1, Q);
+    if (!!DemandedRHS) {
+      Tmp2 = ComputeNumSignBits(Shuf->getOperand(1), Depth + 1, Q);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    // If we don't know anything, early out and try computeKnownBits fall-back.
+    if (Tmp == 1)
+      break;
+    assert(Tmp <= V->getType()->getScalarSizeInBits() &&
+           "Failed to determine minimum sign bits");
+    return Tmp;
+  }
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
@@ -2898,7 +2936,13 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
               cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI,
                                               SignBitOnly, Depth + 1));
 
+    case Intrinsic::maximum:
+      return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
+                                             Depth + 1) ||
+             cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
+                                             Depth + 1);
     case Intrinsic::minnum:
+    case Intrinsic::minimum:
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                              Depth + 1) &&
              cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
@@ -4716,6 +4760,27 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
                                               Value *TrueVal, Value *FalseVal,
                                               Value *&LHS, Value *&RHS,
                                               unsigned Depth) {
+  if (CmpInst::isFPPredicate(Pred)) {
+    // IEEE-754 ignores the sign of 0.0 in comparisons. So if the select has one
+    // 0.0 operand, set the compare's 0.0 operands to that same value for the
+    // purpose of identifying min/max. Disregard vector constants with undefined
+    // elements because those can not be back-propagated for analysis.
+    Value *OutputZeroVal = nullptr;
+    if (match(TrueVal, m_AnyZeroFP()) && !match(FalseVal, m_AnyZeroFP()) &&
+        !cast<Constant>(TrueVal)->containsUndefElement())
+      OutputZeroVal = TrueVal;
+    else if (match(FalseVal, m_AnyZeroFP()) && !match(TrueVal, m_AnyZeroFP()) &&
+             !cast<Constant>(FalseVal)->containsUndefElement())
+      OutputZeroVal = FalseVal;
+
+    if (OutputZeroVal) {
+      if (match(CmpLHS, m_AnyZeroFP()))
+        CmpLHS = OutputZeroVal;
+      if (match(CmpRHS, m_AnyZeroFP()))
+        CmpRHS = OutputZeroVal;
+    }
+  }
+
   LHS = CmpLHS;
   RHS = CmpRHS;
 
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 272c665ace1311a8b287107e08956971dde5d477..38dca50e82a523b9b503337690e3161c9e234810 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -54,6 +54,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::fabs:
   case Intrinsic::minnum:
   case Intrinsic::maxnum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximum:
   case Intrinsic::copysign:
   case Intrinsic::floor:
   case Intrinsic::ceil:
@@ -502,6 +504,35 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
   return Inst;
 }
 
+Constant *llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
+                                           const InterleaveGroup &Group) {
+  // All 1's means mask is not needed.
+  if (Group.getNumMembers() == Group.getFactor())
+    return nullptr;
+
+  // TODO: support reversed access.
+  assert(!Group.isReverse() && "Reversed group not supported.");
+
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < Group.getFactor(); ++j) {
+      unsigned HasMember = Group.getMember(j) ? 1 : 0;
+      Mask.push_back(Builder.getInt1(HasMember));
+    }
+
+  return ConstantVector::get(Mask);
+}
+
+Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, 
+                                     unsigned ReplicationFactor, unsigned VF) {
+  SmallVector<Constant *, 16> MaskVec;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < ReplicationFactor; j++)
+      MaskVec.push_back(Builder.getInt32(i));
+
+  return ConstantVector::get(MaskVec);
+}
+
 Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
                                      unsigned NumVecs) {
   SmallVector<Constant *, 16> Mask;
@@ -672,7 +703,8 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
 // this group because it and (2) are dependent. However, (1) can be grouped
 // with other accesses that may precede it in program order. Note that a
 // bottom-up order does not imply that WAW dependences should not be checked.
-void InterleavedAccessInfo::analyzeInterleaving() {
+void InterleavedAccessInfo::analyzeInterleaving(
+                                 bool EnablePredicatedInterleavedMemAccesses) {
   LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
   const ValueToValueMap &Strides = LAI->getSymbolicStrides();
 
@@ -712,9 +744,8 @@ void InterleavedAccessInfo::analyzeInterleaving() {
     // create a group for B, we continue with the bottom-up algorithm to ensure
     // we don't break any of B's dependences.
     InterleaveGroup *Group = nullptr;
-    // TODO: Ignore B if it is in a predicated block. This restriction can be 
-    // relaxed in the future once we handle masked interleaved groups.
-    if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) {
+    if (isStrided(DesB.Stride) && 
+        (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
       Group = getInterleaveGroup(B);
       if (!Group) {
         LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
@@ -808,11 +839,12 @@ void InterleavedAccessInfo::analyzeInterleaving() {
       if (DistanceToB % static_cast<int64_t>(DesB.Size))
         continue;
 
-      // Ignore A if either A or B is in a predicated block. Although we
-      // currently prevent group formation for predicated accesses, we may be
-      // able to relax this limitation in the future once we handle more
-      // complicated blocks.
-      if (isPredicated(A->getParent()) || isPredicated(B->getParent()))
+      // All members of a predicated interleave-group must have the same predicate,
+      // and currently must reside in the same BB.
+      BasicBlock *BlockA = A->getParent();  
+      BasicBlock *BlockB = B->getParent();  
+      if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
+          (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
         continue;
 
       // The index of A is the index of B plus A's distance to B in multiples
@@ -906,3 +938,28 @@ void InterleavedAccessInfo::analyzeInterleaving() {
     }
   }
 }
+
+void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
+  // If no group had triggered the requirement to create an epilogue loop,
+  // there is nothing to do.
+  if (!requiresScalarEpilogue())
+    return;
+
+  // Avoid releasing a Group twice.
+  SmallPtrSet<InterleaveGroup *, 4> DelSet;
+  for (auto &I : InterleaveGroupMap) {
+    InterleaveGroup *Group = I.second;
+    if (Group->requiresScalarEpilogue())
+      DelSet.insert(Group);
+  }
+  for (auto *Ptr : DelSet) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Invalidate candidate interleaved group due to gaps that "
+           "require a scalar epilogue (not allowed under optsize) and cannot "
+           "be masked (not enabled). \n");
+    releaseGroup(Ptr);
+  }
+
+  RequiresScalarEpilogue = false;
+}
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 53787b25d0b7682dca2e1b0bfdf2813b2d9f2abc..af4f43986ef0632c50c7284fa14f4ad9ee156925 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -740,6 +740,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(readOnly);
   KEYWORD(noRecurse);
   KEYWORD(returnDoesNotAlias);
+  KEYWORD(noInline);
   KEYWORD(calls);
   KEYWORD(callee);
   KEYWORD(hotness);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index e83b9f80592809d35581fa4b50cbd0c21fe158c8..5fe1e125d48680db35067d3bf9e19275f65a991a 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -7714,6 +7714,7 @@ bool LLParser::ParseFlag(unsigned &Val) {
 ///   := 'funcFlags' ':' '(' ['readNone' ':' Flag]?
 ///        [',' 'readOnly' ':' Flag]? [',' 'noRecurse' ':' Flag]?
 ///        [',' 'returnDoesNotAlias' ':' Flag]? ')'
+///        [',' 'noInline' ':' Flag]? ')'
 bool LLParser::ParseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
   assert(Lex.getKind() == lltok::kw_funcFlags);
   Lex.Lex();
@@ -7749,6 +7750,12 @@ bool LLParser::ParseOptionalFFlags(FunctionSummary::FFlags &FFlags) {
         return true;
       FFlags.ReturnDoesNotAlias = Val;
       break;
+    case lltok::kw_noInline:
+      Lex.Lex();
+      if (ParseToken(lltok::colon, "expected ':'") || ParseFlag(Val))
+        return true;
+      FFlags.NoInline = Val;
+      break;
     default:
       return Error(Lex.getLoc(), "expected function flag type");
     }
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index a3a9930f9e3f51f0b7a78ff33c7600b4968079bc..f8f5955a16c89efd3281990f868f7e9ad847e41b 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -369,6 +369,7 @@ enum Kind {
   kw_readOnly,
   kw_noRecurse,
   kw_returnDoesNotAlias,
+  kw_noInline,
   kw_calls,
   kw_callee,
   kw_hotness,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index aa83955e646abde7d4422b63d373d12a29d421a5..56e05f8f085564153c86aa9c93dfb91eb5ca3ddb 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -876,6 +876,7 @@ static FunctionSummary::FFlags getDecodedFFlags(uint64_t RawFlags) {
   Flags.ReadOnly = (RawFlags >> 1) & 0x1;
   Flags.NoRecurse = (RawFlags >> 2) & 0x1;
   Flags.ReturnDoesNotAlias = (RawFlags >> 3) & 0x1;
+  Flags.NoInline = (RawFlags >> 4) & 0x1;
   return Flags;
 }
 
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index d59da255be4bfca439d4094cc2426937496cbfbf..f4634c9d3f4e39b9a5f7392f497115f55ea59167 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -971,6 +971,7 @@ static uint64_t getEncodedFFlags(FunctionSummary::FFlags Flags) {
   RawFlags |= (Flags.ReadOnly << 1);
   RawFlags |= (Flags.NoRecurse << 2);
   RawFlags |= (Flags.ReturnDoesNotAlias << 3);
+  RawFlags |= (Flags.NoInline << 4);
   return RawFlags;
 }
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index ecf8b93d253218e7f46aac76c56daae45b2ab503..1f54c611bad66a74b5726f384b9c04967ef89416 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -15,6 +15,7 @@ add_subdirectory(MC)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
+add_subdirectory(OptRemarks)
 add_subdirectory(DebugInfo)
 add_subdirectory(ExecutionEngine)
 add_subdirectory(Target)
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index b769e92590f08b3ef5738863ca079c4dcbfa6f80..27dce7fd7b79981e882e1d56adfedd5fd7994256 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -471,7 +471,7 @@ static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
 bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
   const Instruction *I = CS.getInstruction();
   const BasicBlock *ExitBB = I->getParent();
-  const TerminatorInst *Term = ExitBB->getTerminator();
+  const Instruction *Term = ExitBB->getTerminator();
   const ReturnInst *Ret = dyn_cast<ReturnInst>(Term);
 
   // The block must end in a return statement or unreachable.
@@ -496,6 +496,10 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
       // Debug info intrinsics do not get in the way of tail call optimization.
       if (isa<DbgInfoIntrinsic>(BBI))
         continue;
+      // A lifetime end intrinsic should not stop tail call optimization.
+      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
+        if (II->getIntrinsicID() == Intrinsic::lifetime_end)
+          continue;
       if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
           !isSafeToSpeculativelyExecute(&*BBI))
         return false;
diff --git a/lib/CodeGen/AsmPrinter/AddressPool.cpp b/lib/CodeGen/AsmPrinter/AddressPool.cpp
index c21616766fa55b255b4b5bd18462cc336a21762e..f8143b903d5157126906da63a62c50437944f9fc 100644
--- a/lib/CodeGen/AsmPrinter/AddressPool.cpp
+++ b/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -39,6 +39,9 @@ void AddressPool::emitHeader(AsmPrinter &Asm, MCSection *Section) {
 
 // Emit addresses into the section given.
 void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) {
+  if (isEmpty())
+    return;
+
   // Start the dwarf addr section.
   Asm.OutStreamer->SwitchSection(AddrSection);
 
@@ -49,9 +52,6 @@ void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) {
   // It is referenced via DW_AT_addr_base.
   Asm.OutStreamer->EmitLabel(AddressTableBaseSym);
 
-  if (Pool.empty())
-    return;
-
   // Order the address pool entries by ID
   SmallVector<const MCExpr *, 64> Entries(Pool.size());
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 63c5b262edc3e60a468f7bdb78f23e9728dd12d0..526f7ce30831518d2156b7aee4a4c2bf82d3f0d7 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -16,6 +16,7 @@
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
+#include "WasmException.h"
 #include "WinCFGuard.h"
 #include "WinException.h"
 #include "llvm/ADT/APFloat.h"
@@ -356,7 +357,7 @@ bool AsmPrinter::doInitialization(Module &M) {
     }
     break;
   case ExceptionHandling::Wasm:
-    // TODO to prevent warning
+    ES = new WasmException(this);
     break;
   }
   if (ES)
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 2920ac66290bc4ad5fca14d3d4f5ac5c2d234c10..62103e3107c008a749ca89db3bc6525cf84db9c4 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -156,9 +156,10 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   Parser->setAssemblerDialect(Dialect);
   Parser->setTargetParser(*TAP.get());
   Parser->setEnablePrintSchedInfo(EnablePrintSchedInfo);
+  // Enable lexing Masm binary and hex integer literals in intel inline
+  // assembly.
   if (Dialect == InlineAsm::AD_Intel)
-    // We need this flag to be able to parse numbers like "0bH"
-    Parser->setParsingInlineAsm(true);
+    Parser->getLexer().setLexMasmIntegers(true);
   if (MF) {
     const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
     TAP->SetFrameRegister(TRI->getFrameRegister(*MF));
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 6cba4a0d4b814819dfc0e2c54d98a0050776142e..3fb088ab6f0dd8f40e8dbf0fd90122a028cb573e 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -23,6 +23,7 @@ add_llvm_library(LLVMAsmPrinter
   WinCFGuard.cpp
   WinException.cpp
   CodeViewDebug.cpp
+  WasmException.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 4d45a103c5a3f0ab337fb3fd79c085079054625e..01d018fdde37bad3c844ca5f11e758e37866bcb7 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -73,6 +73,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -116,6 +117,7 @@ CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
   if (!MMI->getModule()->getNamedMetadata("llvm.dbg.cu") ||
       !AP->getObjFileLowering().getCOFFDebugSymbolsSection()) {
     Asm = nullptr;
+    MMI->setDebugInfoAvailability(false);
     return;
   }
   // Tell MMI that we have debug info.
@@ -134,7 +136,9 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
 
   // If this is a Unix-style path, just use it as is. Don't try to canonicalize
   // it textually because one of the path components could be a symlink.
-  if (!Dir.empty() && Dir[0] == '/') {
+  if (Dir.startswith("/") || Filename.startswith("/")) {
+    if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix))
+      return Filename;
     Filepath = Dir;
     if (Dir.back() != '/')
       Filepath += '/';
@@ -558,6 +562,11 @@ void CodeViewDebug::endModule() {
   OS.AddComment("String table");
   OS.EmitCVStringTableDirective();
 
+  // Emit S_BUILDINFO, which points to LF_BUILDINFO. Put this in its own symbol
+  // subsection in the generic .debug$S section at the end. There is no
+  // particular reason for this ordering other than to match MSVC.
+  emitBuildInfo();
+
   // Emit type information and hashes last, so that any types we translate while
   // emitting function info are included.
   emitTypeInformation();
@@ -769,6 +778,49 @@ void CodeViewDebug::emitCompilerInformation() {
   OS.EmitLabel(CompilerEnd);
 }
 
+static TypeIndex getStringIdTypeIdx(GlobalTypeTableBuilder &TypeTable,
+                                    StringRef S) {
+  StringIdRecord SIR(TypeIndex(0x0), S);
+  return TypeTable.writeLeafType(SIR);
+}
+
+void CodeViewDebug::emitBuildInfo() {
+  // First, make LF_BUILDINFO. It's a sequence of strings with various bits of
+  // build info. The known prefix is:
+  // - Absolute path of current directory
+  // - Compiler path
+  // - Main source file path, relative to CWD or absolute
+  // - Type server PDB file
+  // - Canonical compiler command line
+  // If frontend and backend compilation are separated (think llc or LTO), it's
+  // not clear if the compiler path should refer to the executable for the
+  // frontend or the backend. Leave it blank for now.
+  TypeIndex BuildInfoArgs[BuildInfoRecord::MaxArgs] = {};
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  const MDNode *Node = *CUs->operands().begin(); // FIXME: Multiple CUs.
+  const auto *CU = cast<DICompileUnit>(Node);
+  const DIFile *MainSourceFile = CU->getFile();
+  BuildInfoArgs[BuildInfoRecord::CurrentDirectory] =
+      getStringIdTypeIdx(TypeTable, MainSourceFile->getDirectory());
+  BuildInfoArgs[BuildInfoRecord::SourceFile] =
+      getStringIdTypeIdx(TypeTable, MainSourceFile->getFilename());
+  // FIXME: Path to compiler and command line. PDB is intentionally blank unless
+  // we implement /Zi type servers.
+  BuildInfoRecord BIR(BuildInfoArgs);
+  TypeIndex BuildInfoIndex = TypeTable.writeLeafType(BIR);
+
+  // Make a new .debug$S subsection for the S_BUILDINFO record, which points
+  // from the module symbols into the type stream.
+  MCSymbol *BuildInfoEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
+  OS.AddComment("Record length");
+  OS.EmitIntValue(6, 2);
+  OS.AddComment("Record kind: S_BUILDINFO");
+  OS.EmitIntValue(unsigned(SymbolKind::S_BUILDINFO), 2);
+  OS.AddComment("LF_BUILDINFO index");
+  OS.EmitIntValue(BuildInfoIndex.getIndex(), 4);
+  endCVSubsection(BuildInfoEnd);
+}
+
 void CodeViewDebug::emitInlineeLinesSubsection() {
   if (InlinedSubprograms.empty())
     return;
@@ -1287,6 +1339,7 @@ void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   // instruction (AArch64), this will be zero.
   CurFn->CSRSize = MFI.getCVBytesOfCalleeSavedRegisters();
   CurFn->FrameSize = MFI.getStackSize();
+  CurFn->OffsetAdjustment = MFI.getOffsetAdjustment();
   CurFn->HasStackRealignment = TRI->needsStackRealignment(*MF);
 
   // For this function S_FRAMEPROC record, figure out which codeview register
@@ -1464,6 +1517,8 @@ TypeIndex CodeViewDebug::lowerType(const DIType *Ty, const DIType *ClassTy) {
   case dwarf::DW_TAG_union_type:
     return lowerTypeUnion(cast<DICompositeType>(Ty));
   case dwarf::DW_TAG_unspecified_type:
+    if (Ty->getName() == "decltype(nullptr)")
+      return TypeIndex::NullptrT();
     return TypeIndex::None();
   default:
     // Use the null type index.
@@ -2545,16 +2600,10 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
 
       // 32-bit x86 call sequences often use PUSH instructions, which disrupt
       // ESP-relative offsets. Use the virtual frame pointer, VFRAME or $T0,
-      // instead. In simple cases, $T0 will be the CFA.
+      // instead. In frames without stack realignment, $T0 will be the CFA.
       if (RegisterId(Reg) == RegisterId::ESP) {
         Reg = unsigned(RegisterId::VFRAME);
-        Offset -= FI.FrameSize;
-
-        // If the frame requires realignment, VFRAME will be ESP after it is
-        // aligned. We have to remove the ESP adjustments made to push CSRs and
-        // EBP. EBP is not included in CSRSize.
-        if (FI.HasStackRealignment)
-          Offset += FI.CSRSize + 4;
+        Offset += FI.OffsetAdjustment;
       }
 
       // If we can use the chosen frame pointer for the frame and this isn't a
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index b97092a642eed1506bc2d6e339a0ae01be9b01ae..ef0f0c3635e501b8b5e73df307ef0dd3bfc5eede 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -153,6 +153,9 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
     /// Number of bytes pushed to save CSRs.
     unsigned CSRSize = 0;
 
+    /// Adjustment to apply on x86 when using the VFRAME frame pointer.
+    int OffsetAdjustment = 0;
+
     /// Two-bit value indicating which register is the designated frame pointer
     /// register for local variables. Included in S_FRAMEPROC.
     codeview::EncodedFramePtrReg EncodedLocalFramePtrReg =
@@ -272,6 +275,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   void emitCompilerInformation();
 
+  void emitBuildInfo();
+
   void emitInlineeLinesSubsection();
 
   void emitDebugInfoForThunk(const Function *GV,
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 570424a79c8169e01b975dee19049971ff71d0f2..301fd9ef81b3cfabb952c1deb76e003b0c47ba69 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -414,6 +414,8 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_GNU_addr_index:
   case dwarf::DW_FORM_ref_udata:
   case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_addrx:
+  case dwarf::DW_FORM_rnglistx:
   case dwarf::DW_FORM_udata:
     Asm->EmitULEB128(Integer);
     return;
@@ -440,6 +442,8 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   case dwarf::DW_FORM_GNU_addr_index:
   case dwarf::DW_FORM_ref_udata:
   case dwarf::DW_FORM_strx:
+  case dwarf::DW_FORM_addrx:
+  case dwarf::DW_FORM_rnglistx:
   case dwarf::DW_FORM_udata:
     return getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata:
@@ -585,8 +589,7 @@ void DIEString::print(raw_ostream &O) const {
 //===----------------------------------------------------------------------===//
 void DIEInlineString::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_string) {
-    for (char ch : S)
-      AP->emitInt8(ch);
+    AP->OutStreamer->EmitBytes(S);
     AP->emitInt8(0);
     return;
   }
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 580f682b9a60f1c4ecc54bd90d1827f933fb45a1..a362dd40e3b16121df0df7534fb5a74bb164e99b 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -125,6 +125,21 @@ MCSymbol *DebugHandlerBase::getLabelAfterInsn(const MachineInstr *MI) {
   return LabelsAfterInsn.lookup(MI);
 }
 
+// Return the function-local offset of an instruction.
+const MCExpr *
+DebugHandlerBase::getFunctionLocalOffsetAfterInsn(const MachineInstr *MI) {
+  MCContext &MC = Asm->OutContext;
+
+  MCSymbol *Start = Asm->getFunctionBegin();
+  const auto *StartRef = MCSymbolRefExpr::create(Start, MC);
+
+  MCSymbol *AfterInsn = getLabelAfterInsn(MI);
+  assert(AfterInsn && "Expected label after instruction");
+  const auto *AfterRef = MCSymbolRefExpr::create(AfterInsn, MC);
+
+  return MCBinaryExpr::createSub(AfterRef, StartRef, MC);
+}
+
 /// If this type is derived from a base type then return base type size.
 uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
   DIType *Ty = TyRef.resolve();
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
index 4b0ce0e3f03e5128701575cab576e2c5e1083f9d..cdf8dc72b0753180cf82741cdda098ece7e3efa0 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
@@ -125,6 +125,10 @@ public:
   /// Return Label immediately following the instruction.
   MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
 
+  /// Return the function-local offset of an instruction. A label for the
+  /// instruction \p MI should exist (\ref getLabelAfterInsn).
+  const MCExpr *getFunctionLocalOffsetAfterInsn(const MachineInstr *MI);
+
   /// If this type is derived from a base type then return base type size.
   static uint64_t getBaseTypeSize(const DITypeRef TyRef);
 };
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 443c8879f1389847ad9ba7aa2988cb5e20d509f2..d93c7f6c8459ea038b083b5109c986b770b4989d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -69,14 +69,16 @@ void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
   // pool from the skeleton - maybe even in non-fission (possibly fewer
   // relocations by sharing them in the pool, but we have other ideas about how
   // to reduce the number of relocations as well/instead).
-  if (!DD->useSplitDwarf() || !Skeleton)
+  if ((!DD->useSplitDwarf() || !Skeleton) && DD->getDwarfVersion() < 5)
     return addLocalLabelAddress(Die, Attribute, Label);
 
   if (Label)
     DD->addArangeLabel(SymbolCU(this, Label));
 
   unsigned idx = DD->getAddressPool().getIndex(Label);
-  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_GNU_addr_index,
+  Die.addValue(DIEValueAllocator, Attribute,
+               DD->getDwarfVersion() >= 5 ? dwarf::DW_FORM_addrx
+                                          : dwarf::DW_FORM_GNU_addr_index,
                DIEInteger(idx));
 }
 
@@ -275,6 +277,7 @@ void DwarfCompileUnit::addRange(RangeSpan Range) {
       (&CURanges.back().getEnd()->getSection() !=
        &Range.getEnd()->getSection())) {
     CURanges.push_back(Range);
+    DD->addSectionLabel(Range.getStart());
     return;
   }
 
@@ -422,24 +425,29 @@ void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
           ? TLOF.getDwarfRnglistsSection()->getBeginSymbol()
           : TLOF.getDwarfRangesSection()->getBeginSymbol();
 
-  RangeSpanList List(Asm->createTempSymbol("debug_ranges"), std::move(Range));
+  HasRangeLists = true;
+
+  // Add the range list to the set of ranges to be emitted.
+  auto IndexAndList =
+      (DD->getDwarfVersion() < 5 && Skeleton ? Skeleton->DU : DU)
+          ->addRange(*(Skeleton ? Skeleton : this), std::move(Range));
+
+  uint32_t Index = IndexAndList.first;
+  auto &List = *IndexAndList.second;
 
   // Under fission, ranges are specified by constant offsets relative to the
   // CU's DW_AT_GNU_ranges_base.
   // FIXME: For DWARF v5, do not generate the DW_AT_ranges attribute under
   // fission until we support the forms using the .debug_addr section
   // (DW_RLE_startx_endx etc.).
-  if (isDwoUnit()) {
-    if (DD->getDwarfVersion() < 5)
-      addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
-                      RangeSectionSym);
-  } else {
+  if (DD->getDwarfVersion() >= 5)
+    addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_rnglistx, Index);
+  else if (isDwoUnit())
+    addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                    RangeSectionSym);
+  else
     addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
                     RangeSectionSym);
-  }
-
-  // Add the range list to the set of ranges to be emitted.
-  (Skeleton ? Skeleton : this)->CURangeLists.push_back(std::move(List));
 }
 
 void DwarfCompileUnit::attachRangesOrLowHighPC(
@@ -813,7 +821,7 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
 DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
                                                  const DISubprogram &CalleeSP,
                                                  bool IsTail,
-                                                 const MCSymbol *ReturnPC) {
+                                                 const MCExpr *PCOffset) {
   // Insert a call site entry DIE within ScopeDIE.
   DIE &CallSiteDIE =
       createAndAddDIE(dwarf::DW_TAG_call_site, ScopeDIE, nullptr);
@@ -830,8 +838,8 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
   } else {
     // Attach the return PC to allow the debugger to disambiguate call paths
     // from one function to another.
-    assert(ReturnPC && "Missing return PC information for a call");
-    addLabelAddress(CallSiteDIE, dwarf::DW_AT_call_return_pc, ReturnPC);
+    assert(PCOffset && "Missing return PC information for a call");
+    addAddressExpr(CallSiteDIE, dwarf::DW_AT_call_return_pc, PCOffset);
   }
   return CallSiteDIE;
 }
@@ -1095,6 +1103,12 @@ void DwarfCompileUnit::addExpr(DIELoc &Die, dwarf::Form Form,
   Die.addValue(DIEValueAllocator, (dwarf::Attribute)0, Form, DIEExpr(Expr));
 }
 
+void DwarfCompileUnit::addAddressExpr(DIE &Die, dwarf::Attribute Attribute,
+                                      const MCExpr *Expr) {
+  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_addr,
+               DIEExpr(Expr));
+}
+
 void DwarfCompileUnit::applySubprogramAttributesToDefinition(
     const DISubprogram *SP, DIE &SPDie) {
   auto *SPDecl = SP->getDeclaration();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 6389ccd686d6da03182727e80ff97c76723e69f5..13679c37fe547c45ac7dd833a91e09160305bae4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -44,6 +44,7 @@ class MDNode;
 class DwarfCompileUnit final : public DwarfUnit {
   /// A numeric ID unique among all CUs in the module
   unsigned UniqueID;
+  bool HasRangeLists = false;
 
   /// The attribute index of DW_AT_stmt_list in the compile unit DIE, avoiding
   /// the need to search for it in applyStmtList.
@@ -69,10 +70,6 @@ class DwarfCompileUnit final : public DwarfUnit {
   /// GlobalTypes - A map of globally visible types for this unit.
   StringMap<const DIE *> GlobalTypes;
 
-  // List of range lists for a given compile unit, separate from the ranges for
-  // the CU itself.
-  SmallVector<RangeSpanList, 1> CURangeLists;
-
   // List of ranges for a given compile unit.
   SmallVector<RangeSpan, 2> CURanges;
 
@@ -108,6 +105,7 @@ public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
                    DwarfDebug *DW, DwarfFile *DWU);
 
+  bool hasRangeLists() const { return HasRangeLists; }
   unsigned getUniqueID() const { return UniqueID; }
 
   DwarfCompileUnit *getSkeleton() const {
@@ -212,10 +210,10 @@ public:
 
   /// Construct a call site entry DIE describing a call within \p Scope to a
   /// callee described by \p CalleeSP. \p IsTail specifies whether the call is
-  /// a tail call. \p ReturnPC must be non-null for non-tail calls and point
-  /// to the PC value after the call returns.
+  /// a tail call. \p PCOffset must be non-zero for non-tail calls or be the
+  /// function-local offset to PC value after the call instruction.
   DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram &CalleeSP,
-                                 bool IsTail, const MCSymbol *ReturnPC);
+                                 bool IsTail, const MCExpr *PCOffset);
 
   /// Construct import_module DIE.
   DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
@@ -294,16 +292,14 @@ public:
   /// Add a Dwarf expression attribute data and value.
   void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
 
+  /// Add an attribute containing an address expression to \p Die.
+  void addAddressExpr(DIE &Die, dwarf::Attribute Attribute, const MCExpr *Expr);
+
   void applySubprogramAttributesToDefinition(const DISubprogram *SP,
                                              DIE &SPDie);
 
   void applyLabelAttributes(const DbgLabel &Label, DIE &LabelDie);
 
-  /// getRangeLists - Get the vector of range lists.
-  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
-    return (Skeleton ? Skeleton : this)->CURangeLists;
-  }
-
   /// getRanges - Get the list of ranges for this unit.
   const SmallVectorImpl<RangeSpan> &getRanges() const { return CURanges; }
   SmallVector<RangeSpan, 2> takeRanges() { return std::move(CURanges); }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index ab3559d63cc7283697bbec5df2dc1fc66bb01afa..070b8fe4ec1c4a1b7b1eb6cb4f7539270ba4604d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -548,14 +548,15 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
       // For tail calls, no return PC information is needed. For regular calls,
       // the return PC is needed to disambiguate paths in the call graph which
       // could lead to some target function.
-      const MCSymbol *ReturnPC = IsTail ? nullptr : getLabelAfterInsn(&MI);
+      const MCExpr *PCOffset =
+          IsTail ? nullptr : getFunctionLocalOffsetAfterInsn(&MI);
 
-      assert((IsTail || ReturnPC) && "Call without return PC information");
+      assert((IsTail || PCOffset) && "Call without return PC information");
       LLVM_DEBUG(dbgs() << "CallSiteEntry: " << MF.getName() << " -> "
                         << CalleeDecl->getName() << (IsTail ? " [tail]" : "")
                         << "\n");
       CU.constructCallSiteEntryDIE(ScopeDIE, *CalleeDecl->getSubprogram(),
-                                   IsTail, ReturnPC);
+                                   IsTail, PCOffset);
     }
   }
 }
@@ -700,15 +701,18 @@ sortGlobalExprs(SmallVectorImpl<DwarfCompileUnit::GlobalExpr> &GVEs) {
 void DwarfDebug::beginModule() {
   NamedRegionTimer T(DbgTimerName, DbgTimerDescription, DWARFGroupName,
                      DWARFGroupDescription, TimePassesIsEnabled);
-  if (DisableDebugInfoPrinting)
+  if (DisableDebugInfoPrinting) {
+    MMI->setDebugInfoAvailability(false);
     return;
+  }
 
   const Module *M = MMI->getModule();
 
   unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
                                        M->debug_compile_units_end());
   // Tell MMI whether we have debug info.
-  MMI->setDebugInfoAvailability(NumDebugCUs > 0);
+  assert(MMI->hasDebugInfo() == (NumDebugCUs > 0) &&
+         "DebugInfoAvailabilty initialized unexpectedly");
   SingleCU = NumDebugCUs == 1;
   DenseMap<DIGlobalVariable *, SmallVector<DwarfCompileUnit::GlobalExpr, 1>>
       GVMap;
@@ -726,11 +730,20 @@ void DwarfDebug::beginModule() {
     (useSplitDwarf() ? SkeletonHolder : InfoHolder)
         .setStringOffsetsStartSym(Asm->createTempSymbol("str_offsets_base"));
 
-  // Create the symbol that designates the start of the DWARF v5 range list
-  // table. It is located past the header and before the offsets table.
-  if (getDwarfVersion() >= 5)
-    (useSplitDwarf() ? SkeletonHolder : InfoHolder)
-        .setRnglistsTableBaseSym(Asm->createTempSymbol("rnglists_table_base"));
+
+  // Create the symbols that designates the start of the DWARF v5 range list
+  // and locations list tables. They are located past the table headers.
+  if (getDwarfVersion() >= 5) {
+    DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+    Holder.setRnglistsTableBaseSym(
+        Asm->createTempSymbol("rnglists_table_base"));
+    Holder.setLoclistsTableBaseSym(
+        Asm->createTempSymbol("loclists_table_base"));
+
+    if (useSplitDwarf())
+      InfoHolder.setRnglistsTableBaseSym(
+          Asm->createTempSymbol("rnglists_dwo_table_base"));
+  }
 
   // Create the symbol that points to the first entry following the debug
   // address table (.debug_addr) header.
@@ -851,12 +864,8 @@ void DwarfDebug::finalizeModuleInfo() {
         SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
                       dwarf::DW_FORM_data8, ID);
       }
-      // We don't keep track of which addresses are used in which CU so this
-      // is a bit pessimistic under LTO.
-      if (!AddrPool.isEmpty())
-        SkCU->addAddrTableBase();
 
-      if (getDwarfVersion() < 5 && !SkCU->getRangeLists().empty()) {
+      if (getDwarfVersion() < 5 && !SkeletonHolder.getRangeLists().empty()) {
         const MCSymbol *Sym = TLOF.getDwarfRangesSection()->getBeginSymbol();
         SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_ranges_base,
                               Sym, Sym);
@@ -870,6 +879,12 @@ void DwarfDebug::finalizeModuleInfo() {
     // .subsections_via_symbols in mach-o. This would mean turning on
     // ranges for all subprogram DIEs for mach-o.
     DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+
+    // We don't keep track of which addresses are used in which CU so this
+    // is a bit pessimistic under LTO.
+    if (!AddrPool.isEmpty())
+      U.addAddrTableBase();
+
     if (unsigned NumRanges = TheCU.getRanges().size()) {
       if (NumRanges > 1 && useRangesSection())
         // A DW_AT_low_pc attribute may also be specified in combination with
@@ -882,9 +897,13 @@ void DwarfDebug::finalizeModuleInfo() {
       U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
     }
 
-    if (getDwarfVersion() >= 5 && !useSplitDwarf() &&
-        !U.getRangeLists().empty())
-      U.addRnglistsBase();
+    if (getDwarfVersion() >= 5) {
+      if (U.hasRangeLists())
+        U.addRnglistsBase();
+
+      if (!DebugLocs.getLists().empty() && !useSplitDwarf())
+        U.addLoclistsBase();
+    }
 
     auto *CUNode = cast<DICompileUnit>(P.first);
     // If compile Unit has macros, emit "DW_AT_macro_info" attribute.
@@ -948,9 +967,11 @@ void DwarfDebug::endModule() {
     emitDebugInfoDWO();
     emitDebugAbbrevDWO();
     emitDebugLineDWO();
-    emitDebugAddr();
+    emitDebugRangesDWO();
   }
 
+  emitDebugAddr();
+
   // Emit info into the dwarf accelerator table sections.
   switch (getAccelTableKind()) {
   case AccelTableKind::Apple:
@@ -1371,49 +1392,6 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
   }
 }
 
-static const DebugLoc &
-findNextDebugLoc(MachineBasicBlock::const_iterator MBBI,
-                 MachineBasicBlock::const_iterator MBBE) {
-  static DebugLoc NoLocation;
-  for ( ; MBBI != MBBE; ++MBBI) {
-    if (MBBI->isDebugInstr())
-      continue;
-    const DebugLoc &DL = MBBI->getDebugLoc();
-    if (DL)
-      return DL;
-  }
-  return NoLocation;
-}
-
-void DwarfDebug::emitDebugLoc(const DebugLoc &DL) {
-  unsigned LastAsmLine =
-      Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
-
-  // We have an explicit location, different from the previous location.
-  // Don't repeat a line-0 record, but otherwise emit the new location.
-  // (The new location might be an explicit line 0, which we do emit.)
-  unsigned Line = DL.getLine();
-  if (PrevInstLoc && Line == 0 && LastAsmLine == 0)
-    return;
-  unsigned Flags = 0;
-  if (DL == PrologEndLoc) {
-    Flags |= DWARF2_FLAG_PROLOGUE_END | DWARF2_FLAG_IS_STMT;
-    PrologEndLoc = DebugLoc();
-  }
-  // If the line changed, we call that a new statement; unless we went to
-  // line 0 and came back, in which case it is not a new statement.
-  unsigned OldLine = PrevInstLoc ? PrevInstLoc.getLine() : LastAsmLine;
-  if (Line && Line != OldLine)
-    Flags |= DWARF2_FLAG_IS_STMT;
-
-  const MDNode *Scope = DL.getScope();
-  recordSourceLine(Line, DL.getCol(), Scope, Flags);
-
-  // If we're not at line 0, remember this location.
-  if (Line)
-    PrevInstLoc = DL;
-}
-
 // Process beginning of an instruction.
 void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   DebugHandlerBase::beginInstruction(MI);
@@ -1458,41 +1436,54 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
     // If we have already emitted a line-0 record, don't repeat it.
     if (LastAsmLine == 0)
       return;
-    // By default we emit nothing to avoid line table bloat. However at the
-    // beginning of a basic block or after a label it is undesirable to let
-    // the previous location unchanged. In these cases do a forward search for
-    // the next valid debug location.
-    if (UnknownLocations == Default) {
-      const MachineBasicBlock &MBB = *MI->getParent();
-      if (!PrevLabel && PrevInstBB == &MBB)
-        return;
-
-      const DebugLoc &NextDL = findNextDebugLoc(MI->getIterator(), MBB.end());
-      if (NextDL) {
-        emitDebugLoc(NextDL);
-        return;
-      }
-    }
-
-    // We should emit a line-0 record.
     // If user said Don't Do That, don't do that.
     if (UnknownLocations == Disable)
       return;
-    // Emit a line-0 record now.
-    // Preserve the file and column numbers, if we can, to save space in
-    // the encoded line table.
-    // Do not update PrevInstLoc, it remembers the last non-0 line.
-    const MDNode *Scope = nullptr;
-    unsigned Column = 0;
-    if (PrevInstLoc) {
-      Scope = PrevInstLoc.getScope();
-      Column = PrevInstLoc.getCol();
+    // See if we have a reason to emit a line-0 record now.
+    // Reasons to emit a line-0 record include:
+    // - User asked for it (UnknownLocations).
+    // - Instruction has a label, so it's referenced from somewhere else,
+    //   possibly debug information; we want it to have a source location.
+    // - Instruction is at the top of a block; we don't want to inherit the
+    //   location from the physically previous (maybe unrelated) block.
+    if (UnknownLocations == Enable || PrevLabel ||
+        (PrevInstBB && PrevInstBB != MI->getParent())) {
+      // Preserve the file and column numbers, if we can, to save space in
+      // the encoded line table.
+      // Do not update PrevInstLoc, it remembers the last non-0 line.
+      const MDNode *Scope = nullptr;
+      unsigned Column = 0;
+      if (PrevInstLoc) {
+        Scope = PrevInstLoc.getScope();
+        Column = PrevInstLoc.getCol();
+      }
+      recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
     }
-    recordSourceLine(/*Line=*/0, Column, Scope, /*Flags=*/0);
     return;
   }
 
-  emitDebugLoc(DL);
+  // We have an explicit location, different from the previous location.
+  // Don't repeat a line-0 record, but otherwise emit the new location.
+  // (The new location might be an explicit line 0, which we do emit.)
+  if (PrevInstLoc && DL.getLine() == 0 && LastAsmLine == 0)
+    return;
+  unsigned Flags = 0;
+  if (DL == PrologEndLoc) {
+    Flags |= DWARF2_FLAG_PROLOGUE_END | DWARF2_FLAG_IS_STMT;
+    PrologEndLoc = DebugLoc();
+  }
+  // If the line changed, we call that a new statement; unless we went to
+  // line 0 and came back, in which case it is not a new statement.
+  unsigned OldLine = PrevInstLoc ? PrevInstLoc.getLine() : LastAsmLine;
+  if (DL.getLine() && DL.getLine() != OldLine)
+    Flags |= DWARF2_FLAG_IS_STMT;
+
+  const MDNode *Scope = DL.getScope();
+  recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
+
+  // If we're not at line 0, remember this location.
+  if (DL.getLine())
+    PrevInstLoc = DL;
 }
 
 static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
@@ -1947,25 +1938,119 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry) {
   emitDebugLocEntry(Streamer, Entry);
 }
 
-// Emit locations into the debug loc section.
+// Emit the common part of the DWARF 5 range/locations list tables header.
+static void emitListsTableHeaderStart(AsmPrinter *Asm, const DwarfFile &Holder,
+                                      MCSymbol *TableStart,
+                                      MCSymbol *TableEnd) {
+  // Build the table header, which starts with the length field.
+  Asm->OutStreamer->AddComment("Length");
+  Asm->EmitLabelDifference(TableEnd, TableStart, 4);
+  Asm->OutStreamer->EmitLabel(TableStart);
+  // Version number (DWARF v5 and later).
+  Asm->OutStreamer->AddComment("Version");
+  Asm->emitInt16(Asm->OutStreamer->getContext().getDwarfVersion());
+  // Address size.
+  Asm->OutStreamer->AddComment("Address size");
+  Asm->emitInt8(Asm->MAI->getCodePointerSize());
+  // Segment selector size.
+  Asm->OutStreamer->AddComment("Segment selector size");
+  Asm->emitInt8(0);
+}
+
+// Emit the header of a DWARF 5 range list table list table. Returns the symbol
+// that designates the end of the table for the caller to emit when the table is
+// complete.
+static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
+                                         const DwarfFile &Holder) {
+  MCSymbol *TableStart = Asm->createTempSymbol("debug_rnglist_table_start");
+  MCSymbol *TableEnd = Asm->createTempSymbol("debug_rnglist_table_end");
+  emitListsTableHeaderStart(Asm, Holder, TableStart, TableEnd);
+
+  Asm->OutStreamer->AddComment("Offset entry count");
+  Asm->emitInt32(Holder.getRangeLists().size());
+  Asm->OutStreamer->EmitLabel(Holder.getRnglistsTableBaseSym());
+
+  for (const RangeSpanList &List : Holder.getRangeLists())
+    Asm->EmitLabelDifference(List.getSym(), Holder.getRnglistsTableBaseSym(),
+                             4);
+
+  return TableEnd;
+}
+
+// Emit the header of a DWARF 5 locations list table. Returns the symbol that
+// designates the end of the table for the caller to emit when the table is
+// complete.
+static MCSymbol *emitLoclistsTableHeader(AsmPrinter *Asm,
+                                         const DwarfFile &Holder) {
+  MCSymbol *TableStart = Asm->createTempSymbol("debug_loclist_table_start");
+  MCSymbol *TableEnd = Asm->createTempSymbol("debug_loclist_table_end");
+  emitListsTableHeaderStart(Asm, Holder, TableStart, TableEnd);
+
+  // FIXME: Generate the offsets table and use DW_FORM_loclistx with the
+  // DW_AT_loclists_base attribute. Until then set the number of offsets to 0.
+  Asm->OutStreamer->AddComment("Offset entry count");
+  Asm->emitInt32(0);
+  Asm->OutStreamer->EmitLabel(Holder.getLoclistsTableBaseSym());
+
+  return TableEnd;
+}
+
+// Emit locations into the .debug_loc/.debug_rnglists section.
 void DwarfDebug::emitDebugLoc() {
   if (DebugLocs.getLists().empty())
     return;
 
-  // Start the dwarf loc section.
-  Asm->OutStreamer->SwitchSection(
-      Asm->getObjFileLowering().getDwarfLocSection());
+  bool IsLocLists = getDwarfVersion() >= 5;
+  MCSymbol *TableEnd = nullptr;
+  if (IsLocLists) {
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfLoclistsSection());
+    TableEnd = emitLoclistsTableHeader(Asm, useSplitDwarf() ? SkeletonHolder
+                                                            : InfoHolder);
+  } else {
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfLocSection());
+  }
+
   unsigned char Size = Asm->MAI->getCodePointerSize();
   for (const auto &List : DebugLocs.getLists()) {
     Asm->OutStreamer->EmitLabel(List.Label);
+
     const DwarfCompileUnit *CU = List.CU;
+    const MCSymbol *Base = CU->getBaseAddress();
     for (const auto &Entry : DebugLocs.getEntries(List)) {
-      // Set up the range. This range is relative to the entry point of the
-      // compile unit. This is a hard coded 0 for low_pc when we're emitting
-      // ranges, or the DW_AT_low_pc on the compile unit otherwise.
-      if (auto *Base = CU->getBaseAddress()) {
-        Asm->EmitLabelDifference(Entry.BeginSym, Base, Size);
-        Asm->EmitLabelDifference(Entry.EndSym, Base, Size);
+      if (Base) {
+        // Set up the range. This range is relative to the entry point of the
+        // compile unit. This is a hard coded 0 for low_pc when we're emitting
+        // ranges, or the DW_AT_low_pc on the compile unit otherwise.
+        if (IsLocLists) {
+          Asm->OutStreamer->AddComment("DW_LLE_offset_pair");
+          Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_offset_pair, 1);
+          Asm->OutStreamer->AddComment("  starting offset");
+          Asm->EmitLabelDifferenceAsULEB128(Entry.BeginSym, Base);
+          Asm->OutStreamer->AddComment("  ending offset");
+          Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Base);
+        } else {
+          Asm->EmitLabelDifference(Entry.BeginSym, Base, Size);
+          Asm->EmitLabelDifference(Entry.EndSym, Base, Size);
+        }
+
+        emitDebugLocEntryLocation(Entry);
+        continue;
+      }
+
+      // We have no base address.
+      if (IsLocLists) {
+        // TODO: Use DW_LLE_base_addressx + DW_LLE_offset_pair, or
+        // DW_LLE_startx_length in case if there is only a single range.
+        // That should reduce the size of the debug data emited.
+        // For now just use the DW_LLE_startx_length for all cases.
+        Asm->OutStreamer->AddComment("DW_LLE_startx_length");
+        Asm->emitInt8(dwarf::DW_LLE_startx_length);
+        Asm->OutStreamer->AddComment("  start idx");
+        Asm->EmitULEB128(AddrPool.getIndex(Entry.BeginSym));
+        Asm->OutStreamer->AddComment("  length");
+        Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Entry.BeginSym);
       } else {
         Asm->OutStreamer->EmitSymbolValue(Entry.BeginSym, Size);
         Asm->OutStreamer->EmitSymbolValue(Entry.EndSym, Size);
@@ -1973,9 +2058,20 @@ void DwarfDebug::emitDebugLoc() {
 
       emitDebugLocEntryLocation(Entry);
     }
-    Asm->OutStreamer->EmitIntValue(0, Size);
-    Asm->OutStreamer->EmitIntValue(0, Size);
+
+    if (IsLocLists) {
+      // .debug_loclists section ends with DW_LLE_end_of_list.
+      Asm->OutStreamer->AddComment("DW_LLE_end_of_list");
+      Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_end_of_list, 1);
+    } else {
+      // Terminate the .debug_loc list with two 0 values.
+      Asm->OutStreamer->EmitIntValue(0, Size);
+      Asm->OutStreamer->EmitIntValue(0, Size);
+    }
   }
+
+  if (TableEnd)
+    Asm->OutStreamer->EmitLabel(TableEnd);
 }
 
 void DwarfDebug::emitDebugLocDWO() {
@@ -1984,10 +2080,13 @@ void DwarfDebug::emitDebugLocDWO() {
   for (const auto &List : DebugLocs.getLists()) {
     Asm->OutStreamer->EmitLabel(List.Label);
     for (const auto &Entry : DebugLocs.getEntries(List)) {
-      // Just always use start_length for now - at least that's one address
-      // rather than two. We could get fancier and try to, say, reuse an
-      // address we know we've emitted elsewhere (the start of the function?
-      // The start of the CU or CU subrange that encloses this range?)
+      // GDB only supports startx_length in pre-standard split-DWARF.
+      // (in v5 standard loclists, it currently* /only/ supports base_address +
+      // offset_pair, so the implementations can't really share much since they
+      // need to use different representations)
+      // * as of October 2018, at least
+      // Ideally/in v5, this could use SectionLabels to reuse existing addresses
+      // in the address pool to minimize object size/relocations.
       Asm->emitInt8(dwarf::DW_LLE_startx_length);
       unsigned idx = AddrPool.getIndex(Entry.BeginSym);
       Asm->EmitULEB128(idx);
@@ -2161,10 +2260,10 @@ void DwarfDebug::emitDebugARanges() {
 }
 
 /// Emit a single range list. We handle both DWARF v5 and earlier.
-static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
+static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
                           const RangeSpanList &List) {
 
-  auto DwarfVersion = CU->getDwarfVersion();
+  auto DwarfVersion = DD.getDwarfVersion();
   // Emit our symbol so we can find the beginning of the range.
   Asm->OutStreamer->EmitLabel(List.getSym());
   // Gather all the ranges that apply to the same section so they can share
@@ -2176,7 +2275,8 @@ static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
   for (const RangeSpan &Range : List.getRanges())
     SectionRanges[&Range.getStart()->getSection()].push_back(&Range);
 
-  auto *CUBase = CU->getBaseAddress();
+  const DwarfCompileUnit &CU = List.getCU();
+  const MCSymbol *CUBase = CU.getBaseAddress();
   bool BaseIsSet = false;
   for (const auto &P : SectionRanges) {
     // Don't bother with a base address entry if there's only one range in
@@ -2186,19 +2286,23 @@ static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
     // or optnone where there may be holes in a single CU's section
     // contributions.
     auto *Base = CUBase;
-    if (!Base && P.second.size() > 1 &&
+    if (!Base && (P.second.size() > 1 || DwarfVersion < 5) &&
         (UseDwarfRangesBaseAddressSpecifier || DwarfVersion >= 5)) {
       BaseIsSet = true;
       // FIXME/use care: This may not be a useful base address if it's not
       // the lowest address/range in this object.
       Base = P.second.front()->getStart();
       if (DwarfVersion >= 5) {
-        Asm->OutStreamer->AddComment("DW_RLE_base_address");
-        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_address, 1);
-      } else
+        Base = DD.getSectionLabel(&Base->getSection());
+        Asm->OutStreamer->AddComment("DW_RLE_base_addressx");
+        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_addressx, 1);
+        Asm->OutStreamer->AddComment("  base address index");
+        Asm->EmitULEB128(DD.getAddressPool().getIndex(Base));
+      } else {
         Asm->OutStreamer->EmitIntValue(-1, Size);
-      Asm->OutStreamer->AddComment("  base address");
-      Asm->OutStreamer->EmitSymbolValue(Base, Size);
+        Asm->OutStreamer->AddComment("  base address");
+        Asm->OutStreamer->EmitSymbolValue(Base, Size);
+      }
     } else if (BaseIsSet && DwarfVersion < 5) {
       BaseIsSet = false;
       assert(!Base);
@@ -2225,10 +2329,10 @@ static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
           Asm->EmitLabelDifference(End, Base, Size);
         }
       } else if (DwarfVersion >= 5) {
-        Asm->OutStreamer->AddComment("DW_RLE_start_length");
-        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_start_length, 1);
-        Asm->OutStreamer->AddComment("  start");
-        Asm->OutStreamer->EmitSymbolValue(Begin, Size);
+        Asm->OutStreamer->AddComment("DW_RLE_startx_length");
+        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_startx_length, 1);
+        Asm->OutStreamer->AddComment("  start index");
+        Asm->EmitULEB128(DD.getAddressPool().getIndex(Begin));
         Asm->OutStreamer->AddComment("  length");
         Asm->EmitLabelDifferenceAsULEB128(End, Begin);
       } else {
@@ -2247,31 +2351,13 @@ static void emitRangeList(AsmPrinter *Asm, DwarfCompileUnit *CU,
   }
 }
 
-// Emit the header of a DWARF 5 range list table. Returns the symbol that
-// designates the end of the table for the caller to emit when the table is
-// complete.
-static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm, DwarfFile &Holder) {
-  // The length is described by a starting label right after the length field
-  // and an end label.
-  MCSymbol *TableStart = Asm->createTempSymbol("debug_rnglist_table_start");
-  MCSymbol *TableEnd = Asm->createTempSymbol("debug_rnglist_table_end");
-  // Build the range table header, which starts with the length field.
-  Asm->EmitLabelDifference(TableEnd, TableStart, 4);
-  Asm->OutStreamer->EmitLabel(TableStart);
-  // Version number (DWARF v5 and later).
-  Asm->emitInt16(Asm->OutStreamer->getContext().getDwarfVersion());
-  // Address size.
-  Asm->emitInt8(Asm->MAI->getCodePointerSize());
-  // Segment selector size.
-  Asm->emitInt8(0);
-
-  MCSymbol *RnglistTableBaseSym = Holder.getRnglistsTableBaseSym();
+void emitDebugRangesImpl(DwarfDebug &DD, AsmPrinter *Asm,
+                         const DwarfFile &Holder, MCSymbol *TableEnd) {
+  for (const RangeSpanList &List : Holder.getRangeLists())
+    emitRangeList(DD, Asm, List);
 
-  // FIXME: Generate the offsets table and use DW_FORM_rnglistx with the
-  // DW_AT_ranges attribute. Until then set the number of offsets to 0.
-  Asm->emitInt32(0);
-  Asm->OutStreamer->EmitLabel(RnglistTableBaseSym);
-  return TableEnd;
+  if (TableEnd)
+    Asm->OutStreamer->EmitLabel(TableEnd);
 }
 
 /// Emit address ranges into the .debug_ranges section or into the DWARF v5
@@ -2280,55 +2366,52 @@ void DwarfDebug::emitDebugRanges() {
   if (CUMap.empty())
     return;
 
-  auto NoRangesPresent = [this]() {
-    return llvm::all_of(
-        CUMap, [](const decltype(CUMap)::value_type &Pair) {
-          return Pair.second->getRangeLists().empty();
-        });
-  };
-
-  if (llvm::all_of(CUMap, [](const decltype(CUMap)::value_type &Pair) {
-        return Pair.second->getCUNode()->isDebugDirectivesOnly();
-      })) {
-    assert(NoRangesPresent() && "No debug ranges expected.");
-    return;
-  }
+  const auto &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
 
-  if (!useRangesSection()) {
-    assert(NoRangesPresent() && "No debug ranges expected.");
+  if (Holder.getRangeLists().empty())
     return;
-  }
 
-  if (NoRangesPresent())
-    return;
+  assert(useRangesSection());
+  assert(llvm::none_of(CUMap, [](const decltype(CUMap)::value_type &Pair) {
+    return Pair.second->getCUNode()->isDebugDirectivesOnly();
+  }));
 
   // Start the dwarf ranges section.
   MCSymbol *TableEnd = nullptr;
   if (getDwarfVersion() >= 5) {
     Asm->OutStreamer->SwitchSection(
         Asm->getObjFileLowering().getDwarfRnglistsSection());
-    TableEnd = emitRnglistsTableHeader(Asm, useSplitDwarf() ? SkeletonHolder
-                                                            : InfoHolder);
+    TableEnd = emitRnglistsTableHeader(Asm, Holder);
   } else
     Asm->OutStreamer->SwitchSection(
         Asm->getObjFileLowering().getDwarfRangesSection());
 
-  // Grab the specific ranges for the compile units in the module.
-  for (const auto &I : CUMap) {
-    DwarfCompileUnit *TheCU = I.second;
-    if (TheCU->getCUNode()->isDebugDirectivesOnly())
-      continue;
+  emitDebugRangesImpl(*this, Asm, Holder, TableEnd);
+}
 
-    if (auto *Skel = TheCU->getSkeleton())
-      TheCU = Skel;
+void DwarfDebug::emitDebugRangesDWO() {
+  assert(useSplitDwarf());
 
-    // Iterate over the misc ranges for the compile units in the module.
-    for (const RangeSpanList &List : TheCU->getRangeLists())
-      emitRangeList(Asm, TheCU, List);
-  }
+  if (CUMap.empty())
+    return;
 
-  if (TableEnd)
-    Asm->OutStreamer->EmitLabel(TableEnd);
+  const auto &Holder = InfoHolder;
+
+  if (Holder.getRangeLists().empty())
+    return;
+
+  assert(getDwarfVersion() >= 5);
+  assert(useRangesSection());
+  assert(llvm::none_of(CUMap, [](const decltype(CUMap)::value_type &Pair) {
+    return Pair.second->getCUNode()->isDebugDirectivesOnly();
+  }));
+
+  // Start the dwarf ranges section.
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfRnglistsDWOSection());
+  MCSymbol *TableEnd = emitRnglistsTableHeader(Asm, Holder);
+
+  emitDebugRangesImpl(*this, Asm, Holder, TableEnd);
 }
 
 void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
@@ -2469,9 +2552,8 @@ void DwarfDebug::emitDebugStrDWO() {
                          OffSec, /* UseRelativeOffsets = */ false);
 }
 
-// Emit DWO addresses.
+// Emit address pool.
 void DwarfDebug::emitDebugAddr() {
-  assert(useSplitDwarf() && "No split dwarf?");
   AddrPool.emit(*Asm, Asm->getObjFileLowering().getDwarfAddrSection());
 }
 
@@ -2631,3 +2713,11 @@ void DwarfDebug::addAccelType(const DICompileUnit &CU, StringRef Name,
 uint16_t DwarfDebug::getDwarfVersion() const {
   return Asm->OutStreamer->getContext().getDwarfVersion();
 }
+
+void DwarfDebug::addSectionLabel(const MCSymbol *Sym) {
+  SectionLabels.insert(std::make_pair(&Sym->getSection(), Sym));
+}
+
+const MCSymbol *DwarfDebug::getSectionLabel(const MCSection *S) {
+  return SectionLabels.find(S)->second;
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index e115eb771fb75fb6808afc6b8d2e212c34ce6d21..c73d442af2fd6ea9aced119ce57a04d73482ec68 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -327,6 +327,8 @@ class DwarfDebug : public DebugHandlerBase {
   /// used to keep track of which types we have emitted type units for.
   DenseMap<const MDNode *, uint64_t> TypeSignatures;
 
+  DenseMap<const MCSection *, const MCSymbol *> SectionLabels;
+
   SmallVector<
       std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1>
       TypeUnitsUnderConstruction;
@@ -490,9 +492,7 @@ class DwarfDebug : public DebugHandlerBase {
 
   /// Emit address ranges into a debug ranges section.
   void emitDebugRanges();
-
-  /// Emit range lists into a DWARF v5 debug rnglists section.
-  void emitDebugRnglists();
+  void emitDebugRangesDWO();
 
   /// Emit macros into a debug macinfo section.
   void emitDebugMacinfo();
@@ -724,8 +724,8 @@ public:
   bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; }
   /// @}
 
-private:
-  void emitDebugLoc(const DebugLoc &DL);
+  void addSectionLabel(const MCSymbol *Sym);
+  const MCSymbol *getSectionLabel(const MCSection *S);
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index af51d27663449cb2dab3f4da3b0a26c83e3ad0e8..19c350afbf1776a5c42cd026143bf1dbe50ec95c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -254,10 +254,9 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
 
   // Don't emit locations that cannot be expressed without DW_OP_stack_value.
   if (DwarfVersion < 4)
-    if (std::any_of(ExprCursor.begin(), ExprCursor.end(),
-                    [](DIExpression::ExprOperand Op) -> bool {
-                      return Op.getOp() == dwarf::DW_OP_stack_value;
-                    })) {
+    if (any_of(ExprCursor, [](DIExpression::ExprOperand Op) -> bool {
+          return Op.getOp() == dwarf::DW_OP_stack_value;
+        })) {
       DwarfRegs.clear();
       LocationKind = Unknown;
       return false;
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 0ab9ea87c232bc3457a186312d39d5b07781abc5..4e410bb49beab02e459d05c33eacf058fc4405fb 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -109,3 +109,10 @@ void DwarfFile::addScopeLabel(LexicalScope *LS, DbgLabel *Label) {
   SmallVectorImpl<DbgLabel *> &Labels = ScopeLabels[LS];
   Labels.push_back(Label);
 }
+
+std::pair<uint32_t, RangeSpanList *>
+DwarfFile::addRange(const DwarfCompileUnit &CU, SmallVector<RangeSpan, 2> R) {
+  CURangeLists.push_back(
+      RangeSpanList(Asm->createTempSymbol("debug_ranges"), CU, std::move(R)));
+  return std::make_pair(CURangeLists.size() - 1, &CURangeLists.back());
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index c315f44a8d837bfe9d1239ad6516d252dc1821ce..51acca8c1e53424e5a710813c525be0d7359e618 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -32,6 +32,36 @@ class DwarfUnit;
 class LexicalScope;
 class MCSection;
 
+// Data structure to hold a range for range lists.
+class RangeSpan {
+public:
+  RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {}
+  const MCSymbol *getStart() const { return Start; }
+  const MCSymbol *getEnd() const { return End; }
+  void setEnd(const MCSymbol *E) { End = E; }
+
+private:
+  const MCSymbol *Start, *End;
+};
+
+class RangeSpanList {
+private:
+  // Index for locating within the debug_range section this particular span.
+  MCSymbol *RangeSym;
+  const DwarfCompileUnit *CU;
+  // List of ranges.
+  SmallVector<RangeSpan, 2> Ranges;
+
+public:
+  RangeSpanList(MCSymbol *Sym, const DwarfCompileUnit &CU,
+                SmallVector<RangeSpan, 2> Ranges)
+      : RangeSym(Sym), CU(&CU), Ranges(std::move(Ranges)) {}
+  MCSymbol *getSym() const { return RangeSym; }
+  const DwarfCompileUnit &getCU() const { return *CU; }
+  const SmallVectorImpl<RangeSpan> &getRanges() const { return Ranges; }
+  void addRange(RangeSpan Range) { Ranges.push_back(Range); }
+};
+
 class DwarfFile {
   // Target of Dwarf emission, used for sizing of abbreviations.
   AsmPrinter *Asm;
@@ -46,6 +76,10 @@ class DwarfFile {
 
   DwarfStringPool StrPool;
 
+  // List of range lists for a given compile unit, separate from the ranges for
+  // the CU itself.
+  SmallVector<RangeSpanList, 1> CURangeLists;
+
   /// DWARF v5: The symbol that designates the start of the contribution to
   /// the string offsets table. The contribution is shared by all units.
   MCSymbol *StringOffsetsStartSym = nullptr;
@@ -54,6 +88,10 @@ class DwarfFile {
   /// The table is shared by all units.
   MCSymbol *RnglistsTableBaseSym = nullptr;
 
+  /// DWARF v5: The symbol that designates the base of the locations list table.
+  /// The table is shared by all units.
+  MCSymbol *LoclistsTableBaseSym = nullptr;
+
   /// The variables of a lexical scope.
   struct ScopeVars {
     /// We need to sort Args by ArgNo and check for duplicates. This could also
@@ -84,6 +122,14 @@ public:
     return CUs;
   }
 
+  std::pair<uint32_t, RangeSpanList *> addRange(const DwarfCompileUnit &CU,
+                                                SmallVector<RangeSpan, 2> R);
+
+  /// getRangeLists - Get the vector of range lists.
+  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
+    return CURangeLists;
+  }
+
   /// Compute the size and offset of a DIE given an incoming Offset.
   unsigned computeSizeAndOffset(DIE &Die, unsigned Offset);
 
@@ -118,13 +164,14 @@ public:
   DwarfStringPool &getStringPool() { return StrPool; }
 
   MCSymbol *getStringOffsetsStartSym() const { return StringOffsetsStartSym; }
-
   void setStringOffsetsStartSym(MCSymbol *Sym) { StringOffsetsStartSym = Sym; }
 
   MCSymbol *getRnglistsTableBaseSym() const { return RnglistsTableBaseSym; }
-
   void setRnglistsTableBaseSym(MCSymbol *Sym) { RnglistsTableBaseSym = Sym; }
 
+  MCSymbol *getLoclistsTableBaseSym() const { return LoclistsTableBaseSym; }
+  void setLoclistsTableBaseSym(MCSymbol *Sym) { LoclistsTableBaseSym = Sym; }
+
   /// \returns false if the variable was merged with a previous one.
   bool addScopeVariable(LexicalScope *LS, DbgVariable *Var);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 14e59c3df2701d246f45c8afe447d47acbf9e082..2053395808f1bcf1f16e5fb2d349a8eeb4d165ee 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -315,14 +315,21 @@ unsigned DwarfTypeUnit::getOrCreateSourceID(const DIFile *File) {
 }
 
 void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
-  if (!DD->useSplitDwarf()) {
-    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Die, dwarf::DW_FORM_udata, Sym);
-  } else {
+  if (DD->getDwarfVersion() >= 5) {
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addrx);
+    addUInt(Die, dwarf::DW_FORM_addrx, DD->getAddressPool().getIndex(Sym));
+    return;
+  }
+
+  if (DD->useSplitDwarf()) {
     addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
     addUInt(Die, dwarf::DW_FORM_GNU_addr_index,
             DD->getAddressPool().getIndex(Sym));
+    return;
   }
+
+  addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+  addLabel(Die, dwarf::DW_FORM_udata, Sym);
 }
 
 void DwarfUnit::addLabelDelta(DIE &Die, dwarf::Attribute Attribute,
@@ -1649,9 +1656,20 @@ void DwarfUnit::addRnglistsBase() {
                   TLOF.getDwarfRnglistsSection()->getBeginSymbol());
 }
 
+void DwarfUnit::addLoclistsBase() {
+  assert(DD->getDwarfVersion() >= 5 &&
+         "DW_AT_loclists_base requires DWARF version 5 or later");
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  addSectionLabel(getUnitDie(), dwarf::DW_AT_loclists_base,
+                  DU->getLoclistsTableBaseSym(),
+                  TLOF.getDwarfLoclistsSection()->getBeginSymbol());
+}
+
 void DwarfUnit::addAddrTableBase() {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   MCSymbol *Label = DD->getAddressPool().getLabel();
-  addSectionLabel(getUnitDie(), dwarf::DW_AT_GNU_addr_base, Label,
-                  TLOF.getDwarfAddrSection()->getBeginSymbol());
+  addSectionLabel(getUnitDie(),
+                  getDwarfVersion() >= 5 ? dwarf::DW_AT_addr_base
+                                         : dwarf::DW_AT_GNU_addr_base,
+                  Label, TLOF.getDwarfAddrSection()->getBeginSymbol());
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 6e2bd273cb6d4823b29e304d7effea5d1fb62e0b..860d165318430a9611bd09a569b41fbc2c1a9498 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -35,33 +35,6 @@ class ConstantFP;
 class DbgVariable;
 class DwarfCompileUnit;
 
-// Data structure to hold a range for range lists.
-class RangeSpan {
-public:
-  RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {}
-  const MCSymbol *getStart() const { return Start; }
-  const MCSymbol *getEnd() const { return End; }
-  void setEnd(const MCSymbol *E) { End = E; }
-
-private:
-  const MCSymbol *Start, *End;
-};
-
-class RangeSpanList {
-private:
-  // Index for locating within the debug_range section this particular span.
-  MCSymbol *RangeSym;
-  // List of ranges.
-  SmallVector<RangeSpan, 2> Ranges;
-
-public:
-  RangeSpanList(MCSymbol *Sym, SmallVector<RangeSpan, 2> Ranges)
-      : RangeSym(Sym), Ranges(std::move(Ranges)) {}
-  MCSymbol *getSym() const { return RangeSym; }
-  const SmallVectorImpl<RangeSpan> &getRanges() const { return Ranges; }
-  void addRange(RangeSpan Range) { Ranges.push_back(Range); }
-};
-
 //===----------------------------------------------------------------------===//
 /// This dwarf writer support class manages information associated with a
 /// source file.
@@ -299,6 +272,9 @@ public:
   /// Add the DW_AT_rnglists_base attribute to the unit DIE.
   void addRnglistsBase();
 
+  /// Add the DW_AT_loclists_base attribute to the unit DIE.
+  void addLoclistsBase();
+
   /// Add the DW_AT_addr_base attribute to the unit DIE.
   void addAddrTableBase();
 
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index be04b9a6e8c85d3cf2c4199c0a1ea323d3e49023..7599121de2b06c9c84c1c598b008b384e5aece63 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -345,7 +345,9 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
 ///     unwound and handling continues.
 ///  3. Type ID table contains references to all the C++ typeinfo for all
 ///     catches in the function.  This tables is reverse indexed base 1.
-void EHStreamer::emitExceptionTable() {
+///
+/// Returns the starting symbol of an exception table.
+MCSymbol *EHStreamer::emitExceptionTable() {
   const MachineFunction *MF = Asm->MF;
   const std::vector<const GlobalValue *> &TypeInfos = MF->getTypeInfos();
   const std::vector<unsigned> &FilterIds = MF->getFilterIds();
@@ -375,6 +377,7 @@ void EHStreamer::emitExceptionTable() {
   computeCallSiteTable(CallSites, LandingPads, FirstActions);
 
   bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+  bool IsWasm = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::Wasm;
   unsigned CallSiteEncoding =
       IsSJLJ ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_uleb128;
   bool HaveTTData = !TypeInfos.empty() || !FilterIds.empty();
@@ -457,8 +460,8 @@ void EHStreamer::emitExceptionTable() {
   Asm->EmitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
   Asm->OutStreamer->EmitLabel(CstBeginLabel);
 
-  // SjLj Exception handling
-  if (IsSJLJ) {
+  // SjLj / Wasm Exception handling
+  if (IsSJLJ || IsWasm) {
     unsigned idx = 0;
     for (SmallVectorImpl<CallSiteEntry>::const_iterator
          I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
@@ -604,6 +607,7 @@ void EHStreamer::emitExceptionTable() {
   }
 
   Asm->EmitAlignment(2);
+  return GCCETSym;
 }
 
 void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index b89421a1e0675dbdbda18f0d0cee39dbcb047816..e3a6f8e9d587268e821f716a0ef2871ec04c5745 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -85,9 +85,10 @@ protected:
   /// zero for the landing pad and the action.  Calls marked 'nounwind' have
   /// no entry and must not be contained in the try-range of any entry - they
   /// form gaps in the table.  Entries must be ordered by try-range address.
-  void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                            const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
-                            const SmallVectorImpl<unsigned> &FirstActions);
+  virtual void computeCallSiteTable(
+      SmallVectorImpl<CallSiteEntry> &CallSites,
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      const SmallVectorImpl<unsigned> &FirstActions);
 
   /// Emit landing pads and actions.
   ///
@@ -108,7 +109,9 @@ protected:
   ///     found the frame is unwound and handling continues.
   ///  3. Type id table contains references to all the C++ typeinfo for all
   ///     catches in the function.  This tables is reversed indexed base 1.
-  void emitExceptionTable();
+  ///
+  /// Returns the starting symbol of an exception table.
+  MCSymbol *emitExceptionTable();
 
   virtual void emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel);
 
diff --git a/lib/CodeGen/AsmPrinter/WasmException.cpp b/lib/CodeGen/AsmPrinter/WasmException.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46745d08c9f3b7103de89ed30ddf05833c8c4cfc
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -0,0 +1,81 @@
+//===-- CodeGen/AsmPrinter/WasmException.cpp - Wasm Exception Impl --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing WebAssembly exception info into asm
+// files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WasmException.h"
+#include "llvm/MC/MCStreamer.h"
+using namespace llvm;
+
+void WasmException::markFunctionEnd() {
+  // Get rid of any dead landing pads.
+  if (!Asm->MF->getLandingPads().empty()) {
+    auto *NonConstMF = const_cast<MachineFunction *>(Asm->MF);
+    // Wasm does not set BeginLabel and EndLabel information for landing pads,
+    // so we should set the second argument false.
+    NonConstMF->tidyLandingPads(nullptr, /* TidyIfNoBeginLabels */ false);
+  }
+}
+
+void WasmException::endFunction(const MachineFunction *MF) {
+  bool ShouldEmitExceptionTable = false;
+  for (const LandingPadInfo &Info : MF->getLandingPads()) {
+    if (MF->hasWasmLandingPadIndex(Info.LandingPadBlock)) {
+      ShouldEmitExceptionTable = true;
+      break;
+    }
+  }
+  if (!ShouldEmitExceptionTable)
+    return;
+  MCSymbol *LSDALabel = emitExceptionTable();
+  assert(LSDALabel && ".GCC_exception_table has not been emitted!");
+
+  // Wasm requires every data section symbol to have a .size set. So we emit an
+  // end marker and set the size as the difference between the start end the end
+  // marker.
+  MCSymbol *LSDAEndLabel = Asm->createTempSymbol("GCC_except_table_end");
+  Asm->OutStreamer->EmitLabel(LSDAEndLabel);
+  MCContext &OutContext = Asm->OutStreamer->getContext();
+  const MCExpr *SizeExp = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(LSDAEndLabel, OutContext),
+      MCSymbolRefExpr::create(LSDALabel, OutContext), OutContext);
+  Asm->OutStreamer->emitELFSize(LSDALabel, SizeExp);
+}
+
+// Compute the call-site table for wasm EH. Even though we use the same function
+// name to share the common routines, a call site entry in the table corresponds
+// to not a call site for possibly-throwing functions but a landing pad. In wasm
+// EH the VM is responsible for stack unwinding. After an exception occurs and
+// the stack is unwound, the control flow is transferred to wasm 'catch'
+// instruction by the VM, after which the personality function is called from
+// the compiler-generated code. Refer to WasmEHPrepare pass for more
+// information.
+void WasmException::computeCallSiteTable(
+    SmallVectorImpl<CallSiteEntry> &CallSites,
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    const SmallVectorImpl<unsigned> &FirstActions) {
+  MachineFunction &MF = *Asm->MF;
+  for (unsigned I = 0, N = LandingPads.size(); I < N; ++I) {
+    const LandingPadInfo *Info = LandingPads[I];
+    MachineBasicBlock *LPad = Info->LandingPadBlock;
+    // We don't emit LSDA for single catch (...).
+    if (!MF.hasWasmLandingPadIndex(LPad))
+      continue;
+    // Wasm EH must maintain the EH pads in the order assigned to them by the
+    // WasmEHPrepare pass.
+    unsigned LPadIndex = MF.getWasmLandingPadIndex(LPad);
+    CallSiteEntry Site = {nullptr, nullptr, Info, FirstActions[I]};
+    if (CallSites.size() < LPadIndex + 1)
+      CallSites.resize(LPadIndex + 1);
+    CallSites[LPadIndex] = Site;
+  }
+}
diff --git a/lib/CodeGen/AsmPrinter/WasmException.h b/lib/CodeGen/AsmPrinter/WasmException.h
new file mode 100644
index 0000000000000000000000000000000000000000..09a9a25ce8d0027564e8867f3b6e55801014df68
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/WasmException.h
@@ -0,0 +1,42 @@
+//===-- WasmException.h - Wasm Exception Framework -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing WebAssembly exception info into asm
+// files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WASMEXCEPTION_H
+
+#include "EHStreamer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class LLVM_LIBRARY_VISIBILITY WasmException : public EHStreamer {
+public:
+  WasmException(AsmPrinter *A) : EHStreamer(A) {}
+
+  void endModule() override {}
+  void beginFunction(const MachineFunction *MF) override {}
+  virtual void markFunctionEnd() override;
+  void endFunction(const MachineFunction *MF) override;
+
+protected:
+  // Compute the call site table for wasm EH.
+  void computeCallSiteTable(
+      SmallVectorImpl<CallSiteEntry> &CallSites,
+      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+      const SmallVectorImpl<unsigned> &FirstActions) override;
+};
+
+} // End of namespace llvm
+
+#endif
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index eff73a58d8d2d6857345242edccb678331198908..2a97a2fde43d1520752c2fe132e25b004c538375 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -42,6 +42,7 @@ WinException::WinException(AsmPrinter *A) : EHStreamer(A) {
   // MSVC's EH tables are always composed of 32-bit words.  All known 64-bit
   // platforms use an imagerel32 relocation to refer to symbols.
   useImageRel32 = (A->getDataLayout().getPointerSizeInBits() == 64);
+  isAArch64 = Asm->TM.getTargetTriple().isAArch64();
 }
 
 WinException::~WinException() {}
@@ -242,6 +243,17 @@ void WinException::endFunclet() {
     if (F.hasPersonalityFn())
       Per = classifyEHPersonality(F.getPersonalityFn()->stripPointerCasts());
 
+    // On funclet exit, we emit a fake "function" end marker, so that the call
+    // to EmitWinEHHandlerData below can calculate the size of the funclet or
+    // function.
+    if (isAArch64) {
+      Asm->OutStreamer->SwitchSection(CurrentFuncletTextSection);
+      Asm->OutStreamer->EmitWinCFIFuncletOrFuncEnd();
+      MCSection *XData = Asm->OutStreamer->getAssociatedXDataSection(
+          Asm->OutStreamer->getCurrentSectionOnly());
+      Asm->OutStreamer->SwitchSection(XData);
+    }
+
     // Emit an UNWIND_INFO struct describing the prologue.
     Asm->OutStreamer->EmitWinEHHandlerData();
 
@@ -286,7 +298,10 @@ const MCExpr *WinException::create32bitRef(const GlobalValue *GV) {
   return create32bitRef(Asm->getSymbol(GV));
 }
 
-const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
+const MCExpr *WinException::getLabel(const MCSymbol *Label) {
+  if (isAArch64)
+    return MCSymbolRefExpr::create(Label, MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                   Asm->OutContext);
   return MCBinaryExpr::createAdd(create32bitRef(Label),
                                  MCConstantExpr::create(1, Asm->OutContext),
                                  Asm->OutContext);
@@ -588,7 +603,6 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
                                           const MCSymbol *EndLabel, int State) {
   auto &OS = *Asm->OutStreamer;
   MCContext &Ctx = Asm->OutContext;
-
   bool VerboseAsm = OS.isVerboseAsm();
   auto AddComment = [&](const Twine &Comment) {
     if (VerboseAsm)
@@ -613,9 +627,9 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
     }
 
     AddComment("LabelStart");
-    OS.EmitValue(getLabelPlusOne(BeginLabel), 4);
+    OS.EmitValue(getLabel(BeginLabel), 4);
     AddComment("LabelEnd");
-    OS.EmitValue(getLabelPlusOne(EndLabel), 4);
+    OS.EmitValue(getLabel(EndLabel), 4);
     AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
                                                              : "CatchAll");
     OS.EmitValue(FilterOrFinally, 4);
@@ -799,7 +813,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
       //   TypeDescriptor *Type;
       //   int32_t         CatchObjOffset;
       //   void          (*Handler)();
-      //   int32_t         ParentFrameOffset; // x64 only
+      //   int32_t         ParentFrameOffset; // x64 and AArch64 only
       // };
       OS.EmitLabel(HandlerMapXData);
       for (const WinEHHandlerType &HT : TBME.HandlerArray) {
@@ -901,7 +915,7 @@ void WinException::computeIP2StateTable(
         ChangeLabel = StateChange.PreviousEndLabel;
       // Emit an entry indicating that PCs after 'Label' have this EH state.
       IPToStateTable.push_back(
-          std::make_pair(getLabelPlusOne(ChangeLabel), StateChange.NewState));
+          std::make_pair(getLabel(ChangeLabel), StateChange.NewState));
       // FIXME: assert that NewState is between CatchLow and CatchHigh.
     }
   }
diff --git a/lib/CodeGen/AsmPrinter/WinException.h b/lib/CodeGen/AsmPrinter/WinException.h
index eed3c4453ffc8a2425c3bdf45abc45c92b3a188a..728cde3b250279b11ca871b2e4db4cdfc7799a7e 100644
--- a/lib/CodeGen/AsmPrinter/WinException.h
+++ b/lib/CodeGen/AsmPrinter/WinException.h
@@ -38,6 +38,9 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
   /// True if this is a 64-bit target and we should use image relative offsets.
   bool useImageRel32 = false;
 
+  /// True if we are generating exception handling on Windows for ARM64.
+  bool isAArch64 = false;
+
   /// Pointer to the current funclet entry BB.
   const MachineBasicBlock *CurrentFuncletEntry = nullptr;
 
@@ -72,7 +75,7 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
 
   const MCExpr *create32bitRef(const MCSymbol *Value);
   const MCExpr *create32bitRef(const GlobalValue *GV);
-  const MCExpr *getLabelPlusOne(const MCSymbol *Label);
+  const MCExpr *getLabel(const MCSymbol *Label);
   const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
   const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
                                  const MCSymbol *OffsetFrom);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index dfbfae85a86376e6839537c20796a19c9cf5611a..651873bb91182a1636f9f7278f6828c995c47bc1 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -321,6 +321,24 @@ class TypePromotionTransaction;
     }
 
   private:
+    template <typename F>
+    void resetIteratorIfInvalidatedWhileCalling(BasicBlock *BB, F f) {
+      // Substituting can cause recursive simplifications, which can invalidate
+      // our iterator.  Use a WeakTrackingVH to hold onto it in case this
+      // happens.
+      Value *CurValue = &*CurInstIterator;
+      WeakTrackingVH IterHandle(CurValue);
+
+      f();
+
+      // If the iterator instruction was recursively deleted, start over at the
+      // start of the block.
+      if (IterHandle != CurValue) {
+        CurInstIterator = BB->begin();
+        SunkAddrs.clear();
+      }
+    }
+
     bool eliminateFallThrough(Function &F);
     bool eliminateMostlyEmptyBlocks(Function &F);
     BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
@@ -436,11 +454,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
-    SeenChainsForSExt.clear();
-    ValToSExtendedUses.clear();
-    RemovedInsts.clear();
-    LargeOffsetGEPMap.clear();
-    LargeOffsetGEPID.clear();
     for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = &*I++;
       bool ModifiedDTOnIteration = false;
@@ -460,6 +473,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
       I->deleteValue();
 
     EverMadeChange |= MadeChange;
+    SeenChainsForSExt.clear();
+    ValToSExtendedUses.clear();
+    RemovedInsts.clear();
+    LargeOffsetGEPMap.clear();
+    LargeOffsetGEPID.clear();
   }
 
   SunkAddrs.clear();
@@ -1690,21 +1708,18 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       // Lower all uses of llvm.objectsize.*
       ConstantInt *RetVal =
           lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true);
-      // Substituting this can cause recursive simplifications, which can
-      // invalidate our iterator.  Use a WeakTrackingVH to hold onto it in case
-      // this
-      // happens.
-      Value *CurValue = &*CurInstIterator;
-      WeakTrackingVH IterHandle(CurValue);
-
-      replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
 
-      // If the iterator instruction was recursively deleted, start over at the
-      // start of the block.
-      if (IterHandle != CurValue) {
-        CurInstIterator = BB->begin();
-        SunkAddrs.clear();
-      }
+      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
+        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
+      });
+      return true;
+    }
+    case Intrinsic::is_constant: {
+      // If is_constant hasn't folded away yet, lower it to false now.
+      Constant *RetVal = ConstantInt::get(II->getType(), 0);
+      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
+        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
+      });
       return true;
     }
     case Intrinsic::aarch64_stlxr:
@@ -1721,11 +1736,22 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       return true;
     }
     case Intrinsic::launder_invariant_group:
-    case Intrinsic::strip_invariant_group:
-      II->replaceAllUsesWith(II->getArgOperand(0));
+    case Intrinsic::strip_invariant_group: {
+      Value *ArgVal = II->getArgOperand(0);
+      auto it = LargeOffsetGEPMap.find(II);
+      if (it != LargeOffsetGEPMap.end()) {
+          // Merge entries in LargeOffsetGEPMap to reflect the RAUW.
+          // Make sure not to have to deal with iterator invalidation
+          // after possibly adding ArgVal to LargeOffsetGEPMap.
+          auto GEPs = std::move(it->second);
+          LargeOffsetGEPMap[ArgVal].append(GEPs.begin(), GEPs.end());
+          LargeOffsetGEPMap.erase(II);
+      }
+
+      II->replaceAllUsesWith(ArgVal);
       II->eraseFromParent();
       return true;
-
+    }
     case Intrinsic::cttz:
     case Intrinsic::ctlz:
       // If counting zeros is expensive, try to avoid it.
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 709965ba1515fe66bf15d3551037476940ccd1b2..ef090777726c52908e1047cdb9e31d307a18b508 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -104,6 +104,36 @@ IRTranslator::IRTranslator() : MachineFunctionPass(ID) {
   initializeIRTranslatorPass(*PassRegistry::getPassRegistry());
 }
 
+#ifndef NDEBUG
+/// Verify that every instruction created has the same DILocation as the
+/// instruction being translated.
+class DILocationVerifier : MachineFunction::Delegate {
+  MachineFunction &MF;
+  const Instruction *CurrInst = nullptr;
+
+public:
+  DILocationVerifier(MachineFunction &MF) : MF(MF) { MF.setDelegate(this); }
+  ~DILocationVerifier() { MF.resetDelegate(this); }
+
+  const Instruction *getCurrentInst() const { return CurrInst; }
+  void setCurrentInst(const Instruction *Inst) { CurrInst = Inst; }
+
+  void MF_HandleInsertion(const MachineInstr &MI) override {
+    assert(getCurrentInst() && "Inserted instruction without a current MI");
+
+    // Only print the check message if we're actually checking it.
+#ifndef NDEBUG
+    LLVM_DEBUG(dbgs() << "Checking DILocation from " << *CurrInst
+                      << " was copied to " << MI);
+#endif
+    assert(CurrInst->getDebugLoc() == MI.getDebugLoc() &&
+           "Line info was not transferred to all instructions");
+  }
+  void MF_HandleRemoval(const MachineInstr &MI) override {}
+};
+#endif // ifndef NDEBUG
+
+
 void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<StackProtector>();
   AU.addRequired<TargetPassConfig>();
@@ -917,6 +947,11 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     MIRBuilder.buildConstant(getOrCreateVReg(CI), Min->isZero() ? -1ULL : 0);
     return true;
   }
+  case Intrinsic::is_constant:
+    // If this wasn't constant-folded away by now, then it's not a
+    // constant.
+    MIRBuilder.buildConstant(getOrCreateVReg(CI), 0);
+    return true;
   case Intrinsic::stackguard:
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
@@ -1330,7 +1365,22 @@ bool IRTranslator::translateExtractElement(const User &U,
   }
   unsigned Res = getOrCreateVReg(U);
   unsigned Val = getOrCreateVReg(*U.getOperand(0));
-  unsigned Idx = getOrCreateVReg(*U.getOperand(1));
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+  unsigned PreferredVecIdxWidth = TLI.getVectorIdxTy(*DL).getSizeInBits();
+  unsigned Idx = 0;
+  if (auto *CI = dyn_cast<ConstantInt>(U.getOperand(1))) {
+    if (CI->getBitWidth() != PreferredVecIdxWidth) {
+      APInt NewIdx = CI->getValue().sextOrTrunc(PreferredVecIdxWidth);
+      auto *NewIdxCI = ConstantInt::get(CI->getContext(), NewIdx);
+      Idx = getOrCreateVReg(*NewIdxCI);
+    }
+  }
+  if (!Idx)
+    Idx = getOrCreateVReg(*U.getOperand(1));
+  if (MRI->getType(Idx).getSizeInBits() != PreferredVecIdxWidth) {
+    const LLT &VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+    Idx = MIRBuilder.buildSExtOrTrunc(VecIdxTy, Idx)->getOperand(0).getReg();
+  }
   MIRBuilder.buildExtractVectorElement(Res, Val, Idx);
   return true;
 }
@@ -1453,9 +1503,16 @@ bool IRTranslator::translateAtomicRMW(const User &U,
 }
 
 void IRTranslator::finishPendingPhis() {
+#ifndef NDEBUG
+  DILocationVerifier Verifier(*MF);
+#endif // ifndef NDEBUG
   for (auto &Phi : PendingPHIs) {
     const PHINode *PI = Phi.first;
     ArrayRef<MachineInstr *> ComponentPHIs = Phi.second;
+    EntryBuilder.setDebugLoc(PI->getDebugLoc());
+#ifndef NDEBUG
+    Verifier.setCurrentInst(PI);
+#endif // ifndef NDEBUG
 
     // All MachineBasicBlocks exist, add them to the PHI. We assume IRTranslator
     // won't create extra control flow here, otherwise we need to find the
@@ -1494,6 +1551,7 @@ bool IRTranslator::valueIsSplit(const Value &V,
 
 bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder.setDebugLoc(Inst.getDebugLoc());
+  EntryBuilder.setDebugLoc(Inst.getDebugLoc());
   switch(Inst.getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS) \
     case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder);
@@ -1669,31 +1727,39 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   }
 
   // Need to visit defs before uses when translating instructions.
-  ReversePostOrderTraversal<const Function *> RPOT(&F);
-  for (const BasicBlock *BB : RPOT) {
-    MachineBasicBlock &MBB = getMBB(*BB);
-    // Set the insertion point of all the following translations to
-    // the end of this basic block.
-    CurBuilder.setMBB(MBB);
-
-    for (const Instruction &Inst : *BB) {
-      if (translate(Inst))
-        continue;
-
-      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
-                                 Inst.getDebugLoc(), BB);
-      R << "unable to translate instruction: " << ore::NV("Opcode", &Inst);
-
-      if (ORE->allowExtraAnalysis("gisel-irtranslator")) {
-        std::string InstStrStorage;
-        raw_string_ostream InstStr(InstStrStorage);
-        InstStr << Inst;
+  {
+    ReversePostOrderTraversal<const Function *> RPOT(&F);
+#ifndef NDEBUG
+    DILocationVerifier Verifier(*MF);
+#endif // ifndef NDEBUG
+    for (const BasicBlock *BB : RPOT) {
+      MachineBasicBlock &MBB = getMBB(*BB);
+      // Set the insertion point of all the following translations to
+      // the end of this basic block.
+      CurBuilder.setMBB(MBB);
+
+      for (const Instruction &Inst : *BB) {
+#ifndef NDEBUG
+        Verifier.setCurrentInst(&Inst);
+#endif // ifndef NDEBUG
+        if (translate(Inst))
+          continue;
+
+        OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                   Inst.getDebugLoc(), BB);
+        R << "unable to translate instruction: " << ore::NV("Opcode", &Inst);
+
+        if (ORE->allowExtraAnalysis("gisel-irtranslator")) {
+          std::string InstStrStorage;
+          raw_string_ostream InstStr(InstStrStorage);
+          InstStr << Inst;
+
+          R << ": '" << InstStr.str() << "'";
+        }
 
-        R << ": '" << InstStr.str() << "'";
+        reportTranslationError(*MF, *TPC, *ORE, R);
+        return false;
       }
-
-      reportTranslationError(*MF, *TPC, *ORE, R);
-      return false;
     }
   }
 
diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 5e77fcbb0ed9337dd57464db82ea3c49c7ad73af..38913e4afcba30228438017f879983489490abf5 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -80,5 +80,5 @@ bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI,
     return true;
 
   return !MI.mayLoadOrStore() && !MI.hasUnmodeledSideEffects() &&
-         MI.implicit_operands().begin() == MI.implicit_operands().end();
+         empty(MI.implicit_operands());
 }
diff --git a/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index 344f573a67f5e25c20eb75ea9e2c4f02fa08415c..94eab9ae00c87c8e074bdfc29146ea00433f5993 100644
--- a/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -45,7 +45,7 @@ LegalityPredicate LegalityPredicates::typePairAndMemSizeInSet(
   SmallVector<TypePairAndMemSize, 4> TypesAndMemSize = TypesAndMemSizeInit;
   return [=](const LegalityQuery &Query) {
     TypePairAndMemSize Match = {Query.Types[TypeIdx0], Query.Types[TypeIdx1],
-                                Query.MMODescrs[MMOIdx].Size};
+                                Query.MMODescrs[MMOIdx].SizeInBits};
     return std::find(TypesAndMemSize.begin(), TypesAndMemSize.end(), Match) !=
            TypesAndMemSize.end();
   };
@@ -82,7 +82,7 @@ LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) {
 
 LegalityPredicate LegalityPredicates::memSizeInBytesNotPow2(unsigned MMOIdx) {
   return [=](const LegalityQuery &Query) {
-    return !isPowerOf2_32(Query.MMODescrs[MMOIdx].Size /* In Bytes */);
+    return !isPowerOf2_32(Query.MMODescrs[MMOIdx].SizeInBits / 8);
   };
 }
 
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ff2e61c03b47e03c50e237347737d30be153dee7..516f5ce4343e7c34efb2279e7fdef00491cd0417 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -467,12 +467,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       unsigned SrcReg = 0;
       unsigned Adjustment = i * NarrowSize / 8;
+      unsigned Alignment = MinAlign(MMO.getAlignment(), Adjustment);
 
       MachineMemOperand *SplitMMO = MIRBuilder.getMF().getMachineMemOperand(
           MMO.getPointerInfo().getWithOffset(Adjustment), MMO.getFlags(),
-          NarrowSize / 8, i == 0 ? MMO.getAlignment() : NarrowSize / 8,
-          MMO.getAAInfo(), MMO.getRanges(), MMO.getSyncScopeID(),
-          MMO.getOrdering(), MMO.getFailureOrdering());
+          NarrowSize / 8, Alignment, MMO.getAAInfo(), MMO.getRanges(),
+          MMO.getSyncScopeID(), MMO.getOrdering(), MMO.getFailureOrdering());
 
       MIRBuilder.materializeGEP(SrcReg, MI.getOperand(1).getReg(), OffsetTy,
                                 Adjustment);
@@ -509,12 +509,12 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstReg = 0;
       unsigned Adjustment = i * NarrowSize / 8;
+      unsigned Alignment = MinAlign(MMO.getAlignment(), Adjustment);
 
       MachineMemOperand *SplitMMO = MIRBuilder.getMF().getMachineMemOperand(
           MMO.getPointerInfo().getWithOffset(Adjustment), MMO.getFlags(),
-          NarrowSize / 8, i == 0 ? MMO.getAlignment() : NarrowSize / 8,
-          MMO.getAAInfo(), MMO.getRanges(), MMO.getSyncScopeID(),
-          MMO.getOrdering(), MMO.getFailureOrdering());
+          NarrowSize / 8, Alignment, MMO.getAAInfo(), MMO.getRanges(),
+          MMO.getSyncScopeID(), MMO.getOrdering(), MMO.getFailureOrdering());
 
       MIRBuilder.materializeGEP(DstReg, MI.getOperand(1).getReg(), OffsetTy,
                                 Adjustment);
@@ -878,6 +878,12 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     MIRBuilder.recordInsertion(&MI);
     return Legalized;
   }
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+    if (TypeIdx != 2)
+      return UnableToLegalize;
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
+    MIRBuilder.recordInsertion(&MI);
+    return Legalized;
   }
 }
 
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 1bfede097bd111be77f39595ee1b67e3f060e6c2..ca776de0a0fe0d9e69ec90307fb580cfe60465e7 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -51,7 +51,7 @@ raw_ostream &LegalityQuery::print(raw_ostream &OS) const {
 
   OS << Opcode << ", MMOs={";
   for (const auto &MMODescr : MMODescrs) {
-    OS << MMODescr.Size << ", ";
+    OS << MMODescr.SizeInBits << ", ";
   }
   OS << "}";
 
@@ -298,8 +298,7 @@ LegalizeRuleSet &LegalizerInfo::getActionDefinitionsBuilder(
     std::initializer_list<unsigned> Opcodes) {
   unsigned Representative = *Opcodes.begin();
 
-  assert(Opcodes.begin() != Opcodes.end() &&
-         Opcodes.begin() + 1 != Opcodes.end() &&
+  assert(!empty(Opcodes) && Opcodes.begin() + 1 != Opcodes.end() &&
          "Initializer list must have at least two opcodes");
 
   for (auto I = Opcodes.begin() + 1, E = Opcodes.end(); I != E; ++I)
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 9e2d48d1dc424fe09c0c8ca9215cb2baa83a2982..6bb48dc2e8aac8cef90628f54a512bd99bf7fdb1 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -140,7 +140,7 @@ bool RegBankSelect::repairReg(
     return false;
   assert(ValMapping.NumBreakDowns == 1 && "Not yet implemented");
   // An empty range of new register means no repairing.
-  assert(NewVRegs.begin() != NewVRegs.end() && "We should not have to repair");
+  assert(!empty(NewVRegs) && "We should not have to repair");
 
   // Assume we are repairing a use and thus, the original reg will be
   // the source of the repairing.
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index dd15567ef1c1bcf31433f99aea1538c8368bece6..28404e52d6ea9954bbb08ad7457df69c26968e5f 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -426,7 +426,7 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
            "This mapping is too complex for this function");
     iterator_range<SmallVectorImpl<unsigned>::const_iterator> NewRegs =
         OpdMapper.getVRegs(OpIdx);
-    if (NewRegs.begin() == NewRegs.end()) {
+    if (empty(NewRegs)) {
       LLVM_DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
       continue;
     }
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index 1a5f88743d5f097cdabf2653f260c03eb2326f51..4d3a375355919833b45968145fdbcd05dd2ccadc 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -137,7 +137,7 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
   // If we can move an instruction, we can remove it.  Otherwise, it has
   // a side-effect of some sort.
   bool SawStore = false;
-  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore))
+  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore) && !MI.isPHI())
     return false;
 
   // Instructions without side-effects are dead iff they only define dead vregs.
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index f12d00071b2417310c5b085a25305f124216d448..5666626ab311f8837973cb892af2bbe2c96c6ebb 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -273,7 +273,7 @@ namespace {
     void PredicateBlock(BBInfo &BBI,
                         MachineBasicBlock::iterator E,
                         SmallVectorImpl<MachineOperand> &Cond,
-                        SmallSet<unsigned, 4> *LaterRedefs = nullptr);
+                        SmallSet<MCPhysReg, 4> *LaterRedefs = nullptr);
     void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                SmallVectorImpl<MachineOperand> &Cond,
                                bool IgnoreBr = false);
@@ -1366,12 +1366,12 @@ static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) {
   // Before stepping forward past MI, remember which regs were live
   // before MI. This is needed to set the Undef flag only when reg is
   // dead.
-  SparseSet<unsigned> LiveBeforeMI;
+  SparseSet<MCPhysReg, identity<MCPhysReg>> LiveBeforeMI;
   LiveBeforeMI.setUniverse(TRI->getNumRegs());
   for (unsigned Reg : Redefs)
     LiveBeforeMI.insert(Reg);
 
-  SmallVector<std::pair<unsigned, const MachineOperand*>, 4> Clobbers;
+  SmallVector<std::pair<MCPhysReg, const MachineOperand*>, 4> Clobbers;
   Redefs.stepForward(MI, Clobbers);
 
   // Now add the implicit uses for each of the clobbered values.
@@ -1740,7 +1740,7 @@ bool IfConverter::IfConvertDiamondCommon(
 
   if (MRI->tracksLiveness()) {
     for (const MachineInstr &MI : make_range(MBB1.begin(), DI1)) {
-      SmallVector<std::pair<unsigned, const MachineOperand*>, 4> Dummy;
+      SmallVector<std::pair<MCPhysReg, const MachineOperand*>, 4> Dummy;
       Redefs.stepForward(MI, Dummy);
     }
   }
@@ -1806,13 +1806,13 @@ bool IfConverter::IfConvertDiamondCommon(
   // generate:
   //   sub    r0, r1, #1
   //   addne  r0, r1, #1
-  SmallSet<unsigned, 4> RedefsByFalse;
-  SmallSet<unsigned, 4> ExtUses;
+  SmallSet<MCPhysReg, 4> RedefsByFalse;
+  SmallSet<MCPhysReg, 4> ExtUses;
   if (TII->isProfitableToUnpredicate(MBB1, MBB2)) {
     for (const MachineInstr &FI : make_range(MBB2.begin(), DI2)) {
       if (FI.isDebugInstr())
         continue;
-      SmallVector<unsigned, 4> Defs;
+      SmallVector<MCPhysReg, 4> Defs;
       for (const MachineOperand &MO : FI.operands()) {
         if (!MO.isReg())
           continue;
@@ -1830,7 +1830,7 @@ bool IfConverter::IfConvertDiamondCommon(
         }
       }
 
-      for (unsigned Reg : Defs) {
+      for (MCPhysReg Reg : Defs) {
         if (!ExtUses.count(Reg)) {
           for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
                SubRegs.isValid(); ++SubRegs)
@@ -1976,7 +1976,7 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 }
 
 static bool MaySpeculate(const MachineInstr &MI,
-                         SmallSet<unsigned, 4> &LaterRedefs) {
+                         SmallSet<MCPhysReg, 4> &LaterRedefs) {
   bool SawStore = true;
   if (!MI.isSafeToMove(nullptr, SawStore))
     return false;
@@ -1999,7 +1999,7 @@ static bool MaySpeculate(const MachineInstr &MI,
 void IfConverter::PredicateBlock(BBInfo &BBI,
                                  MachineBasicBlock::iterator E,
                                  SmallVectorImpl<MachineOperand> &Cond,
-                                 SmallSet<unsigned, 4> *LaterRedefs) {
+                                 SmallSet<MCPhysReg, 4> *LaterRedefs) {
   bool AnyUnpred = false;
   bool MaySpec = LaterRedefs != nullptr;
   for (MachineInstr &I : make_range(BBI.BB->begin(), E)) {
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index 034692de92da3806651a580ab03e205c46d64830..deb49a1ea4826acaacf55a12f05c56003d787258 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -344,11 +344,11 @@ ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI,
           return AR_MayAlias;
         continue;
       }
-      llvm::AliasResult AAResult = AA->alias(
-          MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
-                         MMO1->getAAInfo()),
-          MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
-                         MMO2->getAAInfo()));
+      llvm::AliasResult AAResult =
+          AA->alias(MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
+                                   MMO1->getAAInfo()),
+                    MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
+                                   MMO2->getAAInfo()));
       if (AAResult != NoAlias)
         return AR_MayAlias;
     }
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 90337903008e23ced5092a7b8b48b63b8af93af9..52e832cc38c1cd3e819d68915a341da2391c4bfe 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -95,29 +95,22 @@ LLVMTargetMachine::getTargetTransformInfo(const Function &F) {
 }
 
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
-static MCContext *
-addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
-                        bool DisableVerify, bool &WillCompleteCodeGenPipeline,
-                        raw_pwrite_stream &Out, MachineModuleInfo *MMI) {
+static TargetPassConfig *
+addPassesToGenerateCode(LLVMTargetMachine &TM, PassManagerBase &PM,
+                        bool DisableVerify, MachineModuleInfo &MMI) {
   // Targets may override createPassConfig to provide a target-specific
   // subclass.
-  TargetPassConfig *PassConfig = TM->createPassConfig(PM);
+  TargetPassConfig *PassConfig = TM.createPassConfig(PM);
   // Set PassConfig options provided by TargetMachine.
   PassConfig->setDisableVerify(DisableVerify);
-  WillCompleteCodeGenPipeline = PassConfig->willCompleteCodeGenPipeline();
   PM.add(PassConfig);
-  if (!MMI)
-    MMI = new MachineModuleInfo(TM);
-  PM.add(MMI);
+  PM.add(&MMI);
 
   if (PassConfig->addISelPasses())
     return nullptr;
   PassConfig->addMachinePasses();
   PassConfig->setInitialized();
-  if (!WillCompleteCodeGenPipeline)
-    PM.add(createPrintMIRPass(Out));
-
-  return &MMI->getContext();
+  return PassConfig;
 }
 
 bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
@@ -201,14 +194,16 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                             bool DisableVerify,
                                             MachineModuleInfo *MMI) {
   // Add common CodeGen passes.
-  bool WillCompleteCodeGenPipeline = true;
-  MCContext *Context = addPassesToGenerateCode(
-      this, PM, DisableVerify, WillCompleteCodeGenPipeline, Out, MMI);
-  if (!Context)
+  if (!MMI)
+    MMI = new MachineModuleInfo(this);
+  TargetPassConfig *PassConfig =
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMI);
+  if (!PassConfig)
     return true;
 
-  if (WillCompleteCodeGenPipeline &&
-      addAsmPrinter(PM, Out, DwoOut, FileType, *Context))
+  if (!TargetPassConfig::willCompleteCodeGenPipeline()) {
+    PM.add(createPrintMIRPass(Out));
+  } else if (addAsmPrinter(PM, Out, DwoOut, FileType, MMI->getContext()))
     return true;
 
   PM.add(createFreeMachineFunctionPass());
@@ -224,14 +219,15 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
                                           raw_pwrite_stream &Out,
                                           bool DisableVerify) {
   // Add common CodeGen passes.
-  bool WillCompleteCodeGenPipeline = true;
-  Ctx = addPassesToGenerateCode(this, PM, DisableVerify,
-                                WillCompleteCodeGenPipeline, Out,
-                                /*MachineModuleInfo*/ nullptr);
-  if (!Ctx)
+  MachineModuleInfo *MMI = new MachineModuleInfo(this);
+  TargetPassConfig *PassConfig =
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMI);
+  if (!PassConfig)
     return true;
-  assert(WillCompleteCodeGenPipeline && "CodeGen pipeline has been altered");
+  assert(TargetPassConfig::willCompleteCodeGenPipeline() &&
+         "Cannot emit MC with limited codegen pipeline");
 
+  Ctx = &MMI->getContext();
   if (Options.MCOptions.MCSaveTempLabels)
     Ctx->setAllowTemporaryLabels(false);
 
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index aa35880b063a7e90c9c3e3ed2fa9711b07dfeb93..0060399c2b04526f837a7fca1b2ab63778a198cd 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -39,13 +39,6 @@ public:
   LiveDebugVariables();
   ~LiveDebugVariables() override;
 
-  /// renameRegister - Move any user variables in OldReg to NewReg:SubIdx.
-  /// @param OldReg Old virtual register that is going away.
-  /// @param NewReg New register holding the user variables.
-  /// @param SubIdx If NewReg is a virtual register, SubIdx may indicate a sub-
-  ///               register.
-  void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx);
-
   /// splitRegister - Move any user variables in OldReg to the live ranges in
   /// NewRegs where they are live. Mark the values as unavailable where no new
   /// register is live.
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index 86c6c8e29f9a444ffa54a3c4ed51008e4199f957..619643acb6d37853988ef386997b18045ee74520 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -29,8 +29,8 @@ using namespace llvm;
 /// The clobbers set will be the list of live registers clobbered
 /// by the regmask.
 void LivePhysRegs::removeRegsInMask(const MachineOperand &MO,
-        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> *Clobbers) {
-  SparseSet<unsigned>::iterator LRI = LiveRegs.begin();
+    SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> *Clobbers) {
+  RegisterSet::iterator LRI = LiveRegs.begin();
   while (LRI != LiveRegs.end()) {
     if (MO.clobbersPhysReg(*LRI)) {
       if (Clobbers)
@@ -83,7 +83,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) {
 /// on accurate kill flags. If possible use stepBackward() instead of this
 /// function.
 void LivePhysRegs::stepForward(const MachineInstr &MI,
-        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers) {
+    SmallVectorImpl<std::pair<MCPhysReg, const MachineOperand*>> &Clobbers) {
   // Remove killed registers from the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg() && !O->isDebug()) {
@@ -142,7 +142,7 @@ LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
 #endif
 
 bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
-                             unsigned Reg) const {
+                             MCPhysReg Reg) const {
   if (LiveRegs.count(Reg))
     return false;
   if (MRI.isReserved(Reg))
@@ -157,7 +157,7 @@ bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
 /// Add live-in registers of basic block \p MBB to \p LiveRegs.
 void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {
   for (const auto &LI : MBB.liveins()) {
-    unsigned Reg = LI.PhysReg;
+    MCPhysReg Reg = LI.PhysReg;
     LaneBitmask Mask = LI.LaneMask;
     MCSubRegIndexIterator S(Reg, TRI);
     assert(Mask.any() && "Invalid livein mask");
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index 04324943dfad814d0b220da9dacfa7c24b26d957..70e135ab1aff98dfe631a2d2b62d61da1b39279d 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -364,7 +364,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
 #ifndef NDEBUG
     if (MBB->pred_empty()) {
       MBB->getParent()->verify();
-      errs() << "Use of " << printReg(PhysReg)
+      errs() << "Use of " << printReg(PhysReg, MRI->getTargetRegisterInfo())
              << " does not have a corresponding definition on every path:\n";
       const MachineInstr *MI = Indexes->getInstructionFromIndex(Use);
       if (MI != nullptr)
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index da758da873c81ff5a7b42d13db1194b21aefe794..1a6174bf9ee23a316ed4bd3670df27b6d806fa5c 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -752,6 +752,8 @@ bool MIParser::parse(MachineInstr *&MI) {
     Optional<unsigned> TiedDefIdx;
     if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
       return true;
+    if (OpCode == TargetOpcode::DBG_VALUE && MO.isReg())
+      MO.setIsDebug();
     Operands.push_back(
         ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx));
     if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) ||
diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp
index 0102f1240a887cbc90f0b1170a97713386b0c5f1..00da92a92ec62326093102cd50b966ded9a426fc 100644
--- a/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -355,6 +355,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   if (YamlMF.Alignment)
     MF.setAlignment(YamlMF.Alignment);
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
+  MF.setHasWinCFI(YamlMF.HasWinCFI);
 
   if (YamlMF.Legalized)
     MF.getProperties().set(MachineFunctionProperties::Property::Legalized);
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 88e2f16d3fd3bbd4a0f6436aa8ff52f59a0c7481..8012946371531d5446b0e3b95259b2042a0385c9 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -196,6 +196,7 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.Name = MF.getName();
   YamlMF.Alignment = MF.getAlignment();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
+  YamlMF.HasWinCFI = MF.hasWinCFI();
 
   YamlMF.Legalized = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::Legalized);
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index dcb6f7cca4f5892dccb86fe62787b3c33a76c4ba..6ee8571c28aab6453aa7570081cb44473958b520 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -235,6 +235,21 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
   return false;
 }
 
+static bool isCallerPreservedOrConstPhysReg(unsigned Reg,
+                                            const MachineFunction &MF,
+                                            const TargetRegisterInfo &TRI) {
+  // MachineRegisterInfo::isConstantPhysReg directly called by
+  // MachineRegisterInfo::isCallerPreservedOrConstPhysReg expects the
+  // reserved registers to be frozen. That doesn't cause a problem  post-ISel as
+  // most (if not all) targets freeze reserved registers right after ISel.
+  //
+  // It does cause issues mid-GlobalISel, however, hence the additional
+  // reservedRegsFrozen check.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  return TRI.isCallerPreservedPhysReg(Reg, MF) ||
+         (MRI.reservedRegsFrozen() && MRI.isConstantPhysReg(Reg));
+}
+
 /// hasLivePhysRegDefUses - Return true if the specified instruction read/write
 /// physical registers (except for dead defs of physical registers). It also
 /// returns the physical register def by reference if it's the only one and the
@@ -254,7 +269,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
     if (TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
     // Reading either caller preserved or constant physregs is ok.
-    if (!MRI->isCallerPreservedOrConstPhysReg(Reg))
+    if (!isCallerPreservedOrConstPhysReg(Reg, *MI->getMF(), *TRI))
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         PhysRefs.insert(*AI);
   }
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index dfaa946c9134a2c3fe1aeac95b1e8e00930747fc..19879fe890078c51a6881c677b1823d2ce0425de 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -75,98 +75,109 @@ DEBUG_COUNTER(FwdCounter, "machine-cp-fwd",
 namespace {
 
 class CopyTracker {
-  using RegList = SmallVector<unsigned, 4>;
-  using SourceMap = DenseMap<unsigned, RegList>;
-  using Reg2MIMap = DenseMap<unsigned, MachineInstr *>;
+  struct CopyInfo {
+    MachineInstr *MI;
+    SmallVector<unsigned, 4> DefRegs;
+    bool Avail;
+  };
 
-  /// Def -> available copies map.
-  Reg2MIMap AvailCopyMap;
-
-  /// Def -> copies map.
-  Reg2MIMap CopyMap;
-
-  /// Src -> Def map
-  SourceMap SrcMap;
+  DenseMap<unsigned, CopyInfo> Copies;
 
 public:
   /// Mark all of the given registers and their subregisters as unavailable for
   /// copying.
-  void markRegsUnavailable(const RegList &Regs, const TargetRegisterInfo &TRI) {
+  void markRegsUnavailable(ArrayRef<unsigned> Regs,
+                           const TargetRegisterInfo &TRI) {
     for (unsigned Reg : Regs) {
       // Source of copy is no longer available for propagation.
-      for (MCSubRegIterator SR(Reg, &TRI, true); SR.isValid(); ++SR)
-        AvailCopyMap.erase(*SR);
+      for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
+        auto CI = Copies.find(*RUI);
+        if (CI != Copies.end())
+          CI->second.Avail = false;
+      }
     }
   }
 
   /// Clobber a single register, removing it from the tracker's copy maps.
   void clobberRegister(unsigned Reg, const TargetRegisterInfo &TRI) {
-    for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) {
-      CopyMap.erase(*AI);
-      AvailCopyMap.erase(*AI);
-
-      SourceMap::iterator SI = SrcMap.find(*AI);
-      if (SI != SrcMap.end()) {
-        markRegsUnavailable(SI->second, TRI);
-        SrcMap.erase(SI);
+    for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
+      auto I = Copies.find(*RUI);
+      if (I != Copies.end()) {
+        // When we clobber the source of a copy, we need to clobber everything
+        // it defined.
+        markRegsUnavailable(I->second.DefRegs, TRI);
+        // When we clobber the destination of a copy, we need to clobber the
+        // whole register it defined.
+        if (MachineInstr *MI = I->second.MI)
+          markRegsUnavailable({MI->getOperand(0).getReg()}, TRI);
+        // Now we can erase the copy.
+        Copies.erase(I);
       }
     }
   }
 
   /// Add this copy's registers into the tracker's copy maps.
-  void trackCopy(MachineInstr *Copy, const TargetRegisterInfo &TRI) {
-    assert(Copy->isCopy() && "Tracking non-copy?");
+  void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI) {
+    assert(MI->isCopy() && "Tracking non-copy?");
 
-    unsigned Def = Copy->getOperand(0).getReg();
-    unsigned Src = Copy->getOperand(1).getReg();
+    unsigned Def = MI->getOperand(0).getReg();
+    unsigned Src = MI->getOperand(1).getReg();
 
     // Remember Def is defined by the copy.
-    for (MCSubRegIterator SR(Def, &TRI, /*IncludeSelf=*/true); SR.isValid();
-         ++SR) {
-      CopyMap[*SR] = Copy;
-      AvailCopyMap[*SR] = Copy;
-    }
+    for (MCRegUnitIterator RUI(Def, &TRI); RUI.isValid(); ++RUI)
+      Copies[*RUI] = {MI, {}, true};
 
     // Remember source that's copied to Def. Once it's clobbered, then
     // it's no longer available for copy propagation.
-    RegList &DestList = SrcMap[Src];
-    if (!is_contained(DestList, Def))
-      DestList.push_back(Def);
+    for (MCRegUnitIterator RUI(Src, &TRI); RUI.isValid(); ++RUI) {
+      auto I = Copies.insert({*RUI, {nullptr, {}, false}});
+      auto &Copy = I.first->second;
+      if (!is_contained(Copy.DefRegs, Def))
+        Copy.DefRegs.push_back(Def);
+    }
+  }
+
+  bool hasAnyCopies() {
+    return !Copies.empty();
   }
 
-  bool hasAvailableCopies() { return !AvailCopyMap.empty(); }
+  MachineInstr *findCopyForUnit(unsigned RegUnit, const TargetRegisterInfo &TRI,
+                         bool MustBeAvailable = false) {
+    auto CI = Copies.find(RegUnit);
+    if (CI == Copies.end())
+      return nullptr;
+    if (MustBeAvailable && !CI->second.Avail)
+      return nullptr;
+    return CI->second.MI;
+  }
 
-  MachineInstr *findAvailCopy(MachineInstr &DestCopy, unsigned Reg) {
-    auto CI = AvailCopyMap.find(Reg);
-    if (CI == AvailCopyMap.end())
+  MachineInstr *findAvailCopy(MachineInstr &DestCopy, unsigned Reg,
+                              const TargetRegisterInfo &TRI) {
+    // We check the first RegUnit here, since we'll only be interested in the
+    // copy if it copies the entire register anyway.
+    MCRegUnitIterator RUI(Reg, &TRI);
+    MachineInstr *AvailCopy =
+        findCopyForUnit(*RUI, TRI, /*MustBeAvailable=*/true);
+    if (!AvailCopy ||
+        !TRI.isSubRegisterEq(AvailCopy->getOperand(0).getReg(), Reg))
       return nullptr;
-    MachineInstr &AvailCopy = *CI->second;
 
     // Check that the available copy isn't clobbered by any regmasks between
     // itself and the destination.
-    unsigned AvailSrc = AvailCopy.getOperand(1).getReg();
-    unsigned AvailDef = AvailCopy.getOperand(0).getReg();
+    unsigned AvailSrc = AvailCopy->getOperand(1).getReg();
+    unsigned AvailDef = AvailCopy->getOperand(0).getReg();
     for (const MachineInstr &MI :
-         make_range(AvailCopy.getIterator(), DestCopy.getIterator()))
+         make_range(AvailCopy->getIterator(), DestCopy.getIterator()))
       for (const MachineOperand &MO : MI.operands())
         if (MO.isRegMask())
           if (MO.clobbersPhysReg(AvailSrc) || MO.clobbersPhysReg(AvailDef))
             return nullptr;
 
-    return &AvailCopy;
-  }
-
-  MachineInstr *findCopy(unsigned Reg) {
-    auto CI = CopyMap.find(Reg);
-    if (CI != CopyMap.end())
-      return CI->second;
-    return nullptr;
+    return AvailCopy;
   }
 
   void clear() {
-    AvailCopyMap.clear();
-    CopyMap.clear();
-    SrcMap.clear();
+    Copies.clear();
   }
 };
 
@@ -224,8 +235,8 @@ INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
 void MachineCopyPropagation::ReadRegister(unsigned Reg) {
   // If 'Reg' is defined by a copy, the copy is no longer a candidate
   // for elimination.
-  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-    if (MachineInstr *Copy = Tracker.findCopy(*AI)) {
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+    if (MachineInstr *Copy = Tracker.findCopyForUnit(*RUI, *TRI)) {
       LLVM_DEBUG(dbgs() << "MCP: Copy is used - not dead: "; Copy->dump());
       MaybeDeadCopies.remove(Copy);
     }
@@ -263,7 +274,7 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
     return false;
 
   // Search for an existing copy.
-  MachineInstr *PrevCopy = Tracker.findAvailCopy(Copy, Def);
+  MachineInstr *PrevCopy = Tracker.findAvailCopy(Copy, Def, *TRI);
   if (!PrevCopy)
     return false;
 
@@ -357,7 +368,7 @@ bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI,
 /// Look for available copies whose destination register is used by \p MI and
 /// replace the use in \p MI with the copy's source register.
 void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
-  if (!Tracker.hasAvailableCopies())
+  if (!Tracker.hasAnyCopies())
     return;
 
   // Look for non-tied explicit vreg uses that have an active COPY
@@ -384,7 +395,7 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     if (!MOUse.isRenamable())
       continue;
 
-    MachineInstr *Copy = Tracker.findAvailCopy(MI, MOUse.getReg());
+    MachineInstr *Copy = Tracker.findAvailCopy(MI, MOUse.getReg(), *TRI);
     if (!Copy)
       continue;
 
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 431484f078b85a758526cb4892a7ec59c0160969..488481cec37c4cd16b3b59b4daff2ad4b0c1b164 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -130,7 +130,8 @@ static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
   return STI->getFrameLowering()->getStackAlignment();
 }
 
-MachineFunction::MachineFunction(const Function &F, const TargetMachine &Target,
+MachineFunction::MachineFunction(const Function &F,
+                                 const LLVMTargetMachine &Target,
                                  const TargetSubtargetInfo &STI,
                                  unsigned FunctionNum, MachineModuleInfo &mmi)
     : F(F), Target(Target), STI(&STI), Ctx(mmi.getContext()), MMI(mmi) {
@@ -661,8 +662,11 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) {
       }
     }
 
-  } else if (isa<CatchPadInst>(FirstI)) {
-    // TODO
+  } else if (const auto *CPI = dyn_cast<CatchPadInst>(FirstI)) {
+    for (unsigned I = CPI->getNumArgOperands(); I != 0; --I) {
+      Value *TypeInfo = CPI->getArgOperand(I - 1)->stripPointerCasts();
+      addCatchTypeInfo(LandingPad, dyn_cast<GlobalValue>(TypeInfo));
+    }
 
   } else {
     assert(isa<CleanupPadInst>(FirstI) && "Invalid landingpad!");
@@ -687,7 +691,8 @@ void MachineFunction::addFilterTypeInfo(MachineBasicBlock *LandingPad,
   LP.TypeIds.push_back(getFilterIDFor(IdsInFilter));
 }
 
-void MachineFunction::tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
+void MachineFunction::tidyLandingPads(DenseMap<MCSymbol *, uintptr_t> *LPMap,
+                                      bool TidyIfNoBeginLabels) {
   for (unsigned i = 0; i != LandingPads.size(); ) {
     LandingPadInfo &LandingPad = LandingPads[i];
     if (LandingPad.LandingPadLabel &&
@@ -702,24 +707,25 @@ void MachineFunction::tidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
       continue;
     }
 
-    for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
-      MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
-      MCSymbol *EndLabel = LandingPad.EndLabels[j];
-      if ((BeginLabel->isDefined() ||
-           (LPMap && (*LPMap)[BeginLabel] != 0)) &&
-          (EndLabel->isDefined() ||
-           (LPMap && (*LPMap)[EndLabel] != 0))) continue;
-
-      LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
-      LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
-      --j;
-      --e;
-    }
+    if (TidyIfNoBeginLabels) {
+      for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
+        MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
+        MCSymbol *EndLabel = LandingPad.EndLabels[j];
+        if ((BeginLabel->isDefined() || (LPMap && (*LPMap)[BeginLabel] != 0)) &&
+            (EndLabel->isDefined() || (LPMap && (*LPMap)[EndLabel] != 0)))
+          continue;
+
+        LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
+        LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
+        --j;
+        --e;
+      }
 
-    // Remove landing pads with no try-ranges.
-    if (LandingPads[i].BeginLabels.empty()) {
-      LandingPads.erase(LandingPads.begin() + i);
-      continue;
+      // Remove landing pads with no try-ranges.
+      if (LandingPads[i].BeginLabels.empty()) {
+        LandingPads.erase(LandingPads.begin() + i);
+        continue;
+      }
     }
 
     // If there is no landing pad, ensure that the list of typeids is empty.
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 639cd80768fc24f6e1400728261228056a2a7b83..6ef8de88f8b117484dcaf0d6fd2df4864c4cd7cc 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -194,7 +194,7 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
   Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
 }
 
-MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM)
+MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
   : ImmutablePass(ID), TM(*TM),
     Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
             TM->getObjFileLowering(), nullptr, false) {
@@ -206,10 +206,11 @@ MachineModuleInfo::~MachineModuleInfo() = default;
 bool MachineModuleInfo::doInitialization(Module &M) {
   ObjFileMMI = nullptr;
   CurCallSite = 0;
-  DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false;
+  UsesVAFloatArgument = UsesMorestackAddr = false;
   HasSplitStack = HasNosplitStack = false;
   AddrLabelSymbols = nullptr;
   TheModule = &M;
+  DbgInfoAvailable = !empty(M.debug_compile_units());
   return false;
 }
 
diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp
index a116d8fe877d4b785c51fc426a6de0feaee006f3..4fe51f6624816bd24443d4884de662f8d891b2ee 100644
--- a/lib/CodeGen/MachineOperand.cpp
+++ b/lib/CodeGen/MachineOperand.cpp
@@ -461,7 +461,8 @@ static void printIRValueReference(raw_ostream &OS, const Value &V,
     printLLVMNameWithoutPrefix(OS, V.getName());
     return;
   }
-  MachineOperand::printIRSlotNumber(OS, MST.getLocalSlot(&V));
+  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
+  MachineOperand::printIRSlotNumber(OS, Slot);
 }
 
 static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
@@ -743,10 +744,10 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << "undef ";
     if (isEarlyClobber())
       OS << "early-clobber ";
-    if (isDebug())
-      OS << "debug-use ";
     if (TargetRegisterInfo::isPhysicalRegister(getReg()) && isRenamable())
       OS << "renamable ";
+    // isDebug() is exactly true for register operands of a DBG_VALUE. So we
+    // simply infer it when parsing and do not need to print it.
 
     const MachineRegisterInfo *MRI = nullptr;
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 4b65d971a78b0f26841e282d470dc7d473d3f490..c69bfecd8dcd414db1115147eb630f52eef640a0 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -128,9 +128,6 @@ struct SuffixTreeNode {
   /// mapping by tacking that character on the end of the current string.
   DenseMap<unsigned, SuffixTreeNode *> Children;
 
-  /// A flag set to false if the node has been pruned from the tree.
-  bool IsInTree = true;
-
   /// The start index of this node's substring in the main string.
   unsigned StartIdx = EmptyIdx;
 
@@ -167,15 +164,6 @@ struct SuffixTreeNode {
   /// construction algorithm O(N^2) rather than O(N).
   SuffixTreeNode *Link = nullptr;
 
-  /// The parent of this node. Every node except for the root has a parent.
-  SuffixTreeNode *Parent = nullptr;
-
-  /// The number of times this node's string appears in the tree.
-  ///
-  /// This is equal to the number of leaf children of the string. It represents
-  /// the number of suffixes that the node's string is a prefix of.
-  unsigned OccurrenceCount = 0;
-
   /// The length of the string formed by concatenating the edge labels from the
   /// root to this node.
   unsigned ConcatLen = 0;
@@ -200,9 +188,8 @@ struct SuffixTreeNode {
     return *EndIdx - StartIdx + 1;
   }
 
-  SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link,
-                 SuffixTreeNode *Parent)
-      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {}
+  SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
+      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
 
   SuffixTreeNode() {}
 };
@@ -231,14 +218,18 @@ struct SuffixTreeNode {
 /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
 class SuffixTree {
 public:
-  /// Stores each leaf node in the tree.
-  ///
-  /// This is used for finding outlining candidates.
-  std::vector<SuffixTreeNode *> LeafVector;
-
   /// Each element is an integer representing an instruction in the module.
   ArrayRef<unsigned> Str;
 
+  /// A repeated substring in the tree.
+  struct RepeatedSubstring {
+    /// The length of the string.
+    unsigned Length;
+
+    /// The start indices of each occurrence.
+    std::vector<unsigned> StartIndices;
+  };
+
 private:
   /// Maintains each node in the tree.
   SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
@@ -291,7 +282,7 @@ private:
     assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
 
     SuffixTreeNode *N = new (NodeAllocator.Allocate())
-        SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr, &Parent);
+        SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
     Parent.Children[Edge] = N;
 
     return N;
@@ -314,7 +305,7 @@ private:
 
     unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
     SuffixTreeNode *N = new (NodeAllocator.Allocate())
-        SuffixTreeNode(StartIdx, E, Root, Parent);
+        SuffixTreeNode(StartIdx, E, Root);
     if (Parent)
       Parent->Children[Edge] = N;
 
@@ -322,41 +313,27 @@ private:
   }
 
   /// Set the suffix indices of the leaves to the start indices of their
-  /// respective suffixes. Also stores each leaf in \p LeafVector at its
-  /// respective suffix index.
+  /// respective suffixes.
   ///
   /// \param[in] CurrNode The node currently being visited.
-  /// \param CurrIdx The current index of the string being visited.
-  void setSuffixIndices(SuffixTreeNode &CurrNode, unsigned CurrIdx) {
+  /// \param CurrNodeLen The concatenation of all node sizes from the root to
+  /// this node. Used to produce suffix indices.
+  void setSuffixIndices(SuffixTreeNode &CurrNode, unsigned CurrNodeLen) {
 
     bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot();
 
-    // Store the length of the concatenation of all strings from the root to
-    // this node.
-    if (!CurrNode.isRoot()) {
-      if (CurrNode.ConcatLen == 0)
-        CurrNode.ConcatLen = CurrNode.size();
-
-      if (CurrNode.Parent)
-        CurrNode.ConcatLen += CurrNode.Parent->ConcatLen;
-    }
-
+    // Store the concatenation of lengths down from the root.
+    CurrNode.ConcatLen = CurrNodeLen;
     // Traverse the tree depth-first.
     for (auto &ChildPair : CurrNode.Children) {
       assert(ChildPair.second && "Node had a null child!");
-      setSuffixIndices(*ChildPair.second, CurrIdx + ChildPair.second->size());
+      setSuffixIndices(*ChildPair.second,
+                       CurrNodeLen + ChildPair.second->size());
     }
 
-    // Is this node a leaf?
-    if (IsLeaf) {
-      // If yes, give it a suffix index and bump its parent's occurrence count.
-      CurrNode.SuffixIdx = Str.size() - CurrIdx;
-      assert(CurrNode.Parent && "CurrNode had no parent!");
-      CurrNode.Parent->OccurrenceCount++;
-
-      // Store the leaf in the leaf vector for pruning later.
-      LeafVector[CurrNode.SuffixIdx] = &CurrNode;
-    }
+    // Is this node a leaf? If it is, give it a suffix index.
+    if (IsLeaf)
+      CurrNode.SuffixIdx = Str.size() - CurrNodeLen;
   }
 
   /// Construct the suffix tree for the prefix of the input ending at
@@ -461,7 +438,6 @@ private:
         // Make the old node a child of the split node and update its start
         // index. This is the node n from the diagram.
         NextNode->StartIdx += Active.Len;
-        NextNode->Parent = SplitNode;
         SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
 
         // SplitNode is an internal node, update the suffix link.
@@ -495,9 +471,7 @@ public:
   /// \param Str The string to construct the suffix tree for.
   SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
     Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
-    Root->IsInTree = true;
     Active.Node = Root;
-    LeafVector = std::vector<SuffixTreeNode *>(Str.size());
 
     // Keep track of the number of suffixes we have to add of the current
     // prefix.
@@ -518,6 +492,116 @@ public:
     assert(Root && "Root node can't be nullptr!");
     setSuffixIndices(*Root, 0);
   }
+
+
+  /// Iterator for finding all repeated substrings in the suffix tree.
+  struct RepeatedSubstringIterator {
+    private:
+    /// The current node we're visiting.
+    SuffixTreeNode *N = nullptr;
+
+    /// The repeated substring associated with this node.
+    RepeatedSubstring RS;
+
+    /// The nodes left to visit.
+    std::vector<SuffixTreeNode *> ToVisit;
+
+    /// The minimum length of a repeated substring to find.
+    /// Since we're outlining, we want at least two instructions in the range.
+    /// FIXME: This may not be true for targets like X86 which support many
+    /// instruction lengths.
+    const unsigned MinLength = 2;
+
+    /// Move the iterator to the next repeated substring.
+    void advance() {
+      // Clear the current state. If we're at the end of the range, then this
+      // is the state we want to be in.
+      RS = RepeatedSubstring();
+      N = nullptr;
+
+      // Continue visiting nodes until we find one which repeats more than once.
+      while (!ToVisit.empty()) {
+        SuffixTreeNode *Curr = ToVisit.back();
+        ToVisit.pop_back();
+
+        // Keep track of the length of the string associated with the node. If
+        // it's too short, we'll quit.
+        unsigned Length = Curr->ConcatLen;
+
+        // Each leaf node represents a repeat of a string.
+        std::vector<SuffixTreeNode *> LeafChildren;
+
+        // Iterate over each child, saving internal nodes for visiting, and
+        // leaf nodes in LeafChildren. Internal nodes represent individual
+        // strings, which may repeat.
+        for (auto &ChildPair : Curr->Children) {
+          // Save all of this node's children for processing.
+          if (!ChildPair.second->isLeaf())
+            ToVisit.push_back(ChildPair.second);
+
+          // It's not an internal node, so it must be a leaf. If we have a
+          // long enough string, then save the leaf children.
+          else if (Length >= MinLength)
+            LeafChildren.push_back(ChildPair.second);
+        }
+
+        // The root never represents a repeated substring. If we're looking at
+        // that, then skip it.
+        if (Curr->isRoot())
+          continue;
+
+        // Do we have any repeated substrings?
+        if (LeafChildren.size() >= 2) {
+          // Yes. Update the state to reflect this, and then bail out.
+          N = Curr;
+          RS.Length = Length;
+          for (SuffixTreeNode *Leaf : LeafChildren)
+            RS.StartIndices.push_back(Leaf->SuffixIdx);
+          break;
+        }
+      }
+
+      // At this point, either NewRS is an empty RepeatedSubstring, or it was
+      // set in the above loop. Similarly, N is either nullptr, or the node
+      // associated with NewRS.
+    }
+
+  public:
+    /// Return the current repeated substring.
+    RepeatedSubstring &operator*() { return RS; }
+
+    RepeatedSubstringIterator &operator++() {
+      advance();
+      return *this;
+    }
+
+    RepeatedSubstringIterator operator++(int I) {
+      RepeatedSubstringIterator It(*this);
+      advance();
+      return It;
+    }
+
+    bool operator==(const RepeatedSubstringIterator &Other) {
+      return N == Other.N;
+    }
+    bool operator!=(const RepeatedSubstringIterator &Other) {
+      return !(*this == Other);
+    }
+
+    RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
+      // Do we have a non-null node?
+      if (N) {
+        // Yes. At the first step, we need to visit all of N's children.
+        // Note: This means that we visit N last.
+        ToVisit.push_back(N);
+        advance();
+      }
+    }
+};
+
+  typedef RepeatedSubstringIterator iterator;
+  iterator begin() { return iterator(Root); }
+  iterator end() { return iterator(nullptr); }
 };
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
@@ -548,17 +632,40 @@ struct InstructionMapper {
   /// at index i in \p UnsignedVec for each index i.
   std::vector<MachineBasicBlock::iterator> InstrList;
 
+  // Set if we added an illegal number in the previous step.
+  // Since each illegal number is unique, we only need one of them between
+  // each range of legal numbers. This lets us make sure we don't add more
+  // than one illegal number per range.
+  bool AddedIllegalLastTime = false;
+
   /// Maps \p *It to a legal integer.
   ///
-  /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap,
-  /// \p IntegerInstructionMap, and \p LegalInstrNumber.
+  /// Updates \p CanOutlineWithPrevInstr, \p HaveLegalRange, \p InstrListForMBB,
+  /// \p UnsignedVecForMBB, \p InstructionIntegerMap, \p IntegerInstructionMap,
+  /// and \p LegalInstrNumber.
   ///
   /// \returns The integer that \p *It was mapped to.
-  unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) {
+  unsigned mapToLegalUnsigned(
+      MachineBasicBlock::iterator &It, bool &CanOutlineWithPrevInstr,
+      bool &HaveLegalRange, unsigned &NumLegalInBlock,
+      std::vector<unsigned> &UnsignedVecForMBB,
+      std::vector<MachineBasicBlock::iterator> &InstrListForMBB) {
+    // We added something legal, so we should unset the AddedLegalLastTime
+    // flag.
+    AddedIllegalLastTime = false;
+
+    // If we have at least two adjacent legal instructions (which may have
+    // invisible instructions in between), remember that.
+    if (CanOutlineWithPrevInstr)
+      HaveLegalRange = true;
+    CanOutlineWithPrevInstr = true;
+
+    // Keep track of the number of legal instructions we insert.
+    NumLegalInBlock++;
 
     // Get the integer for this instruction or give it the current
     // LegalInstrNumber.
-    InstrList.push_back(It);
+    InstrListForMBB.push_back(It);
     MachineInstr &MI = *It;
     bool WasInserted;
     DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator
@@ -573,7 +680,7 @@ struct InstructionMapper {
       IntegerInstructionMap.insert(std::make_pair(MINumber, &MI));
     }
 
-    UnsignedVec.push_back(MINumber);
+    UnsignedVecForMBB.push_back(MINumber);
 
     // Make sure we don't overflow or use any integers reserved by the DenseMap.
     if (LegalInstrNumber >= IllegalInstrNumber)
@@ -589,14 +696,26 @@ struct InstructionMapper {
 
   /// Maps \p *It to an illegal integer.
   ///
-  /// Updates \p InstrList, \p UnsignedVec, and \p IllegalInstrNumber.
+  /// Updates \p InstrListForMBB, \p UnsignedVecForMBB, and \p
+  /// IllegalInstrNumber.
   ///
   /// \returns The integer that \p *It was mapped to.
-  unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) {
+  unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It,
+  bool &CanOutlineWithPrevInstr, std::vector<unsigned> &UnsignedVecForMBB,
+  std::vector<MachineBasicBlock::iterator> &InstrListForMBB) {
+    // Can't outline an illegal instruction. Set the flag.
+    CanOutlineWithPrevInstr = false;
+
+    // Only add one illegal number per range of legal numbers.
+    if (AddedIllegalLastTime)
+      return IllegalInstrNumber;
+
+    // Remember that we added an illegal number last time.
+    AddedIllegalLastTime = true;
     unsigned MINumber = IllegalInstrNumber;
 
-    InstrList.push_back(It);
-    UnsignedVec.push_back(IllegalInstrNumber);
+    InstrListForMBB.push_back(It);
+    UnsignedVecForMBB.push_back(IllegalInstrNumber);
     IllegalInstrNumber--;
 
     assert(LegalInstrNumber < IllegalInstrNumber &&
@@ -624,50 +743,69 @@ struct InstructionMapper {
   void convertToUnsignedVec(MachineBasicBlock &MBB,
                             const TargetInstrInfo &TII) {
     unsigned Flags = TII.getMachineOutlinerMBBFlags(MBB);
+    MachineBasicBlock::iterator It = MBB.begin();
+
+    // The number of instructions in this block that will be considered for
+    // outlining.
+    unsigned NumLegalInBlock = 0;
+
+    // True if we have at least two legal instructions which aren't separated
+    // by an illegal instruction.
+    bool HaveLegalRange = false;
 
-    // Set to true whenever we map an illegal number.
-    bool AddedIllegalLastTime = false;
-    for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et;
-         It++) {
+    // True if we can perform outlining given the last mapped (non-invisible)
+    // instruction. This lets us know if we have a legal range.
+    bool CanOutlineWithPrevInstr = false;
 
+    // FIXME: Should this all just be handled in the target, rather than using
+    // repeated calls to getOutliningType?
+    std::vector<unsigned> UnsignedVecForMBB;
+    std::vector<MachineBasicBlock::iterator> InstrListForMBB;
+
+    for (MachineBasicBlock::iterator Et = MBB.end(); It != Et; It++) {
       // Keep track of where this instruction is in the module.
       switch (TII.getOutliningType(It, Flags)) {
       case InstrType::Illegal:
-        // If we added an illegal number last time, then don't add more of them.
-        // One number is all that is necessary to prevent matches on illegal
-        // instructions.
-        if (AddedIllegalLastTime)
-          break;
-        AddedIllegalLastTime = true;
-        mapToIllegalUnsigned(It);
+        mapToIllegalUnsigned(It, CanOutlineWithPrevInstr,
+                             UnsignedVecForMBB, InstrListForMBB);
         break;
 
       case InstrType::Legal:
-        AddedIllegalLastTime = false;
-        mapToLegalUnsigned(It);
+        mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                           NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB);
         break;
 
       case InstrType::LegalTerminator:
-        mapToLegalUnsigned(It);
-        InstrList.push_back(It);
-        AddedIllegalLastTime = true;
-        UnsignedVec.push_back(IllegalInstrNumber);
-        IllegalInstrNumber--;
+        mapToLegalUnsigned(It, CanOutlineWithPrevInstr, HaveLegalRange,
+                           NumLegalInBlock, UnsignedVecForMBB, InstrListForMBB);
+        // The instruction also acts as a terminator, so we have to record that
+        // in the string.
+        mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+        InstrListForMBB);
         break;
 
       case InstrType::Invisible:
+        // Normally this is set by mapTo(Blah)Unsigned, but we just want to
+        // skip this instruction. So, unset the flag here.
         AddedIllegalLastTime = false;
         break;
       }
     }
 
-    // After we're done every insertion, uniquely terminate this part of the
-    // "string". This makes sure we won't match across basic block or function
-    // boundaries since the "end" is encoded uniquely and thus appears in no
-    // repeated substring.
-    InstrList.push_back(MBB.end());
-    UnsignedVec.push_back(IllegalInstrNumber);
-    IllegalInstrNumber--;
+    // Are there enough legal instructions in the block for outlining to be
+    // possible?
+    if (HaveLegalRange) {
+      // After we're done every insertion, uniquely terminate this part of the
+      // "string". This makes sure we won't match across basic block or function
+      // boundaries since the "end" is encoded uniquely and thus appears in no
+      // repeated substring.
+      mapToIllegalUnsigned(It, CanOutlineWithPrevInstr, UnsignedVecForMBB,
+      InstrListForMBB);
+      InstrList.insert(InstrList.end(), InstrListForMBB.begin(),
+                       InstrListForMBB.end());
+      UnsignedVec.insert(UnsignedVec.end(), UnsignedVecForMBB.begin(),
+                         UnsignedVecForMBB.end());
+    }
   }
 
   InstructionMapper() {
@@ -763,7 +901,8 @@ struct MachineOutliner : public ModulePass {
 
   /// Creates a function for \p OF and inserts it into the module.
   MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF,
-                                          InstructionMapper &Mapper);
+                                          InstructionMapper &Mapper,
+                                          unsigned Name);
 
   /// Find potential outlining candidates and store them in \p CandidateList.
   ///
@@ -854,6 +993,10 @@ INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false,
 void MachineOutliner::emitNotOutliningCheaperRemark(
     unsigned StringLen, std::vector<Candidate> &CandidatesForRepeatedSeq,
     OutlinedFunction &OF) {
+  // FIXME: Right now, we arbitrarily choose some Candidate from the
+  // OutlinedFunction. This isn't necessarily fixed, nor does it have to be.
+  // We should probably sort these by function name or something to make sure
+  // the remarks are stable.
   Candidate &C = CandidatesForRepeatedSeq.front();
   MachineOptimizationRemarkEmitter MORE(*(C.getMF()), nullptr);
   MORE.emit([&]() {
@@ -918,80 +1061,50 @@ unsigned MachineOutliner::findCandidates(
   FunctionList.clear();
   unsigned MaxLen = 0;
 
-  // FIXME: Visit internal nodes instead of leaves.
-  for (SuffixTreeNode *Leaf : ST.LeafVector) {
-    assert(Leaf && "Leaves in LeafVector cannot be null!");
-    if (!Leaf->IsInTree)
-      continue;
-
-    assert(Leaf->Parent && "All leaves must have parents!");
-    SuffixTreeNode &Parent = *(Leaf->Parent);
-
-    // If it doesn't appear enough, or we already outlined from it, skip it.
-    if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree)
-      continue;
-
-    // Figure out if this candidate is beneficial.
-    unsigned StringLen = Leaf->ConcatLen - (unsigned)Leaf->size();
-
-    // Too short to be beneficial; skip it.
-    // FIXME: This isn't necessarily true for, say, X86. If we factor in
-    // instruction lengths we need more information than this.
-    if (StringLen < 2)
-      continue;
-
-    // If this is a beneficial class of candidate, then every one is stored in
-    // this vector.
+  // First, find dall of the repeated substrings in the tree of minimum length
+  // 2.
+  for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
+    SuffixTree::RepeatedSubstring RS = *It;
     std::vector<Candidate> CandidatesForRepeatedSeq;
-
-    // Figure out the call overhead for each instance of the sequence.
-    for (auto &ChildPair : Parent.Children) {
-      SuffixTreeNode *M = ChildPair.second;
-
-      if (M && M->IsInTree && M->isLeaf()) {
-        // Never visit this leaf again.
-        M->IsInTree = false;
-        unsigned StartIdx = M->SuffixIdx;
-        unsigned EndIdx = StartIdx + StringLen - 1;
-
-        // Trick: Discard some candidates that would be incompatible with the
-        // ones we've already found for this sequence. This will save us some
-        // work in candidate selection.
-        //
-        // If two candidates overlap, then we can't outline them both. This
-        // happens when we have candidates that look like, say
-        //
-        // AA (where each "A" is an instruction).
-        //
-        // We might have some portion of the module that looks like this:
-        // AAAAAA (6 A's)
-        //
-        // In this case, there are 5 different copies of "AA" in this range, but
-        // at most 3 can be outlined. If only outlining 3 of these is going to
-        // be unbeneficial, then we ought to not bother.
-        //
-        // Note that two things DON'T overlap when they look like this:
-        // start1...end1 .... start2...end2
-        // That is, one must either
-        // * End before the other starts
-        // * Start after the other ends
-        if (std::all_of(CandidatesForRepeatedSeq.begin(),
-                        CandidatesForRepeatedSeq.end(),
-                        [&StartIdx, &EndIdx](const Candidate &C) {
-                          return (EndIdx < C.getStartIdx() ||
-                                  StartIdx > C.getEndIdx());
-                        })) {
-          // It doesn't overlap with anything, so we can outline it.
-          // Each sequence is over [StartIt, EndIt].
-          // Save the candidate and its location.
-
-          MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx];
-          MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
-
-          CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt,
-                                                EndIt, StartIt->getParent(),
-                                                FunctionList.size());
-        }
+    unsigned StringLen = RS.Length;
+    for (const unsigned &StartIdx : RS.StartIndices) {
+      unsigned EndIdx = StartIdx + StringLen - 1;
+      // Trick: Discard some candidates that would be incompatible with the
+      // ones we've already found for this sequence. This will save us some
+      // work in candidate selection.
+      //
+      // If two candidates overlap, then we can't outline them both. This
+      // happens when we have candidates that look like, say
+      //
+      // AA (where each "A" is an instruction).
+      //
+      // We might have some portion of the module that looks like this:
+      // AAAAAA (6 A's)
+      //
+      // In this case, there are 5 different copies of "AA" in this range, but
+      // at most 3 can be outlined. If only outlining 3 of these is going to
+      // be unbeneficial, then we ought to not bother.
+      //
+      // Note that two things DON'T overlap when they look like this:
+      // start1...end1 .... start2...end2
+      // That is, one must either
+      // * End before the other starts
+      // * Start after the other ends
+      if (std::all_of(
+              CandidatesForRepeatedSeq.begin(), CandidatesForRepeatedSeq.end(),
+              [&StartIdx, &EndIdx](const Candidate &C) {
+                return (EndIdx < C.getStartIdx() || StartIdx > C.getEndIdx());
+              })) {
+        // It doesn't overlap with anything, so we can outline it.
+        // Each sequence is over [StartIt, EndIt].
+        // Save the candidate and its location.
+
+        MachineBasicBlock::iterator StartIt = Mapper.InstrList[StartIdx];
+        MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
+
+        CandidatesForRepeatedSeq.emplace_back(StartIdx, StringLen, StartIt,
+                                              EndIt, StartIt->getParent(),
+                                              FunctionList.size());
       }
     }
 
@@ -1014,10 +1127,10 @@ unsigned MachineOutliner::findCandidates(
       continue;
 
     std::vector<unsigned> Seq;
-    for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++)
+    unsigned StartIdx = RS.StartIndices[0]; // Grab any start index.
+    for (unsigned i = StartIdx; i < StartIdx + StringLen; i++)
       Seq.push_back(ST.Str[i]);
     OF.Sequence = Seq;
-    OF.Name = FunctionList.size();
 
     // Is it better to outline this candidate than not?
     if (OF.getBenefit() < 1) {
@@ -1033,9 +1146,6 @@ unsigned MachineOutliner::findCandidates(
     for (std::shared_ptr<Candidate> &C : OF.Candidates)
       CandidateList.push_back(C);
     FunctionList.push_back(OF);
-
-    // Move to the next function.
-    Parent.IsInTree = false;
   }
 
   return MaxLen;
@@ -1175,13 +1285,16 @@ unsigned MachineOutliner::buildCandidateList(
 
 MachineFunction *
 MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
-                                        InstructionMapper &Mapper) {
+                                        InstructionMapper &Mapper,
+                                        unsigned Name) {
 
   // Create the function name. This should be unique. For now, just hash the
   // module name and include it in the function name plus the number of this
   // function.
   std::ostringstream NameStream;
-  NameStream << "OUTLINED_FUNCTION_" << OF.Name;
+  // FIXME: We should have a better naming scheme. This should be stable,
+  // regardless of changes to the outliner's cost model/traversal order.
+  NameStream << "OUTLINED_FUNCTION_" << Name;
 
   // Create the function using an IR-level function.
   LLVMContext &C = M.getContext();
@@ -1202,6 +1315,14 @@ MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
   F->addFnAttr(Attribute::OptimizeForSize);
   F->addFnAttr(Attribute::MinSize);
 
+  // Include target features from an arbitrary candidate for the outlined
+  // function. This makes sure the outlined function knows what kinds of
+  // instructions are going into it. This is fine, since all parent functions
+  // must necessarily support the instructions that are in the outlined region.
+  const Function &ParentFn = OF.Candidates.front()->getMF()->getFunction();
+  if (ParentFn.hasFnAttribute("target-features"))
+    F->addFnAttr(ParentFn.getFnAttribute("target-features"));
+
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
   IRBuilder<> Builder(EntryBB);
   Builder.CreateRetVoid();
@@ -1272,6 +1393,10 @@ bool MachineOutliner::outline(
     std::vector<OutlinedFunction> &FunctionList, InstructionMapper &Mapper) {
 
   bool OutlinedSomething = false;
+
+  // Number to append to the current outlined function.
+  unsigned OutlinedFunctionNum = 0;
+
   // Replace the candidates with calls to their respective outlined functions.
   for (const std::shared_ptr<Candidate> &Cptr : CandidateList) {
     Candidate &C = *Cptr;
@@ -1288,9 +1413,10 @@ bool MachineOutliner::outline(
 
     // Does this candidate have a function yet?
     if (!OF.MF) {
-      OF.MF = createOutlinedFunction(M, OF, Mapper);
+      OF.MF = createOutlinedFunction(M, OF, Mapper, OutlinedFunctionNum);
       emitOutlinedFunctionRemark(OF);
       FunctionsCreated++;
+      OutlinedFunctionNum++; // Created a function, move to the next name.
     }
 
     MachineFunction *MF = OF.MF;
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 5f6f0cf96a5811a3beb9c18852c0639da8eb646c..bb5fc664c5f593e9469459bfcbb4346633901cf1 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -171,6 +171,12 @@ static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
                                      cl::ReallyHidden, cl::init(false),
                                      cl::ZeroOrMore, cl::desc("Ignore RecMII"));
 
+// A command line option to enable the CopyToPhi DAG mutation.
+static cl::opt<bool>
+    SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+                       cl::init(true), cl::ZeroOrMore,
+                       cl::desc("Enable CopyToPhi DAG Mutation"));
+
 namespace {
 
 class NodeSet;
@@ -278,12 +284,21 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     BitVector Blocked;
     SmallVector<SmallPtrSet<SUnit *, 4>, 10> B;
     SmallVector<SmallVector<int, 4>, 16> AdjK;
+    // Node to Index from ScheduleDAGTopologicalSort
+    std::vector<int> *Node2Idx;
     unsigned NumPaths;
     static unsigned MaxPaths;
 
   public:
-    Circuits(std::vector<SUnit> &SUs)
-        : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {}
+    Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
+        : SUnits(SUs), Blocked(SUs.size()), B(SUs.size()), AdjK(SUs.size()) {
+      Node2Idx = new std::vector<int>(SUs.size());
+      unsigned Idx = 0;
+      for (const auto &NodeNum : Topo)
+        Node2Idx->at(NodeNum) = Idx++;
+    }
+
+    ~Circuits() { delete Node2Idx; }
 
     /// Reset the data structures used in the circuit algorithm.
     void reset() {
@@ -298,12 +313,18 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     void unblock(int U);
   };
 
+  struct CopyToPhiMutation : public ScheduleDAGMutation {
+    void apply(ScheduleDAGInstrs *DAG) override;
+  };
+
 public:
   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
                     const RegisterClassInfo &rci)
       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
         RegClassInfo(rci), Topo(SUnits, &ExitSU) {
     P.MF->getSubtarget().getSMSMutations(Mutations);
+    if (SwpEnableCopyToPhi)
+      Mutations.push_back(llvm::make_unique<CopyToPhiMutation>());
   }
 
   void schedule() override;
@@ -382,6 +403,8 @@ public:
     Mutations.push_back(std::move(Mutation));
   }
 
+  static bool classof(const ScheduleDAGInstrs *DAG) { return true; }
+
 private:
   void addLoopCarriedDependences(AliasAnalysis *AA);
   void updatePhiDependences();
@@ -884,8 +907,8 @@ void SwingSchedulerDAG::schedule() {
   addLoopCarriedDependences(AA);
   updatePhiDependences();
   Topo.InitDAGTopologicalSorting();
-  postprocessDAG();
   changeDependences();
+  postprocessDAG();
   LLVM_DEBUG(dump());
 
   NodeSetType NodeSets;
@@ -1136,9 +1159,9 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
             continue;
           }
           AliasResult AAResult = AA->alias(
-              MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
+              MemoryLocation(MMO1->getValue(), LocationSize::unknown(),
                              MMO1->getAAInfo()),
-              MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
+              MemoryLocation(MMO2->getValue(), LocationSize::unknown(),
                              MMO2->getAAInfo()));
 
           if (AAResult != NoAlias) {
@@ -1295,6 +1318,7 @@ void SwingSchedulerDAG::changeDependences() {
     // Add a dependence between the new instruction and the instruction
     // that defines the new base.
     SDep Dep(&I, SDep::Anti, NewBase);
+    Topo.AddPred(LastSU, &I);
     LastSU->addPred(Dep);
 
     // Remember the base and offset information so that we can update the
@@ -1506,9 +1530,9 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
         }
         OutputDeps[N] = BackEdge;
       }
-      // Do not process a boundary node and a back-edge is processed only
-      // if it goes to a Phi.
-      if (SI.getSUnit()->isBoundaryNode() ||
+      // Do not process a boundary node, an artificial node.
+      // A back-edge is processed only if it goes to a Phi.
+      if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
           (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
         continue;
       int N = SI.getSUnit()->NodeNum;
@@ -1561,7 +1585,8 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
       ++NumPaths;
       break;
     } else if (!Blocked.test(W)) {
-      if (circuit(W, S, NodeSets, W < V ? true : HasBackedge))
+      if (circuit(W, S, NodeSets,
+                  Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
         F = true;
     }
   }
@@ -1601,7 +1626,7 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
   // but we do this to find the circuits, and then change them back.
   swapAntiDependences(SUnits);
 
-  Circuits Cir(SUnits);
+  Circuits Cir(SUnits, Topo);
   // Create the adjacency structure.
   Cir.createAdjacencyStructure(this);
   for (int i = 0, e = SUnits.size(); i != e; ++i) {
@@ -1613,6 +1638,85 @@ void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
   swapAntiDependences(SUnits);
 }
 
+// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
+// is loop-carried to the USE in next iteration. This will help pipeliner avoid
+// additional copies that are needed across iterations. An artificial dependence
+// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
+
+// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
+// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
+// PHI-------True-Dep------> USEOfPhi
+
+// The mutation creates
+// USEOfPHI -------Artificial-Dep---> SRCOfCopy
+
+// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
+// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
+// late  to avoid additional copies across iterations. The possible scheduling
+// order would be
+// USEOfPHI --- SRCOfCopy---  COPY/REG_SEQUENCE.
+
+void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
+  for (SUnit &SU : DAG->SUnits) {
+    // Find the COPY/REG_SEQUENCE instruction.
+    if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
+      continue;
+
+    // Record the loop carried PHIs.
+    SmallVector<SUnit *, 4> PHISUs;
+    // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
+    SmallVector<SUnit *, 4> SrcSUs;
+
+    for (auto &Dep : SU.Preds) {
+      SUnit *TmpSU = Dep.getSUnit();
+      MachineInstr *TmpMI = TmpSU->getInstr();
+      SDep::Kind DepKind = Dep.getKind();
+      // Save the loop carried PHI.
+      if (DepKind == SDep::Anti && TmpMI->isPHI())
+        PHISUs.push_back(TmpSU);
+      // Save the source of COPY/REG_SEQUENCE.
+      // If the source has no pre-decessors, we will end up creating cycles.
+      else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
+        SrcSUs.push_back(TmpSU);
+    }
+
+    if (PHISUs.size() == 0 || SrcSUs.size() == 0)
+      continue;
+
+    // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
+    // SUnit to the container.
+    SmallVector<SUnit *, 8> UseSUs;
+    for (auto I = PHISUs.begin(); I != PHISUs.end(); ++I) {
+      for (auto &Dep : (*I)->Succs) {
+        if (Dep.getKind() != SDep::Data)
+          continue;
+
+        SUnit *TmpSU = Dep.getSUnit();
+        MachineInstr *TmpMI = TmpSU->getInstr();
+        if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
+          PHISUs.push_back(TmpSU);
+          continue;
+        }
+        UseSUs.push_back(TmpSU);
+      }
+    }
+
+    if (UseSUs.size() == 0)
+      continue;
+
+    SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
+    // Add the artificial dependencies if it does not form a cycle.
+    for (auto I : UseSUs) {
+      for (auto Src : SrcSUs) {
+        if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
+          Src->addPred(SDep(I, SDep::Artificial));
+          SDAG->Topo.AddPred(Src, I);
+        }
+      }
+    }
+  }
+}
+
 /// Return true for DAG nodes that we ignore when computing the cost functions.
 /// We ignore the back-edge recurrence in order to avoid unbounded recursion
 /// in the calculation of the ASAP, ALAP, etc functions.
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 1da99d91760bd6deef55ce2635ef17e318e8ba25..6e5ca45d5e5eaef127c22361a9238b61e7a202f1 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -93,36 +93,29 @@ bool
 MachineRegisterInfo::constrainRegAttrs(unsigned Reg,
                                        unsigned ConstrainingReg,
                                        unsigned MinNumRegs) {
-  auto const *OldRC = getRegClassOrNull(Reg);
-  auto const *RC = getRegClassOrNull(ConstrainingReg);
-  // A virtual register at any point must have either a low-level type
-  // or a class assigned, but not both. The only exception is the internals of
-  // GlobalISel's instruction selection pass, which is allowed to temporarily
-  // introduce registers with types and classes both.
-  assert((OldRC || getType(Reg).isValid()) && "Reg has neither class nor type");
-  assert((!OldRC || !getType(Reg).isValid()) && "Reg has class and type both");
-  assert((RC || getType(ConstrainingReg).isValid()) &&
-         "ConstrainingReg has neither class nor type");
-  assert((!RC || !getType(ConstrainingReg).isValid()) &&
-         "ConstrainingReg has class and type both");
-  if (OldRC && RC)
-    return ::constrainRegClass(*this, Reg, OldRC, RC, MinNumRegs);
-  // If one of the virtual registers is generic (used in generic machine
-  // instructions, has a low-level type, doesn't have a class), and the other is
-  // concrete (used in target specific instructions, doesn't have a low-level
-  // type, has a class), we can not unify them.
-  if (OldRC || RC)
+  const LLT RegTy = getType(Reg);
+  const LLT ConstrainingRegTy = getType(ConstrainingReg);
+  if (RegTy.isValid() && ConstrainingRegTy.isValid() &&
+      RegTy != ConstrainingRegTy)
     return false;
-  // At this point, both registers are guaranteed to have a valid low-level
-  // type, and they must agree.
-  if (getType(Reg) != getType(ConstrainingReg))
-    return false;
-  auto const *OldRB = getRegBankOrNull(Reg);
-  auto const *RB = getRegBankOrNull(ConstrainingReg);
-  if (OldRB)
-    return !RB || RB == OldRB;
-  if (RB)
-    setRegBank(Reg, *RB);
+  const auto ConstrainingRegCB = getRegClassOrRegBank(ConstrainingReg);
+  if (!ConstrainingRegCB.isNull()) {
+    const auto RegCB = getRegClassOrRegBank(Reg);
+    if (RegCB.isNull())
+      setRegClassOrRegBank(Reg, ConstrainingRegCB);
+    else if (RegCB.is<const TargetRegisterClass *>() !=
+             ConstrainingRegCB.is<const TargetRegisterClass *>())
+      return false;
+    else if (RegCB.is<const TargetRegisterClass *>()) {
+      if (!::constrainRegClass(
+              *this, Reg, RegCB.get<const TargetRegisterClass *>(),
+              ConstrainingRegCB.get<const TargetRegisterClass *>(), MinNumRegs))
+        return false;
+    } else if (RegCB != ConstrainingRegCB)
+      return false;
+  }
+  if (ConstrainingRegTy.isValid())
+    setType(Reg, ConstrainingRegTy);
   return true;
 }
 
@@ -188,10 +181,6 @@ unsigned MachineRegisterInfo::cloneVirtualRegister(unsigned VReg,
 }
 
 void MachineRegisterInfo::setType(unsigned VReg, LLT Ty) {
-  // Check that VReg doesn't have a class.
-  assert((getRegClassOrRegBank(VReg).isNull() ||
-         !getRegClassOrRegBank(VReg).is<const TargetRegisterClass *>()) &&
-         "Can't set the size of a non-generic virtual register");
   VRegToType.grow(VReg);
   VRegToType[VReg] = Ty;
 }
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 1d2e85accbc19f927aa32c47dcb71c84f8ed7439..d45855407f28f26dc75357657c2c2d9499a3ec1e 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -734,12 +734,18 @@ static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI,
          MBP.LHS.getReg() == BaseReg;
 }
 
-/// Sink an instruction and its associated debug instructions.
+/// Sink an instruction and its associated debug instructions. If the debug
+/// instructions to be sunk are already known, they can be provided in DbgVals.
 static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
-                        MachineBasicBlock::iterator InsertPos) {
-  // Collect matching debug values.
+                        MachineBasicBlock::iterator InsertPos,
+                        SmallVectorImpl<MachineInstr *> *DbgVals = nullptr) {
+  // If debug values are provided use those, otherwise call collectDebugValues.
   SmallVector<MachineInstr *, 2> DbgValuesToSink;
-  MI.collectDebugValues(DbgValuesToSink);
+  if (DbgVals)
+    DbgValuesToSink.insert(DbgValuesToSink.begin(),
+                           DbgVals->begin(), DbgVals->end());
+  else
+    MI.collectDebugValues(DbgValuesToSink);
 
   // If we cannot find a location to use (merge with), then we erase the debug
   // location to prevent debug-info driven tools from potentially reporting
@@ -951,6 +957,9 @@ private:
   /// Track which register units have been modified and used.
   LiveRegUnits ModifiedRegUnits, UsedRegUnits;
 
+  /// Track DBG_VALUEs of (unmodified) register units.
+  DenseMap<unsigned, TinyPtrVector<MachineInstr*>> SeenDbgInstrs;
+
   /// Sink Copy instructions unused in the same block close to their uses in
   /// successors.
   bool tryToSinkCopy(MachineBasicBlock &BB, MachineFunction &MF,
@@ -1105,11 +1114,34 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
   // block and the current instruction.
   ModifiedRegUnits.clear();
   UsedRegUnits.clear();
+  SeenDbgInstrs.clear();
 
   for (auto I = CurBB.rbegin(), E = CurBB.rend(); I != E;) {
     MachineInstr *MI = &*I;
     ++I;
 
+    // Track the operand index for use in Copy.
+    SmallVector<unsigned, 2> UsedOpsInCopy;
+    // Track the register number defed in Copy.
+    SmallVector<unsigned, 2> DefedRegsInCopy;
+
+    // We must sink this DBG_VALUE if its operand is sunk. To avoid searching
+    // for DBG_VALUEs later, record them when they're encountered.
+    if (MI->isDebugValue()) {
+      auto &MO = MI->getOperand(0);
+      if (MO.isReg() && TRI->isPhysicalRegister(MO.getReg())) {
+        // Bail if we can already tell the sink would be rejected, rather
+        // than needlessly accumulating lots of DBG_VALUEs.
+        if (hasRegisterDependency(MI, UsedOpsInCopy, DefedRegsInCopy,
+                                  ModifiedRegUnits, UsedRegUnits))
+          continue;
+
+        // Record debug use of this register.
+        SeenDbgInstrs[MO.getReg()].push_back(MI);
+      }
+      continue;
+    }
+
     if (MI->isDebugInstr())
       continue;
 
@@ -1123,11 +1155,6 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
       continue;
     }
 
-    // Track the operand index for use in Copy.
-    SmallVector<unsigned, 2> UsedOpsInCopy;
-    // Track the register number defed in Copy.
-    SmallVector<unsigned, 2> DefedRegsInCopy;
-
     // Don't sink the COPY if it would violate a register dependency.
     if (hasRegisterDependency(MI, UsedOpsInCopy, DefedRegsInCopy,
                               ModifiedRegUnits, UsedRegUnits)) {
@@ -1149,11 +1176,21 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     assert((SuccBB->pred_size() == 1 && *SuccBB->pred_begin() == &CurBB) &&
            "Unexpected predecessor");
 
+    // Collect DBG_VALUEs that must sink with this copy.
+    SmallVector<MachineInstr *, 4> DbgValsToSink;
+    for (auto &MO : MI->operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned reg = MO.getReg();
+      for (auto *MI : SeenDbgInstrs.lookup(reg))
+        DbgValsToSink.push_back(MI);
+    }
+
     // Clear the kill flag if SrcReg is killed between MI and the end of the
     // block.
     clearKillFlags(MI, CurBB, UsedOpsInCopy, UsedRegUnits, TRI);
     MachineBasicBlock::iterator InsertPos = SuccBB->getFirstNonPHI();
-    performSink(*MI, *SuccBB, InsertPos);
+    performSink(*MI, *SuccBB, InsertPos, &DbgValsToSink);
     updateLiveIn(MI, SuccBB, UsedOpsInCopy, DefedRegsInCopy);
 
     Changed = true;
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index a19c2ef800272d255c5a5973ce7b8c0653566e0b..b37c421596b757f8f78aa84aff8bd7c8cd6274c4 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -778,7 +778,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
                "isn't a terminator instruction!", MBB);
       }
       if (Cond.empty()) {
-        report("MBB exits via conditinal branch/branch but there's no "
+        report("MBB exits via conditional branch/branch but there's no "
                "condition!", MBB);
       }
     } else {
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 7b57c6cbcdb83ede88b3f5e70469190d1aba94a1..ea7f247214de3ea98dc3d6821914598d1281a711 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -54,7 +54,7 @@ using namespace llvm;
 
 STATISTIC(NumStores, "Number of stores added");
 STATISTIC(NumLoads , "Number of loads added");
-STATISTIC(NumCopies, "Number of copies coalesced");
+STATISTIC(NumCoalesced, "Number of copies coalesced");
 
 static RegisterRegAlloc
   fastRegAlloc("fast", "fast register allocator", createFastRegisterAllocator);
@@ -88,7 +88,7 @@ namespace {
       unsigned short LastOpNum = 0;    ///< OpNum on LastUse.
       bool Dirty = false;              ///< Register needs spill.
 
-      explicit LiveReg(unsigned v) : VirtReg(v) {}
+      explicit LiveReg(unsigned VirtReg) : VirtReg(VirtReg) {}
 
       unsigned getSparseSetIndex() const {
         return TargetRegisterInfo::virtReg2Index(VirtReg);
@@ -96,14 +96,13 @@ namespace {
     };
 
     using LiveRegMap = SparseSet<LiveReg>;
-
     /// This map contains entries for each virtual register that is currently
     /// available in a physical register.
     LiveRegMap LiveVirtRegs;
 
-    DenseMap<unsigned, SmallVector<MachineInstr *, 4>> LiveDbgValueMap;
+    DenseMap<unsigned, SmallVector<MachineInstr *, 2>> LiveDbgValueMap;
 
-    /// Track the state of a physical register.
+    /// State of a physical register.
     enum RegState {
       /// A disabled register is not available for allocation, but an alias may
       /// be in use. A register can only be moved out of the disabled state if
@@ -123,18 +122,18 @@ namespace {
       /// register. In that case, LiveVirtRegs contains the inverse mapping.
     };
 
-    /// One of the RegState enums, or a virtreg.
+    /// Maps each physical register to a RegState enum or a virtual register.
     std::vector<unsigned> PhysRegState;
 
     SmallVector<unsigned, 16> VirtDead;
     SmallVector<MachineInstr *, 32> Coalesced;
 
-    /// Set of register units.
-    using UsedInInstrSet = SparseSet<unsigned>;
-
+    using RegUnitSet = SparseSet<uint16_t, identity<uint16_t>>;
     /// Set of register units that are used in the current instruction, and so
     /// cannot be allocated.
-    UsedInInstrSet UsedInInstr;
+    RegUnitSet UsedInInstr;
+
+    void setPhysRegState(MCPhysReg PhysReg, unsigned NewState);
 
     /// Mark a physreg as used in this instruction.
     void markRegUsedInInstr(MCPhysReg PhysReg) {
@@ -150,12 +149,8 @@ namespace {
       return false;
     }
 
-    /// This flag is set when LiveRegMap will be cleared completely after
-    /// spilling all live registers. LiveRegMap entries should not be erased.
-    bool isBulkSpilling = false;
-
     enum : unsigned {
-      spillClean = 1,
+      spillClean = 50,
       spillDirty = 100,
       spillImpossible = ~0u
     };
@@ -180,16 +175,16 @@ namespace {
 
   private:
     bool runOnMachineFunction(MachineFunction &MF) override;
+
     void allocateBasicBlock(MachineBasicBlock &MBB);
     void handleThroughOperands(MachineInstr &MI,
                                SmallVectorImpl<unsigned> &VirtDead);
-    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass &RC);
     bool isLastUseOfLocalReg(const MachineOperand &MO) const;
 
     void addKillFlag(const LiveReg &LRI);
-    void killVirtReg(LiveRegMap::iterator LRI);
+    void killVirtReg(LiveReg &LR);
     void killVirtReg(unsigned VirtReg);
-    void spillVirtReg(MachineBasicBlock::iterator MI, LiveRegMap::iterator);
+    void spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR);
     void spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg);
 
     void usePhysReg(MachineOperand &MO);
@@ -206,16 +201,20 @@ namespace {
       return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
     }
 
-    LiveRegMap::iterator assignVirtToPhysReg(unsigned VirtReg, MCPhysReg PhysReg);
-    LiveRegMap::iterator allocVirtReg(MachineInstr &MI, LiveRegMap::iterator,
-                                      unsigned Hint);
-    LiveRegMap::iterator defineVirtReg(MachineInstr &MI, unsigned OpNum,
-                                       unsigned VirtReg, unsigned Hint);
-    LiveRegMap::iterator reloadVirtReg(MachineInstr &MI, unsigned OpNum,
-                                       unsigned VirtReg, unsigned Hint);
+    void allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint);
+    MCPhysReg defineVirtReg(MachineInstr &MI, unsigned OpNum, unsigned VirtReg,
+                            unsigned Hint);
+    LiveReg &reloadVirtReg(MachineInstr &MI, unsigned OpNum, unsigned VirtReg,
+                           unsigned Hint);
     void spillAll(MachineBasicBlock::iterator MI);
     bool setPhysReg(MachineInstr &MI, unsigned OpNum, MCPhysReg PhysReg);
 
+    int getStackSpaceFor(unsigned VirtReg);
+    void spill(MachineBasicBlock::iterator Before, unsigned VirtReg,
+               MCPhysReg AssignedReg, bool Kill);
+    void reload(MachineBasicBlock::iterator Before, unsigned VirtReg,
+                MCPhysReg PhysReg);
+
     void dumpState();
   };
 
@@ -226,10 +225,13 @@ char RegAllocFast::ID = 0;
 INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
                 false)
 
+void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
+  PhysRegState[PhysReg] = NewState;
+}
+
 /// This allocates space for the specified virtual register to be held on the
 /// stack.
-int RegAllocFast::getStackSpaceFor(unsigned VirtReg,
-                                   const TargetRegisterClass &RC) {
+int RegAllocFast::getStackSpaceFor(unsigned VirtReg) {
   // Find the location Reg would belong...
   int SS = StackSlotForVirtReg[VirtReg];
   // Already has space allocated?
@@ -237,6 +239,7 @@ int RegAllocFast::getStackSpaceFor(unsigned VirtReg,
     return SS;
 
   // Allocate a new stack object for this spill location...
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   unsigned Size = TRI->getSpillSize(RC);
   unsigned Align = TRI->getSpillAlignment(RC);
   int FrameIdx = MFI->CreateSpillStackObject(Size, Align);
@@ -246,6 +249,46 @@ int RegAllocFast::getStackSpaceFor(unsigned VirtReg,
   return FrameIdx;
 }
 
+/// Insert spill instruction for \p AssignedReg before \p Before. Update
+/// DBG_VALUEs with \p VirtReg operands with the stack slot.
+void RegAllocFast::spill(MachineBasicBlock::iterator Before, unsigned VirtReg,
+                         MCPhysReg AssignedReg, bool Kill) {
+  LLVM_DEBUG(dbgs() << "Spilling " << printReg(VirtReg, TRI)
+                    << " in " << printReg(AssignedReg, TRI));
+  int FI = getStackSpaceFor(VirtReg);
+  LLVM_DEBUG(dbgs() << " to stack slot #" << FI << '\n');
+
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  TII->storeRegToStackSlot(*MBB, Before, AssignedReg, Kill, FI, &RC, TRI);
+  ++NumStores;
+
+  // If this register is used by DBG_VALUE then insert new DBG_VALUE to
+  // identify spilled location as the place to find corresponding variable's
+  // value.
+  SmallVectorImpl<MachineInstr *> &LRIDbgValues = LiveDbgValueMap[VirtReg];
+  for (MachineInstr *DBG : LRIDbgValues) {
+    MachineInstr *NewDV = buildDbgValueForSpill(*MBB, Before, *DBG, FI);
+    assert(NewDV->getParent() == MBB && "dangling parent pointer");
+    (void)NewDV;
+    LLVM_DEBUG(dbgs() << "Inserting debug info due to spill:\n" << *NewDV);
+  }
+  // Now this register is spilled there is should not be any DBG_VALUE
+  // pointing to this register because they are all pointing to spilled value
+  // now.
+  LRIDbgValues.clear();
+}
+
+/// Insert reload instruction for \p PhysReg before \p Before.
+void RegAllocFast::reload(MachineBasicBlock::iterator Before, unsigned VirtReg,
+                          MCPhysReg PhysReg) {
+  LLVM_DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
+                    << printReg(PhysReg, TRI) << '\n');
+  int FI = getStackSpaceFor(VirtReg);
+  const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  TII->loadRegFromStackSlot(*MBB, Before, PhysReg, FI, &RC, TRI);
+  ++NumLoads;
+}
+
 /// Return true if MO is the only remaining reference to its virtual register,
 /// and it is guaranteed to be a block-local register.
 bool RegAllocFast::isLastUseOfLocalReg(const MachineOperand &MO) const {
@@ -281,14 +324,12 @@ void RegAllocFast::addKillFlag(const LiveReg &LR) {
 }
 
 /// Mark virtreg as no longer available.
-void RegAllocFast::killVirtReg(LiveRegMap::iterator LRI) {
-  addKillFlag(*LRI);
-  assert(PhysRegState[LRI->PhysReg] == LRI->VirtReg &&
+void RegAllocFast::killVirtReg(LiveReg &LR) {
+  addKillFlag(LR);
+  assert(PhysRegState[LR.PhysReg] == LR.VirtReg &&
          "Broken RegState mapping");
-  PhysRegState[LRI->PhysReg] = regFree;
-  // Erase from LiveVirtRegs unless we're spilling in bulk.
-  if (!isBulkSpilling)
-    LiveVirtRegs.erase(LRI);
+  setPhysRegState(LR.PhysReg, regFree);
+  LR.PhysReg = 0;
 }
 
 /// Mark virtreg as no longer available.
@@ -296,8 +337,8 @@ void RegAllocFast::killVirtReg(unsigned VirtReg) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "killVirtReg needs a virtual register");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
-  if (LRI != LiveVirtRegs.end())
-    killVirtReg(LRI);
+  if (LRI != LiveVirtRegs.end() && LRI->PhysReg)
+    killVirtReg(*LRI);
 }
 
 /// This method spills the value specified by VirtReg into the corresponding
@@ -307,63 +348,40 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Spilling a physical register is illegal!");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
-  assert(LRI != LiveVirtRegs.end() && "Spilling unmapped virtual register");
-  spillVirtReg(MI, LRI);
+  assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+         "Spilling unmapped virtual register");
+  spillVirtReg(MI, *LRI);
 }
 
 /// Do the actual work of spilling.
-void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
-                                LiveRegMap::iterator LRI) {
-  LiveReg &LR = *LRI;
-  assert(PhysRegState[LR.PhysReg] == LRI->VirtReg && "Broken RegState mapping");
+void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) {
+  assert(PhysRegState[LR.PhysReg] == LR.VirtReg && "Broken RegState mapping");
 
   if (LR.Dirty) {
     // If this physreg is used by the instruction, we want to kill it on the
     // instruction, not on the spill.
     bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
     LR.Dirty = false;
-    LLVM_DEBUG(dbgs() << "Spilling " << printReg(LRI->VirtReg, TRI) << " in "
-                      << printReg(LR.PhysReg, TRI));
-    const TargetRegisterClass &RC = *MRI->getRegClass(LRI->VirtReg);
-    int FI = getStackSpaceFor(LRI->VirtReg, RC);
-    LLVM_DEBUG(dbgs() << " to stack slot #" << FI << "\n");
-    TII->storeRegToStackSlot(*MBB, MI, LR.PhysReg, SpillKill, FI, &RC, TRI);
-    ++NumStores;   // Update statistics
-
-    // If this register is used by DBG_VALUE then insert new DBG_VALUE to
-    // identify spilled location as the place to find corresponding variable's
-    // value.
-    SmallVectorImpl<MachineInstr *> &LRIDbgValues =
-      LiveDbgValueMap[LRI->VirtReg];
-    for (MachineInstr *DBG : LRIDbgValues) {
-      MachineInstr *NewDV = buildDbgValueForSpill(*MBB, MI, *DBG, FI);
-      assert(NewDV->getParent() == MBB && "dangling parent pointer");
-      (void)NewDV;
-      LLVM_DEBUG(dbgs() << "Inserting debug info due to spill:"
-                        << "\n"
-                        << *NewDV);
-    }
-    // Now this register is spilled there is should not be any DBG_VALUE
-    // pointing to this register because they are all pointing to spilled value
-    // now.
-    LRIDbgValues.clear();
+
+    spill(MI, LR.VirtReg, LR.PhysReg, SpillKill);
+
     if (SpillKill)
       LR.LastUse = nullptr; // Don't kill register again
   }
-  killVirtReg(LRI);
+  killVirtReg(LR);
 }
 
 /// Spill all dirty virtregs without killing them.
 void RegAllocFast::spillAll(MachineBasicBlock::iterator MI) {
   if (LiveVirtRegs.empty()) return;
-  isBulkSpilling = true;
   // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
   // of spilling here is deterministic, if arbitrary.
-  for (LiveRegMap::iterator I = LiveVirtRegs.begin(), E = LiveVirtRegs.end();
-       I != E; ++I)
-    spillVirtReg(MI, I);
+  for (LiveReg &LR : LiveVirtRegs) {
+    if (!LR.PhysReg)
+      continue;
+    spillVirtReg(MI, LR);
+  }
   LiveVirtRegs.clear();
-  isBulkSpilling = false;
 }
 
 /// Handle the direct use of a physical register.  Check that the register is
@@ -417,12 +435,12 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
     case regFree:
       if (TRI->isSuperRegister(PhysReg, Alias)) {
         // Leave the superregister in the working set.
-        PhysRegState[Alias] = regFree;
+        setPhysRegState(Alias, regFree);
         MO.getParent()->addRegisterKilled(Alias, TRI, true);
         return;
       }
       // Some other alias was in the working set - clear it.
-      PhysRegState[Alias] = regDisabled;
+      setPhysRegState(Alias, regDisabled);
       break;
     default:
       llvm_unreachable("Instruction uses an alias of an allocated register");
@@ -430,7 +448,7 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
   }
 
   // All aliases are disabled, bring register into working set.
-  PhysRegState[PhysReg] = regFree;
+  setPhysRegState(PhysReg, regFree);
   MO.setIsKill();
 }
 
@@ -448,12 +466,12 @@ void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
     LLVM_FALLTHROUGH;
   case regFree:
   case regReserved:
-    PhysRegState[PhysReg] = NewState;
+    setPhysRegState(PhysReg, NewState);
     return;
   }
 
   // This is a disabled register, disable all aliases.
-  PhysRegState[PhysReg] = NewState;
+  setPhysRegState(PhysReg, NewState);
   for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
     MCPhysReg Alias = *AI;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
@@ -464,7 +482,7 @@ void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
       LLVM_FALLTHROUGH;
     case regFree:
     case regReserved:
-      PhysRegState[Alias] = regDisabled;
+      setPhysRegState(Alias, regDisabled);
       if (TRI->isSuperRegister(PhysReg, Alias))
         return;
       break;
@@ -492,9 +510,10 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
                       << printReg(PhysReg, TRI) << " is reserved already.\n");
     return spillImpossible;
   default: {
-    LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
-    assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
-    return I->Dirty ? spillDirty : spillClean;
+    LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
+    assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+           "Missing VirtReg entry");
+    return LRI->Dirty ? spillDirty : spillClean;
   }
   }
 
@@ -512,9 +531,10 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
     case regReserved:
       return spillImpossible;
     default: {
-      LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
-      assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
-      Cost += I->Dirty ? spillDirty : spillClean;
+      LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
+      assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+             "Missing VirtReg entry");
+      Cost += LRI->Dirty ? spillDirty : spillClean;
       break;
     }
     }
@@ -526,31 +546,27 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
 /// proper container for VirtReg now.  The physical register must not be used
 /// for anything else when this is called.
 void RegAllocFast::assignVirtToPhysReg(LiveReg &LR, MCPhysReg PhysReg) {
-  LLVM_DEBUG(dbgs() << "Assigning " << printReg(LR.VirtReg, TRI) << " to "
-                    << printReg(PhysReg, TRI) << "\n");
-  PhysRegState[PhysReg] = LR.VirtReg;
-  assert(!LR.PhysReg && "Already assigned a physreg");
+  unsigned VirtReg = LR.VirtReg;
+  LLVM_DEBUG(dbgs() << "Assigning " << printReg(VirtReg, TRI) << " to "
+                    << printReg(PhysReg, TRI) << '\n');
+  assert(LR.PhysReg == 0 && "Already assigned a physreg");
+  assert(PhysReg != 0 && "Trying to assign no register");
   LR.PhysReg = PhysReg;
-}
-
-RegAllocFast::LiveRegMap::iterator
-RegAllocFast::assignVirtToPhysReg(unsigned VirtReg, MCPhysReg PhysReg) {
-  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
-  assert(LRI != LiveVirtRegs.end() && "VirtReg disappeared");
-  assignVirtToPhysReg(*LRI, PhysReg);
-  return LRI;
+  setPhysRegState(PhysReg, VirtReg);
 }
 
 /// Allocates a physical register for VirtReg.
-RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
-    LiveRegMap::iterator LRI, unsigned Hint) {
-  const unsigned VirtReg = LRI->VirtReg;
+void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint) {
+  const unsigned VirtReg = LR.VirtReg;
 
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Can only allocate virtual registers");
 
-  // Take hint when possible.
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
+  LLVM_DEBUG(dbgs() << "Search register for " << printReg(VirtReg)
+                    << " in class " << TRI->getRegClassName(&RC) << '\n');
+
+  // Take hint when possible.
   if (TargetRegisterInfo::isPhysicalRegister(Hint) &&
       MRI->isAllocatable(Hint) && RC.contains(Hint)) {
     // Ignore the hint if we would have to spill a dirty register.
@@ -558,67 +574,64 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
     if (Cost < spillDirty) {
       if (Cost)
         definePhysReg(MI, Hint, regFree);
-      // definePhysReg may kill virtual registers and modify LiveVirtRegs.
-      // That invalidates LRI, so run a new lookup for VirtReg.
-      return assignVirtToPhysReg(VirtReg, Hint);
+      assignVirtToPhysReg(LR, Hint);
+      return;
     }
   }
 
   // First try to find a completely free register.
-  ArrayRef<MCPhysReg> AO = RegClassInfo.getOrder(&RC);
-  for (MCPhysReg PhysReg : AO) {
+  ArrayRef<MCPhysReg> AllocationOrder = RegClassInfo.getOrder(&RC);
+  for (MCPhysReg PhysReg : AllocationOrder) {
     if (PhysRegState[PhysReg] == regFree && !isRegUsedInInstr(PhysReg)) {
-      assignVirtToPhysReg(*LRI, PhysReg);
-      return LRI;
+      assignVirtToPhysReg(LR, PhysReg);
+      return;
     }
   }
 
   LLVM_DEBUG(dbgs() << "Allocating " << printReg(VirtReg) << " from "
-                    << TRI->getRegClassName(&RC) << "\n");
+                    << TRI->getRegClassName(&RC) << '\n');
 
   unsigned BestReg = 0;
   unsigned BestCost = spillImpossible;
-  for (MCPhysReg PhysReg : AO) {
+  for (MCPhysReg PhysReg : AllocationOrder) {
+    LLVM_DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << ' ');
     unsigned Cost = calcSpillCost(PhysReg);
-    LLVM_DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << "\n");
-    LLVM_DEBUG(dbgs() << "\tCost: " << Cost << "\n");
-    LLVM_DEBUG(dbgs() << "\tBestCost: " << BestCost << "\n");
+    LLVM_DEBUG(dbgs() << "Cost: " << Cost << " BestCost: " << BestCost << '\n');
     // Cost is 0 when all aliases are already disabled.
     if (Cost == 0) {
-      assignVirtToPhysReg(*LRI, PhysReg);
-      return LRI;
+      assignVirtToPhysReg(LR, PhysReg);
+      return;
+    }
+    if (Cost < BestCost) {
+      BestReg = PhysReg;
+      BestCost = Cost;
     }
-    if (Cost < BestCost)
-      BestReg = PhysReg, BestCost = Cost;
   }
 
-  if (BestReg) {
-    definePhysReg(MI, BestReg, regFree);
-    // definePhysReg may kill virtual registers and modify LiveVirtRegs.
-    // That invalidates LRI, so run a new lookup for VirtReg.
-    return assignVirtToPhysReg(VirtReg, BestReg);
+  if (!BestReg) {
+    // Nothing we can do. Report an error and keep going with a bad allocation.
+    if (MI.isInlineAsm())
+      MI.emitError("inline assembly requires more registers than available");
+    else
+      MI.emitError("ran out of registers during register allocation");
+    definePhysReg(MI, *AllocationOrder.begin(), regFree);
+    assignVirtToPhysReg(LR, *AllocationOrder.begin());
+    return;
   }
 
-  // Nothing we can do. Report an error and keep going with a bad allocation.
-  if (MI.isInlineAsm())
-    MI.emitError("inline assembly requires more registers than available");
-  else
-    MI.emitError("ran out of registers during register allocation");
-  definePhysReg(MI, *AO.begin(), regFree);
-  return assignVirtToPhysReg(VirtReg, *AO.begin());
+  definePhysReg(MI, BestReg, regFree);
+  assignVirtToPhysReg(LR, BestReg);
 }
 
 /// Allocates a register for VirtReg and mark it as dirty.
-RegAllocFast::LiveRegMap::iterator RegAllocFast::defineVirtReg(MachineInstr &MI,
-                                                               unsigned OpNum,
-                                                               unsigned VirtReg,
-                                                               unsigned Hint) {
+MCPhysReg RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
+                                      unsigned VirtReg, unsigned Hint) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
-  if (New) {
+  if (!LRI->PhysReg) {
     // If there is no hint, peek at the only use of this register.
     if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
         MRI->hasOneNonDBGUse(VirtReg)) {
@@ -627,7 +640,7 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::defineVirtReg(MachineInstr &MI,
       if (UseMI.isCopyLike())
         Hint = UseMI.getOperand(0).getReg();
     }
-    LRI = allocVirtReg(MI, LRI, Hint);
+    allocVirtReg(MI, *LRI, Hint);
   } else if (LRI->LastUse) {
     // Redefining a live register - kill at the last use, unless it is this
     // instruction defining VirtReg multiple times.
@@ -639,40 +652,35 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::defineVirtReg(MachineInstr &MI,
   LRI->LastOpNum = OpNum;
   LRI->Dirty = true;
   markRegUsedInInstr(LRI->PhysReg);
-  return LRI;
+  return LRI->PhysReg;
 }
 
 /// Make sure VirtReg is available in a physreg and return it.
-RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
-                                                               unsigned OpNum,
-                                                               unsigned VirtReg,
-                                                               unsigned Hint) {
+RegAllocFast::LiveReg &RegAllocFast::reloadVirtReg(MachineInstr &MI,
+                                                   unsigned OpNum,
+                                                   unsigned VirtReg,
+                                                   unsigned Hint) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
   MachineOperand &MO = MI.getOperand(OpNum);
-  if (New) {
-    LRI = allocVirtReg(MI, LRI, Hint);
-    const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
-    int FrameIndex = getStackSpaceFor(VirtReg, RC);
-    LLVM_DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
-                      << printReg(LRI->PhysReg, TRI) << "\n");
-    TII->loadRegFromStackSlot(*MBB, MI, LRI->PhysReg, FrameIndex, &RC, TRI);
-    ++NumLoads;
+  if (!LRI->PhysReg) {
+    allocVirtReg(MI, *LRI, Hint);
+    reload(MI, VirtReg, LRI->PhysReg);
   } else if (LRI->Dirty) {
     if (isLastUseOfLocalReg(MO)) {
-      LLVM_DEBUG(dbgs() << "Killing last use: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Killing last use: " << MO << '\n');
       if (MO.isUse())
         MO.setIsKill();
       else
         MO.setIsDead();
     } else if (MO.isKill()) {
-      LLVM_DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Clearing dubious kill: " << MO << '\n');
       MO.setIsKill(false);
     } else if (MO.isDead()) {
-      LLVM_DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Clearing dubious dead: " << MO << '\n');
       MO.setIsDead(false);
     }
   } else if (MO.isKill()) {
@@ -680,17 +688,17 @@ RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
     // register would be killed immediately, and there might be a second use:
     //   %foo = OR killed %x, %x
     // This would cause a second reload of %x into a different register.
-    LLVM_DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
+    LLVM_DEBUG(dbgs() << "Clearing clean kill: " << MO << '\n');
     MO.setIsKill(false);
   } else if (MO.isDead()) {
-    LLVM_DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
+    LLVM_DEBUG(dbgs() << "Clearing clean dead: " << MO << '\n');
     MO.setIsDead(false);
   }
   assert(LRI->PhysReg && "Register not assigned");
   LRI->LastUse = &MI;
   LRI->LastOpNum = OpNum;
   markRegUsedInInstr(LRI->PhysReg);
-  return LRI;
+  return *LRI;
 }
 
 /// Changes operand OpNum in MI the refer the PhysReg, considering subregs. This
@@ -770,17 +778,17 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
       LLVM_DEBUG(dbgs() << "Operand " << I << "(" << MO
                         << ") is tied to operand " << MI.findTiedOperandIdx(I)
                         << ".\n");
-      LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, 0);
-      MCPhysReg PhysReg = LRI->PhysReg;
+      LiveReg &LR = reloadVirtReg(MI, I, Reg, 0);
+      MCPhysReg PhysReg = LR.PhysReg;
       setPhysReg(MI, I, PhysReg);
       // Note: we don't update the def operand yet. That would cause the normal
       // def-scan to attempt spilling.
     } else if (MO.getSubReg() && MI.readsVirtualRegister(Reg)) {
-      LLVM_DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
+      LLVM_DEBUG(dbgs() << "Partial redefine: " << MO << '\n');
       // Reload the register, but don't assign to the operand just yet.
       // That would confuse the later phys-def processing pass.
-      LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, 0);
-      PartialDefs.push_back(LRI->PhysReg);
+      LiveReg &LR = reloadVirtReg(MI, I, Reg, 0);
+      PartialDefs.push_back(LR.PhysReg);
     }
   }
 
@@ -793,8 +801,7 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
     if (!MO.isEarlyClobber())
       continue;
     // Note: defineVirtReg may invalidate MO.
-    LiveRegMap::iterator LRI = defineVirtReg(MI, I, Reg, 0);
-    MCPhysReg PhysReg = LRI->PhysReg;
+    MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, 0);
     if (setPhysReg(MI, I, PhysReg))
       VirtDead.push_back(Reg);
   }
@@ -828,11 +835,12 @@ void RegAllocFast::dumpState() {
       break;
     default: {
       dbgs() << '=' << printReg(PhysRegState[Reg]);
-      LiveRegMap::iterator I = findLiveVirtReg(PhysRegState[Reg]);
-      assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
-      if (I->Dirty)
+      LiveRegMap::iterator LRI = findLiveVirtReg(PhysRegState[Reg]);
+      assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
+             "Missing VirtReg entry");
+      if (LRI->Dirty)
         dbgs() << "*";
-      assert(I->PhysReg == Reg && "Bad inverse map");
+      assert(LRI->PhysReg == Reg && "Bad inverse map");
       break;
     }
     }
@@ -841,6 +849,8 @@ void RegAllocFast::dumpState() {
   // Check that LiveVirtRegs is the inverse.
   for (LiveRegMap::iterator i = LiveVirtRegs.begin(),
        e = LiveVirtRegs.end(); i != e; ++i) {
+    if (!i->PhysReg)
+      continue;
     assert(TargetRegisterInfo::isVirtualRegister(i->VirtReg) &&
            "Bad map key");
     assert(TargetRegisterInfo::isPhysicalRegister(i->PhysReg) &&
@@ -888,7 +898,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
       // See if this virtual register has already been allocated to a physical
       // register or spilled to a stack slot.
       LiveRegMap::iterator LRI = findLiveVirtReg(Reg);
-      if (LRI != LiveVirtRegs.end())
+      if (LRI != LiveVirtRegs.end() && LRI->PhysReg)
         setPhysReg(*DebugMI, 0, LRI->PhysReg);
       else {
         int SS = StackSlotForVirtReg[Reg];
@@ -998,11 +1008,11 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
       unsigned Reg = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
       if (MO.isUse()) {
-        LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, CopyDstReg);
-        MCPhysReg PhysReg = LRI->PhysReg;
+        LiveReg &LR = reloadVirtReg(MI, I, Reg, CopyDstReg);
+        MCPhysReg PhysReg = LR.PhysReg;
         CopySrcReg = (CopySrcReg == Reg || CopySrcReg == PhysReg) ? PhysReg : 0;
         if (setPhysReg(MI, I, PhysReg))
-          killVirtReg(LRI);
+          killVirtReg(LR);
       }
     }
 
@@ -1046,8 +1056,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
         definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
         continue;
       }
-      LiveRegMap::iterator LRI = defineVirtReg(MI, I, Reg, CopySrcReg);
-      MCPhysReg PhysReg = LRI->PhysReg;
+      MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, CopySrcReg);
       if (setPhysReg(MI, I, PhysReg)) {
         VirtDead.push_back(Reg);
         CopyDstReg = 0; // cancel coalescing;
@@ -1079,12 +1088,11 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   // LiveVirtRegs might refer to the instrs.
   for (MachineInstr *MI : Coalesced)
     MBB.erase(MI);
-  NumCopies += Coalesced.size();
+  NumCoalesced += Coalesced.size();
 
   LLVM_DEBUG(MBB.dump());
 }
 
-/// Allocates registers for a function.
 bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
                     << "********** Function: " << MF.getName() << '\n');
diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp
index 9db2af9f9623c9a38f6a5d76d8cdb881d188dafd..66c7c5cd7dbf7137a8775973c8d7b35f416c01ef 100644
--- a/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -81,7 +81,7 @@ FunctionPass *llvm::createRegUsageInfoCollector() {
 bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo *MRI = &MF.getRegInfo();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  const TargetMachine &TM = MF.getTarget();
+  const LLVMTargetMachine &TM = MF.getTarget();
 
   LLVM_DEBUG(dbgs() << " -------------------- " << getPassName()
                     << " -------------------- \n");
diff --git a/lib/CodeGen/RegisterUsageInfo.cpp b/lib/CodeGen/RegisterUsageInfo.cpp
index 1b3fbc25b6ee9b9d8cc8eefb74817cdb5bf787a7..6b9880a8913f9f135c1e0590175d2e41a94e5819 100644
--- a/lib/CodeGen/RegisterUsageInfo.cpp
+++ b/lib/CodeGen/RegisterUsageInfo.cpp
@@ -40,7 +40,7 @@ INITIALIZE_PASS(PhysicalRegisterUsageInfo, "reg-usage-info",
 
 char PhysicalRegisterUsageInfo::ID = 0;
 
-void PhysicalRegisterUsageInfo::setTargetMachine(const TargetMachine &TM) {
+void PhysicalRegisterUsageInfo::setTargetMachine(const LLVMTargetMachine &TM) {
   this->TM = &TM;
 }
 
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
index 1fb116e9b4805f24a5fdb27353593ed9959bb191..2684f92b3a9318192860d36142cef1e697b68d6f 100644
--- a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -138,8 +138,6 @@ static void scalarizeMaskedLoad(CallInst *CI) {
   IRBuilder<> Builder(CI->getContext());
   Instruction *InsertPt = CI;
   BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
 
   Builder.SetInsertPoint(InsertPt);
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
@@ -195,7 +193,8 @@ static void scalarizeMaskedLoad(CallInst *CI) {
     //  %Elt = load i32* %EltAddr
     //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
     //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
+                                                     "cond.load");
     Builder.SetInsertPoint(InsertPt);
 
     Value *Gep =
@@ -211,7 +210,7 @@ static void scalarizeMaskedLoad(CallInst *CI) {
     Instruction *OldBr = IfBlock->getTerminator();
     BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
     OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
+    BasicBlock *PrevIfBlock = IfBlock;
     IfBlock = NewIfBlock;
 
     // Create the phi to join the new and previous value.
@@ -372,8 +371,6 @@ static void scalarizeMaskedGather(CallInst *CI) {
   IRBuilder<> Builder(CI->getContext());
   Instruction *InsertPt = CI;
   BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
   Builder.SetInsertPoint(InsertPt);
   unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
 
@@ -416,7 +413,7 @@ static void scalarizeMaskedGather(CallInst *CI) {
     //  %Elt = load i32* %EltAddr
     //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
     //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
     Builder.SetInsertPoint(InsertPt);
 
     Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
@@ -433,7 +430,7 @@ static void scalarizeMaskedGather(CallInst *CI) {
     Instruction *OldBr = IfBlock->getTerminator();
     BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
     OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
+    BasicBlock *PrevIfBlock = IfBlock;
     IfBlock = NewIfBlock;
 
     PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 346f82ff95fff4d5387c232b0f8791be0f180aeb..99406ed1496a7cb49806d7d4c7fa836756d53f5f 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -234,6 +234,11 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
   // Ask the target if address-backscheduling is desirable, and if so how much.
   const TargetSubtargetInfo &ST = MF.getSubtarget();
 
+  // Only use any non-zero latency for real defs/uses, in contrast to
+  // "fake" operands added by regalloc.
+  const MCInstrDesc *DefMIDesc = &SU->getInstr()->getDesc();
+  bool ImplicitPseudoDef = (OperIdx >= DefMIDesc->getNumOperands() &&
+                            !DefMIDesc->hasImplicitDefOfPhysReg(MO.getReg()));
   for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
        Alias.isValid(); ++Alias) {
     if (!Uses.contains(*Alias))
@@ -257,11 +262,18 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
         Dep = SDep(SU, SDep::Data, *Alias);
         RegUse = UseSU->getInstr();
       }
-      Dep.setLatency(
-        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, RegUse,
-                                         UseOp));
+      const MCInstrDesc *UseMIDesc =
+          (RegUse ? &UseSU->getInstr()->getDesc() : nullptr);
+      bool ImplicitPseudoUse =
+          (UseMIDesc && UseOp >= ((int)UseMIDesc->getNumOperands()) &&
+           !UseMIDesc->hasImplicitUseOfPhysReg(*Alias));
+      if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
+        Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
+                                                        RegUse, UseOp));
+        ST.adjustSchedDependency(SU, UseSU, Dep);
+      } else
+        Dep.setLatency(0);
 
-      ST.adjustSchedDependency(SU, UseSU, Dep);
       UseSU->addPred(Dep);
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 29adcad22e127431091a10a5af087471e8b36448..fc0e8efebdcbdc84f8c739500ee0255eb7553794 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -250,6 +250,11 @@ namespace {
     SDValue SplitIndexingFromLoad(LoadSDNode *LD);
     bool SliceUpLoad(SDNode *N);
 
+    // Scalars have size 0 to distinguish from singleton vectors.
+    SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD);
+    bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val);
+    bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val);
+
     /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
     ///   load.
     ///
@@ -366,6 +371,8 @@ namespace {
     SDValue visitFFLOOR(SDNode *N);
     SDValue visitFMINNUM(SDNode *N);
     SDValue visitFMAXNUM(SDNode *N);
+    SDValue visitFMINIMUM(SDNode *N);
+    SDValue visitFMAXIMUM(SDNode *N);
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
@@ -457,7 +464,6 @@ namespace {
     SDValue TransformFPLoadStorePair(SDNode *N);
     SDValue convertBuildVecZextToZext(SDNode *N);
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
-    SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
     SDValue reduceBuildVecToShuffle(SDNode *N);
     SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                   ArrayRef<int> VectorMask, SDValue VecIn1,
@@ -1577,6 +1583,8 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FFLOOR:             return visitFFLOOR(N);
   case ISD::FMINNUM:            return visitFMINNUM(N);
   case ISD::FMAXNUM:            return visitFMAXNUM(N);
+  case ISD::FMINIMUM:           return visitFMINIMUM(N);
+  case ISD::FMAXIMUM:           return visitFMAXIMUM(N);
   case ISD::FCEIL:              return visitFCEIL(N);
   case ISD::FTRUNC:             return visitFTRUNC(N);
   case ISD::BRCOND:             return visitBRCOND(N);
@@ -2532,8 +2540,7 @@ SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
 // Since it may not be valid to emit a fold to zero for vector initializers
 // check if we can before folding.
 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
-                             SelectionDAG &DAG, bool LegalOperations,
-                             bool LegalTypes) {
+                             SelectionDAG &DAG, bool LegalOperations) {
   if (!VT.isVector())
     return DAG.getConstant(0, DL, VT);
   if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
@@ -2560,7 +2567,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   // fold (sub x, x) -> 0
   // FIXME: Refactor this and xor and other similar operations together.
   if (N0 == N1)
-    return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations, LegalTypes);
+    return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
     // fold (sub c1, c2) -> c1-c2
@@ -3119,8 +3126,11 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
   if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
 
-  // TODO: 0 / X -> 0
-  // TODO: 0 % X -> 0
+  // 0 / X -> 0
+  // 0 % X -> 0
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  if (N0C && N0C->isNullValue())
+    return N0;
 
   // X / X -> 1
   // X % X -> 0
@@ -3129,11 +3139,12 @@ static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
 
   // X / 1 -> X
   // X % 1 -> 0
-  if (N1C && N1C->isOne())
-    return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
   // If this is a boolean op (single-bit element type), we can't have
   // division-by-zero or remainder-by-zero, so assume the divisor is 1.
-  // Similarly, if we're zero-extending a boolean divisor, then assume it's a 1.
+  // TODO: Similarly, if we're zero-extending a boolean divisor, then assume
+  // it's a 1.
+  if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1))
+    return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
 
   return SDValue();
 }
@@ -3828,10 +3839,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
       // Don't try to fold this node if it requires introducing a
       // build vector of all zeros that might be illegal at this stage.
       if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
-        if (!LegalTypes)
-          ShOp = DAG.getConstant(0, SDLoc(N), VT);
-        else
-          ShOp = SDValue();
+        ShOp = tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations);
       }
 
       // (AND (shuf (A, C), shuf (B, C))) -> shuf (AND (A, B), C)
@@ -3849,10 +3857,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
       // build vector of all zeros that might be illegal at this stage.
       ShOp = N0->getOperand(0);
       if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
-        if (!LegalTypes)
-          ShOp = DAG.getConstant(0, SDLoc(N), VT);
-        else
-          ShOp = SDValue();
+        ShOp = tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations);
       }
 
       // (AND (shuf (C, A), shuf (C, B))) -> shuf (C, AND (A, B))
@@ -6142,7 +6147,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
 
   // fold (xor x, x) -> 0
   if (N0 == N1)
-    return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
+    return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations);
 
   // fold (xor (shl 1, x), -1) -> (rotl ~1, x)
   // Here is a concrete example of this equivalence:
@@ -7092,6 +7097,13 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
   case ISD::SETLE:
   case ISD::SETULT:
   case ISD::SETULE: {
+    // Since it's known never nan to get here already, either fminnum or
+    // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is
+    // expanded in terms of it.
+    unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+    if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+      return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
     unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
     if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
       return DAG.getNode(Opcode, DL, VT, LHS, RHS);
@@ -7103,6 +7115,10 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
   case ISD::SETGE:
   case ISD::SETUGT:
   case ISD::SETUGE: {
+    unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
+    if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
+      return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
+
     unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
     if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
       return DAG.getNode(Opcode, DL, VT, LHS, RHS);
@@ -8368,7 +8384,7 @@ static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   EVT MemVT = LN0->getMemoryVT();
-  if ((LegalOperations || LN0->isVolatile()) &&
+  if ((LegalOperations || LN0->isVolatile() || VT.isVector()) &&
       !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
     return {};
 
@@ -8672,27 +8688,25 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
     return true;
   }
 
-  if (N->getOpcode() != ISD::SETCC || N->getValueType(0) != MVT::i1 ||
-      cast<CondCodeSDNode>(N->getOperand(2))->get() != ISD::SETNE)
+  if (N.getOpcode() != ISD::SETCC ||
+      N.getValueType().getScalarType() != MVT::i1 ||
+      cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE)
     return false;
 
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   assert(Op0.getValueType() == Op1.getValueType());
 
-  if (isNullConstant(Op0))
+  if (isNullConstantOrNullSplatConstant(Op0))
     Op = Op1;
-  else if (isNullConstant(Op1))
+  else if (isNullConstantOrNullSplatConstant(Op1))
     Op = Op0;
   else
     return false;
 
   DAG.computeKnownBits(Op, Known);
 
-  if (!(Known.Zero | 1).isAllOnesValue())
-    return false;
-
-  return true;
+  return (Known.Zero | 1).isAllOnesValue();
 }
 
 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
@@ -8712,17 +8726,16 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   // fold (zext (truncate x)) -> (zext x) or
   //      (zext (truncate x)) -> (truncate x)
   // This is valid when the truncated bits of x are already zero.
-  // FIXME: We should extend this to work for vectors too.
   SDValue Op;
   KnownBits Known;
-  if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) {
+  if (isTruncateOf(DAG, N0, Op, Known)) {
     APInt TruncatedBits =
-      (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ?
-      APInt(Op.getValueSizeInBits(), 0) :
-      APInt::getBitsSet(Op.getValueSizeInBits(),
-                        N0.getValueSizeInBits(),
-                        std::min(Op.getValueSizeInBits(),
-                                 VT.getSizeInBits()));
+      (Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ?
+      APInt(Op.getScalarValueSizeInBits(), 0) :
+      APInt::getBitsSet(Op.getScalarValueSizeInBits(),
+                        N0.getScalarValueSizeInBits(),
+                        std::min(Op.getScalarValueSizeInBits(),
+                                 VT.getScalarSizeInBits()));
     if (TruncatedBits.isSubsetOf(Known.Zero))
       return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
   }
@@ -9067,17 +9080,16 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
         return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
                              N0.getOperand(1),
                              cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/any extend
-      else {
-        EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
-        SDValue VsetCC =
-          DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
-                        N0.getOperand(1),
-                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
-      }
+      EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
+      SDValue VsetCC =
+        DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
+                      N0.getOperand(1),
+                      cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
     }
 
     // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
@@ -9382,7 +9394,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {
     if (!LegalOperations ||
         TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
-      return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT);
+      return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT,
+                         N0.getOperand(0));
   }
 
   // fold (sext_in_reg (zext x)) -> (sext x)
@@ -10789,17 +10802,18 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
 
-  // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y)
-  // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y))
+  // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y)
+  // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y))
   auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
     if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
-      auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
-      if (XC1 && XC1->isExactlyValue(+1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           Y, Flags);
-      if (XC1 && XC1->isExactlyValue(-1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+      if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) {
+        if (C->isExactlyValue(+1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             Y, Flags);
+        if (C->isExactlyValue(-1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+      }
     }
     return SDValue();
   };
@@ -10809,29 +10823,30 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   if (SDValue FMA = FuseFADD(N1, N0, Flags))
     return FMA;
 
-  // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y)
-  // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y))
-  // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y))
-  // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y)
+  // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y)
+  // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y))
+  // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y))
+  // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y)
   auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
     if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
-      auto XC0 = isConstOrConstSplatFP(X.getOperand(0));
-      if (XC0 && XC0->isExactlyValue(+1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                           DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
-                           Y, Flags);
-      if (XC0 && XC0->isExactlyValue(-1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                           DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
-                           DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
-
-      auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
-      if (XC1 && XC1->isExactlyValue(+1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
-      if (XC1 && XC1->isExactlyValue(-1.0))
-        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
-                           Y, Flags);
+      if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) {
+        if (C0->isExactlyValue(+1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                             DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                             Y, Flags);
+        if (C0->isExactlyValue(-1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                             DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                             DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+      }
+      if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) {
+        if (C1->isExactlyValue(+1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
+        if (C1->isExactlyValue(-1.0))
+          return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                             Y, Flags);
+      }
     }
     return SDValue();
   };
@@ -10844,14 +10859,6 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   return SDValue();
 }
 
-static bool isFMulNegTwo(SDValue &N) {
-  if (N.getOpcode() != ISD::FMUL)
-    return false;
-  if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1)))
-    return CFP->isExactlyValue(-2.0);
-  return false;
-}
-
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -10896,14 +10903,24 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     return DAG.getNode(ISD::FSUB, DL, VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations), Flags);
 
-  // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B))
-  // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B))
-  if ((isFMulNegTwo(N0) && N0.hasOneUse()) ||
-      (isFMulNegTwo(N1) && N1.hasOneUse())) {
-    bool N1IsFMul = isFMulNegTwo(N1);
-    SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0);
-    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags);
-    return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags);
+  auto isFMulNegTwo = [](SDValue FMul) {
+    if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
+      return false;
+    auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true);
+    return C && C->isExactlyValue(-2.0);
+  };
+
+  // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B)
+  if (isFMulNegTwo(N0)) {
+    SDValue B = N0.getOperand(0);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
+    return DAG.getNode(ISD::FSUB, DL, VT, N1, Add, Flags);
+  }
+  // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B)
+  if (isFMulNegTwo(N1)) {
+    SDValue B = N1.getOperand(0);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B, Flags);
+    return DAG.getNode(ISD::FSUB, DL, VT, N0, Add, Flags);
   }
 
   // No FP constant should be created after legalization as Instruction
@@ -11566,15 +11583,15 @@ static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
+  bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
   EVT VT = N->getValueType(0);
 
   if (N0CFP && N1CFP) // Constant fold
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);
 
-  if (N1CFP) {
-    const APFloat &V = N1CFP->getValueAPF();
+  if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
+    const APFloat &V = N1C->getValueAPF();
     // copysign(x, c1) -> fabs(x)       iff ispos(c1)
     // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
     if (!V.isNegative()) {
@@ -12104,7 +12121,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
+static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N,
+                            APFloat (*Op)(const APFloat &, const APFloat &)) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
@@ -12114,36 +12132,31 @@ SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
   if (N0CFP && N1CFP) {
     const APFloat &C0 = N0CFP->getValueAPF();
     const APFloat &C1 = N1CFP->getValueAPF();
-    return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT);
+    return DAG.getConstantFP(Op(C0, C1), SDLoc(N), VT);
   }
 
   // Canonicalize to constant on RHS.
   if (isConstantFPBuildVectorOrConstantFP(N0) &&
-     !isConstantFPBuildVectorOrConstantFP(N1))
-    return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0);
+      !isConstantFPBuildVectorOrConstantFP(N1))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
 
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  EVT VT = N->getValueType(0);
-  const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
-  const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
+SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
+  return visitFMinMax(DAG, N, minnum);
+}
 
-  if (N0CFP && N1CFP) {
-    const APFloat &C0 = N0CFP->getValueAPF();
-    const APFloat &C1 = N1CFP->getValueAPF();
-    return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT);
-  }
+SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
+  return visitFMinMax(DAG, N, maxnum);
+}
 
-  // Canonicalize to constant on RHS.
-  if (isConstantFPBuildVectorOrConstantFP(N0) &&
-     !isConstantFPBuildVectorOrConstantFP(N1))
-    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0);
+SDValue DAGCombiner::visitFMINIMUM(SDNode *N) {
+  return visitFMinMax(DAG, N, minimum);
+}
 
-  return SDValue();
+SDValue DAGCombiner::visitFMAXIMUM(SDNode *N) {
+  return visitFMinMax(DAG, N, maximum);
 }
 
 SDValue DAGCombiner::visitFABS(SDNode *N) {
@@ -12762,6 +12775,143 @@ SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
   return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
 }
 
+static inline int numVectorEltsOrZero(EVT T) {
+  return T.isVector() ? T.getVectorNumElements() : 0;
+}
+
+bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
+  Val = ST->getValue();
+  EVT STType = Val.getValueType();
+  EVT STMemType = ST->getMemoryVT();
+  if (STType == STMemType)
+    return true;
+  if (isTypeLegal(STMemType))
+    return false; // fail.
+  if (STType.isFloatingPoint() && STMemType.isFloatingPoint() &&
+      TLI.isOperationLegal(ISD::FTRUNC, STMemType)) {
+    Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val);
+    return true;
+  }
+  if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) &&
+      STType.isInteger() && STMemType.isInteger()) {
+    Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val);
+    return true;
+  }
+  if (STType.getSizeInBits() == STMemType.getSizeInBits()) {
+    Val = DAG.getBitcast(STMemType, Val);
+    return true;
+  }
+  return false; // fail.
+}
+
+bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
+  EVT LDMemType = LD->getMemoryVT();
+  EVT LDType = LD->getValueType(0);
+  assert(Val.getValueType() == LDMemType &&
+         "Attempting to extend value of non-matching type");
+  if (LDType == LDMemType)
+    return true;
+  if (LDMemType.isInteger() && LDType.isInteger()) {
+    switch (LD->getExtensionType()) {
+    case ISD::NON_EXTLOAD:
+      Val = DAG.getBitcast(LDType, Val);
+      return true;
+    case ISD::EXTLOAD:
+      Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val);
+      return true;
+    case ISD::SEXTLOAD:
+      Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val);
+      return true;
+    case ISD::ZEXTLOAD:
+      Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val);
+      return true;
+    }
+  }
+  return false;
+}
+
+SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
+  if (OptLevel == CodeGenOpt::None || LD->isVolatile())
+    return SDValue();
+  SDValue Chain = LD->getOperand(0);
+  StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
+  if (!ST || ST->isVolatile())
+    return SDValue();
+
+  EVT LDType = LD->getValueType(0);
+  EVT LDMemType = LD->getMemoryVT();
+  EVT STMemType = ST->getMemoryVT();
+  EVT STType = ST->getValue().getValueType();
+
+  BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
+  BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
+  int64_t Offset;
+  if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
+    return SDValue();
+
+  // Normalize for Endianness. After this Offset=0 will denote that the least
+  // significant bit in the loaded value maps to the least significant bit in
+  // the stored value). With Offset=n (for n > 0) the loaded value starts at the
+  // n:th least significant byte of the stored value.
+  if (DAG.getDataLayout().isBigEndian())
+    Offset = (STMemType.getStoreSizeInBits() -
+              LDMemType.getStoreSizeInBits()) / 8 - Offset;
+
+  // Check that the stored value cover all bits that are loaded.
+  bool STCoversLD =
+      (Offset >= 0) &&
+      (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits());
+  if (!STCoversLD)
+    return SDValue();
+
+  // Memory as copy space (potentially masked).
+  if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
+    // Simple case: Direct non-truncating forwarding
+    if (LDType.getSizeInBits() == LDMemType.getSizeInBits())
+      return CombineTo(LD, ST->getValue(), Chain);
+    // Can we model the truncate and extension with an and mask?
+    if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
+        !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
+      // Mask to size of LDMemType
+      auto Mask =
+          DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(),
+                                               STMemType.getSizeInBits()),
+                          SDLoc(ST), STType);
+      auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
+      return CombineTo(LD, Val, Chain);
+    }
+  }
+
+  // TODO: Deal with nonzero offset.
+  if (LD->getBasePtr().isUndef() || Offset != 0)
+    return SDValue();
+  // Model necessary truncations / extenstions.
+  SDValue Val;
+  // Truncate Value To Stored Memory Size.
+  do {
+    if (!getTruncatedStoreValue(ST, Val))
+      continue;
+    if (!isTypeLegal(LDMemType))
+      continue;
+    if (STMemType != LDMemType) {
+      // TODO: Support vectors? This requires extract_subvector/bitcast.
+      if (!STMemType.isVector() && !LDMemType.isVector() &&
+          STMemType.isInteger() && LDMemType.isInteger())
+        Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val);
+      else
+        continue;
+    }
+    if (!extendLoadedValueToExtension(LD, Val))
+      continue;
+    return CombineTo(LD, Val, Chain);
+  } while (false);
+
+  // On failure, cleanup dead nodes we may have created.
+  if (Val->use_empty())
+    deleteAndRecombine(Val.getNode());
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
@@ -12828,17 +12978,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
 
   // If this load is directly stored, replace the load value with the stored
   // value.
-  // TODO: Handle store large -> read small portion.
-  // TODO: Handle TRUNCSTORE/LOADEXT
-  if (OptLevel != CodeGenOpt::None &&
-      ISD::isNormalLoad(N) && !LD->isVolatile()) {
-    if (ISD::isNON_TRUNCStore(Chain.getNode())) {
-      StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
-      if (PrevST->getBasePtr() == Ptr &&
-          PrevST->getValue().getValueType() == N->getValueType(0))
-        return CombineTo(N, PrevST->getOperand(1), Chain);
-    }
-  }
+  if (auto V = ForwardStoreValueToDirectLoad(LD))
+    return V;
 
   // Try to infer better alignment information than the load already has.
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
@@ -14172,14 +14313,14 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
     //                    in candidate selection and can be
     //                    safely ignored
     //   * Value (Op 1) -> Cycles may happen (e.g. through load chains)
-    //   * Address (Op 2) -> Merged addresses may only vary by a fixed constant
-    //                      and so no cycles are possible.
-    //   * (Op 3) -> appears to always be undef. Cannot be source of cycle.
-    //
-    // Thus we need only check predecessors of the value operands.
-    auto *Op = N->getOperand(1).getNode();
-    if (Visited.insert(Op).second)
-      Worklist.push_back(Op);
+    //   * Address (Op 2) -> Merged addresses may only vary by a fixed constant,
+    //                       but aren't necessarily fromt the same base node, so
+    //                       cycles possible (e.g. via indexed store).
+    //   * (Op 3) -> Represents the pre or post-indexing offset (or undef for
+    //               non-indexed stores). Not constant on all targets (e.g. ARM)
+    //               and so can participate in a cycle.
+    for (unsigned j = 1; j < N->getNumOperands(); ++j)
+      Worklist.push_back(N->getOperand(j).getNode());
   }
   // Search through DAG. We can stop early if we find a store node.
   for (unsigned i = 0; i < NumStores; ++i)
@@ -14885,7 +15026,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // FIXME: is there such a thing as a truncating indexed store?
   if (ST->isTruncatingStore() && ST->isUnindexed() &&
-      Value.getValueType().isInteger()) {
+      Value.getValueType().isInteger() &&
+      (!isa<ConstantSDNode>(Value) ||
+       !cast<ConstantSDNode>(Value)->isOpaque())) {
     // See if we can simplify the input to this truncstore with knowledge that
     // only the low bits are being used.  For example:
     // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
@@ -15328,14 +15471,13 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
 }
 
 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
-  // (vextract (scalar_to_vector val, 0) -> val
   SDValue InVec = N->getOperand(0);
   EVT VT = InVec.getValueType();
   EVT NVT = N->getValueType(0);
-
   if (InVec.isUndef())
     return DAG.getUNDEF(NVT);
 
+  // (vextract (scalar_to_vector val, 0) -> val
   if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
     // Check if the result type doesn't match the inserted element type. A
     // SCALAR_TO_VECTOR may truncate the inserted element and the
@@ -15372,14 +15514,41 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // converts.
   }
 
-  // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x)
-  bool isLE = DAG.getDataLayout().isLittleEndian();
-  unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1;
-  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() &&
-      ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) {
+  // TODO: These transforms should not require the 'hasOneUse' restriction, but
+  // there are regressions on multiple targets without it. We can end up with a
+  // mess of scalar and vector code if we reduce only part of the DAG to scalar.
+  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && VT.isInteger() &&
+      InVec.hasOneUse()) {
+    // The vector index of the LSBs of the source depend on the endian-ness.
+    bool IsLE = DAG.getDataLayout().isLittleEndian();
+    unsigned ExtractIndex = ConstEltNo->getZExtValue();
+    // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
+    unsigned BCTruncElt = IsLE ? 0 : VT.getVectorNumElements() - 1;
     SDValue BCSrc = InVec.getOperand(0);
-    if (BCSrc.getValueType().isScalarInteger())
+    if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
+
+    if (LegalTypes && BCSrc.getValueType().isInteger() &&
+        BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
+      // trunc i64 X to i32
+      SDValue X = BCSrc.getOperand(0);
+      assert(X.getValueType().isScalarInteger() && NVT.isScalarInteger() &&
+             "Extract element and scalar to vector can't change element type "
+             "from FP to integer.");
+      unsigned XBitWidth = X.getValueSizeInBits();
+      unsigned VecEltBitWidth = VT.getScalarSizeInBits();
+      BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
+
+      // An extract element return value type can be wider than its vector
+      // operand element type. In that case, the high bits are undefined, so
+      // it's possible that we may need to extend rather than truncate.
+      if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
+        assert(XBitWidth % VecEltBitWidth == 0 &&
+               "Scalar bitwidth must be a multiple of vector element bitwidth");
+        return DAG.getAnyExtOrTrunc(X, SDLoc(N), NVT);
+      }
+    }
   }
 
   // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
@@ -15677,77 +15846,6 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   return DAG.getBitcast(VT, BV);
 }
 
-SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
-  EVT VT = N->getValueType(0);
-
-  unsigned NumInScalars = N->getNumOperands();
-  SDLoc DL(N);
-
-  EVT SrcVT = MVT::Other;
-  unsigned Opcode = ISD::DELETED_NODE;
-  unsigned NumDefs = 0;
-
-  for (unsigned i = 0; i != NumInScalars; ++i) {
-    SDValue In = N->getOperand(i);
-    unsigned Opc = In.getOpcode();
-
-    if (Opc == ISD::UNDEF)
-      continue;
-
-    // If all scalar values are floats and converted from integers.
-    if (Opcode == ISD::DELETED_NODE &&
-        (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
-      Opcode = Opc;
-    }
-
-    if (Opc != Opcode)
-      return SDValue();
-
-    EVT InVT = In.getOperand(0).getValueType();
-
-    // If all scalar values are typed differently, bail out. It's chosen to
-    // simplify BUILD_VECTOR of integer types.
-    if (SrcVT == MVT::Other)
-      SrcVT = InVT;
-    if (SrcVT != InVT)
-      return SDValue();
-    NumDefs++;
-  }
-
-  // If the vector has just one element defined, it's not worth to fold it into
-  // a vectorized one.
-  if (NumDefs < 2)
-    return SDValue();
-
-  assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP)
-         && "Should only handle conversion from integer to float.");
-  assert(SrcVT != MVT::Other && "Cannot determine source type!");
-
-  EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
-
-  if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
-    return SDValue();
-
-  // Just because the floating-point vector type is legal does not necessarily
-  // mean that the corresponding integer vector type is.
-  if (!isTypeLegal(NVT))
-    return SDValue();
-
-  SmallVector<SDValue, 8> Opnds;
-  for (unsigned i = 0; i != NumInScalars; ++i) {
-    SDValue In = N->getOperand(i);
-
-    if (In.isUndef())
-      Opnds.push_back(DAG.getUNDEF(SrcVT));
-    else
-      Opnds.push_back(In.getOperand(0));
-  }
-  SDValue BV = DAG.getBuildVector(NVT, DL, Opnds);
-  AddToWorklist(BV.getNode());
-
-  return DAG.getNode(Opcode, DL, VT, BV);
-}
-
 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                            ArrayRef<int> VectorMask,
                                            SDValue VecIn1, SDValue VecIn2,
@@ -16194,9 +16292,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
     return V;
 
-  if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N))
-    return V;
-
   if (SDValue V = reduceBuildVecToShuffle(N))
     return V;
 
@@ -16500,39 +16595,21 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   return SDValue();
 }
 
-/// If we are extracting a subvector produced by a wide binary operator with at
-/// at least one operand that was the result of a vector concatenation, then try
-/// to use the narrow vector operands directly to avoid the concatenation and
-/// extraction.
+/// If we are extracting a subvector produced by a wide binary operator try
+/// to use a narrow binary operator and/or avoid concatenation and extraction.
 static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
   // some of these bailouts with other transforms.
 
   // The extract index must be a constant, so we can map it to a concat operand.
-  auto *ExtractIndex = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
-  if (!ExtractIndex)
-    return SDValue();
-
-  // Only handle the case where we are doubling and then halving. A larger ratio
-  // may require more than two narrow binops to replace the wide binop.
-  EVT VT = Extract->getValueType(0);
-  unsigned NumElems = VT.getVectorNumElements();
-  assert((ExtractIndex->getZExtValue() % NumElems) == 0 &&
-         "Extract index is not a multiple of the vector length.");
-  if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
+  auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
+  if (!ExtractIndexC)
     return SDValue();
 
   // We are looking for an optionally bitcasted wide vector binary operator
   // feeding an extract subvector.
   SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0));
-
-  // TODO: The motivating case for this transform is an x86 AVX1 target. That
-  // target has temptingly almost legal versions of bitwise logic ops in 256-bit
-  // flavors, but no other 256-bit integer support. This could be extended to
-  // handle any binop, but that may require fixing/adding other folds to avoid
-  // codegen regressions.
-  unsigned BOpcode = BinOp.getOpcode();
-  if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
+  if (!ISD::isBinaryOp(BinOp.getNode()))
     return SDValue();
 
   // The binop must be a vector type, so we can chop it in half.
@@ -16540,19 +16617,62 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   if (!WideBVT.isVector())
     return SDValue();
 
+  EVT VT = Extract->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned ExtractIndex = ExtractIndexC->getZExtValue();
+  assert(ExtractIndex % NumElems == 0 &&
+         "Extract index is not a multiple of the vector length.");
+  EVT SrcVT = Extract->getOperand(0).getValueType();
+
+  // Bail out if this is not a proper multiple width extraction.
+  unsigned NumSrcElems = SrcVT.getVectorNumElements();
+  if (NumSrcElems % NumElems != 0)
+    return SDValue();
+
   // Bail out if the target does not support a narrower version of the binop.
+  unsigned NarrowingRatio = NumSrcElems / NumElems;
+  unsigned BOpcode = BinOp.getOpcode();
+  unsigned WideNumElts = WideBVT.getVectorNumElements();
   EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
-                                   WideBVT.getVectorNumElements() / 2);
+                                   WideNumElts / NarrowingRatio);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
     return SDValue();
 
-  SDValue LHS = peekThroughBitcasts(BinOp.getOperand(0));
-  SDValue RHS = peekThroughBitcasts(BinOp.getOperand(1));
+  // If extraction is cheap, we don't need to look at the binop operands
+  // for concat ops. The narrow binop alone makes this transform profitable.
+  // TODO: We're not dealing with the bitcasted pattern here. That limitation
+  // should be lifted.
+  if (Extract->getOperand(0) == BinOp && BinOp.hasOneUse() &&
+      TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtractIndex)) {
+    // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
+    SDLoc DL(Extract);
+    SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                            BinOp.getOperand(0), Extract->getOperand(1));
+    SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                            BinOp.getOperand(1), Extract->getOperand(1));
+    return DAG.getNode(BOpcode, DL, NarrowBVT, X, Y,
+                       BinOp.getNode()->getFlags());
+  }
+
+  // Only handle the case where we are doubling and then halving. A larger ratio
+  // may require more than two narrow binops to replace the wide binop.
+  if (NarrowingRatio != 2)
+    return SDValue();
+
+  // TODO: The motivating case for this transform is an x86 AVX1 target. That
+  // target has temptingly almost legal versions of bitwise logic ops in 256-bit
+  // flavors, but no other 256-bit integer support. This could be extended to
+  // handle any binop, but that may require fixing/adding other folds to avoid
+  // codegen regressions.
+  if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
+    return SDValue();
 
   // We need at least one concatenation operation of a binop operand to make
   // this transform worthwhile. The concat must double the input vector sizes.
   // TODO: Should we also handle INSERT_SUBVECTOR patterns?
+  SDValue LHS = peekThroughBitcasts(BinOp.getOperand(0));
+  SDValue RHS = peekThroughBitcasts(BinOp.getOperand(1));
   bool ConcatL =
       LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2;
   bool ConcatR =
@@ -16563,7 +16683,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   // If one of the binop operands was not the result of a concat, we must
   // extract a half-sized operand for our new narrow binop. We can't just reuse
   // the original extract index operand because we may have bitcasted.
-  unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems;
+  unsigned ConcatOpNum = ExtractIndex / NumElems;
   unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
   EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
   SDLoc DL(Extract);
@@ -16922,7 +17042,8 @@ static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
       if (!LegalOperations ||
           TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
         return DAG.getBitcast(VT,
-                            DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));
+                              DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG,
+                                          SDLoc(SVN), OutVT, N0));
   }
 
   return SDValue();
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index ad416017470d192f76aa76256b319195f78e79ed..d5f066c2423eb989420af4df1e8f6b8727520053 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -89,6 +89,7 @@
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -110,6 +111,7 @@
 #include <utility>
 
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "isel"
 
@@ -1448,6 +1450,14 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, ResultReg);
     return true;
   }
+  case Intrinsic::is_constant: {
+    Constant *ResCI = ConstantInt::get(II->getType(), 0);
+    unsigned ResultReg = getRegForValue(ResCI);
+    if (!ResultReg)
+      return false;
+    updateValueMap(II, ResultReg);
+    return true;
+  }
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
   case Intrinsic::expect: {
@@ -1692,7 +1702,10 @@ void FastISel::finishCondBranch(const BasicBlock *BranchBB,
 
 /// Emit an FNeg operation.
 bool FastISel::selectFNeg(const User *I) {
-  unsigned OpReg = getRegForValue(BinaryOperator::getFNegArgument(I));
+  Value *X;
+  if (!match(I, m_FNeg(m_Value(X))))
+    return false;
+  unsigned OpReg = getRegForValue(X);
   if (!OpReg)
     return false;
   bool OpRegIsKill = hasTrivialKill(I);
@@ -1782,11 +1795,9 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
     return selectBinaryOp(I, ISD::FADD);
   case Instruction::Sub:
     return selectBinaryOp(I, ISD::SUB);
-  case Instruction::FSub:
+  case Instruction::FSub: 
     // FNeg is currently represented in LLVM IR as a special case of FSub.
-    if (BinaryOperator::isFNeg(I))
-      return selectFNeg(I);
-    return selectBinaryOp(I, ISD::FSUB);
+    return selectFNeg(I) || selectBinaryOp(I, ISD::FSUB);
   case Instruction::Mul:
     return selectBinaryOp(I, ISD::MUL);
   case Instruction::FMul:
@@ -2223,7 +2234,7 @@ unsigned FastISel::fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
 /// might result in multiple MBB's for one BB.  As such, the start of the
 /// BB might correspond to a different MBB than the end.
 bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
-  const TerminatorInst *TI = LLVMBB->getTerminator();
+  const Instruction *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
   FuncInfo.OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 9f57df87fb2ac83ecf78756a306ac58d76cc2be3..fc9c227e4dfacd3a1958a2404ad75221ce389c7b 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -959,7 +959,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   }
 
   // Finally mark unused registers as dead.
-  if (!UsedRegs.empty() || II.getImplicitDefs())
+  if (!UsedRegs.empty() || II.getImplicitDefs() || II.hasOptionalDef())
     MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
 
   // Run post-isel target hook to adjust this instruction if needed.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 27875c11909b43c24c28e9662ff10096c1c8343c..dcb479e4ce1fb85b136a1e3f8fd214ac73090e83 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -176,7 +176,6 @@ private:
 
   SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
   SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
-  SDValue ExpandBitCount(unsigned Opc, SDValue Op, const SDLoc &dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
@@ -239,7 +238,7 @@ public:
 } // end anonymous namespace
 
 /// Return a vector shuffle operation which
-/// performs the same shuffe in terms of order or result bytes, but on a type
+/// performs the same shuffle in terms of order or result bytes, but on a type
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
 SDValue SelectionDAGLegalize::ShuffleWithNarrowerEltType(
@@ -1060,6 +1059,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
   case ISD::ADDROFRETURNADDR:
+  case ISD::SPONENTRY:
     // These operations lie about being legal: when they claim to be legal,
     // they should actually be custom-lowered.
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
@@ -1108,6 +1108,12 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     // These pseudo-ops get legalized as if they were their non-strict
     // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
     // is also legal, but if ISD::FSQRT requires expansion then so does
@@ -1115,6 +1121,13 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
                                             Node->getValueType(0));
     break;
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT: {
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    break;
+  }
   case ISD::MSCATTER:
     Action = TLI.getOperationAction(Node->getOpcode(),
                     cast<MaskedScatterSDNode>(Node)->getValue().getValueType());
@@ -2364,94 +2377,6 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
   assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
   // Code below here assumes !isSigned without checking again.
 
-  // Implementation of unsigned i64 to f64 following the algorithm in
-  // __floatundidf in compiler_rt. This implementation has the advantage
-  // of performing rounding correctly, both in the default rounding mode
-  // and in all alternate rounding modes.
-  // TODO: Generalize this for use with other types.
-  if (SrcVT == MVT::i64 && DestVT == MVT::f64) {
-    LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f64\n");
-    SDValue TwoP52 =
-      DAG.getConstant(UINT64_C(0x4330000000000000), dl, MVT::i64);
-    SDValue TwoP84PlusTwoP52 =
-      DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl,
-                        MVT::f64);
-    SDValue TwoP84 =
-      DAG.getConstant(UINT64_C(0x4530000000000000), dl, MVT::i64);
-
-    SDValue Lo = DAG.getZeroExtendInReg(Op0, dl, MVT::i32);
-    SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0,
-                             DAG.getConstant(32, dl, MVT::i64));
-    SDValue LoOr = DAG.getNode(ISD::OR, dl, MVT::i64, Lo, TwoP52);
-    SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
-    SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, LoOr);
-    SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, HiOr);
-    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
-                                TwoP84PlusTwoP52);
-    return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
-  }
-
-  // TODO: Generalize this for use with other types.
-  if (SrcVT == MVT::i64 && DestVT == MVT::f32) {
-    LLVM_DEBUG(dbgs() << "Converting unsigned i64 to f32\n");
-    // For unsigned conversions, convert them to signed conversions using the
-    // algorithm from the x86_64 __floatundidf in compiler_rt.
-    if (!isSigned) {
-      SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0);
-
-      SDValue ShiftConst = DAG.getConstant(
-          1, dl, TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout()));
-      SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst);
-      SDValue AndConst = DAG.getConstant(1, dl, MVT::i64);
-      SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst);
-      SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And, Shr);
-
-      SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Or);
-      SDValue Slow = DAG.getNode(ISD::FADD, dl, MVT::f32, SignCvt, SignCvt);
-
-      // TODO: This really should be implemented using a branch rather than a
-      // select.  We happen to get lucky and machinesink does the right
-      // thing most of the time.  This would be a good candidate for a
-      //pseudo-op, or, even better, for whole-function isel.
-      SDValue SignBitTest = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
-        Op0, DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
-      return DAG.getSelect(dl, MVT::f32, SignBitTest, Slow, Fast);
-    }
-
-    // Otherwise, implement the fully general conversion.
-
-    SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
-         DAG.getConstant(UINT64_C(0xfffffffffffff800), dl, MVT::i64));
-    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
-         DAG.getConstant(UINT64_C(0x800), dl, MVT::i64));
-    SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
-         DAG.getConstant(UINT64_C(0x7ff), dl, MVT::i64));
-    SDValue Ne = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), And2,
-                              DAG.getConstant(UINT64_C(0), dl, MVT::i64),
-                              ISD::SETNE);
-    SDValue Sel = DAG.getSelect(dl, MVT::i64, Ne, Or, Op0);
-    SDValue Ge = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), Op0,
-                              DAG.getConstant(UINT64_C(0x0020000000000000), dl,
-                                              MVT::i64),
-                              ISD::SETUGE);
-    SDValue Sel2 = DAG.getSelect(dl, MVT::i64, Ge, Sel, Op0);
-    EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType(), DAG.getDataLayout());
-
-    SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
-                             DAG.getConstant(32, dl, SHVT));
-    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sh);
-    SDValue Fcvt = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Trunc);
-    SDValue TwoP32 =
-      DAG.getConstantFP(BitsToDouble(UINT64_C(0x41f0000000000000)), dl,
-                        MVT::f64);
-    SDValue Fmul = DAG.getNode(ISD::FMUL, dl, MVT::f64, TwoP32, Fcvt);
-    SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sel2);
-    SDValue Fcvt2 = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Lo);
-    SDValue Fadd = DAG.getNode(ISD::FADD, dl, MVT::f64, Fmul, Fcvt2);
-    return DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Fadd,
-                       DAG.getIntPtrConstant(0, dl));
-  }
-
   SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
 
   SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(SrcVT), Op0,
@@ -2614,22 +2539,22 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
     // swap i4: ((V & 0xF0) >> 4) | ((V & 0x0F) << 4)
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi4, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo4, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, VT));
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, VT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(4, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(4, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
     // swap i2: ((V & 0xCC) >> 2) | ((V & 0x33) << 2)
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi2, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo2, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, VT));
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, VT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(2, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(2, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
 
     // swap i1: ((V & 0xAA) >> 1) | ((V & 0x55) << 1)
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskHi1, dl, VT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp, DAG.getConstant(MaskLo1, dl, VT));
-    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, VT));
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, VT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Tmp2, DAG.getConstant(1, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Tmp3, DAG.getConstant(1, dl, SHVT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
     return Tmp;
   }
@@ -2705,126 +2630,6 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
   }
 }
 
-/// Expand the specified bitcount instruction into operations.
-SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
-                                             const SDLoc &dl) {
-  switch (Opc) {
-  default: llvm_unreachable("Cannot expand this yet!");
-  case ISD::CTPOP: {
-    EVT VT = Op.getValueType();
-    EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-    unsigned Len = VT.getSizeInBits();
-
-    assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
-           "CTPOP not implemented for this type.");
-
-    // This is the "best" algorithm from
-    // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-
-    SDValue Mask55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)),
-                                     dl, VT);
-    SDValue Mask33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)),
-                                     dl, VT);
-    SDValue Mask0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)),
-                                     dl, VT);
-    SDValue Mask01 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)),
-                                     dl, VT);
-
-    // v = v - ((v >> 1) & 0x55555555...)
-    Op = DAG.getNode(ISD::SUB, dl, VT, Op,
-                     DAG.getNode(ISD::AND, dl, VT,
-                                 DAG.getNode(ISD::SRL, dl, VT, Op,
-                                             DAG.getConstant(1, dl, ShVT)),
-                                 Mask55));
-    // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
-    Op = DAG.getNode(ISD::ADD, dl, VT,
-                     DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
-                     DAG.getNode(ISD::AND, dl, VT,
-                                 DAG.getNode(ISD::SRL, dl, VT, Op,
-                                             DAG.getConstant(2, dl, ShVT)),
-                                 Mask33));
-    // v = (v + (v >> 4)) & 0x0F0F0F0F...
-    Op = DAG.getNode(ISD::AND, dl, VT,
-                     DAG.getNode(ISD::ADD, dl, VT, Op,
-                                 DAG.getNode(ISD::SRL, dl, VT, Op,
-                                             DAG.getConstant(4, dl, ShVT))),
-                     Mask0F);
-    // v = (v * 0x01010101...) >> (Len - 8)
-    Op = DAG.getNode(ISD::SRL, dl, VT,
-                     DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
-                     DAG.getConstant(Len - 8, dl, ShVT));
-
-    return Op;
-  }
-  case ISD::CTLZ_ZERO_UNDEF:
-    // This trivially expands to CTLZ.
-    return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
-  case ISD::CTLZ: {
-    EVT VT = Op.getValueType();
-    unsigned Len = VT.getSizeInBits();
-
-    if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
-      EVT SetCCVT = getSetCCResultType(VT);
-      SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
-      SDValue Zero = DAG.getConstant(0, dl, VT);
-      SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
-      return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
-                         DAG.getConstant(Len, dl, VT), CTLZ);
-    }
-
-    // for now, we do this:
-    // x = x | (x >> 1);
-    // x = x | (x >> 2);
-    // ...
-    // x = x | (x >>16);
-    // x = x | (x >>32); // for 64-bit input
-    // return popcount(~x);
-    //
-    // Ref: "Hacker's Delight" by Henry Warren
-    EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-    for (unsigned i = 0; (1U << i) <= (Len / 2); ++i) {
-      SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
-      Op = DAG.getNode(ISD::OR, dl, VT, Op,
-                       DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
-    }
-    Op = DAG.getNOT(dl, Op, VT);
-    return DAG.getNode(ISD::CTPOP, dl, VT, Op);
-  }
-  case ISD::CTTZ_ZERO_UNDEF:
-    // This trivially expands to CTTZ.
-    return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op);
-  case ISD::CTTZ: {
-    EVT VT = Op.getValueType();
-    unsigned Len = VT.getSizeInBits();
-
-    if (TLI.isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
-      EVT SetCCVT = getSetCCResultType(VT);
-      SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
-      SDValue Zero = DAG.getConstant(0, dl, VT);
-      SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
-      return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
-                         DAG.getConstant(Len, dl, VT), CTTZ);
-    }
-
-    // for now, we use: { return popcount(~x & (x - 1)); }
-    // unless the target has ctlz but not ctpop, in which case we use:
-    // { return 32 - nlz(~x & (x-1)); }
-    // Ref: "Hacker's Delight" by Henry Warren
-    SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
-                               DAG.getNOT(dl, Op, VT),
-                               DAG.getNode(ISD::SUB, dl, VT, Op,
-                                           DAG.getConstant(1, dl, VT)));
-    // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
-    if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
-        TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
-      return DAG.getNode(ISD::SUB, dl, VT,
-                         DAG.getConstant(VT.getSizeInBits(), dl, VT),
-                         DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
-    return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
-  }
-  }
-}
-
 bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Trying to expand node\n");
   SmallVector<SDValue, 8> Results;
@@ -2833,12 +2638,18 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   bool NeedInvert;
   switch (Node->getOpcode()) {
   case ISD::CTPOP:
+    if (TLI.expandCTPOP(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
+    break;
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
+    if (TLI.expandCTLZ(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
+    break;
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
-    Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
-    Results.push_back(Tmp1);
+    if (TLI.expandCTTZ(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
     break;
   case ISD::BITREVERSE:
     Results.push_back(ExpandBITREVERSE(Node->getOperand(0), dl));
@@ -3033,8 +2844,13 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
-  case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
+    if (TLI.expandUINT_TO_FP(Node, Tmp1, DAG)) {
+      Results.push_back(Tmp1);
+      break;
+    }
+    LLVM_FALLTHROUGH;
+  case ISD::SINT_TO_FP:
     Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
                                 Node->getOperand(0), Node->getValueType(0), dl);
     Results.push_back(Tmp1);
@@ -3043,29 +2859,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
       Results.push_back(Tmp1);
     break;
-  case ISD::FP_TO_UINT: {
-    SDValue True, False;
-    EVT VT =  Node->getOperand(0).getValueType();
-    EVT NVT = Node->getValueType(0);
-    APFloat apf(DAG.EVTToAPFloatSemantics(VT),
-                APInt::getNullValue(VT.getSizeInBits()));
-    APInt x = APInt::getSignMask(NVT.getSizeInBits());
-    (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
-    Tmp1 = DAG.getConstantFP(apf, dl, VT);
-    Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
-                        Node->getOperand(0),
-                        Tmp1, ISD::SETLT);
-    True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
-    // TODO: Should any fast-math-flags be set for the FSUB?
-    False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
-                        DAG.getNode(ISD::FSUB, dl, VT,
-                                    Node->getOperand(0), Tmp1));
-    False = DAG.getNode(ISD::XOR, dl, NVT, False,
-                        DAG.getConstant(x, dl, NVT));
-    Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
-    Results.push_back(Tmp1);
+  case ISD::FP_TO_UINT:
+    if (TLI.expandFP_TO_UINT(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
     break;
-  }
   case ISD::VAARG:
     Results.push_back(DAG.expandVAArg(Node));
     Results.push_back(Results[0].getValue(1));
@@ -3252,7 +3049,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
-
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Node, DAG))
+      Results.push_back(Expanded);
+    break;
+  }
   case ISD::FSIN:
   case ISD::FCOS: {
     EVT VT = Node->getValueType(0);
@@ -3460,6 +3262,13 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
     break;
   }
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT: {
+    Results.push_back(TLI.getExpandedSaturationAdditionSubtraction(Node, DAG));
+    break;
+  }
   case ISD::SADDO:
   case ISD::SSUBO: {
     SDValue LHS = Node->getOperand(0);
@@ -4031,11 +3840,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   }
   case ISD::FMINNUM:
+  case ISD::STRICT_FMINNUM:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
                                       RTLIB::FMIN_F80, RTLIB::FMIN_F128,
                                       RTLIB::FMIN_PPCF128));
     break;
   case ISD::FMAXNUM:
+  case ISD::STRICT_FMAXNUM:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
                                       RTLIB::FMAX_F80, RTLIB::FMAX_F128,
                                       RTLIB::FMAX_PPCF128));
@@ -4133,16 +3944,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                         RTLIB::EXP2_PPCF128));
     break;
   case ISD::FTRUNC:
+  case ISD::STRICT_FTRUNC:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
                                       RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
                                       RTLIB::TRUNC_PPCF128));
     break;
   case ISD::FFLOOR:
+  case ISD::STRICT_FFLOOR:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
                                       RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
                                       RTLIB::FLOOR_PPCF128));
     break;
   case ISD::FCEIL:
+  case ISD::STRICT_FCEIL:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
                                       RTLIB::CEIL_F80, RTLIB::CEIL_F128,
                                       RTLIB::CEIL_PPCF128));
@@ -4162,6 +3976,7 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                       RTLIB::NEARBYINT_PPCF128));
     break;
   case ISD::FROUND:
+  case ISD::STRICT_FROUND:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
                                       RTLIB::ROUND_F64,
                                       RTLIB::ROUND_F80,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index b6cce9102286597cda76f05a56cd67741636db9b..866744c397b03ac2ea7e41059dd5fb7e6bf8c0dd 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1910,8 +1910,8 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     // Binary FP Operations
     case ISD::FADD:
     case ISD::FDIV:
-    case ISD::FMAXNAN:
-    case ISD::FMINNAN:
+    case ISD::FMAXIMUM:
+    case ISD::FMINIMUM:
     case ISD::FMAXNUM:
     case ISD::FMINNUM:
     case ISD::FMUL:
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e11a18fd0c4b25811fbd5a10de906249060c68f5..2b1df0165d336198eaad7ea1d208b83de62fea81 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -141,6 +141,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:    Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break;
 
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:     Res = PromoteIntRes_ADDSUBSAT(N); break;
+
   case ISD::ATOMIC_LOAD:
     Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
 
@@ -305,12 +310,45 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
     // make us bitcast between two vectors which are legalized in different ways.
     if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector())
       return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetWidenedVector(InOp));
+    // If the output type is also a vector and widening it to the same size
+    // as the widened input type would be a legal type, we can widen the bitcast
+    // and handle the promotion after.
+    if (NOutVT.isVector()) {
+      unsigned WidenInSize = NInVT.getSizeInBits();
+      unsigned OutSize = OutVT.getSizeInBits();
+      if (WidenInSize % OutSize == 0) {
+        unsigned Scale = WidenInSize / OutSize;
+        EVT WideOutVT = EVT::getVectorVT(*DAG.getContext(),
+                                         OutVT.getVectorElementType(),
+                                         OutVT.getVectorNumElements() * Scale);
+        if (isTypeLegal(WideOutVT)) {
+          InOp = DAG.getBitcast(WideOutVT, GetWidenedVector(InOp));
+          MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+          InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, InOp,
+                             DAG.getConstant(0, dl, IdxTy));
+          return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, InOp);
+        }
+      }
+    }
   }
 
   return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
                      CreateStackStoreLoad(InOp, OutVT));
 }
 
+// Helper for BSWAP/BITREVERSE promotion to ensure we can fit the shift amount
+// in the VT returned by getShiftAmountTy and to return a safe VT if we can't.
+static EVT getShiftAmountTyForConstant(unsigned Val, EVT VT,
+                                       const TargetLowering &TLI,
+                                       SelectionDAG &DAG) {
+  EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  // If the value won't fit in the prefered type, just use something safe. It
+  // will be legalized when the shift is expanded.
+  if ((Log2_32(Val) + 1) > ShiftVT.getScalarSizeInBits())
+    ShiftVT = MVT::i32;
+  return ShiftVT;
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   EVT OVT = N->getValueType(0);
@@ -318,10 +356,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   SDLoc dl(N);
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
-  return DAG.getNode(
-      ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
-      DAG.getConstant(DiffBits, dl,
-                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+  EVT ShiftVT = getShiftAmountTyForConstant(DiffBits, NVT, TLI, DAG);
+  return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
+                     DAG.getConstant(DiffBits, dl, ShiftVT));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
@@ -331,10 +368,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
   SDLoc dl(N);
 
   unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
-  return DAG.getNode(
-      ISD::SRL, dl, NVT, DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
-      DAG.getConstant(DiffBits, dl,
-                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+  EVT ShiftVT = getShiftAmountTyForConstant(DiffBits, NVT, TLI, DAG);
+  return DAG.getNode(ISD::SRL, dl, NVT,
+                     DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
+                     DAG.getConstant(DiffBits, dl, ShiftVT));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
@@ -534,6 +571,51 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   return SDValue(Res.getNode(), 1);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
+  // For promoting iN -> iM, this can be expanded by
+  // 1. ANY_EXTEND iN to iM
+  // 2. SHL by M-N
+  // 3. [US][ADD|SUB]SAT
+  // 4. L/ASHR by M-N
+  SDLoc dl(N);
+  SDValue Op1 = N->getOperand(0);
+  SDValue Op2 = N->getOperand(1);
+  unsigned OldBits = Op1.getValueSizeInBits();
+
+  unsigned Opcode = N->getOpcode();
+  unsigned ShiftOp;
+  switch (Opcode) {
+  case ISD::SADDSAT:
+  case ISD::SSUBSAT:
+    ShiftOp = ISD::SRA;
+    break;
+  case ISD::UADDSAT:
+  case ISD::USUBSAT:
+    ShiftOp = ISD::SRL;
+    break;
+  default:
+    llvm_unreachable("Expected opcode to be signed or unsigned saturation "
+                     "addition or subtraction");
+  }
+
+  SDValue Op1Promoted = GetPromotedInteger(Op1);
+  SDValue Op2Promoted = GetPromotedInteger(Op2);
+
+  EVT PromotedType = Op1Promoted.getValueType();
+  unsigned NewBits = Op1Promoted.getValueSizeInBits();
+  unsigned SHLAmount = NewBits - OldBits;
+  EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+  SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT);
+  Op1Promoted =
+      DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
+  Op2Promoted =
+      DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
+
+  SDValue Result =
+      DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
+  return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
   if (ResNo == 1)
     return PromoteIntRes_Overflow(N);
@@ -1454,6 +1536,11 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break;
   case ISD::UMULO:
   case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break;
+
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -2416,6 +2503,12 @@ void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
   ReplaceValueWith(SDValue(N, 1), R.getValue(2));
 }
 
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBSAT(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue Result = TLI.getExpandedSaturationAdditionSubtraction(N, DAG);
+  SplitInteger(Result, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue LHS = Node->getOperand(0);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 3c93563440b4722c4245e72e6f0f27cb2dc68077..8b7c57cbb3b54460f37ff745af157caa109dcf44 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -330,6 +330,7 @@ private:
   SDValue PromoteIntRes_UNDEF(SDNode *N);
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_ADDSUBSAT(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
@@ -414,6 +415,7 @@ private:
   void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBSAT         (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
 
@@ -847,9 +849,6 @@ private:
   /// MaskVT to ToMaskVT if needed with vector extension or truncation.
   SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT);
 
-  /// Get the target mask VT, and widen if needed.
-  EVT getSETCCWidenedResultTy(SDValue SetCC);
-
   //===--------------------------------------------------------------------===//
   // Generic Splitting: LegalizeTypesGeneric.cpp
   //===--------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 3f38ed8a03cc4d344131ea512d8f15efd8507e67..17f05c3ba9775daf35f686a834b0ba4c50de2089 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -86,9 +86,10 @@ class VectorLegalizer {
   /// operations to legalize them.
   SDValue Expand(SDValue Op);
 
-  /// Implements expansion for FNEG; falls back to UnrollVectorOp if
-  /// FSUB isn't legal.
-  ///
+  /// Implements expansion for FP_TO_UINT; falls back to UnrollVectorOp if
+  /// FP_TO_SINT isn't legal.
+  SDValue ExpandFP_TO_UINT(SDValue Op);
+
   /// Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
   /// SINT_TO_FLOAT and SHR on vectors isn't legal.
   SDValue ExpandUINT_TO_FLOAT(SDValue Op);
@@ -128,8 +129,10 @@ class VectorLegalizer {
   SDValue ExpandFNEG(SDValue Op);
   SDValue ExpandFSUB(SDValue Op);
   SDValue ExpandBITREVERSE(SDValue Op);
+  SDValue ExpandCTPOP(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
-  SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op);
+  SDValue ExpandCTTZ(SDValue Op);
+  SDValue ExpandFMINNUM_FMAXNUM(SDValue Op);
   SDValue ExpandStrictFPOp(SDValue Op);
 
   /// Implements vector promotion.
@@ -303,6 +306,12 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     // These pseudo-ops get legalized as if they were their non-strict
     // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
     // is also legal, but if ISD::FSQRT requires expansion then so does
@@ -353,8 +362,10 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FABS:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
   case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
@@ -386,6 +397,10 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI:
   case ISD::FCANONICALIZE:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::FP_ROUND_INREG:
@@ -704,6 +719,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return ExpandVSELECT(Op);
   case ISD::SELECT:
     return ExpandSELECT(Op);
+  case ISD::FP_TO_UINT:
+    return ExpandFP_TO_UINT(Op);
   case ISD::UINT_TO_FP:
     return ExpandUINT_TO_FLOAT(Op);
   case ISD::FNEG:
@@ -714,11 +731,17 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return UnrollVSETCC(Op);
   case ISD::BITREVERSE:
     return ExpandBITREVERSE(Op);
+  case ISD::CTPOP:
+    return ExpandCTPOP(Op);
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
     return ExpandCTLZ(Op);
+  case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
-    return ExpandCTTZ_ZERO_UNDEF(Op);
+    return ExpandCTTZ(Op);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+    return ExpandFMINNUM_FMAXNUM(Op);
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
@@ -737,6 +760,12 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     return ExpandStrictFPOp(Op);
   default:
     return DAG.UnrollVectorOp(Op.getNode());
@@ -851,7 +880,7 @@ SDValue VectorLegalizer::ExpandSIGN_EXTEND_VECTOR_INREG(SDValue Op) {
 
   // First build an any-extend node which can be legalized above when we
   // recurse through it.
-  Op = DAG.getAnyExtendVectorInReg(Src, DL, VT);
+  Op = DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Src);
 
   // Now we need sign extend. Do this by shifting the elements. Even if these
   // aren't legal operations, they have a better chance of being legalized
@@ -1009,10 +1038,25 @@ SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
 }
 
+SDValue VectorLegalizer::ExpandFP_TO_UINT(SDValue Op) {
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandFP_TO_UINT(Op.getNode(), Result, DAG))
+    return Result;
+
+  // Otherwise go ahead and unroll.
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
   EVT VT = Op.getOperand(0).getValueType();
   SDLoc DL(Op);
 
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandUINT_TO_FP(Op.getNode(), Result, DAG))
+    return Result;
+
   // Make sure that the SINT_TO_FP and SRL instructions are available.
   if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand ||
       TLI.getOperationAction(ISD::SRL,        VT) == TargetLowering::Expand)
@@ -1071,59 +1115,42 @@ SDValue VectorLegalizer::ExpandFSUB(SDValue Op) {
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
-SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
-  EVT VT = Op.getValueType();
-  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
-
-  // If the non-ZERO_UNDEF version is supported we can use that instead.
-  if (Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
-      TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) {
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::CTLZ, DL, Op.getValueType(), Op.getOperand(0));
-  }
-
-  // If CTPOP is available we can lower with a CTPOP based method:
-  // u16 ctlz(u16 x) {
-  //   x |= (x >> 1);
-  //   x |= (x >> 2);
-  //   x |= (x >> 4);
-  //   x |= (x >> 8);
-  //   return ctpop(~x);
-  // }
-  // Ref: "Hacker's Delight" by Henry Warren
-  if (isPowerOf2_32(NumBitsPerElt) &&
-      TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
-      TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
-      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT) &&
-      TLI.isOperationLegalOrCustomOrPromote(ISD::XOR, VT)) {
-    SDLoc DL(Op);
-    SDValue Res = Op.getOperand(0);
-    EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+SDValue VectorLegalizer::ExpandCTPOP(SDValue Op) {
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandCTPOP(Op.getNode(), Result, DAG))
+    return Result;
 
-    for (unsigned i = 1; i != NumBitsPerElt; i *= 2)
-      Res = DAG.getNode(
-          ISD::OR, DL, VT, Res,
-          DAG.getNode(ISD::SRL, DL, VT, Res, DAG.getConstant(i, DL, ShiftTy)));
+  // Otherwise go ahead and unroll.
+  return DAG.UnrollVectorOp(Op.getNode());
+}
 
-    Res = DAG.getNOT(DL, Res, VT);
-    return DAG.getNode(ISD::CTPOP, DL, VT, Res);
-  }
+SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandCTLZ(Op.getNode(), Result, DAG))
+    return Result;
 
   // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
-SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) {
-  // If the non-ZERO_UNDEF version is supported we can use that instead.
-  if (TLI.isOperationLegalOrCustom(ISD::CTTZ, Op.getValueType())) {
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::CTTZ, DL, Op.getValueType(), Op.getOperand(0));
-  }
+SDValue VectorLegalizer::ExpandCTTZ(SDValue Op) {
+  // Attempt to expand using TargetLowering.
+  SDValue Result;
+  if (TLI.expandCTTZ(Op.getNode(), Result, DAG))
+    return Result;
 
   // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
+SDValue VectorLegalizer::ExpandFMINNUM_FMAXNUM(SDValue Op) {
+  if (SDValue Expanded = TLI.expandFMINNUM_FMAXNUM(Op.getNode(), DAG))
+    return Expanded;
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) {
   EVT VT = Op.getValueType();
   EVT EltVT = VT.getVectorElementType();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1b07358561a35f1ca0f0d335d3aecb3284cf6069..88abd84366a9b2aa2041074f57ed286149ea1768 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -113,13 +113,20 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
 
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
+
   case ISD::FPOW:
   case ISD::FREM:
   case ISD::FSUB:
@@ -157,6 +164,12 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     R = ScalarizeVecRes_StrictFPOp(N);
     break;
   }
@@ -781,8 +794,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::FDIV:
@@ -800,6 +813,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
+  case ISD::SADDSAT:
+  case ISD::UADDSAT:
+  case ISD::SSUBSAT:
+  case ISD::USUBSAT:
     SplitVecRes_BinOp(N, Lo, Hi);
     break;
   case ISD::FMA:
@@ -823,6 +840,12 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     SplitVecRes_StrictFPOp(N, Lo, Hi);
     break;
   }
@@ -1797,10 +1820,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
   case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
   case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
   case ISD::VECREDUCE_FMAX:
-    CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
+    CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXIMUM;
     break;
   case ISD::VECREDUCE_FMIN:
-    CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
+    CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINIMUM;
     break;
   default:
     llvm_unreachable("Unexpected reduce ISD node");
@@ -2349,8 +2372,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::XOR:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
-  case ISD::FMINNAN:
-  case ISD::FMAXNAN:
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
@@ -2389,6 +2412,12 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FLOG2:
   case ISD::STRICT_FRINT:
   case ISD::STRICT_FNEARBYINT:
+  case ISD::STRICT_FMAXNUM:
+  case ISD::STRICT_FMINNUM:
+  case ISD::STRICT_FCEIL:
+  case ISD::STRICT_FFLOOR:
+  case ISD::STRICT_FROUND:
+  case ISD::STRICT_FTRUNC:
     Res = WidenVecRes_StrictFP(N);
     break;
 
@@ -2794,9 +2823,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
       // accepts fewer elements in the result than in the input.
       if (Opcode == ISD::SIGN_EXTEND)
-        return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
+        return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
       if (Opcode == ISD::ZERO_EXTEND)
-        return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
+        return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, WidenVT, InOp);
     }
   }
 
@@ -2809,11 +2838,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
     if (WidenNumElts % InVTNumElts == 0) {
       // Widen the input and call convert on the widened input vector.
       unsigned NumConcat = WidenNumElts/InVTNumElts;
-      SmallVector<SDValue, 16> Ops(NumConcat);
+      SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
       Ops[0] = InOp;
-      SDValue UndefVal = DAG.getUNDEF(InVT);
-      for (unsigned i = 1; i != NumConcat; ++i)
-        Ops[i] = UndefVal;
       SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVec);
@@ -2832,11 +2858,12 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   }
 
   // Otherwise unroll into some nasty scalar code and rebuild the vector.
-  SmallVector<SDValue, 16> Ops(WidenNumElts);
   EVT EltVT = WidenVT.getVectorElementType();
-  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
-  unsigned i;
-  for (i=0; i < MinElts; ++i) {
+  SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
+  // Use the original element count so we don't do more scalar opts than
+  // necessary.
+  unsigned MinElts = N->getValueType(0).getVectorNumElements();
+  for (unsigned i=0; i < MinElts; ++i) {
     SDValue Val = DAG.getNode(
         ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
         DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
@@ -2846,10 +2873,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
       Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags);
   }
 
-  SDValue UndefVal = DAG.getUNDEF(EltVT);
-  for (; i < WidenNumElts; ++i)
-    Ops[i] = UndefVal;
-
   return DAG.getBuildVector(WidenVT, DL, Ops);
 }
 
@@ -2872,11 +2895,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
     if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
       switch (Opcode) {
       case ISD::ANY_EXTEND_VECTOR_INREG:
-        return DAG.getAnyExtendVectorInReg(InOp, DL, WidenVT);
       case ISD::SIGN_EXTEND_VECTOR_INREG:
-        return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
       case ISD::ZERO_EXTEND_VECTOR_INREG:
-        return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
+        return DAG.getNode(Opcode, DL, WidenVT, InOp);
       }
     }
   }
@@ -3028,22 +3049,20 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
     }
 
     if (TLI.isTypeLegal(NewInVT)) {
-      // Because the result and the input are different vector types, widening
-      // the result could create a legal type but widening the input might make
-      // it an illegal type that might lead to repeatedly splitting the input
-      // and then widening it. To avoid this, we widen the input only if
-      // it results in a legal type.
-      SmallVector<SDValue, 16> Ops(NewNumElts);
-      SDValue UndefVal = DAG.getUNDEF(InVT);
-      Ops[0] = InOp;
-      for (unsigned i = 1; i < NewNumElts; ++i)
-        Ops[i] = UndefVal;
-
       SDValue NewVec;
-      if (InVT.isVector())
+      if (InVT.isVector()) {
+        // Because the result and the input are different vector types, widening
+        // the result could create a legal type but widening the input might make
+        // it an illegal type that might lead to repeatedly splitting the input
+        // and then widening it. To avoid this, we widen the input only if
+        // it results in a legal type.
+        SmallVector<SDValue, 16> Ops(NewNumElts, DAG.getUNDEF(InVT));
+        Ops[0] = InOp;
+
         NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
-      else
-        NewVec = DAG.getBuildVector(NewInVT, dl, Ops);
+      } else {
+        NewVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewInVT, InOp);
+      }
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
     }
   }
@@ -3374,16 +3393,6 @@ SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
   return Mask;
 }
 
-// Get the target mask VT, and widen if needed.
-EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) {
-  assert(SetCC->getOpcode() == ISD::SETCC);
-  LLVMContext &Ctx = *DAG.getContext();
-  EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType());
-  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
-    MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT);
-  return MaskVT;
-}
-
 // This method tries to handle VSELECT and its mask by legalizing operands
 // (which may require widening) and if needed adjusting the mask vector type
 // to match that of the VSELECT. Without it, many cases end up with
@@ -3451,7 +3460,7 @@ SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
 
   SDValue Mask;
   if (Cond->getOpcode() == ISD::SETCC) {
-    EVT MaskVT = getSETCCWidenedResultTy(Cond);
+    EVT MaskVT = getSetCCResultType(Cond.getOperand(0).getValueType());
     Mask = convertMask(Cond, MaskVT, ToMaskVT);
   } else if (isLogicalMaskOp(Cond->getOpcode()) &&
              Cond->getOperand(0).getOpcode() == ISD::SETCC &&
@@ -3459,8 +3468,8 @@ SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
     // Cond is (AND/OR/XOR (SETCC, SETCC))
     SDValue SETCC0 = Cond->getOperand(0);
     SDValue SETCC1 = Cond->getOperand(1);
-    EVT VT0 = getSETCCWidenedResultTy(SETCC0);
-    EVT VT1 = getSETCCWidenedResultTy(SETCC1);
+    EVT VT0 = getSetCCResultType(SETCC0.getOperand(0).getValueType());
+    EVT VT1 = getSetCCResultType(SETCC1.getOperand(0).getValueType());
     unsigned ScalarBits0 = VT0.getScalarSizeInBits();
     unsigned ScalarBits1 = VT1.getScalarSizeInBits();
     unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
@@ -3723,11 +3732,11 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
   default:
     llvm_unreachable("Extend legalization on extend operation!");
   case ISD::ANY_EXTEND:
-    return DAG.getAnyExtendVectorInReg(InOp, DL, VT);
+    return DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, InOp);
   case ISD::SIGN_EXTEND:
-    return DAG.getSignExtendVectorInReg(InOp, DL, VT);
+    return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, InOp);
   case ISD::ZERO_EXTEND:
-    return DAG.getZeroExtendVectorInReg(InOp, DL, VT);
+    return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, InOp);
   }
 }
 
@@ -3757,8 +3766,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
                                 InVT.getVectorNumElements());
   if (TLI.isTypeLegal(WideVT)) {
     SDValue Res = DAG.getNode(Opcode, dl, WideVT, InOp);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
-                       DAG.getIntPtrConstant(0, dl));
+    return DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
 
   EVT InEltVT = InVT.getVectorElementType();
@@ -3800,20 +3810,31 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
-  // If the input vector is not legal, it is likely that we will not find a
-  // legal vector of the same size. Replace the concatenate vector with a
-  // nasty build vector.
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
+  EVT InVT = N->getOperand(0).getValueType();
   SDLoc dl(N);
+
+  // If the widen width for this operand is the same as the width of the concat
+  // and all but the first operand is undef, just use the widened operand.
+  unsigned NumOperands = N->getNumOperands();
+  if (VT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
+    unsigned i;
+    for (i = 1; i < NumOperands; ++i)
+      if (!N->getOperand(i).isUndef())
+        break;
+
+    if (i == NumOperands)
+      return GetWidenedVector(N->getOperand(0));
+  }
+
+  // Otherwise, fall back to a nasty build vector.
   unsigned NumElts = VT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(NumElts);
 
-  EVT InVT = N->getOperand(0).getValueType();
   unsigned NumInElts = InVT.getVectorNumElements();
 
   unsigned Idx = 0;
-  unsigned NumOperands = N->getNumOperands();
   for (unsigned i=0; i < NumOperands; ++i) {
     SDValue InOp = N->getOperand(i);
     assert(getTypeAction(InOp.getValueType()) ==
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0f8bd0808672b3107d2e4474c16ecaedb00f709c..fce14d53c22a9a3c84e0a2acc58f343fd04b64e2 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1118,39 +1118,6 @@ SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
                  getConstant(Imm, DL, Op.getValueType()));
 }
 
-SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL,
-                                              EVT VT) {
-  assert(VT.isVector() && "This DAG node is restricted to vector types.");
-  assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
-         "The sizes of the input and result must match in order to perform the "
-         "extend in-register.");
-  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
-         "The destination vector type must have fewer lanes than the input.");
-  return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op);
-}
-
-SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, const SDLoc &DL,
-                                               EVT VT) {
-  assert(VT.isVector() && "This DAG node is restricted to vector types.");
-  assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
-         "The sizes of the input and result must match in order to perform the "
-         "extend in-register.");
-  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
-         "The destination vector type must have fewer lanes than the input.");
-  return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op);
-}
-
-SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL,
-                                               EVT VT) {
-  assert(VT.isVector() && "This DAG node is restricted to vector types.");
-  assert(VT.getSizeInBits() == Op.getValueSizeInBits() &&
-         "The sizes of the input and result must match in order to perform the "
-         "extend in-register.");
-  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
-         "The destination vector type must have fewer lanes than the input.");
-  return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op);
-}
-
 /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
 SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
   EVT EltVT = VT.getScalarType();
@@ -3712,9 +3679,30 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
     // TODO: Refine on operand
     return false;
   }
-
-  // TODO: Handle FMINNUM/FMAXNUM/FMINNAN/FMAXNAN when there is an agreement on
-  // what they should do.
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM: {
+    // Only one needs to be known not-nan, since it will be returned if the
+    // other ends up being one.
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) ||
+           isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+  }
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE: {
+    if (SNaN)
+      return true;
+    // This can return a NaN if either operand is an sNaN, or if both operands
+    // are NaN.
+    return (isKnownNeverNaN(Op.getOperand(0), false, Depth + 1) &&
+            isKnownNeverSNaN(Op.getOperand(1), Depth + 1)) ||
+           (isKnownNeverNaN(Op.getOperand(1), false, Depth + 1) &&
+            isKnownNeverSNaN(Op.getOperand(0), Depth + 1));
+  }
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM: {
+    // TODO: Does this quiet or return the origina NaN as-is?
+    return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+           isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+  }
   case ISD::EXTRACT_VECTOR_ELT: {
     return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
   }
@@ -3784,6 +3772,38 @@ bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   return (computeKnownBits(A).Zero | computeKnownBits(B).Zero).isAllOnesValue();
 }
 
+static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
+                                ArrayRef<SDValue> Ops,
+                                SelectionDAG &DAG) {
+  int NumOps = Ops.size();
+  assert(NumOps != 0 && "Can't build an empty vector!");
+  assert(VT.getVectorNumElements() == (unsigned)NumOps &&
+         "Incorrect element count in BUILD_VECTOR!");
+
+  // BUILD_VECTOR of UNDEFs is UNDEF.
+  if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+    return DAG.getUNDEF(VT);
+
+  // BUILD_VECTOR of seq extract/insert from the same vector + type is Identity.
+  SDValue IdentitySrc;
+  bool IsIdentity = true;
+  for (int i = 0; i != NumOps; ++i) {
+    if (Ops[i].getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        Ops[i].getOperand(0).getValueType() != VT ||
+        (IdentitySrc && Ops[i].getOperand(0) != IdentitySrc) ||
+        !isa<ConstantSDNode>(Ops[i].getOperand(1)) ||
+        cast<ConstantSDNode>(Ops[i].getOperand(1))->getAPIntValue() != i) {
+      IsIdentity = false;
+      break;
+    }
+    IdentitySrc = Ops[i].getOperand(0);
+  }
+  if (IsIdentity)
+    return IdentitySrc;
+
+  return SDValue();
+}
+
 static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
                                   ArrayRef<SDValue> Ops,
                                   SelectionDAG &DAG) {
@@ -3867,9 +3887,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     case ISD::SIGN_EXTEND:
       return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
                          C->isTargetOpcode(), C->isOpaque());
+    case ISD::TRUNCATE:
+      if (C->isOpaque())
+        break;
+      LLVM_FALLTHROUGH;
     case ISD::ANY_EXTEND:
     case ISD::ZERO_EXTEND:
-    case ISD::TRUNCATE:
       return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
                          C->isTargetOpcode(), C->isOpaque());
     case ISD::UINT_TO_FP:
@@ -4035,6 +4058,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::MERGE_VALUES:
   case ISD::CONCAT_VECTORS:
     return Operand;         // Factor, merge or concat of one node?  No need.
+  case ISD::BUILD_VECTOR: {
+    // Attempt to simplify BUILD_VECTOR.
+    SDValue Ops[] = {Operand};
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
   case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
   case ISD::FP_EXTEND:
     assert(VT.isFloatingPoint() &&
@@ -4133,6 +4163,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    assert(VT.isVector() && "This DAG node is restricted to vector types.");
+    assert(VT.getSizeInBits() == Operand.getValueSizeInBits() &&
+           "The sizes of the input and result must match in order to perform the "
+           "extend in-register.");
+    assert(VT.getVectorNumElements() <
+             Operand.getValueType().getVectorNumElements() &&
+           "The destination vector type must have fewer lanes than the input.");
+    break;
   case ISD::ABS:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
            "Invalid ABS!");
@@ -4368,10 +4409,10 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
     SDValue V2 = BV2->getOperand(I);
 
     if (SVT.isInteger()) {
-        if (V1->getValueType(0).bitsGT(SVT))
-          V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
-        if (V2->getValueType(0).bitsGT(SVT))
-          V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
+      if (V1->getValueType(0).bitsGT(SVT))
+        V1 = getNode(ISD::TRUNCATE, DL, SVT, V1);
+      if (V2->getValueType(0).bitsGT(SVT))
+        V2 = getNode(ISD::TRUNCATE, DL, SVT, V2);
     }
 
     if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT)
@@ -4524,6 +4565,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N2.getOpcode() == ISD::EntryToken) return N1;
     if (N1 == N2) return N1;
     break;
+  case ISD::BUILD_VECTOR: {
+    // Attempt to simplify BUILD_VECTOR.
+    SDValue Ops[] = {N1, N2};
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
   case ISD::CONCAT_VECTORS: {
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     SDValue Ops[] = {N1, N2};
@@ -4995,6 +5043,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
     break;
   }
+  case ISD::BUILD_VECTOR: {
+    // Attempt to simplify BUILD_VECTOR.
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
   case ISD::CONCAT_VECTORS: {
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     SDValue Ops[] = {N1, N2, N3};
@@ -5003,6 +5058,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     break;
   }
   case ISD::SETCC: {
+    assert(VT.isInteger() && "SETCC result type must be an integer!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           "SETCC operands must have the same type!");
+    assert(VT.isVector() == N1.getValueType().isVector() &&
+           "SETCC type should be vector iff the operand type is vector!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() == N1.getValueType().getVectorNumElements()) &&
+           "SETCC vector element counts must match!");
     // Use FoldSetCC to simplify SETCC's.
     if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
       return V;
@@ -5136,8 +5199,11 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
     assert(C->getAPIntValue().getBitWidth() == 8);
     APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
-    if (VT.isInteger())
-      return DAG.getConstant(Val, dl, VT);
+    if (VT.isInteger()) {
+      bool IsOpaque = VT.getSizeInBits() > 64 ||
+          !DAG.getTargetLoweringInfo().isLegalStoreImmediate(C->getSExtValue());
+      return DAG.getConstant(Val, dl, VT, false, IsOpaque);
+    }
     return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
                              VT);
   }
@@ -6753,6 +6819,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
   switch (Opcode) {
   default: break;
+  case ISD::BUILD_VECTOR:
+    // Attempt to simplify BUILD_VECTOR.
+    if (SDValue V = FoldBUILD_VECTOR(DL, VT, Ops, *this))
+      return V;
+    break;
   case ISD::CONCAT_VECTORS:
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
@@ -7311,6 +7382,12 @@ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
     NewOpc = ISD::FNEARBYINT;
     IsUnary = true;
     break;
+  case ISD::STRICT_FMAXNUM: NewOpc = ISD::FMAXNUM; break;
+  case ISD::STRICT_FMINNUM: NewOpc = ISD::FMINNUM; break;
+  case ISD::STRICT_FCEIL: NewOpc = ISD::FCEIL; IsUnary = true; break;
+  case ISD::STRICT_FFLOOR: NewOpc = ISD::FFLOOR; IsUnary = true; break;
+  case ISD::STRICT_FROUND: NewOpc = ISD::FROUND; IsUnary = true; break;
+  case ISD::STRICT_FTRUNC: NewOpc = ISD::FTRUNC; IsUnary = true; break;
   }
 
   // We're taking this node out of the chain, so we need to re-link things.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index c859f16e74fef991ff63e351292e148af17b2302..8c57f18183e95777dd6210dd7440225559509117 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -19,8 +19,9 @@
 
 using namespace llvm;
 
-bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
-                                     const SelectionDAG &DAG, int64_t &Off) {
+bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
+                                     const SelectionDAG &DAG,
+                                     int64_t &Off) const {
   // Conservatively fail if we a match failed..
   if (!Base.getNode() || !Other.Base.getNode())
     return false;
@@ -75,7 +76,7 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
 }
 
 /// Parses tree in Ptr for base, index, offset addresses.
-BaseIndexOffset BaseIndexOffset::match(LSBaseSDNode *N,
+BaseIndexOffset BaseIndexOffset::match(const LSBaseSDNode *N,
                                        const SelectionDAG &DAG) {
   SDValue Ptr = N->getBasePtr();
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 868160c77a3558b2ff05f56b0b2d29a5c8f1d41c..bf24d7f75622747e28589419c2c64f2ffe1c8734 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -88,6 +88,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -121,6 +122,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "isel"
 
@@ -1824,7 +1826,6 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
   SwitchCases.push_back(CB);
 }
 
-/// FindMergedConditions - If Cond is an expression like
 void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
                                                MachineBasicBlock *TBB,
                                                MachineBasicBlock *FBB,
@@ -1836,13 +1837,12 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
                                                bool InvertCond) {
   // Skip over not part of the tree and remember to invert op and operands at
   // next level.
-  if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) {
-    const Value *CondOp = BinaryOperator::getNotArgument(Cond);
-    if (InBlock(CondOp, CurBB->getBasicBlock())) {
-      FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
-                           !InvertCond);
-      return;
-    }
+  Value *NotCond;
+  if (match(Cond, m_OneUse(m_Not(m_Value(NotCond)))) &&
+      InBlock(NotCond, CurBB->getBasicBlock())) {
+    FindMergedConditions(NotCond, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
+                         !InvertCond);
+    return;
   }
 
   const Instruction *BOp = dyn_cast<Instruction>(Cond);
@@ -2972,16 +2972,16 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
     case SPF_FMINNUM:
       switch (SPR.NaNBehavior) {
       case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
-      case SPNB_RETURNS_NAN:   Opc = ISD::FMINNAN; break;
+      case SPNB_RETURNS_NAN:   Opc = ISD::FMINIMUM; break;
       case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break;
       case SPNB_RETURNS_ANY: {
         if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT))
           Opc = ISD::FMINNUM;
-        else if (TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT))
-          Opc = ISD::FMINNAN;
+        else if (TLI.isOperationLegalOrCustom(ISD::FMINIMUM, VT))
+          Opc = ISD::FMINIMUM;
         else if (UseScalarMinMax)
           Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ?
-            ISD::FMINNUM : ISD::FMINNAN;
+            ISD::FMINNUM : ISD::FMINIMUM;
         break;
       }
       }
@@ -2989,17 +2989,17 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
     case SPF_FMAXNUM:
       switch (SPR.NaNBehavior) {
       case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
-      case SPNB_RETURNS_NAN:   Opc = ISD::FMAXNAN; break;
+      case SPNB_RETURNS_NAN:   Opc = ISD::FMAXIMUM; break;
       case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break;
       case SPNB_RETURNS_ANY:
 
         if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT))
           Opc = ISD::FMAXNUM;
-        else if (TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT))
-          Opc = ISD::FMAXNAN;
+        else if (TLI.isOperationLegalOrCustom(ISD::FMAXIMUM, VT))
+          Opc = ISD::FMAXIMUM;
         else if (UseScalarMinMax)
           Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ?
-            ISD::FMAXNUM : ISD::FMAXNAN;
+            ISD::FMAXNUM : ISD::FMAXIMUM;
         break;
       }
       break;
@@ -5050,6 +5050,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout())));
     return nullptr;
+  case Intrinsic::sponentry:
+    setValue(&I, DAG.getNode(ISD::SPONENTRY, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout())));
+    return nullptr;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
                              TLI.getPointerTy(DAG.getDataLayout()),
@@ -5565,8 +5569,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::minnum: {
     auto VT = getValue(I.getArgOperand(0)).getValueType();
     unsigned Opc =
-        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT)
-            ? ISD::FMINNAN
+        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMINIMUM, VT)
+            ? ISD::FMINIMUM
             : ISD::FMINNUM;
     setValue(&I, DAG.getNode(Opc, sdl, VT,
                              getValue(I.getArgOperand(0)),
@@ -5576,14 +5580,26 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::maxnum: {
     auto VT = getValue(I.getArgOperand(0)).getValueType();
     unsigned Opc =
-        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT)
-            ? ISD::FMAXNAN
+        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMAXIMUM, VT)
+            ? ISD::FMAXIMUM
             : ISD::FMAXNUM;
     setValue(&I, DAG.getNode(Opc, sdl, VT,
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1))));
     return nullptr;
   }
+  case Intrinsic::minimum:
+    setValue(&I, DAG.getNode(ISD::FMINIMUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
+  case Intrinsic::maximum:
+    setValue(&I, DAG.getNode(ISD::FMAXIMUM, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
   case Intrinsic::copysign:
     setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -5615,6 +5631,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::experimental_constrained_log2:
   case Intrinsic::experimental_constrained_rint:
   case Intrinsic::experimental_constrained_nearbyint:
+  case Intrinsic::experimental_constrained_maxnum:
+  case Intrinsic::experimental_constrained_minnum:
+  case Intrinsic::experimental_constrained_ceil:
+  case Intrinsic::experimental_constrained_floor:
+  case Intrinsic::experimental_constrained_round:
+  case Intrinsic::experimental_constrained_trunc:
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
     return nullptr;
   case Intrinsic::fmuladd: {
@@ -5759,6 +5781,30 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, DAG.getSelect(sdl, VT, IsZeroShift, IsFSHL ? X : Y, Or));
     return nullptr;
   }
+  case Intrinsic::sadd_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SADDSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return nullptr;
+  }
+  case Intrinsic::uadd_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::UADDSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return nullptr;
+  }
+  case Intrinsic::ssub_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SSUBSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return nullptr;
+  }
+  case Intrinsic::usub_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2));
+    return nullptr;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
@@ -5851,6 +5897,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, Res);
     return nullptr;
   }
+
+  case Intrinsic::is_constant:
+    // If this wasn't constant-folded away by now, then it's not a
+    // constant.
+    setValue(&I, DAG.getConstant(0, sdl, MVT::i1));
+    return nullptr;
+
   case Intrinsic::annotation:
   case Intrinsic::ptr_annotation:
   case Intrinsic::launder_invariant_group:
@@ -6270,12 +6323,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return nullptr;
   }
 
-  case Intrinsic::wasm_landingpad_index: {
-    // TODO store landing pad index in a map, which will be used when generating
-    // LSDA information
+  case Intrinsic::wasm_landingpad_index:
+    // Information this intrinsic contained has been transferred to
+    // MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely
+    // delete it now.
     return nullptr;
   }
-  }
 }
 
 void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
@@ -6338,6 +6391,24 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   case Intrinsic::experimental_constrained_nearbyint:
     Opcode = ISD::STRICT_FNEARBYINT;
     break;
+  case Intrinsic::experimental_constrained_maxnum:
+    Opcode = ISD::STRICT_FMAXNUM;
+    break;
+  case Intrinsic::experimental_constrained_minnum:
+    Opcode = ISD::STRICT_FMINNUM;
+    break;
+  case Intrinsic::experimental_constrained_ceil:
+    Opcode = ISD::STRICT_FCEIL;
+    break;
+  case Intrinsic::experimental_constrained_floor:
+    Opcode = ISD::STRICT_FFLOOR;
+    break;
+  case Intrinsic::experimental_constrained_round:
+    Opcode = ISD::STRICT_FROUND;
+    break;
+  case Intrinsic::experimental_constrained_trunc:
+    Opcode = ISD::STRICT_FTRUNC;
+    break;
   }
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Chain = getRoot();
@@ -6432,7 +6503,7 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
       WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
       EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS.getInstruction()),
                                 BeginLabel, EndLabel);
-    } else {
+    } else if (!isScopedEHPersonality(Pers)) {
       MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
     }
   }
@@ -7949,7 +8020,8 @@ SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
     return Op;
 
   APInt Hi = CR.getUnsignedMax();
-  unsigned Bits = Hi.getActiveBits();
+  unsigned Bits = std::max(Hi.getActiveBits(),
+                           static_cast<unsigned>(IntegerType::MIN_INT_BITS));
 
   EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
 
@@ -9237,7 +9309,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 /// the end.
 void
 SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
-  const TerminatorInst *TI = LLVMBB->getTerminator();
+  const Instruction *TI = LLVMBB->getTerminator();
 
   SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 594a587e412f4edbdc4a5fb0742cefd60492a50d..02d45df5864d906aadf12959eff1ab9cfe64b3e9 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -124,6 +124,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::RETURNADDR:                 return "RETURNADDR";
   case ISD::ADDROFRETURNADDR:           return "ADDROFRETURNADDR";
   case ISD::FRAMEADDR:                  return "FRAMEADDR";
+  case ISD::SPONENTRY:                  return "SPONENTRY";
   case ISD::LOCAL_RECOVER:              return "LOCAL_RECOVER";
   case ISD::READ_REGISTER:              return "READ_REGISTER";
   case ISD::WRITE_REGISTER:             return "WRITE_REGISTER";
@@ -175,9 +176,13 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   // Unary operators
   case ISD::FABS:                       return "fabs";
   case ISD::FMINNUM:                    return "fminnum";
+  case ISD::STRICT_FMINNUM:             return "strict_fminnum";
   case ISD::FMAXNUM:                    return "fmaxnum";
-  case ISD::FMINNAN:                    return "fminnan";
-  case ISD::FMAXNAN:                    return "fmaxnan";
+  case ISD::STRICT_FMAXNUM:             return "strict_fmaxnum";
+  case ISD::FMINNUM_IEEE:               return "fminnum_ieee";
+  case ISD::FMAXNUM_IEEE:               return "fmaxnum_ieee";
+  case ISD::FMINIMUM:                   return "fminimum";
+  case ISD::FMAXIMUM:                   return "fmaximum";
   case ISD::FNEG:                       return "fneg";
   case ISD::FSQRT:                      return "fsqrt";
   case ISD::STRICT_FSQRT:               return "strict_fsqrt";
@@ -188,13 +193,17 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STRICT_FCOS:                return "strict_fcos";
   case ISD::FSINCOS:                    return "fsincos";
   case ISD::FTRUNC:                     return "ftrunc";
+  case ISD::STRICT_FTRUNC:              return "strict_ftrunc";
   case ISD::FFLOOR:                     return "ffloor";
+  case ISD::STRICT_FFLOOR:              return "strict_ffloor";
   case ISD::FCEIL:                      return "fceil";
+  case ISD::STRICT_FCEIL:               return "strict_fceil";
   case ISD::FRINT:                      return "frint";
   case ISD::STRICT_FRINT:               return "strict_frint";
   case ISD::FNEARBYINT:                 return "fnearbyint";
   case ISD::STRICT_FNEARBYINT:          return "strict_fnearbyint";
   case ISD::FROUND:                     return "fround";
+  case ISD::STRICT_FROUND:              return "strict_fround";
   case ISD::FEXP:                       return "fexp";
   case ISD::STRICT_FEXP:                return "strict_fexp";
   case ISD::FEXP2:                      return "fexp2";
@@ -282,6 +291,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::SRA_PARTS:                  return "sra_parts";
   case ISD::SRL_PARTS:                  return "srl_parts";
 
+  case ISD::SADDSAT:                    return "saddsat";
+  case ISD::UADDSAT:                    return "uaddsat";
+  case ISD::SSUBSAT:                    return "ssubsat";
+  case ISD::USUBSAT:                    return "usubsat";
+
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
   case ISD::ZERO_EXTEND:                return "zero_extend";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index feb57eeafe7d17f3292a4da30199f0ee9b5db22c..dca358032fb59fec6881a27b0a3ac9a136cc15fb 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -451,7 +452,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       if (!succ_empty(&BB))
         continue;
 
-      const TerminatorInst *Term = BB.getTerminator();
+      const Instruction *Term = BB.getTerminator();
       if (isa<UnreachableInst>(Term) || isa<ReturnInst>(Term))
         continue;
 
@@ -1128,6 +1129,37 @@ static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) {
   return false;
 }
 
+// wasm.landingpad.index intrinsic is for associating a landing pad index number
+// with a catchpad instruction. Retrieve the landing pad index in the intrinsic
+// and store the mapping in the function.
+static void mapWasmLandingPadIndex(MachineBasicBlock *MBB,
+                                   const CatchPadInst *CPI) {
+  MachineFunction *MF = MBB->getParent();
+  // In case of single catch (...), we don't emit LSDA, so we don't need
+  // this information.
+  bool IsSingleCatchAllClause =
+      CPI->getNumArgOperands() == 1 &&
+      cast<Constant>(CPI->getArgOperand(0))->isNullValue();
+  if (!IsSingleCatchAllClause) {
+    // Create a mapping from landing pad label to landing pad index.
+    bool IntrFound = false;
+    for (const User *U : CPI->users()) {
+      if (const auto *Call = dyn_cast<IntrinsicInst>(U)) {
+        Intrinsic::ID IID = Call->getIntrinsicID();
+        if (IID == Intrinsic::wasm_landingpad_index) {
+          Value *IndexArg = Call->getArgOperand(1);
+          int Index = cast<ConstantInt>(IndexArg)->getZExtValue();
+          MF->setWasmLandingPadIndex(MBB, Index);
+          IntrFound = true;
+          break;
+        }
+      }
+    }
+    assert(IntrFound && "wasm.landingpad.index intrinsic not found!");
+    (void)IntrFound;
+  }
+}
+
 /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
 /// do other setup for EH landing-pad blocks.
 bool SelectionDAGISel::PrepareEHLandingPad() {
@@ -1137,44 +1169,48 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
   const TargetRegisterClass *PtrRC =
       TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout()));
 
+  auto Pers = classifyEHPersonality(PersonalityFn);
+
   // Catchpads have one live-in register, which typically holds the exception
   // pointer or code.
-  if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
-    if (hasExceptionPointerOrCodeUser(CPI)) {
-      // Get or create the virtual register to hold the pointer or code.  Mark
-      // the live in physreg and copy into the vreg.
-      MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
-      assert(EHPhysReg && "target lacks exception pointer register");
-      MBB->addLiveIn(EHPhysReg);
-      unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
-      BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
-              TII->get(TargetOpcode::COPY), VReg)
-          .addReg(EHPhysReg, RegState::Kill);
+  if (isFuncletEHPersonality(Pers)) {
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
+      if (hasExceptionPointerOrCodeUser(CPI)) {
+        // Get or create the virtual register to hold the pointer or code.  Mark
+        // the live in physreg and copy into the vreg.
+        MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
+        assert(EHPhysReg && "target lacks exception pointer register");
+        MBB->addLiveIn(EHPhysReg);
+        unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
+        BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
+                TII->get(TargetOpcode::COPY), VReg)
+            .addReg(EHPhysReg, RegState::Kill);
+      }
     }
     return true;
   }
 
-  if (!LLVMBB->isLandingPad())
-    return true;
-
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MCSymbol *Label = MF->addLandingPad(MBB);
 
-  // Assign the call site to the landing pad's begin label.
-  MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
-
   const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL);
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
-  // Mark exception register as live in.
-  if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
-    FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
-
-  // Mark exception selector register as live in.
-  if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
-    FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
+  if (Pers == EHPersonality::Wasm_CXX) {
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI()))
+      mapWasmLandingPadIndex(MBB, CPI);
+  } else {
+    // Assign the call site to the landing pad's begin label.
+    MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
+    // Mark exception register as live in.
+    if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
+      FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
+    // Mark exception selector register as live in.
+    if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
+      FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
+  }
 
   return true;
 }
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d3a50788f79c802bce72ca1608e74ca40a2a9039..2bc9090428b4a45a7e1c58091f5c29f7ff0a45cc 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1335,6 +1335,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     LLVM_FALLTHROUGH;
   }
   default:
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END) {
+      if (SimplifyDemandedBitsForTargetNode(Op, DemandedBits, Known, TLO,
+                                            Depth))
+        return true;
+      break;
+    }
+
     // Just use computeKnownBits to compute output bits.
     TLO.DAG.computeKnownBits(Op, Known, Depth);
     break;
@@ -1712,7 +1719,12 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     break;
   }
   case ISD::ADD:
-  case ISD::SUB: {
+  case ISD::SUB:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM: {
     APInt SrcUndef, SrcZero;
     if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef,
                                    SrcZero, TLO, Depth + 1))
@@ -1798,6 +1810,23 @@ bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   return false;
 }
 
+bool TargetLowering::SimplifyDemandedBitsForTargetNode(
+    SDValue Op, const APInt &DemandedBits, KnownBits &Known,
+    TargetLoweringOpt &TLO, unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use SimplifyDemandedBits if you don't know whether Op"
+         " is a target node!");
+  EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
+  return false;
+}
+
 bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
                                                   const SelectionDAG &DAG,
                                                   bool SNaN,
@@ -2255,7 +2284,8 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       }
       if (bestWidth) {
         EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
-        if (newVT.isRound()) {
+        if (newVT.isRound() &&
+            shouldReduceLoadWidth(Lod, ISD::NON_EXTLOAD, newVT)) {
           EVT PtrType = Lod->getOperand(1).getValueType();
           SDValue Ptr = Lod->getBasePtr();
           if (bestOffset != 0)
@@ -4047,64 +4077,382 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
 
 bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
                                SelectionDAG &DAG) const {
-  EVT VT = Node->getOperand(0).getValueType();
-  EVT NVT = Node->getValueType(0);
+  SDValue Src = Node->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
   SDLoc dl(SDValue(Node, 0));
 
   // FIXME: Only f32 to i64 conversions are supported.
-  if (VT != MVT::f32 || NVT != MVT::i64)
+  if (SrcVT != MVT::f32 || DstVT != MVT::i64)
     return false;
 
   // Expand f32 -> i64 conversion
   // This algorithm comes from compiler-rt's implementation of fixsfdi:
   // https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/fixsfdi.c
-  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(),
-                                VT.getSizeInBits());
+  unsigned SrcEltBits = SrcVT.getScalarSizeInBits();
+  EVT IntVT = SrcVT.changeTypeToInteger();
+  EVT IntShVT = getShiftAmountTy(IntVT, DAG.getDataLayout());
+
   SDValue ExponentMask = DAG.getConstant(0x7F800000, dl, IntVT);
   SDValue ExponentLoBit = DAG.getConstant(23, dl, IntVT);
   SDValue Bias = DAG.getConstant(127, dl, IntVT);
-  SDValue SignMask = DAG.getConstant(APInt::getSignMask(VT.getSizeInBits()), dl,
-                                     IntVT);
-  SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, dl, IntVT);
+  SDValue SignMask = DAG.getConstant(APInt::getSignMask(SrcEltBits), dl, IntVT);
+  SDValue SignLowBit = DAG.getConstant(SrcEltBits - 1, dl, IntVT);
   SDValue MantissaMask = DAG.getConstant(0x007FFFFF, dl, IntVT);
 
-  SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0));
+  SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Src);
 
-  auto &DL = DAG.getDataLayout();
   SDValue ExponentBits = DAG.getNode(
       ISD::SRL, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
-      DAG.getZExtOrTrunc(ExponentLoBit, dl, getShiftAmountTy(IntVT, DL)));
+      DAG.getZExtOrTrunc(ExponentLoBit, dl, IntShVT));
   SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
 
-  SDValue Sign = DAG.getNode(
-      ISD::SRA, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
-      DAG.getZExtOrTrunc(SignLowBit, dl, getShiftAmountTy(IntVT, DL)));
-  Sign = DAG.getSExtOrTrunc(Sign, dl, NVT);
+  SDValue Sign = DAG.getNode(ISD::SRA, dl, IntVT,
+                             DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
+                             DAG.getZExtOrTrunc(SignLowBit, dl, IntShVT));
+  Sign = DAG.getSExtOrTrunc(Sign, dl, DstVT);
 
   SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
-      DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
-      DAG.getConstant(0x00800000, dl, IntVT));
+                          DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
+                          DAG.getConstant(0x00800000, dl, IntVT));
 
-  R = DAG.getZExtOrTrunc(R, dl, NVT);
+  R = DAG.getZExtOrTrunc(R, dl, DstVT);
 
   R = DAG.getSelectCC(
       dl, Exponent, ExponentLoBit,
-      DAG.getNode(ISD::SHL, dl, NVT, R,
+      DAG.getNode(ISD::SHL, dl, DstVT, R,
                   DAG.getZExtOrTrunc(
                       DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
-                      dl, getShiftAmountTy(IntVT, DL))),
-      DAG.getNode(ISD::SRL, dl, NVT, R,
+                      dl, IntShVT)),
+      DAG.getNode(ISD::SRL, dl, DstVT, R,
                   DAG.getZExtOrTrunc(
                       DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
-                      dl, getShiftAmountTy(IntVT, DL))),
+                      dl, IntShVT)),
       ISD::SETGT);
 
-  SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT,
-      DAG.getNode(ISD::XOR, dl, NVT, R, Sign),
-      Sign);
+  SDValue Ret = DAG.getNode(ISD::SUB, dl, DstVT,
+                            DAG.getNode(ISD::XOR, dl, DstVT, R, Sign), Sign);
 
   Result = DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, dl, IntVT),
-      DAG.getConstant(0, dl, NVT), Ret, ISD::SETLT);
+                           DAG.getConstant(0, dl, DstVT), Ret, ISD::SETLT);
+  return true;
+}
+
+bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
+                                      SelectionDAG &DAG) const {
+  SDLoc dl(SDValue(Node, 0));
+  SDValue Src = Node->getOperand(0);
+
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+  EVT SetCCVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (DstVT.isVector() && (!isOperationLegalOrCustom(ISD::FP_TO_SINT, DstVT) ||
+                           !isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
+    return false;
+
+  // Expand based on maximum range of FP_TO_SINT:
+  // True = fp_to_sint(Src)
+  // False = 0x8000000000000000 + fp_to_sint(Src - 0x8000000000000000)
+  // Result = select (Src < 0x8000000000000000), True, False
+  APFloat apf(DAG.EVTToAPFloatSemantics(SrcVT),
+              APInt::getNullValue(SrcVT.getScalarSizeInBits()));
+  APInt x = APInt::getSignMask(DstVT.getScalarSizeInBits());
+  (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
+
+  SDValue Tmp1 = DAG.getConstantFP(apf, dl, SrcVT);
+  SDValue Tmp2 = DAG.getSetCC(dl, SetCCVT, Src, Tmp1, ISD::SETLT);
+  SDValue True = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
+  // TODO: Should any fast-math-flags be set for the FSUB?
+  SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT,
+                              DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Tmp1));
+  False =
+      DAG.getNode(ISD::XOR, dl, DstVT, False, DAG.getConstant(x, dl, DstVT));
+  Result = DAG.getSelect(dl, DstVT, Tmp2, True, False);
+  return true;
+}
+
+bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
+                                      SelectionDAG &DAG) const {
+  SDValue Src = Node->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Node->getValueType(0);
+
+  if (SrcVT.getScalarType() != MVT::i64)
+    return false;
+
+  SDLoc dl(SDValue(Node, 0));
+  EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
+
+  if (DstVT.getScalarType() == MVT::f32) {
+    // Only expand vector types if we have the appropriate vector bit
+    // operations.
+    if (SrcVT.isVector() &&
+        (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
+         !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
+         !isOperationLegalOrCustom(ISD::SINT_TO_FP, SrcVT) ||
+         !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
+         !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
+      return false;
+
+    // For unsigned conversions, convert them to signed conversions using the
+    // algorithm from the x86_64 __floatundidf in compiler_rt.
+    SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
+
+    SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
+    SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Src, ShiftConst);
+    SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
+    SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst);
+    SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
+
+    SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or);
+    SDValue Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt);
+
+    // TODO: This really should be implemented using a branch rather than a
+    // select.  We happen to get lucky and machinesink does the right
+    // thing most of the time.  This would be a good candidate for a
+    // pseudo-op, or, even better, for whole-function isel.
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
+
+    SDValue SignBitTest = DAG.getSetCC(
+        dl, SetCCVT, Src, DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
+    Result = DAG.getSelect(dl, DstVT, SignBitTest, Slow, Fast);
+    return true;
+  }
+
+  if (DstVT.getScalarType() == MVT::f64) {
+    // Only expand vector types if we have the appropriate vector bit
+    // operations.
+    if (SrcVT.isVector() &&
+        (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
+         !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
+         !isOperationLegalOrCustom(ISD::FSUB, DstVT) ||
+         !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
+         !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
+      return false;
+
+    // Implementation of unsigned i64 to f64 following the algorithm in
+    // __floatundidf in compiler_rt. This implementation has the advantage
+    // of performing rounding correctly, both in the default rounding mode
+    // and in all alternate rounding modes.
+    SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
+    SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
+        BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
+    SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
+    SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
+    SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
+
+    SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
+    SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
+    SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
+    SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
+    SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
+    SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
+    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
+    Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
+    return true;
+  }
+
+  return false;
+}
+
+SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
+                                              SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ?
+    ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+  EVT VT = Node->getValueType(0);
+  if (isOperationLegalOrCustom(NewOp, VT)) {
+    SDValue Quiet0 = Node->getOperand(0);
+    SDValue Quiet1 = Node->getOperand(1);
+
+    if (!Node->getFlags().hasNoNaNs()) {
+      // Insert canonicalizes if it's possible we need to quiet to get correct
+      // sNaN behavior.
+      if (!DAG.isKnownNeverSNaN(Quiet0)) {
+        Quiet0 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet0,
+                             Node->getFlags());
+      }
+      if (!DAG.isKnownNeverSNaN(Quiet1)) {
+        Quiet1 = DAG.getNode(ISD::FCANONICALIZE, dl, VT, Quiet1,
+                             Node->getFlags());
+      }
+    }
+
+    return DAG.getNode(NewOp, dl, VT, Quiet0, Quiet1, Node->getFlags());
+  }
+
+  return SDValue();
+}
+
+bool TargetLowering::expandCTPOP(SDNode *Node, SDValue &Result,
+                                 SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  unsigned Len = VT.getScalarSizeInBits();
+  assert(VT.isInteger() && "CTPOP not implemented for this type.");
+
+  // TODO: Add support for irregular type lengths.
+  if (!(Len <= 128 && Len % 8 == 0))
+    return false;
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (VT.isVector() && (!isOperationLegalOrCustom(ISD::ADD, VT) ||
+                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
+                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        (Len != 8 && !isOperationLegalOrCustom(ISD::MUL, VT)) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT)))
+    return false;
+
+  // This is the "best" algorithm from
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+  SDValue Mask55 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), dl, VT);
+  SDValue Mask33 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), dl, VT);
+  SDValue Mask0F =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), dl, VT);
+  SDValue Mask01 =
+      DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
+
+  // v = v - ((v >> 1) & 0x55555555...)
+  Op = DAG.getNode(ISD::SUB, dl, VT, Op,
+                   DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, VT, Op,
+                                           DAG.getConstant(1, dl, ShVT)),
+                               Mask55));
+  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+  Op = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
+                   DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, VT, Op,
+                                           DAG.getConstant(2, dl, ShVT)),
+                               Mask33));
+  // v = (v + (v >> 4)) & 0x0F0F0F0F...
+  Op = DAG.getNode(ISD::AND, dl, VT,
+                   DAG.getNode(ISD::ADD, dl, VT, Op,
+                               DAG.getNode(ISD::SRL, dl, VT, Op,
+                                           DAG.getConstant(4, dl, ShVT))),
+                   Mask0F);
+  // v = (v * 0x01010101...) >> (Len - 8)
+  if (Len > 8)
+    Op =
+        DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+                    DAG.getConstant(Len - 8, dl, ShVT));
+
+  Result = Op;
+  return true;
+}
+
+bool TargetLowering::expandCTLZ(SDNode *Node, SDValue &Result,
+                                SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Op = Node->getOperand(0);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // If the non-ZERO_UNDEF version is supported we can use that instead.
+  if (Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
+      isOperationLegalOrCustom(ISD::CTLZ, VT)) {
+    Result = DAG.getNode(ISD::CTLZ, dl, VT, Op);
+    return true;
+  }
+
+  // If the ZERO_UNDEF version is supported use that and handle the zero case.
+  if (isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
+    Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
+                         DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ);
+    return true;
+  }
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
+                        !isOperationLegalOrCustom(ISD::CTPOP, VT) ||
+                        !isOperationLegalOrCustom(ISD::SRL, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
+    return false;
+
+  // for now, we do this:
+  // x = x | (x >> 1);
+  // x = x | (x >> 2);
+  // ...
+  // x = x | (x >>16);
+  // x = x | (x >>32); // for 64-bit input
+  // return popcount(~x);
+  //
+  // Ref: "Hacker's Delight" by Henry Warren
+  for (unsigned i = 0; (1U << i) <= (NumBitsPerElt / 2); ++i) {
+    SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT);
+    Op = DAG.getNode(ISD::OR, dl, VT, Op,
+                     DAG.getNode(ISD::SRL, dl, VT, Op, Tmp));
+  }
+  Op = DAG.getNOT(dl, Op, VT);
+  Result = DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  return true;
+}
+
+bool TargetLowering::expandCTTZ(SDNode *Node, SDValue &Result,
+                                SelectionDAG &DAG) const {
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue Op = Node->getOperand(0);
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
+  // If the non-ZERO_UNDEF version is supported we can use that instead.
+  if (Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF &&
+      isOperationLegalOrCustom(ISD::CTTZ, VT)) {
+    Result = DAG.getNode(ISD::CTTZ, dl, VT, Op);
+    return true;
+  }
+
+  // If the ZERO_UNDEF version is supported use that and handle the zero case.
+  if (isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) {
+    EVT SetCCVT =
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op);
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
+    Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
+                         DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ);
+    return true;
+  }
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (VT.isVector() && (!isPowerOf2_32(NumBitsPerElt) ||
+                        (!isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+                         !isOperationLegalOrCustom(ISD::CTLZ, VT)) ||
+                        !isOperationLegalOrCustom(ISD::SUB, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::AND, VT) ||
+                        !isOperationLegalOrCustomOrPromote(ISD::XOR, VT)))
+    return false;
+
+  // for now, we use: { return popcount(~x & (x - 1)); }
+  // unless the target has ctlz but not ctpop, in which case we use:
+  // { return 32 - nlz(~x & (x-1)); }
+  // Ref: "Hacker's Delight" by Henry Warren
+  SDValue Tmp = DAG.getNode(
+      ISD::AND, dl, VT, DAG.getNOT(dl, Op, VT),
+      DAG.getNode(ISD::SUB, dl, VT, Op, DAG.getConstant(1, dl, VT)));
+
+  // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
+  if (isOperationLegal(ISD::CTLZ, VT) && !isOperationLegal(ISD::CTPOP, VT)) {
+    Result =
+        DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(NumBitsPerElt, dl, VT),
+                    DAG.getNode(ISD::CTLZ, dl, VT, Tmp));
+    return true;
+  }
+
+  Result = DAG.getNode(ISD::CTPOP, dl, VT, Tmp);
   return true;
 }
 
@@ -4646,3 +4994,71 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
   }
   return SDValue();
 }
+
+SDValue TargetLowering::getExpandedSaturationAdditionSubtraction(
+    SDNode *Node, SelectionDAG &DAG) const {
+  unsigned Opcode = Node->getOpcode();
+  unsigned OverflowOp;
+  switch (Opcode) {
+  case ISD::SADDSAT:
+    OverflowOp = ISD::SADDO;
+    break;
+  case ISD::UADDSAT:
+    OverflowOp = ISD::UADDO;
+    break;
+  case ISD::SSUBSAT:
+    OverflowOp = ISD::SSUBO;
+    break;
+  case ISD::USUBSAT:
+    OverflowOp = ISD::USUBO;
+    break;
+  default:
+    llvm_unreachable("Expected method to receive signed or unsigned saturation "
+                     "addition or subtraction node.");
+  }
+  assert(Node->getNumOperands() == 2 && "Expected node to have 2 operands.");
+
+  SDLoc dl(Node);
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  assert(LHS.getValueType().isScalarInteger() &&
+         "Expected operands to be integers. Vector of int arguments should "
+         "already be unrolled.");
+  assert(RHS.getValueType().isScalarInteger() &&
+         "Expected operands to be integers. Vector of int arguments should "
+         "already be unrolled.");
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected both operands to be the same type");
+
+  unsigned BitWidth = LHS.getValueSizeInBits();
+  EVT ResultType = LHS.getValueType();
+  EVT BoolVT =
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ResultType);
+  SDValue Result =
+      DAG.getNode(OverflowOp, dl, DAG.getVTList(ResultType, BoolVT), LHS, RHS);
+  SDValue SumDiff = Result.getValue(0);
+  SDValue Overflow = Result.getValue(1);
+  SDValue Zero = DAG.getConstant(0, dl, ResultType);
+
+  if (Opcode == ISD::UADDSAT) {
+    // Just need to check overflow for SatMax.
+    APInt MaxVal = APInt::getMaxValue(BitWidth);
+    SDValue SatMax = DAG.getConstant(MaxVal, dl, ResultType);
+    return DAG.getSelect(dl, ResultType, Overflow, SatMax, SumDiff);
+  } else if (Opcode == ISD::USUBSAT) {
+    // Just need to check overflow for SatMin.
+    APInt MinVal = APInt::getMinValue(BitWidth);
+    SDValue SatMin = DAG.getConstant(MinVal, dl, ResultType);
+    return DAG.getSelect(dl, ResultType, Overflow, SatMin, SumDiff);
+  } else {
+    // SatMax -> Overflow && SumDiff < 0
+    // SatMin -> Overflow && SumDiff >= 0
+    APInt MinVal = APInt::getSignedMinValue(BitWidth);
+    APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
+    SDValue SatMin = DAG.getConstant(MinVal, dl, ResultType);
+    SDValue SatMax = DAG.getConstant(MaxVal, dl, ResultType);
+    SDValue SumNeg = DAG.getSetCC(dl, BoolVT, SumDiff, Zero, ISD::SETLT);
+    Result = DAG.getSelect(dl, ResultType, SumNeg, SatMax, SatMin);
+    return DAG.getSelect(dl, ResultType, Overflow, Result, SumDiff);
+  }
+}
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index b785fdc42a31861bbb4292ba0eadccddeb55b66b..166ff18e7759efc3ff3df0397b83d3d9431aca41 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -600,14 +600,20 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
     setOperationAction(ISD::FMINNUM, VT, Expand);
     setOperationAction(ISD::FMAXNUM, VT, Expand);
-    setOperationAction(ISD::FMINNAN, VT, Expand);
-    setOperationAction(ISD::FMAXNAN, VT, Expand);
+    setOperationAction(ISD::FMINNUM_IEEE, VT, Expand);
+    setOperationAction(ISD::FMAXNUM_IEEE, VT, Expand);
+    setOperationAction(ISD::FMINIMUM, VT, Expand);
+    setOperationAction(ISD::FMAXIMUM, VT, Expand);
     setOperationAction(ISD::FMAD, VT, Expand);
     setOperationAction(ISD::SMIN, VT, Expand);
     setOperationAction(ISD::SMAX, VT, Expand);
     setOperationAction(ISD::UMIN, VT, Expand);
     setOperationAction(ISD::UMAX, VT, Expand);
     setOperationAction(ISD::ABS, VT, Expand);
+    setOperationAction(ISD::SADDSAT, VT, Expand);
+    setOperationAction(ISD::UADDSAT, VT, Expand);
+    setOperationAction(ISD::SSUBSAT, VT, Expand);
+    setOperationAction(ISD::USUBSAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
@@ -1098,7 +1104,7 @@ void TargetLoweringBase::computeRegisterProperties(
       LegalIntReg = IntReg;
     } else {
       RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
-        (const MVT::SimpleValueType)LegalIntReg;
+        (MVT::SimpleValueType)LegalIntReg;
       ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
     }
   }
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index f6882c40531ac97eddb596da4753568ff844d995..341ab927861039d114dad9516291094003b2b287 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -506,6 +506,30 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
   return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr;
 }
 
+static unsigned getEntrySizeForKind(SectionKind Kind) {
+  if (Kind.isMergeable1ByteCString())
+    return 1;
+  else if (Kind.isMergeable2ByteCString())
+    return 2;
+  else if (Kind.isMergeable4ByteCString())
+    return 4;
+  else if (Kind.isMergeableConst4())
+    return 4;
+  else if (Kind.isMergeableConst8())
+    return 8;
+  else if (Kind.isMergeableConst16())
+    return 16;
+  else if (Kind.isMergeableConst32())
+    return 32;
+  else {
+    // We shouldn't have mergeable C strings or mergeable constants that we
+    // didn't handle above.
+    assert(!Kind.isMergeableCString() && "unknown string width");
+    assert(!Kind.isMergeableConst() && "unknown data width");
+    return 0;
+  }
+}
+
 MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   StringRef SectionName = GO->getSection();
@@ -550,7 +574,7 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
 
   MCSectionELF *Section = getContext().getELFSection(
       SectionName, getELFSectionType(SectionName, Kind), Flags,
-      /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol);
+      getEntrySizeForKind(Kind), Group, UniqueID, AssociatedSymbol);
   // Make sure that we did not get some other section with incompatible sh_link.
   // This should not be possible due to UniqueID code above.
   assert(Section->getAssociatedSymbol() == AssociatedSymbol &&
@@ -577,30 +601,6 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
   return ".data.rel.ro";
 }
 
-static unsigned getEntrySizeForKind(SectionKind Kind) {
-  if (Kind.isMergeable1ByteCString())
-    return 1;
-  else if (Kind.isMergeable2ByteCString())
-    return 2;
-  else if (Kind.isMergeable4ByteCString())
-    return 4;
-  else if (Kind.isMergeableConst4())
-    return 4;
-  else if (Kind.isMergeableConst8())
-    return 8;
-  else if (Kind.isMergeableConst16())
-    return 16;
-  else if (Kind.isMergeableConst32())
-    return 32;
-  else {
-    // We shouldn't have mergeable C strings or mergeable constants that we
-    // didn't handle above.
-    assert(!Kind.isMergeableCString() && "unknown string width");
-    assert(!Kind.isMergeableConst() && "unknown data width");
-    return 0;
-  }
-}
-
 static MCSectionELF *selectELFSectionForGlobal(
     MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
     const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags,
@@ -1748,6 +1748,10 @@ const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference(
 void TargetLoweringObjectFileWasm::InitializeWasm() {
   StaticCtorSection =
       getContext().getWasmSection(".init_array", SectionKind::getData());
+
+  // We don't use PersonalityEncoding and LSDAEncoding because we don't emit
+  // .cfi directives. We use TTypeEncoding to encode typeinfo global variables.
+  TTypeEncoding = dwarf::DW_EH_PE_absptr;
 }
 
 MCSection *TargetLoweringObjectFileWasm::getStaticCtorSection(
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index a3b24d1cd6645eef6e53d8235daa86374c365d9e..9adacd2ed718a7170f7e9b0d822b6fe73732a682 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -419,8 +419,13 @@ TargetPassConfig::TargetPassConfig()
                      "triple set?");
 }
 
-bool TargetPassConfig::hasLimitedCodeGenPipeline() const {
-  return StartBefore || StartAfter || StopBefore || StopAfter;
+bool TargetPassConfig::willCompleteCodeGenPipeline() {
+  return StopBeforeOpt.empty() && StopAfterOpt.empty();
+}
+
+bool TargetPassConfig::hasLimitedCodeGenPipeline() {
+  return !StartBeforeOpt.empty() || !StartAfterOpt.empty() ||
+         !willCompleteCodeGenPipeline();
 }
 
 std::string
@@ -806,15 +811,17 @@ void TargetPassConfig::addMachinePasses() {
   AddingMachinePasses = true;
 
   // Insert a machine instr printer pass after the specified pass.
-  if (!StringRef(PrintMachineInstrs.getValue()).equals("") &&
-      !StringRef(PrintMachineInstrs.getValue()).equals("option-unspecified")) {
-    const PassRegistry *PR = PassRegistry::getPassRegistry();
-    const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
-    const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
-    assert (TPI && IPI && "Pass ID not registered!");
-    const char *TID = (const char *)(TPI->getTypeInfo());
-    const char *IID = (const char *)(IPI->getTypeInfo());
-    insertPass(TID, IID);
+  StringRef PrintMachineInstrsPassName = PrintMachineInstrs.getValue();
+  if (!PrintMachineInstrsPassName.equals("") &&
+      !PrintMachineInstrsPassName.equals("option-unspecified")) {
+    if (const PassInfo *TPI = getPassInfo(PrintMachineInstrsPassName)) {
+      const PassRegistry *PR = PassRegistry::getPassRegistry();
+      const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
+      assert(IPI && "failed to get \"machineinstr-printer\" PassInfo!");
+      const char *TID = (const char *)(TPI->getTypeInfo());
+      const char *IID = (const char *)(IPI->getTypeInfo());
+      insertPass(TID, IID);
+    }
   }
 
   // Print the instruction selected machine code...
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 99ccb0f9c9f306499b452ced32dd4e1b8c70630b..2e2fe72e5395017ab033c2bbc87ed6ed1eb57822 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1608,23 +1608,28 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
   }
 
   if (AllUsesCopied) {
+    bool ReplacedAllUntiedUses = true;
     if (!IsEarlyClobber) {
       // Replace other (un-tied) uses of regB with LastCopiedReg.
       for (MachineOperand &MO : MI->operands()) {
-        if (MO.isReg() && MO.getReg() == RegB &&
-            MO.isUse()) {
-          if (MO.isKill()) {
-            MO.setIsKill(false);
-            RemovedKillFlag = true;
+        if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+          if (MO.getSubReg() == SubRegB) {
+            if (MO.isKill()) {
+              MO.setIsKill(false);
+              RemovedKillFlag = true;
+            }
+            MO.setReg(LastCopiedReg);
+            MO.setSubReg(0);
+          } else {
+            ReplacedAllUntiedUses = false;
           }
-          MO.setReg(LastCopiedReg);
-          MO.setSubReg(MO.getSubReg());
         }
       }
     }
 
     // Update live variables for regB.
-    if (RemovedKillFlag && LV && LV->getVarInfo(RegB).removeKill(*MI)) {
+    if (RemovedKillFlag && ReplacedAllUntiedUses &&
+        LV && LV->getVarInfo(RegB).removeKill(*MI)) {
       MachineBasicBlock::iterator PrevMI = MI;
       --PrevMI;
       LV->addVirtualRegisterKilled(RegB, *PrevMI);
diff --git a/lib/CodeGen/WasmEHPrepare.cpp b/lib/CodeGen/WasmEHPrepare.cpp
index 83d04da5dd0c816fe82a613a45b0db4b18e33186..6f02a05f56157e33a7932e16a20b97ec1ac47158 100644
--- a/lib/CodeGen/WasmEHPrepare.cpp
+++ b/lib/CodeGen/WasmEHPrepare.cpp
@@ -300,7 +300,7 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, unsigned Index) {
   // This is to create a map of <landingpad EH label, landingpad index> in
   // SelectionDAGISel, which is to be used in EHStreamer to emit LSDA tables.
   // Pseudocode: wasm.landingpad.index(Index);
-  IRB.CreateCall(LPadIndexF, IRB.getInt32(Index));
+  IRB.CreateCall(LPadIndexF, {FPI, IRB.getInt32(Index)});
 
   // Pseudocode: __wasm_lpad_context.lpad_index = index;
   IRB.CreateStore(IRB.getInt32(Index), LPadIndexField);
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index a3243235854f8a3931d1d79d5db7443c6da1a990..6a15240fa6e0ad401338ea15bfa1071b4f3d5d19 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -218,7 +218,7 @@ static void calculateStateNumbersForInvokes(const Function *Fn,
 // to. If the unwind edge came from an invoke, return null.
 static const BasicBlock *getEHPadFromPredecessor(const BasicBlock *BB,
                                                  Value *ParentPad) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   if (isa<InvokeInst>(TI))
     return nullptr;
   if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
@@ -977,7 +977,7 @@ void WinEHPrepare::removeImplausibleInstructions(Function &F) {
         break;
       }
 
-      TerminatorInst *TI = BB->getTerminator();
+      Instruction *TI = BB->getTerminator();
       // CatchPadInst and CleanupPadInst can't transfer control to a ReturnInst.
       bool IsUnreachableRet = isa<ReturnInst>(TI) && FuncletPad;
       // The token consumed by a CatchReturnInst must match the funclet token.
diff --git a/lib/DebugInfo/CodeView/CodeViewError.cpp b/lib/DebugInfo/CodeView/CodeViewError.cpp
index 914157ef0c17c5610c3e5253dfcc16710b3b5641..2a9753add311e6d43f859a4a2a784b1983d6e5b8 100644
--- a/lib/DebugInfo/CodeView/CodeViewError.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewError.cpp
@@ -41,6 +41,8 @@ public:
 };
 
 static llvm::ManagedStatic<CodeViewErrorCategory> CodeViewErrCategory;
-const std::error_category &llvm::codeview::CVErrorCategory() { return *CodeViewErrCategory; }
+const std::error_category &llvm::codeview::CVErrorCategory() {
+  return *CodeViewErrCategory;
+}
 
 char CodeViewError::ID;
diff --git a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
index e73c69fff44ffd788333e2bbd43a9e2db55c9b5a..2af8205cebc35991731f09d6a3fe32536e2cf325 100644
--- a/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
+++ b/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
@@ -529,6 +529,7 @@ EncodedFramePtrReg codeview::encodeFramePtrReg(RegisterId Reg, CPUType CPU) {
     default:
       break;
     }
+    break;
   case CPUType::X64:
     switch (Reg) {
     case RegisterId::RSP:
@@ -540,6 +541,7 @@ EncodedFramePtrReg codeview::encodeFramePtrReg(RegisterId Reg, CPUType CPU) {
     default:
       break;
     }
+    break;
   }
   return EncodedFramePtrReg::None;
 }
diff --git a/lib/DebugInfo/CodeView/TypeIndex.cpp b/lib/DebugInfo/CodeView/TypeIndex.cpp
index 24fe5fcb28d4f96efa743b837883cd653f407b48..332d67470da50dd12360bdc067dc1a2bdda5a8e8 100644
--- a/lib/DebugInfo/CodeView/TypeIndex.cpp
+++ b/lib/DebugInfo/CodeView/TypeIndex.cpp
@@ -74,6 +74,9 @@ StringRef TypeIndex::simpleTypeName(TypeIndex TI) {
   if (TI.isNoneType())
     return "<no type>";
 
+  if (TI == TypeIndex::NullptrT())
+    return "std::nullptr_t";
+
   // This is a simple type.
   for (const auto &SimpleTypeName : SimpleTypeNames) {
     if (SimpleTypeName.Kind == TI.getSimpleKind()) {
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index 1a1d516ff3cb6a8c4634d0e66e3aa00070b19da7..803818226e54b7b450ed19723c962886d0652019 100644
--- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
@@ -63,7 +64,12 @@ class TypeStreamMerger {
 public:
   explicit TypeStreamMerger(SmallVectorImpl<TypeIndex> &SourceToDest)
       : IndexMap(SourceToDest) {
-    SourceToDest.clear();
+    // When dealing with precompiled headers objects, all data in SourceToDest
+    // belongs to the precompiled headers object, and is assumed to be already
+    // remapped to the target PDB. Any forthcoming type that will be merged in
+    // might potentially back-reference this data. We also don't want to resolve
+    // twice the types in the precompiled object.
+    CurIndex += SourceToDest.size();
   }
 
   static const TypeIndex Untranslated;
@@ -71,7 +77,8 @@ public:
   // Local hashing entry points
   Error mergeTypesAndIds(MergingTypeTableBuilder &DestIds,
                          MergingTypeTableBuilder &DestTypes,
-                         const CVTypeArray &IdsAndTypes);
+                         const CVTypeArray &IdsAndTypes,
+                         Optional<EndPrecompRecord> &EP);
   Error mergeIdRecords(MergingTypeTableBuilder &Dest,
                        ArrayRef<TypeIndex> TypeSourceToDest,
                        const CVTypeArray &Ids);
@@ -82,13 +89,15 @@ public:
   Error mergeTypesAndIds(GlobalTypeTableBuilder &DestIds,
                          GlobalTypeTableBuilder &DestTypes,
                          const CVTypeArray &IdsAndTypes,
-                         ArrayRef<GloballyHashedType> Hashes);
+                         ArrayRef<GloballyHashedType> Hashes,
+                         Optional<EndPrecompRecord> &EP);
   Error mergeIdRecords(GlobalTypeTableBuilder &Dest,
                        ArrayRef<TypeIndex> TypeSourceToDest,
                        const CVTypeArray &Ids,
                        ArrayRef<GloballyHashedType> Hashes);
   Error mergeTypeRecords(GlobalTypeTableBuilder &Dest, const CVTypeArray &Types,
-                         ArrayRef<GloballyHashedType> Hashes);
+                         ArrayRef<GloballyHashedType> Hashes,
+                         Optional<EndPrecompRecord> &EP);
 
 private:
   Error doit(const CVTypeArray &Types);
@@ -156,6 +165,8 @@ private:
     return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
   }
 
+  Expected<bool> shouldRemapType(const CVType &Type);
+
   Optional<Error> LastError;
 
   bool UseGlobalHashes = false;
@@ -185,6 +196,8 @@ private:
   /// Temporary storage that we use to copy a record's data while re-writing
   /// its type indices.
   SmallVector<uint8_t, 256> RemapStorage;
+
+  Optional<EndPrecompRecord> EndPrecomp; 
 };
 
 } // end anonymous namespace
@@ -261,22 +274,27 @@ Error TypeStreamMerger::mergeIdRecords(MergingTypeTableBuilder &Dest,
 
 Error TypeStreamMerger::mergeTypesAndIds(MergingTypeTableBuilder &DestIds,
                                          MergingTypeTableBuilder &DestTypes,
-                                         const CVTypeArray &IdsAndTypes) {
+                                         const CVTypeArray &IdsAndTypes,
+                                         Optional<EndPrecompRecord> &EP) {
   DestIdStream = &DestIds;
   DestTypeStream = &DestTypes;
   UseGlobalHashes = false;
-  return doit(IdsAndTypes);
+  auto Err = doit(IdsAndTypes);
+  EP = EndPrecomp;
+  return Err;
 }
 
 // Global hashing entry points
 Error TypeStreamMerger::mergeTypeRecords(GlobalTypeTableBuilder &Dest,
                                          const CVTypeArray &Types,
-                                         ArrayRef<GloballyHashedType> Hashes) {
+                                         ArrayRef<GloballyHashedType> Hashes,
+                                         Optional<EndPrecompRecord> &EP) {
   DestGlobalTypeStream = &Dest;
   UseGlobalHashes = true;
   GlobalHashes = Hashes;
-
-  return doit(Types);
+  auto Err = doit(Types);
+  EP = EndPrecomp;
+  return Err;
 }
 
 Error TypeStreamMerger::mergeIdRecords(GlobalTypeTableBuilder &Dest,
@@ -294,12 +312,15 @@ Error TypeStreamMerger::mergeIdRecords(GlobalTypeTableBuilder &Dest,
 Error TypeStreamMerger::mergeTypesAndIds(GlobalTypeTableBuilder &DestIds,
                                          GlobalTypeTableBuilder &DestTypes,
                                          const CVTypeArray &IdsAndTypes,
-                                         ArrayRef<GloballyHashedType> Hashes) {
+                                         ArrayRef<GloballyHashedType> Hashes,
+                                         Optional<EndPrecompRecord> &EP) {
   DestGlobalIdStream = &DestIds;
   DestGlobalTypeStream = &DestTypes;
   UseGlobalHashes = true;
   GlobalHashes = Hashes;
-  return doit(IdsAndTypes);
+  auto Err = doit(IdsAndTypes);
+  EP = EndPrecomp;
+  return Err;
 }
 
 Error TypeStreamMerger::doit(const CVTypeArray &Types) {
@@ -345,25 +366,30 @@ Error TypeStreamMerger::remapAllTypes(const CVTypeArray &Types) {
 }
 
 Error TypeStreamMerger::remapType(const CVType &Type) {
-  auto DoSerialize =
-      [this, Type](MutableArrayRef<uint8_t> Storage) -> ArrayRef<uint8_t> {
-    return remapIndices(Type, Storage);
-  };
+  auto R = shouldRemapType(Type);
+  if (!R)
+    return R.takeError();
 
   TypeIndex DestIdx = Untranslated;
-  if (LLVM_LIKELY(UseGlobalHashes)) {
-    GlobalTypeTableBuilder &Dest =
-        isIdRecord(Type.kind()) ? *DestGlobalIdStream : *DestGlobalTypeStream;
-    GloballyHashedType H = GlobalHashes[CurIndex.toArrayIndex()];
-    DestIdx = Dest.insertRecordAs(H, Type.RecordData.size(), DoSerialize);
-  } else {
-    MergingTypeTableBuilder &Dest =
-        isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream;
-
-    RemapStorage.resize(Type.RecordData.size());
-    ArrayRef<uint8_t> Result = DoSerialize(RemapStorage);
-    if (!Result.empty())
-      DestIdx = Dest.insertRecordBytes(Result);
+  if (*R) {
+    auto DoSerialize =
+        [this, Type](MutableArrayRef<uint8_t> Storage) -> ArrayRef<uint8_t> {
+      return remapIndices(Type, Storage);
+    };
+    if (LLVM_LIKELY(UseGlobalHashes)) {
+      GlobalTypeTableBuilder &Dest =
+          isIdRecord(Type.kind()) ? *DestGlobalIdStream : *DestGlobalTypeStream;
+      GloballyHashedType H = GlobalHashes[CurIndex.toArrayIndex()];
+      DestIdx = Dest.insertRecordAs(H, Type.RecordData.size(), DoSerialize);
+    } else {
+      MergingTypeTableBuilder &Dest =
+          isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream;
+
+      RemapStorage.resize(Type.RecordData.size());
+      ArrayRef<uint8_t> Result = DoSerialize(RemapStorage);
+      if (!Result.empty())
+        DestIdx = Dest.insertRecordBytes(Result);
+    }
   }
   addMapping(DestIdx);
 
@@ -418,25 +444,29 @@ Error llvm::codeview::mergeIdRecords(MergingTypeTableBuilder &Dest,
 
 Error llvm::codeview::mergeTypeAndIdRecords(
     MergingTypeTableBuilder &DestIds, MergingTypeTableBuilder &DestTypes,
-    SmallVectorImpl<TypeIndex> &SourceToDest, const CVTypeArray &IdsAndTypes) {
+    SmallVectorImpl<TypeIndex> &SourceToDest, const CVTypeArray &IdsAndTypes,
+    Optional<EndPrecompRecord> &EndPrecomp) {
   TypeStreamMerger M(SourceToDest);
-  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes);
+  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes, EndPrecomp);
 }
 
 Error llvm::codeview::mergeTypeAndIdRecords(
     GlobalTypeTableBuilder &DestIds, GlobalTypeTableBuilder &DestTypes,
     SmallVectorImpl<TypeIndex> &SourceToDest, const CVTypeArray &IdsAndTypes,
-    ArrayRef<GloballyHashedType> Hashes) {
+    ArrayRef<GloballyHashedType> Hashes,
+    Optional<EndPrecompRecord> &EndPrecomp) {
   TypeStreamMerger M(SourceToDest);
-  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes, Hashes);
+  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes, Hashes,
+                            EndPrecomp);
 }
 
 Error llvm::codeview::mergeTypeRecords(GlobalTypeTableBuilder &Dest,
                                        SmallVectorImpl<TypeIndex> &SourceToDest,
                                        const CVTypeArray &Types,
-                                       ArrayRef<GloballyHashedType> Hashes) {
+                                       ArrayRef<GloballyHashedType> Hashes,
+                                       Optional<EndPrecompRecord> &EndPrecomp) {
   TypeStreamMerger M(SourceToDest);
-  return M.mergeTypeRecords(Dest, Types, Hashes);
+  return M.mergeTypeRecords(Dest, Types, Hashes, EndPrecomp);
 }
 
 Error llvm::codeview::mergeIdRecords(GlobalTypeTableBuilder &Dest,
@@ -447,3 +477,18 @@ Error llvm::codeview::mergeIdRecords(GlobalTypeTableBuilder &Dest,
   TypeStreamMerger M(SourceToDest);
   return M.mergeIdRecords(Dest, Types, Ids, Hashes);
 }
+
+Expected<bool> TypeStreamMerger::shouldRemapType(const CVType &Type) {
+  // For object files containing precompiled types, we need to extract the
+  // signature, through EndPrecompRecord. This is done here for performance
+  // reasons, to avoid re-parsing the Types stream.
+  if (Type.kind() == LF_ENDPRECOMP) {
+    assert(!EndPrecomp);
+    EndPrecomp.emplace();
+    if (auto EC = TypeDeserializer::deserializeAs(const_cast<CVType &>(Type),
+                                                  EndPrecomp.getValue()))
+      return joinErrors(std::move(EC), errorCorruptRecord());
+    return false;
+  }
+  return true;
+}
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index ddabc7a46528b947c712dfe5614cf74e7f5aa10d..7ab54de6bc49e3ad604dc2d5e575381acf1ba403 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -269,9 +269,11 @@ static void dumpAddrSection(raw_ostream &OS, DWARFDataExtractor &AddrData,
 }
 
 // Dump the .debug_rnglists or .debug_rnglists.dwo section (DWARF v5).
-static void dumpRnglistsSection(raw_ostream &OS,
-                                DWARFDataExtractor &rnglistData,
-                                DIDumpOptions DumpOpts) {
+static void
+dumpRnglistsSection(raw_ostream &OS, DWARFDataExtractor &rnglistData,
+                    llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+                        LookupPooledAddress,
+                    DIDumpOptions DumpOpts) {
   uint32_t Offset = 0;
   while (rnglistData.isValidOffset(Offset)) {
     llvm::DWARFDebugRnglistTable Rnglists;
@@ -285,11 +287,32 @@ static void dumpRnglistsSection(raw_ostream &OS,
         break;
       Offset = TableOffset + Length;
     } else {
-      Rnglists.dump(OS, DumpOpts);
+      Rnglists.dump(OS, LookupPooledAddress, DumpOpts);
     }
   }
 }
 
+static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
+                                DWARFDataExtractor Data,
+                                const MCRegisterInfo *MRI,
+                                Optional<uint64_t> DumpOffset) {
+  uint32_t Offset = 0;
+  DWARFDebugLoclists Loclists;
+
+  DWARFListTableHeader Header(".debug_loclists", "locations");
+  if (Error E = Header.extract(Data, &Offset)) {
+    WithColor::error() << toString(std::move(E)) << '\n';
+    return;
+  }
+
+  Header.dump(OS, DumpOpts);
+  DataExtractor LocData(Data.getData().drop_front(Offset),
+                        Data.isLittleEndian(), Header.getAddrSize());
+
+  Loclists.parse(LocData, Header.getVersion());
+  Loclists.dump(OS, 0, MRI, DumpOffset);
+}
+
 void DWARFContext::dump(
     raw_ostream &OS, DIDumpOptions DumpOpts,
     std::array<Optional<uint64_t>, DIDT_ID_Count> DumpOffsets) {
@@ -328,20 +351,22 @@ void DWARFContext::dump(
                  DObj->getAbbrevDWOSection()))
     getDebugAbbrevDWO()->dump(OS);
 
-  auto dumpDebugInfo = [&](unit_iterator_range Units) {
-    if (DumpOffset)
-      getDIEForOffset(DumpOffset.getValue())
-          .dump(OS, 0, DumpOpts.noImplicitRecursion());
+  auto dumpDebugInfo = [&](const char *Name, unit_iterator_range Units) {
+    OS << '\n' << Name << " contents:\n";
+    if ((DumpOffset = DumpOffsets[DIDT_ID_DebugInfo]))
+      for (const auto &U : Units)
+        U->getDIEForOffset(DumpOffset.getValue())
+            .dump(OS, 0, DumpOpts.noImplicitRecursion());
     else
       for (const auto &U : Units)
         U->dump(OS, DumpOpts);
   };
-  if (shouldDump(Explicit, ".debug_info", DIDT_ID_DebugInfo,
-                 DObj->getInfoSection().Data))
-    dumpDebugInfo(info_section_units());
-  if (shouldDump(ExplicitDWO, ".debug_info.dwo", DIDT_ID_DebugInfo,
-                 DObj->getInfoDWOSection().Data))
-    dumpDebugInfo(dwo_info_section_units());
+  if ((DumpType & DIDT_DebugInfo)) {
+    if (Explicit || getNumCompileUnits())
+      dumpDebugInfo(".debug_info", info_section_units());
+    if (ExplicitDWO || getNumDWOCompileUnits())
+      dumpDebugInfo(".debug_info.dwo", dwo_info_section_units());
+  }
 
   auto dumpDebugType = [&](const char *Name, unit_iterator_range Units) {
     OS << '\n' << Name << " contents:\n";
@@ -364,9 +389,15 @@ void DWARFContext::dump(
                  DObj->getLocSection().Data)) {
     getDebugLoc()->dump(OS, getRegisterInfo(), DumpOffset);
   }
+  if (shouldDump(Explicit, ".debug_loclists", DIDT_ID_DebugLoclists,
+                 DObj->getLoclistsSection().Data)) {
+    DWARFDataExtractor Data(*DObj, DObj->getLoclistsSection(), isLittleEndian(),
+                            0);
+    dumpLoclistsSection(OS, DumpOpts, Data, getRegisterInfo(), DumpOffset);
+  }
   if (shouldDump(ExplicitDWO, ".debug_loc.dwo", DIDT_ID_DebugLoc,
                  DObj->getLocDWOSection().Data)) {
-    getDebugLocDWO()->dump(OS, getRegisterInfo(), DumpOffset);
+    getDebugLocDWO()->dump(OS, 0, getRegisterInfo(), DumpOffset);
   }
 
   if (shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame,
@@ -495,18 +526,26 @@ void DWARFContext::dump(
     }
   }
 
+  auto LookupPooledAddress = [&](uint32_t Index) -> Optional<SectionedAddress> {
+    const auto &CUs = compile_units();
+    auto I = CUs.begin();
+    if (I == CUs.end())
+      return None;
+    return (*I)->getAddrOffsetSectionItem(Index);
+  };
+
   if (shouldDump(Explicit, ".debug_rnglists", DIDT_ID_DebugRnglists,
                  DObj->getRnglistsSection().Data)) {
     DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsSection(),
                                    isLittleEndian(), 0);
-    dumpRnglistsSection(OS, RnglistData, DumpOpts);
+    dumpRnglistsSection(OS, RnglistData, LookupPooledAddress, DumpOpts);
   }
 
   if (shouldDump(ExplicitDWO, ".debug_rnglists.dwo", DIDT_ID_DebugRnglists,
                  DObj->getRnglistsDWOSection().Data)) {
     DWARFDataExtractor RnglistData(*DObj, DObj->getRnglistsDWOSection(),
                                    isLittleEndian(), 0);
-    dumpRnglistsSection(OS, RnglistData, DumpOpts);
+    dumpRnglistsSection(OS, RnglistData, LookupPooledAddress, DumpOpts);
   }
 
   if (shouldDump(Explicit, ".debug_pubnames", DIDT_ID_DebugPubnames,
@@ -544,7 +583,7 @@ void DWARFContext::dump(
                              DObj->getStringDWOSection(), dwo_units(),
                              isLittleEndian(), getMaxDWOVersion());
 
-  if (shouldDump(Explicit, ".gnu_index", DIDT_ID_GdbIndex,
+  if (shouldDump(Explicit, ".gdb_index", DIDT_ID_GdbIndex,
                  DObj->getGdbIndexSection())) {
     getGdbIndex().dump(OS);
   }
@@ -686,16 +725,19 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() {
   return Loc.get();
 }
 
-const DWARFDebugLocDWO *DWARFContext::getDebugLocDWO() {
+const DWARFDebugLoclists *DWARFContext::getDebugLocDWO() {
   if (LocDWO)
     return LocDWO.get();
 
-  LocDWO.reset(new DWARFDebugLocDWO());
+  LocDWO.reset(new DWARFDebugLoclists());
   // Assume all compile units have the same address byte size.
   // FIXME: We don't need AddressSize for split DWARF since relocatable
   // addresses cannot appear there. At the moment DWARFExpression requires it.
   DataExtractor LocData(DObj->getLocDWOSection().Data, isLittleEndian(), 4);
-  LocDWO->parse(LocData);
+  // Use version 4. DWO does not support the DWARF v5 .debug_loclists yet and
+  // that means we are parsing the new style .debug_loc (pre-standatized version
+  // of the .debug_loclists).
+  LocDWO->parse(LocData, 4 /* Version */);
   return LocDWO.get();
 }
 
@@ -832,7 +874,9 @@ Expected<const DWARFDebugLine::LineTable *> DWARFContext::getLineTableForUnit(
 void DWARFContext::parseNormalUnits() {
   if (!NormalUnits.empty())
     return;
-  NormalUnits.addUnitsForSection(*this, DObj->getInfoSection(), DW_SECT_INFO);
+  DObj->forEachInfoSections([&](const DWARFSection &S) {
+    NormalUnits.addUnitsForSection(*this, S, DW_SECT_INFO);
+  });
   NormalUnits.finishedInfoUnits();
   DObj->forEachTypesSections([&](const DWARFSection &S) {
     NormalUnits.addUnitsForSection(*this, S, DW_SECT_TYPES);
@@ -842,8 +886,9 @@ void DWARFContext::parseNormalUnits() {
 void DWARFContext::parseDWOUnits(bool Lazy) {
   if (!DWOUnits.empty())
     return;
-  DWOUnits.addUnitsForDWOSection(*this, DObj->getInfoDWOSection(), DW_SECT_INFO,
-                                 Lazy);
+  DObj->forEachInfoDWOSections([&](const DWARFSection &S) {
+    DWOUnits.addUnitsForDWOSection(*this, S, DW_SECT_INFO, Lazy);
+  });
   DWOUnits.finishedInfoUnits();
   DObj->forEachTypesDWOSections([&](const DWARFSection &S) {
     DWOUnits.addUnitsForDWOSection(*this, S, DW_SECT_TYPES, Lazy);
@@ -1195,19 +1240,20 @@ class DWARFObjInMemory final : public DWARFObject {
   const object::ObjectFile *Obj = nullptr;
   std::vector<SectionName> SectionNames;
 
-  using TypeSectionMap = MapVector<object::SectionRef, DWARFSectionMap,
+  using InfoSectionMap = MapVector<object::SectionRef, DWARFSectionMap,
                                    std::map<object::SectionRef, unsigned>>;
 
-  TypeSectionMap TypesSections;
-  TypeSectionMap TypesDWOSections;
+  InfoSectionMap InfoSections;
+  InfoSectionMap TypesSections;
+  InfoSectionMap InfoDWOSections;
+  InfoSectionMap TypesDWOSections;
 
-  DWARFSectionMap InfoSection;
   DWARFSectionMap LocSection;
+  DWARFSectionMap LocListsSection;
   DWARFSectionMap LineSection;
   DWARFSectionMap RangeSection;
   DWARFSectionMap RnglistsSection;
   DWARFSectionMap StringOffsetSection;
-  DWARFSectionMap InfoDWOSection;
   DWARFSectionMap LineDWOSection;
   DWARFSectionMap LocDWOSection;
   DWARFSectionMap StringOffsetDWOSection;
@@ -1222,13 +1268,12 @@ class DWARFObjInMemory final : public DWARFObject {
 
   DWARFSectionMap *mapNameToDWARFSection(StringRef Name) {
     return StringSwitch<DWARFSectionMap *>(Name)
-        .Case("debug_info", &InfoSection)
         .Case("debug_loc", &LocSection)
+        .Case("debug_loclists", &LocListsSection)
         .Case("debug_line", &LineSection)
         .Case("debug_str_offsets", &StringOffsetSection)
         .Case("debug_ranges", &RangeSection)
         .Case("debug_rnglists", &RnglistsSection)
-        .Case("debug_info.dwo", &InfoDWOSection)
         .Case("debug_loc.dwo", &LocDWOSection)
         .Case("debug_line.dwo", &LineDWOSection)
         .Case("debug_names", &DebugNamesSection)
@@ -1317,6 +1362,16 @@ public:
     for (const auto &SecIt : Sections) {
       if (StringRef *SectionData = mapSectionToMember(SecIt.first()))
         *SectionData = SecIt.second->getBuffer();
+      else if (SecIt.first() == "debug_info")
+        // Find debug_info and debug_types data by section rather than name as
+        // there are multiple, comdat grouped, of these sections.
+        InfoSections[SectionRef()].Data = SecIt.second->getBuffer();
+      else if (SecIt.first() == "debug_info.dwo")
+        InfoDWOSections[SectionRef()].Data = SecIt.second->getBuffer();
+      else if (SecIt.first() == "debug_types")
+        TypesSections[SectionRef()].Data = SecIt.second->getBuffer();
+      else if (SecIt.first() == "debug_types.dwo")
+        TypesDWOSections[SectionRef()].Data = SecIt.second->getBuffer();
     }
   }
   DWARFObjInMemory(const object::ObjectFile &Obj, const LoadedObjectInfo *L,
@@ -1371,9 +1426,13 @@ public:
           // FIXME: Use the other dwo range section when we emit it.
           RangeDWOSection.Data = Data;
         }
+      } else if (Name == "debug_info") {
+        // Find debug_info and debug_types data by section rather than name as
+        // there are multiple, comdat grouped, of these sections.
+        InfoSections[Section].Data = Data;
+      } else if (Name == "debug_info.dwo") {
+        InfoDWOSections[Section].Data = Data;
       } else if (Name == "debug_types") {
-        // Find debug_types data by section rather than name as there are
-        // multiple, comdat grouped, debug_types sections.
         TypesSections[Section].Data = Data;
       } else if (Name == "debug_types.dwo") {
         TypesDWOSections[Section].Data = Data;
@@ -1408,9 +1467,16 @@ public:
       DWARFSectionMap *Sec = mapNameToDWARFSection(RelSecName);
       RelocAddrMap *Map = Sec ? &Sec->Relocs : nullptr;
       if (!Map) {
-        // Find debug_types relocs by section rather than name as there are
-        // multiple, comdat grouped, debug_types sections.
-        if (RelSecName == "debug_types")
+        // Find debug_info and debug_types relocs by section rather than name
+        // as there are multiple, comdat grouped, of these sections.
+        if (RelSecName == "debug_info")
+          Map = &static_cast<DWARFSectionMap &>(InfoSections[*RelocatedSection])
+                     .Relocs;
+        else if (RelSecName == "debug_info.dwo")
+          Map = &static_cast<DWARFSectionMap &>(
+                     InfoDWOSections[*RelocatedSection])
+                     .Relocs;
+        else if (RelSecName == "debug_types")
           Map =
               &static_cast<DWARFSectionMap &>(TypesSections[*RelocatedSection])
                    .Relocs;
@@ -1508,8 +1574,10 @@ public:
   StringRef getLineStringSection() const override { return LineStringSection; }
 
   // Sections for DWARF5 split dwarf proposal.
-  const DWARFSection &getInfoDWOSection() const override {
-    return InfoDWOSection;
+  void forEachInfoDWOSections(
+      function_ref<void(const DWARFSection &)> F) const override {
+    for (auto &P : InfoDWOSections)
+      F(P.second);
   }
   void forEachTypesDWOSections(
       function_ref<void(const DWARFSection &)> F) const override {
@@ -1519,6 +1587,7 @@ public:
 
   StringRef getAbbrevSection() const override { return AbbrevSection; }
   const DWARFSection &getLocSection() const override { return LocSection; }
+  const DWARFSection &getLoclistsSection() const override { return LocListsSection; }
   StringRef getARangeSection() const override { return ARangeSection; }
   StringRef getDebugFrameSection() const override { return DebugFrameSection; }
   StringRef getEHFrameSection() const override { return EHFrameSection; }
@@ -1555,7 +1624,11 @@ public:
 
   StringRef getFileName() const override { return FileName; }
   uint8_t getAddressSize() const override { return AddressSize; }
-  const DWARFSection &getInfoSection() const override { return InfoSection; }
+  void forEachInfoSections(
+      function_ref<void(const DWARFSection &)> F) const override {
+    for (auto &P : InfoSections)
+      F(P.second);
+  }
   void forEachTypesSections(
       function_ref<void(const DWARFSection &)> F) const override {
     for (auto &P : TypesSections)
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 617b914ecce97b1d0b677f230850c21fcf74fd71..9146b457a5ddae1bfdae098e337b3053b094fa93 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -144,51 +144,74 @@ void DWARFDebugLoc::parse(const DWARFDataExtractor &data) {
     WithColor::error() << "failed to consume entire .debug_loc section\n";
 }
 
-Optional<DWARFDebugLocDWO::LocationList>
-DWARFDebugLocDWO::parseOneLocationList(DataExtractor Data, unsigned *Offset) {
+Optional<DWARFDebugLoclists::LocationList>
+DWARFDebugLoclists::parseOneLocationList(DataExtractor Data, unsigned *Offset,
+                                         unsigned Version) {
   LocationList LL;
   LL.Offset = *Offset;
 
   // dwarf::DW_LLE_end_of_list_entry is 0 and indicates the end of the list.
   while (auto Kind =
              static_cast<dwarf::LocationListEntry>(Data.getU8(Offset))) {
-    if (Kind != dwarf::DW_LLE_startx_length) {
+
+    Entry E;
+    E.Kind = Kind;
+    switch (Kind) {
+    case dwarf::DW_LLE_startx_length:
+      E.Value0 = Data.getULEB128(Offset);
+      // Pre-DWARF 5 has different interpretation of the length field. We have
+      // to support both pre- and standartized styles for the compatibility.
+      if (Version < 5)
+        E.Value1 = Data.getU32(Offset);
+      else
+        E.Value1 = Data.getULEB128(Offset);
+      break;
+    case dwarf::DW_LLE_start_length:
+      E.Value0 = Data.getAddress(Offset);
+      E.Value1 = Data.getULEB128(Offset);
+      break;
+    case dwarf::DW_LLE_offset_pair:
+      E.Value0 = Data.getULEB128(Offset);
+      E.Value1 = Data.getULEB128(Offset);
+      break;
+    case dwarf::DW_LLE_base_address:
+      E.Value0 = Data.getAddress(Offset);
+      break;
+    default:
       WithColor::error() << "dumping support for LLE of kind " << (int)Kind
                          << " not implemented\n";
       return None;
     }
 
-    Entry E;
-    E.Start = Data.getULEB128(Offset);
-    E.Length = Data.getU32(Offset);
-
-    unsigned Bytes = Data.getU16(Offset);
-    // A single location description describing the location of the object...
-    StringRef str = Data.getData().substr(*Offset, Bytes);
-    *Offset += Bytes;
-    E.Loc.resize(str.size());
-    std::copy(str.begin(), str.end(), E.Loc.begin());
+    if (Kind != dwarf::DW_LLE_base_address) {
+      unsigned Bytes = Data.getU16(Offset);
+      // A single location description describing the location of the object...
+      StringRef str = Data.getData().substr(*Offset, Bytes);
+      *Offset += Bytes;
+      E.Loc.resize(str.size());
+      std::copy(str.begin(), str.end(), E.Loc.begin());
+    }
 
     LL.Entries.push_back(std::move(E));
   }
   return LL;
 }
 
-void DWARFDebugLocDWO::parse(DataExtractor data) {
+void DWARFDebugLoclists::parse(DataExtractor data, unsigned Version) {
   IsLittleEndian = data.isLittleEndian();
   AddressSize = data.getAddressSize();
 
   uint32_t Offset = 0;
   while (data.isValidOffset(Offset)) {
-    if (auto LL = parseOneLocationList(data, &Offset))
+    if (auto LL = parseOneLocationList(data, &Offset, Version))
       Locations.push_back(std::move(*LL));
     else
       return;
   }
 }
 
-DWARFDebugLocDWO::LocationList const *
-DWARFDebugLocDWO::getLocationListAtOffset(uint64_t Offset) const {
+DWARFDebugLoclists::LocationList const *
+DWARFDebugLoclists::getLocationListAtOffset(uint64_t Offset) const {
   auto It = std::lower_bound(
       Locations.begin(), Locations.end(), Offset,
       [](const LocationList &L, uint64_t Offset) { return L.Offset < Offset; });
@@ -197,23 +220,49 @@ DWARFDebugLocDWO::getLocationListAtOffset(uint64_t Offset) const {
   return nullptr;
 }
 
-void DWARFDebugLocDWO::LocationList::dump(raw_ostream &OS, bool IsLittleEndian,
-                                          unsigned AddressSize,
-                                          const MCRegisterInfo *MRI,
-                                          unsigned Indent) const {
+void DWARFDebugLoclists::LocationList::dump(raw_ostream &OS, uint64_t BaseAddr,
+                                            bool IsLittleEndian,
+                                            unsigned AddressSize,
+                                            const MCRegisterInfo *MRI,
+                                            unsigned Indent) const {
   for (const Entry &E : Entries) {
-    OS << '\n';
-    OS.indent(Indent);
-    OS << "Addr idx " << E.Start << " (w/ length " << E.Length << "): ";
+    switch (E.Kind) {
+    case dwarf::DW_LLE_startx_length:
+      OS << '\n';
+      OS.indent(Indent);
+      OS << "Addr idx " << E.Value0 << " (w/ length " << E.Value1 << "): ";
+      break;
+    case dwarf::DW_LLE_start_length:
+      OS << '\n';
+      OS.indent(Indent);
+      OS << format("[0x%*.*" PRIx64 ", 0x%*.*x): ", AddressSize * 2,
+                   AddressSize * 2, E.Value0, AddressSize * 2, AddressSize * 2,
+                   E.Value0 + E.Value1);
+      break;
+    case dwarf::DW_LLE_offset_pair:
+      OS << '\n';
+      OS.indent(Indent);
+      OS << format("[0x%*.*" PRIx64 ", 0x%*.*x): ", AddressSize * 2,
+                   AddressSize * 2, BaseAddr + E.Value0, AddressSize * 2,
+                   AddressSize * 2, BaseAddr + E.Value1);
+      break;
+    case dwarf::DW_LLE_base_address:
+      BaseAddr = E.Value0;
+      break;
+    default:
+      llvm_unreachable("unreachable locations list kind");
+    }
+
     dumpExpression(OS, E.Loc, IsLittleEndian, AddressSize, MRI);
   }
 }
 
-void DWARFDebugLocDWO::dump(raw_ostream &OS, const MCRegisterInfo *MRI,
-                            Optional<uint64_t> Offset) const {
+void DWARFDebugLoclists::dump(raw_ostream &OS, uint64_t BaseAddr,
+                              const MCRegisterInfo *MRI,
+                              Optional<uint64_t> Offset) const {
   auto DumpLocationList = [&](const LocationList &L) {
     OS << format("0x%8.8x: ", L.Offset);
-    L.dump(OS, IsLittleEndian, AddressSize, MRI, /*Indent=*/12);
+    L.dump(OS, BaseAddr, IsLittleEndian, AddressSize, MRI, /*Indent=*/12);
     OS << "\n\n";
   };
 
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index 84e3c634f54fc3454826e0e085bce0838157f0ff..dfb913000a46219904866f4995ed13b22c7d5e35 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -69,7 +69,7 @@ void DWARFDebugRangeList::dump(raw_ostream &OS) const {
 }
 
 DWARFAddressRangesVector DWARFDebugRangeList::getAbsoluteRanges(
-    llvm::Optional<BaseAddress> BaseAddr) const {
+    llvm::Optional<SectionedAddress> BaseAddr) const {
   DWARFAddressRangesVector Res;
   for (const RangeListEntry &RLE : Entries) {
     if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
index eeb85edf5b2f91ab858cacd0cf1d78bfe68d703d..cb5fb0d49dabaea6de5b58f15ec970a7ed47feae 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -32,21 +32,34 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
     Value0 = Value1 = 0;
     break;
   // TODO: Support other encodings.
-  case dwarf::DW_RLE_base_addressx:
-    return createStringError(errc::not_supported,
-                       "unsupported rnglists encoding DW_RLE_base_addressx "
-                       "at offset 0x%" PRIx32,
-                       *OffsetPtr - 1);
+  case dwarf::DW_RLE_base_addressx: {
+    uint32_t PreviousOffset = *OffsetPtr - 1;
+    Value0 = Data.getULEB128(OffsetPtr);
+    if (End < *OffsetPtr)
+      return createStringError(
+          errc::invalid_argument,
+          "read past end of table when reading "
+          "DW_RLE_base_addressx encoding at offset 0x%" PRIx32,
+          PreviousOffset);
+    break;
+  }
   case dwarf::DW_RLE_startx_endx:
     return createStringError(errc::not_supported,
                        "unsupported rnglists encoding DW_RLE_startx_endx at "
                        "offset 0x%" PRIx32,
                        *OffsetPtr - 1);
-  case dwarf::DW_RLE_startx_length:
-    return createStringError(errc::not_supported,
-                       "unsupported rnglists encoding DW_RLE_startx_length "
-                       "at offset 0x%" PRIx32,
-                       *OffsetPtr - 1);
+  case dwarf::DW_RLE_startx_length: {
+    uint32_t PreviousOffset = *OffsetPtr - 1;
+    Value0 = Data.getULEB128(OffsetPtr);
+    Value1 = Data.getULEB128(OffsetPtr);
+    if (End < *OffsetPtr)
+      return createStringError(
+          errc::invalid_argument,
+          "read past end of table when reading "
+          "DW_RLE_startx_length encoding at offset 0x%" PRIx32,
+          PreviousOffset);
+    break;
+  }
   case dwarf::DW_RLE_offset_pair: {
     uint32_t PreviousOffset = *OffsetPtr - 1;
     Value0 = Data.getULEB128(OffsetPtr);
@@ -100,12 +113,19 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint32_t End,
   return Error::success();
 }
 
-DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
-    llvm::Optional<BaseAddress> BaseAddr) const {
+DWARFAddressRangesVector
+DWARFDebugRnglist::getAbsoluteRanges(llvm::Optional<SectionedAddress> BaseAddr,
+                                     DWARFUnit &U) const {
   DWARFAddressRangesVector Res;
   for (const RangeListEntry &RLE : Entries) {
     if (RLE.EntryKind == dwarf::DW_RLE_end_of_list)
       break;
+    if (RLE.EntryKind == dwarf::DW_RLE_base_addressx) {
+      BaseAddr = U.getAddrOffsetSectionItem(RLE.Value0);
+      if (!BaseAddr)
+        BaseAddr = {RLE.Value0, 0};
+      continue;
+    }
     if (RLE.EntryKind == dwarf::DW_RLE_base_address) {
       BaseAddr = {RLE.Value0, RLE.SectionIndex};
       continue;
@@ -133,6 +153,15 @@ DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
       E.LowPC = RLE.Value0;
       E.HighPC = E.LowPC + RLE.Value1;
       break;
+    case dwarf::DW_RLE_startx_length: {
+      auto Start = U.getAddrOffsetSectionItem(RLE.Value0);
+      if (!Start)
+        Start = {0, 0};
+      E.SectionIndex = Start->SectionIndex;
+      E.LowPC = Start->Address;
+      E.HighPC = E.LowPC + RLE.Value1;
+      break;
+    }
     default:
       // Unsupported encodings should have been reported during extraction,
       // so we should not run into any here.
@@ -143,9 +172,11 @@ DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
   return Res;
 }
 
-void RangeListEntry::dump(raw_ostream &OS, uint8_t AddrSize,
-                          uint8_t MaxEncodingStringLength,
-                          uint64_t &CurrentBase, DIDumpOptions DumpOpts) const {
+void RangeListEntry::dump(
+    raw_ostream &OS, uint8_t AddrSize, uint8_t MaxEncodingStringLength,
+    uint64_t &CurrentBase, DIDumpOptions DumpOpts,
+    llvm::function_ref<Optional<SectionedAddress>(uint32_t)>
+        LookupPooledAddress) const {
   auto PrintRawEntry = [](raw_ostream &OS, const RangeListEntry &Entry,
                           uint8_t AddrSize, DIDumpOptions DumpOpts) {
     if (DumpOpts.Verbose) {
@@ -172,6 +203,17 @@ void RangeListEntry::dump(raw_ostream &OS, uint8_t AddrSize,
   case dwarf::DW_RLE_end_of_list:
     OS << (DumpOpts.Verbose ? "" : "<End of list>");
     break;
+    //  case dwarf::DW_RLE_base_addressx:
+  case dwarf::DW_RLE_base_addressx: {
+    if (auto SA = LookupPooledAddress(Value0))
+      CurrentBase = SA->Address;
+    else
+      CurrentBase = Value0;
+    if (!DumpOpts.Verbose)
+      return;
+    OS << format(" 0x%*.*" PRIx64, AddrSize * 2, AddrSize * 2, Value0);
+    break;
+  }
   case dwarf::DW_RLE_base_address:
     // In non-verbose mode we do not print anything for this entry.
     CurrentBase = Value0;
@@ -191,6 +233,14 @@ void RangeListEntry::dump(raw_ostream &OS, uint8_t AddrSize,
   case dwarf::DW_RLE_start_end:
     DWARFAddressRange(Value0, Value1).dump(OS, AddrSize, DumpOpts);
     break;
+  case dwarf::DW_RLE_startx_length: {
+    PrintRawEntry(OS, *this, AddrSize, DumpOpts);
+    uint64_t Start = 0;
+    if (auto SA = LookupPooledAddress(Value0))
+      Start = SA->Address;
+    DWARFAddressRange(Start, Start + Value1).dump(OS, AddrSize, DumpOpts);
+    break;
+  } break;
   default:
     llvm_unreachable("Unsupported range list encoding");
   }
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 35567d0f67a010395ad7f36137f326cf1c7a8fcb..31c4cd5e472a5efefae5ccf4b8918eb52cdbfafc 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -59,12 +59,14 @@ static void dumpRanges(const DWARFObject &Obj, raw_ostream &OS,
                        const DWARFAddressRangesVector &Ranges,
                        unsigned AddressSize, unsigned Indent,
                        const DIDumpOptions &DumpOpts) {
+  if (!DumpOpts.ShowAddresses)
+    return;
+
   ArrayRef<SectionName> SectionNames;
   if (DumpOpts.Verbose)
     SectionNames = Obj.getSectionNames();
 
   for (const DWARFAddressRange &R : Ranges) {
-
     OS << '\n';
     OS.indent(Indent);
     R.dump(OS, AddressSize);
@@ -99,27 +101,45 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
 
   FormValue.dump(OS, DumpOpts);
   if (FormValue.isFormClass(DWARFFormValue::FC_SectionOffset)) {
-    const DWARFSection &LocSection = Obj.getLocSection();
-    const DWARFSection &LocDWOSection = Obj.getLocDWOSection();
     uint32_t Offset = *FormValue.getAsSectionOffset();
-    if (!LocSection.Data.empty()) {
+    if (!U->isDWOUnit() && !U->getLocSection()->Data.empty()) {
       DWARFDebugLoc DebugLoc;
-      DWARFDataExtractor Data(Obj, LocSection, Ctx.isLittleEndian(),
+      DWARFDataExtractor Data(Obj, *U->getLocSection(), Ctx.isLittleEndian(),
                               Obj.getAddressSize());
       auto LL = DebugLoc.parseOneLocationList(Data, &Offset);
       if (LL) {
         uint64_t BaseAddr = 0;
-        if (Optional<BaseAddress> BA = U->getBaseAddress())
+        if (Optional<SectionedAddress> BA = U->getBaseAddress())
           BaseAddr = BA->Address;
         LL->dump(OS, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI, BaseAddr,
                  Indent);
       } else
         OS << "error extracting location list.";
-    } else if (!LocDWOSection.Data.empty()) {
-      DataExtractor Data(LocDWOSection.Data, Ctx.isLittleEndian(), 0);
-      auto LL = DWARFDebugLocDWO::parseOneLocationList(Data, &Offset);
+      return;
+    }
+
+    bool UseLocLists = !U->isDWOUnit();
+    StringRef LoclistsSectionData =
+        UseLocLists ? Obj.getLoclistsSection().Data : U->getLocSectionData();
+
+    if (!LoclistsSectionData.empty()) {
+      DataExtractor Data(LoclistsSectionData, Ctx.isLittleEndian(),
+                         Obj.getAddressSize());
+
+      // Old-style location list were used in DWARF v4 (.debug_loc.dwo section).
+      // Modern locations list (.debug_loclists) are used starting from v5.
+      // Ideally we should take the version from the .debug_loclists section
+      // header, but using CU's version for simplicity.
+      auto LL = DWARFDebugLoclists::parseOneLocationList(
+          Data, &Offset, UseLocLists ? U->getVersion() : 4);
+
+      uint64_t BaseAddr = 0;
+      if (Optional<SectionedAddress> BA = U->getBaseAddress())
+        BaseAddr = BA->Address;
+
       if (LL)
-        LL->dump(OS, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI, Indent);
+        LL->dump(OS, BaseAddr, Ctx.isLittleEndian(), Obj.getAddressSize(), MRI,
+                 Indent);
       else
         OS << "error extracting location list.";
     }
diff --git a/lib/DebugInfo/DWARF/DWARFExpression.cpp b/lib/DebugInfo/DWARF/DWARFExpression.cpp
index a9ea26c476ca32e572498d5abededfc89c3aa0f1..2df4456053fb13e03edd753df709aad2146a67c2 100644
--- a/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -94,6 +94,7 @@ static DescVector getDescriptions() {
       Desc(Op::Dwarf3, Op::SizeLEB, Op::SizeBlock);
   Descriptions[DW_OP_stack_value] = Desc(Op::Dwarf3);
   Descriptions[DW_OP_GNU_push_tls_address] = Desc(Op::Dwarf3);
+  Descriptions[DW_OP_addrx] = Desc(Op::Dwarf4, Op::SizeLEB);
   Descriptions[DW_OP_GNU_addr_index] = Desc(Op::Dwarf4, Op::SizeLEB);
   Descriptions[DW_OP_GNU_const_index] = Desc(Op::Dwarf4, Op::SizeLEB);
   return Descriptions;
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 27895da80585e7a64c2c7ee91ebd4f75e317ef83..9226dcad39a9438f35651c6fae5acd300978867b 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -308,6 +308,7 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
       break;
     case DW_FORM_GNU_addr_index:
     case DW_FORM_GNU_str_index:
+    case DW_FORM_addrx:
     case DW_FORM_strx:
       Value.uval = Data.getULEB128(OffsetPtr);
       break;
@@ -340,13 +341,17 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   case DW_FORM_addr:
     AddrOS << format("0x%016" PRIx64, UValue);
     break;
+  case DW_FORM_addrx:
+  case DW_FORM_addrx1:
+  case DW_FORM_addrx2:
+  case DW_FORM_addrx3:
+  case DW_FORM_addrx4:
   case DW_FORM_GNU_addr_index: {
     AddrOS << format(" indexed (%8.8x) address = ", (uint32_t)UValue);
-    uint64_t Address;
     if (U == nullptr)
       OS << "<invalid dwarf unit>";
-    else if (U->getAddrOffsetSectionItem(UValue, Address))
-      AddrOS << format("0x%016" PRIx64, Address);
+    else if (Optional<SectionedAddress> A = U->getAddrOffsetSectionItem(UValue))
+      AddrOS << format("0x%016" PRIx64, A->Address);
     else
       OS << "<no .debug_addr section>";
     break;
@@ -537,10 +542,12 @@ Optional<const char *> DWARFFormValue::getAsCString() const {
   if (Form == DW_FORM_GNU_str_index || Form == DW_FORM_strx ||
       Form == DW_FORM_strx1 || Form == DW_FORM_strx2 || Form == DW_FORM_strx3 ||
       Form == DW_FORM_strx4) {
-    uint64_t StrOffset;
-    if (!U || !U->getStringOffsetSectionItem(Offset, StrOffset))
+    if (!U)
       return None;
-    Offset = StrOffset;
+    Optional<uint64_t> StrOffset = U->getStringOffsetSectionItem(Offset);
+    if (!StrOffset)
+      return None;
+    Offset = *StrOffset;
   }
   // Prefer the Unit's string extractor, because for .dwo it will point to
   // .debug_str.dwo, while the Context's extractor always uses .debug_str.
@@ -555,16 +562,23 @@ Optional<const char *> DWARFFormValue::getAsCString() const {
 }
 
 Optional<uint64_t> DWARFFormValue::getAsAddress() const {
+  if (auto SA = getAsSectionedAddress())
+    return SA->Address;
+  return None;
+}
+Optional<SectionedAddress> DWARFFormValue::getAsSectionedAddress() const {
   if (!isFormClass(FC_Address))
     return None;
-  if (Form == DW_FORM_GNU_addr_index) {
+  if (Form == DW_FORM_GNU_addr_index || Form == DW_FORM_addrx) {
     uint32_t Index = Value.uval;
-    uint64_t Result;
-    if (!U || !U->getAddrOffsetSectionItem(Index, Result))
+    if (!U)
       return None;
-    return Result;
+    Optional<SectionedAddress> SA = U->getAddrOffsetSectionItem(Index);
+    if (!SA)
+      return None;
+    return SA;
   }
-  return Value.uval;
+  return {{Value.uval, Value.SectionIndex}};
 }
 
 Optional<uint64_t> DWARFFormValue::getAsReference() const {
diff --git a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index ebd6104ab8785604365ca0a60500802dc6888828..1abd931e3b8b307c1d27f8c67c3e9a31c715d4bd 100644
--- a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -33,6 +34,16 @@ void DWARFGdbIndex::dumpCUList(raw_ostream &OS) const {
                  CU.Length);
 }
 
+void DWARFGdbIndex::dumpTUList(raw_ostream &OS) const {
+  OS << formatv("\n  Types CU list offset = {0:x}, has {1} entries:\n",
+                TuListOffset, TuList.size());
+  uint32_t I = 0;
+  for (const TypeUnitEntry &TU : TuList)
+    OS << formatv("    {0}: offset = {1:x8}, type_offset = {2:x8}, "
+                  "type_signature = {3:x16}\n",
+                  I++, TU.Offset, TU.TypeOffset, TU.TypeSignature);
+}
+
 void DWARFGdbIndex::dumpAddressArea(raw_ostream &OS) const {
   OS << format("\n  Address area offset = 0x%x, has %" PRId64 " entries:",
                AddressAreaOffset, (uint64_t)AddressArea.size())
@@ -94,6 +105,7 @@ void DWARFGdbIndex::dump(raw_ostream &OS) {
   if (HasContent) {
     OS << "  Version = " << Version << '\n';
     dumpCUList(OS);
+    dumpTUList(OS);
     dumpAddressArea(OS);
     dumpSymbolTable(OS);
     dumpConstantPool(OS);
@@ -127,9 +139,14 @@ bool DWARFGdbIndex::parseImpl(DataExtractor Data) {
 
   // CU Types are no longer needed as DWARF skeleton type units never made it
   // into the standard.
-  uint32_t CuTypesListSize = (AddressAreaOffset - CuTypesOffset) / 24;
-  if (CuTypesListSize != 0)
-    return false;
+  uint32_t TuListSize = (AddressAreaOffset - CuTypesOffset) / 24;
+  TuList.resize(TuListSize);
+  for (uint32_t I = 0; I < TuListSize; ++I) {
+    uint64_t CuOffset = Data.getU64(&Offset);
+    uint64_t TypeOffset = Data.getU64(&Offset);
+    uint64_t Signature = Data.getU64(&Offset);
+    TuList[I] = {CuOffset, TypeOffset, Signature};
+  }
 
   uint32_t AddressAreaSize = (SymbolTableOffset - AddressAreaOffset) / 20;
   AddressArea.reserve(AddressAreaSize);
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 6c3c62d86ef084373bb7078622c4e23a36818bc3..1caaa249bef94f7c4174b71f982dfc8942d8c006 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -39,9 +39,10 @@ void DWARFUnitVector::addUnitsForSection(DWARFContext &C,
                                          DWARFSectionKind SectionKind) {
   const DWARFObject &D = C.getDWARFObj();
   addUnitsImpl(C, D, Section, C.getDebugAbbrev(), &D.getRangeSection(),
-               D.getStringSection(), D.getStringOffsetSection(),
-               &D.getAddrSection(), D.getLineSection(), D.isLittleEndian(),
-               false, false, SectionKind);
+               &D.getLocSection(), D.getStringSection(),
+               D.getStringOffsetSection(), &D.getAddrSection(),
+               D.getLineSection(), D.isLittleEndian(), false, false,
+               SectionKind);
 }
 
 void DWARFUnitVector::addUnitsForDWOSection(DWARFContext &C,
@@ -50,16 +51,18 @@ void DWARFUnitVector::addUnitsForDWOSection(DWARFContext &C,
                                             bool Lazy) {
   const DWARFObject &D = C.getDWARFObj();
   addUnitsImpl(C, D, DWOSection, C.getDebugAbbrevDWO(), &D.getRangeDWOSection(),
-               D.getStringDWOSection(), D.getStringOffsetDWOSection(),
-               &D.getAddrSection(), D.getLineDWOSection(), C.isLittleEndian(),
-               true, Lazy, SectionKind);
+               &D.getLocDWOSection(), D.getStringDWOSection(),
+               D.getStringOffsetDWOSection(), &D.getAddrSection(),
+               D.getLineDWOSection(), C.isLittleEndian(), true, Lazy,
+               SectionKind);
 }
 
 void DWARFUnitVector::addUnitsImpl(
     DWARFContext &Context, const DWARFObject &Obj, const DWARFSection &Section,
-    const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS,
-    const DWARFSection &SOS, const DWARFSection *AOS, const DWARFSection &LS,
-    bool LE, bool IsDWO, bool Lazy, DWARFSectionKind SectionKind) {
+    const DWARFDebugAbbrev *DA, const DWARFSection *RS,
+    const DWARFSection *LocSection, StringRef SS, const DWARFSection &SOS,
+    const DWARFSection *AOS, const DWARFSection &LS, bool LE, bool IsDWO,
+    bool Lazy, DWARFSectionKind SectionKind) {
   DWARFDataExtractor Data(Obj, Section, LE, 0);
   // Lazy initialization of Parser, now that we have all section info.
   if (!Parser) {
@@ -79,12 +82,12 @@ void DWARFUnitVector::addUnitsImpl(
       std::unique_ptr<DWARFUnit> U;
       if (Header.isTypeUnit())
         U = llvm::make_unique<DWARFTypeUnit>(Context, InfoSection, Header, DA,
-                                             RS, SS, SOS, AOS, LS, LE, IsDWO,
-                                             *this);
+                                             RS, LocSection, SS, SOS, AOS, LS,
+                                             LE, IsDWO, *this);
       else
         U = llvm::make_unique<DWARFCompileUnit>(Context, InfoSection, Header,
-                                                DA, RS, SS, SOS, AOS, LS, LE,
-                                                IsDWO, *this);
+                                                DA, RS, LocSection, SS, SOS,
+                                                AOS, LS, LE, IsDWO, *this);
       return U;
     };
   }
@@ -164,16 +167,25 @@ DWARFUnitVector::getUnitForIndexEntry(const DWARFUnitIndex::Entry &E) {
 }
 
 DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
-                     const DWARFUnitHeader &Header,
-                     const DWARFDebugAbbrev *DA, const DWARFSection *RS,
+                     const DWARFUnitHeader &Header, const DWARFDebugAbbrev *DA,
+                     const DWARFSection *RS, const DWARFSection *LocSection,
                      StringRef SS, const DWARFSection &SOS,
                      const DWARFSection *AOS, const DWARFSection &LS, bool LE,
                      bool IsDWO, const DWARFUnitVector &UnitVector)
     : Context(DC), InfoSection(Section), Header(Header), Abbrev(DA),
-      RangeSection(RS), LineSection(LS), StringSection(SS),
-      StringOffsetSection(SOS),  AddrOffsetSection(AOS), isLittleEndian(LE),
-      isDWO(IsDWO), UnitVector(UnitVector) {
+      RangeSection(RS), LocSection(LocSection), LineSection(LS),
+      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
+      isLittleEndian(LE), IsDWO(IsDWO), UnitVector(UnitVector) {
   clear();
+  // For split DWARF we only need to keep track of the location list section's
+  // data (no relocations), and if we are reading a package file, we need to
+  // adjust the location list data based on the index entries.
+  if (IsDWO) {
+    LocSectionData = LocSection->Data;
+    if (auto *IndexEntry = Header.getIndexEntry())
+      if (const auto *C = IndexEntry->getOffset(DW_SECT_LOC))
+        LocSectionData = LocSectionData.substr(C->Offset, C->Length);
+  }
 }
 
 DWARFUnit::~DWARFUnit() = default;
@@ -183,29 +195,38 @@ DWARFDataExtractor DWARFUnit::getDebugInfoExtractor() const {
                             getAddressByteSize());
 }
 
-bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
-                                                uint64_t &Result) const {
+Optional<SectionedAddress>
+DWARFUnit::getAddrOffsetSectionItem(uint32_t Index) const {
+  if (IsDWO) {
+    auto R = Context.info_section_units();
+    auto I = R.begin();
+    // Surprising if a DWO file has more than one skeleton unit in it - this
+    // probably shouldn't be valid, but if a use case is found, here's where to
+    // support it (probably have to linearly search for the matching skeleton CU
+    // here)
+    if (I != R.end() && std::next(I) == R.end())
+      return (*I)->getAddrOffsetSectionItem(Index);
+  }
   uint32_t Offset = AddrOffsetSectionBase + Index * getAddressByteSize();
   if (AddrOffsetSection->Data.size() < Offset + getAddressByteSize())
-    return false;
+    return None;
   DWARFDataExtractor DA(Context.getDWARFObj(), *AddrOffsetSection,
                         isLittleEndian, getAddressByteSize());
-  Result = DA.getRelocatedAddress(&Offset);
-  return true;
+  uint64_t Section;
+  uint64_t Address = DA.getRelocatedAddress(&Offset, &Section);
+  return {{Address, Section}};
 }
 
-bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
-                                           uint64_t &Result) const {
+Optional<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const {
   if (!StringOffsetsTableContribution)
-    return false;
+    return None;
   unsigned ItemSize = getDwarfStringOffsetsByteSize();
   uint32_t Offset = getStringOffsetsBase() + Index * ItemSize;
   if (StringOffsetSection.Data.size() < Offset + ItemSize)
-    return false;
+    return None;
   DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
                         isLittleEndian, 0);
-  Result = DA.getRelocatedValue(ItemSize, &Offset);
-  return true;
+  return DA.getRelocatedValue(ItemSize, &Offset);
 }
 
 bool DWARFUnitHeader::extract(DWARFContext &Context,
@@ -386,11 +407,13 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
     DWARFDie UnitDie = getUnitDIE();
     if (Optional<uint64_t> DWOId = toUnsigned(UnitDie.find(DW_AT_GNU_dwo_id)))
       Header.setDWOId(*DWOId);
-    if (!isDWO) {
+    if (!IsDWO) {
       assert(AddrOffsetSectionBase == 0);
       assert(RangeSectionBase == 0);
-      AddrOffsetSectionBase =
-          toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base), 0);
+      AddrOffsetSectionBase = toSectionOffset(UnitDie.find(DW_AT_addr_base), 0);
+      if (!AddrOffsetSectionBase)
+        AddrOffsetSectionBase =
+            toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base), 0);
       RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0);
     }
 
@@ -401,27 +424,19 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
     // offsets table starting at offset 0 of the debug_str_offsets.dwo section.
     // In both cases we need to determine the format of the contribution,
     // which may differ from the unit's format.
-    uint64_t StringOffsetsContributionBase =
-        isDWO ? 0 : toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0);
-    auto IndexEntry = Header.getIndexEntry();
-    if (IndexEntry)
-      if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
-        StringOffsetsContributionBase += C->Offset;
-
     DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection,
                           isLittleEndian, 0);
-    if (isDWO)
+    if (IsDWO)
       StringOffsetsTableContribution =
-          determineStringOffsetsTableContributionDWO(
-              DA, StringOffsetsContributionBase);
+          determineStringOffsetsTableContributionDWO(DA);
     else if (getVersion() >= 5)
-      StringOffsetsTableContribution = determineStringOffsetsTableContribution(
-          DA, StringOffsetsContributionBase);
+      StringOffsetsTableContribution =
+          determineStringOffsetsTableContribution(DA);
 
     // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to
     // describe address ranges.
     if (getVersion() >= 5) {
-      if (isDWO)
+      if (IsDWO)
         setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
       else
         setRangesSection(&Context.getDWARFObj().getRnglistsSection(),
@@ -441,20 +456,20 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
 
         // In a split dwarf unit, there is no DW_AT_rnglists_base attribute.
         // Adjust RangeSectionBase to point past the table header.
-        if (isDWO && RngListTable)
+        if (IsDWO && RngListTable)
           RangeSectionBase = RngListTable->getHeaderSize();
       }
     }
 
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
-  }
+    }
 
   return DieArray.size();
 }
 
 bool DWARFUnit::parseDWO() {
-  if (isDWO)
+  if (IsDWO)
     return false;
   if (DWO.get())
     return false;
@@ -524,7 +539,7 @@ DWARFUnit::findRnglistFromOffset(uint32_t Offset) {
                                   isLittleEndian, RngListTable->getAddrSize());
     auto RangeListOrError = RngListTable->findList(RangesData, Offset);
     if (RangeListOrError)
-      return RangeListOrError.get().getAbsoluteRanges(getBaseAddress());
+      return RangeListOrError.get().getAbsoluteRanges(getBaseAddress(), *this);
     return RangeListOrError.takeError();
   }
 
@@ -748,15 +763,13 @@ const DWARFAbbreviationDeclarationSet *DWARFUnit::getAbbreviations() const {
   return Abbrevs;
 }
 
-llvm::Optional<BaseAddress> DWARFUnit::getBaseAddress() {
+llvm::Optional<SectionedAddress> DWARFUnit::getBaseAddress() {
   if (BaseAddr)
     return BaseAddr;
 
   DWARFDie UnitDie = getUnitDIE();
   Optional<DWARFFormValue> PC = UnitDie.find({DW_AT_low_pc, DW_AT_entry_pc});
-  if (Optional<uint64_t> Addr = toAddress(PC))
-    BaseAddr = {*Addr, PC->getSectionIndex()};
-
+  BaseAddr = toSectionedAddress(PC);
   return BaseAddr;
 }
 
@@ -771,7 +784,7 @@ StrOffsetsContributionDescriptor::validateContributionSize(
   if (ValidationSize >= Size)
     if (DA.isValidOffsetForDataOfSize((uint32_t)Base, ValidationSize))
       return *this;
-  return Optional<StrOffsetsContributionDescriptor>();
+  return None;
 }
 
 // Look for a DWARF64-formatted contribution to the string offsets table
@@ -779,18 +792,17 @@ StrOffsetsContributionDescriptor::validateContributionSize(
 static Optional<StrOffsetsContributionDescriptor>
 parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
   if (!DA.isValidOffsetForDataOfSize(Offset, 16))
-    return Optional<StrOffsetsContributionDescriptor>();
+    return None;
 
   if (DA.getU32(&Offset) != 0xffffffff)
-    return Optional<StrOffsetsContributionDescriptor>();
+    return None;
 
   uint64_t Size = DA.getU64(&Offset);
   uint8_t Version = DA.getU16(&Offset);
   (void)DA.getU16(&Offset); // padding
   // The encoded length includes the 2-byte version field and the 2-byte
   // padding, so we need to subtract them out when we populate the descriptor.
-  return StrOffsetsContributionDescriptor(Offset, Size - 4, Version, DWARF64);
-  //return Optional<StrOffsetsContributionDescriptor>(Descriptor);
+  return {{Offset, Size - 4, Version, DWARF64}};
 }
 
 // Look for a DWARF32-formatted contribution to the string offsets table
@@ -798,22 +810,20 @@ parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
 static Optional<StrOffsetsContributionDescriptor>
 parseDWARF32StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) {
   if (!DA.isValidOffsetForDataOfSize(Offset, 8))
-    return Optional<StrOffsetsContributionDescriptor>();
+    return None;
   uint32_t ContributionSize = DA.getU32(&Offset);
   if (ContributionSize >= 0xfffffff0)
-    return Optional<StrOffsetsContributionDescriptor>();
+    return None;
   uint8_t Version = DA.getU16(&Offset);
   (void)DA.getU16(&Offset); // padding
   // The encoded length includes the 2-byte version field and the 2-byte
   // padding, so we need to subtract them out when we populate the descriptor.
-  return StrOffsetsContributionDescriptor(Offset, ContributionSize - 4, Version,
-                                          DWARF32);
-  //return Optional<StrOffsetsContributionDescriptor>(Descriptor);
+  return {{Offset, ContributionSize - 4, Version, DWARF32}};
 }
 
 Optional<StrOffsetsContributionDescriptor>
-DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA,
-                                                   uint64_t Offset) {
+DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA) {
+  auto Offset = toSectionOffset(getUnitDIE().find(DW_AT_str_offsets_base), 0);
   Optional<StrOffsetsContributionDescriptor> Descriptor;
   // Attempt to find a DWARF64 contribution 16 bytes before the base.
   if (Offset >= 16)
@@ -826,8 +836,13 @@ DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA,
 }
 
 Optional<StrOffsetsContributionDescriptor>
-DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA,
-                                                      uint64_t Offset) {
+DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor & DA) {
+  uint64_t Offset = 0;
+  auto IndexEntry = Header.getIndexEntry();
+  const auto *C =
+      IndexEntry ? IndexEntry->getOffset(DW_SECT_STR_OFFSETS) : nullptr;
+  if (C)
+    Offset = C->Offset;
   if (getVersion() >= 5) {
     // Look for a valid contribution at the given offset.
     auto Descriptor =
@@ -839,15 +854,9 @@ DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA,
   // Prior to DWARF v5, we derive the contribution size from the
   // index table (in a package file). In a .dwo file it is simply
   // the length of the string offsets section.
-  uint64_t Size = 0;
-  auto IndexEntry = Header.getIndexEntry();
   if (!IndexEntry)
-    Size = StringOffsetSection.Data.size();
-  else if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
-    Size = C->Length;
-  // Return a descriptor with the given offset as base, version 4 and
-  // DWARF32 format.
-  //return Optional<StrOffsetsContributionDescriptor>(
-      //StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32));
-  return StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32);
+    return {{0, StringOffsetSection.Data.size(), 4, DWARF32}};
+  if (C)
+    return {{C->Offset, C->Length, 4, DWARF32}};
+  return None;
 }
diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index c433fe470cb354fdb0eeedb4d80d279bdfe2592d..f8370178b6279dc798152333b56cad687d68cf19 100644
--- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -325,9 +325,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
       case dwarf::DW_UT_split_type: {
         Unit = TypeUnitVector.addUnit(llvm::make_unique<DWARFTypeUnit>(
             DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangeSection(),
-            DObj.getStringSection(), DObj.getStringOffsetSection(),
-            &DObj.getAppleObjCSection(), DObj.getLineSection(),
-            DCtx.isLittleEndian(), false, TypeUnitVector));
+            &DObj.getLocSection(), DObj.getStringSection(),
+            DObj.getStringOffsetSection(), &DObj.getAppleObjCSection(),
+            DObj.getLineSection(), DCtx.isLittleEndian(), false,
+            TypeUnitVector));
         break;
       }
       case dwarf::DW_UT_skeleton:
@@ -338,9 +339,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
       case 0: {
         Unit = CompileUnitVector.addUnit(llvm::make_unique<DWARFCompileUnit>(
             DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangeSection(),
-            DObj.getStringSection(), DObj.getStringOffsetSection(),
-            &DObj.getAppleObjCSection(), DObj.getLineSection(),
-            DCtx.isLittleEndian(), false, CompileUnitVector));
+            &DObj.getLocSection(), DObj.getStringSection(),
+            DObj.getStringOffsetSection(), &DObj.getAppleObjCSection(),
+            DObj.getLineSection(), DCtx.isLittleEndian(), false,
+            CompileUnitVector));
         break;
       }
       default: { llvm_unreachable("Invalid UnitType."); }
@@ -362,15 +364,18 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
 
 bool DWARFVerifier::handleDebugInfo() {
   const DWARFObject &DObj = DCtx.getDWARFObj();
+  unsigned NumErrors = 0;
 
   OS << "Verifying .debug_info Unit Header Chain...\n";
-  unsigned result = verifyUnitSection(DObj.getInfoSection(), DW_SECT_INFO);
+  DObj.forEachInfoSections([&](const DWARFSection &S) {
+    NumErrors += verifyUnitSection(S, DW_SECT_INFO);
+  });
 
   OS << "Verifying .debug_types Unit Header Chain...\n";
   DObj.forEachTypesSections([&](const DWARFSection &S) {
-    result += verifyUnitSection(S, DW_SECT_TYPES);
+    NumErrors += verifyUnitSection(S, DW_SECT_TYPES);
   });
-  return result == 0;
+  return NumErrors == 0;
 }
 
 unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
@@ -392,20 +397,42 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   // Build RI for this DIE and check that ranges within this DIE do not
   // overlap.
   DieRangeInfo RI(Die);
-  for (auto Range : Ranges) {
-    if (!Range.valid()) {
-      ++NumErrors;
-      error() << "Invalid address range " << Range << "\n";
-      continue;
-    }
 
-    // Verify that ranges don't intersect.
-    const auto IntersectingRange = RI.insert(Range);
-    if (IntersectingRange != RI.Ranges.end()) {
-      ++NumErrors;
-      error() << "DIE has overlapping address ranges: " << Range << " and "
-              << *IntersectingRange << "\n";
-      break;
+  // TODO support object files better
+  //
+  // Some object file formats (i.e. non-MachO) support COMDAT.  ELF in
+  // particular does so by placing each function into a section.  The DWARF data
+  // for the function at that point uses a section relative DW_FORM_addrp for
+  // the DW_AT_low_pc and a DW_FORM_data4 for the offset as the DW_AT_high_pc.
+  // In such a case, when the Die is the CU, the ranges will overlap, and we
+  // will flag valid conflicting ranges as invalid.
+  //
+  // For such targets, we should read the ranges from the CU and partition them
+  // by the section id.  The ranges within a particular section should be
+  // disjoint, although the ranges across sections may overlap.  We would map
+  // the child die to the entity that it references and the section with which
+  // it is associated.  The child would then be checked against the range
+  // information for the associated section.
+  //
+  // For now, simply elide the range verification for the CU DIEs if we are
+  // processing an object file.
+
+  if (!IsObjectFile || IsMachOObject || Die.getTag() != DW_TAG_compile_unit) {
+    for (auto Range : Ranges) {
+      if (!Range.valid()) {
+        ++NumErrors;
+        error() << "Invalid address range " << Range << "\n";
+        continue;
+      }
+
+      // Verify that ranges don't intersect.
+      const auto IntersectingRange = RI.insert(Range);
+      if (IntersectingRange != RI.Ranges.end()) {
+        ++NumErrors;
+        error() << "DIE has overlapping address ranges: " << Range << " and "
+                << *IntersectingRange << "\n";
+        break;
+      }
     }
   }
 
@@ -508,14 +535,15 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
                   "incompatible tag " +
                   TagString(RefTag));
     }
+    break;
   }
   case DW_AT_type: {
     DWARFDie TypeDie = Die.getAttributeValueAsReferencedDie(DW_AT_type);
     if (TypeDie && !isType(TypeDie.getTag())) {
       ReportError("DIE has " + AttributeString(Attr) +
                   " with incompatible tag " + TagString(TypeDie.getTag()));
-      break;
     }
+    break;
   }
   default:
     break;
@@ -526,6 +554,7 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
 unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
                                             DWARFAttribute &AttrValue) {
   const DWARFObject &DObj = DCtx.getDWARFObj();
+  auto DieCU = Die.getDwarfUnit();
   unsigned NumErrors = 0;
   const auto Form = AttrValue.Value.getForm();
   switch (Form) {
@@ -538,7 +567,6 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     Optional<uint64_t> RefVal = AttrValue.Value.getAsReference();
     assert(RefVal);
     if (RefVal) {
-      auto DieCU = Die.getDwarfUnit();
       auto CUSize = DieCU->getNextUnitOffset() - DieCU->getOffset();
       auto CUOffset = AttrValue.Value.getRawUValue();
       if (CUOffset >= CUSize) {
@@ -563,7 +591,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     Optional<uint64_t> RefVal = AttrValue.Value.getAsReference();
     assert(RefVal);
     if (RefVal) {
-      if (*RefVal >= DObj.getInfoSection().Data.size()) {
+      if (*RefVal >= DieCU->getInfoSection().Data.size()) {
         ++NumErrors;
         error() << "DW_FORM_ref_addr offset beyond .debug_info "
                    "bounds:\n";
@@ -586,6 +614,45 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
     }
     break;
   }
+  case DW_FORM_strx:
+  case DW_FORM_strx1:
+  case DW_FORM_strx2:
+  case DW_FORM_strx3:
+  case DW_FORM_strx4: {
+    auto Index = AttrValue.Value.getRawUValue();
+    auto DieCU = Die.getDwarfUnit();
+    // Check that we have a valid DWARF v5 string offsets table.
+    if (!DieCU->getStringOffsetsTableContribution()) {
+      ++NumErrors;
+      error() << FormEncodingString(Form)
+              << " used without a valid string offsets table:\n";
+      dump(Die) << '\n';
+      break;
+    }
+    // Check that the index is within the bounds of the section. 
+    unsigned ItemSize = DieCU->getDwarfStringOffsetsByteSize();
+    // Use a 64-bit type to calculate the offset to guard against overflow.
+    uint64_t Offset =
+        (uint64_t)DieCU->getStringOffsetsBase() + Index * ItemSize;
+    if (DObj.getStringOffsetSection().Data.size() < Offset + ItemSize) {
+      ++NumErrors;
+      error() << FormEncodingString(Form) << " uses index "
+              << format("%" PRIu64, Index) << ", which is too large:\n";
+      dump(Die) << '\n';
+      break;
+    }
+    // Check that the string offset is valid.
+    uint64_t StringOffset = *DieCU->getStringOffsetSectionItem(Index);
+    if (StringOffset >= DObj.getStringSection().size()) {
+      ++NumErrors;
+      error() << FormEncodingString(Form) << " uses index "
+              << format("%" PRIu64, Index)
+              << ", but the referenced string"
+                 " offset is beyond .debug_str bounds:\n";
+      dump(Die) << '\n';
+    }
+    break;
+  }
   default:
     break;
   }
@@ -742,6 +809,16 @@ void DWARFVerifier::verifyDebugLineRows() {
   }
 }
 
+DWARFVerifier::DWARFVerifier(raw_ostream &S, DWARFContext &D,
+                             DIDumpOptions DumpOpts)
+    : OS(S), DCtx(D), DumpOpts(std::move(DumpOpts)), IsObjectFile(false),
+      IsMachOObject(false) {
+  if (const auto *F = DCtx.getDWARFObj().getFile()) {
+    IsObjectFile = F->isRelocatableObject();
+    IsMachOObject = F->isMachO();
+  }
+}
+
 bool DWARFVerifier::handleDebugLine() {
   NumDebugLineErrors = 0;
   OS << "Verifying .debug_line...\n";
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
index 86dcfdaa163cf1b20c6fd2c11f6685ff85a62289..d9d379f6d09129b1c3b6c297f6c8d563a08b15ff 100644
--- a/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -14,6 +14,7 @@ if(LLVM_ENABLE_DIA_SDK)
   add_pdb_impl_folder(DIA
     DIA/DIADataStream.cpp
     DIA/DIAEnumDebugStreams.cpp
+    DIA/DIAEnumFrameData.cpp
     DIA/DIAEnumInjectedSources.cpp
     DIA/DIAEnumLineNumbers.cpp
     DIA/DIAEnumSectionContribs.cpp
@@ -21,6 +22,7 @@ if(LLVM_ENABLE_DIA_SDK)
     DIA/DIAEnumSymbols.cpp
     DIA/DIAEnumTables.cpp
     DIA/DIAError.cpp
+    DIA/DIAFrameData.cpp
     DIA/DIAInjectedSource.cpp
     DIA/DIALineNumber.cpp
     DIA/DIARawSymbol.cpp
diff --git a/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f873f3525df59b1cd0ff1bcdb781b3cc5a5c45c2
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAEnumFrameData.cpp
@@ -0,0 +1,42 @@
+//==- DIAEnumFrameData.cpp ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAFrameData.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+
+using namespace llvm::pdb;
+
+DIAEnumFrameData::DIAEnumFrameData(CComPtr<IDiaEnumFrameData> DiaEnumerator)
+    : Enumerator(DiaEnumerator) {}
+
+uint32_t DIAEnumFrameData::getChildCount() const {
+  LONG Count = 0;
+  return (S_OK == Enumerator->get_Count(&Count)) ? Count : 0;
+}
+
+std::unique_ptr<IPDBFrameData>
+DIAEnumFrameData::getChildAtIndex(uint32_t Index) const {
+  CComPtr<IDiaFrameData> Item;
+  if (S_OK != Enumerator->Item(Index, &Item))
+    return nullptr;
+
+  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Item));
+}
+
+std::unique_ptr<IPDBFrameData> DIAEnumFrameData::getNext() {
+  CComPtr<IDiaFrameData> Item;
+  ULONG NumFetched = 0;
+  if (S_OK != Enumerator->Next(1, &Item, &NumFetched))
+    return nullptr;
+
+  return std::unique_ptr<IPDBFrameData>(new DIAFrameData(Item));
+}
+
+void DIAEnumFrameData::reset() { Enumerator->Reset(); }
diff --git a/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp b/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..533cce7923c0d8089e3ac9d6bfd34a645ab49dde
--- /dev/null
+++ b/lib/DebugInfo/PDB/DIA/DIAFrameData.cpp
@@ -0,0 +1,53 @@
+//===- DIAFrameData.cpp - DIA impl. of IPDBFrameData -------------- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/DIA/DIAFrameData.h"
+#include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAUtils.h"
+
+using namespace llvm::pdb;
+
+DIAFrameData::DIAFrameData(CComPtr<IDiaFrameData> DiaFrameData)
+    : FrameData(DiaFrameData) {}
+
+template <typename ArgType>
+ArgType
+PrivateGetDIAValue(IDiaFrameData *FrameData,
+                   HRESULT (__stdcall IDiaFrameData::*Method)(ArgType *)) {
+  ArgType Value;
+  if (S_OK == (FrameData->*Method)(&Value))
+    return static_cast<ArgType>(Value);
+
+  return ArgType();
+}
+
+uint32_t DIAFrameData::getAddressOffset() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_addressOffset);
+}
+
+uint32_t DIAFrameData::getAddressSection() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_addressSection);
+}
+
+uint32_t DIAFrameData::getLengthBlock() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_lengthBlock);
+}
+
+std::string DIAFrameData::getProgram() const {
+  return invokeBstrMethod(*FrameData, &IDiaFrameData::get_program);
+}
+
+uint32_t DIAFrameData::getRelativeVirtualAddress() const {
+  return PrivateGetDIAValue(FrameData,
+                            &IDiaFrameData::get_relativeVirtualAddress);
+}
+
+uint64_t DIAFrameData::getVirtualAddress() const {
+  return PrivateGetDIAValue(FrameData, &IDiaFrameData::get_virtualAddress);
+}
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
index 7726fe1326449785837f246862517c5606ca2206..bd375e172ac09819c4d9fb995bfbb12b70151a30 100644
--- a/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -9,6 +9,7 @@
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+#include "llvm/DebugInfo/PDB/DIA/DIAEnumFrameData.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumInjectedSources.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSectionContribs.h"
@@ -419,3 +420,13 @@ DIASession::getSectionContribs() const {
 
   return llvm::make_unique<DIAEnumSectionContribs>(*this, Sections);
 }
+
+std::unique_ptr<IPDBEnumFrameData>
+DIASession::getFrameData() const {
+  CComPtr<IDiaEnumFrameData> FD =
+      getTableEnumerator<IDiaEnumFrameData>(*Session);
+  if (!FD)
+    return nullptr;
+
+  return llvm::make_unique<DIAEnumFrameData>(FD);
+}
diff --git a/lib/DebugInfo/PDB/GenericError.cpp b/lib/DebugInfo/PDB/GenericError.cpp
index 95f6c15cd30e71e8281f42d5a9cacfff76dfb798..5f5ff69fe3f829eb3d41b9b09c7ce23b395ffd2a 100644
--- a/lib/DebugInfo/PDB/GenericError.cpp
+++ b/lib/DebugInfo/PDB/GenericError.cpp
@@ -24,8 +24,6 @@ public:
     switch (static_cast<pdb_error_code>(Condition)) {
     case pdb_error_code::unspecified:
       return "An unknown error has occurred.";
-    case pdb_error_code::type_server_not_found:
-        return "Type server PDB was not found.";
     case pdb_error_code::dia_sdk_not_present:
       return "LLVM was not compiled with support for DIA. This usually means "
              "that you are not using MSVC, or your Visual Studio "
diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp
index baab0a2399ce80c890bcf7d3a3b096a97e2026da..7807e312365c9acb0d76f6a1fc6b9859b76d109c 100644
--- a/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -200,6 +200,11 @@ NativeSession::getSectionContribs() const {
   return nullptr;
 }
 
+std::unique_ptr<IPDBEnumFrameData>
+NativeSession::getFrameData() const {
+  return nullptr;
+}
+
 void NativeSession::initializeExeSymbol() {
   if (ExeSymbol == 0)
     ExeSymbol = Cache.createSymbol<NativeExeSymbol>();
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index 96221f7d6ec09477efbc6ec7a027468313bbf373..44781705bfaed221953a93df9652182e7a17ed18 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
+#include "llvm/DebugInfo/CodeView/RecordName.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordHelpers.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
@@ -158,6 +159,20 @@ void TpiStream::buildHashMap() {
   }
 }
 
+std::vector<TypeIndex> TpiStream::findRecordsByName(StringRef Name) const {
+  uint32_t Bucket = hashStringV1(Name) % Header->NumHashBuckets;
+  if (Bucket > HashMap.size())
+    return {};
+
+  std::vector<TypeIndex> Result;
+  for (TypeIndex TI : HashMap[Bucket]) {
+    std::string ThisName = computeTypeName(*Types, TI);
+    if (ThisName == Name)
+      Result.push_back(TI);
+  }
+  return Result;
+}
+
 bool TpiStream::supportsTypeLookup() const { return !HashMap.empty(); }
 
 Expected<TypeIndex>
@@ -199,6 +214,10 @@ TpiStream::findFullDeclForForwardRef(TypeIndex ForwardRefTI) const {
   return ForwardRefTI;
 }
 
+codeview::CVType TpiStream::getType(codeview::TypeIndex Index) {
+  return Types->getType(Index);
+}
+
 BinarySubstreamRef TpiStream::getTypeRecordsSubstream() const {
   return TypeRecordsSubstream;
 }
diff --git a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
index c62796507a0157bfc35855fbd082ebfe7a1e5a1b..951909295d13e36102037193da3231905768224c 100644
--- a/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
+++ b/lib/DebugInfo/PDB/PDBInterfaceAnchors.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/IPDBDataStream.h"
+#include "llvm/DebugInfo/PDB/IPDBFrameData.h"
 #include "llvm/DebugInfo/PDB/IPDBInjectedSource.h"
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
@@ -35,3 +36,5 @@ IPDBTable::~IPDBTable() = default;
 IPDBInjectedSource::~IPDBInjectedSource() = default;
 
 IPDBSectionContrib::~IPDBSectionContrib() = default;
+
+IPDBFrameData::~IPDBFrameData() = default;
diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index 2165cbab7e706a5f6bee4225323b68b2b29dba8e..b6b11dbddf26b5d841331e6a6d9d750318da47ea 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -112,14 +112,20 @@ struct DumpVisitor {
     printStr("}");
     --Depth;
   }
+
   // Overload used when T is exactly 'bool', not merely convertible to 'bool'.
-  template<typename T, T * = (bool*)nullptr>
-  void print(T B) {
-    printStr(B ? "true" : "false");
+  void print(bool B) { printStr(B ? "true" : "false"); }
+
+  template <class T>
+  typename std::enable_if<std::is_unsigned<T>::value>::type print(T N) {
+    fprintf(stderr, "%llu", (unsigned long long)N);
   }
-  void print(size_t N) {
-    fprintf(stderr, "%zu", N);
+
+  template <class T>
+  typename std::enable_if<std::is_signed<T>::value>::type print(T N) {
+    fprintf(stderr, "%lld", (long long)N);
   }
+
   void print(ReferenceKind RK) {
     switch (RK) {
     case ReferenceKind::LValue:
@@ -316,7 +322,7 @@ public:
 // Code beyond this point should not be synchronized with libc++abi.
 //===----------------------------------------------------------------------===//
 
-using Demangler = itanium_demangle::Db<DefaultAllocator>;
+using Demangler = itanium_demangle::ManglingParser<DefaultAllocator>;
 
 char *llvm::itaniumDemangle(const char *MangledName, char *Buf,
                             size_t *N, int *Status) {
diff --git a/lib/Demangle/MicrosoftDemangle.cpp b/lib/Demangle/MicrosoftDemangle.cpp
index 9f60eb22cc4307e1aa4abf671c23db0a314db3c8..882e4a578455dbcac5189167b0839d522d3ba43f 100644
--- a/lib/Demangle/MicrosoftDemangle.cpp
+++ b/lib/Demangle/MicrosoftDemangle.cpp
@@ -14,8 +14,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MicrosoftDemangleNodes.h"
+#include "llvm/Demangle/MicrosoftDemangle.h"
 #include "llvm/Demangle/Demangle.h"
+#include "llvm/Demangle/MicrosoftDemangleNodes.h"
 
 #include "llvm/Demangle/Compiler.h"
 #include "llvm/Demangle/StringView.h"
@@ -33,21 +34,12 @@ static bool startsWithDigit(StringView S) {
   return !S.empty() && std::isdigit(S.front());
 }
 
-enum class QualifierMangleMode { Drop, Mangle, Result };
 
 struct NodeList {
   Node *N = nullptr;
   NodeList *Next = nullptr;
 };
 
-enum class FunctionIdentifierCodeGroup { Basic, Under, DoubleUnder };
-
-enum NameBackrefBehavior : uint8_t {
-  NBB_None = 0,          // don't save any names as backrefs.
-  NBB_Template = 1 << 0, // save template instanations.
-  NBB_Simple = 1 << 1,   // save simple names.
-};
-
 static bool isMemberPointer(StringView MangledName) {
   switch (MangledName.popFront()) {
   case '$':
@@ -246,151 +238,6 @@ demanglePointerCVQualifiers(StringView &MangledName) {
   return std::make_pair(Q_None, PointerAffinity::Pointer);
 }
 
-namespace {
-
-struct BackrefContext {
-  static constexpr size_t Max = 10;
-
-  TypeNode *FunctionParams[Max];
-  size_t FunctionParamCount = 0;
-
-  // The first 10 BackReferences in a mangled name can be back-referenced by
-  // special name @[0-9]. This is a storage for the first 10 BackReferences.
-  NamedIdentifierNode *Names[Max];
-  size_t NamesCount = 0;
-};
-
-// Demangler class takes the main role in demangling symbols.
-// It has a set of functions to parse mangled symbols into Type instances.
-// It also has a set of functions to cnovert Type instances to strings.
-class Demangler {
-public:
-  Demangler() = default;
-  virtual ~Demangler() = default;
-
-  // You are supposed to call parse() first and then check if error is true.  If
-  // it is false, call output() to write the formatted name to the given stream.
-  SymbolNode *parse(StringView &MangledName);
-
-  // True if an error occurred.
-  bool Error = false;
-
-  void dumpBackReferences();
-
-private:
-  SymbolNode *demangleEncodedSymbol(StringView &MangledName,
-                                    QualifiedNameNode *QN);
-
-  VariableSymbolNode *demangleVariableEncoding(StringView &MangledName,
-                                               StorageClass SC);
-  FunctionSymbolNode *demangleFunctionEncoding(StringView &MangledName);
-
-  Qualifiers demanglePointerExtQualifiers(StringView &MangledName);
-
-  // Parser functions. This is a recursive-descent parser.
-  TypeNode *demangleType(StringView &MangledName, QualifierMangleMode QMM);
-  PrimitiveTypeNode *demanglePrimitiveType(StringView &MangledName);
-  CustomTypeNode *demangleCustomType(StringView &MangledName);
-  TagTypeNode *demangleClassType(StringView &MangledName);
-  PointerTypeNode *demanglePointerType(StringView &MangledName);
-  PointerTypeNode *demangleMemberPointerType(StringView &MangledName);
-  FunctionSignatureNode *demangleFunctionType(StringView &MangledName,
-                                              bool HasThisQuals);
-
-  ArrayTypeNode *demangleArrayType(StringView &MangledName);
-
-  NodeArrayNode *demangleTemplateParameterList(StringView &MangledName);
-  NodeArrayNode *demangleFunctionParameterList(StringView &MangledName);
-
-  std::pair<uint64_t, bool> demangleNumber(StringView &MangledName);
-  uint64_t demangleUnsigned(StringView &MangledName);
-  int64_t demangleSigned(StringView &MangledName);
-
-  void memorizeString(StringView s);
-  void memorizeIdentifier(IdentifierNode *Identifier);
-
-  /// Allocate a copy of \p Borrowed into memory that we own.
-  StringView copyString(StringView Borrowed);
-
-  QualifiedNameNode *demangleFullyQualifiedTypeName(StringView &MangledName);
-  QualifiedNameNode *demangleFullyQualifiedSymbolName(StringView &MangledName);
-
-  IdentifierNode *demangleUnqualifiedTypeName(StringView &MangledName,
-                                              bool Memorize);
-  IdentifierNode *demangleUnqualifiedSymbolName(StringView &MangledName,
-                                                NameBackrefBehavior NBB);
-
-  QualifiedNameNode *demangleNameScopeChain(StringView &MangledName,
-                                            IdentifierNode *UnqualifiedName);
-  IdentifierNode *demangleNameScopePiece(StringView &MangledName);
-
-  NamedIdentifierNode *demangleBackRefName(StringView &MangledName);
-  IdentifierNode *demangleTemplateInstantiationName(StringView &MangledName,
-                                                    NameBackrefBehavior NBB);
-  IdentifierNode *demangleFunctionIdentifierCode(StringView &MangledName);
-  IdentifierNode *
-  demangleFunctionIdentifierCode(StringView &MangledName,
-                                 FunctionIdentifierCodeGroup Group);
-  StructorIdentifierNode *demangleStructorIdentifier(StringView &MangledName,
-                                                     bool IsDestructor);
-  ConversionOperatorIdentifierNode *
-  demangleConversionOperatorIdentifier(StringView &MangledName);
-  LiteralOperatorIdentifierNode *
-  demangleLiteralOperatorIdentifier(StringView &MangledName);
-
-  SymbolNode *demangleSpecialIntrinsic(StringView &MangledName);
-  SpecialTableSymbolNode *
-  demangleSpecialTableSymbolNode(StringView &MangledName,
-                                 SpecialIntrinsicKind SIK);
-  LocalStaticGuardVariableNode *
-  demangleLocalStaticGuard(StringView &MangledName);
-  VariableSymbolNode *demangleUntypedVariable(ArenaAllocator &Arena,
-                                              StringView &MangledName,
-                                              StringView VariableName);
-  VariableSymbolNode *
-  demangleRttiBaseClassDescriptorNode(ArenaAllocator &Arena,
-                                      StringView &MangledName);
-  FunctionSymbolNode *demangleInitFiniStub(StringView &MangledName,
-                                           bool IsDestructor);
-
-  NamedIdentifierNode *demangleSimpleName(StringView &MangledName,
-                                          bool Memorize);
-  NamedIdentifierNode *demangleAnonymousNamespaceName(StringView &MangledName);
-  NamedIdentifierNode *demangleLocallyScopedNamePiece(StringView &MangledName);
-  EncodedStringLiteralNode *demangleStringLiteral(StringView &MangledName);
-  FunctionSymbolNode *demangleVcallThunkNode(StringView &MangledName);
-
-  StringView demangleSimpleString(StringView &MangledName, bool Memorize);
-
-  FuncClass demangleFunctionClass(StringView &MangledName);
-  CallingConv demangleCallingConvention(StringView &MangledName);
-  StorageClass demangleVariableStorageClass(StringView &MangledName);
-  void demangleThrowSpecification(StringView &MangledName);
-  wchar_t demangleWcharLiteral(StringView &MangledName);
-  uint8_t demangleCharLiteral(StringView &MangledName);
-
-  std::pair<Qualifiers, bool> demangleQualifiers(StringView &MangledName);
-
-  // Memory allocator.
-  ArenaAllocator Arena;
-
-  // A single type uses one global back-ref table for all function params.
-  // This means back-refs can even go "into" other types.  Examples:
-  //
-  //  // Second int* is a back-ref to first.
-  //  void foo(int *, int*);
-  //
-  //  // Second int* is not a back-ref to first (first is not a function param).
-  //  int* foo(int*);
-  //
-  //  // Second int* is a back-ref to first (ALL function types share the same
-  //  // back-ref map.
-  //  using F = void(*)(int*);
-  //  F G(int *);
-  BackrefContext Backrefs;
-};
-} // namespace
-
 StringView Demangler::copyString(StringView Borrowed) {
   char *Stable = Arena.allocUnalignedBuffer(Borrowed.size() + 1);
   std::strcpy(Stable, Borrowed.begin());
@@ -652,7 +499,7 @@ Demangler::demangleLiteralOperatorIdentifier(StringView &MangledName) {
   return N;
 }
 
-IntrinsicFunctionKind
+static IntrinsicFunctionKind
 translateIntrinsicFunctionCode(char CH, FunctionIdentifierCodeGroup Group) {
   // Not all ? identifiers are intrinsics *functions*.  This function only maps
   // operator codes for the special functions, all others are handled elsewhere,
@@ -886,6 +733,16 @@ SymbolNode *Demangler::parse(StringView &MangledName) {
   return Symbol;
 }
 
+TagTypeNode *Demangler::parseTagUniqueName(StringView &MangledName) {
+  if (!MangledName.consumeFront(".?A"))
+    return nullptr;
+  MangledName.consumeFront(".?A");
+  if (MangledName.empty())
+    return nullptr;
+
+  return demangleClassType(MangledName);
+}
+
 // <type-encoding> ::= <storage-class> <variable-type>
 // <storage-class> ::= 0  # private static member
 //                 ::= 1  # protected static member
@@ -1220,7 +1077,7 @@ static void outputEscapedChar(OutputStream &OS, unsigned C) {
   outputHex(OS, C);
 }
 
-unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) {
+static unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) {
   const uint8_t *End = StringBytes + Length - 1;
   unsigned Count = 0;
   while (Length > 0 && *End == 0) {
@@ -1231,7 +1088,8 @@ unsigned countTrailingNullBytes(const uint8_t *StringBytes, int Length) {
   return Count;
 }
 
-unsigned countEmbeddedNulls(const uint8_t *StringBytes, unsigned Length) {
+static unsigned countEmbeddedNulls(const uint8_t *StringBytes,
+                                   unsigned Length) {
   unsigned Result = 0;
   for (unsigned I = 0; I < Length; ++I) {
     if (*StringBytes++ == 0)
@@ -1240,8 +1098,8 @@ unsigned countEmbeddedNulls(const uint8_t *StringBytes, unsigned Length) {
   return Result;
 }
 
-unsigned guessCharByteSize(const uint8_t *StringBytes, unsigned NumChars,
-                           unsigned NumBytes) {
+static unsigned guessCharByteSize(const uint8_t *StringBytes, unsigned NumChars,
+                                  unsigned NumBytes) {
   assert(NumBytes > 0);
 
   // If the number of bytes is odd, this is guaranteed to be a char string.
diff --git a/lib/Demangle/MicrosoftDemangleNodes.cpp b/lib/Demangle/MicrosoftDemangleNodes.cpp
index 93719f8934279869778312ac3bb64a070ddd4f0b..af893b9b68e1ce1bb56e54d4b10b79eedd250d08 100644
--- a/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MicrosoftDemangleNodes.h"
+#include "llvm/Demangle/MicrosoftDemangleNodes.h"
 #include "llvm/Demangle/Compiler.h"
 #include "llvm/Demangle/Utility.h"
 #include <cctype>
@@ -558,6 +558,7 @@ void VariableSymbolNode::output(OutputStream &OS, OutputFlags Flags) const {
   case StorageClass::PublicStatic:
   case StorageClass::ProtectedStatic:
     OS << "static ";
+    break;
   default:
     break;
   }
diff --git a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index ae1c7e84259706f7a14d76f1c2d87226daa35e93..241eb3600da748463aa8c7e461240153e581740e 100644
--- a/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -68,14 +68,16 @@ namespace orc {
 class PartitioningIRMaterializationUnit : public IRMaterializationUnit {
 public:
   PartitioningIRMaterializationUnit(ExecutionSession &ES, ThreadSafeModule TSM,
-                                    CompileOnDemandLayer2 &Parent)
-      : IRMaterializationUnit(ES, std::move(TSM)), Parent(Parent) {}
+                                    VModuleKey K, CompileOnDemandLayer &Parent)
+      : IRMaterializationUnit(ES, std::move(TSM), std::move(K)),
+        Parent(Parent) {}
 
   PartitioningIRMaterializationUnit(
       ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
       SymbolNameToDefinitionMap SymbolToDefinition,
-      CompileOnDemandLayer2 &Parent)
-      : IRMaterializationUnit(std::move(TSM), std::move(SymbolFlags),
+      CompileOnDemandLayer &Parent)
+      : IRMaterializationUnit(std::move(TSM), std::move(K),
+                              std::move(SymbolFlags),
                               std::move(SymbolToDefinition)),
         Parent(Parent) {}
 
@@ -93,31 +95,31 @@ private:
   }
 
   mutable std::mutex SourceModuleMutex;
-  CompileOnDemandLayer2 &Parent;
+  CompileOnDemandLayer &Parent;
 };
 
-Optional<CompileOnDemandLayer2::GlobalValueSet>
-CompileOnDemandLayer2::compileRequested(GlobalValueSet Requested) {
+Optional<CompileOnDemandLayer::GlobalValueSet>
+CompileOnDemandLayer::compileRequested(GlobalValueSet Requested) {
   return std::move(Requested);
 }
 
-Optional<CompileOnDemandLayer2::GlobalValueSet>
-CompileOnDemandLayer2::compileWholeModule(GlobalValueSet Requested) {
+Optional<CompileOnDemandLayer::GlobalValueSet>
+CompileOnDemandLayer::compileWholeModule(GlobalValueSet Requested) {
   return None;
 }
 
-CompileOnDemandLayer2::CompileOnDemandLayer2(
+CompileOnDemandLayer::CompileOnDemandLayer(
     ExecutionSession &ES, IRLayer &BaseLayer, LazyCallThroughManager &LCTMgr,
     IndirectStubsManagerBuilder BuildIndirectStubsManager)
     : IRLayer(ES), BaseLayer(BaseLayer), LCTMgr(LCTMgr),
       BuildIndirectStubsManager(std::move(BuildIndirectStubsManager)) {}
 
-void CompileOnDemandLayer2::setPartitionFunction(PartitionFunction Partition) {
+void CompileOnDemandLayer::setPartitionFunction(PartitionFunction Partition) {
   this->Partition = std::move(Partition);
 }
 
-void CompileOnDemandLayer2::emit(MaterializationResponsibility R, VModuleKey K,
-                                 ThreadSafeModule TSM) {
+void CompileOnDemandLayer::emit(MaterializationResponsibility R,
+                                ThreadSafeModule TSM) {
   assert(TSM.getModule() && "Null module");
 
   auto &ES = getExecutionSession();
@@ -149,25 +151,32 @@ void CompileOnDemandLayer2::emit(MaterializationResponsibility R, VModuleKey K,
   // implementation dylib.
   if (auto Err = PDR.getImplDylib().define(
           llvm::make_unique<PartitioningIRMaterializationUnit>(
-              ES, std::move(TSM), *this))) {
+              ES, std::move(TSM), R.getVModuleKey(), *this))) {
     ES.reportError(std::move(Err));
     R.failMaterialization();
     return;
   }
 
-  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables)));
+  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), true));
   R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
                           std::move(Callables)));
 }
 
-CompileOnDemandLayer2::PerDylibResources &
-CompileOnDemandLayer2::getPerDylibResources(JITDylib &TargetD) {
+CompileOnDemandLayer::PerDylibResources &
+CompileOnDemandLayer::getPerDylibResources(JITDylib &TargetD) {
   auto I = DylibResources.find(&TargetD);
   if (I == DylibResources.end()) {
-    auto &ImplD =
-        getExecutionSession().createJITDylib(TargetD.getName() + ".impl");
-    TargetD.withSearchOrderDo([&](const JITDylibList &TargetSearchOrder) {
-      ImplD.setSearchOrder(TargetSearchOrder, false);
+    auto &ImplD = getExecutionSession().createJITDylib(
+        TargetD.getName() + ".impl", false);
+    TargetD.withSearchOrderDo([&](const JITDylibSearchList &TargetSearchOrder) {
+      auto NewSearchOrder = TargetSearchOrder;
+      assert(!NewSearchOrder.empty() &&
+             NewSearchOrder.front().first == &TargetD &&
+             NewSearchOrder.front().second == true &&
+             "TargetD must be at the front of its own search order and match "
+             "non-exported symbol");
+      NewSearchOrder.insert(std::next(NewSearchOrder.begin()), {&ImplD, true});
+      ImplD.setSearchOrder(std::move(NewSearchOrder), false);
     });
     PerDylibResources PDR(ImplD, BuildIndirectStubsManager());
     I = DylibResources.insert(std::make_pair(&TargetD, std::move(PDR))).first;
@@ -176,7 +185,7 @@ CompileOnDemandLayer2::getPerDylibResources(JITDylib &TargetD) {
   return I->second;
 }
 
-void CompileOnDemandLayer2::cleanUpModule(Module &M) {
+void CompileOnDemandLayer::cleanUpModule(Module &M) {
   for (auto &F : M.functions()) {
     if (F.isDeclaration())
       continue;
@@ -189,7 +198,7 @@ void CompileOnDemandLayer2::cleanUpModule(Module &M) {
   }
 }
 
-void CompileOnDemandLayer2::expandPartition(GlobalValueSet &Partition) {
+void CompileOnDemandLayer::expandPartition(GlobalValueSet &Partition) {
   // Expands the partition to ensure the following rules hold:
   // (1) If any alias is in the partition, its aliasee is also in the partition.
   // (2) If any aliasee is in the partition, its aliases are also in the
@@ -221,7 +230,7 @@ void CompileOnDemandLayer2::expandPartition(GlobalValueSet &Partition) {
     Partition.insert(GV);
 }
 
-void CompileOnDemandLayer2::emitPartition(
+void CompileOnDemandLayer::emitPartition(
     MaterializationResponsibility R, ThreadSafeModule TSM,
     IRMaterializationUnit::SymbolNameToDefinitionMap Defs) {
 
@@ -245,7 +254,7 @@ void CompileOnDemandLayer2::emitPartition(
   // unmodified to the base layer.
   if (GVsToExtract == None) {
     Defs.clear();
-    BaseLayer.emit(std::move(R), ES.allocateVModule(), std::move(TSM));
+    BaseLayer.emit(std::move(R), std::move(TSM));
     return;
   }
 
@@ -285,9 +294,9 @@ void CompileOnDemandLayer2::emitPartition(
 
   auto ExtractedTSM = extractSubModule(TSM, ".submodule", ShouldExtract);
   R.replace(llvm::make_unique<PartitioningIRMaterializationUnit>(
-      ES, std::move(TSM), *this));
+      ES, std::move(TSM), R.getVModuleKey(), *this));
 
-  BaseLayer.emit(std::move(R), ES.allocateVModule(), std::move(ExtractedTSM));
+  BaseLayer.emit(std::move(R), std::move(ExtractedTSM));
 }
 
 } // end namespace orc
diff --git a/lib/ExecutionEngine/Orc/Core.cpp b/lib/ExecutionEngine/Orc/Core.cpp
index 86a7ecaaf0717716870462c7fc98b2d6ac62f4c1..f99cbec6d3b60a0a50199498e11fa9f220215a19 100644
--- a/lib/ExecutionEngine/Orc/Core.cpp
+++ b/lib/ExecutionEngine/Orc/Core.cpp
@@ -134,6 +134,8 @@ struct PrintSymbolMapElemsMatchingCLOpts {
 namespace llvm {
 namespace orc {
 
+  SymbolStringPool::PoolMapEntry SymbolStringPtr::Tombstone(0);
+
 char FailedToMaterialize::ID = 0;
 char SymbolsNotFound::ID = 0;
 char SymbolsCouldNotBeRemoved::ID = 0;
@@ -168,7 +170,8 @@ raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) {
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const JITEvaluatedSymbol &Sym) {
-  return OS << format("0x%016x", Sym.getAddress()) << " " << Sym.getFlags();
+  return OS << format("0x%016" PRIx64, Sym.getAddress()) << " "
+            << Sym.getFlags();
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV) {
@@ -203,14 +206,16 @@ raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
   return OS << ")";
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibList &JDs) {
+raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs) {
   OS << "[";
   if (!JDs.empty()) {
-    assert(JDs.front() && "JITDylibList entries must not be null");
-    OS << " " << JDs.front()->getName();
-    for (auto *JD : make_range(std::next(JDs.begin()), JDs.end())) {
-      assert(JD && "JITDylibList entries must not be null");
-      OS << ", " << JD->getName();
+    assert(JDs.front().first && "JITDylibList entries must not be null");
+    OS << " (\"" << JDs.front().first->getName() << "\", "
+       << (JDs.front().second ? "true" : "false") << ")";
+    for (auto &KV : make_range(std::next(JDs.begin()), JDs.end())) {
+      assert(KV.first && "JITDylibList entries must not be null");
+      OS << ", (\"" << KV.first->getName() << "\", "
+         << (KV.second ? "true" : "false") << ")";
     }
   }
   OS << " ]";
@@ -366,8 +371,8 @@ void AsynchronousSymbolQuery::detach() {
 }
 
 MaterializationResponsibility::MaterializationResponsibility(
-    JITDylib &JD, SymbolFlagsMap SymbolFlags)
-    : JD(JD), SymbolFlags(std::move(SymbolFlags)) {
+    JITDylib &JD, SymbolFlagsMap SymbolFlags, VModuleKey K)
+    : JD(JD), SymbolFlags(std::move(SymbolFlags)), K(std::move(K)) {
   assert(!this->SymbolFlags.empty() && "Materializing nothing?");
 
 #ifndef NDEBUG
@@ -459,7 +464,12 @@ void MaterializationResponsibility::replace(
 }
 
 MaterializationResponsibility
-MaterializationResponsibility::delegate(const SymbolNameSet &Symbols) {
+MaterializationResponsibility::delegate(const SymbolNameSet &Symbols,
+                                        VModuleKey NewKey) {
+
+  if (NewKey == VModuleKey())
+    NewKey = K;
+
   SymbolFlagsMap DelegatedFlags;
 
   for (auto &Name : Symbols) {
@@ -472,7 +482,8 @@ MaterializationResponsibility::delegate(const SymbolNameSet &Symbols) {
     SymbolFlags.erase(I);
   }
 
-  return MaterializationResponsibility(JD, std::move(DelegatedFlags));
+  return MaterializationResponsibility(JD, std::move(DelegatedFlags),
+                                       std::move(NewKey));
 }
 
 void MaterializationResponsibility::addDependencies(
@@ -489,8 +500,9 @@ void MaterializationResponsibility::addDependenciesForAll(
 }
 
 AbsoluteSymbolsMaterializationUnit::AbsoluteSymbolsMaterializationUnit(
-    SymbolMap Symbols)
-    : MaterializationUnit(extractFlags(Symbols)), Symbols(std::move(Symbols)) {}
+    SymbolMap Symbols, VModuleKey K)
+    : MaterializationUnit(extractFlags(Symbols), std::move(K)),
+      Symbols(std::move(Symbols)) {}
 
 StringRef AbsoluteSymbolsMaterializationUnit::getName() const {
   return "<Absolute Symbols>";
@@ -517,8 +529,10 @@ AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
 }
 
 ReExportsMaterializationUnit::ReExportsMaterializationUnit(
-    JITDylib *SourceJD, SymbolAliasMap Aliases)
-    : MaterializationUnit(extractFlags(Aliases)), SourceJD(SourceJD),
+    JITDylib *SourceJD, bool MatchNonExported, SymbolAliasMap Aliases,
+    VModuleKey K)
+    : MaterializationUnit(extractFlags(Aliases), std::move(K)),
+      SourceJD(SourceJD), MatchNonExported(MatchNonExported),
       Aliases(std::move(Aliases)) {}
 
 StringRef ReExportsMaterializationUnit::getName() const {
@@ -547,7 +561,7 @@ void ReExportsMaterializationUnit::materialize(
 
   if (!Aliases.empty()) {
     if (SourceJD)
-      R.replace(reexports(*SourceJD, std::move(Aliases)));
+      R.replace(reexports(*SourceJD, std::move(Aliases), MatchNonExported));
     else
       R.replace(symbolAliases(std::move(Aliases)));
   }
@@ -575,20 +589,22 @@ void ReExportsMaterializationUnit::materialize(
     SymbolNameSet QuerySymbols;
     SymbolAliasMap QueryAliases;
 
-    for (auto I = RequestedAliases.begin(), E = RequestedAliases.end();
-         I != E;) {
-      auto Tmp = I++;
-
+    // Collect as many aliases as we can without including a chain.
+    for (auto &KV : RequestedAliases) {
       // Chain detected. Skip this symbol for this round.
-      if (&SrcJD == &TgtJD && (QueryAliases.count(Tmp->second.Aliasee) ||
-                               RequestedAliases.count(Tmp->second.Aliasee)))
+      if (&SrcJD == &TgtJD && (QueryAliases.count(KV.second.Aliasee) ||
+                               RequestedAliases.count(KV.second.Aliasee)))
         continue;
 
-      ResponsibilitySymbols.insert(Tmp->first);
-      QuerySymbols.insert(Tmp->second.Aliasee);
-      QueryAliases[Tmp->first] = std::move(Tmp->second);
-      RequestedAliases.erase(Tmp);
+      ResponsibilitySymbols.insert(KV.first);
+      QuerySymbols.insert(KV.second.Aliasee);
+      QueryAliases[KV.first] = std::move(KV.second);
     }
+
+    // Remove the aliases collected this round from the RequestedAliases map.
+    for (auto &KV : QueryAliases)
+      RequestedAliases.erase(KV.first);
+
     assert(!QuerySymbols.empty() && "Alias cycle detected!");
 
     auto QueryInfo = std::make_shared<OnResolveInfo>(
@@ -645,7 +661,8 @@ void ReExportsMaterializationUnit::materialize(
 
     auto OnReady = [&ES](Error Err) { ES.reportError(std::move(Err)); };
 
-    ES.lookup({&SrcJD}, QuerySymbols, std::move(OnResolve), std::move(OnReady),
+    ES.lookup(JITDylibSearchList({{&SrcJD, MatchNonExported}}), QuerySymbols,
+              std::move(OnResolve), std::move(OnReady),
               std::move(RegisterDependencies));
   }
 }
@@ -686,26 +703,28 @@ buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) {
   return Result;
 }
 
-ReexportsFallbackDefinitionGenerator::ReexportsFallbackDefinitionGenerator(
-    JITDylib &BackingJD, SymbolPredicate Allow)
-    : BackingJD(BackingJD), Allow(std::move(Allow)) {}
+ReexportsGenerator::ReexportsGenerator(JITDylib &SourceJD,
+                                       bool MatchNonExported,
+                                       SymbolPredicate Allow)
+    : SourceJD(SourceJD), MatchNonExported(MatchNonExported),
+      Allow(std::move(Allow)) {}
 
-SymbolNameSet ReexportsFallbackDefinitionGenerator::
-operator()(JITDylib &JD, const SymbolNameSet &Names) {
+SymbolNameSet ReexportsGenerator::operator()(JITDylib &JD,
+                                             const SymbolNameSet &Names) {
   orc::SymbolNameSet Added;
   orc::SymbolAliasMap AliasMap;
 
-  auto Flags = BackingJD.lookupFlags(Names);
+  auto Flags = SourceJD.lookupFlags(Names);
 
   for (auto &KV : Flags) {
-    if (!Allow(KV.first))
+    if (Allow && !Allow(KV.first))
       continue;
     AliasMap[KV.first] = SymbolAliasMapEntry(KV.first, KV.second);
     Added.insert(KV.first);
   }
 
   if (!Added.empty())
-    cantFail(JD.define(reexports(BackingJD, AliasMap)));
+    cantFail(JD.define(reexports(SourceJD, AliasMap, MatchNonExported)));
 
   return Added;
 }
@@ -1030,30 +1049,41 @@ void JITDylib::notifyFailed(const SymbolNameSet &FailedSymbols) {
     Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbols));
 }
 
-void JITDylib::setSearchOrder(JITDylibList NewSearchOrder,
-                              bool SearchThisJITDylibFirst) {
-  if (SearchThisJITDylibFirst && NewSearchOrder.front() != this)
-    NewSearchOrder.insert(NewSearchOrder.begin(), this);
+void JITDylib::setSearchOrder(JITDylibSearchList NewSearchOrder,
+                              bool SearchThisJITDylibFirst,
+                              bool MatchNonExportedInThisDylib) {
+  if (SearchThisJITDylibFirst && NewSearchOrder.front().first != this)
+    NewSearchOrder.insert(NewSearchOrder.begin(),
+                          {this, MatchNonExportedInThisDylib});
 
   ES.runSessionLocked([&]() { SearchOrder = std::move(NewSearchOrder); });
 }
 
-void JITDylib::addToSearchOrder(JITDylib &JD) {
-  ES.runSessionLocked([&]() { SearchOrder.push_back(&JD); });
+void JITDylib::addToSearchOrder(JITDylib &JD, bool MatchNonExported) {
+  ES.runSessionLocked([&]() {
+    SearchOrder.push_back({&JD, MatchNonExported});
+  });
 }
 
-void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD) {
+void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
+                                    bool MatchNonExported) {
   ES.runSessionLocked([&]() {
-    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &OldJD);
+    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
+                          [&](const JITDylibSearchList::value_type &KV) {
+                            return KV.first == &OldJD;
+                          });
 
     if (I != SearchOrder.end())
-      *I = &NewJD;
+      *I = {&NewJD, MatchNonExported};
   });
 }
 
 void JITDylib::removeFromSearchOrder(JITDylib &JD) {
   ES.runSessionLocked([&]() {
-    auto I = std::find(SearchOrder.begin(), SearchOrder.end(), &JD);
+    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
+                          [&](const JITDylibSearchList::value_type &KV) {
+                            return KV.first == &JD;
+                          });
     if (I != SearchOrder.end())
       SearchOrder.erase(I);
   });
@@ -1117,10 +1147,10 @@ SymbolFlagsMap JITDylib::lookupFlags(const SymbolNameSet &Names) {
   return ES.runSessionLocked([&, this]() {
     SymbolFlagsMap Result;
     auto Unresolved = lookupFlagsImpl(Result, Names);
-    if (FallbackDefinitionGenerator && !Unresolved.empty()) {
-      auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
-      if (!FallbackDefs.empty()) {
-        auto Unresolved2 = lookupFlagsImpl(Result, FallbackDefs);
+    if (DefGenerator && !Unresolved.empty()) {
+      auto NewDefs = DefGenerator(*this, Unresolved);
+      if (!NewDefs.empty()) {
+        auto Unresolved2 = lookupFlagsImpl(Result, NewDefs);
         (void)Unresolved2;
         assert(Unresolved2.empty() &&
                "All fallback defs should have been found by lookupFlagsImpl");
@@ -1150,18 +1180,18 @@ SymbolNameSet JITDylib::lookupFlagsImpl(SymbolFlagsMap &Flags,
 }
 
 void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                          SymbolNameSet &Unresolved,
+                          SymbolNameSet &Unresolved, bool MatchNonExported,
                           MaterializationUnitList &MUs) {
   assert(Q && "Query can not be null");
 
-  lodgeQueryImpl(Q, Unresolved, MUs);
-  if (FallbackDefinitionGenerator && !Unresolved.empty()) {
-    auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
-    if (!FallbackDefs.empty()) {
-      for (auto &D : FallbackDefs)
+  lodgeQueryImpl(Q, Unresolved, MatchNonExported, MUs);
+  if (DefGenerator && !Unresolved.empty()) {
+    auto NewDefs = DefGenerator(*this, Unresolved);
+    if (!NewDefs.empty()) {
+      for (auto &D : NewDefs)
         Unresolved.erase(D);
-      lodgeQueryImpl(Q, FallbackDefs, MUs);
-      assert(FallbackDefs.empty() &&
+      lodgeQueryImpl(Q, NewDefs, MatchNonExported, MUs);
+      assert(NewDefs.empty() &&
              "All fallback defs should have been found by lookupImpl");
     }
   }
@@ -1169,19 +1199,23 @@ void JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
 
 void JITDylib::lodgeQueryImpl(
     std::shared_ptr<AsynchronousSymbolQuery> &Q, SymbolNameSet &Unresolved,
+    bool MatchNonExported,
     std::vector<std::unique_ptr<MaterializationUnit>> &MUs) {
-  for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) {
-    auto TmpI = I++;
-    auto Name = *TmpI;
 
+  std::vector<SymbolStringPtr> ToRemove;
+  for (auto Name : Unresolved) {
     // Search for the name in Symbols. Skip it if not found.
     auto SymI = Symbols.find(Name);
     if (SymI == Symbols.end())
       continue;
 
-    // If we found Name in JD, remove it frome the Unresolved set and add it
-    // to the added set.
-    Unresolved.erase(TmpI);
+    // If this is a non exported symbol and we're skipping those then skip it.
+    if (!SymI->second.getFlags().isExported() && !MatchNonExported)
+      continue;
+
+    // If we matched against Name in JD, mark it to be removed from the Unresolved
+    // set.
+    ToRemove.push_back(Name);
 
     // If the symbol has an address then resolve it.
     if (SymI->second.getAddress() != 0)
@@ -1226,6 +1260,10 @@ void JITDylib::lodgeQueryImpl(
     MI.PendingQueries.push_back(Q);
     Q->addQueryDependence(*this, Name);
   }
+
+  // Remove any symbols that we found.
+  for (auto &Name : ToRemove)
+    Unresolved.erase(Name);
 }
 
 SymbolNameSet JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
@@ -1240,15 +1278,15 @@ SymbolNameSet JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
   SymbolNameSet Unresolved = std::move(Names);
   ES.runSessionLocked([&, this]() {
     ActionFlags = lookupImpl(Q, MUs, Unresolved);
-    if (FallbackDefinitionGenerator && !Unresolved.empty()) {
+    if (DefGenerator && !Unresolved.empty()) {
       assert(ActionFlags == None &&
              "ActionFlags set but unresolved symbols remain?");
-      auto FallbackDefs = FallbackDefinitionGenerator(*this, Unresolved);
-      if (!FallbackDefs.empty()) {
-        for (auto &D : FallbackDefs)
+      auto NewDefs = DefGenerator(*this, Unresolved);
+      if (!NewDefs.empty()) {
+        for (auto &D : NewDefs)
           Unresolved.erase(D);
-        ActionFlags = lookupImpl(Q, MUs, FallbackDefs);
-        assert(FallbackDefs.empty() &&
+        ActionFlags = lookupImpl(Q, MUs, NewDefs);
+        assert(NewDefs.empty() &&
                "All fallback defs should have been found by lookupImpl");
       }
     }
@@ -1285,19 +1323,17 @@ JITDylib::lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
                      std::vector<std::unique_ptr<MaterializationUnit>> &MUs,
                      SymbolNameSet &Unresolved) {
   LookupImplActionFlags ActionFlags = None;
+  std::vector<SymbolStringPtr> ToRemove;
 
-  for (auto I = Unresolved.begin(), E = Unresolved.end(); I != E;) {
-    auto TmpI = I++;
-    auto Name = *TmpI;
+  for (auto Name : Unresolved) {
 
     // Search for the name in Symbols. Skip it if not found.
     auto SymI = Symbols.find(Name);
     if (SymI == Symbols.end())
       continue;
 
-    // If we found Name, remove it frome the Unresolved set and add it
-    // to the dependencies set.
-    Unresolved.erase(TmpI);
+    // If we found Name, mark it to be removed from the Unresolved set.
+    ToRemove.push_back(Name);
 
     // If the symbol has an address then resolve it.
     if (SymI->second.getAddress() != 0) {
@@ -1348,20 +1384,28 @@ JITDylib::lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
     Q->addQueryDependence(*this, Name);
   }
 
+  // Remove any marked symbols from the Unresolved set.
+  for (auto &Name : ToRemove)
+    Unresolved.erase(Name);
+
   return ActionFlags;
 }
 
 void JITDylib::dump(raw_ostream &OS) {
   ES.runSessionLocked([&, this]() {
-    OS << "JITDylib \"" << JITDylibName
-       << "\" (ES: " << format("0x%016x", reinterpret_cast<uintptr_t>(&ES))
-       << "):\n"
+    OS << "JITDylib \"" << JITDylibName << "\" (ES: "
+       << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES)) << "):\n"
+       << "Search order: [";
+    for (auto &KV : SearchOrder)
+      OS << " (\"" << KV.first->getName() << "\", "
+         << (KV.second ? "all" : "exported only") << ")";
+    OS << " ]\n"
        << "Symbol table:\n";
 
     for (auto &KV : Symbols) {
       OS << "    \"" << *KV.first << "\": ";
       if (auto Addr = KV.second.getAddress())
-        OS << format("0x%016x", Addr);
+        OS << format("0x%016" PRIx64, Addr) << ", " << KV.second.getFlags();
       else
         OS << "<not resolved>";
       if (KV.second.getFlags().isLazy() ||
@@ -1375,7 +1419,7 @@ void JITDylib::dump(raw_ostream &OS) {
         }
         if (KV.second.getFlags().isMaterializing())
           OS << " Materializing";
-        OS << " )\n";
+        OS << ", " << KV.second.getFlags() << " )\n";
       } else
         OS << "\n";
     }
@@ -1402,7 +1446,7 @@ void JITDylib::dump(raw_ostream &OS) {
 
 JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
     : ES(ES), JITDylibName(std::move(Name)) {
-  SearchOrder.push_back(this);
+  SearchOrder.push_back({this, true});
 }
 
 Error JITDylib::defineImpl(MaterializationUnit &MU) {
@@ -1696,17 +1740,17 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
 }
 
 void ExecutionSession::lookup(
-    const JITDylibList &JDs, SymbolNameSet Symbols,
+    const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
     SymbolsResolvedCallback OnResolve, SymbolsReadyCallback OnReady,
     RegisterDependenciesFunction RegisterDependencies) {
 
   // lookup can be re-entered recursively if running on a single thread. Run any
-  // outstanding MUs in case this query depends on them, otherwise the main
-  // thread will starve waiting for a result from an MU that it failed to run.
+  // outstanding MUs in case this query depends on them, otherwise this lookup
+  // will starve waiting for a result from an MU that is stuck in the queue.
   runOutstandingMUs();
 
   auto Unresolved = std::move(Symbols);
-  std::map<JITDylib *, MaterializationUnitList> MUsMap;
+  std::map<JITDylib *, MaterializationUnitList> CollectedMUsMap;
   auto Q = std::make_shared<AsynchronousSymbolQuery>(
       Unresolved, std::move(OnResolve), std::move(OnReady));
   bool QueryIsFullyResolved = false;
@@ -1714,11 +1758,14 @@ void ExecutionSession::lookup(
   bool QueryFailed = false;
 
   runSessionLocked([&]() {
-    for (auto *JD : JDs) {
-      assert(JD && "JITDylibList entries must not be null");
-      assert(!MUsMap.count(JD) &&
+    for (auto &KV : SearchOrder) {
+      assert(KV.first && "JITDylibList entries must not be null");
+      assert(!CollectedMUsMap.count(KV.first) &&
              "JITDylibList should not contain duplicate entries");
-      JD->lodgeQuery(Q, Unresolved, MUsMap[JD]);
+
+      auto &JD = *KV.first;
+      auto MatchNonExported = KV.second;
+      JD.lodgeQuery(Q, Unresolved, MatchNonExported, CollectedMUsMap[&JD]);
     }
 
     if (Unresolved.empty()) {
@@ -1741,7 +1788,7 @@ void ExecutionSession::lookup(
       Q->detach();
 
       // Replace the MUs.
-      for (auto &KV : MUsMap)
+      for (auto &KV : CollectedMUsMap)
         for (auto &MU : KV.second)
           KV.first->replace(std::move(MU));
     }
@@ -1761,7 +1808,7 @@ void ExecutionSession::lookup(
   {
     std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
 
-    for (auto &KV : MUsMap)
+    for (auto &KV : CollectedMUsMap)
       for (auto &MU : KV.second)
         OutstandingMUs.push_back(std::make_pair(KV.first, std::move(MU)));
   }
@@ -1769,10 +1816,9 @@ void ExecutionSession::lookup(
   runOutstandingMUs();
 }
 
-Expected<SymbolMap>
-ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
-                         RegisterDependenciesFunction RegisterDependencies,
-                         bool WaitUntilReady) {
+Expected<SymbolMap> ExecutionSession::lookup(
+    const JITDylibSearchList &SearchOrder, const SymbolNameSet &Symbols,
+    RegisterDependenciesFunction RegisterDependencies, bool WaitUntilReady) {
 #if LLVM_ENABLE_THREADS
   // In the threaded case we use promises to return the results.
   std::promise<SymbolMap> PromisedResult;
@@ -1839,7 +1885,7 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
 #endif
 
   // Perform the asynchronous lookup.
-  lookup(JDs, Symbols, OnResolve, OnReady, RegisterDependencies);
+  lookup(SearchOrder, Symbols, OnResolve, OnReady, RegisterDependencies);
 
 #if LLVM_ENABLE_THREADS
   auto ResultFuture = PromisedResult.get_future();
@@ -1882,6 +1928,37 @@ ExecutionSession::lookup(const JITDylibList &JDs, const SymbolNameSet &Symbols,
 #endif
 }
 
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
+                         SymbolStringPtr Name) {
+  SymbolNameSet Names({Name});
+
+  if (auto ResultMap = lookup(SearchOrder, std::move(Names),
+                              NoDependenciesToRegister, true)) {
+    assert(ResultMap->size() == 1 && "Unexpected number of results");
+    assert(ResultMap->count(Name) && "Missing result for symbol");
+    return std::move(ResultMap->begin()->second);
+  } else
+    return ResultMap.takeError();
+}
+
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder,
+                         SymbolStringPtr Name) {
+  SymbolNameSet Names({Name});
+
+  JITDylibSearchList FullSearchOrder(SearchOrder.size());
+  for (auto *JD : SearchOrder)
+    FullSearchOrder.push_back({JD, false});
+
+  return lookup(FullSearchOrder, Name);
+}
+
+Expected<JITEvaluatedSymbol>
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, StringRef Name) {
+  return lookup(SearchOrder, intern(Name));
+}
+
 void ExecutionSession::dump(raw_ostream &OS) {
   runSessionLocked([this, &OS]() {
     for (auto &JD : JDs)
@@ -1910,28 +1987,6 @@ void ExecutionSession::runOutstandingMUs() {
   }
 }
 
-Expected<SymbolMap> lookup(const JITDylibList &JDs, SymbolNameSet Names) {
-
-  if (JDs.empty())
-    return SymbolMap();
-
-  auto &ES = (*JDs.begin())->getExecutionSession();
-
-  return ES.lookup(JDs, Names, NoDependenciesToRegister, true);
-}
-
-/// Look up a symbol by searching a list of JDs.
-Expected<JITEvaluatedSymbol> lookup(const JITDylibList &JDs,
-                                    SymbolStringPtr Name) {
-  SymbolNameSet Names({Name});
-  if (auto ResultMap = lookup(JDs, std::move(Names))) {
-    assert(ResultMap->size() == 1 && "Unexpected number of results");
-    assert(ResultMap->count(Name) && "Missing result for symbol");
-    return std::move(ResultMap->begin()->second);
-  } else
-    return ResultMap.takeError();
-}
-
 MangleAndInterner::MangleAndInterner(ExecutionSession &ES, const DataLayout &DL)
     : ES(ES), DL(DL) {}
 
diff --git a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 47cb273ee12dd18d19536ebc922061e57bc6ab79..7c3c50b4d6e534549a1b4b3a274fe63c1a42613e 100644
--- a/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -87,8 +87,8 @@ iterator_range<CtorDtorIterator> getDestructors(const Module &M) {
                     CtorDtorIterator(DtorsList, true));
 }
 
-void CtorDtorRunner2::add(iterator_range<CtorDtorIterator> CtorDtors) {
-  if (CtorDtors.begin() == CtorDtors.end())
+void CtorDtorRunner::add(iterator_range<CtorDtorIterator> CtorDtors) {
+  if (empty(CtorDtors))
     return;
 
   MangleAndInterner Mangle(
@@ -115,7 +115,7 @@ void CtorDtorRunner2::add(iterator_range<CtorDtorIterator> CtorDtors) {
   }
 }
 
-Error CtorDtorRunner2::run() {
+Error CtorDtorRunner::run() {
   using CtorDtorTy = void (*)();
 
   SymbolNameSet Names;
@@ -128,7 +128,10 @@ Error CtorDtorRunner2::run() {
     }
   }
 
-  if (auto CtorDtorMap = lookup({&JD}, std::move(Names))) {
+  auto &ES = JD.getExecutionSession();
+  if (auto CtorDtorMap =
+          ES.lookup(JITDylibSearchList({{&JD, true}}), std::move(Names),
+                    NoDependenciesToRegister, true)) {
     for (auto &KV : CtorDtorsByPriority) {
       for (auto &Name : KV.second) {
         assert(CtorDtorMap->count(Name) && "No entry for Name");
@@ -162,34 +165,35 @@ int LocalCXXRuntimeOverridesBase::CXAAtExitOverride(DestructorPtr Destructor,
   return 0;
 }
 
-Error LocalCXXRuntimeOverrides2::enable(JITDylib &JD,
+Error LocalCXXRuntimeOverrides::enable(JITDylib &JD,
                                         MangleAndInterner &Mangle) {
-  SymbolMap RuntimeInterposes(
-      {{Mangle("__dso_handle"),
-        JITEvaluatedSymbol(toTargetAddress(&DSOHandleOverride),
-                           JITSymbolFlags::Exported)},
-       {Mangle("__cxa_atexit"),
-        JITEvaluatedSymbol(toTargetAddress(&CXAAtExitOverride),
-                           JITSymbolFlags::Exported)}});
+  SymbolMap RuntimeInterposes;
+  RuntimeInterposes[Mangle("__dso_handle")] =
+    JITEvaluatedSymbol(toTargetAddress(&DSOHandleOverride),
+                       JITSymbolFlags::Exported);
+  RuntimeInterposes[Mangle("__cxa_atexit")] =
+    JITEvaluatedSymbol(toTargetAddress(&CXAAtExitOverride),
+                       JITSymbolFlags::Exported);
 
   return JD.define(absoluteSymbols(std::move(RuntimeInterposes)));
 }
 
-DynamicLibraryFallbackGenerator::DynamicLibraryFallbackGenerator(
+DynamicLibrarySearchGenerator::DynamicLibrarySearchGenerator(
     sys::DynamicLibrary Dylib, const DataLayout &DL, SymbolPredicate Allow)
     : Dylib(std::move(Dylib)), Allow(std::move(Allow)),
       GlobalPrefix(DL.getGlobalPrefix()) {}
 
-Expected<DynamicLibraryFallbackGenerator> DynamicLibraryFallbackGenerator::Load(
-    const char *FileName, const DataLayout &DL, SymbolPredicate Allow) {
+Expected<DynamicLibrarySearchGenerator>
+DynamicLibrarySearchGenerator::Load(const char *FileName, const DataLayout &DL,
+                                    SymbolPredicate Allow) {
   std::string ErrMsg;
   auto Lib = sys::DynamicLibrary::getPermanentLibrary(FileName, &ErrMsg);
   if (!Lib.isValid())
     return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
-  return DynamicLibraryFallbackGenerator(std::move(Lib), DL, std::move(Allow));
+  return DynamicLibrarySearchGenerator(std::move(Lib), DL, std::move(Allow));
 }
 
-SymbolNameSet DynamicLibraryFallbackGenerator::
+SymbolNameSet DynamicLibrarySearchGenerator::
 operator()(JITDylib &JD, const SymbolNameSet &Names) {
   orc::SymbolNameSet Added;
   orc::SymbolMap NewSymbols;
@@ -197,7 +201,10 @@ operator()(JITDylib &JD, const SymbolNameSet &Names) {
   bool HasGlobalPrefix = (GlobalPrefix != '\0');
 
   for (auto &Name : Names) {
-    if (!Allow(Name) || (*Name).empty())
+    if ((*Name).empty())
+      continue;
+
+    if (Allow && !Allow(Name))
       continue;
 
     if (HasGlobalPrefix && (*Name).front() != GlobalPrefix)
@@ -212,8 +219,8 @@ operator()(JITDylib &JD, const SymbolNameSet &Names) {
     }
   }
 
-  // Add any new symbols to JD. Since the fallback generator is only called for
-  // symbols that are not already defined, this will never trigger a duplicate
+  // Add any new symbols to JD. Since the generator is only called for symbols
+  // that are not already defined, this will never trigger a duplicate
   // definition error, so we can wrap this call in a 'cantFail'.
   if (!NewSymbols.empty())
     cantFail(JD.define(absoluteSymbols(std::move(NewSymbols))));
diff --git a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
index 5dee1c80e0b36e2bfd5862bb5a67ad8192f20d2a..d952d1be70dab1e587b93b52a82d5602e1b337ca 100644
--- a/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
+++ b/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
@@ -12,28 +12,28 @@
 namespace llvm {
 namespace orc {
 
-IRCompileLayer2::IRCompileLayer2(ExecutionSession &ES, ObjectLayer &BaseLayer,
+IRCompileLayer::IRCompileLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
                                  CompileFunction Compile)
     : IRLayer(ES), BaseLayer(BaseLayer), Compile(std::move(Compile)) {}
 
-void IRCompileLayer2::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) {
+void IRCompileLayer::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) {
   std::lock_guard<std::mutex> Lock(IRLayerMutex);
   this->NotifyCompiled = std::move(NotifyCompiled);
 }
 
-void IRCompileLayer2::emit(MaterializationResponsibility R, VModuleKey K,
-                           ThreadSafeModule TSM) {
+void IRCompileLayer::emit(MaterializationResponsibility R,
+                          ThreadSafeModule TSM) {
   assert(TSM.getModule() && "Module must not be null");
 
   if (auto Obj = Compile(*TSM.getModule())) {
     {
       std::lock_guard<std::mutex> Lock(IRLayerMutex);
       if (NotifyCompiled)
-        NotifyCompiled(K, std::move(TSM));
+        NotifyCompiled(R.getVModuleKey(), std::move(TSM));
       else
         TSM = ThreadSafeModule();
     }
-    BaseLayer.emit(std::move(R), std::move(K), std::move(*Obj));
+    BaseLayer.emit(std::move(R), std::move(*Obj));
   } else {
     R.failMaterialization();
     getExecutionSession().reportError(Obj.takeError());
diff --git a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
index 7a79a382d8d8397a98b08f9607edafb5973050b0..7bc0d696e3acaba058381ef5fcb9b83ed500ea8f 100644
--- a/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
+++ b/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
@@ -13,17 +13,17 @@
 namespace llvm {
 namespace orc {
 
-IRTransformLayer2::IRTransformLayer2(ExecutionSession &ES,
+IRTransformLayer::IRTransformLayer(ExecutionSession &ES,
                                      IRLayer &BaseLayer,
                                      TransformFunction Transform)
     : IRLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-void IRTransformLayer2::emit(MaterializationResponsibility R, VModuleKey K,
-                             ThreadSafeModule TSM) {
+void IRTransformLayer::emit(MaterializationResponsibility R,
+                            ThreadSafeModule TSM) {
   assert(TSM.getModule() && "Module must not be null");
 
   if (auto TransformedTSM = Transform(std::move(TSM), R))
-    BaseLayer.emit(std::move(R), std::move(K), std::move(*TransformedTSM));
+    BaseLayer.emit(std::move(R), std::move(*TransformedTSM));
   else {
     R.failMaterialization();
     getExecutionSession().reportError(TransformedTSM.takeError());
diff --git a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index d7fd57b6e53783cbc9f02db6ad74cbf4fd4236e2..82000ec5b32bc4e708217a601c6ed472ee81126f 100644
--- a/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -27,8 +27,9 @@ public:
   using CompileFunction = JITCompileCallbackManager::CompileFunction;
 
   CompileCallbackMaterializationUnit(SymbolStringPtr Name,
-                                     CompileFunction Compile)
-      : MaterializationUnit(SymbolFlagsMap({{Name, JITSymbolFlags::Exported}})),
+                                     CompileFunction Compile, VModuleKey K)
+      : MaterializationUnit(SymbolFlagsMap({{Name, JITSymbolFlags::Exported}}),
+                            std::move(K)),
         Name(std::move(Name)), Compile(std::move(Compile)) {}
 
   StringRef getName() const override { return "<Compile Callbacks>"; }
@@ -67,7 +68,8 @@ JITCompileCallbackManager::getCompileCallback(CompileFunction Compile) {
     AddrToSymbol[*TrampolineAddr] = CallbackName;
     cantFail(CallbacksJD.define(
         llvm::make_unique<CompileCallbackMaterializationUnit>(
-            std::move(CallbackName), std::move(Compile))));
+            std::move(CallbackName), std::move(Compile),
+            ES.allocateVModule())));
     return *TrampolineAddr;
   } else
     return TrampolineAddr.takeError();
@@ -90,7 +92,7 @@ JITTargetAddress JITCompileCallbackManager::executeCompileCallback(
       {
         raw_string_ostream ErrMsgStream(ErrMsg);
         ErrMsgStream << "No compile callback for trampoline at "
-                     << format("0x%016x", TrampolineAddr);
+                     << format("0x%016" PRIx64, TrampolineAddr);
       }
       ES.reportError(
           make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode()));
@@ -99,9 +101,10 @@ JITTargetAddress JITCompileCallbackManager::executeCompileCallback(
       Name = I->second;
   }
 
-  if (auto Sym = lookup({&CallbacksJD}, Name))
+  if (auto Sym = ES.lookup(JITDylibSearchList({{&CallbacksJD, true}}), Name))
     return Sym->getAddress();
   else {
+    llvm::dbgs() << "Didn't find callback.\n";
     // If anything goes wrong materializing Sym then report it to the session
     // and return the ErrorHandlerAddress;
     ES.reportError(Sym.takeError());
diff --git a/lib/ExecutionEngine/Orc/LLJIT.cpp b/lib/ExecutionEngine/Orc/LLJIT.cpp
index 47baa45a8aa9311263dccf9e3450a3e3d9cc3974..e2089f9106bd67ecd286e299b1e3b96027d4a598 100644
--- a/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -21,7 +21,7 @@ namespace {
       : llvm::orc::SimpleCompiler(*TM), TM(std::move(TM)) {}
   private:
     // FIXME: shared because std::functions (and thus
-    // IRCompileLayer2::CompileFunction) are not moveable.
+    // IRCompileLayer::CompileFunction) are not moveable.
     std::shared_ptr<llvm::TargetMachine> TM;
   };
 
@@ -65,27 +65,26 @@ Error LLJIT::addIRModule(JITDylib &JD, ThreadSafeModule TSM) {
   if (auto Err = applyDataLayout(*TSM.getModule()))
     return Err;
 
-  auto K = ES->allocateVModule();
-  return CompileLayer.add(JD, K, std::move(TSM));
+  return CompileLayer.add(JD, std::move(TSM), ES->allocateVModule());
 }
 
 Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
   assert(Obj && "Can not add null object");
 
-  auto K = ES->allocateVModule();
-  return ObjLinkingLayer.add(JD, K, std::move(Obj));
+  return ObjLinkingLayer.add(JD, std::move(Obj), ES->allocateVModule());
 }
 
 Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
                                                         StringRef Name) {
-  return llvm::orc::lookup({&JD}, ES->intern(Name));
+  return ES->lookup(JITDylibSearchList({{&JD, true}}), ES->intern(Name));
 }
 
 LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES,
              std::unique_ptr<TargetMachine> TM, DataLayout DL)
     : ES(std::move(ES)), Main(this->ES->getMainJITDylib()), DL(std::move(DL)),
-      ObjLinkingLayer(*this->ES,
-                      [this](VModuleKey K) { return getMemoryManager(K); }),
+      ObjLinkingLayer(
+          *this->ES,
+          []() { return llvm::make_unique<SectionMemoryManager>(); }),
       CompileLayer(*this->ES, ObjLinkingLayer,
                    TMOwningSimpleCompiler(std::move(TM))),
       CtorRunner(Main), DtorRunner(Main) {}
@@ -93,10 +92,11 @@ LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES,
 LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES, JITTargetMachineBuilder JTMB,
              DataLayout DL, unsigned NumCompileThreads)
     : ES(std::move(ES)), Main(this->ES->getMainJITDylib()), DL(std::move(DL)),
-      ObjLinkingLayer(*this->ES,
-                      [this](VModuleKey K) { return getMemoryManager(K); }),
+      ObjLinkingLayer(
+          *this->ES,
+          []() { return llvm::make_unique<SectionMemoryManager>(); }),
       CompileLayer(*this->ES, ObjLinkingLayer,
-                   MultiThreadedSimpleCompiler(std::move(JTMB))),
+                   ConcurrentIRCompiler(std::move(JTMB))),
       CtorRunner(Main), DtorRunner(Main) {
   assert(NumCompileThreads != 0 &&
          "Multithreaded LLJIT instance can not be created with 0 threads");
@@ -117,11 +117,6 @@ LLJIT::LLJIT(std::unique_ptr<ExecutionSession> ES, JITTargetMachineBuilder JTMB,
       });
 }
 
-std::unique_ptr<RuntimeDyld::MemoryManager>
-LLJIT::getMemoryManager(VModuleKey K) {
-  return llvm::make_unique<SectionMemoryManager>();
-}
-
 std::string LLJIT::mangle(StringRef UnmangledName) {
   std::string MangledName;
   {
@@ -149,13 +144,13 @@ void LLJIT::recordCtorDtors(Module &M) {
 }
 
 Expected<std::unique_ptr<LLLazyJIT>>
-  LLLazyJIT::Create(JITTargetMachineBuilder JTMB, DataLayout DL,
-                    unsigned NumCompileThreads) {
+LLLazyJIT::Create(JITTargetMachineBuilder JTMB, DataLayout DL,
+                  JITTargetAddress ErrorAddr, unsigned NumCompileThreads) {
   auto ES = llvm::make_unique<ExecutionSession>();
 
   const Triple &TT = JTMB.getTargetTriple();
 
-  auto LCTMgr = createLocalLazyCallThroughManager(TT, *ES, 0);
+  auto LCTMgr = createLocalLazyCallThroughManager(TT, *ES, ErrorAddr);
   if (!LCTMgr)
     return LCTMgr.takeError();
 
@@ -187,8 +182,7 @@ Error LLLazyJIT::addLazyIRModule(JITDylib &JD, ThreadSafeModule TSM) {
 
   recordCtorDtors(*TSM.getModule());
 
-  auto K = ES->allocateVModule();
-  return CODLayer.add(JD, K, std::move(TSM));
+  return CODLayer.add(JD, std::move(TSM), ES->allocateVModule());
 }
 
 LLLazyJIT::LLLazyJIT(
diff --git a/lib/ExecutionEngine/Orc/Layer.cpp b/lib/ExecutionEngine/Orc/Layer.cpp
index 22dbf5c26d14ce3d53c9c8de3dbffec615b6da07..11af76825e9fd09d2daef003f96871cfe32391ee 100644
--- a/lib/ExecutionEngine/Orc/Layer.cpp
+++ b/lib/ExecutionEngine/Orc/Layer.cpp
@@ -19,14 +19,14 @@ namespace orc {
 IRLayer::IRLayer(ExecutionSession &ES) : ES(ES) {}
 IRLayer::~IRLayer() {}
 
-Error IRLayer::add(JITDylib &JD, VModuleKey K, ThreadSafeModule TSM) {
+Error IRLayer::add(JITDylib &JD, ThreadSafeModule TSM, VModuleKey K) {
   return JD.define(llvm::make_unique<BasicIRLayerMaterializationUnit>(
       *this, std::move(K), std::move(TSM)));
 }
 
 IRMaterializationUnit::IRMaterializationUnit(ExecutionSession &ES,
-                                             ThreadSafeModule TSM)
-    : MaterializationUnit(SymbolFlagsMap()), TSM(std::move(TSM)) {
+                                             ThreadSafeModule TSM, VModuleKey K)
+    : MaterializationUnit(SymbolFlagsMap(), std::move(K)), TSM(std::move(TSM)) {
 
   assert(this->TSM && "Module must not be null");
 
@@ -42,10 +42,10 @@ IRMaterializationUnit::IRMaterializationUnit(ExecutionSession &ES,
 }
 
 IRMaterializationUnit::IRMaterializationUnit(
-    ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
+    ThreadSafeModule TSM, VModuleKey K, SymbolFlagsMap SymbolFlags,
     SymbolNameToDefinitionMap SymbolToDefinition)
-    : MaterializationUnit(std::move(SymbolFlags)), TSM(std::move(TSM)),
-      SymbolToDefinition(std::move(SymbolToDefinition)) {}
+    : MaterializationUnit(std::move(SymbolFlags), std::move(K)),
+      TSM(std::move(TSM)), SymbolToDefinition(std::move(SymbolToDefinition)) {}
 
 StringRef IRMaterializationUnit::getName() const {
   if (TSM.getModule())
@@ -71,8 +71,9 @@ void IRMaterializationUnit::discard(const JITDylib &JD,
 
 BasicIRLayerMaterializationUnit::BasicIRLayerMaterializationUnit(
     IRLayer &L, VModuleKey K, ThreadSafeModule TSM)
-    : IRMaterializationUnit(L.getExecutionSession(), std::move(TSM)), L(L),
-      K(std::move(K)) {}
+    : IRMaterializationUnit(L.getExecutionSession(), std::move(TSM),
+                            std::move(K)),
+      L(L), K(std::move(K)) {}
 
 void BasicIRLayerMaterializationUnit::materialize(
     MaterializationResponsibility R) {
@@ -94,7 +95,7 @@ void BasicIRLayerMaterializationUnit::materialize(
     dbgs() << "Emitting, for " << R.getTargetJITDylib().getName() << ", "
            << *this << "\n";
   }););
-  L.emit(std::move(R), std::move(K), std::move(TSM));
+  L.emit(std::move(R), std::move(TSM));
   LLVM_DEBUG(ES.runSessionLocked([&]() {
     dbgs() << "Finished emitting, for " << R.getTargetJITDylib().getName()
            << ", " << *this << "\n";
@@ -105,8 +106,8 @@ ObjectLayer::ObjectLayer(ExecutionSession &ES) : ES(ES) {}
 
 ObjectLayer::~ObjectLayer() {}
 
-Error ObjectLayer::add(JITDylib &JD, VModuleKey K,
-                       std::unique_ptr<MemoryBuffer> O) {
+Error ObjectLayer::add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O,
+                       VModuleKey K) {
   auto ObjMU = BasicObjectLayerMaterializationUnit::Create(*this, std::move(K),
                                                            std::move(O));
   if (!ObjMU)
@@ -131,7 +132,7 @@ BasicObjectLayerMaterializationUnit::Create(ObjectLayer &L, VModuleKey K,
 BasicObjectLayerMaterializationUnit::BasicObjectLayerMaterializationUnit(
     ObjectLayer &L, VModuleKey K, std::unique_ptr<MemoryBuffer> O,
     SymbolFlagsMap SymbolFlags)
-    : MaterializationUnit(std::move(SymbolFlags)), L(L), K(std::move(K)),
+    : MaterializationUnit(std::move(SymbolFlags), std::move(K)), L(L),
       O(std::move(O)) {}
 
 StringRef BasicObjectLayerMaterializationUnit::getName() const {
@@ -142,7 +143,7 @@ StringRef BasicObjectLayerMaterializationUnit::getName() const {
 
 void BasicObjectLayerMaterializationUnit::materialize(
     MaterializationResponsibility R) {
-  L.emit(std::move(R), std::move(K), std::move(O));
+  L.emit(std::move(R), std::move(O));
 }
 
 void BasicObjectLayerMaterializationUnit::discard(const JITDylib &JD,
diff --git a/lib/ExecutionEngine/Orc/LazyReexports.cpp b/lib/ExecutionEngine/Orc/LazyReexports.cpp
index 0d8049178b545fcc0fba2ad3ce8832cb05298436..55f4a7c5afcec2f8ed2d76e2e301343f9561ad71 100644
--- a/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -52,8 +52,8 @@ LazyCallThroughManager::callThroughToSymbol(JITTargetAddress TrampolineAddr) {
     SymbolName = I->second.second;
   }
 
-  auto LookupResult =
-      ES.lookup({SourceJD}, {SymbolName}, NoDependenciesToRegister);
+  auto LookupResult = ES.lookup(JITDylibSearchList({{SourceJD, true}}),
+                                {SymbolName}, NoDependenciesToRegister, true);
 
   if (!LookupResult) {
     ES.reportError(LookupResult.takeError());
@@ -125,8 +125,8 @@ createLocalLazyCallThroughManager(const Triple &T, ExecutionSession &ES,
 
 LazyReexportsMaterializationUnit::LazyReexportsMaterializationUnit(
     LazyCallThroughManager &LCTManager, IndirectStubsManager &ISManager,
-    JITDylib &SourceJD, SymbolAliasMap CallableAliases)
-    : MaterializationUnit(extractFlags(CallableAliases)),
+    JITDylib &SourceJD, SymbolAliasMap CallableAliases, VModuleKey K)
+    : MaterializationUnit(extractFlags(CallableAliases), std::move(K)),
       LCTManager(LCTManager), ISManager(ISManager), SourceJD(SourceJD),
       CallableAliases(std::move(CallableAliases)),
       NotifyResolved(LazyCallThroughManager::createNotifyResolvedFunction(
diff --git a/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp b/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
index 6980c8140fd03d8489f5cd1e82221b79466f54f5..825f532047360a948affd515573bc804a224d769 100644
--- a/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
+++ b/lib/ExecutionEngine/Orc/ObjectTransformLayer.cpp
@@ -13,17 +13,17 @@
 namespace llvm {
 namespace orc {
 
-ObjectTransformLayer2::ObjectTransformLayer2(ExecutionSession &ES,
-                                             ObjectLayer &BaseLayer,
-                                             TransformFunction Transform)
+ObjectTransformLayer::ObjectTransformLayer(ExecutionSession &ES,
+                                            ObjectLayer &BaseLayer,
+                                            TransformFunction Transform)
     : ObjectLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
 
-void ObjectTransformLayer2::emit(MaterializationResponsibility R, VModuleKey K,
-                                 std::unique_ptr<MemoryBuffer> O) {
+void ObjectTransformLayer::emit(MaterializationResponsibility R,
+                                std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Module must not be null");
 
   if (auto TransformedObj = Transform(std::move(O)))
-    BaseLayer.emit(std::move(R), std::move(K), std::move(*TransformedObj));
+    BaseLayer.emit(std::move(R), std::move(*TransformedObj));
   else {
     R.failMaterialization();
     getExecutionSession().reportError(TransformedObj.takeError());
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index 3fedba1caa6a581f4f2c2cbea7322b7c8710c166..deddfcb10e12e4d05ee4873615de0f08fa29e160 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -77,9 +77,9 @@ public:
   };
 
   template <>
-  class GenericLayerImpl<orc::RTDyldObjectLinkingLayer> : public GenericLayer {
+  class GenericLayerImpl<orc::LegacyRTDyldObjectLinkingLayer> : public GenericLayer {
   private:
-    using LayerT = orc::RTDyldObjectLinkingLayer;
+    using LayerT = orc::LegacyRTDyldObjectLinkingLayer;
   public:
     GenericLayerImpl(LayerT &Layer) : Layer(Layer) {}
 
@@ -107,10 +107,10 @@ class OrcCBindingsStack {
 public:
 
   using CompileCallbackMgr = orc::JITCompileCallbackManager;
-  using ObjLayerT = orc::RTDyldObjectLinkingLayer;
-  using CompileLayerT = orc::IRCompileLayer<ObjLayerT, orc::SimpleCompiler>;
+  using ObjLayerT = orc::LegacyRTDyldObjectLinkingLayer;
+  using CompileLayerT = orc::LegacyIRCompileLayer<ObjLayerT, orc::SimpleCompiler>;
   using CODLayerT =
-        orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>;
+        orc::LegacyCompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>;
 
   using CallbackManagerBuilder =
       std::function<std::unique_ptr<CompileCallbackMgr>()>;
@@ -312,7 +312,7 @@ public:
 
     // Run the static constructors, and save the static destructor runner for
     // execution when the JIT is torn down.
-    orc::CtorDtorRunner<OrcCBindingsStack> CtorRunner(std::move(CtorNames), K);
+    orc::LegacyCtorDtorRunner<OrcCBindingsStack> CtorRunner(std::move(CtorNames), K);
     if (auto Err = CtorRunner.runViaLayer(*this))
       return std::move(Err);
 
@@ -517,8 +517,8 @@ private:
 
   std::map<orc::VModuleKey, std::unique_ptr<detail::GenericLayer>> KeyLayers;
 
-  orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides;
-  std::vector<orc::CtorDtorRunner<OrcCBindingsStack>> IRStaticDestructorRunners;
+  orc::LegacyLocalCXXRuntimeOverrides CXXRuntimeOverrides;
+  std::vector<orc::LegacyCtorDtorRunner<OrcCBindingsStack>> IRStaticDestructorRunners;
   std::string ErrMsg;
 
   ResolverMap Resolvers;
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
index 4def579e70974683da2c03f56ed96ee6cc928f45..617bc2fc64b500a1a84532a5eb780f1429ae0104 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
@@ -128,7 +128,7 @@ void OrcMCJITReplacement::runStaticConstructorsDestructors(bool isDtors) {
   auto &CtorDtorsMap = isDtors ? UnexecutedDestructors : UnexecutedConstructors;
 
   for (auto &KV : CtorDtorsMap)
-    cantFail(CtorDtorRunner<LazyEmitLayerT>(std::move(KV.second), KV.first)
+    cantFail(LegacyCtorDtorRunner<LazyEmitLayerT>(std::move(KV.second), KV.first)
                  .runViaLayer(LazyEmitLayer));
 
   CtorDtorsMap.clear();
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index 1195d39561d07af467ec1e12bc6a5dc158fd31b7..36e7e83a8babfbb0774be78019c6f5147d504475 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -461,8 +461,8 @@ private:
     return MangledName;
   }
 
-  using ObjectLayerT = RTDyldObjectLinkingLayer;
-  using CompileLayerT = IRCompileLayer<ObjectLayerT, orc::SimpleCompiler>;
+  using ObjectLayerT = LegacyRTDyldObjectLinkingLayer;
+  using CompileLayerT = LegacyIRCompileLayer<ObjectLayerT, orc::SimpleCompiler>;
   using LazyEmitLayerT = LazyEmittingLayer<CompileLayerT>;
 
   ExecutionSession ES;
diff --git a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index a2c4a2f2081a4bc0cd91bb30192b7da510169488..299d76183cd41a2542055742e441e4edfd767f4c 100644
--- a/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -50,10 +50,11 @@ public:
       MR.addDependenciesForAll(Deps);
     };
 
-    MR.getTargetJITDylib().withSearchOrderDo([&](const JITDylibList &JDs) {
-      ES.lookup(JDs, InternedSymbols, OnResolvedWithUnwrap, OnReady,
-                RegisterDependencies);
-    });
+    JITDylibSearchList SearchOrder;
+    MR.getTargetJITDylib().withSearchOrderDo(
+        [&](const JITDylibSearchList &JDs) { SearchOrder = JDs; });
+    ES.lookup(SearchOrder, InternedSymbols, OnResolvedWithUnwrap, OnReady,
+              RegisterDependencies);
   }
 
   Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) {
@@ -76,16 +77,15 @@ private:
 namespace llvm {
 namespace orc {
 
-RTDyldObjectLinkingLayer2::RTDyldObjectLinkingLayer2(
+RTDyldObjectLinkingLayer::RTDyldObjectLinkingLayer(
     ExecutionSession &ES, GetMemoryManagerFunction GetMemoryManager,
     NotifyLoadedFunction NotifyLoaded, NotifyEmittedFunction NotifyEmitted)
     : ObjectLayer(ES), GetMemoryManager(GetMemoryManager),
       NotifyLoaded(std::move(NotifyLoaded)),
       NotifyEmitted(std::move(NotifyEmitted)) {}
 
-void RTDyldObjectLinkingLayer2::emit(MaterializationResponsibility R,
-                                     VModuleKey K,
-                                     std::unique_ptr<MemoryBuffer> O) {
+void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
+                                    std::unique_ptr<MemoryBuffer> O) {
   assert(O && "Object must not be null");
 
   // This method launches an asynchronous link step that will fulfill our
@@ -121,14 +121,15 @@ void RTDyldObjectLinkingLayer2::emit(MaterializationResponsibility R,
     }
   }
 
-  auto MemoryManager = GetMemoryManager(K);
-  auto &MemMgr = *MemoryManager;
+  auto K = R.getVModuleKey();
+  RuntimeDyld::MemoryManager *MemMgr = nullptr;
+
+  // Create a record a memory manager for this object.
   {
+    auto Tmp = GetMemoryManager();
     std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
-
-    assert(!MemMgrs.count(K) &&
-           "A memory manager already exists for this key?");
-    MemMgrs[K] = std::move(MemoryManager);
+    MemMgrs.push_back(std::move(Tmp));
+    MemMgr = MemMgrs.back().get();
   }
 
   JITDylibSearchOrderResolver Resolver(*SharedR);
@@ -141,7 +142,7 @@ void RTDyldObjectLinkingLayer2::emit(MaterializationResponsibility R,
    * duplicate defs.
    */
   jitLinkForORC(
-      **Obj, std::move(O), MemMgr, Resolver, ProcessAllSections,
+      **Obj, std::move(O), *MemMgr, Resolver, ProcessAllSections,
       [this, K, SharedR, &Obj, InternalSymbols](
           std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
           std::map<StringRef, JITEvaluatedSymbol> ResolvedSymbols) {
@@ -153,7 +154,7 @@ void RTDyldObjectLinkingLayer2::emit(MaterializationResponsibility R,
       });
 }
 
-Error RTDyldObjectLinkingLayer2::onObjLoad(
+Error RTDyldObjectLinkingLayer::onObjLoad(
     VModuleKey K, MaterializationResponsibility &R, object::ObjectFile &Obj,
     std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
     std::map<StringRef, JITEvaluatedSymbol> Resolved,
@@ -196,7 +197,7 @@ Error RTDyldObjectLinkingLayer2::onObjLoad(
   return Error::success();
 }
 
-void RTDyldObjectLinkingLayer2::onObjEmit(VModuleKey K,
+void RTDyldObjectLinkingLayer::onObjEmit(VModuleKey K,
                                           MaterializationResponsibility &R,
                                           Error Err) {
   if (Err) {
diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 2d6e5c4aea675bdf8cbd94ed0c20bab10a137677..39bdc4b69217adced55de954fda2ff46d2fb8eaa 100644
--- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -37,7 +37,13 @@ private:
     if (!ImageBase) {
       ImageBase = std::numeric_limits<uint64_t>::max();
       for (const SectionEntry &Section : Sections)
-        ImageBase = std::min(ImageBase, Section.getLoadAddress());
+        // The Sections list may contain sections that weren't loaded for
+        // whatever reason: they may be debug sections, and ProcessAllSections
+        // is false, or they may be sections that contain 0 bytes. If the
+        // section isn't loaded, the load address will be 0, and it should not
+        // be included in the ImageBase calculation.
+        if (Section.getLoadAddress() != 0)
+          ImageBase = std::min(ImageBase, Section.getLoadAddress());
     }
     return ImageBase;
   }
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index eb2311da63bafc8c3e8a0ccd04b2317c843bd3af..3b575739263dc1c716c7f41f73356a594429f715 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -2871,6 +2871,7 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
     Out << ", readOnly: " << FFlags.ReadOnly;
     Out << ", noRecurse: " << FFlags.NoRecurse;
     Out << ", returnDoesNotAlias: " << FFlags.ReturnDoesNotAlias;
+    Out << ", noInline: " << FFlags.NoInline;
     Out << ")";
   }
   if (!FS->calls().empty()) {
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index d04af9261e3c205a796d524eefe3dd1847b77462..12ab2e2ace4da07ddbebe5948a37c3c09ba66889 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -135,9 +135,10 @@ const Module *BasicBlock::getModule() const {
   return getParent()->getParent();
 }
 
-const TerminatorInst *BasicBlock::getTerminator() const {
-  if (InstList.empty()) return nullptr;
-  return dyn_cast<TerminatorInst>(&InstList.back());
+const Instruction *BasicBlock::getTerminator() const {
+  if (InstList.empty() || !InstList.back().isTerminator())
+    return nullptr;
+  return &InstList.back();
 }
 
 const CallInst *BasicBlock::getTerminatingMustTailCall() const {
@@ -437,7 +438,7 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
 }
 
 void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) {
-  TerminatorInst *TI = getTerminator();
+  Instruction *TI = getTerminator();
   if (!TI)
     // Cope with being called on a BasicBlock that doesn't have a terminator
     // yet. Clang's CodeGenFunction::EmitReturnBlock() likes to do this.
@@ -468,7 +469,7 @@ const LandingPadInst *BasicBlock::getLandingPadInst() const {
 }
 
 Optional<uint64_t> BasicBlock::getIrrLoopHeaderWeight() const {
-  const TerminatorInst *TI = getTerminator();
+  const Instruction *TI = getTerminator();
   if (MDNode *MDIrrLoopHeader =
       TI->getMetadata(LLVMContext::MD_irr_loop)) {
     MDString *MDName = cast<MDString>(MDIrrLoopHeader->getOperand(0));
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 410a426a4a2dced4059090c2d1fa70c2fd9c84d2..a3065733c81b280fcce5cf2b0e978afa43c839ed 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
@@ -1189,6 +1190,78 @@ void LLVMAddNamedMetadataOperand(LLVMModuleRef M, const char *Name,
   N->addOperand(extractMDNode(unwrap<MetadataAsValue>(Val)));
 }
 
+const char *LLVMGetDebugLocDirectory(LLVMValueRef Val, unsigned *Length) {
+  if (!Length) return nullptr;
+  StringRef S;
+  if (const auto *I = unwrap<Instruction>(Val)) {
+    S = I->getDebugLoc()->getDirectory();
+  } else if (const auto *GV = unwrap<GlobalVariable>(Val)) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+    GV->getDebugInfo(GVEs);
+    if (GVEs.size())
+      if (const DIGlobalVariable *DGV = GVEs[0]->getVariable())
+        S = DGV->getDirectory();
+  } else if (const auto *F = unwrap<Function>(Val)) {
+    if (const DISubprogram *DSP = F->getSubprogram())
+      S = DSP->getDirectory();
+  } else {
+    assert(0 && "Expected Instruction, GlobalVariable or Function");
+    return nullptr;
+  }
+  *Length = S.size();
+  return S.data();
+}
+
+const char *LLVMGetDebugLocFilename(LLVMValueRef Val, unsigned *Length) {
+  if (!Length) return nullptr;
+  StringRef S;
+  if (const auto *I = unwrap<Instruction>(Val)) {
+    S = I->getDebugLoc()->getFilename();
+  } else if (const auto *GV = unwrap<GlobalVariable>(Val)) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+    GV->getDebugInfo(GVEs);
+    if (GVEs.size())
+      if (const DIGlobalVariable *DGV = GVEs[0]->getVariable())
+        S = DGV->getFilename();
+  } else if (const auto *F = unwrap<Function>(Val)) {
+    if (const DISubprogram *DSP = F->getSubprogram())
+      S = DSP->getFilename();
+  } else {
+    assert(0 && "Expected Instruction, GlobalVariable or Function");
+    return nullptr;
+  }
+  *Length = S.size();
+  return S.data();
+}
+
+unsigned LLVMGetDebugLocLine(LLVMValueRef Val) {
+  unsigned L = 0;
+  if (const auto *I = unwrap<Instruction>(Val)) {
+    L = I->getDebugLoc()->getLine();
+  } else if (const auto *GV = unwrap<GlobalVariable>(Val)) {
+    SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+    GV->getDebugInfo(GVEs);
+    if (GVEs.size())
+      if (const DIGlobalVariable *DGV = GVEs[0]->getVariable())
+        L = DGV->getLine();
+  } else if (const auto *F = unwrap<Function>(Val)) {
+    if (const DISubprogram *DSP = F->getSubprogram())
+      L = DSP->getLine();
+  } else {
+    assert(0 && "Expected Instruction, GlobalVariable or Function");
+    return -1;
+  }
+  return L;
+}
+
+unsigned LLVMGetDebugLocColumn(LLVMValueRef Val) {
+  unsigned C = 0;
+  if (const auto *I = unwrap<Instruction>(Val))
+    if (const auto &L = I->getDebugLoc())
+      C = L->getColumn();
+  return C;
+}
+
 /*--.. Operations on scalar constants ......................................--*/
 
 LLVMValueRef LLVMConstInt(LLVMTypeRef IntTy, unsigned long long N,
@@ -2207,6 +2280,50 @@ unsigned LLVMGetIntrinsicID(LLVMValueRef Fn) {
   return 0;
 }
 
+static Intrinsic::ID llvm_map_to_intrinsic_id(unsigned ID) {
+  assert(ID < llvm::Intrinsic::num_intrinsics && "Intrinsic ID out of range");
+  return llvm::Intrinsic::ID(ID);
+}
+
+LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod,
+                                         unsigned ID,
+                                         LLVMTypeRef *ParamTypes,
+                                         size_t ParamCount) {
+  ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  return wrap(llvm::Intrinsic::getDeclaration(unwrap(Mod), IID, Tys));
+}
+
+const char *LLVMIntrinsicGetName(unsigned ID, size_t *NameLength) {
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  auto Str = llvm::Intrinsic::getName(IID);
+  *NameLength = Str.size();
+  return Str.data();
+}
+
+LLVMTypeRef LLVMIntrinsicGetType(LLVMContextRef Ctx, unsigned ID,
+                                 LLVMTypeRef *ParamTypes, size_t ParamCount) {
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
+  return wrap(llvm::Intrinsic::getType(*unwrap(Ctx), IID, Tys));
+}
+
+const char *LLVMIntrinsicCopyOverloadedName(unsigned ID,
+                                            LLVMTypeRef *ParamTypes,
+                                            size_t ParamCount,
+                                            size_t *NameLength) {
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  ArrayRef<Type*> Tys(unwrap(ParamTypes), ParamCount);
+  auto Str = llvm::Intrinsic::getName(IID, Tys);
+  *NameLength = Str.length();
+  return strdup(Str.c_str());
+}
+
+LLVMBool LLVMIntrinsicIsOverloaded(unsigned ID) {
+  auto IID = llvm_map_to_intrinsic_id(ID);
+  return llvm::Intrinsic::isOverloaded(IID);
+}
+
 unsigned LLVMGetFunctionCallConv(LLVMValueRef Fn) {
   return unwrap<Function>(Fn)->getCallingConv();
 }
@@ -2522,6 +2639,11 @@ LLVMValueRef LLVMInstructionClone(LLVMValueRef Inst) {
   return nullptr;
 }
 
+LLVMValueRef LLVMIsATerminatorInst(LLVMValueRef Inst) {
+  Instruction *I = dyn_cast<Instruction>(unwrap(Inst));
+  return (I && I->isTerminator()) ? wrap(I) : nullptr;
+}
+
 unsigned LLVMGetNumArgOperands(LLVMValueRef Instr) {
   if (FuncletPadInst *FPI = dyn_cast<FuncletPadInst>(unwrap(Instr))) {
     return FPI->getNumArgOperands();
@@ -2637,15 +2759,15 @@ void LLVMSetUnwindDest(LLVMValueRef Invoke, LLVMBasicBlockRef B) {
 /*--.. Operations on terminators ...........................................--*/
 
 unsigned LLVMGetNumSuccessors(LLVMValueRef Term) {
-  return unwrap<TerminatorInst>(Term)->getNumSuccessors();
+  return unwrap<Instruction>(Term)->getNumSuccessors();
 }
 
 LLVMBasicBlockRef LLVMGetSuccessor(LLVMValueRef Term, unsigned i) {
-  return wrap(unwrap<TerminatorInst>(Term)->getSuccessor(i));
+  return wrap(unwrap<Instruction>(Term)->getSuccessor(i));
 }
 
 void LLVMSetSuccessor(LLVMValueRef Term, unsigned i, LLVMBasicBlockRef block) {
-  return unwrap<TerminatorInst>(Term)->setSuccessor(i,unwrap(block));
+  return unwrap<Instruction>(Term)->setSuccessor(i, unwrap(block));
 }
 
 /*--.. Operations on branch instructions (only) ............................--*/
@@ -3156,6 +3278,30 @@ LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
 
+LLVMValueRef LLVMBuildMemSet(LLVMBuilderRef B, LLVMValueRef Ptr, 
+                             LLVMValueRef Val, LLVMValueRef Len,
+                             unsigned Align) {
+  return wrap(unwrap(B)->CreateMemSet(unwrap(Ptr), unwrap(Val), unwrap(Len), Align));
+}
+
+LLVMValueRef LLVMBuildMemCpy(LLVMBuilderRef B, 
+                             LLVMValueRef Dst, unsigned DstAlign,
+                             LLVMValueRef Src, unsigned SrcAlign,
+                             LLVMValueRef Size) {
+  return wrap(unwrap(B)->CreateMemCpy(unwrap(Dst), DstAlign,
+                                      unwrap(Src), SrcAlign,
+                                      unwrap(Size)));
+}
+
+LLVMValueRef LLVMBuildMemMove(LLVMBuilderRef B,
+                              LLVMValueRef Dst, unsigned DstAlign,
+                              LLVMValueRef Src, unsigned SrcAlign,
+                              LLVMValueRef Size) {
+  return wrap(unwrap(B)->CreateMemMove(unwrap(Dst), DstAlign,
+                                       unwrap(Src), SrcAlign,
+                                       unwrap(Size)));
+}
+
 LLVMValueRef LLVMBuildAlloca(LLVMBuilderRef B, LLVMTypeRef Ty,
                              const char *Name) {
   return wrap(unwrap(B)->CreateAlloca(unwrap(Ty), nullptr, Name));
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index e5fb765f778fb3167ab53b7bc0da1bb144f0640b..02b7953cb5bbed5bcd972443c7241b9cbefddf52 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -280,7 +280,7 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
 }
 
 static MDNode *stripDebugLocFromLoopID(MDNode *N) {
-  assert(N->op_begin() != N->op_end() && "Missing self reference?");
+  assert(!empty(N->operands()) && "Missing self reference?");
 
   // if there is no debug location, we do not have to rewrite this MDNode.
   if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
diff --git a/lib/IR/Dominators.cpp b/lib/IR/Dominators.cpp
index c78f220439af1e8a57673d338f45ae5fbe1d5df7..cf9f5759ba5339231191864f23393cc130151a36 100644
--- a/lib/IR/Dominators.cpp
+++ b/lib/IR/Dominators.cpp
@@ -41,7 +41,7 @@ static constexpr bool ExpensiveChecksEnabled = false;
 #endif
 
 bool BasicBlockEdge::isSingleEdge() const {
-  const TerminatorInst *TI = Start->getTerminator();
+  const Instruction *TI = Start->getTerminator();
   unsigned NumEdgesToEnd = 0;
   for (unsigned int i = 0, n = TI->getNumSuccessors(); i < n; ++i) {
     if (TI->getSuccessor(i) == End)
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 36ba8d0721f38dc4020b9831eee8273f7632afcb..ec094812ceb2f3c61a68225896535506f48fe003 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -195,9 +195,9 @@ LLVMContext &Function::getContext() const {
   return getType()->getContext();
 }
 
-unsigned Function::getInstructionCount() {
+unsigned Function::getInstructionCount() const {
   unsigned NumInstrs = 0;
-  for (BasicBlock &BB : BasicBlocks)
+  for (const BasicBlock &BB : BasicBlocks)
     NumInstrs += std::distance(BB.instructionsWithoutDebug().begin(),
                                BB.instructionsWithoutDebug().end());
   return NumInstrs;
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 3f57b1dbfa889d71a2f992dc9996a450f5a1382a..cbd6450a20c9b138c9c222a0778497606bb4e5ff 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -252,7 +252,7 @@ bool GlobalValue::canIncreaseAlignment() const {
   // Conservatively assume ELF if there's no parent pointer.
   bool isELF =
       (!Parent || Triple(Parent->getTargetTriple()).isOSBinFormatELF());
-  if (isELF && hasDefaultVisibility() && !hasLocalLinkage())
+  if (isELF && !isDSOLocal())
     return false;
 
   return true;
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 126a96635ee8c106c9ada7313d5ed21b7898f9a3..7d4b6df18d93911840a0683e0fd51dd53a370c28 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -622,55 +622,53 @@ LandingPadInst *InvokeInst::getLandingPadInst() const {
 //===----------------------------------------------------------------------===//
 
 ReturnInst::ReturnInst(const ReturnInst &RI)
-  : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Ret,
-                   OperandTraits<ReturnInst>::op_end(this) -
-                     RI.getNumOperands(),
-                   RI.getNumOperands()) {
+    : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Ret,
+                  OperandTraits<ReturnInst>::op_end(this) - RI.getNumOperands(),
+                  RI.getNumOperands()) {
   if (RI.getNumOperands())
     Op<0>() = RI.Op<0>();
   SubclassOptionalData = RI.SubclassOptionalData;
 }
 
 ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
-                   OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
-                   InsertBefore) {
+    : Instruction(Type::getVoidTy(C), Instruction::Ret,
+                  OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+                  InsertBefore) {
   if (retVal)
     Op<0>() = retVal;
 }
 
 ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
-                   OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
-                   InsertAtEnd) {
+    : Instruction(Type::getVoidTy(C), Instruction::Ret,
+                  OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+                  InsertAtEnd) {
   if (retVal)
     Op<0>() = retVal;
 }
 
 ReturnInst::ReturnInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(Context), Instruction::Ret,
-                   OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {
-}
+    : Instruction(Type::getVoidTy(Context), Instruction::Ret,
+                  OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {}
 
 //===----------------------------------------------------------------------===//
 //                        ResumeInst Implementation
 //===----------------------------------------------------------------------===//
 
 ResumeInst::ResumeInst(const ResumeInst &RI)
-  : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Resume,
-                   OperandTraits<ResumeInst>::op_begin(this), 1) {
+    : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Resume,
+                  OperandTraits<ResumeInst>::op_begin(this), 1) {
   Op<0>() = RI.Op<0>();
 }
 
 ResumeInst::ResumeInst(Value *Exn, Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
-                   OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
+    : Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
+                  OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
   Op<0>() = Exn;
 }
 
 ResumeInst::ResumeInst(Value *Exn, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
-                   OperandTraits<ResumeInst>::op_begin(this), 1, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
+                  OperandTraits<ResumeInst>::op_begin(this), 1, InsertAtEnd) {
   Op<0>() = Exn;
 }
 
@@ -679,10 +677,10 @@ ResumeInst::ResumeInst(Value *Exn, BasicBlock *InsertAtEnd)
 //===----------------------------------------------------------------------===//
 
 CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI)
-    : TerminatorInst(CRI.getType(), Instruction::CleanupRet,
-                     OperandTraits<CleanupReturnInst>::op_end(this) -
-                         CRI.getNumOperands(),
-                     CRI.getNumOperands()) {
+    : Instruction(CRI.getType(), Instruction::CleanupRet,
+                  OperandTraits<CleanupReturnInst>::op_end(this) -
+                      CRI.getNumOperands(),
+                  CRI.getNumOperands()) {
   setInstructionSubclassData(CRI.getSubclassDataFromInstruction());
   Op<0>() = CRI.Op<0>();
   if (CRI.hasUnwindDest())
@@ -700,19 +698,19 @@ void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) {
 
 CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
                                      unsigned Values, Instruction *InsertBefore)
-    : TerminatorInst(Type::getVoidTy(CleanupPad->getContext()),
-                     Instruction::CleanupRet,
-                     OperandTraits<CleanupReturnInst>::op_end(this) - Values,
-                     Values, InsertBefore) {
+    : Instruction(Type::getVoidTy(CleanupPad->getContext()),
+                  Instruction::CleanupRet,
+                  OperandTraits<CleanupReturnInst>::op_end(this) - Values,
+                  Values, InsertBefore) {
   init(CleanupPad, UnwindBB);
 }
 
 CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
                                      unsigned Values, BasicBlock *InsertAtEnd)
-    : TerminatorInst(Type::getVoidTy(CleanupPad->getContext()),
-                     Instruction::CleanupRet,
-                     OperandTraits<CleanupReturnInst>::op_end(this) - Values,
-                     Values, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(CleanupPad->getContext()),
+                  Instruction::CleanupRet,
+                  OperandTraits<CleanupReturnInst>::op_end(this) - Values,
+                  Values, InsertAtEnd) {
   init(CleanupPad, UnwindBB);
 }
 
@@ -725,25 +723,25 @@ void CatchReturnInst::init(Value *CatchPad, BasicBlock *BB) {
 }
 
 CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI)
-    : TerminatorInst(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet,
-                     OperandTraits<CatchReturnInst>::op_begin(this), 2) {
+    : Instruction(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet,
+                  OperandTraits<CatchReturnInst>::op_begin(this), 2) {
   Op<0>() = CRI.Op<0>();
   Op<1>() = CRI.Op<1>();
 }
 
 CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
                                  Instruction *InsertBefore)
-    : TerminatorInst(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
-                     OperandTraits<CatchReturnInst>::op_begin(this), 2,
-                     InsertBefore) {
+    : Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
+                  OperandTraits<CatchReturnInst>::op_begin(this), 2,
+                  InsertBefore) {
   init(CatchPad, BB);
 }
 
 CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
                                  BasicBlock *InsertAtEnd)
-    : TerminatorInst(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
-                     OperandTraits<CatchReturnInst>::op_begin(this), 2,
-                     InsertAtEnd) {
+    : Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
+                  OperandTraits<CatchReturnInst>::op_begin(this), 2,
+                  InsertAtEnd) {
   init(CatchPad, BB);
 }
 
@@ -755,8 +753,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumReservedValues,
                                  const Twine &NameStr,
                                  Instruction *InsertBefore)
-    : TerminatorInst(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
-                     InsertBefore) {
+    : Instruction(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
+                  InsertBefore) {
   if (UnwindDest)
     ++NumReservedValues;
   init(ParentPad, UnwindDest, NumReservedValues + 1);
@@ -766,8 +764,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
 CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
                                  unsigned NumReservedValues,
                                  const Twine &NameStr, BasicBlock *InsertAtEnd)
-    : TerminatorInst(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
-                     InsertAtEnd) {
+    : Instruction(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
+                  InsertAtEnd) {
   if (UnwindDest)
     ++NumReservedValues;
   init(ParentPad, UnwindDest, NumReservedValues + 1);
@@ -775,8 +773,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
 }
 
 CatchSwitchInst::CatchSwitchInst(const CatchSwitchInst &CSI)
-    : TerminatorInst(CSI.getType(), Instruction::CatchSwitch, nullptr,
-                     CSI.getNumOperands()) {
+    : Instruction(CSI.getType(), Instruction::CatchSwitch, nullptr,
+                  CSI.getNumOperands()) {
   init(CSI.getParentPad(), CSI.getUnwindDest(), CSI.getNumOperands());
   setNumHungOffUseOperands(ReservedSpace);
   Use *OL = getOperandList();
@@ -874,13 +872,11 @@ FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
 
 UnreachableInst::UnreachableInst(LLVMContext &Context,
                                  Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable,
-                   nullptr, 0, InsertBefore) {
-}
+    : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
+                  0, InsertBefore) {}
 UnreachableInst::UnreachableInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(Context), Instruction::Unreachable,
-                   nullptr, 0, InsertAtEnd) {
-}
+    : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
+                  0, InsertAtEnd) {}
 
 //===----------------------------------------------------------------------===//
 //                        BranchInst Implementation
@@ -893,18 +889,18 @@ void BranchInst::AssertOK() {
 }
 
 BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                   OperandTraits<BranchInst>::op_end(this) - 1,
-                   1, InsertBefore) {
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 1, 1,
+                  InsertBefore) {
   assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
 
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
                        Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                   OperandTraits<BranchInst>::op_end(this) - 3,
-                   3, InsertBefore) {
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 3, 3,
+                  InsertBefore) {
   Op<-1>() = IfTrue;
   Op<-2>() = IfFalse;
   Op<-3>() = Cond;
@@ -914,18 +910,16 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
 }
 
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                   OperandTraits<BranchInst>::op_end(this) - 1,
-                   1, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 1, 1, InsertAtEnd) {
   assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
 
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
-           BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
-                   OperandTraits<BranchInst>::op_end(this) - 3,
-                   3, InsertAtEnd) {
+                       BasicBlock *InsertAtEnd)
+    : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - 3, 3, InsertAtEnd) {
   Op<-1>() = IfTrue;
   Op<-2>() = IfFalse;
   Op<-3>() = Cond;
@@ -934,10 +928,10 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
 #endif
 }
 
-BranchInst::BranchInst(const BranchInst &BI) :
-  TerminatorInst(Type::getVoidTy(BI.getContext()), Instruction::Br,
-                 OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
-                 BI.getNumOperands()) {
+BranchInst::BranchInst(const BranchInst &BI)
+    : Instruction(Type::getVoidTy(BI.getContext()), Instruction::Br,
+                  OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
+                  BI.getNumOperands()) {
   Op<-1>() = BI.Op<-1>();
   if (BI.getNumOperands() != 1) {
     assert(BI.getNumOperands() == 3 && "BR can have 1 or 3 operands!");
@@ -2115,71 +2109,6 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                             Op->getType(), Name, InsertAtEnd);
 }
 
-// isConstantAllOnes - Helper function for several functions below
-static inline bool isConstantAllOnes(const Value *V) {
-  if (const Constant *C = dyn_cast<Constant>(V))
-    return C->isAllOnesValue();
-  return false;
-}
-
-bool BinaryOperator::isNeg(const Value *V) {
-  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
-    if (Bop->getOpcode() == Instruction::Sub)
-      if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0)))
-        return C->isNegativeZeroValue();
-  return false;
-}
-
-bool BinaryOperator::isFNeg(const Value *V, bool IgnoreZeroSign) {
-  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
-    if (Bop->getOpcode() == Instruction::FSub)
-      if (Constant *C = dyn_cast<Constant>(Bop->getOperand(0))) {
-        if (!IgnoreZeroSign)
-          IgnoreZeroSign = cast<Instruction>(V)->hasNoSignedZeros();
-        return !IgnoreZeroSign ? C->isNegativeZeroValue() : C->isZeroValue();
-      }
-  return false;
-}
-
-bool BinaryOperator::isNot(const Value *V) {
-  if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
-    return (Bop->getOpcode() == Instruction::Xor &&
-            (isConstantAllOnes(Bop->getOperand(1)) ||
-             isConstantAllOnes(Bop->getOperand(0))));
-  return false;
-}
-
-Value *BinaryOperator::getNegArgument(Value *BinOp) {
-  return cast<BinaryOperator>(BinOp)->getOperand(1);
-}
-
-const Value *BinaryOperator::getNegArgument(const Value *BinOp) {
-  return getNegArgument(const_cast<Value*>(BinOp));
-}
-
-Value *BinaryOperator::getFNegArgument(Value *BinOp) {
-  return cast<BinaryOperator>(BinOp)->getOperand(1);
-}
-
-const Value *BinaryOperator::getFNegArgument(const Value *BinOp) {
-  return getFNegArgument(const_cast<Value*>(BinOp));
-}
-
-Value *BinaryOperator::getNotArgument(Value *BinOp) {
-  assert(isNot(BinOp) && "getNotArgument on non-'not' instruction!");
-  BinaryOperator *BO = cast<BinaryOperator>(BinOp);
-  Value *Op0 = BO->getOperand(0);
-  Value *Op1 = BO->getOperand(1);
-  if (isConstantAllOnes(Op0)) return Op1;
-
-  assert(isConstantAllOnes(Op1));
-  return Op0;
-}
-
-const Value *BinaryOperator::getNotArgument(const Value *BinOp) {
-  return getNotArgument(const_cast<Value*>(BinOp));
-}
-
 // Exchange the two operands to this instruction. This instruction is safe to
 // use on any binary instruction and does not modify the semantics of the
 // instruction. If the instruction is order-dependent (SetLT f.e.), the opcode
@@ -3220,15 +3149,18 @@ AddrSpaceCastInst::AddrSpaceCastInst(
 //===----------------------------------------------------------------------===//
 
 CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
-                 Value *RHS, const Twine &Name, Instruction *InsertBefore)
+                 Value *RHS, const Twine &Name, Instruction *InsertBefore,
+                 Instruction *FlagsSource)
   : Instruction(ty, op,
                 OperandTraits<CmpInst>::op_begin(this),
                 OperandTraits<CmpInst>::operands(this),
                 InsertBefore) {
-    Op<0>() = LHS;
-    Op<1>() = RHS;
+  Op<0>() = LHS;
+  Op<1>() = RHS;
   setPredicate((Predicate)predicate);
   setName(Name);
+  if (FlagsSource)
+    copyIRFlags(FlagsSource);
 }
 
 CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
@@ -3567,8 +3499,8 @@ void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumReserved) {
 /// constructor can also autoinsert before another instruction.
 SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
                        Instruction *InsertBefore)
-  : TerminatorInst(Type::getVoidTy(Value->getContext()), Instruction::Switch,
-                   nullptr, 0, InsertBefore) {
+    : Instruction(Type::getVoidTy(Value->getContext()), Instruction::Switch,
+                  nullptr, 0, InsertBefore) {
   init(Value, Default, 2+NumCases*2);
 }
 
@@ -3578,13 +3510,13 @@ SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
 /// constructor also autoinserts at the end of the specified BasicBlock.
 SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
                        BasicBlock *InsertAtEnd)
-  : TerminatorInst(Type::getVoidTy(Value->getContext()), Instruction::Switch,
-                   nullptr, 0, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(Value->getContext()), Instruction::Switch,
+                  nullptr, 0, InsertAtEnd) {
   init(Value, Default, 2+NumCases*2);
 }
 
 SwitchInst::SwitchInst(const SwitchInst &SI)
-  : TerminatorInst(SI.getType(), Instruction::Switch, nullptr, 0) {
+    : Instruction(SI.getType(), Instruction::Switch, nullptr, 0) {
   init(SI.getCondition(), SI.getDefaultDest(), SI.getNumOperands());
   setNumHungOffUseOperands(SI.getNumOperands());
   Use *OL = getOperandList();
@@ -3596,7 +3528,6 @@ SwitchInst::SwitchInst(const SwitchInst &SI)
   SubclassOptionalData = SI.SubclassOptionalData;
 }
 
-
 /// addCase - Add an entry to the switch instruction...
 ///
 void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
@@ -3675,21 +3606,21 @@ void IndirectBrInst::growOperands() {
 
 IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
                                Instruction *InsertBefore)
-: TerminatorInst(Type::getVoidTy(Address->getContext()),Instruction::IndirectBr,
-                 nullptr, 0, InsertBefore) {
+    : Instruction(Type::getVoidTy(Address->getContext()),
+                  Instruction::IndirectBr, nullptr, 0, InsertBefore) {
   init(Address, NumCases);
 }
 
 IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
                                BasicBlock *InsertAtEnd)
-: TerminatorInst(Type::getVoidTy(Address->getContext()),Instruction::IndirectBr,
-                 nullptr, 0, InsertAtEnd) {
+    : Instruction(Type::getVoidTy(Address->getContext()),
+                  Instruction::IndirectBr, nullptr, 0, InsertAtEnd) {
   init(Address, NumCases);
 }
 
 IndirectBrInst::IndirectBrInst(const IndirectBrInst &IBI)
-    : TerminatorInst(Type::getVoidTy(IBI.getContext()), Instruction::IndirectBr,
-                     nullptr, IBI.getNumOperands()) {
+    : Instruction(Type::getVoidTy(IBI.getContext()), Instruction::IndirectBr,
+                  nullptr, IBI.getNumOperands()) {
   allocHungoffUses(IBI.getNumOperands());
   Use *OL = getOperandList();
   const Use *InOL = IBI.getOperandList();
diff --git a/lib/IR/IntrinsicInst.cpp b/lib/IR/IntrinsicInst.cpp
index 43a93890a6198b7bc100ec72d8c1a8b9142d74b2..df3a38ac147feb08fbf98a9bb4d229ff2ebc0995 100644
--- a/lib/IR/IntrinsicInst.cpp
+++ b/lib/IR/IntrinsicInst.cpp
@@ -152,6 +152,10 @@ bool ConstrainedFPIntrinsic::isUnaryOp() const {
     case Intrinsic::experimental_constrained_log2:
     case Intrinsic::experimental_constrained_rint:
     case Intrinsic::experimental_constrained_nearbyint:
+    case Intrinsic::experimental_constrained_ceil:
+    case Intrinsic::experimental_constrained_floor:
+    case Intrinsic::experimental_constrained_round:
+    case Intrinsic::experimental_constrained_trunc:
       return true;
   }
 }
diff --git a/lib/IR/ModuleSummaryIndex.cpp b/lib/IR/ModuleSummaryIndex.cpp
index 4c4466f9a9026922bb5fd369dbbe25a1d15cf1c8..8d85f7901b080476119d4c6d979858ac1ce41921 100644
--- a/lib/IR/ModuleSummaryIndex.cpp
+++ b/lib/IR/ModuleSummaryIndex.cpp
@@ -182,8 +182,9 @@ static std::string linkageToString(GlobalValue::LinkageTypes LT) {
 
 static std::string fflagsToString(FunctionSummary::FFlags F) {
   auto FlagValue = [](unsigned V) { return V ? '1' : '0'; };
-  char FlagRep[] = {FlagValue(F.ReadNone), FlagValue(F.ReadOnly),
-                    FlagValue(F.NoRecurse), FlagValue(F.ReturnDoesNotAlias), 0};
+  char FlagRep[] = {FlagValue(F.ReadNone),     FlagValue(F.ReadOnly),
+                    FlagValue(F.NoRecurse),    FlagValue(F.ReturnDoesNotAlias),
+                    FlagValue(F.NoInline), 0};
 
   return FlagRep;
 }
@@ -198,9 +199,12 @@ static std::string getSummaryAttributes(GlobalValueSummary* GVS) {
          ", ffl: " + fflagsToString(FS->fflags());
 }
 
+static std::string getNodeVisualName(GlobalValue::GUID Id) {
+  return std::string("@") + std::to_string(Id);
+}
+
 static std::string getNodeVisualName(const ValueInfo &VI) {
-  return VI.name().empty() ? std::string("@") + std::to_string(VI.getGUID())
-                           : VI.name().str();
+  return VI.name().empty() ? getNodeVisualName(VI.getGUID()) : VI.name().str();
 }
 
 static std::string getNodeLabel(const ValueInfo &VI, GlobalValueSummary *GVS) {
@@ -221,13 +225,19 @@ static std::string getNodeLabel(const ValueInfo &VI, GlobalValueSummary *GVS) {
 // specific module associated with it. Typically this is function
 // or variable defined in native object or library.
 static void defineExternalNode(raw_ostream &OS, const char *Pfx,
-                               const ValueInfo &VI) {
-  auto StrId = std::to_string(VI.getGUID());
-  OS << "  " << StrId << " [label=\"" << getNodeVisualName(VI)
-     << "\"]; // defined externally\n";
+                               const ValueInfo &VI, GlobalValue::GUID Id) {
+  auto StrId = std::to_string(Id);
+  OS << "  " << StrId << " [label=\"";
+
+  if (VI) {
+    OS << getNodeVisualName(VI);
+  } else {
+    OS << getNodeVisualName(Id);
+  }
+  OS << "\"]; // defined externally\n";
 }
 
-void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
+void ModuleSummaryIndex::exportToDot(raw_ostream &OS) const {
   std::vector<Edge> CrossModuleEdges;
   DenseMap<GlobalValue::GUID, std::vector<uint64_t>> NodeMap;
   StringMap<GVSummaryMapTy> ModuleToDefinedGVS;
@@ -241,8 +251,8 @@ void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
                                        "_" + std::to_string(Id);
   };
 
-  auto DrawEdge = [&](const char *Pfx, int SrcMod, GlobalValue::GUID SrcId,
-                      int DstMod, GlobalValue::GUID DstId, int TypeOrHotness) {
+  auto DrawEdge = [&](const char *Pfx, uint64_t SrcMod, GlobalValue::GUID SrcId,
+                      uint64_t DstMod, GlobalValue::GUID DstId, int TypeOrHotness) {
     // 0 corresponds to alias edge, 1 to ref edge, 2 to call with unknown
     // hotness, ...
     TypeOrHotness += 2;
@@ -311,10 +321,17 @@ void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
         Draw(SummaryIt.first, R.getGUID(), -1);
 
       if (auto *AS = dyn_cast_or_null<AliasSummary>(SummaryIt.second)) {
-        auto AliaseeOrigId = AS->getAliasee().getOriginalName();
-        auto AliaseeId = getGUIDFromOriginalID(AliaseeOrigId);
-
-        Draw(SummaryIt.first, AliaseeId ? AliaseeId : AliaseeOrigId, -2);
+        GlobalValue::GUID AliaseeId;
+        if (AS->hasAliaseeGUID())
+          AliaseeId = AS->getAliaseeGUID();
+        else {
+          auto AliaseeOrigId = AS->getAliasee().getOriginalName();
+          AliaseeId = getGUIDFromOriginalID(AliaseeOrigId);
+          if (!AliaseeId)
+            AliaseeId = AliaseeOrigId;
+        }
+
+        Draw(SummaryIt.first, AliaseeId, -2);
         continue;
       }
 
@@ -330,7 +347,7 @@ void ModuleSummaryIndex::exportToDot(raw_ostream& OS) const {
   for (auto &E : CrossModuleEdges) {
     auto &ModList = NodeMap[E.Dst];
     if (ModList.empty()) {
-      defineExternalNode(OS, "  ", getValueInfo(E.Dst));
+      defineExternalNode(OS, "  ", getValueInfo(E.Dst), E.Dst);
       // Add fake module to the list to draw an edge to an external node
       // in the loop below.
       ModList.push_back(-1);
diff --git a/lib/IR/SafepointIRVerifier.cpp b/lib/IR/SafepointIRVerifier.cpp
index 7af48f5301f7a74fcf3bb0ab81ab9ddfe7e37561..3596b31dd25abee549979222a5569e28b23f4f92 100644
--- a/lib/IR/SafepointIRVerifier.cpp
+++ b/lib/IR/SafepointIRVerifier.cpp
@@ -134,7 +134,7 @@ public:
     // Top-down walk of the dominator tree
     ReversePostOrderTraversal<const Function *> RPOT(&F);
     for (const BasicBlock *BB : RPOT) {
-      const TerminatorInst *TI = BB->getTerminator();
+      const Instruction *TI = BB->getTerminator();
       assert(TI && "blocks must be well formed");
 
       // For conditional branches, we can perform simple conditional propagation on
@@ -257,8 +257,7 @@ static bool containsGCPtrType(Type *Ty) {
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
     return containsGCPtrType(AT->getElementType());
   if (StructType *ST = dyn_cast<StructType>(Ty))
-    return std::any_of(ST->subtypes().begin(), ST->subtypes().end(),
-                       containsGCPtrType);
+    return llvm::any_of(ST->subtypes(), containsGCPtrType);
   return false;
 }
 
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 83016496ff7ec3dfa699a1f3def0f966e68af8bf..0fb079c5ab73956d46d30ec3ed3d67e059bfaa14 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -297,20 +297,26 @@ FunctionType::FunctionType(Type *Result, ArrayRef<Type*> Params,
 FunctionType *FunctionType::get(Type *ReturnType,
                                 ArrayRef<Type*> Params, bool isVarArg) {
   LLVMContextImpl *pImpl = ReturnType->getContext().pImpl;
-  FunctionTypeKeyInfo::KeyTy Key(ReturnType, Params, isVarArg);
-  auto I = pImpl->FunctionTypes.find_as(Key);
+  const FunctionTypeKeyInfo::KeyTy Key(ReturnType, Params, isVarArg);
   FunctionType *FT;
-
-  if (I == pImpl->FunctionTypes.end()) {
+  // Since we only want to allocate a fresh function type in case none is found
+  // and we don't want to perform two lookups (one for checking if existent and
+  // one for inserting the newly allocated one), here we instead lookup based on
+  // Key and update the reference to the function type in-place to a newly
+  // allocated one if not found.
+  auto Insertion = pImpl->FunctionTypes.insert_as(nullptr, Key);
+  if (Insertion.second) {
+    // The function type was not found. Allocate one and update FunctionTypes
+    // in-place.
     FT = (FunctionType *)pImpl->TypeAllocator.Allocate(
         sizeof(FunctionType) + sizeof(Type *) * (Params.size() + 1),
         alignof(FunctionType));
     new (FT) FunctionType(ReturnType, Params, isVarArg);
-    pImpl->FunctionTypes.insert(FT);
+    *Insertion.first = FT;
   } else {
-    FT = *I;
+    // The function type was found. Just return it.
+    FT = *Insertion.first;
   }
-
   return FT;
 }
 
@@ -336,18 +342,25 @@ bool FunctionType::isValidArgumentType(Type *ArgTy) {
 StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
                             bool isPacked) {
   LLVMContextImpl *pImpl = Context.pImpl;
-  AnonStructTypeKeyInfo::KeyTy Key(ETypes, isPacked);
-  auto I = pImpl->AnonStructTypes.find_as(Key);
-  StructType *ST;
+  const AnonStructTypeKeyInfo::KeyTy Key(ETypes, isPacked);
 
-  if (I == pImpl->AnonStructTypes.end()) {
-    // Value not found.  Create a new type!
+  StructType *ST;
+  // Since we only want to allocate a fresh struct type in case none is found
+  // and we don't want to perform two lookups (one for checking if existent and
+  // one for inserting the newly allocated one), here we instead lookup based on
+  // Key and update the reference to the struct type in-place to a newly
+  // allocated one if not found.
+  auto Insertion = pImpl->AnonStructTypes.insert_as(nullptr, Key);
+  if (Insertion.second) {
+    // The struct type was not found. Allocate one and update AnonStructTypes
+    // in-place.
     ST = new (Context.pImpl->TypeAllocator) StructType(Context);
     ST->setSubclassData(SCDB_IsLiteral);  // Literal struct.
     ST->setBody(ETypes, isPacked);
-    Context.pImpl->AnonStructTypes.insert(ST);
+    *Insertion.first = ST;
   } else {
-    ST = *I;
+    // The struct type was found. Just return it.
+    ST = *Insertion.first;
   }
 
   return ST;
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 4b954c710e33649accc85be8af73fa4fa5c3125a..4d0135d8338e92825462a3c69b834eb022fe4bd1 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -287,7 +287,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
 
   // Maps catchswitches and cleanuppads that unwind to siblings to the
   // terminators that indicate the unwind, used to detect cycles therein.
-  MapVector<Instruction *, TerminatorInst *> SiblingFuncletInfo;
+  MapVector<Instruction *, Instruction *> SiblingFuncletInfo;
 
   /// Cache of constants visited in search of ConstantExprs.
   SmallPtrSet<const Constant *, 32> ConstantExprVisited;
@@ -457,7 +457,7 @@ private:
   void visitStoreInst(StoreInst &SI);
   void verifyDominatesUse(Instruction &I, unsigned i);
   void visitInstruction(Instruction &I);
-  void visitTerminatorInst(TerminatorInst &I);
+  void visitTerminator(Instruction &I);
   void visitBranchInst(BranchInst &BI);
   void visitReturnInst(ReturnInst &RI);
   void visitSwitchInst(SwitchInst &SI);
@@ -632,7 +632,8 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
     if (ArrayType *ATy = dyn_cast<ArrayType>(GV.getValueType())) {
       StructType *STy = dyn_cast<StructType>(ATy->getElementType());
       PointerType *FuncPtrTy =
-          FunctionType::get(Type::getVoidTy(Context), false)->getPointerTo();
+          FunctionType::get(Type::getVoidTy(Context), false)->
+          getPointerTo(DL.getProgramAddressSpace());
       // FIXME: Reject the 2-field form in LLVM 4.0.
       Assert(STy &&
                  (STy->getNumElements() == 2 || STy->getNumElements() == 3) &&
@@ -2008,7 +2009,7 @@ void Verifier::verifyFrameRecoverIndices() {
   }
 }
 
-static Instruction *getSuccPad(TerminatorInst *Terminator) {
+static Instruction *getSuccPad(Instruction *Terminator) {
   BasicBlock *UnwindDest;
   if (auto *II = dyn_cast<InvokeInst>(Terminator))
     UnwindDest = II->getUnwindDest();
@@ -2027,7 +2028,7 @@ void Verifier::verifySiblingFuncletUnwinds() {
     if (Visited.count(PredPad))
       continue;
     Active.insert(PredPad);
-    TerminatorInst *Terminator = Pair.second;
+    Instruction *Terminator = Pair.second;
     do {
       Instruction *SuccPad = getSuccPad(Terminator);
       if (Active.count(SuccPad)) {
@@ -2036,7 +2037,7 @@ void Verifier::verifySiblingFuncletUnwinds() {
         SmallVector<Instruction *, 8> CycleNodes;
         do {
           CycleNodes.push_back(CyclePad);
-          TerminatorInst *CycleTerminator = SiblingFuncletInfo[CyclePad];
+          Instruction *CycleTerminator = SiblingFuncletInfo[CyclePad];
           if (CycleTerminator != CyclePad)
             CycleNodes.push_back(CycleTerminator);
           CyclePad = getSuccPad(CycleTerminator);
@@ -2351,7 +2352,7 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
   }
 }
 
-void Verifier::visitTerminatorInst(TerminatorInst &I) {
+void Verifier::visitTerminator(Instruction &I) {
   // Ensure that terminators only exist at the end of the basic block.
   Assert(&I == I.getParent()->getTerminator(),
          "Terminator found in the middle of a basic block!", I.getParent());
@@ -2363,7 +2364,7 @@ void Verifier::visitBranchInst(BranchInst &BI) {
     Assert(BI.getCondition()->getType()->isIntegerTy(1),
            "Branch condition is not 'i1' type!", &BI, BI.getCondition());
   }
-  visitTerminatorInst(BI);
+  visitTerminator(BI);
 }
 
 void Verifier::visitReturnInst(ReturnInst &RI) {
@@ -2382,7 +2383,7 @@ void Verifier::visitReturnInst(ReturnInst &RI) {
 
   // Check to make sure that the return value has necessary properties for
   // terminators...
-  visitTerminatorInst(RI);
+  visitTerminator(RI);
 }
 
 void Verifier::visitSwitchInst(SwitchInst &SI) {
@@ -2397,7 +2398,7 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
            "Duplicate integer as switch case", &SI, Case.getCaseValue());
   }
 
-  visitTerminatorInst(SI);
+  visitTerminator(SI);
 }
 
 void Verifier::visitIndirectBrInst(IndirectBrInst &BI) {
@@ -2407,7 +2408,7 @@ void Verifier::visitIndirectBrInst(IndirectBrInst &BI) {
     Assert(BI.getDestination(i)->getType()->isLabelTy(),
            "Indirectbr destinations must all have pointer type!", &BI);
 
-  visitTerminatorInst(BI);
+  visitTerminator(BI);
 }
 
 void Verifier::visitSelectInst(SelectInst &SI) {
@@ -2986,7 +2987,7 @@ void Verifier::visitInvokeInst(InvokeInst &II) {
       "The unwind destination does not have an exception handling instruction!",
       &II);
 
-  visitTerminatorInst(II);
+  visitTerminator(II);
 }
 
 /// visitBinaryOperator - Check that both arguments to the binary operator are
@@ -3449,7 +3450,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
   Instruction *ToPad = &I;
   Value *ToPadParent = getParentPad(ToPad);
   for (BasicBlock *PredBB : predecessors(BB)) {
-    TerminatorInst *TI = PredBB->getTerminator();
+    Instruction *TI = PredBB->getTerminator();
     Value *FromPad;
     if (auto *II = dyn_cast<InvokeInst>(TI)) {
       Assert(II->getUnwindDest() == BB && II->getNormalDest() != BB,
@@ -3537,7 +3538,7 @@ void Verifier::visitResumeInst(ResumeInst &RI) {
            "inside a function.",
            &RI);
 
-  visitTerminatorInst(RI);
+  visitTerminator(RI);
 }
 
 void Verifier::visitCatchPadInst(CatchPadInst &CPI) {
@@ -3565,7 +3566,7 @@ void Verifier::visitCatchReturnInst(CatchReturnInst &CatchReturn) {
          "CatchReturnInst needs to be provided a CatchPad", &CatchReturn,
          CatchReturn.getOperand(0));
 
-  visitTerminatorInst(CatchReturn);
+  visitTerminator(CatchReturn);
 }
 
 void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) {
@@ -3686,7 +3687,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
           // Record cleanup sibling unwinds for verifySiblingFuncletUnwinds
           if (isa<CleanupPadInst>(&FPI) && !isa<ConstantTokenNone>(UnwindPad) &&
               getParentPad(UnwindPad) == getParentPad(&FPI))
-            SiblingFuncletInfo[&FPI] = cast<TerminatorInst>(U);
+            SiblingFuncletInfo[&FPI] = cast<Instruction>(U);
         }
       }
       // Make sure we visit all uses of FPI, but for nested pads stop as
@@ -3787,7 +3788,7 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
   }
 
   visitEHPadPredecessors(CatchSwitch);
-  visitTerminatorInst(CatchSwitch);
+  visitTerminator(CatchSwitch);
 }
 
 void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) {
@@ -3803,7 +3804,7 @@ void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) {
            &CRI);
   }
 
-  visitTerminatorInst(CRI);
+  visitTerminator(CRI);
 }
 
 void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
@@ -4103,6 +4104,12 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
   case Intrinsic::experimental_constrained_log2:
   case Intrinsic::experimental_constrained_rint:
   case Intrinsic::experimental_constrained_nearbyint:
+  case Intrinsic::experimental_constrained_maxnum:
+  case Intrinsic::experimental_constrained_minnum:
+  case Intrinsic::experimental_constrained_ceil:
+  case Intrinsic::experimental_constrained_floor:
+  case Intrinsic::experimental_constrained_round:
+  case Intrinsic::experimental_constrained_trunc:
     visitConstrainedFPIntrinsic(
         cast<ConstrainedFPIntrinsic>(*CS.getInstruction()));
     break;
@@ -4473,6 +4480,20 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
 
     break;
   }
+  case Intrinsic::sadd_sat:
+  case Intrinsic::uadd_sat:
+  case Intrinsic::ssub_sat:
+  case Intrinsic::usub_sat: {
+    Value *Op1 = CS.getArgOperand(0);
+    Value *Op2 = CS.getArgOperand(1);
+    Assert(Op1->getType()->isIntOrIntVectorTy(),
+           "first operand of [us][add|sub]_sat must be an int type or vector "
+           "of ints");
+    Assert(Op2->getType()->isIntOrIntVectorTy(),
+           "second operand of [us][add|sub]_sat must be an int type or vector "
+           "of ints");
+    break;
+  }
   };
 }
 
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index a6cd15699fb1b948e9a696aa6c86c325f12aaaf3..0eb4bba26760079001118a33f6d105412a0241cb 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -35,6 +35,7 @@ subdirectories =
  BinaryFormat
  ObjectYAML
  Option
+ OptRemarks
  Passes
  ProfileData
  Support
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 50d0075a60813da90b2bd24b83df0d7d2416639d..2726b6785eddfd174e62112b68fc64aca5f45600 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -56,6 +56,11 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+/// Enable global value internalization in LTO.
+cl::opt<bool> EnableLTOInternalization(
+    "enable-lto-internalization", cl::init(true), cl::Hidden,
+    cl::desc("Enable global value internalization in LTO"));
+
 // Returns a unique hash for the Module considering the current list of
 // export/import and other global analysis results.
 // The hash is produced in \p Key.
@@ -263,8 +268,15 @@ static void computeCacheKey(
 
   if (!Conf.SampleProfile.empty()) {
     auto FileOrErr = MemoryBuffer::getFile(Conf.SampleProfile);
-    if (FileOrErr)
+    if (FileOrErr) {
       Hasher.update(FileOrErr.get()->getBuffer());
+
+      if (!Conf.ProfileRemapping.empty()) {
+        FileOrErr = MemoryBuffer::getFile(Conf.ProfileRemapping);
+        if (FileOrErr)
+          Hasher.update(FileOrErr.get()->getBuffer());
+      }
+    }
   }
 
   Key = toHex(Hasher.result());
@@ -337,7 +349,8 @@ static void thinLTOInternalizeAndPromoteGUID(
     if (isExported(S->modulePath(), GUID)) {
       if (GlobalValue::isLocalLinkage(S->linkage()))
         S->setLinkage(GlobalValue::ExternalLinkage);
-    } else if (!GlobalValue::isLocalLinkage(S->linkage()))
+    } else if (EnableLTOInternalization &&
+               !GlobalValue::isLocalLinkage(S->linkage()))
       S->setLinkage(GlobalValue::InternalLinkage);
   }
 }
@@ -869,7 +882,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
         continue;
       GV->setUnnamedAddr(R.second.UnnamedAddr ? GlobalValue::UnnamedAddr::Global
                                               : GlobalValue::UnnamedAddr::None);
-      if (R.second.Partition == 0)
+      if (EnableLTOInternalization && R.second.Partition == 0)
         GV->setLinkage(GlobalValue::InternalLinkage);
     }
 
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index be33ab8493359f82db2ab2467ab1c3dca3b3e4df..1f9d60a5bdff520c1eb7e5baa7476cfdacfd09cf 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -155,13 +155,14 @@ static void runNewPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
                            const ModuleSummaryIndex *ImportSummary) {
   Optional<PGOOptions> PGOOpt;
   if (!Conf.SampleProfile.empty())
-    PGOOpt = PGOOptions("", "", Conf.SampleProfile, false, true);
+    PGOOpt = PGOOptions("", "", Conf.SampleProfile, Conf.ProfileRemapping,
+                        false, true);
 
   PassBuilder PB(TM, PGOOpt);
   AAManager AA;
 
   // Parse a custom AA pipeline if asked to.
-  if (!PB.parseAAPipeline(AA, "default"))
+  if (auto Err = PB.parseAAPipeline(AA, "default"))
     report_fatal_error("Error parsing default AA pipeline");
 
   LoopAnalysisManager LAM(Conf.DebugPassManager);
@@ -220,9 +221,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
 
   // Parse a custom AA pipeline if asked to.
   if (!AAPipelineDesc.empty())
-    if (!PB.parseAAPipeline(AA, AAPipelineDesc))
-      report_fatal_error("unable to parse AA pipeline description: " +
-                         AAPipelineDesc);
+    if (auto Err = PB.parseAAPipeline(AA, AAPipelineDesc))
+      report_fatal_error("unable to parse AA pipeline description '" +
+                         AAPipelineDesc + "': " + toString(std::move(Err)));
 
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
@@ -245,9 +246,9 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
   MPM.addPass(VerifierPass());
 
   // Now, add all the passes we've been requested to.
-  if (!PB.parsePassPipeline(MPM, PipelineDesc))
-    report_fatal_error("unable to parse pass pipeline description: " +
-                       PipelineDesc);
+  if (auto Err = PB.parsePassPipeline(MPM, PipelineDesc))
+    report_fatal_error("unable to parse pass pipeline description '" +
+                       PipelineDesc + "': " + toString(std::move(Err)));
 
   if (!DisableVerify)
     MPM.addPass(VerifierPass());
diff --git a/lib/LTO/UpdateCompilerUsed.cpp b/lib/LTO/UpdateCompilerUsed.cpp
index c982a5b0e5aa4445ef3ed46cace7146c50d272bd..00482dee6e106f3e7d213625156b6d1e57b1f206 100644
--- a/lib/LTO/UpdateCompilerUsed.cpp
+++ b/lib/LTO/UpdateCompilerUsed.cpp
@@ -95,12 +95,18 @@ private:
     if (GV.hasPrivateLinkage())
       return;
 
-    // Conservatively append user-supplied runtime library functions to
-    // llvm.compiler.used.  These could be internalized and deleted by
-    // optimizations like -globalopt, causing problems when later optimizations
-    // add new library calls (e.g., llvm.memset => memset and printf => puts).
+    // Conservatively append user-supplied runtime library functions (supplied
+    // either directly, or via a function alias) to llvm.compiler.used.  These
+    // could be internalized and deleted by optimizations like -globalopt,
+    // causing problems when later optimizations add new library calls (e.g.,
+    // llvm.memset => memset and printf => puts).
     // Leave it to the linker to remove any dead code (e.g. with -dead_strip).
-    if (isa<Function>(GV) && Libcalls.count(GV.getName())) {
+    GlobalValue *FuncAliasee = nullptr;
+    if (isa<GlobalAlias>(GV)) {
+      auto *A = cast<GlobalAlias>(&GV);
+      FuncAliasee = dyn_cast<Function>(A->getAliasee());
+    }
+    if ((isa<Function>(GV) || FuncAliasee) && Libcalls.count(GV.getName())) {
       LLVMUsed.push_back(&GV);
       return;
     }
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index c4744ac5d51af086064b8b16a5fbc8b73e37ca00..463e9066616d81ba35897522889dde5299cc7045 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -289,6 +289,7 @@ public:
 
   void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) override;
   void EmitWinCFIEndProc(SMLoc Loc) override;
+  void EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) override;
   void EmitWinCFIStartChained(SMLoc Loc) override;
   void EmitWinCFIEndChained(SMLoc Loc) override;
   void EmitWinCFIPushReg(unsigned Register, SMLoc Loc) override;
@@ -858,10 +859,14 @@ void MCAsmStreamer::EmitBytes(StringRef Data) {
   // supported, emit as vector of 8bits data.
   if (Data.size() == 1 ||
       !(MAI->getAscizDirective() || MAI->getAsciiDirective())) {
-    const char *Directive = MAI->getData8bitsDirective();
-    for (const unsigned char C : Data.bytes()) {
-      OS << Directive << (unsigned)C;
-      EmitEOL();
+    if (MCTargetStreamer *TS = getTargetStreamer()) {
+      TS->emitRawBytes(Data);
+    } else {
+      const char *Directive = MAI->getData8bitsDirective();
+      for (const unsigned char C : Data.bytes()) {
+        OS << Directive << (unsigned)C;
+        EmitEOL();
+      }
     }
     return;
   }
@@ -1585,6 +1590,10 @@ void MCAsmStreamer::EmitWinCFIEndProc(SMLoc Loc) {
   EmitEOL();
 }
 
+// TODO: Implement
+void MCAsmStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
+}
+
 void MCAsmStreamer::EmitWinCFIStartChained(SMLoc Loc) {
   MCStreamer::EmitWinCFIStartChained(Loc);
 
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index a4458e64bd359bc686805dcc4e94ce763e738aa5..38f311be7c66b49fac14355c6716b6bbfaa30abc 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -526,6 +526,11 @@ static void AttemptToFoldSymbolOffsetDifference(
     if (Asm->isThumbFunc(&SA))
       Addend |= 1;
 
+    // If symbol is labeled as micromips, we set low-bit to ensure
+    // correct offset in .gcc_except_table
+    if (Asm->getBackend().isMicroMips(&SA))
+      Addend |= 1;
+
     // Clear the symbol expr pointers to indicate we have folded these
     // operands.
     A = B = nullptr;
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index a96dec184441cfa56ca24a47ec171e6e0435ccb0..4e97e7550bcbfc34bcaa4dea08b9336307a819cb 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -22,6 +23,9 @@ namespace {
     /// @name MCStreamer Interface
     /// @{
 
+    bool hasRawTextSupport() const override { return true; }
+    void EmitRawTextImpl(StringRef String) override {}
+
     bool EmitSymbolAttribute(MCSymbol *Symbol,
                              MCSymbolAttr Attribute) override {
       return true;
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index edfccfcb9ed2c50fb69adacdff9eab575f83de79..ab8e0f31db9e7a8faa482fbea124cdef8f8738c8 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -254,9 +254,16 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   DwarfStrOffSection =
       Ctx->getMachOSection("__DWARF", "__debug_str_offs", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "section_str_off");
+  DwarfAddrSection =
+      Ctx->getMachOSection("__DWARF", "__debug_addr", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata(), "section_info");
   DwarfLocSection =
       Ctx->getMachOSection("__DWARF", "__debug_loc", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "section_debug_loc");
+  DwarfLoclistsSection =
+      Ctx->getMachOSection("__DWARF", "__debug_loclists", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata(), "section_debug_loc");
+
   DwarfARangesSection =
       Ctx->getMachOSection("__DWARF", "__debug_aranges", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
@@ -432,6 +439,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".debug_str_offsets", DebugSecType, 0);
   DwarfAddrSection = Ctx->getELFSection(".debug_addr", DebugSecType, 0);
   DwarfRnglistsSection = Ctx->getELFSection(".debug_rnglists", DebugSecType, 0);
+  DwarfLoclistsSection = Ctx->getELFSection(".debug_loclists", DebugSecType, 0);
 
   // Fission Sections
   DwarfInfoDWOSection =
@@ -743,6 +751,12 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
   DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata());
   DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata());
 
+  // Wasm use data section for LSDA.
+  // TODO Consider putting each function's exception table in a separate
+  // section, as in -function-sections, to facilitate lld's --gc-section.
+  LSDASection = Ctx->getWasmSection(".rodata.gcc_except_table",
+                                    SectionKind::getReadOnlyWithRel());
+
   // TODO: Define more sections.
 }
 
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 74835fd70c04b04d34fae53449e39c1de29e9f1b..c8d48f033f65f6a958568a9e956a018d5fde4341 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -243,22 +243,26 @@ static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
 
 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
 // integer as a hexadecimal, possibly with leading zeroes.
-static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
-  const char *FirstHex = nullptr;
+static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
+                               bool LexHex) {
+  const char *FirstNonDec = nullptr;
   const char *LookAhead = CurPtr;
   while (true) {
     if (isDigit(*LookAhead)) {
       ++LookAhead;
-    } else if (isHexDigit(*LookAhead)) {
-      if (!FirstHex)
-        FirstHex = LookAhead;
-      ++LookAhead;
     } else {
-      break;
+      if (!FirstNonDec)
+        FirstNonDec = LookAhead;
+
+      // Keep going if we are looking for a 'h' suffix.
+      if (LexHex && isHexDigit(*LookAhead))
+        ++LookAhead;
+      else
+        break;
     }
   }
-  bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
-  CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
+  bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
+  CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
   if (isHex)
     return 16;
   return DefaultRadix;
@@ -281,7 +285,7 @@ static AsmToken intToken(StringRef Ref, APInt &Value)
 AsmToken AsmLexer::LexDigit() {
   // MASM-flavor binary integer: [01]+[bB]
   // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
-  if (IsParsingMSInlineAsm && isdigit(CurPtr[-1])) {
+  if (LexMasmIntegers && isdigit(CurPtr[-1])) {
     const char *FirstNonBinary = (CurPtr[-1] != '0' && CurPtr[-1] != '1') ?
                                    CurPtr - 1 : nullptr;
     const char *OldCurPtr = CurPtr;
@@ -320,7 +324,7 @@ AsmToken AsmLexer::LexDigit() {
 
   // Decimal integer: [1-9][0-9]*
   if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
-    unsigned Radix = doLookAhead(CurPtr, 10);
+    unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
     bool isHex = Radix == 16;
     // Check for floating point literals.
     if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
@@ -335,8 +339,8 @@ AsmToken AsmLexer::LexDigit() {
       return ReturnError(TokStart, !isHex ? "invalid decimal number" :
                            "invalid hexdecimal number");
 
-    // Consume the [bB][hH].
-    if (Radix == 2 || Radix == 16)
+    // Consume the [hH].
+    if (LexMasmIntegers && Radix == 16)
       ++CurPtr;
 
     // The darwin/x86 (and x86-64) assembler accepts and ignores type
@@ -346,7 +350,7 @@ AsmToken AsmLexer::LexDigit() {
     return intToken(Result, Value);
   }
 
-  if (!IsParsingMSInlineAsm && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
+  if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
     ++CurPtr;
     // See if we actually have "0b" as part of something like "jmp 0b\n"
     if (!isDigit(CurPtr[0])) {
@@ -395,7 +399,7 @@ AsmToken AsmLexer::LexDigit() {
       return ReturnError(TokStart, "invalid hexadecimal number");
 
     // Consume the optional [hH].
-    if (!IsParsingMSInlineAsm && (*CurPtr == 'h' || *CurPtr == 'H'))
+    if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
       ++CurPtr;
 
     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
@@ -407,7 +411,7 @@ AsmToken AsmLexer::LexDigit() {
 
   // Either octal or hexadecimal.
   APInt Value(128, 0, true);
-  unsigned Radix = doLookAhead(CurPtr, 8);
+  unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
   bool isHex = Radix == 16;
   StringRef Result(TokStart, CurPtr - TokStart);
   if (Result.getAsInteger(Radix, Value))
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 6eb7fd0d0b6b159a40b5e0d256b15adb0193c727..3f7b507791eca9ec957e2b7fde49db974cbd6307 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -229,7 +229,9 @@ public:
 
   void setParsingInlineAsm(bool V) override {
     ParsingInlineAsm = V;
-    Lexer.setParsingMSInlineAsm(V);
+    // When parsing MS inline asm, we must lex 0b1101 and 0ABCH as binary and
+    // hex integer literals.
+    Lexer.setLexMasmIntegers(V);
   }
   bool isParsingInlineAsm() override { return ParsingInlineAsm; }
 
@@ -3919,8 +3921,13 @@ bool AsmParser::parseDirectiveCFIStartProc() {
         parseToken(AsmToken::EndOfStatement))
       return addErrorSuffix(" in '.cfi_startproc' directive");
   }
-
-  getStreamer().EmitCFIStartProc(!Simple.empty());
+  
+  // TODO(kristina): Deal with a corner case of incorrect diagnostic context
+  // being produced if this directive is emitted as part of preprocessor macro
+  // expansion which can *ONLY* happen if Clang's cc1as is the API consumer.
+  // Tools like llvm-mc on the other hand are not affected by it, and report
+  // correct context information.
+  getStreamer().EmitCFIStartProc(!Simple.empty(), Lexer.getLoc());
   return false;
 }
 
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index d439734e76fc7c2cccb5b6bf3797e3dd88b2149a..efedcdc5a314d10e4c1882f0ad9095a136538bb7 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -21,7 +21,7 @@
 
 using namespace llvm;
 
-MCAsmParser::MCAsmParser() : ShowParsedOperands(0) {}
+MCAsmParser::MCAsmParser() {}
 
 MCAsmParser::~MCAsmParser() = default;
 
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index fa0d1f46cbb58e9b9f1df212a5e3e8ed3d8f22d7..3722c0ad3c818292e55c6a5fd48d75055a771835 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -72,6 +72,18 @@ void MCTargetStreamer::emitValue(const MCExpr *Value) {
   Streamer.EmitRawText(OS.str());
 }
 
+void MCTargetStreamer::emitRawBytes(StringRef Data) {
+  const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+  const char *Directive = MAI->getData8bitsDirective();
+  for (const unsigned char C : Data.bytes()) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+
+    OS << Directive << (unsigned)C;
+    Streamer.EmitRawText(OS.str());
+  }
+}
+
 void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {}
 
 MCStreamer::MCStreamer(MCContext &Ctx)
@@ -347,10 +359,10 @@ void MCStreamer::EmitCFISections(bool EH, bool Debug) {
   assert(EH || Debug);
 }
 
-void MCStreamer::EmitCFIStartProc(bool IsSimple) {
+void MCStreamer::EmitCFIStartProc(bool IsSimple, SMLoc Loc) {
   if (hasUnfinishedDwarfFrameInfo())
-    getContext().reportError(
-        SMLoc(), "starting new .cfi frame before finishing the previous one");
+    return getContext().reportError(
+        Loc, "starting new .cfi frame before finishing the previous one");
 
   MCDwarfFrameInfo Frame;
   Frame.IsSimple = IsSimple;
@@ -615,6 +627,17 @@ void MCStreamer::EmitWinCFIEndProc(SMLoc Loc) {
   CurFrame->End = Label;
 }
 
+void MCStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
+  WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
+  if (!CurFrame)
+    return;
+  if (CurFrame->ChainedParent)
+    getContext().reportError(Loc, "Not all chained regions terminated!");
+
+  MCSymbol *Label = EmitCFILabel();
+  CurFrame->FuncletOrFuncEnd = Label;
+}
+
 void MCStreamer::EmitWinCFIStartChained(SMLoc Loc) {
   WinEH::FrameInfo *CurFrame = EnsureValidWinFrameInfo(Loc);
   if (!CurFrame)
diff --git a/lib/MC/MCWasmStreamer.cpp b/lib/MC/MCWasmStreamer.cpp
index 7321a30dd9431e97405b0f27fa1a4f29261f5174..d2a152058b90dfb024f195a0559f80960ac2cc52 100644
--- a/lib/MC/MCWasmStreamer.cpp
+++ b/lib/MC/MCWasmStreamer.cpp
@@ -61,7 +61,7 @@ void MCWasmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
 void MCWasmStreamer::ChangeSection(MCSection *Section,
                                    const MCExpr *Subsection) {
   MCAssembler &Asm = getAssembler();
-  auto *SectionWasm = static_cast<const MCSectionWasm *>(Section);
+  auto *SectionWasm = cast<MCSectionWasm>(Section);
   const MCSymbol *Grp = SectionWasm->getGroup();
   if (Grp)
     Asm.registerSymbol(*Grp);
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index 1407f25e6f2a1bd94faefe0a08c461b7c8b9bcf2..0c8d58e597271a97366d3467417c192659760209 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -11,6 +11,9 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Win64EH.h"
@@ -23,6 +26,8 @@ static uint8_t CountOfUnwindCodes(std::vector<WinEH::Instruction> &Insns) {
   uint8_t Count = 0;
   for (const auto &I : Insns) {
     switch (static_cast<Win64EH::UnwindOpcodes>(I.Operation)) {
+    default:
+      llvm_unreachable("Unsupported unwind code");
     case Win64EH::UOP_PushNonVol:
     case Win64EH::UOP_AllocSmall:
     case Win64EH::UOP_SetFPReg:
@@ -60,6 +65,8 @@ static void EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
   uint16_t w;
   b2 = (inst.Operation & 0x0F);
   switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) {
+  default:
+    llvm_unreachable("Unsupported unwind code");
   case Win64EH::UOP_PushNonVol:
     EmitAbsDifference(streamer, inst.Label, begin);
     b2 |= (inst.Register & 0x0F) << 4;
@@ -242,3 +249,343 @@ void llvm::Win64EH::UnwindEmitter::EmitUnwindInfo(
   ::EmitUnwindInfo(Streamer, info);
 }
 
+static int64_t GetAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS,
+                                const MCSymbol *RHS) {
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Diff =
+      MCBinaryExpr::createSub(MCSymbolRefExpr::create(LHS, Context),
+                              MCSymbolRefExpr::create(RHS, Context), Context);
+  MCObjectStreamer *OS = (MCObjectStreamer *)(&Streamer);
+  int64_t value;
+  Diff->evaluateAsAbsolute(value, OS->getAssembler());
+  return value;
+}
+
+static uint32_t
+ARM64CountOfUnwindCodes(const std::vector<WinEH::Instruction> &Insns) {
+  uint32_t Count = 0;
+  for (const auto &I : Insns) {
+    switch (static_cast<Win64EH::UnwindOpcodes>(I.Operation)) {
+    default:
+      llvm_unreachable("Unsupported ARM64 unwind code");
+    case Win64EH::UOP_AllocSmall:
+      Count += 1;
+      break;
+    case Win64EH::UOP_AllocMedium:
+      Count += 2;
+      break;
+    case Win64EH::UOP_AllocLarge:
+      Count += 4;
+      break;
+    case Win64EH::UOP_SaveFPLRX:
+      Count += 1;
+      break;
+    case Win64EH::UOP_SaveFPLR:
+      Count += 1;
+      break;
+    case Win64EH::UOP_SaveReg:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveRegP:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveRegPX:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveRegX:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFReg:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFRegP:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFRegX:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SaveFRegPX:
+      Count += 2;
+      break;
+    case Win64EH::UOP_SetFP:
+      Count += 1;
+      break;
+    case Win64EH::UOP_AddFP:
+      Count += 2;
+      break;
+    case Win64EH::UOP_Nop:
+      Count += 1;
+      break;
+    case Win64EH::UOP_End:
+      Count += 1;
+      break;
+    }
+  }
+  return Count;
+}
+
+// Unwind opcode encodings and restrictions are documented at
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
+                                WinEH::Instruction &inst) {
+  uint8_t b, reg;
+  switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) {
+  default:
+    llvm_unreachable("Unsupported ARM64 unwind code");
+  case Win64EH::UOP_AllocSmall:
+    b = (inst.Offset >> 4) & 0x1F;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_AllocMedium: {
+    uint16_t hw = (inst.Offset >> 4) & 0x7FF;
+    b = 0xC0;
+    b |= (hw >> 8);
+    streamer.EmitIntValue(b, 1);
+    b = hw & 0xFF;
+    streamer.EmitIntValue(b, 1);
+    break;
+  }
+  case Win64EH::UOP_AllocLarge: {
+    uint32_t w;
+    b = 0xE0;
+    streamer.EmitIntValue(b, 1);
+    w = inst.Offset >> 4;
+    b = (w & 0x00FF0000) >> 16;
+    streamer.EmitIntValue(b, 1);
+    b = (w & 0x0000FF00) >> 8;
+    streamer.EmitIntValue(b, 1);
+    b = w & 0x000000FF;
+    streamer.EmitIntValue(b, 1);
+    break;
+  }
+  case Win64EH::UOP_SetFP:
+    b = 0xE1;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_AddFP:
+    b = 0xE2;
+    streamer.EmitIntValue(b, 1);
+    b = (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_Nop:
+    b = 0xE3;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFPLRX:
+    b = 0x80;
+    b |= ((inst.Offset - 1) >> 3) & 0x3F;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFPLR:
+    b = 0x40;
+    b |= (inst.Offset >> 3) & 0x3F;
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveReg:
+    assert(inst.Register >= 19 && "Saved reg must be >= 19");
+    reg = inst.Register - 19;
+    b = 0xD0 | ((reg & 0xC) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveRegX:
+    assert(inst.Register >= 19 && "Saved reg must be >= 19");
+    reg = inst.Register - 19;
+    b = 0xD4 | ((reg & 0x8) >> 3);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x7) << 5) | ((inst.Offset >> 3) - 1);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveRegP:
+    assert(inst.Register >= 19 && "Saved registers must be >= 19");
+    reg = inst.Register - 19;
+    b = 0xC8 | ((reg & 0xC) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveRegPX:
+    assert(inst.Register >= 19 && "Saved registers must be >= 19");
+    reg = inst.Register - 19;
+    b = 0xCC | ((reg & 0xC) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | ((inst.Offset >> 3) - 1);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFReg:
+    assert(inst.Register >= 8 && "Saved dreg must be >= 8");
+    reg = inst.Register - 8;
+    b = 0xDC | ((reg & 0x4) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFRegX:
+    assert(inst.Register >= 8 && "Saved dreg must be >= 8");
+    reg = inst.Register - 8;
+    b = 0xDE;
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x7) << 5) | ((inst.Offset >> 3) - 1);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFRegP:
+    assert(inst.Register >= 8 && "Saved dregs must be >= 8");
+    reg = inst.Register - 8;
+    b = 0xD8 | ((reg & 0x4) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_SaveFRegPX:
+    assert(inst.Register >= 8 && "Saved dregs must be >= 8");
+    reg = inst.Register - 8;
+    b = 0xDA | ((reg & 0x4) >> 2);
+    streamer.EmitIntValue(b, 1);
+    b = ((reg & 0x3) << 6) | ((inst.Offset >> 3) - 1);
+    streamer.EmitIntValue(b, 1);
+    break;
+  case Win64EH::UOP_End:
+    b = 0xE4;
+    streamer.EmitIntValue(b, 1);
+    break;
+  }
+}
+
+// Populate the .xdata section.  The format of .xdata on ARM64 is documented at
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
+  // If this UNWIND_INFO already has a symbol, it's already been emitted.
+  if (info->Symbol)
+    return;
+
+  MCContext &context = streamer.getContext();
+  MCSymbol *Label = context.createTempSymbol();
+
+  streamer.EmitValueToAlignment(4);
+  streamer.EmitLabel(Label);
+  info->Symbol = Label;
+
+  uint32_t FuncLength = 0x0;
+  FuncLength = (uint32_t)GetAbsDifference(streamer, info->FuncletOrFuncEnd,
+                                          info->Begin);
+  if (FuncLength)
+    FuncLength /= 4;
+  uint32_t PrologCodeBytes = ARM64CountOfUnwindCodes(info->Instructions);
+  uint32_t TotalCodeBytes = PrologCodeBytes;
+
+  // Process epilogs.
+  MapVector<MCSymbol *, uint32_t> EpilogInfo;
+  for (auto &I : info->EpilogMap) {
+    MCSymbol *EpilogStart = I.first;
+    auto &EpilogInstrs = I.second;
+    uint32_t CodeBytes = ARM64CountOfUnwindCodes(EpilogInstrs);
+    EpilogInfo[EpilogStart] = TotalCodeBytes;
+    TotalCodeBytes += CodeBytes;
+  }
+
+  // Code Words, Epilog count, E, X, Vers, Function Length
+  uint32_t row1 = 0x0;
+  uint8_t CodeWords = TotalCodeBytes / 4;
+  uint8_t CodeWordsMod = TotalCodeBytes % 4;
+  if (CodeWordsMod)
+    CodeWords++;
+  uint32_t EpilogCount = info->EpilogMap.size();
+  bool ExtensionWord = EpilogCount > 31 || TotalCodeBytes > 124;
+  if (!ExtensionWord) {
+    row1 |= (EpilogCount & 0x1F) << 22;
+    row1 |= (CodeWords & 0x1F) << 27;
+  }
+  // E is always 0 right now, TODO: packed epilog setup
+  if (info->HandlesExceptions) // X
+    row1 |= 1 << 20;
+  row1 |= FuncLength & 0x3FFFF;
+  streamer.EmitIntValue(row1, 4);
+
+  // Extended Code Words, Extended Epilog Count
+  if (ExtensionWord) {
+    uint32_t row2 = 0x0;
+    row2 |= (CodeWords & 0xFF) << 16;
+    row2 |= (EpilogCount & 0xFFFF);
+    streamer.EmitIntValue(row2, 4);
+  }
+
+  // Epilog Start Index, Epilog Start Offset
+  for (auto &I : EpilogInfo) {
+    MCSymbol *EpilogStart = I.first;
+    uint32_t EpilogIndex = I.second;
+    uint32_t EpilogOffset =
+        (uint32_t)GetAbsDifference(streamer, EpilogStart, info->Begin);
+    if (EpilogOffset)
+      EpilogOffset /= 4;
+    uint32_t row3 = EpilogOffset;
+    row3 |= (EpilogIndex & 0x3FF) << 22;
+    streamer.EmitIntValue(row3, 4);
+  }
+
+  // Emit prolog unwind instructions (in reverse order).
+  uint8_t numInst = info->Instructions.size();
+  for (uint8_t c = 0; c < numInst; ++c) {
+    WinEH::Instruction inst = info->Instructions.back();
+    info->Instructions.pop_back();
+    ARM64EmitUnwindCode(streamer, info->Begin, inst);
+  }
+
+  // Emit epilog unwind instructions
+  for (auto &I : info->EpilogMap) {
+    auto &EpilogInstrs = I.second;
+    for (uint32_t i = 0; i < EpilogInstrs.size(); i++) {
+      WinEH::Instruction inst = EpilogInstrs[i];
+      ARM64EmitUnwindCode(streamer, info->Begin, inst);
+    }
+  }
+
+  int32_t BytesMod = CodeWords * 4 - TotalCodeBytes;
+  assert(BytesMod >= 0);
+  for (int i = 0; i < BytesMod; i++)
+    streamer.EmitIntValue(0xE3, 1);
+
+  if (info->HandlesExceptions)
+    streamer.EmitValue(
+        MCSymbolRefExpr::create(info->ExceptionHandler,
+                                MCSymbolRefExpr::VK_COFF_IMGREL32, context),
+        4);
+}
+
+static void ARM64EmitRuntimeFunction(MCStreamer &streamer,
+                                     const WinEH::FrameInfo *info) {
+  MCContext &context = streamer.getContext();
+
+  streamer.EmitValueToAlignment(4);
+  EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
+  streamer.EmitValue(MCSymbolRefExpr::create(info->Symbol,
+                                             MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                             context),
+                     4);
+}
+
+void llvm::Win64EH::ARM64UnwindEmitter::Emit(MCStreamer &Streamer) const {
+  // Emit the unwind info structs first.
+  for (const auto &CFI : Streamer.getWinFrameInfos()) {
+    MCSection *XData = Streamer.getAssociatedXDataSection(CFI->TextSection);
+    Streamer.SwitchSection(XData);
+    ARM64EmitUnwindInfo(Streamer, CFI.get());
+  }
+
+  // Now emit RUNTIME_FUNCTION entries.
+  for (const auto &CFI : Streamer.getWinFrameInfos()) {
+    MCSection *PData = Streamer.getAssociatedPDataSection(CFI->TextSection);
+    Streamer.SwitchSection(PData);
+    ARM64EmitRuntimeFunction(Streamer, CFI.get());
+  }
+}
+
+void llvm::Win64EH::ARM64UnwindEmitter::EmitUnwindInfo(
+    MCStreamer &Streamer, WinEH::FrameInfo *info) const {
+  // Switch sections (the static function above is meant to be called from
+  // here and from Emit().
+  MCSection *XData = Streamer.getAssociatedXDataSection(info->TextSection);
+  Streamer.SwitchSection(XData);
+  ARM64EmitUnwindInfo(Streamer, info);
+}
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index cbbe161ae820a0ecbc004bd728bd5dac5bec4e37..c1e0b7aa7ab01a526f61d4e24f9fcde4f6a8e435 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -92,9 +92,9 @@ struct WasmFunctionTypeDenseMapInfo {
   static unsigned getHashValue(const WasmFunctionType &FuncTy) {
     uintptr_t Value = FuncTy.State;
     for (wasm::ValType Ret : FuncTy.Returns)
-      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Ret));
+      Value += DenseMapInfo<uint32_t>::getHashValue(uint32_t(Ret));
     for (wasm::ValType Param : FuncTy.Params)
-      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Param));
+      Value += DenseMapInfo<uint32_t>::getHashValue(uint32_t(Param));
     return Value;
   }
   static bool isEqual(const WasmFunctionType &LHS,
@@ -118,7 +118,7 @@ struct WasmDataSegment {
 
 // A wasm function to be written into the function section.
 struct WasmFunction {
-  int32_t Type;
+  uint32_t Type;
   const MCSymbolWasm *Sym;
 };
 
@@ -231,7 +231,7 @@ class WasmObjectWriter : public MCObjectWriter {
   // Map from section to defining function symbol.
   DenseMap<const MCSection *, const MCSymbol *> SectionFunctions;
 
-  DenseMap<WasmFunctionType, int32_t, WasmFunctionTypeDenseMapInfo>
+  DenseMap<WasmFunctionType, uint32_t, WasmFunctionTypeDenseMapInfo>
       FunctionTypeIndices;
   SmallVector<WasmFunctionType, 4> FunctionTypes;
   SmallVector<WasmGlobal, 4> Globals;
@@ -635,10 +635,12 @@ static void addData(SmallVectorImpl<char> &DataBytes,
         llvm_unreachable("The fill should be an assembler constant");
       DataBytes.insert(DataBytes.end(), Fill->getValueSize() * NumValues,
                        Fill->getValue());
+    } else if (auto *LEB = dyn_cast<MCLEBFragment>(&Frag)) {
+      const SmallVectorImpl<char> &Contents = LEB->getContents();
+      DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
     } else {
       const auto &DataFrag = cast<MCDataFragment>(Frag);
       const SmallVectorImpl<char> &Contents = DataFrag.getContents();
-
       DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
     }
   }
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index c6c0befb90f93c937e4ddf43e5eaa1e5c729c3ee..767205390e0782bf655af0882c1751126b364461 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -121,6 +121,11 @@ static void printWithSpacePadding(raw_ostream &OS, T Data, unsigned Size) {
   OS.indent(Size - SizeSoFar);
 }
 
+static bool isDarwin(object::Archive::Kind Kind) {
+  return Kind == object::Archive::K_DARWIN ||
+         Kind == object::Archive::K_DARWIN64;
+}
+
 static bool isBSDLike(object::Archive::Kind Kind) {
   switch (Kind) {
   case object::Archive::K_GNU:
@@ -128,8 +133,8 @@ static bool isBSDLike(object::Archive::Kind Kind) {
     return false;
   case object::Archive::K_BSD:
   case object::Archive::K_DARWIN:
-    return true;
   case object::Archive::K_DARWIN64:
+    return true;
   case object::Archive::K_COFF:
     break;
   }
@@ -314,7 +319,9 @@ static void printNBits(raw_ostream &Out, object::Archive::Kind Kind,
 static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
                              bool Deterministic, ArrayRef<MemberData> Members,
                              StringRef StringTable) {
-  if (StringTable.empty())
+  // We don't write a symbol table on an archive with no members -- except on
+  // Darwin, where the linker will abort unless the archive has a symbol table.
+  if (StringTable.empty() && !isDarwin(Kind))
     return;
 
   unsigned NumSyms = 0;
@@ -322,15 +329,15 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
     NumSyms += M.Symbols.size();
 
   unsigned Size = 0;
-  Size += is64BitKind(Kind) ? 8 : 4; // Number of entries
+  unsigned OffsetSize = is64BitKind(Kind) ? sizeof(uint64_t) : sizeof(uint32_t);
+
+  Size += OffsetSize; // Number of entries
   if (isBSDLike(Kind))
-    Size += NumSyms * 8; // Table
-  else if (is64BitKind(Kind))
-    Size += NumSyms * 8; // Table
+    Size += NumSyms * OffsetSize * 2; // Table
   else
-    Size += NumSyms * 4; // Table
+    Size += NumSyms * OffsetSize; // Table
   if (isBSDLike(Kind))
-    Size += 4; // byte count
+    Size += OffsetSize; // byte count
   Size += StringTable.size();
   // ld64 expects the members to be 8-byte aligned for 64-bit content and at
   // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
@@ -340,25 +347,26 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
   unsigned Pad = OffsetToAlignment(Size, Alignment);
   Size += Pad;
 
-  if (isBSDLike(Kind))
-    printBSDMemberHeader(Out, Out.tell(), "__.SYMDEF", now(Deterministic), 0, 0,
-                         0, Size);
-  else if (is64BitKind(Kind))
-    printGNUSmallMemberHeader(Out, "/SYM64", now(Deterministic), 0, 0, 0, Size);
-  else
-    printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, Size);
+  if (isBSDLike(Kind)) {
+    const char *Name = is64BitKind(Kind) ? "__.SYMDEF_64" : "__.SYMDEF";
+    printBSDMemberHeader(Out, Out.tell(), Name, now(Deterministic), 0, 0, 0,
+                         Size);
+  } else {
+    const char *Name = is64BitKind(Kind) ? "/SYM64" : "";
+    printGNUSmallMemberHeader(Out, Name, now(Deterministic), 0, 0, 0, Size);
+  }
 
   uint64_t Pos = Out.tell() + Size;
 
   if (isBSDLike(Kind))
-    print<uint32_t>(Out, Kind, NumSyms * 8);
+    printNBits(Out, Kind, NumSyms * 2 * OffsetSize);
   else
     printNBits(Out, Kind, NumSyms);
 
   for (const MemberData &M : Members) {
     for (unsigned StringOffset : M.Symbols) {
       if (isBSDLike(Kind))
-        print<uint32_t>(Out, Kind, StringOffset);
+        printNBits(Out, Kind, StringOffset);
       printNBits(Out, Kind, Pos); // member offset
     }
     Pos += M.Header.size() + M.Data.size() + M.Padding.size();
@@ -366,7 +374,7 @@ static void writeSymbolTable(raw_ostream &Out, object::Archive::Kind Kind,
 
   if (isBSDLike(Kind))
     // byte count of the string table
-    print<uint32_t>(Out, Kind, StringTable.size());
+    printNBits(Out, Kind, StringTable.size());
   Out << StringTable;
 
   while (Pad--)
@@ -466,9 +474,7 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
   // See also the functions that handle the lookup:
   // in lldb: ObjectContainerBSDArchive::Archive::FindObject()
   // in llvm/tools/dsymutil: BinaryHolder::GetArchiveMemberBuffers().
-  bool UniqueTimestamps =
-      Deterministic && (Kind == object::Archive::K_DARWIN ||
-                        Kind == object::Archive::K_DARWIN64);
+  bool UniqueTimestamps = Deterministic && isDarwin(Kind);
   std::map<StringRef, unsigned> FilenameCount;
   if (UniqueTimestamps) {
     for (const NewArchiveMember &M : NewMembers)
@@ -488,9 +494,8 @@ computeMemberData(raw_ostream &StringTable, raw_ostream &SymNames,
     // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
     // uniformly.  This matches the behaviour with cctools and ensures that ld64
     // is happy with archives that we generate.
-    unsigned MemberPadding = Kind == object::Archive::K_DARWIN
-                                 ? OffsetToAlignment(Data.size(), 8)
-                                 : 0;
+    unsigned MemberPadding =
+        isDarwin(Kind) ? OffsetToAlignment(Data.size(), 8) : 0;
     unsigned TailPadding = OffsetToAlignment(Data.size() + MemberPadding, 2);
     StringRef Padding = StringRef(PaddingData, MemberPadding + TailPadding);
 
@@ -569,8 +574,12 @@ Error llvm::writeArchive(StringRef ArcName,
     // If LastOffset isn't going to fit in a 32-bit varible we need to switch
     // to 64-bit. Note that the file can be larger than 4GB as long as the last
     // member starts before the 4GB offset.
-    if (LastOffset >= (1ULL << Sym64Threshold))
-      Kind = object::Archive::K_GNU64;
+    if (LastOffset >= (1ULL << Sym64Threshold)) {
+      if (Kind == object::Archive::K_DARWIN)
+        Kind = object::Archive::K_DARWIN64;
+      else
+        Kind = object::Archive::K_GNU64;
+    }
   }
 
   Expected<sys::fs::TempFile> Temp =
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
index da56d97c4bc250a55e036dda75bc4d69dd8139ea..2edab0b13735eca0432f1d6a6080212286008ff0 100644
--- a/lib/Object/ELF.cpp
+++ b/lib/Object/ELF.cpp
@@ -139,6 +139,13 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
       break;
     }
     break;
+  case ELF::EM_MSP430:
+    switch (Type) {
+#include "llvm/BinaryFormat/ELFRelocs/MSP430.def"
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 75925a5ea1025c9319ae32d0037dd53349be6881..3bd66f9375fe670eab34e7b8f41839c454a8505a 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -193,7 +193,7 @@ static Error readInitExpr(wasm::WasmInitExpr &Expr,
 
 static wasm::WasmLimits readLimits(WasmObjectFile::ReadContext &Ctx) {
   wasm::WasmLimits Result;
-  Result.Flags = readVaruint1(Ctx);
+  Result.Flags = readVaruint32(Ctx);
   Result.Initial = readVaruint32(Ctx);
   if (Result.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
     Result.Maximum = readVaruint32(Ctx);
diff --git a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
index 745f79cd77f37cc4d00a586899881cda6ff429a4..713e9a710e945a88fc14a73e3f04530bdf10574b 100644
--- a/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
+++ b/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -108,7 +108,7 @@ void ScalarBitSetTraits<ExportFlags>::bitset(IO &io, ExportFlags &Flags) {
 }
 
 void ScalarBitSetTraits<PublicSymFlags>::bitset(IO &io, PublicSymFlags &Flags) {
-  auto FlagNames = getProcSymFlagNames();
+  auto FlagNames = getPublicSymFlagNames();
   for (const auto &E : FlagNames) {
     io.bitSetCase(Flags, E.Name.str().c_str(),
                   static_cast<PublicSymFlags>(E.Value));
diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp
index a381a63d600641825a01aae395486e5054390fbd..189d71782bd85d7f840a760f6c689d288b5f9975 100644
--- a/lib/ObjectYAML/ELFYAML.cpp
+++ b/lib/ObjectYAML/ELFYAML.cpp
@@ -402,7 +402,9 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX902, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH);
     BCase(EF_AMDGPU_XNACK);
+    BCase(EF_AMDGPU_SRAM_ECC);
     break;
   case ELF::EM_X86_64:
     break;
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
index 2e7a1d6f6531199d67fde1d8b6bde0d588d7344c..dba950af5892941ef6e79cd3bbb7fa542f040ebd 100644
--- a/lib/ObjectYAML/WasmYAML.cpp
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -416,6 +416,7 @@ void ScalarBitSetTraits<WasmYAML::LimitFlags>::bitset(
     IO &IO, WasmYAML::LimitFlags &Value) {
 #define BCase(X) IO.bitSetCase(Value, #X, wasm::WASM_LIMITS_FLAG_##X)
   BCase(HAS_MAX);
+  BCase(IS_SHARED);
 #undef BCase
 }
 
diff --git a/lib/OptRemarks/CMakeLists.txt b/lib/OptRemarks/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8fefe1d986b5b2319865ad4e744f2aeabf1de421
--- /dev/null
+++ b/lib/OptRemarks/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMOptRemarks
+  OptRemarksParser.cpp
+)
diff --git a/lib/OptRemarks/LLVMBuild.txt b/lib/OptRemarks/LLVMBuild.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4c1032296dcb7f0a82ea79f39c99e32fb6066739
--- /dev/null
+++ b/lib/OptRemarks/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/OptRemarks/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = OptRemarks
+parent = Libraries
+required_libraries = Support
diff --git a/lib/OptRemarks/OptRemarksParser.cpp b/lib/OptRemarks/OptRemarksParser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0478d2bfbfa600521bc9385cc79d614638c748c0
--- /dev/null
+++ b/lib/OptRemarks/OptRemarksParser.cpp
@@ -0,0 +1,368 @@
+//===- OptRemarksParser.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utility methods used by clients that want to use the
+// parser for optimization remarks in LLVM.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/OptRemarks.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLTraits.h"
+
+using namespace llvm;
+
+namespace {
+struct RemarkParser {
+  /// Source manager for better error messages.
+  SourceMgr SM;
+  /// Stream for yaml parsing.
+  yaml::Stream Stream;
+  /// Storage for the error stream.
+  std::string ErrorString;
+  /// The error stream.
+  raw_string_ostream ErrorStream;
+  /// Iterator in the YAML stream.
+  yaml::document_iterator DI;
+  /// The parsed remark (if any).
+  Optional<LLVMOptRemarkEntry> LastRemark;
+  /// Temporary parsing buffer for the arguments.
+  SmallVector<LLVMOptRemarkArg, 8> TmpArgs;
+  /// The state used by the parser to parse a remark entry. Invalidated with
+  /// every call to `parseYAMLElement`.
+  struct ParseState {
+    /// Temporary parsing buffer for the arguments.
+    SmallVectorImpl<LLVMOptRemarkArg> *Args;
+    StringRef Type;
+    StringRef Pass;
+    StringRef Name;
+    StringRef Function;
+    /// Optional.
+    Optional<StringRef> File;
+    Optional<unsigned> Line;
+    Optional<unsigned> Column;
+    Optional<unsigned> Hotness;
+
+    ParseState(SmallVectorImpl<LLVMOptRemarkArg> &Args) : Args(&Args) {}
+    /// Use Args only as a **temporary** buffer.
+    ~ParseState() { Args->clear(); }
+  };
+
+  ParseState State;
+
+  /// Set to `true` if we had any errors during parsing.
+  bool HadAnyErrors = false;
+
+  RemarkParser(StringRef Buf)
+      : SM(), Stream(Buf, SM), ErrorString(), ErrorStream(ErrorString),
+        DI(Stream.begin()), LastRemark(), TmpArgs(), State(TmpArgs) {
+    SM.setDiagHandler(RemarkParser::HandleDiagnostic, this);
+  }
+
+  /// Parse a YAML element.
+  Error parseYAMLElement(yaml::Document &Remark);
+
+private:
+  /// Parse one key to a string.
+  /// otherwise.
+  Error parseKey(StringRef &Result, yaml::KeyValueNode &Node);
+  /// Parse one value to a string.
+  Error parseValue(StringRef &Result, yaml::KeyValueNode &Node);
+  /// Parse one value to an unsigned.
+  Error parseValue(Optional<unsigned> &Result, yaml::KeyValueNode &Node);
+  /// Parse a debug location.
+  Error parseDebugLoc(Optional<StringRef> &File, Optional<unsigned> &Line,
+                      Optional<unsigned> &Column, yaml::KeyValueNode &Node);
+  /// Parse an argument.
+  Error parseArg(SmallVectorImpl<LLVMOptRemarkArg> &TmpArgs, yaml::Node &Node);
+
+  /// Handle a diagnostic from the YAML stream. Records the error in the
+  /// RemarkParser class.
+  static void HandleDiagnostic(const SMDiagnostic &Diag, void *Ctx) {
+    assert(Ctx && "Expected non-null Ctx in diagnostic handler.");
+    auto *Parser = static_cast<RemarkParser *>(Ctx);
+    Diag.print(/*ProgName=*/nullptr, Parser->ErrorStream, /*ShowColors*/ false,
+               /*ShowKindLabels*/ true);
+  }
+};
+
+class ParseError : public ErrorInfo<ParseError> {
+public:
+  static char ID;
+
+  ParseError(StringRef Message, yaml::Node &Node)
+      : Message(Message), Node(Node) {}
+
+  void log(raw_ostream &OS) const override { OS << Message; }
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  StringRef getMessage() const { return Message; }
+  yaml::Node &getNode() const { return Node; }
+
+private:
+  StringRef Message; // No need to hold a full copy of the buffer.
+  yaml::Node &Node;
+};
+
+char ParseError::ID = 0;
+
+static LLVMOptRemarkStringRef toOptRemarkStr(StringRef Str) {
+  return {Str.data(), static_cast<uint32_t>(Str.size())};
+}
+
+Error RemarkParser::parseKey(StringRef &Result, yaml::KeyValueNode &Node) {
+  auto *Key = dyn_cast<yaml::ScalarNode>(Node.getKey());
+  if (!Key)
+    return make_error<ParseError>("key is not a string.", Node);
+
+  Result = Key->getRawValue();
+  return Error::success();
+}
+
+Error RemarkParser::parseValue(StringRef &Result, yaml::KeyValueNode &Node) {
+  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
+  if (!Value)
+    return make_error<ParseError>("expected a value of scalar type.", Node);
+  Result = Value->getRawValue();
+
+  if (Result.front() == '\'')
+    Result = Result.drop_front();
+
+  if (Result.back() == '\'')
+    Result = Result.drop_back();
+
+  return Error::success();
+}
+
+Error RemarkParser::parseValue(Optional<unsigned> &Result,
+                               yaml::KeyValueNode &Node) {
+  SmallVector<char, 4> Tmp;
+  auto *Value = dyn_cast<yaml::ScalarNode>(Node.getValue());
+  if (!Value)
+    return make_error<ParseError>("expected a value of scalar type.", Node);
+  unsigned UnsignedValue = 0;
+  if (Value->getValue(Tmp).getAsInteger(10, UnsignedValue))
+    return make_error<ParseError>("expected a value of integer type.", *Value);
+  Result = UnsignedValue;
+  return Error::success();
+}
+
+Error RemarkParser::parseDebugLoc(Optional<StringRef> &File,
+                                  Optional<unsigned> &Line,
+                                  Optional<unsigned> &Column,
+                                  yaml::KeyValueNode &Node) {
+  auto *DebugLoc = dyn_cast<yaml::MappingNode>(Node.getValue());
+  if (!DebugLoc)
+    return make_error<ParseError>("expected a value of mapping type.", Node);
+
+  for (yaml::KeyValueNode &DLNode : *DebugLoc) {
+    StringRef KeyName;
+    if (Error E = parseKey(KeyName, DLNode))
+      return E;
+    if (KeyName == "File") {
+      File = StringRef(); // Set the optional to contain a default constructed
+                          // value, to be passed to the parsing function.
+      if (Error E = parseValue(*File, DLNode))
+        return E;
+    } else if (KeyName == "Column") {
+      if (Error E = parseValue(Column, DLNode))
+        return E;
+    } else if (KeyName == "Line") {
+      if (Error E = parseValue(Line, DLNode))
+        return E;
+    } else {
+      return make_error<ParseError>("unknown entry in DebugLoc map.", DLNode);
+    }
+  }
+
+  // If any of the debug loc fields is missing, return an error.
+  if (!File || !Line || !Column)
+    return make_error<ParseError>("DebugLoc node incomplete.", Node);
+
+  return Error::success();
+}
+
+Error RemarkParser::parseArg(SmallVectorImpl<LLVMOptRemarkArg> &Args,
+                             yaml::Node &Node) {
+  auto *ArgMap = dyn_cast<yaml::MappingNode>(&Node);
+  if (!ArgMap)
+    return make_error<ParseError>("expected a value of mapping type.", Node);
+
+  StringRef ValueStr;
+  StringRef KeyStr;
+  Optional<StringRef> File;
+  Optional<unsigned> Line;
+  Optional<unsigned> Column;
+
+  for (yaml::KeyValueNode &ArgEntry : *ArgMap) {
+    StringRef KeyName;
+    if (Error E = parseKey(KeyName, ArgEntry))
+      return E;
+
+    // Try to parse debug locs.
+    if (KeyName == "DebugLoc") {
+      // Can't have multiple DebugLoc entries per argument.
+      if (File || Line || Column)
+        return make_error<ParseError>(
+            "only one DebugLoc entry is allowed per argument.", ArgEntry);
+
+      if (Error E = parseDebugLoc(File, Line, Column, ArgEntry))
+        return E;
+      continue;
+    }
+
+    // If we already have a string, error out.
+    if (!ValueStr.empty())
+      return make_error<ParseError>(
+          "only one string entry is allowed per argument.", ArgEntry);
+
+    // Try to parse a string.
+    if (Error E = parseValue(ValueStr, ArgEntry))
+      return E;
+
+    // Keep the key from the string.
+    KeyStr = KeyName;
+  }
+
+  if (KeyStr.empty())
+    return make_error<ParseError>("argument key is missing.", *ArgMap);
+  if (ValueStr.empty())
+    return make_error<ParseError>("argument value is missing.", *ArgMap);
+
+  Args.push_back(LLVMOptRemarkArg{
+      toOptRemarkStr(KeyStr), toOptRemarkStr(ValueStr),
+      LLVMOptRemarkDebugLoc{toOptRemarkStr(File.getValueOr(StringRef())),
+                            Line.getValueOr(0), Column.getValueOr(0)}});
+
+  return Error::success();
+}
+
+Error RemarkParser::parseYAMLElement(yaml::Document &Remark) {
+  // Parsing a new remark, clear the previous one.
+  LastRemark = None;
+  State = ParseState(TmpArgs);
+
+  auto *Root = dyn_cast<yaml::MappingNode>(Remark.getRoot());
+  if (!Root)
+    return make_error<ParseError>("document root is not of mapping type.",
+                                  *Remark.getRoot());
+
+  State.Type = Root->getRawTag();
+
+  for (yaml::KeyValueNode &RemarkField : *Root) {
+    StringRef KeyName;
+    if (Error E = parseKey(KeyName, RemarkField))
+      return E;
+
+    if (KeyName == "Pass") {
+      if (Error E = parseValue(State.Pass, RemarkField))
+        return E;
+    } else if (KeyName == "Name") {
+      if (Error E = parseValue(State.Name, RemarkField))
+        return E;
+    } else if (KeyName == "Function") {
+      if (Error E = parseValue(State.Function, RemarkField))
+        return E;
+    } else if (KeyName == "Hotness") {
+      if (Error E = parseValue(State.Hotness, RemarkField))
+        return E;
+    } else if (KeyName == "DebugLoc") {
+      if (Error E =
+              parseDebugLoc(State.File, State.Line, State.Column, RemarkField))
+        return E;
+    } else if (KeyName == "Args") {
+      auto *Args = dyn_cast<yaml::SequenceNode>(RemarkField.getValue());
+      if (!Args)
+        return make_error<ParseError>("wrong value type for key.", RemarkField);
+
+      for (yaml::Node &Arg : *Args)
+        if (Error E = parseArg(*State.Args, Arg))
+          return E;
+    } else {
+      return make_error<ParseError>("unknown key.", RemarkField);
+    }
+  }
+
+  // If the YAML parsing failed, don't even continue parsing. We might
+  // encounter malformed YAML.
+  if (Stream.failed())
+    return make_error<ParseError>("YAML parsing failed.", *Remark.getRoot());
+
+  // Check if any of the mandatory fields are missing.
+  if (State.Type.empty() || State.Pass.empty() || State.Name.empty() ||
+      State.Function.empty())
+    return make_error<ParseError>("Type, Pass, Name or Function missing.",
+                                  *Remark.getRoot());
+
+  LastRemark = LLVMOptRemarkEntry{
+      toOptRemarkStr(State.Type),
+      toOptRemarkStr(State.Pass),
+      toOptRemarkStr(State.Name),
+      toOptRemarkStr(State.Function),
+      LLVMOptRemarkDebugLoc{toOptRemarkStr(State.File.getValueOr(StringRef())),
+                            State.Line.getValueOr(0),
+                            State.Column.getValueOr(0)},
+      State.Hotness.getValueOr(0),
+      static_cast<uint32_t>(State.Args->size()),
+      State.Args->data()};
+
+  return Error::success();
+}
+} // namespace
+
+// Create wrappers for C Binding types (see CBindingWrapping.h).
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(RemarkParser, LLVMOptRemarkParserRef)
+
+extern "C" LLVMOptRemarkParserRef LLVMOptRemarkParserCreate(const void *Buf,
+                                                            uint64_t Size) {
+  return wrap(
+      new RemarkParser(StringRef(static_cast<const char *>(Buf), Size)));
+}
+
+extern "C" LLVMOptRemarkEntry *
+LLVMOptRemarkParserGetNext(LLVMOptRemarkParserRef Parser) {
+  RemarkParser &TheParser = *unwrap(Parser);
+  // Check for EOF.
+  if (TheParser.HadAnyErrors || TheParser.DI == TheParser.Stream.end())
+    return nullptr;
+
+  // Try to parse an entry.
+  if (Error E = TheParser.parseYAMLElement(*TheParser.DI)) {
+    handleAllErrors(std::move(E), [&](const ParseError &PE) {
+      TheParser.Stream.printError(&PE.getNode(),
+                                  Twine(PE.getMessage()) + Twine('\n'));
+      TheParser.HadAnyErrors = true;
+    });
+    return nullptr;
+  }
+
+  // Move on.
+  ++TheParser.DI;
+
+  // Return the just-parsed remark.
+  if (Optional<LLVMOptRemarkEntry> &Entry = TheParser.LastRemark)
+    return &*Entry;
+  return nullptr;
+}
+
+extern "C" LLVMBool LLVMOptRemarkParserHasError(LLVMOptRemarkParserRef Parser) {
+  return unwrap(Parser)->HadAnyErrors;
+}
+
+extern "C" const char *
+LLVMOptRemarkParserGetErrorMessage(LLVMOptRemarkParserRef Parser) {
+  return unwrap(Parser)->ErrorStream.str().c_str();
+}
+
+extern "C" void LLVMOptRemarkParserDispose(LLVMOptRemarkParserRef Parser) {
+  delete unwrap(Parser);
+}
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index a880befc0d50b786602a31a7a829141186a2f0a1..0c6dfff06f1887b1306ffaab762fd2a2fff1ed63 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -58,6 +58,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
@@ -505,7 +506,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                                     PassBuilder::OptimizationLevel Level,
                                     bool RunProfileGen,
                                     std::string ProfileGenFile,
-                                    std::string ProfileUseFile) {
+                                    std::string ProfileUseFile,
+                                    std::string ProfileRemappingFile) {
   // Generally running simplification passes and the inliner with an high
   // threshold results in smaller executables, but there may be cases where
   // the size grows, so let's be conservative here and skip this simplification
@@ -559,7 +561,7 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
   }
 
   if (!ProfileUseFile.empty())
-    MPM.addPass(PGOInstrumentationUse(ProfileUseFile));
+    MPM.addPass(PGOInstrumentationUse(ProfileUseFile, ProfileRemappingFile));
 }
 
 static InlineParams
@@ -605,6 +607,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
     // Annotate sample profile right after early FPM to ensure freshness of
     // the debug info.
     MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile,
+                                        PGOOpt->ProfileRemappingFile,
                                         Phase == ThinLTOPhase::PreLink));
     // Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard
     // for the profile annotation to be accurate in the ThinLTO backend.
@@ -617,9 +620,6 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
                                            true));
   }
 
-  if (EnableHotColdSplit)
-    MPM.addPass(HotColdSplittingPass());
-
   // Interprocedural constant propagation now that basic cleanup has occurred
   // and prior to optimizing globals.
   // FIXME: This position in the pipeline hasn't been carefully considered in
@@ -657,7 +657,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (PGOOpt && Phase != ThinLTOPhase::PostLink &&
       (!PGOOpt->ProfileGenFile.empty() || !PGOOpt->ProfileUseFile.empty())) {
     addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen,
-                      PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile);
+                      PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile,
+                      PGOOpt->ProfileRemappingFile);
     MPM.addPass(PGOIndirectCallPromotion(false, false));
   }
 
@@ -708,6 +709,11 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       buildFunctionSimplificationPipeline(Level, Phase, DebugLogging)));
 
+  // We only want to do hot cold splitting once for ThinLTO, during the
+  // post-link ThinLTO.
+  if (EnableHotColdSplit && Phase != ThinLTOPhase::PreLink)
+    MPM.addPass(HotColdSplittingPass());
+
   for (auto &C : CGSCCOptimizerLateEPCallbacks)
     C(MainCGPipeline, Level);
 
@@ -824,7 +830,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
     OptimizePM.addPass(
         createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
   }
-  OptimizePM.addPass(LoopUnrollPass(Level));
+  OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(Level)));
   OptimizePM.addPass(InstCombinePass());
   OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass(), DebugLogging));
@@ -1398,9 +1404,9 @@ PassBuilder::parsePipelineText(StringRef Text) {
   return {std::move(ResultPipeline)};
 }
 
-bool PassBuilder::parseModulePass(ModulePassManager &MPM,
-                                  const PipelineElement &E, bool VerifyEachPass,
-                                  bool DebugLogging) {
+Error PassBuilder::parseModulePass(ModulePassManager &MPM,
+                                   const PipelineElement &E,
+                                   bool VerifyEachPass, bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1408,50 +1414,56 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
   if (!InnerPipeline.empty()) {
     if (Name == "module") {
       ModulePassManager NestedMPM(DebugLogging);
-      if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass,
-                                   DebugLogging))
-        return false;
+      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
+                                             VerifyEachPass, DebugLogging))
+        return Err;
       MPM.addPass(std::move(NestedMPM));
-      return true;
+      return Error::success();
     }
     if (Name == "cgscc") {
       CGSCCPassManager CGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(CGPM, InnerPipeline, VerifyEachPass,
+                                            DebugLogging))
+        return Err;
       MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
-      return true;
+      return Error::success();
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       ModulePassManager NestedMPM(DebugLogging);
-      if (!parseModulePassPipeline(NestedMPM, InnerPipeline, VerifyEachPass,
-                                   DebugLogging))
-        return false;
+      if (auto Err = parseModulePassPipeline(NestedMPM, InnerPipeline,
+                                             VerifyEachPass, DebugLogging))
+        return Err;
       MPM.addPass(createRepeatedPass(*Count, std::move(NestedMPM)));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : ModulePipelineParsingCallbacks)
       if (C(Name, MPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as module pipeline", Name).str(),
+        inconvertibleErrorCode());
+    ;
   }
 
   // Manually handle aliases for pre-configured pipeline fragments.
   if (startsWithDefaultPipelineAliasPrefix(Name)) {
     SmallVector<StringRef, 3> Matches;
     if (!DefaultAliasRegex.match(Name, &Matches))
-      return false;
+      return make_error<StringError>(
+          formatv("unknown default pipeline alias '{0}'", Name).str(),
+          inconvertibleErrorCode());
+
     assert(Matches.size() == 3 && "Must capture two matched strings!");
 
     OptimizationLevel L = StringSwitch<OptimizationLevel>(Matches[2])
@@ -1463,7 +1475,7 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
                               .Case("Oz", Oz);
     if (L == O0)
       // At O0 we do nothing at all!
-      return true;
+      return Error::success();
 
     if (Matches[1] == "default") {
       MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging));
@@ -1477,38 +1489,40 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
       assert(Matches[1] == "lto" && "Not one of the matched options!");
       MPM.addPass(buildLTODefaultPipeline(L, DebugLogging, nullptr));
     }
-    return true;
+    return Error::success();
   }
 
   // Finally expand the basic registered passes from the .inc file.
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   if (Name == NAME) {                                                          \
     MPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">") {                                           \
     MPM.addPass(                                                               \
         RequireAnalysisPass<                                                   \
             std::remove_reference<decltype(CREATE_PASS)>::type, Module>());    \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     MPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : ModulePipelineParsingCallbacks)
     if (C(Name, MPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(
+      formatv("unknown module pass '{0}'", Name).str(),
+      inconvertibleErrorCode());
 }
 
-bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
-                                 const PipelineElement &E, bool VerifyEachPass,
-                                 bool DebugLogging) {
+Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
+                                  const PipelineElement &E, bool VerifyEachPass,
+                                  bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1516,53 +1530,55 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
   if (!InnerPipeline.empty()) {
     if (Name == "cgscc") {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
+                                            VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(std::move(NestedCGPM));
-      return true;
+      return Error::success();
     }
     if (Name == "function") {
       FunctionPassManager FPM(DebugLogging);
-      if (!parseFunctionPassPipeline(FPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(FPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       CGPM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
+                                            VerifyEachPass, DebugLogging))
+        return Err;
       CGPM.addPass(createRepeatedPass(*Count, std::move(NestedCGPM)));
-      return true;
+      return Error::success();
     }
     if (auto MaxRepetitions = parseDevirtPassName(Name)) {
       CGSCCPassManager NestedCGPM(DebugLogging);
-      if (!parseCGSCCPassPipeline(NestedCGPM, InnerPipeline, VerifyEachPass,
-                                  DebugLogging))
-        return false;
+      if (auto Err = parseCGSCCPassPipeline(NestedCGPM, InnerPipeline,
+                                            VerifyEachPass, DebugLogging))
+        return Err;
       CGPM.addPass(
           createDevirtSCCRepeatedPass(std::move(NestedCGPM), *MaxRepetitions));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : CGSCCPipelineParsingCallbacks)
       if (C(Name, CGPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as cgscc pipeline", Name).str(),
+        inconvertibleErrorCode());
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define CGSCC_PASS(NAME, CREATE_PASS)                                          \
   if (Name == NAME) {                                                          \
     CGPM.addPass(CREATE_PASS);                                                 \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   if (Name == "require<" NAME ">") {                                           \
@@ -1570,24 +1586,26 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
                  std::remove_reference<decltype(CREATE_PASS)>::type,           \
                  LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,    \
                  CGSCCUpdateResult &>());                                      \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     CGPM.addPass(InvalidateAnalysisPass<                                       \
                  std::remove_reference<decltype(CREATE_PASS)>::type>());       \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : CGSCCPipelineParsingCallbacks)
     if (C(Name, CGPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(
+      formatv("unknown cgscc pass '{0}'", Name).str(),
+      inconvertibleErrorCode());
 }
 
-bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
-                                    const PipelineElement &E,
-                                    bool VerifyEachPass, bool DebugLogging) {
+Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
+                                     const PipelineElement &E,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto &Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1595,68 +1613,72 @@ bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
   if (!InnerPipeline.empty()) {
     if (Name == "function") {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(std::move(NestedFPM));
-      return true;
+      return Error::success();
     }
     if (Name == "loop") {
       LoopPassManager LPM(DebugLogging);
-      if (!parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
-                                 DebugLogging))
-        return false;
+      if (auto Err = parseLoopPassPipeline(LPM, InnerPipeline, VerifyEachPass,
+                                           DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       FPM.addPass(
           createFunctionToLoopPassAdaptor(std::move(LPM), DebugLogging));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       FunctionPassManager NestedFPM(DebugLogging);
-      if (!parseFunctionPassPipeline(NestedFPM, InnerPipeline, VerifyEachPass,
-                                     DebugLogging))
-        return false;
+      if (auto Err = parseFunctionPassPipeline(NestedFPM, InnerPipeline,
+                                               VerifyEachPass, DebugLogging))
+        return Err;
       FPM.addPass(createRepeatedPass(*Count, std::move(NestedFPM)));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : FunctionPipelineParsingCallbacks)
       if (C(Name, FPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as function pipeline", Name).str(),
+        inconvertibleErrorCode());
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
     FPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   if (Name == "require<" NAME ">") {                                           \
     FPM.addPass(                                                               \
         RequireAnalysisPass<                                                   \
             std::remove_reference<decltype(CREATE_PASS)>::type, Function>());  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     FPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : FunctionPipelineParsingCallbacks)
     if (C(Name, FPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(
+      formatv("unknown function pass '{0}'", Name).str(),
+      inconvertibleErrorCode());
 }
 
-bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
-                                bool VerifyEachPass, bool DebugLogging) {
+Error PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
+                                 bool VerifyEachPass, bool DebugLogging) {
   StringRef Name = E.Name;
   auto &InnerPipeline = E.InnerPipeline;
 
@@ -1664,35 +1686,37 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
   if (!InnerPipeline.empty()) {
     if (Name == "loop") {
       LoopPassManager NestedLPM(DebugLogging);
-      if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass,
-                                 DebugLogging))
-        return false;
+      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
+                                           VerifyEachPass, DebugLogging))
+        return Err;
       // Add the nested pass manager with the appropriate adaptor.
       LPM.addPass(std::move(NestedLPM));
-      return true;
+      return Error::success();
     }
     if (auto Count = parseRepeatPassName(Name)) {
       LoopPassManager NestedLPM(DebugLogging);
-      if (!parseLoopPassPipeline(NestedLPM, InnerPipeline, VerifyEachPass,
-                                 DebugLogging))
-        return false;
+      if (auto Err = parseLoopPassPipeline(NestedLPM, InnerPipeline,
+                                           VerifyEachPass, DebugLogging))
+        return Err;
       LPM.addPass(createRepeatedPass(*Count, std::move(NestedLPM)));
-      return true;
+      return Error::success();
     }
 
     for (auto &C : LoopPipelineParsingCallbacks)
       if (C(Name, LPM, InnerPipeline))
-        return true;
+        return Error::success();
 
     // Normal passes can't have pipelines.
-    return false;
+    return make_error<StringError>(
+        formatv("invalid use of '{0}' pass as loop pipeline", Name).str(),
+        inconvertibleErrorCode());
   }
 
 // Now expand the basic registered passes from the .inc file.
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     LPM.addPass(CREATE_PASS);                                                  \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   if (Name == "require<" NAME ">") {                                           \
@@ -1700,19 +1724,20 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
                 std::remove_reference<decltype(CREATE_PASS)>::type, Loop,      \
                 LoopAnalysisManager, LoopStandardAnalysisResults &,            \
                 LPMUpdater &>());                                              \
-    return true;                                                               \
+    return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
     LPM.addPass(InvalidateAnalysisPass<                                        \
                 std::remove_reference<decltype(CREATE_PASS)>::type>());        \
-    return true;                                                               \
+    return Error::success();                                                   \
   }
 #include "PassRegistry.def"
 
   for (auto &C : LoopPipelineParsingCallbacks)
     if (C(Name, LPM, InnerPipeline))
-      return true;
-  return false;
+      return Error::success();
+  return make_error<StringError>(formatv("unknown loop pass '{0}'", Name).str(),
+                                 inconvertibleErrorCode());
 }
 
 bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
@@ -1736,41 +1761,42 @@ bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
   return false;
 }
 
-bool PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
-                                        ArrayRef<PipelineElement> Pipeline,
-                                        bool VerifyEachPass,
-                                        bool DebugLogging) {
+Error PassBuilder::parseLoopPassPipeline(LoopPassManager &LPM,
+                                         ArrayRef<PipelineElement> Pipeline,
+                                         bool VerifyEachPass,
+                                         bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err = parseLoopPass(LPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     // FIXME: No verifier support for Loop passes!
   }
-  return true;
+  return Error::success();
 }
 
-bool PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
-                                            ArrayRef<PipelineElement> Pipeline,
-                                            bool VerifyEachPass,
-                                            bool DebugLogging) {
+Error PassBuilder::parseFunctionPassPipeline(FunctionPassManager &FPM,
+                                             ArrayRef<PipelineElement> Pipeline,
+                                             bool VerifyEachPass,
+                                             bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err =
+            parseFunctionPass(FPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     if (VerifyEachPass)
       FPM.addPass(VerifierPass());
   }
-  return true;
+  return Error::success();
 }
 
-bool PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
-                                         ArrayRef<PipelineElement> Pipeline,
-                                         bool VerifyEachPass,
-                                         bool DebugLogging) {
+Error PassBuilder::parseCGSCCPassPipeline(CGSCCPassManager &CGPM,
+                                          ArrayRef<PipelineElement> Pipeline,
+                                          bool VerifyEachPass,
+                                          bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err = parseCGSCCPass(CGPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     // FIXME: No verifier support for CGSCC passes!
   }
-  return true;
+  return Error::success();
 }
 
 void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
@@ -1786,28 +1812,30 @@ void PassBuilder::crossRegisterProxies(LoopAnalysisManager &LAM,
   LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); });
 }
 
-bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
-                                          ArrayRef<PipelineElement> Pipeline,
-                                          bool VerifyEachPass,
-                                          bool DebugLogging) {
+Error PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
+                                           ArrayRef<PipelineElement> Pipeline,
+                                           bool VerifyEachPass,
+                                           bool DebugLogging) {
   for (const auto &Element : Pipeline) {
-    if (!parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
-      return false;
+    if (auto Err = parseModulePass(MPM, Element, VerifyEachPass, DebugLogging))
+      return Err;
     if (VerifyEachPass)
       MPM.addPass(VerifierPass());
   }
-  return true;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c ModulePassManager
 // FIXME: Should this routine accept a TargetMachine or require the caller to
 // pre-populate the analysis managers with target-specific stuff?
-bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
   // If the first name isn't at the module layer, wrap the pipeline up
   // automatically.
@@ -1824,73 +1852,106 @@ bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
         if (C(MPM, *Pipeline, VerifyEachPass, DebugLogging))
-          return true;
-
-      // Unknown pass name!
-      return false;
+          return Error::success();
+
+      // Unknown pass or pipeline name!
+      auto &InnerPipeline = Pipeline->front().InnerPipeline;
+      return make_error<StringError>(
+          formatv("unknown {0} name '{1}'",
+                  (InnerPipeline.empty() ? "pass" : "pipeline"), FirstName)
+              .str(),
+          inconvertibleErrorCode());
     }
   }
 
-  return parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging);
+  if (auto Err =
+          parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging))
+    return Err;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c CGSCCPassManager
-bool PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
   StringRef FirstName = Pipeline->front().Name;
   if (!isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks))
-    return false;
-
-  return parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
+    return make_error<StringError>(
+        formatv("unknown cgscc pass '{0}' in pipeline '{1}'", FirstName,
+                PipelineText)
+            .str(),
+        inconvertibleErrorCode());
+
+  if (auto Err =
+          parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
+    return Err;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c
 // FunctionPassManager
-bool PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
   StringRef FirstName = Pipeline->front().Name;
   if (!isFunctionPassName(FirstName, FunctionPipelineParsingCallbacks))
-    return false;
-
-  return parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
-                                   DebugLogging);
+    return make_error<StringError>(
+        formatv("unknown function pass '{0}' in pipeline '{1}'", FirstName,
+                PipelineText)
+            .str(),
+        inconvertibleErrorCode());
+
+  if (auto Err = parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
+                                           DebugLogging))
+    return Err;
+  return Error::success();
 }
 
 // Primary pass pipeline description parsing routine for a \c LoopPassManager
-bool PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
-                                    StringRef PipelineText, bool VerifyEachPass,
-                                    bool DebugLogging) {
+Error PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
+                                     StringRef PipelineText,
+                                     bool VerifyEachPass, bool DebugLogging) {
   auto Pipeline = parsePipelineText(PipelineText);
   if (!Pipeline || Pipeline->empty())
-    return false;
+    return make_error<StringError>(
+        formatv("invalid pipeline '{0}'", PipelineText).str(),
+        inconvertibleErrorCode());
 
-  return parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
+  if (auto Err =
+          parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging))
+    return Err;
+
+  return Error::success();
 }
 
-bool PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
+Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
   // If the pipeline just consists of the word 'default' just replace the AA
   // manager with our default one.
   if (PipelineText == "default") {
     AA = buildDefaultAAPipeline();
-    return true;
+    return Error::success();
   }
 
   while (!PipelineText.empty()) {
     StringRef Name;
     std::tie(Name, PipelineText) = PipelineText.split(',');
     if (!parseAAPassName(AA, Name))
-      return false;
+      return make_error<StringError>(
+          formatv("unknown alias analysis name '{0}'", Name).str(),
+          inconvertibleErrorCode());
   }
 
-  return true;
+  return Error::success();
 }
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 8de4541a77232b935327457edbca20e830ae2ed3..99df2ad2719cf78505d0a0ec76ed6ceea7e6b944 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -215,6 +215,7 @@ FUNCTION_PASS("sroa", SROA())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("unroll", LoopUnrollPass())
+FUNCTION_PASS("unroll<peeling;no-runtime>",LoopUnrollPass(LoopUnrollOptions().setPeeling(true).setRuntime(false)))
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
diff --git a/lib/Passes/StandardInstrumentations.cpp b/lib/Passes/StandardInstrumentations.cpp
index aa34584fa1265aec6ae09d955d6a69ad75b5f7e3..48d36e5a01e58ad9f9335dca6be6fb3bbe5875c8 100644
--- a/lib/Passes/StandardInstrumentations.cpp
+++ b/lib/Passes/StandardInstrumentations.cpp
@@ -37,10 +37,6 @@ namespace PrintIR {
 /// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into
 /// llvm::Any and does actual print job.
 void unwrapAndPrint(StringRef Banner, Any IR) {
-  if (any_isa<const CallGraphSCC *>(IR) ||
-      any_isa<const LazyCallGraph::SCC *>(IR))
-    return;
-
   SmallString<40> Extra{"\n"};
   const Module *M = nullptr;
   if (any_isa<const Module *>(IR)) {
@@ -55,6 +51,34 @@ void unwrapAndPrint(StringRef Banner, Any IR) {
     }
     M = F->getParent();
     Extra = formatv(" (function: {0})\n", F->getName());
+  } else if (any_isa<const LazyCallGraph::SCC *>(IR)) {
+    const LazyCallGraph::SCC *C = any_cast<const LazyCallGraph::SCC *>(IR);
+    assert(C);
+    if (!llvm::forcePrintModuleIR()) {
+      Extra = formatv(" (scc: {0})\n", C->getName());
+      bool BannerPrinted = false;
+      for (const LazyCallGraph::Node &N : *C) {
+        const Function &F = N.getFunction();
+        if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) {
+          if (!BannerPrinted) {
+            dbgs() << Banner << Extra;
+            BannerPrinted = true;
+          }
+          F.print(dbgs());
+        }
+      }
+      return;
+    }
+    for (const LazyCallGraph::Node &N : *C) {
+      const Function &F = N.getFunction();
+      if (!F.isDeclaration() && isFunctionInPrintList(F.getName())) {
+        M = F.getParent();
+        break;
+      }
+    }
+    if (!M)
+      return;
+    Extra = formatv(" (for scc: {0})\n", C->getName());
   } else if (any_isa<const Loop *>(IR)) {
     const Loop *L = any_cast<const Loop *>(IR);
     const Function *F = L->getHeader()->getParent();
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index 3b704158a5c5ba5ce1938bceb6ea46b631c463b3..eaf0eb04bfbfe2adff00b48865b34ed61ae28abe 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
@@ -23,6 +24,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SymbolRemappingReader.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include <algorithm>
 #include <cctype>
@@ -88,16 +90,29 @@ InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
 }
 
 Expected<std::unique_ptr<IndexedInstrProfReader>>
-IndexedInstrProfReader::create(const Twine &Path) {
+IndexedInstrProfReader::create(const Twine &Path, const Twine &RemappingPath) {
   // Set up the buffer to read.
   auto BufferOrError = setupMemoryBuffer(Path);
   if (Error E = BufferOrError.takeError())
     return std::move(E);
-  return IndexedInstrProfReader::create(std::move(BufferOrError.get()));
+
+  // Set up the remapping buffer if requested.
+  std::unique_ptr<MemoryBuffer> RemappingBuffer;
+  std::string RemappingPathStr = RemappingPath.str();
+  if (!RemappingPathStr.empty()) {
+    auto RemappingBufferOrError = setupMemoryBuffer(RemappingPathStr);
+    if (Error E = RemappingBufferOrError.takeError())
+      return std::move(E);
+    RemappingBuffer = std::move(RemappingBufferOrError.get());
+  }
+
+  return IndexedInstrProfReader::create(std::move(BufferOrError.get()),
+                                        std::move(RemappingBuffer));
 }
 
 Expected<std::unique_ptr<IndexedInstrProfReader>>
-IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
+IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
+                               std::unique_ptr<MemoryBuffer> RemappingBuffer) {
   // Sanity check the buffer.
   if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<unsigned>::max())
     return make_error<InstrProfError>(instrprof_error::too_large);
@@ -105,7 +120,8 @@ IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   // Create the reader.
   if (!IndexedInstrProfReader::hasFormat(*Buffer))
     return make_error<InstrProfError>(instrprof_error::bad_magic);
-  auto Result = llvm::make_unique<IndexedInstrProfReader>(std::move(Buffer));
+  auto Result = llvm::make_unique<IndexedInstrProfReader>(
+      std::move(Buffer), std::move(RemappingBuffer));
 
   // Initialize the reader and return the result.
   if (Error E = initializeReader(*Result))
@@ -587,6 +603,124 @@ InstrProfReaderIndex<HashTableImpl>::InstrProfReaderIndex(
   RecordIterator = HashTable->data_begin();
 }
 
+namespace {
+/// A remapper that does not apply any remappings.
+class InstrProfReaderNullRemapper : public InstrProfReaderRemapper {
+  InstrProfReaderIndexBase &Underlying;
+
+public:
+  InstrProfReaderNullRemapper(InstrProfReaderIndexBase &Underlying)
+      : Underlying(Underlying) {}
+
+  Error getRecords(StringRef FuncName,
+                   ArrayRef<NamedInstrProfRecord> &Data) override {
+    return Underlying.getRecords(FuncName, Data);
+  }
+};
+}
+
+/// A remapper that applies remappings based on a symbol remapping file.
+template <typename HashTableImpl>
+class llvm::InstrProfReaderItaniumRemapper
+    : public InstrProfReaderRemapper {
+public:
+  InstrProfReaderItaniumRemapper(
+      std::unique_ptr<MemoryBuffer> RemapBuffer,
+      InstrProfReaderIndex<HashTableImpl> &Underlying)
+      : RemapBuffer(std::move(RemapBuffer)), Underlying(Underlying) {
+  }
+
+  /// Extract the original function name from a PGO function name.
+  static StringRef extractName(StringRef Name) {
+    // We can have multiple :-separated pieces; there can be pieces both
+    // before and after the mangled name. Find the first part that starts
+    // with '_Z'; we'll assume that's the mangled name we want.
+    std::pair<StringRef, StringRef> Parts = {StringRef(), Name};
+    while (true) {
+      Parts = Parts.second.split(':');
+      if (Parts.first.startswith("_Z"))
+        return Parts.first;
+      if (Parts.second.empty())
+        return Name;
+    }
+  }
+
+  /// Given a mangled name extracted from a PGO function name, and a new
+  /// form for that mangled name, reconstitute the name.
+  static void reconstituteName(StringRef OrigName, StringRef ExtractedName,
+                               StringRef Replacement,
+                               SmallVectorImpl<char> &Out) {
+    Out.reserve(OrigName.size() + Replacement.size() - ExtractedName.size());
+    Out.insert(Out.end(), OrigName.begin(), ExtractedName.begin());
+    Out.insert(Out.end(), Replacement.begin(), Replacement.end());
+    Out.insert(Out.end(), ExtractedName.end(), OrigName.end());
+  }
+
+  Error populateRemappings() override {
+    if (Error E = Remappings.read(*RemapBuffer))
+      return E;
+    for (StringRef Name : Underlying.HashTable->keys()) {
+      StringRef RealName = extractName(Name);
+      if (auto Key = Remappings.insert(RealName)) {
+        // FIXME: We could theoretically map the same equivalence class to
+        // multiple names in the profile data. If that happens, we should
+        // return NamedInstrProfRecords from all of them.
+        MappedNames.insert({Key, RealName});
+      }
+    }
+    return Error::success();
+  }
+
+  Error getRecords(StringRef FuncName,
+                   ArrayRef<NamedInstrProfRecord> &Data) override {
+    StringRef RealName = extractName(FuncName);
+    if (auto Key = Remappings.lookup(RealName)) {
+      StringRef Remapped = MappedNames.lookup(Key);
+      if (!Remapped.empty()) {
+        if (RealName.begin() == FuncName.begin() &&
+            RealName.end() == FuncName.end())
+          FuncName = Remapped;
+        else {
+          // Try rebuilding the name from the given remapping.
+          SmallString<256> Reconstituted;
+          reconstituteName(FuncName, RealName, Remapped, Reconstituted);
+          Error E = Underlying.getRecords(Reconstituted, Data);
+          if (!E)
+            return E;
+
+          // If we failed because the name doesn't exist, fall back to asking
+          // about the original name.
+          if (Error Unhandled = handleErrors(
+                  std::move(E), [](std::unique_ptr<InstrProfError> Err) {
+                    return Err->get() == instrprof_error::unknown_function
+                               ? Error::success()
+                               : Error(std::move(Err));
+                  }))
+            return Unhandled;
+        }
+      }
+    }
+    return Underlying.getRecords(FuncName, Data);
+  }
+
+private:
+  /// The memory buffer containing the remapping configuration. Remappings
+  /// holds pointers into this buffer.
+  std::unique_ptr<MemoryBuffer> RemapBuffer;
+
+  /// The mangling remapper.
+  SymbolRemappingReader Remappings;
+
+  /// Mapping from mangled name keys to the name used for the key in the
+  /// profile data.
+  /// FIXME: Can we store a location within the on-disk hash table instead of
+  /// redoing lookup?
+  DenseMap<SymbolRemappingReader::Key, StringRef> MappedNames;
+
+  /// The real profile data reader.
+  InstrProfReaderIndex<HashTableImpl> &Underlying;
+};
+
 bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
   using namespace support;
 
@@ -683,10 +817,22 @@ Error IndexedInstrProfReader::readHeader() {
   uint64_t HashOffset = endian::byte_swap<uint64_t, little>(Header->HashOffset);
 
   // The rest of the file is an on disk hash table.
-  InstrProfReaderIndexBase *IndexPtr = nullptr;
-  IndexPtr = new InstrProfReaderIndex<OnDiskHashTableImplV3>(
-      Start + HashOffset, Cur, Start, HashType, FormatVersion);
-  Index.reset(IndexPtr);
+  auto IndexPtr =
+      llvm::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
+          Start + HashOffset, Cur, Start, HashType, FormatVersion);
+
+  // Load the remapping table now if requested.
+  if (RemappingBuffer) {
+    Remapper = llvm::make_unique<
+        InstrProfReaderItaniumRemapper<OnDiskHashTableImplV3>>(
+        std::move(RemappingBuffer), *IndexPtr);
+    if (Error E = Remapper->populateRemappings())
+      return E;
+  } else {
+    Remapper = llvm::make_unique<InstrProfReaderNullRemapper>(*IndexPtr);
+  }
+  Index = std::move(IndexPtr);
+
   return success();
 }
 
@@ -707,7 +853,7 @@ Expected<InstrProfRecord>
 IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
                                            uint64_t FuncHash) {
   ArrayRef<NamedInstrProfRecord> Data;
-  Error Err = Index->getRecords(FuncName, Data);
+  Error Err = Remapper->getRecords(FuncName, Data);
   if (Err)
     return std::move(Err);
   // Found it. Look for counters with the right hash.
diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp
index 2b4551b984993e33a68d70e037b3841bbc77b99b..a68d1e9d3ab06262106d1b0beaca99499c749c89 100644
--- a/lib/ProfileData/SampleProfReader.cpp
+++ b/lib/ProfileData/SampleProfReader.cpp
@@ -912,6 +912,40 @@ bool SampleProfileReaderGCC::hasFormat(const MemoryBuffer &Buffer) {
   return Magic == "adcg*704";
 }
 
+std::error_code SampleProfileReaderItaniumRemapper::read() {
+  // If the underlying data is in compact format, we can't remap it because
+  // we don't know what the original function names were.
+  if (getFormat() == SPF_Compact_Binary) {
+    Ctx.diagnose(DiagnosticInfoSampleProfile(
+        Buffer->getBufferIdentifier(),
+        "Profile data remapping cannot be applied to profile data "
+        "in compact format (original mangled names are not available).",
+        DS_Warning));
+    return sampleprof_error::success;
+  }
+
+  if (Error E = Remappings.read(*Buffer)) {
+    handleAllErrors(
+        std::move(E), [&](const SymbolRemappingParseError &ParseError) {
+          reportError(ParseError.getLineNum(), ParseError.getMessage());
+        });
+    return sampleprof_error::malformed;
+  }
+
+  for (auto &Sample : getProfiles())
+    if (auto Key = Remappings.insert(Sample.first()))
+      SampleMap.insert({Key, &Sample.second});
+
+  return sampleprof_error::success;
+}
+
+FunctionSamples *
+SampleProfileReaderItaniumRemapper::getSamplesFor(StringRef Fname) {
+  if (auto Key = Remappings.lookup(Fname))
+    return SampleMap.lookup(Key);
+  return SampleProfileReader::getSamplesFor(Fname);
+}
+
 /// Prepare a memory buffer for the contents of \p Filename.
 ///
 /// \returns an error code indicating the status of the buffer.
@@ -944,6 +978,27 @@ SampleProfileReader::create(const Twine &Filename, LLVMContext &C) {
   return create(BufferOrError.get(), C);
 }
 
+/// Create a sample profile remapper from the given input, to remap the
+/// function names in the given profile data.
+///
+/// \param Filename The file to open.
+///
+/// \param C The LLVM context to use to emit diagnostics.
+///
+/// \param Underlying The underlying profile data reader to remap.
+///
+/// \returns an error code indicating the status of the created reader.
+ErrorOr<std::unique_ptr<SampleProfileReader>>
+SampleProfileReaderItaniumRemapper::create(
+    const Twine &Filename, LLVMContext &C,
+    std::unique_ptr<SampleProfileReader> Underlying) {
+  auto BufferOrError = setupMemoryBuffer(Filename);
+  if (std::error_code EC = BufferOrError.getError())
+    return EC;
+  return llvm::make_unique<SampleProfileReaderItaniumRemapper>(
+      std::move(BufferOrError.get()), C, std::move(Underlying));
+}
+
 /// Create a sample profile reader based on the format of the input data.
 ///
 /// \param B The memory buffer to create the reader from (assumes ownership).
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index b169bb609643e595f5b93d0cc0267e781a012f4f..cb2a2e557fa982488973d5edc879bb3314ca177e 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -1061,8 +1061,27 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
 }
 
 bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
-                                 StringRef Overview, raw_ostream *Errs) {
-  return GlobalParser->ParseCommandLineOptions(argc, argv, Overview,
+                                 StringRef Overview, raw_ostream *Errs,
+                                 const char *EnvVar) {
+  SmallVector<const char *, 20> NewArgv;
+  BumpPtrAllocator A;
+  StringSaver Saver(A);
+  NewArgv.push_back(argv[0]);
+
+  // Parse options from environment variable.
+  if (EnvVar) {
+    if (llvm::Optional<std::string> EnvValue =
+            sys::Process::GetEnv(StringRef(EnvVar)))
+      TokenizeGNUCommandLine(*EnvValue, Saver, NewArgv);
+  }
+
+  // Append options from command line.
+  for (int I = 1; I < argc; ++I)
+    NewArgv.push_back(argv[I]);
+  int NewArgc = static_cast<int>(NewArgv.size());
+
+  // Parse all options.
+  return GlobalParser->ParseCommandLineOptions(NewArgc, &NewArgv[0], Overview,
                                                Errs);
 }
 
diff --git a/lib/Support/DebugCounter.cpp b/lib/Support/DebugCounter.cpp
index 9c8260dbe07ca5843e387cb17dc079f3d196af26..6598103658daa78c8449274829346b2099d5ea27 100644
--- a/lib/Support/DebugCounter.cpp
+++ b/lib/Support/DebugCounter.cpp
@@ -49,8 +49,18 @@ static DebugCounterList DebugCounterOption(
     cl::desc("Comma separated list of debug counter skip and count"),
     cl::CommaSeparated, cl::ZeroOrMore, cl::location(DebugCounter::instance()));
 
+static cl::opt<bool> PrintDebugCounter(
+    "print-debug-counter", cl::Hidden, cl::init(false), cl::Optional,
+    cl::desc("Print out debug counter info after all counters accumulated"));
+
 static ManagedStatic<DebugCounter> DC;
 
+// Print information when destroyed, iff command line option is specified.
+DebugCounter::~DebugCounter() {
+  if (isCountingEnabled() && PrintDebugCounter)
+    print(dbgs());
+}
+
 DebugCounter &DebugCounter::instance() { return *DC; }
 
 // This is called by the command line parser when it sees a value for the
@@ -107,11 +117,18 @@ void DebugCounter::push_back(const std::string &Val) {
 }
 
 void DebugCounter::print(raw_ostream &OS) const {
+  SmallVector<StringRef, 16> CounterNames(RegisteredCounters.begin(),
+                                          RegisteredCounters.end());
+  sort(CounterNames.begin(), CounterNames.end());
+
+  auto &Us = instance();
   OS << "Counters and values:\n";
-  for (const auto &KV : Counters)
-    OS << left_justify(RegisteredCounters[KV.first], 32) << ": {"
-       << KV.second.Count << "," << KV.second.Skip << ","
-       << KV.second.StopAfter << "}\n";
+  for (auto &CounterName : CounterNames) {
+    unsigned CounterID = getCounterId(CounterName);
+    OS << left_justify(RegisteredCounters[CounterID], 32) << ": {"
+       << Us.Counters[CounterID].Count << "," << Us.Counters[CounterID].Skip
+       << "," << Us.Counters[CounterID].StopAfter << "}\n";
+  }
 }
 
 LLVM_DUMP_METHOD void DebugCounter::dump() const {
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 7de2a9e3fbb062b53b8dacb3d22d5dee705f258d..91e98a33b371e4db035ba21bf7e5ebe7be83e9aa 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -511,8 +511,8 @@ static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
 static void
 getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
                                 unsigned Brand_id, unsigned Features,
-                                unsigned Features2, unsigned *Type,
-                                unsigned *Subtype) {
+                                unsigned Features2, unsigned Features3,
+                                unsigned *Type, unsigned *Subtype) {
   if (Brand_id != 0)
     return;
   switch (Family) {
@@ -696,8 +696,8 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         break;
       }
 
-      if (Features2 & (1 << (X86::FEATURE_CLFLUSHOPT - 32))) {
-        if (Features2 & (1 << (X86::FEATURE_SHA - 32))) {
+      if (Features3 & (1 << (X86::FEATURE_CLFLUSHOPT - 64))) {
+        if (Features3 & (1 << (X86::FEATURE_SHA - 64))) {
           *Type = X86::INTEL_GOLDMONT;
         } else {
           *Type = X86::INTEL_COREI7;
@@ -705,7 +705,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         }
         break;
       }
-      if (Features2 & (1 << (X86::FEATURE_ADX - 32))) {
+      if (Features3 & (1 << (X86::FEATURE_ADX - 64))) {
         *Type = X86::INTEL_COREI7;
         *Subtype = X86::INTEL_COREI7_BROADWELL;
         break;
@@ -721,7 +721,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         break;
       }
       if (Features & (1 << X86::FEATURE_SSE4_2)) {
-        if (Features2 & (1 << (X86::FEATURE_MOVBE - 32))) {
+        if (Features3 & (1 << (X86::FEATURE_MOVBE - 64))) {
           *Type = X86::INTEL_SILVERMONT;
         } else {
           *Type = X86::INTEL_COREI7;
@@ -735,7 +735,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         break;
       }
       if (Features & (1 << X86::FEATURE_SSSE3)) {
-        if (Features2 & (1 << (X86::FEATURE_MOVBE - 32))) {
+        if (Features3 & (1 << (X86::FEATURE_MOVBE - 64))) {
           *Type = X86::INTEL_BONNELL; // "bonnell"
         } else {
           *Type = X86::INTEL_CORE2; // "core2"
@@ -743,7 +743,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         }
         break;
       }
-      if (Features2 & (1 << (X86::FEATURE_EM64T - 32))) {
+      if (Features3 & (1 << (X86::FEATURE_EM64T - 64))) {
         *Type = X86::INTEL_CORE2; // "core2"
         *Subtype = X86::INTEL_CORE2_65;
         break;
@@ -769,7 +769,7 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     }
     break;
   case 15: {
-    if (Features2 & (1 << (X86::FEATURE_EM64T - 32))) {
+    if (Features3 & (1 << (X86::FEATURE_EM64T - 64))) {
       *Type = X86::INTEL_NOCONA;
       break;
     }
@@ -877,40 +877,52 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
 }
 
 static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
-                                 unsigned *FeaturesOut,
-                                 unsigned *Features2Out) {
+                                 unsigned *FeaturesOut, unsigned *Features2Out,
+                                 unsigned *Features3Out) {
   unsigned Features = 0;
   unsigned Features2 = 0;
+  unsigned Features3 = 0;
   unsigned EAX, EBX;
 
+  auto setFeature = [&](unsigned F) {
+    if (F < 32)
+      Features |= 1 << F;
+    else if (F < 64)
+      Features2 |= 1 << (F - 32);
+    else if (F < 96)
+      Features3 |= 1 << (F - 64);
+    else
+      llvm_unreachable("Unexpected FeatureBit");
+  };
+
   if ((EDX >> 15) & 1)
-    Features |= 1 << X86::FEATURE_CMOV;
+    setFeature(X86::FEATURE_CMOV);
   if ((EDX >> 23) & 1)
-    Features |= 1 << X86::FEATURE_MMX;
+    setFeature(X86::FEATURE_MMX);
   if ((EDX >> 25) & 1)
-    Features |= 1 << X86::FEATURE_SSE;
+    setFeature(X86::FEATURE_SSE);
   if ((EDX >> 26) & 1)
-    Features |= 1 << X86::FEATURE_SSE2;
+    setFeature(X86::FEATURE_SSE2);
 
   if ((ECX >> 0) & 1)
-    Features |= 1 << X86::FEATURE_SSE3;
+    setFeature(X86::FEATURE_SSE3);
   if ((ECX >> 1) & 1)
-    Features |= 1 << X86::FEATURE_PCLMUL;
+    setFeature(X86::FEATURE_PCLMUL);
   if ((ECX >> 9) & 1)
-    Features |= 1 << X86::FEATURE_SSSE3;
+    setFeature(X86::FEATURE_SSSE3);
   if ((ECX >> 12) & 1)
-    Features |= 1 << X86::FEATURE_FMA;
+    setFeature(X86::FEATURE_FMA);
   if ((ECX >> 19) & 1)
-    Features |= 1 << X86::FEATURE_SSE4_1;
+    setFeature(X86::FEATURE_SSE4_1);
   if ((ECX >> 20) & 1)
-    Features |= 1 << X86::FEATURE_SSE4_2;
+    setFeature(X86::FEATURE_SSE4_2);
   if ((ECX >> 23) & 1)
-    Features |= 1 << X86::FEATURE_POPCNT;
+    setFeature(X86::FEATURE_POPCNT);
   if ((ECX >> 25) & 1)
-    Features |= 1 << X86::FEATURE_AES;
+    setFeature(X86::FEATURE_AES);
 
   if ((ECX >> 22) & 1)
-    Features2 |= 1 << (X86::FEATURE_MOVBE - 32);
+    setFeature(X86::FEATURE_MOVBE);
 
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
   // indicates that the AVX registers will be saved and restored on context
@@ -921,49 +933,59 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0);
 
   if (HasAVX)
-    Features |= 1 << X86::FEATURE_AVX;
+    setFeature(X86::FEATURE_AVX);
 
   bool HasLeaf7 =
       MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
   if (HasLeaf7 && ((EBX >> 3) & 1))
-    Features |= 1 << X86::FEATURE_BMI;
+    setFeature(X86::FEATURE_BMI);
   if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVX)
-    Features |= 1 << X86::FEATURE_AVX2;
+    setFeature(X86::FEATURE_AVX2);
   if (HasLeaf7 && ((EBX >> 9) & 1))
-    Features |= 1 << X86::FEATURE_BMI2;
+    setFeature(X86::FEATURE_BMI2);
   if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512F;
+    setFeature(X86::FEATURE_AVX512F);
   if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512DQ;
+    setFeature(X86::FEATURE_AVX512DQ);
   if (HasLeaf7 && ((EBX >> 19) & 1))
-    Features2 |= 1 << (X86::FEATURE_ADX - 32);
+    setFeature(X86::FEATURE_ADX);
   if (HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512IFMA;
+    setFeature(X86::FEATURE_AVX512IFMA);
   if (HasLeaf7 && ((EBX >> 23) & 1))
-    Features2 |= 1 << (X86::FEATURE_CLFLUSHOPT - 32);
+    setFeature(X86::FEATURE_CLFLUSHOPT);
   if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512PF;
+    setFeature(X86::FEATURE_AVX512PF);
   if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512ER;
+    setFeature(X86::FEATURE_AVX512ER);
   if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512CD;
+    setFeature(X86::FEATURE_AVX512CD);
   if (HasLeaf7 && ((EBX >> 29) & 1))
-    Features2 |= 1 << (X86::FEATURE_SHA - 32);
+    setFeature(X86::FEATURE_SHA);
   if (HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512BW;
+    setFeature(X86::FEATURE_AVX512BW);
   if (HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512VL;
+    setFeature(X86::FEATURE_AVX512VL);
 
   if (HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512VBMI;
+    setFeature(X86::FEATURE_AVX512VBMI);
+  if (HasLeaf7 && ((ECX >> 6) & 1) && HasAVX512Save)
+    setFeature(X86::FEATURE_AVX512VBMI2);
+  if (HasLeaf7 && ((ECX >> 8) & 1))
+    setFeature(X86::FEATURE_GFNI);
+  if (HasLeaf7 && ((ECX >> 10) & 1) && HasAVX)
+    setFeature(X86::FEATURE_VPCLMULQDQ);
+  if (HasLeaf7 && ((ECX >> 11) & 1) && HasAVX512Save)
+    setFeature(X86::FEATURE_AVX512VNNI);
+  if (HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save)
+    setFeature(X86::FEATURE_AVX512BITALG);
   if (HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX512VPOPCNTDQ;
+    setFeature(X86::FEATURE_AVX512VPOPCNTDQ);
 
   if (HasLeaf7 && ((EDX >> 2) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX5124VNNIW;
+    setFeature(X86::FEATURE_AVX5124VNNIW);
   if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save)
-    Features |= 1 << X86::FEATURE_AVX5124FMAPS;
+    setFeature(X86::FEATURE_AVX5124FMAPS);
 
   unsigned MaxExtLevel;
   getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -971,17 +993,18 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
   bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
                      !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
   if (HasExtLeaf1 && ((ECX >> 6) & 1))
-    Features |= 1 << X86::FEATURE_SSE4_A;
+    setFeature(X86::FEATURE_SSE4_A);
   if (HasExtLeaf1 && ((ECX >> 11) & 1))
-    Features |= 1 << X86::FEATURE_XOP;
+    setFeature(X86::FEATURE_XOP);
   if (HasExtLeaf1 && ((ECX >> 16) & 1))
-    Features |= 1 << X86::FEATURE_FMA4;
+    setFeature(X86::FEATURE_FMA4);
 
   if (HasExtLeaf1 && ((EDX >> 29) & 1))
-    Features2 |= 1 << (X86::FEATURE_EM64T - 32);
+    setFeature(X86::FEATURE_EM64T);
 
   *FeaturesOut  = Features;
   *Features2Out = Features2;
+  *Features3Out = Features3;
 }
 
 StringRef sys::getHostCPUName() {
@@ -1002,16 +1025,16 @@ StringRef sys::getHostCPUName() {
 
   unsigned Brand_id = EBX & 0xff;
   unsigned Family = 0, Model = 0;
-  unsigned Features = 0, Features2 = 0;
+  unsigned Features = 0, Features2 = 0, Features3 = 0;
   detectX86FamilyModel(EAX, &Family, &Model);
-  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2);
+  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2, &Features3);
 
   unsigned Type = 0;
   unsigned Subtype = 0;
 
   if (Vendor == SIG_INTEL) {
     getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features,
-                                    Features2, &Type, &Subtype);
+                                    Features2, Features3, &Type, &Subtype);
   } else if (Vendor == SIG_AMD) {
     getAMDProcessorTypeAndSubtype(Family, Model, Features, &Type, &Subtype);
   }
diff --git a/lib/Support/ItaniumManglingCanonicalizer.cpp b/lib/Support/ItaniumManglingCanonicalizer.cpp
index ca63c6d1c7da0cb6735ef34d90b013cfefc2d6c8..e55dcd7618095cb7dea033fbf5381efdfde2057d 100644
--- a/lib/Support/ItaniumManglingCanonicalizer.cpp
+++ b/lib/Support/ItaniumManglingCanonicalizer.cpp
@@ -221,7 +221,8 @@ struct CanonicalizerAllocator::MakeNodeImpl<
 
 // FIXME: Also expand built-in substitutions?
 
-using CanonicalizingDemangler = itanium_demangle::Db<CanonicalizerAllocator>;
+using CanonicalizingDemangler =
+    itanium_demangle::ManglingParser<CanonicalizerAllocator>;
 }
 
 struct ItaniumManglingCanonicalizer::Impl {
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 582e2cf6c11621ee63d80e674320b002a72d780b..a55ad881d01271490839974aea50b40152137ca8 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
@@ -370,65 +371,48 @@ static bool isNonASCII(char c) {
   return c & 0x80;
 }
 
-void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
-                         bool ShowKindLabel) const {
-  // Display colors only if OS supports colors.
-  ShowColors &= S.has_colors();
+void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
+                         bool ShowColors, bool ShowKindLabel) const {
+  {
+    WithColor S(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors);
 
-  if (ShowColors)
-    S.changeColor(raw_ostream::SAVEDCOLOR, true);
+    if (ProgName && ProgName[0])
+      S << ProgName << ": ";
 
-  if (ProgName && ProgName[0])
-    S << ProgName << ": ";
+    if (!Filename.empty()) {
+      if (Filename == "-")
+        S << "<stdin>";
+      else
+        S << Filename;
 
-  if (!Filename.empty()) {
-    if (Filename == "-")
-      S << "<stdin>";
-    else
-      S << Filename;
-
-    if (LineNo != -1) {
-      S << ':' << LineNo;
-      if (ColumnNo != -1)
-        S << ':' << (ColumnNo+1);
+      if (LineNo != -1) {
+        S << ':' << LineNo;
+        if (ColumnNo != -1)
+          S << ':' << (ColumnNo + 1);
+      }
+      S << ": ";
     }
-    S << ": ";
   }
 
   if (ShowKindLabel) {
     switch (Kind) {
     case SourceMgr::DK_Error:
-      if (ShowColors)
-        S.changeColor(raw_ostream::RED, true);
-      S << "error: ";
+      WithColor::error(OS, "", !ShowColors);
       break;
     case SourceMgr::DK_Warning:
-      if (ShowColors)
-        S.changeColor(raw_ostream::MAGENTA, true);
-      S << "warning: ";
+      WithColor::warning(OS, "", !ShowColors);
       break;
     case SourceMgr::DK_Note:
-      if (ShowColors)
-        S.changeColor(raw_ostream::BLACK, true);
-      S << "note: ";
+      WithColor::note(OS, "", !ShowColors);
       break;
     case SourceMgr::DK_Remark:
-      if (ShowColors)
-        S.changeColor(raw_ostream::BLUE, true);
-      S << "remark: ";
+      WithColor::remark(OS, "", !ShowColors);
       break;
     }
-
-    if (ShowColors) {
-      S.resetColor();
-      S.changeColor(raw_ostream::SAVEDCOLOR, true);
-    }
   }
 
-  S << Message << '\n';
-
-  if (ShowColors)
-    S.resetColor();
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors)
+      << Message << '\n';
 
   if (LineNo == -1 || ColumnNo == -1)
     return;
@@ -439,7 +423,7 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
   // expanding them later, and bail out rather than show incorrect ranges and
   // misaligned fixits for any other odd characters.
   if (find_if(LineContents, isNonASCII) != LineContents.end()) {
-    printSourceLine(S, LineContents);
+    printSourceLine(OS, LineContents);
     return;
   }
   size_t NumColumns = LineContents.size();
@@ -473,29 +457,27 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
   // least.
   CaretLine.erase(CaretLine.find_last_not_of(' ')+1);
 
-  printSourceLine(S, LineContents);
+  printSourceLine(OS, LineContents);
 
-  if (ShowColors)
-    S.changeColor(raw_ostream::GREEN, true);
+  {
+    WithColor S(OS, raw_ostream::GREEN, true, false, !ShowColors);
 
-  // Print out the caret line, matching tabs in the source line.
-  for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
-    if (i >= LineContents.size() || LineContents[i] != '\t') {
-      S << CaretLine[i];
-      ++OutCol;
-      continue;
-    }
+    // Print out the caret line, matching tabs in the source line.
+    for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
+      if (i >= LineContents.size() || LineContents[i] != '\t') {
+        S << CaretLine[i];
+        ++OutCol;
+        continue;
+      }
 
-    // Okay, we have a tab.  Insert the appropriate number of characters.
-    do {
-      S << CaretLine[i];
-      ++OutCol;
-    } while ((OutCol % TabStop) != 0);
+      // Okay, we have a tab.  Insert the appropriate number of characters.
+      do {
+        S << CaretLine[i];
+        ++OutCol;
+      } while ((OutCol % TabStop) != 0);
+    }
+    S << '\n';
   }
-  S << '\n';
-
-  if (ShowColors)
-    S.resetColor();
 
   // Print out the replacement line, matching tabs in the source line.
   if (FixItInsertionLine.empty())
@@ -503,14 +485,14 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
 
   for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i < e; ++i) {
     if (i >= LineContents.size() || LineContents[i] != '\t') {
-      S << FixItInsertionLine[i];
+      OS << FixItInsertionLine[i];
       ++OutCol;
       continue;
     }
 
     // Okay, we have a tab.  Insert the appropriate number of characters.
     do {
-      S << FixItInsertionLine[i];
+      OS << FixItInsertionLine[i];
       // FIXME: This is trying not to break up replacements, but then to re-sync
       // with the tabs between replacements. This will fail, though, if two
       // fix-it replacements are exactly adjacent, or if a fix-it contains a
@@ -521,5 +503,5 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S, bool ShowColors,
       ++OutCol;
     } while (((OutCol % TabStop) != 0) && i != e);
   }
-  S << '\n';
+  OS << '\n';
 }
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index f2fdc23ad85fcc8d2e73494766afdc35f2961973..968b559c08dfa5a268347faa4d6e6f2c1c97987e 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -995,7 +995,7 @@ constexpr GPUInfo R600GPUs[26] = {
 
 // This table should be sorted by the value of GPUKind
 // Don't bother listing the implicitly true features
-constexpr GPUInfo AMDGCNGPUs[32] = {
+constexpr GPUInfo AMDGCNGPUs[33] = {
   // Name         Canonical    Kind        Features
   //              Name
   {{"gfx600"},    {"gfx600"},  GK_GFX600,  FEATURE_FAST_FMA_F32},
@@ -1030,6 +1030,7 @@ constexpr GPUInfo AMDGCNGPUs[32] = {
   {{"gfx902"},    {"gfx902"},  GK_GFX902,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
   {{"gfx904"},    {"gfx904"},  GK_GFX904,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
   {{"gfx906"},    {"gfx906"},  GK_GFX906,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
+  {{"gfx909"},    {"gfx909"},  GK_GFX909,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
 };
 
 const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef<GPUInfo> Table) {
@@ -1124,6 +1125,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX902: return {9, 0, 2};
   case GK_GFX904: return {9, 0, 4};
   case GK_GFX906: return {9, 0, 6};
+  case GK_GFX909: return {9, 0, 9};
   default:        return {0, 0, 0};
   }
 }
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index ec3eecb29474f41378dbf882b90036e0d5bb3b41..02b7c2579c9c5bf13fd55ac2933ca7f75471b09b 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -49,6 +49,7 @@
 // For GNU Hurd
 #if defined(__GNU__) && !defined(PATH_MAX)
 # define PATH_MAX 4096
+# define MAXPATHLEN 4096
 #endif
 
 #include <sys/types.h>
@@ -82,7 +83,7 @@
 #define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
 #endif
 
-#if defined(__NetBSD__)
+#if defined(__NetBSD__) || defined(__GNU__)
 #define STATVFS_F_FLAG(vfs) (vfs).f_flag
 #else
 #define STATVFS_F_FLAG(vfs) (vfs).f_flags
@@ -98,7 +99,7 @@ const file_t kInvalidFile = -1;
 
 #if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) ||     \
     defined(__minix) || defined(__FreeBSD_kernel__) || defined(__linux__) ||   \
-    defined(__CYGWIN__) || defined(__DragonFly__) || defined(_AIX)
+    defined(__CYGWIN__) || defined(__DragonFly__) || defined(_AIX) || defined(__GNU__)
 static int
 test_dir(char ret[PATH_MAX], const char *dir, const char *bin)
 {
@@ -178,14 +179,34 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
   char exe_path[MAXPATHLEN];
   StringRef aPath("/proc/self/exe");
   if (sys::fs::exists(aPath)) {
-      // /proc is not always mounted under Linux (chroot for example).
-      ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
-      if (len >= 0)
-          return std::string(exe_path, len);
+    // /proc is not always mounted under Linux (chroot for example).
+    ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
+    if (len < 0)
+      return "";
+
+    // Null terminate the string for realpath. readlink never null
+    // terminates its output.
+    len = std::min(len, ssize_t(sizeof(exe_path) - 1));
+    exe_path[len] = '\0';
+
+    // On Linux, /proc/self/exe always looks through symlinks. However, on
+    // GNU/Hurd, /proc/self/exe is a symlink to the path that was used to start
+    // the program, and not the eventual binary file. Therefore, call realpath
+    // so this behaves the same on all platforms.
+#if _POSIX_VERSION >= 200112 || defined(__GLIBC__)
+    char *real_path = realpath(exe_path, NULL);
+    std::string ret = std::string(real_path);
+    free(real_path);
+    return ret;
+#else
+    char real_path[MAXPATHLEN];
+    realpath(exe_path, real_path);
+    return std::string(real_path);
+#endif
   } else {
-      // Fall back to the classical detection.
-      if (getprogpath(exe_path, argv0))
-        return exe_path;
+    // Fall back to the classical detection.
+    if (getprogpath(exe_path, argv0))
+      return exe_path;
   }
 #elif defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
   // Use dladdr to get executable path if available.
@@ -347,7 +368,7 @@ std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
 }
 
 static bool is_local_impl(struct STATVFS &Vfs) {
-#if defined(__linux__)
+#if defined(__linux__) || defined(__GNU__)
 #ifndef NFS_SUPER_MAGIC
 #define NFS_SUPER_MAGIC 0x6969
 #endif
@@ -357,7 +378,11 @@ static bool is_local_impl(struct STATVFS &Vfs) {
 #ifndef CIFS_MAGIC_NUMBER
 #define CIFS_MAGIC_NUMBER 0xFF534D42
 #endif
+#ifdef __GNU__
+  switch ((uint32_t)Vfs.__f_type) {
+#else
   switch ((uint32_t)Vfs.f_type) {
+#endif
   case NFS_SUPER_MAGIC:
   case SMB_SUPER_MAGIC:
   case CIFS_MAGIC_NUMBER:
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index de26695d64ea2c3dd6381b51094e43b1b23b60a3..ad88d5e969068e6802d0e9b359d8c4101667f1f5 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -47,6 +47,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <string>
+#include <sysexits.h>
 #ifdef HAVE_BACKTRACE
 # include BACKTRACE_HEADER         // For backtrace().
 #endif
@@ -334,6 +335,10 @@ static RETSIGTYPE SignalHandler(int Sig) {
       if (auto OldInterruptFunction = InterruptFunction.exchange(nullptr))
         return OldInterruptFunction();
 
+      // Send a special return code that drivers can check for, from sysexits.h.
+      if (Sig == SIGPIPE)
+        exit(EX_IOERR);
+
       raise(Sig);   // Execute the default handler.
       return;
    }
diff --git a/lib/Support/VirtualFileSystem.cpp b/lib/Support/VirtualFileSystem.cpp
index 23b5fbceb20ed79ad99b5e82f68313c76bab2764..e8b0435b9cdf12fe5d43603ecea5adf23b8b3a24 100644
--- a/lib/Support/VirtualFileSystem.cpp
+++ b/lib/Support/VirtualFileSystem.cpp
@@ -136,6 +136,10 @@ std::error_code FileSystem::getRealPath(const Twine &Path,
   return errc::operation_not_permitted;
 }
 
+std::error_code FileSystem::isLocal(const Twine &Path, bool &Result) {
+  return errc::operation_not_permitted;
+}
+
 bool FileSystem::exists(const Twine &Path) {
   auto Status = status(Path);
   return Status && Status->exists();
@@ -233,6 +237,7 @@ public:
 
   llvm::ErrorOr<std::string> getCurrentWorkingDirectory() const override;
   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
+  std::error_code isLocal(const Twine &Path, bool &Result) override;
   std::error_code getRealPath(const Twine &Path,
                               SmallVectorImpl<char> &Output) const override;
 
@@ -288,6 +293,10 @@ std::error_code RealFileSystem::setCurrentWorkingDirectory(const Twine &Path) {
   return std::error_code();
 }
 
+std::error_code RealFileSystem::isLocal(const Twine &Path, bool &Result) {
+  return llvm::sys::fs::is_local(Path, Result);
+}
+
 std::error_code
 RealFileSystem::getRealPath(const Twine &Path,
                             SmallVectorImpl<char> &Output) const {
@@ -377,6 +386,13 @@ OverlayFileSystem::setCurrentWorkingDirectory(const Twine &Path) {
   return {};
 }
 
+std::error_code OverlayFileSystem::isLocal(const Twine &Path, bool &Result) {
+  for (auto &FS : FSList)
+    if (FS->exists(Path))
+      return FS->isLocal(Path, Result);
+  return errc::no_such_file_or_directory;
+}
+
 std::error_code
 OverlayFileSystem::getRealPath(const Twine &Path,
                                SmallVectorImpl<char> &Output) const {
@@ -913,6 +929,11 @@ InMemoryFileSystem::getRealPath(const Twine &Path,
   return {};
 }
 
+std::error_code InMemoryFileSystem::isLocal(const Twine &Path, bool &Result) {
+  Result = false;
+  return {};
+}
+
 } // namespace vfs
 } // namespace llvm
 
@@ -993,16 +1014,44 @@ public:
   static bool classof(const Entry *E) { return E->getKind() == EK_File; }
 };
 
+// FIXME: reuse implementation common with OverlayFSDirIterImpl as these
+// iterators are conceptually similar.
 class VFSFromYamlDirIterImpl : public llvm::vfs::detail::DirIterImpl {
   std::string Dir;
   RedirectingDirectoryEntry::iterator Current, End;
 
-  std::error_code incrementImpl();
+  // To handle 'fallthrough' mode we need to iterate at first through
+  // RedirectingDirectoryEntry and then through ExternalFS. These operations are
+  // done sequentially, we just need to keep a track of what kind of iteration
+  // we are currently performing.
+
+  /// Flag telling if we should iterate through ExternalFS or stop at the last
+  /// RedirectingDirectoryEntry::iterator.
+  bool IterateExternalFS;
+  /// Flag telling if we have switched to iterating through ExternalFS.
+  bool IsExternalFSCurrent = false;
+  FileSystem &ExternalFS;
+  directory_iterator ExternalDirIter;
+  llvm::StringSet<> SeenNames;
+
+  /// To combine multiple iterations, different methods are responsible for
+  /// different iteration steps.
+  /// @{
+
+  /// Responsible for dispatching between RedirectingDirectoryEntry iteration
+  /// and ExternalFS iteration.
+  std::error_code incrementImpl(bool IsFirstTime);
+  /// Responsible for RedirectingDirectoryEntry iteration.
+  std::error_code incrementContent(bool IsFirstTime);
+  /// Responsible for ExternalFS iteration.
+  std::error_code incrementExternal();
+  /// @}
 
 public:
   VFSFromYamlDirIterImpl(const Twine &Path,
                          RedirectingDirectoryEntry::iterator Begin,
                          RedirectingDirectoryEntry::iterator End,
+                         bool IterateExternalFS, FileSystem &ExternalFS,
                          std::error_code &EC);
 
   std::error_code increment() override;
@@ -1028,7 +1077,7 @@ public:
 ///   'case-sensitive': <boolean, default=true>
 ///   'use-external-names': <boolean, default=true>
 ///   'overlay-relative': <boolean, default=false>
-///   'ignore-non-existent-contents': <boolean, default=true>
+///   'fallthrough': <boolean, default=true>
 ///
 /// Virtual directories are represented as
 /// \verbatim
@@ -1093,13 +1142,9 @@ class RedirectingFileSystem : public vfs::FileSystem {
   /// names of files.  This global value is overridable on a per-file basis.
   bool UseExternalNames = true;
 
-  /// Whether an invalid path obtained via 'external-contents' should
-  /// cause iteration on the VFS to stop. If 'true', the VFS should ignore
-  /// the entry and continue with the next. Allows YAML files to be shared
-  /// across multiple compiler invocations regardless of prior existent
-  /// paths in 'external-contents'. This global value is overridable on a
-  /// per-file basis.
-  bool IgnoreNonExistentContents = true;
+  /// Whether to attempt a file lookup in external file system after it wasn't
+  /// found in VFS.
+  bool IsFallthrough = true;
   /// @}
 
   /// Virtual file paths and external files could be canonicalized without "..",
@@ -1146,10 +1191,16 @@ public:
     return ExternalFS->setCurrentWorkingDirectory(Path);
   }
 
+  std::error_code isLocal(const Twine &Path, bool &Result) override {
+    return ExternalFS->isLocal(Path, Result);
+  }
+
   directory_iterator dir_begin(const Twine &Dir, std::error_code &EC) override {
     ErrorOr<Entry *> E = lookupPath(Dir);
     if (!E) {
       EC = E.getError();
+      if (IsFallthrough && EC == errc::no_such_file_or_directory)
+        return ExternalFS->dir_begin(Dir, EC);
       return {};
     }
     ErrorOr<Status> S = status(Dir, *E);
@@ -1165,7 +1216,8 @@ public:
 
     auto *D = cast<RedirectingDirectoryEntry>(*E);
     return directory_iterator(std::make_shared<VFSFromYamlDirIterImpl>(
-        Dir, D->contents_begin(), D->contents_end(), EC));
+        Dir, D->contents_begin(), D->contents_end(),
+        /*IterateExternalFS=*/IsFallthrough, *ExternalFS, EC));
   }
 
   void setExternalContentsPrefixDir(StringRef PrefixDir) {
@@ -1176,8 +1228,6 @@ public:
     return ExternalContentsPrefixDir;
   }
 
-  bool ignoreNonExistentContents() const { return IgnoreNonExistentContents; }
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const {
     for (const auto &Root : Roots)
@@ -1549,7 +1599,7 @@ public:
         KeyStatusPair("case-sensitive", false),
         KeyStatusPair("use-external-names", false),
         KeyStatusPair("overlay-relative", false),
-        KeyStatusPair("ignore-non-existent-contents", false),
+        KeyStatusPair("fallthrough", false),
         KeyStatusPair("roots", true),
     };
 
@@ -1607,8 +1657,8 @@ public:
       } else if (Key == "use-external-names") {
         if (!parseScalarBool(I.getValue(), FS->UseExternalNames))
           return false;
-      } else if (Key == "ignore-non-existent-contents") {
-        if (!parseScalarBool(I.getValue(), FS->IgnoreNonExistentContents))
+      } else if (Key == "fallthrough") {
+        if (!parseScalarBool(I.getValue(), FS->IsFallthrough))
           return false;
       } else {
         llvm_unreachable("key missing from Keys");
@@ -1775,8 +1825,13 @@ ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path, Entry *E) {
 
 ErrorOr<Status> RedirectingFileSystem::status(const Twine &Path) {
   ErrorOr<Entry *> Result = lookupPath(Path);
-  if (!Result)
+  if (!Result) {
+    if (IsFallthrough &&
+        Result.getError() == llvm::errc::no_such_file_or_directory) {
+      return ExternalFS->status(Path);
+    }
     return Result.getError();
+  }
   return status(Path, *Result);
 }
 
@@ -1808,8 +1863,13 @@ public:
 ErrorOr<std::unique_ptr<File>>
 RedirectingFileSystem::openFileForRead(const Twine &Path) {
   ErrorOr<Entry *> E = lookupPath(Path);
-  if (!E)
+  if (!E) {
+    if (IsFallthrough &&
+        E.getError() == llvm::errc::no_such_file_or_directory) {
+      return ExternalFS->openFileForRead(Path);
+    }
     return E.getError();
+  }
 
   auto *F = dyn_cast<RedirectingFileEntry>(*E);
   if (!F) // FIXME: errc::not_a_file?
@@ -1915,7 +1975,7 @@ public:
 
   void write(ArrayRef<YAMLVFSEntry> Entries, Optional<bool> UseExternalNames,
              Optional<bool> IsCaseSensitive, Optional<bool> IsOverlayRelative,
-             Optional<bool> IgnoreNonExistentContents, StringRef OverlayDir);
+             StringRef OverlayDir);
 };
 
 } // namespace
@@ -1973,7 +2033,6 @@ void JSONWriter::write(ArrayRef<YAMLVFSEntry> Entries,
                        Optional<bool> UseExternalNames,
                        Optional<bool> IsCaseSensitive,
                        Optional<bool> IsOverlayRelative,
-                       Optional<bool> IgnoreNonExistentContents,
                        StringRef OverlayDir) {
   using namespace llvm::sys;
 
@@ -1991,9 +2050,6 @@ void JSONWriter::write(ArrayRef<YAMLVFSEntry> Entries,
     OS << "  'overlay-relative': '" << (UseOverlayRelative ? "true" : "false")
        << "',\n";
   }
-  if (IgnoreNonExistentContents.hasValue())
-    OS << "  'ignore-non-existent-contents': '"
-       << (IgnoreNonExistentContents.getValue() ? "true" : "false") << "',\n";
   OS << "  'roots': [\n";
 
   if (!Entries.empty()) {
@@ -2049,24 +2105,47 @@ void YAMLVFSWriter::write(llvm::raw_ostream &OS) {
   });
 
   JSONWriter(OS).write(Mappings, UseExternalNames, IsCaseSensitive,
-                       IsOverlayRelative, IgnoreNonExistentContents,
-                       OverlayDir);
+                       IsOverlayRelative, OverlayDir);
 }
 
 VFSFromYamlDirIterImpl::VFSFromYamlDirIterImpl(
     const Twine &_Path, RedirectingDirectoryEntry::iterator Begin,
-    RedirectingDirectoryEntry::iterator End, std::error_code &EC)
-    : Dir(_Path.str()), Current(Begin), End(End) {
-  EC = incrementImpl();
+    RedirectingDirectoryEntry::iterator End, bool IterateExternalFS,
+    FileSystem &ExternalFS, std::error_code &EC)
+    : Dir(_Path.str()), Current(Begin), End(End),
+      IterateExternalFS(IterateExternalFS), ExternalFS(ExternalFS) {
+  EC = incrementImpl(/*IsFirstTime=*/true);
 }
 
 std::error_code VFSFromYamlDirIterImpl::increment() {
-  assert(Current != End && "cannot iterate past end");
-  ++Current;
-  return incrementImpl();
+  return incrementImpl(/*IsFirstTime=*/false);
+}
+
+std::error_code VFSFromYamlDirIterImpl::incrementExternal() {
+  assert(!(IsExternalFSCurrent && ExternalDirIter == directory_iterator()) &&
+         "incrementing past end");
+  std::error_code EC;
+  if (IsExternalFSCurrent) {
+    ExternalDirIter.increment(EC);
+  } else if (IterateExternalFS) {
+    ExternalDirIter = ExternalFS.dir_begin(Dir, EC);
+    IsExternalFSCurrent = true;
+    if (EC && EC != errc::no_such_file_or_directory)
+      return EC;
+    EC = {};
+  }
+  if (EC || ExternalDirIter == directory_iterator()) {
+    CurrentEntry = directory_entry();
+  } else {
+    CurrentEntry = *ExternalDirIter;
+  }
+  return EC;
 }
 
-std::error_code VFSFromYamlDirIterImpl::incrementImpl() {
+std::error_code VFSFromYamlDirIterImpl::incrementContent(bool IsFirstTime) {
+  assert((IsFirstTime || Current != End) && "cannot iterate past end");
+  if (!IsFirstTime)
+    ++Current;
   while (Current != End) {
     SmallString<128> PathStr(Dir);
     llvm::sys::path::append(PathStr, (*Current)->getName());
@@ -2080,12 +2159,22 @@ std::error_code VFSFromYamlDirIterImpl::incrementImpl() {
       break;
     }
     CurrentEntry = directory_entry(PathStr.str(), Type);
-    break;
+    return {};
   }
+  return incrementExternal();
+}
 
-  if (Current == End)
-    CurrentEntry = directory_entry();
-  return {};
+std::error_code VFSFromYamlDirIterImpl::incrementImpl(bool IsFirstTime) {
+  while (true) {
+    std::error_code EC = IsExternalFSCurrent ? incrementExternal()
+                                             : incrementContent(IsFirstTime);
+    if (EC || CurrentEntry.path().empty())
+      return EC;
+    StringRef Name = llvm::sys::path::filename(CurrentEntry.path());
+    if (SeenNames.insert(Name).second)
+      return EC; // name not seen before
+  }
+  llvm_unreachable("returned above");
 }
 
 vfs::recursive_directory_iterator::recursive_directory_iterator(
@@ -2093,28 +2182,33 @@ vfs::recursive_directory_iterator::recursive_directory_iterator(
     : FS(&FS_) {
   directory_iterator I = FS->dir_begin(Path, EC);
   if (I != directory_iterator()) {
-    State = std::make_shared<IterState>();
-    State->push(I);
+    State = std::make_shared<detail::RecDirIterState>();
+    State->Stack.push(I);
   }
 }
 
 vfs::recursive_directory_iterator &
 recursive_directory_iterator::increment(std::error_code &EC) {
-  assert(FS && State && !State->empty() && "incrementing past end");
-  assert(!State->top()->path().empty() && "non-canonical end iterator");
+  assert(FS && State && !State->Stack.empty() && "incrementing past end");
+  assert(!State->Stack.top()->path().empty() && "non-canonical end iterator");
   vfs::directory_iterator End;
-  if (State->top()->type() == sys::fs::file_type::directory_file) {
-    vfs::directory_iterator I = FS->dir_begin(State->top()->path(), EC);
-    if (I != End) {
-      State->push(I);
-      return *this;
+
+  if (State->HasNoPushRequest)
+    State->HasNoPushRequest = false;
+  else {
+    if (State->Stack.top()->type() == sys::fs::file_type::directory_file) {
+      vfs::directory_iterator I = FS->dir_begin(State->Stack.top()->path(), EC);
+      if (I != End) {
+        State->Stack.push(I);
+        return *this;
+      }
     }
   }
 
-  while (!State->empty() && State->top().increment(EC) == End)
-    State->pop();
+  while (!State->Stack.empty() && State->Stack.top().increment(EC) == End)
+    State->Stack.pop();
 
-  if (State->empty())
+  if (State->Stack.empty())
     State.reset(); // end iterator
 
   return *this;
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index e719d3c7b728e4cb394b50606cf7c0eb24765a59..45d73ae3dfed29a98ec00444c05492e1354a5eef 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -854,16 +854,37 @@ mapped_file_region::mapped_file_region(int fd, mapmode mode, size_t length,
     Mapping = 0;
 }
 
+static bool hasFlushBufferKernelBug() {
+  static bool Ret{GetWindowsOSVersion() < llvm::VersionTuple(10, 0, 0, 17763)};
+  return Ret;
+}
+
+static bool isEXE(StringRef Magic) {
+  static const char PEMagic[] = {'P', 'E', '\0', '\0'};
+  if (Magic.startswith(StringRef("MZ")) && Magic.size() >= 0x3c + 4) {
+    uint32_t off = read32le(Magic.data() + 0x3c);
+    // PE/COFF file, either EXE or DLL.
+    if (Magic.substr(off).startswith(StringRef(PEMagic, sizeof(PEMagic))))
+      return true;
+  }
+  return false;
+}
+
 mapped_file_region::~mapped_file_region() {
   if (Mapping) {
+
+    bool Exe = isEXE(StringRef((char *)Mapping, Size));
+
     ::UnmapViewOfFile(Mapping);
 
-    if (Mode == mapmode::readwrite) {
+    if (Mode == mapmode::readwrite && Exe && hasFlushBufferKernelBug()) {
       // There is a Windows kernel bug, the exact trigger conditions of which
       // are not well understood.  When triggered, dirty pages are not properly
       // flushed and subsequent process's attempts to read a file can return
       // invalid data.  Calling FlushFileBuffers on the write handle is
       // sufficient to ensure that this bug is not triggered.
+      // The bug only occurs when writing an executable and executing it right
+      // after, under high I/O pressure.
       ::FlushFileBuffers(FileHandle);
     }
 
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index ce646d63609b5b7650994cffe6c37792dc7df212..2b2d7923143438b33defe04ac9173655b28c92ef 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -460,3 +460,27 @@ unsigned Process::GetRandomNumber() {
     ReportLastErrorFatal("Could not generate a random number");
   return Ret;
 }
+
+typedef NTSTATUS(WINAPI* RtlGetVersionPtr)(PRTL_OSVERSIONINFOW);
+#define STATUS_SUCCESS ((NTSTATUS)0x00000000L)
+
+llvm::VersionTuple llvm::GetWindowsOSVersion() {
+  HMODULE hMod = ::GetModuleHandleW(L"ntdll.dll");
+  if (hMod) {
+    auto getVer = (RtlGetVersionPtr)::GetProcAddress(hMod, "RtlGetVersion");
+    if (getVer) {
+      RTL_OSVERSIONINFOEXW info{};
+      info.dwOSVersionInfoSize = sizeof(info);
+      if (getVer((PRTL_OSVERSIONINFOW)&info) == STATUS_SUCCESS) {
+        return llvm::VersionTuple(info.dwMajorVersion, info.dwMinorVersion, 0,
+                                  info.dwBuildNumber);
+      }
+    }
+  }
+  return llvm::VersionTuple(0, 0, 0, 0);
+}
+
+bool llvm::RunningWindows8OrGreater() {
+  // Windows 8 is version 6.2, service pack 0.
+  return GetWindowsOSVersion() >= llvm::VersionTuple(6, 2, 0, 0);
+}
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 88c56bc173bb52f5a577ddaf2467ab82ec19c46f..c037956603f268f4fd67618fd03a83b69109a39e 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -105,6 +105,25 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
   return std::string(U8Result.begin(), U8Result.end());
 }
 
+bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix) {
+  if (!ErrMsg)
+    return true;
+  char *buffer = NULL;
+  DWORD LastError = GetLastError();
+  DWORD R = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                               FORMAT_MESSAGE_FROM_SYSTEM |
+                               FORMAT_MESSAGE_MAX_WIDTH_MASK,
+                           NULL, LastError, 0, (LPSTR)&buffer, 1, NULL);
+  if (R)
+    *ErrMsg = prefix + ": " + buffer;
+  else
+    *ErrMsg = prefix + ": Unknown error";
+  *ErrMsg += " (0x" + llvm::utohexstr(LastError) + ")";
+
+  LocalFree(buffer);
+  return R != 0;
+}
+
 static HANDLE RedirectIO(Optional<StringRef> Path, int fd,
                          std::string *ErrMsg) {
   HANDLE h;
diff --git a/lib/Support/Windows/WindowsSupport.h b/lib/Support/Windows/WindowsSupport.h
index c2fd6bb982d42ae0a9f4899874c96678b470ae80..979cc5d0139048d881ef63ae4d1210dfef2dec9e 100644
--- a/lib/Support/Windows/WindowsSupport.h
+++ b/lib/Support/Windows/WindowsSupport.h
@@ -41,6 +41,7 @@
 #include "llvm/Config/config.h" // Get build system configuration settings
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/VersionTuple.h"
 #include <cassert>
 #include <string>
 #include <system_error>
@@ -49,54 +50,29 @@
 // Must be included after windows.h
 #include <wincrypt.h>
 
+namespace llvm {
+
 /// Determines if the program is running on Windows 8 or newer. This
 /// reimplements one of the helpers in the Windows 8.1 SDK, which are intended
 /// to supercede raw calls to GetVersionEx. Old SDKs, Cygwin, and MinGW don't
 /// yet have VersionHelpers.h, so we have our own helper.
-inline bool RunningWindows8OrGreater() {
-  // Windows 8 is version 6.2, service pack 0.
-  OSVERSIONINFOEXW osvi = {};
-  osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
-  osvi.dwMajorVersion = 6;
-  osvi.dwMinorVersion = 2;
-  osvi.wServicePackMajor = 0;
-
-  DWORDLONG Mask = 0;
-  Mask = VerSetConditionMask(Mask, VER_MAJORVERSION, VER_GREATER_EQUAL);
-  Mask = VerSetConditionMask(Mask, VER_MINORVERSION, VER_GREATER_EQUAL);
-  Mask = VerSetConditionMask(Mask, VER_SERVICEPACKMAJOR, VER_GREATER_EQUAL);
-
-  return VerifyVersionInfoW(&osvi, VER_MAJORVERSION | VER_MINORVERSION |
-                                       VER_SERVICEPACKMAJOR,
-                            Mask) != FALSE;
-}
+bool RunningWindows8OrGreater();
 
-inline bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix) {
-  if (!ErrMsg)
-    return true;
-  char *buffer = NULL;
-  DWORD LastError = GetLastError();
-  DWORD R = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER |
-                               FORMAT_MESSAGE_FROM_SYSTEM |
-                               FORMAT_MESSAGE_MAX_WIDTH_MASK,
-                           NULL, LastError, 0, (LPSTR)&buffer, 1, NULL);
-  if (R)
-    *ErrMsg = prefix + ": " + buffer;
-  else
-    *ErrMsg = prefix + ": Unknown error";
-  *ErrMsg += " (0x" + llvm::utohexstr(LastError) + ")";
-
-  LocalFree(buffer);
-  return R != 0;
-}
+/// Returns the Windows version as Major.Minor.0.BuildNumber. Uses
+/// RtlGetVersion or GetVersionEx under the hood depending on what is available.
+/// GetVersionEx is deprecated, but this API exposes the build number which can
+/// be useful for working around certain kernel bugs.
+llvm::VersionTuple GetWindowsOSVersion();
+
+bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix);
 
 template <typename HandleTraits>
 class ScopedHandle {
   typedef typename HandleTraits::handle_type handle_type;
   handle_type Handle;
 
-  ScopedHandle(const ScopedHandle &other); // = delete;
-  void operator=(const ScopedHandle &other); // = delete;
+  ScopedHandle(const ScopedHandle &other) = delete;
+  void operator=(const ScopedHandle &other) = delete;
 public:
   ScopedHandle()
     : Handle(HandleTraits::GetInvalid()) {}
@@ -201,7 +177,6 @@ typedef ScopedHandle<RegTraits>          ScopedRegHandle;
 typedef ScopedHandle<FindHandleTraits>   ScopedFindHandle;
 typedef ScopedHandle<JobHandleTraits>    ScopedJobHandle;
 
-namespace llvm {
 template <class T>
 class SmallVectorImpl;
 
diff --git a/lib/Support/WithColor.cpp b/lib/Support/WithColor.cpp
index d2e13f0e86de68a7812d2bb1668d7c4d13691b56..cf4c10956f218fba4ef020c9061ceb164dcd2703 100644
--- a/lib/Support/WithColor.cpp
+++ b/lib/Support/WithColor.cpp
@@ -19,15 +19,10 @@ static cl::opt<cl::boolOrDefault>
              cl::desc("Use colors in output (default=autodetect)"),
              cl::init(cl::BOU_UNSET));
 
-bool WithColor::colorsEnabled(raw_ostream &OS) {
-  if (UseColor == cl::BOU_UNSET)
-    return OS.has_colors();
-  return UseColor == cl::BOU_TRUE;
-}
-
-WithColor::WithColor(raw_ostream &OS, HighlightColor Color) : OS(OS) {
+WithColor::WithColor(raw_ostream &OS, HighlightColor Color, bool DisableColors)
+    : OS(OS), DisableColors(DisableColors) {
   // Detect color from terminal type unless the user passed the --color option.
-  if (colorsEnabled(OS)) {
+  if (colorsEnabled()) {
     switch (Color) {
     case HighlightColor::Address:
       OS.changeColor(raw_ostream::YELLOW);
@@ -56,6 +51,9 @@ WithColor::WithColor(raw_ostream &OS, HighlightColor Color) : OS(OS) {
     case HighlightColor::Note:
       OS.changeColor(raw_ostream::BLACK, true);
       break;
+    case HighlightColor::Remark:
+      OS.changeColor(raw_ostream::BLUE, true);
+      break;
     }
   }
 }
@@ -66,25 +64,58 @@ raw_ostream &WithColor::warning() { return warning(errs()); }
 
 raw_ostream &WithColor::note() { return note(errs()); }
 
-raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix) {
+raw_ostream &WithColor::remark() { return remark(errs()); }
+
+raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix,
+                              bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Error).get() << "error: ";
+  return WithColor(OS, HighlightColor::Error, DisableColors).get()
+         << "error: ";
 }
 
-raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix) {
+raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix,
+                                bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Warning).get() << "warning: ";
+  return WithColor(OS, HighlightColor::Warning, DisableColors).get()
+         << "warning: ";
 }
 
-raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix) {
+raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix,
+                             bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Note).get() << "note: ";
+  return WithColor(OS, HighlightColor::Note, DisableColors).get() << "note: ";
 }
 
-WithColor::~WithColor() {
-  if (colorsEnabled(OS))
+raw_ostream &WithColor::remark(raw_ostream &OS, StringRef Prefix,
+                               bool DisableColors) {
+  if (!Prefix.empty())
+    OS << Prefix << ": ";
+  return WithColor(OS, HighlightColor::Remark, DisableColors).get()
+         << "remark: ";
+}
+
+bool WithColor::colorsEnabled() {
+  if (DisableColors)
+    return false;
+  if (UseColor == cl::BOU_UNSET)
+    return OS.has_colors();
+  return UseColor == cl::BOU_TRUE;
+}
+
+WithColor &WithColor::changeColor(raw_ostream::Colors Color, bool Bold,
+                                  bool BG) {
+  if (colorsEnabled())
+    OS.changeColor(Color, Bold, BG);
+  return *this;
+}
+
+WithColor &WithColor::resetColor() {
+  if (colorsEnabled())
     OS.resetColor();
+  return *this;
 }
+
+WithColor::~WithColor() { resetColor(); }
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index d6345efd00cd3a50ace6f67ed16ff45848553003..f8492c96bab69e681d4fb163c60dd07de09bda5d 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -98,7 +98,7 @@ bool Input::setCurrentDocument() {
       ++DocIterator;
       return setCurrentDocument();
     }
-    TopNode = this->createHNodes(N);
+    TopNode = createHNodes(N);
     CurrentNode = TopNode.get();
     return true;
   }
@@ -343,7 +343,7 @@ void Input::blockScalarString(StringRef &S) { scalarString(S, QuotingType::None)
 
 void Input::setError(HNode *hnode, const Twine &message) {
   assert(hnode && "HNode must not be NULL");
-  this->setError(hnode->_node, message);
+  setError(hnode->_node, message);
 }
 
 void Input::setError(Node *node, const Twine &message) {
@@ -366,7 +366,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
   } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) {
     auto SQHNode = llvm::make_unique<SequenceHNode>(N);
     for (Node &SN : *SQ) {
-      auto Entry = this->createHNodes(&SN);
+      auto Entry = createHNodes(&SN);
       if (EC)
         break;
       SQHNode->Entries.push_back(std::move(Entry));
@@ -391,7 +391,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
         // Copy string to permanent storage
         KeyStr = StringStorage.str().copy(StringAllocator);
       }
-      auto ValueHNode = this->createHNodes(Value);
+      auto ValueHNode = createHNodes(Value);
       if (EC)
         break;
       mapHNode->Mapping[KeyStr] = std::move(ValueHNode);
@@ -406,7 +406,7 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
 }
 
 void Input::setError(const Twine &Message) {
-  this->setError(CurrentNode, Message);
+  setError(CurrentNode, Message);
 }
 
 bool Input::canElideEmptySequence() {
@@ -440,11 +440,11 @@ bool Output::mapTag(StringRef Tag, bool Use) {
         StateStack.size() > 1 && (StateStack[StateStack.size() - 2] == inSeq ||
           StateStack[StateStack.size() - 2] == inFlowSeq);
     if (SequenceElement && StateStack.back() == inMapFirstKey) {
-      this->newLineCheck();
+      newLineCheck();
     } else {
-      this->output(" ");
+      output(" ");
     }
-    this->output(Tag);
+    output(Tag);
     if (SequenceElement) {
       // If we're writing the tag during the first element of a map, the tag
       // takes the place of the first element in the sequence.
@@ -476,8 +476,8 @@ bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
     if (State == inFlowMapFirstKey || State == inFlowMapOtherKey) {
       flowKey(Key);
     } else {
-      this->newLineCheck();
-      this->paddedKey(Key);
+      newLineCheck();
+      paddedKey(Key);
     }
     return true;
   }
@@ -496,23 +496,23 @@ void Output::postflightKey(void *) {
 
 void Output::beginFlowMapping() {
   StateStack.push_back(inFlowMapFirstKey);
-  this->newLineCheck();
+  newLineCheck();
   ColumnAtMapFlowStart = Column;
   output("{ ");
 }
 
 void Output::endFlowMapping() {
   StateStack.pop_back();
-  this->outputUpToEndOfLine(" }");
+  outputUpToEndOfLine(" }");
 }
 
 void Output::beginDocuments() {
-  this->outputUpToEndOfLine("---");
+  outputUpToEndOfLine("---");
 }
 
 bool Output::preflightDocument(unsigned index) {
   if (index > 0)
-    this->outputUpToEndOfLine("\n---");
+    outputUpToEndOfLine("\n---");
   return true;
 }
 
@@ -542,7 +542,7 @@ void Output::postflightElement(void *) {
 
 unsigned Output::beginFlowSequence() {
   StateStack.push_back(inFlowSeq);
-  this->newLineCheck();
+  newLineCheck();
   ColumnAtFlowStart = Column;
   output("[ ");
   NeedFlowSequenceComma = false;
@@ -551,7 +551,7 @@ unsigned Output::beginFlowSequence() {
 
 void Output::endFlowSequence() {
   StateStack.pop_back();
-  this->outputUpToEndOfLine(" ]");
+  outputUpToEndOfLine(" ]");
 }
 
 bool Output::preflightFlowElement(unsigned, void *&) {
@@ -577,8 +577,8 @@ void Output::beginEnumScalar() {
 
 bool Output::matchEnumScalar(const char *Str, bool Match) {
   if (Match && !EnumerationMatchFound) {
-    this->newLineCheck();
-    this->outputUpToEndOfLine(Str);
+    newLineCheck();
+    outputUpToEndOfLine(Str);
     EnumerationMatchFound = true;
   }
   return false;
@@ -597,7 +597,7 @@ void Output::endEnumScalar() {
 }
 
 bool Output::beginBitSetScalar(bool &DoClear) {
-  this->newLineCheck();
+  newLineCheck();
   output("[ ");
   NeedBitValueComma = false;
   DoClear = false;
@@ -608,27 +608,27 @@ bool Output::bitSetMatch(const char *Str, bool Matches) {
   if (Matches) {
     if (NeedBitValueComma)
       output(", ");
-    this->output(Str);
+    output(Str);
     NeedBitValueComma = true;
   }
   return false;
 }
 
 void Output::endBitSetScalar() {
-  this->outputUpToEndOfLine(" ]");
+  outputUpToEndOfLine(" ]");
 }
 
 void Output::scalarString(StringRef &S, QuotingType MustQuote) {
-  this->newLineCheck();
+  newLineCheck();
   if (S.empty()) {
     // Print '' for the empty string because leaving the field empty is not
     // allowed.
-    this->outputUpToEndOfLine("''");
+    outputUpToEndOfLine("''");
     return;
   }
   if (MustQuote == QuotingType::None) {
     // Only quote if we must.
-    this->outputUpToEndOfLine(S);
+    outputUpToEndOfLine(S);
     return;
   }
 
@@ -645,7 +645,7 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
   // escapes. This is handled in yaml::escape.
   if (MustQuote == QuotingType::Double) {
     output(yaml::escape(Base, /* EscapePrintable= */ false));
-    this->outputUpToEndOfLine(Quote);
+    outputUpToEndOfLine(Quote);
     return;
   }
 
@@ -659,7 +659,7 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
     ++j;
   }
   output(StringRef(&Base[i], j - i));
-  this->outputUpToEndOfLine(Quote); // Ending quote.
+  outputUpToEndOfLine(Quote); // Ending quote.
 }
 
 void Output::blockScalarString(StringRef &S) {
@@ -702,7 +702,7 @@ void Output::output(StringRef s) {
 }
 
 void Output::outputUpToEndOfLine(StringRef s) {
-  this->output(s);
+  output(s);
   if (StateStack.empty() || (StateStack.back() != inFlowSeq &&
                              StateStack.back() != inFlowMapFirstKey &&
                              StateStack.back() != inFlowMapOtherKey))
@@ -723,7 +723,7 @@ void Output::newLineCheck() {
     return;
   NeedsNewLine = false;
 
-  this->outputNewLine();
+  outputNewLine();
 
   assert(StateStack.size() > 0);
   unsigned Indent = StateStack.size() - 1;
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 6472dcd5157f84d54496feb626dff718179281a4..2f0d0bf346d6db04127fc2857bf25c2c7d3f72a3 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -32,6 +32,7 @@ class MachineFunctionPass;
 FunctionPass *createAArch64DeadRegisterDefinitions();
 FunctionPass *createAArch64RedundantCopyEliminationPass();
 FunctionPass *createAArch64CondBrTuning();
+FunctionPass *createAArch64CompressJumpTablesPass();
 FunctionPass *createAArch64ConditionalCompares();
 FunctionPass *createAArch64AdvSIMDScalar();
 FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
@@ -62,6 +63,7 @@ void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
 void initializeAArch64BranchTargetsPass(PassRegistry&);
 void initializeAArch64CollectLOHPass(PassRegistry&);
 void initializeAArch64CondBrTuningPass(PassRegistry &);
+void initializeAArch64CompressJumpTablesPass(PassRegistry&);
 void initializeAArch64ConditionalComparesPass(PassRegistry&);
 void initializeAArch64ConditionOptimizerPass(PassRegistry&);
 void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 368898fd1e66b341d296144e6dd2565d4ccb920e..9d596a1821c8de72fb6b0c54ff67c8c74a3aaed7 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -180,6 +180,10 @@ def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
     "Disable latency scheduling heuristic">;
 
+def FeatureForce32BitJumpTables
+   : SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true",
+                      "Force jump table entries to be 32-bits wide except at MinSize">;
+
 def FeatureRCPC : SubtargetFeature<"rcpc", "HasRCPC", "true",
                                    "Enable support for RCPC extension">;
 
@@ -404,9 +408,8 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                      FeatureCRC,
                                      FeatureCrypto,
                                      FeatureExynosCheapAsMoveHandling,
-                                     FeatureFPARMv8,
+                                     FeatureForce32BitJumpTables,
                                      FeatureFuseAES,
-                                     FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeatureSlowMisaligned128Store,
@@ -419,9 +422,8 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                      FeatureCRC,
                                      FeatureCrypto,
                                      FeatureExynosCheapAsMoveHandling,
-                                     FeatureFPARMv8,
+                                     FeatureForce32BitJumpTables,
                                      FeatureFuseAES,
-                                     FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeatureSlowMisaligned128Store,
@@ -432,13 +434,12 @@ def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                     [FeatureCRC,
                                      FeatureCrypto,
                                      FeatureExynosCheapAsMoveHandling,
-                                     FeatureFPARMv8,
+                                     FeatureForce32BitJumpTables,
                                      FeatureFuseAddress,
                                      FeatureFuseAES,
                                      FeatureFuseCCSelect,
                                      FeatureFuseLiterals,
                                      FeatureLSLFast,
-                                     FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive,
@@ -486,7 +487,7 @@ def ProcSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
                                    FeatureLSLFast,
-                                   HasV8_3aOps]>;
+                                   HasV8_4aOps]>;
 
 def ProcThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
                                          "ThunderX2T99",
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 23b6a65555af75bb868441f99765dbd1647ed9d1..1ff0392c0f2692c599dcb7aee580228c95907ff8 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -21,6 +21,7 @@
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -31,6 +32,8 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -77,6 +80,12 @@ public:
     return MCInstLowering.lowerOperand(MO, MCOp);
   }
 
+  void EmitJumpTableInfo() override;
+  void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                          const MachineBasicBlock *MBB, unsigned JTI);
+
+  void LowerJumpTableDestSmall(MCStreamer &OutStreamer, const MachineInstr &MI);
+
   void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
                      const MachineInstr &MI);
   void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
@@ -433,6 +442,104 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
   printOperand(MI, NOps - 2, OS);
 }
 
+void AArch64AsmPrinter::EmitJumpTableInfo() {
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (!MJTI) return;
+
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return;
+
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM);
+  OutStreamer->SwitchSection(ReadOnlySec);
+
+  auto AFI = MF->getInfo<AArch64FunctionInfo>();
+  for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+    // If this jump table was deleted, ignore it.
+    if (JTBBs.empty()) continue;
+
+    unsigned Size = AFI->getJumpTableEntrySize(JTI);
+    EmitAlignment(Log2_32(Size));
+    OutStreamer->EmitLabel(GetJTISymbol(JTI));
+
+    for (auto *JTBB : JTBBs)
+      emitJumpTableEntry(MJTI, JTBB, JTI);
+  }
+}
+
+void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                           const MachineBasicBlock *MBB,
+                                           unsigned JTI) {
+  const MCExpr *Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+  auto AFI = MF->getInfo<AArch64FunctionInfo>();
+  unsigned Size = AFI->getJumpTableEntrySize(JTI);
+
+  if (Size == 4) {
+    // .word LBB - LJTI
+    const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+    const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, JTI, OutContext);
+    Value = MCBinaryExpr::createSub(Value, Base, OutContext);
+  } else {
+    // .byte (LBB - LBB) >> 2 (or .hword)
+    const MCSymbol *BaseSym = AFI->getJumpTableEntryPCRelSymbol(JTI);
+    const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext);
+    Value = MCBinaryExpr::createSub(Value, Base, OutContext);
+    Value = MCBinaryExpr::createLShr(
+        Value, MCConstantExpr::create(2, OutContext), OutContext);
+  }
+
+  OutStreamer->EmitValue(Value, Size);
+}
+
+/// Small jump tables contain an unsigned byte or half, representing the offset
+/// from the lowest-addressed possible destination to the desired basic
+/// block. Since all instructions are 4-byte aligned, this is further compressed
+/// by counting in instructions rather than bytes (i.e. divided by 4). So, to
+/// materialize the correct destination we need:
+///
+///             adr xDest, .LBB0_0
+///             ldrb wScratch, [xTable, xEntry]   (with "lsl #1" for ldrh).
+///             add xDest, xDest, xScratch, lsl #2
+void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
+                                                const llvm::MachineInstr &MI) {
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned ScratchReg = MI.getOperand(1).getReg();
+  unsigned ScratchRegW =
+      STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32);
+  unsigned TableReg = MI.getOperand(2).getReg();
+  unsigned EntryReg = MI.getOperand(3).getReg();
+  int JTIdx = MI.getOperand(4).getIndex();
+  bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8;
+
+  // This has to be first because the compression pass based its reachability
+  // calculations on the start of the JumpTableDest instruction.
+  auto Label =
+      MF->getInfo<AArch64FunctionInfo>()->getJumpTableEntryPCRelSymbol(JTIdx);
+  EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADR)
+                                  .addReg(DestReg)
+                                  .addExpr(MCSymbolRefExpr::create(
+                                      Label, MF->getContext())));
+
+  // Load the number of instruction-steps to offset from the label.
+  unsigned LdrOpcode = IsByteEntry ? AArch64::LDRBBroX : AArch64::LDRHHroX;
+  EmitToStreamer(OutStreamer, MCInstBuilder(LdrOpcode)
+                                  .addReg(ScratchRegW)
+                                  .addReg(TableReg)
+                                  .addReg(EntryReg)
+                                  .addImm(0)
+                                  .addImm(IsByteEntry ? 0 : 1));
+
+  // Multiply the steps by 4 and add to the already materialized base label
+  // address.
+  EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADDXrs)
+                                  .addReg(DestReg)
+                                  .addReg(DestReg)
+                                  .addReg(ScratchReg)
+                                  .addImm(2));
+}
+
 void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
                                       const MachineInstr &MI) {
   unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
@@ -559,6 +666,8 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     OutStreamer->EmitLabel(LOHLabel);
   }
 
+  AArch64TargetStreamer *TS =
+    static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
   // Do any manual lowerings.
   switch (MI->getOpcode()) {
   default:
@@ -662,6 +771,32 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  case AArch64::JumpTableDest32: {
+    // We want:
+    //     ldrsw xScratch, [xTable, xEntry, lsl #2]
+    //     add xDest, xTable, xScratch
+    unsigned DestReg = MI->getOperand(0).getReg(),
+             ScratchReg = MI->getOperand(1).getReg(),
+             TableReg = MI->getOperand(2).getReg(),
+             EntryReg = MI->getOperand(3).getReg();
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX)
+                                     .addReg(ScratchReg)
+                                     .addReg(TableReg)
+                                     .addReg(EntryReg)
+                                     .addImm(0)
+                                     .addImm(1));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs)
+                                     .addReg(DestReg)
+                                     .addReg(TableReg)
+                                     .addReg(ScratchReg)
+                                     .addImm(0));
+    return;
+  }
+  case AArch64::JumpTableDest16:
+  case AArch64::JumpTableDest8:
+    LowerJumpTableDestSmall(*OutStreamer, *MI);
+    return;
+
   case AArch64::FMOVH0:
   case AArch64::FMOVS0:
   case AArch64::FMOVD0:
@@ -685,6 +820,100 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case TargetOpcode::PATCHABLE_TAIL_CALL:
     LowerPATCHABLE_TAIL_CALL(*MI);
     return;
+
+  case AArch64::SEH_StackAlloc:
+    TS->EmitARM64WinCFIAllocStack(MI->getOperand(0).getImm());
+    return;
+
+  case AArch64::SEH_SaveFPLR:
+    TS->EmitARM64WinCFISaveFPLR(MI->getOperand(0).getImm());
+    return;
+
+  case AArch64::SEH_SaveFPLR_X:
+    assert(MI->getOperand(0).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveFPLRX(-MI->getOperand(0).getImm());
+    return;
+
+  case AArch64::SEH_SaveReg:
+    TS->EmitARM64WinCFISaveReg(MI->getOperand(0).getImm(),
+                               MI->getOperand(1).getImm());
+    return;
+
+  case AArch64::SEH_SaveReg_X:
+    assert(MI->getOperand(1).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveRegX(MI->getOperand(0).getImm(),
+		                -MI->getOperand(1).getImm());
+    return;
+
+  case AArch64::SEH_SaveRegP:
+    assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
+            "Non-consecutive registers not allowed for save_regp");
+    TS->EmitARM64WinCFISaveRegP(MI->getOperand(0).getImm(),
+                                MI->getOperand(2).getImm());
+    return;
+
+  case AArch64::SEH_SaveRegP_X:
+    assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
+            "Non-consecutive registers not allowed for save_regp_x");
+    assert(MI->getOperand(2).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveRegPX(MI->getOperand(0).getImm(),
+                                 -MI->getOperand(2).getImm());
+    return;
+
+  case AArch64::SEH_SaveFReg:
+    TS->EmitARM64WinCFISaveFReg(MI->getOperand(0).getImm(),
+                                MI->getOperand(1).getImm());
+    return;
+
+  case AArch64::SEH_SaveFReg_X:
+    assert(MI->getOperand(1).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveFRegX(MI->getOperand(0).getImm(),
+                                 -MI->getOperand(1).getImm());
+    return;
+
+  case AArch64::SEH_SaveFRegP:
+    assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
+            "Non-consecutive registers not allowed for save_regp");
+    TS->EmitARM64WinCFISaveFRegP(MI->getOperand(0).getImm(),
+                                 MI->getOperand(2).getImm());
+    return;
+
+  case AArch64::SEH_SaveFRegP_X:
+    assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
+            "Non-consecutive registers not allowed for save_regp_x");
+    assert(MI->getOperand(2).getImm() < 0 &&
+           "Pre increment SEH opcode must have a negative offset");
+    TS->EmitARM64WinCFISaveFRegPX(MI->getOperand(0).getImm(),
+                                  -MI->getOperand(2).getImm());
+    return;
+
+  case AArch64::SEH_SetFP:
+    TS->EmitARM64WinCFISetFP();
+    return;
+
+  case AArch64::SEH_AddFP:
+    TS->EmitARM64WinCFIAddFP(MI->getOperand(0).getImm());
+    return;
+
+  case AArch64::SEH_Nop:
+    TS->EmitARM64WinCFINop();
+    return;
+
+  case AArch64::SEH_PrologEnd:
+    TS->EmitARM64WinCFIPrologEnd();
+    return;
+
+  case AArch64::SEH_EpilogStart:
+    TS->EmitARM64WinCFIEpilogStart();
+    return;
+
+  case AArch64::SEH_EpilogEnd:
+    TS->EmitARM64WinCFIEpilogEnd();
+    return;
   }
 
   // Finally, do the automated lowerings for everything else.
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 91fe3f237aff475ae6a7d87083c322b84b7d6fdd..2f6cb4c8670a2883673a1ada454613af69348127 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -288,6 +288,14 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
                                            D8,  D9,  D10, D11,
                                            D12, D13, D14, D15)>;
 
+// Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x.
+// We put FP before LR, so that frame lowering logic generates (FP,LR) pairs,
+// and not (LR,FP) pairs.
+def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add FP, LR, X19, X20, X21, X22,
+                                               X23, X24, X25, X26, X27, X28,
+                                               D8, D9, D10, D11,
+                                               D12, D13, D14, D15)>;
+
 // AArch64 PCS for vector functions (VPCS)
 // must (additionally) preserve full Q8-Q23 registers
 def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
diff --git a/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/lib/Target/AArch64/AArch64CompressJumpTables.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0924a27e2586567ade5ff0dc8cb8e89265f17772
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CompressJumpTables.cpp
@@ -0,0 +1,162 @@
+//==-- AArch64CompressJumpTables.cpp - Compress jump tables for AArch64 --====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This pass looks at the basic blocks each jump-table refers to and works out
+// whether they can be emitted in a compressed form (with 8 or 16-bit
+// entries). If so, it changes the opcode and flags them in the associated
+// AArch64FunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-jump-tables"
+
+STATISTIC(NumJT8, "Number of jump-tables with 1-byte entries");
+STATISTIC(NumJT16, "Number of jump-tables with 2-byte entries");
+STATISTIC(NumJT32, "Number of jump-tables with 4-byte entries");
+
+namespace {
+class AArch64CompressJumpTables : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  MachineFunction *MF;
+  SmallVector<int, 8> BlockInfo;
+
+  int computeBlockSize(MachineBasicBlock &MBB);
+  void scanFunction();
+
+  bool compressJumpTable(MachineInstr &MI, int Offset);
+
+public:
+  static char ID;
+  AArch64CompressJumpTables() : MachineFunctionPass(ID) {
+    initializeAArch64CompressJumpTablesPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+  StringRef getPassName() const override {
+    return "AArch64 Compress Jump Tables";
+  }
+};
+char AArch64CompressJumpTables::ID = 0;
+}
+
+INITIALIZE_PASS(AArch64CompressJumpTables, DEBUG_TYPE,
+                "AArch64 compress jump tables pass", false, false)
+
+int AArch64CompressJumpTables::computeBlockSize(MachineBasicBlock &MBB) {
+  int Size = 0;
+  for (const MachineInstr &MI : MBB)
+    Size += TII->getInstSizeInBytes(MI);
+  return Size;
+}
+
+void AArch64CompressJumpTables::scanFunction() {
+  BlockInfo.clear();
+  BlockInfo.resize(MF->getNumBlockIDs());
+
+  int Offset = 0;
+  for (MachineBasicBlock &MBB : *MF) {
+    BlockInfo[MBB.getNumber()] = Offset;
+    Offset += computeBlockSize(MBB);
+  }
+}
+
+bool AArch64CompressJumpTables::compressJumpTable(MachineInstr &MI,
+                                                  int Offset) {
+  if (MI.getOpcode() != AArch64::JumpTableDest32)
+    return false;
+
+  int JTIdx = MI.getOperand(4).getIndex();
+  auto &JTInfo = *MF->getJumpTableInfo();
+  const MachineJumpTableEntry &JT = JTInfo.getJumpTables()[JTIdx];
+
+  // The jump-table might have been optimized away.
+  if (JT.MBBs.empty())
+    return false;
+
+  int MaxOffset = std::numeric_limits<int>::min(),
+      MinOffset = std::numeric_limits<int>::max();
+  MachineBasicBlock *MinBlock = nullptr;
+  for (auto Block : JT.MBBs) {
+    int BlockOffset = BlockInfo[Block->getNumber()];
+    assert(BlockOffset % 4 == 0 && "misaligned basic block");
+
+    MaxOffset = std::max(MaxOffset, BlockOffset);
+    if (BlockOffset <= MinOffset) {
+      MinOffset = BlockOffset;
+      MinBlock = Block;
+    }
+  }
+
+  // The ADR instruction needed to calculate the address of the first reachable
+  // basic block can address +/-1MB.
+  if (!isInt<21>(MinOffset - Offset)) {
+    ++NumJT32;
+    return false;
+  }
+
+  int Span = MaxOffset - MinOffset;
+  auto AFI = MF->getInfo<AArch64FunctionInfo>();
+  if (isUInt<8>(Span / 4)) {
+    AFI->setJumpTableEntryInfo(JTIdx, 1, MinBlock->getSymbol());
+    MI.setDesc(TII->get(AArch64::JumpTableDest8));
+    ++NumJT8;
+    return true;
+  } else if (isUInt<16>(Span / 4)) {
+    AFI->setJumpTableEntryInfo(JTIdx, 2, MinBlock->getSymbol());
+    MI.setDesc(TII->get(AArch64::JumpTableDest16));
+    ++NumJT16;
+    return true;
+  }
+
+  ++NumJT32;
+  return false;
+}
+
+bool AArch64CompressJumpTables::runOnMachineFunction(MachineFunction &MFIn) {
+  bool Changed = false;
+  MF = &MFIn;
+
+  const auto &ST = MF->getSubtarget<AArch64Subtarget>();
+  TII = ST.getInstrInfo();
+
+  if (ST.force32BitJumpTables() && !MF->getFunction().optForMinSize())
+    return false;
+
+  scanFunction();
+
+  for (MachineBasicBlock &MBB : *MF) {
+    int Offset = BlockInfo[MBB.getNumber()];
+    for (MachineInstr &MI : MBB) {
+      Changed |= compressJumpTable(MI, Offset);
+      Offset += TII->getInstSizeInBytes(MI);
+    }
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64CompressJumpTablesPass() {
+  return new AArch64CompressJumpTables();
+}
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 5e4c5dcf09c97a39329588c0488f0c2e6650f6ef..dfc08a12f51343b4524f6978fa111362d7050015 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3450,6 +3450,21 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, SrcReg);
     return true;
   }
+  case Intrinsic::sponentry: {
+    MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
+
+    // SP = FP + Fixed Object + 16
+    int FI = MFI.CreateFixedObject(4, 0, false);
+    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::ADDXri), ResultReg)
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addImm(0);
+
+    updateValueMap(II, ResultReg);
+    return true;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
     const auto *MTI = cast<MemTransferInst>(II);
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index b0451ca2edb6d928906818b89b24479d56a7935d..9c85001481d18598b86c60e28d5a883b38d871e4 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -115,11 +115,13 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -434,12 +436,154 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   return true;
 }
 
+// Given a load or a store instruction, generate an appropriate unwinding SEH
+// code on Windows.
+static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
+                                             const TargetInstrInfo &TII,
+                                             MachineInstr::MIFlag Flag) {
+  unsigned Opc = MBBI->getOpcode();
+  MachineBasicBlock *MBB = MBBI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  DebugLoc DL = MBBI->getDebugLoc();
+  unsigned ImmIdx = MBBI->getNumOperands() - 1;
+  int Imm = MBBI->getOperand(ImmIdx).getImm();
+  MachineInstrBuilder MIB;
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  switch (Opc) {
+  default:
+    llvm_unreachable("No SEH Opcode for this instruction");
+  case AArch64::LDPDpost:
+    Imm = -Imm;
+    LLVM_FALLTHROUGH;
+  case AArch64::STPDpre: {
+    unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
+              .addImm(Reg0)
+              .addImm(Reg1)
+              .addImm(Imm * 8)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::LDPXpost:
+    Imm = -Imm;
+    LLVM_FALLTHROUGH;
+  case AArch64::STPXpre: {
+    unsigned Reg0 = MBBI->getOperand(1).getReg();
+    unsigned Reg1 = MBBI->getOperand(2).getReg();
+    if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    else
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
+                .addImm(RegInfo->getSEHRegNum(Reg0))
+                .addImm(RegInfo->getSEHRegNum(Reg1))
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::LDRDpost:
+    Imm = -Imm;
+    LLVM_FALLTHROUGH;
+  case AArch64::STRDpre: {
+    unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
+              .addImm(Reg)
+              .addImm(Imm)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::LDRXpost:
+    Imm = -Imm;
+    LLVM_FALLTHROUGH;
+  case AArch64::STRXpre: {
+    unsigned Reg =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
+              .addImm(Reg)
+              .addImm(Imm)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::STPDi:
+  case AArch64::LDPDi: {
+    unsigned Reg0 =  RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
+    unsigned Reg1 =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
+              .addImm(Reg0)
+              .addImm(Reg1)
+              .addImm(Imm * 8)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::STPXi:
+  case AArch64::LDPXi: {
+    unsigned Reg0 = MBBI->getOperand(0).getReg();
+    unsigned Reg1 = MBBI->getOperand(1).getReg();
+    if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    else
+      MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
+                .addImm(RegInfo->getSEHRegNum(Reg0))
+                .addImm(RegInfo->getSEHRegNum(Reg1))
+                .addImm(Imm * 8)
+                .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::STRXui:
+  case AArch64::LDRXui: {
+    int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
+              .addImm(Reg)
+              .addImm(Imm * 8)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::STRDui:
+  case AArch64::LDRDui: {
+    unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
+              .addImm(Reg)
+              .addImm(Imm * 8)
+              .setMIFlag(Flag);
+    break;
+  }
+  }
+  auto I = MBB->insertAfter(MBBI, MIB);
+  return I;
+}
+
+// Fix up the SEH opcode associated with the save/restore instruction.
+static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
+                           unsigned LocalStackSize) {
+  MachineOperand *ImmOpnd = nullptr;
+  unsigned ImmIdx = MBBI->getNumOperands() - 1;
+  switch (MBBI->getOpcode()) {
+  default:
+    llvm_unreachable("Fix the offset in the SEH instruction");
+  case AArch64::SEH_SaveFPLR:
+  case AArch64::SEH_SaveRegP:
+  case AArch64::SEH_SaveReg:
+  case AArch64::SEH_SaveFRegP:
+  case AArch64::SEH_SaveFReg:
+    ImmOpnd = &MBBI->getOperand(ImmIdx);
+    break;
+  }
+  if (ImmOpnd)
+    ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
+}
+
 // Convert callee-save register save/restore instruction to do stack pointer
 // decrement/increment to allocate/deallocate the callee-save stack area by
 // converting store/load to use pre/post increment version.
 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+    const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
+    bool NeedsWinCFI, bool InProlog = true) {
   // Ignore instructions that do not operate on SP, i.e. shadow call stack
   // instructions.
   while (MBBI->getOpcode() == AArch64::STRXpost ||
@@ -447,7 +591,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     assert(MBBI->getOperand(0).getReg() != AArch64::SP);
     ++MBBI;
   }
-
   unsigned NewOpc;
   int Scale = 1;
   switch (MBBI->getOpcode()) {
@@ -496,6 +639,12 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     NewOpc = AArch64::LDRQpost;
     break;
   }
+  // Get rid of the SEH code associated with the old instruction.
+  if (NeedsWinCFI) {
+    auto SEH = std::next(MBBI);
+    if (AArch64InstrInfo::isSEHInstruction(*SEH))
+      SEH->eraseFromParent();
+  }
 
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
   MIB.addReg(AArch64::SP, RegState::Define);
@@ -517,13 +666,22 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
   MIB.setMIFlags(MBBI->getFlags());
   MIB.setMemRefs(MBBI->memoperands());
 
+  // Generate a new SEH code that corresponds to the new instruction.
+  if (NeedsWinCFI)
+    InsertSEH(*MIB, *TII,
+              InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
+
   return std::prev(MBB.erase(MBBI));
 }
 
 // Fixup callee-save register save/restore instructions to take into account
 // combined SP bump by adding the local stack size to the stack offsets.
 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
-                                              unsigned LocalStackSize) {
+                                              unsigned LocalStackSize,
+                                              bool NeedsWinCFI) {
+  if (AArch64InstrInfo::isSEHInstruction(MI))
+    return;
+
   unsigned Opc = MI.getOpcode();
 
   // Ignore instructions that do not operate on SP, i.e. shadow call stack
@@ -563,6 +721,14 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
   // All generated opcodes have scaled offsets.
   assert(LocalStackSize % Scale == 0);
   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
+
+  if (NeedsWinCFI) {
+    auto MBBI = std::next(MachineBasicBlock::iterator(MI));
+    assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
+    assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
+           "Expecting a SEH instruction");
+    fixupSEHOpcode(MBBI, LocalStackSize);
+  }
 }
 
 static void adaptForLdStOpt(MachineBasicBlock &MBB,
@@ -597,6 +763,17 @@ static void adaptForLdStOpt(MachineBasicBlock &MBB,
   //
 }
 
+static bool ShouldSignWithAKey(MachineFunction &MF) {
+  const Function &F = MF.getFunction();
+  if (!F.hasFnAttribute("sign-return-address-key"))
+    return true;
+
+  const StringRef Key =
+      F.getFnAttribute("sign-return-address-key").getValueAsString();
+  assert(Key.equals_lower("a_key") || Key.equals_lower("b_key"));
+  return Key.equals_lower("a_key");
+}
+
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -607,9 +784,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  bool needsFrameMoves = MMI.hasDebugInfo() || F.needsUnwindTableEntry();
+  bool needsFrameMoves = (MMI.hasDebugInfo() || F.needsUnwindTableEntry()) &&
+                         !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool HasFP = hasFP(MF);
-
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     F.needsUnwindTableEntry();
+  MF.setHasWinCFI(NeedsWinCFI);
   // At this point, we're going to decide whether or not the function uses a
   // redzone. In most cases, the function doesn't have a redzone so let's
   // assume that's false and set it to true in the case that there's a redzone.
@@ -620,7 +800,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   DebugLoc DL;
 
   if (ShouldSignReturnAddress(MF)) {
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
+    BuildMI(
+        MBB, MBBI, DL,
+        TII->get(ShouldSignWithAKey(MF) ? AArch64::PACIASP : AArch64::PACIBSP))
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
@@ -632,10 +814,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   int NumBytes = (int)MFI.getStackSize();
   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
     assert(!HasFP && "unexpected function without stack frame but with FP");
-
     // All of the stack allocation is for locals.
     AFI->setLocalStackSize(NumBytes);
-
     if (!NumBytes)
       return;
     // REDZONE: If the stack size is less than 128 bytes, we don't need
@@ -645,17 +825,23 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       ++NumRedZoneFunctions;
     } else {
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-
-      // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-      MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
+                      MachineInstr::FrameSetup, false, NeedsWinCFI);
+      if (!NeedsWinCFI) {
+        // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+        MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
+        // Encode the stack size of the leaf function.
+        unsigned CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+        BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
+      }
     }
+
+    if (NeedsWinCFI)
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+          .setMIFlag(MachineInstr::FrameSetup);
+
     return;
   }
 
@@ -666,15 +852,14 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // All of the remaining stack allocations are for locals.
   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
-
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   if (CombineSPBump) {
     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
-                    MachineInstr::FrameSetup);
+                    MachineInstr::FrameSetup, false, NeedsWinCFI);
     NumBytes = 0;
   } else if (PrologueSaveSize != 0) {
-    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
-                                                     -PrologueSaveSize);
+    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
+        MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI);
     NumBytes -= PrologueSaveSize;
   }
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
@@ -685,9 +870,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   MachineBasicBlock::iterator End = MBB.end();
   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
     if (CombineSPBump)
-      fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize());
+      fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
+                                        NeedsWinCFI);
     ++MBBI;
   }
+
   if (HasFP) {
     // Only set up FP if we actually need to. Frame pointer is fp =
     // sp - fixedobject - 16.
@@ -700,15 +887,42 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     // Note: All stores of callee-saved registers are marked as "FrameSetup".
     // This code marks the instruction(s) that set the FP also.
     emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
-                    MachineInstr::FrameSetup);
+                    MachineInstr::FrameSetup, false, NeedsWinCFI);
   }
 
   if (windowsRequiresStackProbe(MF, NumBytes)) {
     uint32_t NumWords = NumBytes >> 4;
-
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
-        .addImm(NumWords)
-        .setMIFlags(MachineInstr::FrameSetup);
+    if (NeedsWinCFI) {
+      // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
+      // exceed this amount.  We need to move at most 2^24 - 1 into x15.
+      // This is at most two instructions, MOVZ follwed by MOVK.
+      // TODO: Fix to use multiple stack alloc unwind codes for stacks
+      // exceeding 256MB in size.
+      if (NumBytes >= (1 << 28))
+        report_fatal_error("Stack size cannot exceed 256MB for stack "
+                            "unwinding purposes");
+
+      uint32_t LowNumWords = NumWords & 0xFFFF;
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
+            .addImm(LowNumWords)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+            .setMIFlag(MachineInstr::FrameSetup);
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
+      if ((NumWords & 0xFFFF0000) != 0) {
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
+              .addReg(AArch64::X15)
+              .addImm((NumWords & 0xFFFF0000) >> 16) // High half
+              .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
+              .setMIFlag(MachineInstr::FrameSetup);
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
+      }
+    } else {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
+          .addImm(NumWords)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
 
     switch (MF.getTarget().getCodeModel()) {
     case CodeModel::Tiny:
@@ -718,7 +932,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
           .addExternalSymbol("__chkstk")
           .addReg(AArch64::X15, RegState::Implicit)
+          .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
+          .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
+          .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
           .setMIFlags(MachineInstr::FrameSetup);
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
       break;
     case CodeModel::Large:
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
@@ -726,11 +946,20 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .addExternalSymbol("__chkstk")
           .addExternalSymbol("__chkstk")
           .setMIFlags(MachineInstr::FrameSetup);
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
 
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
           .addReg(AArch64::X16, RegState::Kill)
           .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
+          .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
+          .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
+          .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
           .setMIFlags(MachineInstr::FrameSetup);
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+            .setMIFlag(MachineInstr::FrameSetup);
       break;
     }
 
@@ -739,6 +968,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(AArch64::X15, RegState::Kill)
         .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
         .setMIFlags(MachineInstr::FrameSetup);
+    if (NeedsWinCFI)
+       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+            .addImm(NumBytes)
+            .setMIFlag(MachineInstr::FrameSetup);
     NumBytes = 0;
   }
 
@@ -758,7 +991,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // the correct value here, as NumBytes also includes padding bytes,
       // which shouldn't be counted here.
       emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
+                      MachineInstr::FrameSetup, false, NeedsWinCFI);
 
     if (NeedsRealignment) {
       const unsigned Alignment = MFI.getMaxAlignment();
@@ -781,6 +1014,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .addReg(scratchSPReg, RegState::Kill)
           .addImm(andMaskEncoded);
       AFI->setStackRealigned(true);
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+            .addImm(NumBytes & andMaskEncoded)
+            .setMIFlag(MachineInstr::FrameSetup);
     }
   }
 
@@ -794,8 +1031,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   if (RegInfo->hasBasePointer(MF)) {
     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
                      false);
+    if (NeedsWinCFI)
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
+          .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // The very last FrameSetup instruction indicates the end of prologue. Emit a
+  // SEH opcode indicating the prologue end.
+  if (NeedsWinCFI)
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
+        .setMIFlag(MachineInstr::FrameSetup);
+
   if (needsFrameMoves) {
     const DataLayout &TD = MF.getDataLayout();
     const int StackGrowth = -TD.getPointerSize(0);
@@ -907,10 +1153,14 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
   // instructions, namely RETA{A,B}, that can be used instead.
   if (Subtarget.hasV8_3aOps() && MBBI != MBB.end() &&
       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::RETAA)).copyImplicitOps(*MBBI);
+    BuildMI(MBB, MBBI, DL,
+            TII->get(ShouldSignWithAKey(MF) ? AArch64::RETAA : AArch64::RETAB))
+        .copyImplicitOps(*MBBI);
     MBB.erase(MBBI);
   } else {
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::AUTIASP))
+    BuildMI(
+        MBB, MBBI, DL,
+        TII->get(ShouldSignWithAKey(MF) ? AArch64::AUTIASP : AArch64::AUTIBSP))
         .setMIFlag(MachineInstr::FrameDestroy);
   }
 }
@@ -923,6 +1173,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
   bool IsTailCallReturn = false;
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     MF.getFunction().needsUnwindTableEntry();
+
   if (MBB.end() != MBBI) {
     DL = MBBI->getDebugLoc();
     unsigned RetOpcode = MBBI->getOpcode();
@@ -930,8 +1183,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
                        RetOpcode == AArch64::TCRETURNri ||
                        RetOpcode == AArch64::TCRETURNriBTI;
   }
+
   int NumBytes = MFI.getStackSize();
-  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
 
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
@@ -996,14 +1250,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (!CombineSPBump && PrologueSaveSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
+    while (AArch64InstrInfo::isSEHInstruction(*Pop))
+      Pop = std::prev(Pop);
     // Converting the last ldp to a post-index ldp is valid only if the last
     // ldp's offset is 0.
     const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
     // If the offset is 0, convert it to a post-index ldp.
-    if (OffsetOp.getImm() == 0) {
-      convertCalleeSaveRestoreToSPPrePostIncDec(MBB, Pop, DL, TII,
-                                                PrologueSaveSize);
-    } else {
+    if (OffsetOp.getImm() == 0)
+      convertCalleeSaveRestoreToSPPrePostIncDec(
+          MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, false);
+    else {
       // If not, make sure to emit an add after the last ldp.
       // We're doing this by transfering the size to be restored from the
       // adjustment *before* the CSR pops to the adjustment *after* the CSR
@@ -1023,14 +1279,23 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       ++LastPopI;
       break;
     } else if (CombineSPBump)
-      fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize());
+      fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
+                                        NeedsWinCFI);
   }
 
+  if (NeedsWinCFI)
+    BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
+        .setMIFlag(MachineInstr::FrameDestroy);
+
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
-                    NumBytes + AfterCSRPopSize, TII,
-                    MachineInstr::FrameDestroy);
+                    NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy,
+                    false, NeedsWinCFI);
+    if (NeedsWinCFI)
+      BuildMI(MBB, MBB.getFirstTerminator(), DL,
+              TII->get(AArch64::SEH_EpilogEnd))
+          .setMIFlag(MachineInstr::FrameDestroy);
     return;
   }
 
@@ -1058,9 +1323,15 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
 
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
-                    StackRestoreBytes, TII, MachineInstr::FrameDestroy);
-    if (Done)
+                    StackRestoreBytes, TII, MachineInstr::FrameDestroy, false,
+                    NeedsWinCFI);
+    if (Done) {
+      if (NeedsWinCFI)
+        BuildMI(MBB, MBB.getFirstTerminator(), DL,
+                TII->get(AArch64::SEH_EpilogEnd))
+            .setMIFlag(MachineInstr::FrameDestroy);
       return;
+    }
 
     NumBytes = 0;
   }
@@ -1072,10 +1343,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   if (MFI.hasVarSizedObjects() || AFI->isStackRealigned())
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
                     -AFI->getCalleeSavedStackSize() + 16, TII,
-                    MachineInstr::FrameDestroy);
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI);
   else if (NumBytes)
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
-                    MachineInstr::FrameDestroy);
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI);
 
   // This must be placed after the callee-save restore code because that code
   // assumes the SP is at the same location as it was after the callee-save save
@@ -1096,8 +1367,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
 
     emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
-                    AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
+                    AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false,
+                    NeedsWinCFI);
   }
+  if (NeedsWinCFI)
+    BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
+        .setMIFlag(MachineInstr::FrameDestroy);
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1222,6 +1497,23 @@ static bool produceCompactUnwindFrame(MachineFunction &MF) {
            Attrs.hasAttrSomewhere(Attribute::SwiftError));
 }
 
+static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
+                                             bool NeedsWinCFI) {
+  // If we are generating register pairs for a Windows function that requires
+  // EH support, then pair consecutive registers only.  There are no unwind
+  // opcodes for saves/restores of non-consectuve register pairs.
+  // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x.
+  // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+
+  // TODO: LR can be paired with any register.  We don't support this yet in
+  // the MCLayer.  We need to add support for the save_lrpair unwind code.
+  if (!NeedsWinCFI)
+    return false;
+  if (Reg2 == Reg1 + 1)
+    return false;
+  return true;
+}
+
 namespace {
 
 struct RegPairInfo {
@@ -1246,6 +1538,8 @@ static void computeCalleeSaveRegisterPairs(
   if (CSI.empty())
     return;
 
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     MF.getFunction().needsUnwindTableEntry();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   CallingConv::ID CC = MF.getFunction().getCallingConv();
@@ -1258,7 +1552,11 @@ static void computeCalleeSaveRegisterPairs(
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
   int Offset = AFI->getCalleeSavedStackSize();
-
+  // On Linux, we will have either one or zero non-paired register.  On Windows
+  // with CFI, we can have multiple unpaired registers in order to utilize the
+  // available unwind codes.  This flag assures that the alignment fixup is done
+  // only once, as intened.
+  bool FixupDone = false;
   for (unsigned i = 0; i < Count; ++i) {
     RegPairInfo RPI;
     RPI.Reg1 = CSI[i].getReg();
@@ -1277,11 +1575,13 @@ static void computeCalleeSaveRegisterPairs(
       unsigned NextReg = CSI[i + 1].getReg();
       switch (RPI.Type) {
       case RegPairInfo::GPR:
-        if (AArch64::GPR64RegClass.contains(NextReg))
+        if (AArch64::GPR64RegClass.contains(NextReg) &&
+            !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR64:
-        if (AArch64::FPR64RegClass.contains(NextReg))
+        if (AArch64::FPR64RegClass.contains(NextReg) &&
+            !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR128:
@@ -1326,8 +1626,9 @@ static void computeCalleeSaveRegisterPairs(
 
     // Round up size of non-pair to pair size if we need to pad the
     // callee-save area to ensure 16-byte alignment.
-    if (AFI->hasCalleeSaveStackFreeSpace() &&
+    if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
         RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) {
+      FixupDone = true;
       Offset -= 8;
       assert(Offset % 16 == 0);
       assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
@@ -1351,6 +1652,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     MF.getFunction().needsUnwindTableEntry();
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
 
@@ -1368,6 +1671,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
         .addImm(8)
         .setMIFlag(MachineInstr::FrameSetup);
 
+    if (NeedsWinCFI)
+      BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
+          .setMIFlag(MachineInstr::FrameSetup);
+
     // This instruction also makes x18 live-in to the entry block.
     MBB.addLiveIn(AArch64::X18);
   }
@@ -1413,6 +1720,17 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
                dbgs() << ")\n");
 
+    assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
+           "Windows unwdinding requires a consecutive (FP,LR) pair");
+    // Windows unwind codes require consecutive registers if registers are
+    // paired.  Make the switch here, so that the code below will save (x,x+1)
+    // and not (x+1,x).
+    unsigned FrameIdxReg1 = RPI.FrameIdx;
+    unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
+    if (NeedsWinCFI && RPI.isPaired()) {
+      std::swap(Reg1, Reg2);
+      std::swap(FrameIdxReg1, FrameIdxReg2);
+    }
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
     if (!MRI.isReserved(Reg1))
       MBB.addLiveIn(Reg1);
@@ -1421,7 +1739,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
         MBB.addLiveIn(Reg2);
       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
       MIB.addMemOperand(MF.getMachineMemOperand(
-          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOStore, Size, Align));
     }
     MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
@@ -1430,8 +1748,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameSetup);
     MIB.addMemOperand(MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachinePointerInfo::getFixedStack(MF,FrameIdxReg1),
         MachineMemOperand::MOStore, Size, Align));
+    if (NeedsWinCFI)
+      InsertSEH(MIB, TII, MachineInstr::FrameSetup);
+
   }
   return true;
 }
@@ -1444,6 +1765,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
+  bool NeedsWinCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+                     MF.getFunction().needsUnwindTableEntry();
 
   if (MI != MBB.end())
     DL = MI->getDebugLoc();
@@ -1489,11 +1812,20 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
                dbgs() << ")\n");
 
+    // Windows unwind codes require consecutive registers if registers are
+    // paired.  Make the switch here, so that the code below will save (x,x+1)
+    // and not (x+1,x).
+    unsigned FrameIdxReg1 = RPI.FrameIdx;
+    unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
+    if (NeedsWinCFI && RPI.isPaired()) {
+      std::swap(Reg1, Reg2);
+      std::swap(FrameIdxReg1, FrameIdxReg2);
+    }
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
     if (RPI.isPaired()) {
       MIB.addReg(Reg2, getDefRegState(true));
       MIB.addMemOperand(MF.getMachineMemOperand(
-          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOLoad, Size, Align));
     }
     MIB.addReg(Reg1, getDefRegState(true))
@@ -1502,10 +1834,11 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameDestroy);
     MIB.addMemOperand(MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
         MachineMemOperand::MOLoad, Size, Align));
+    if (NeedsWinCFI)
+      InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
   };
-
   if (ReverseCSRRestoreSeq)
     for (const RegPairInfo &RPI : reverse(RegPairs))
       EmitMI(RPI);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8cf9d55a9504522016a52859a2845e981b2aed9a..d5d6d5ca23e1de93a117e76ecc773272983b2f80 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -187,7 +187,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
@@ -385,8 +385,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
     setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
     setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
-    setOperationAction(ISD::FMINNAN,     MVT::f16,  Promote);
-    setOperationAction(ISD::FMAXNAN,     MVT::f16,  Promote);
+    setOperationAction(ISD::FMINIMUM,    MVT::f16,  Promote);
+    setOperationAction(ISD::FMAXIMUM,    MVT::f16,  Promote);
 
     // promote v4f16 to v4f32 when that is known to be safe.
     setOperationAction(ISD::FADD,        MVT::v4f16, Promote);
@@ -450,8 +450,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND, Ty, Legal);
     setOperationAction(ISD::FMINNUM, Ty, Legal);
     setOperationAction(ISD::FMAXNUM, Ty, Legal);
-    setOperationAction(ISD::FMINNAN, Ty, Legal);
-    setOperationAction(ISD::FMAXNAN, Ty, Legal);
+    setOperationAction(ISD::FMINIMUM, Ty, Legal);
+    setOperationAction(ISD::FMAXIMUM, Ty, Legal);
   }
 
   if (Subtarget->hasFullFP16()) {
@@ -463,8 +463,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND,  MVT::f16, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
   }
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
@@ -792,9 +792,9 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   for (MVT InnerVT : MVT::all_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 
-  // CNT supports only B element sizes.
+  // CNT supports only B element sizes, then use UADDLP to widen.
   if (VT != MVT::v8i8 && VT != MVT::v16i8)
-    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTPOP, VT, Custom);
 
   setOperationAction(ISD::UDIV, VT, Expand);
   setOperationAction(ISD::SDIV, VT, Expand);
@@ -816,8 +816,8 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
   if (VT.isFloatingPoint() &&
       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
-    for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
-                            ISD::FMINNUM, ISD::FMAXNUM})
+    for (unsigned Opcode :
+         {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM})
       setOperationAction(Opcode, VT, Legal);
 
   if (Subtarget->isLittleEndian()) {
@@ -1460,6 +1460,21 @@ static bool isLegalArithImmed(uint64_t C) {
   return IsLegal;
 }
 
+// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
+// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
+// can be set differently by this operation. It comes down to whether
+// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+// everything is fine. If not then the optimization is wrong. Thus general
+// comparisons are only valid if op2 != 0.
+//
+// So, finally, the only LLVM-native comparisons that don't mention C and V
+// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+// the absence of information about op2.
+static bool isCMN(SDValue Op, ISD::CondCode CC) {
+  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
+         (CC == ISD::SETEQ || CC == ISD::SETNE);
+}
+
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG) {
   EVT VT = LHS.getValueType();
@@ -1482,18 +1497,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
   // register to WZR/XZR if it ends up being unused.
   unsigned Opcode = AArch64ISD::SUBS;
 
-  if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
-    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
-    // can be set differently by this operation. It comes down to whether
-    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
-    // everything is fine. If not then the optimization is wrong. Thus general
-    // comparisons are only valid if op2 != 0.
-
-    // So, finally, the only LLVM-native comparisons that don't mention C and V
-    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
-    // the absence of information about op2.
+  if (isCMN(RHS, CC)) {
+    // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
     RHS = RHS.getOperand(1);
   } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
@@ -1516,7 +1521,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
 /// a comparison. They set the NZCV flags to a predefined value if their
 /// predicate is false. This allows to express arbitrary conjunctions, for
-/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
+/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
 /// expressed as:
 ///   cmp A
 ///   ccmp B, inv(CB), CA
@@ -1586,14 +1591,12 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
 }
 
-/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
-/// CanPushNegate is set to true if we can push a negate operation through
-/// the tree in a was that we are left with AND operations and negate operations
-/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
-/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
-/// brought into such a form.
-static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
-                                         unsigned Depth = 0) {
+/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
+/// expressed as a conjunction. See \ref AArch64CCMP.
+/// \param CanNegate        Set to true if we can also emit the negation of the
+///                         tree as a conjunction.
+static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
+                               unsigned Depth = 0) {
   if (!Val.hasOneUse())
     return false;
   unsigned Opcode = Val->getOpcode();
@@ -1610,10 +1613,10 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
     SDValue O0 = Val->getOperand(0);
     SDValue O1 = Val->getOperand(1);
     bool CanNegateL;
-    if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
+    if (!canEmitConjunction(O0, CanNegateL, Depth+1))
       return false;
     bool CanNegateR;
-    if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
+    if (!canEmitConjunction(O1, CanNegateR, Depth+1))
       return false;
 
     if (Opcode == ISD::OR) {
@@ -1621,8 +1624,11 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
       // we cannot do the transformation at all.
       if (!CanNegateL && !CanNegateR)
         return false;
-      // We can however change a (not (or x y)) to (and (not x) (not y)) if we
-      // can negate the x and y subtrees.
+      // However if we can negate x and y, then we can change
+      // (not (or x y))
+      // into
+      // (and (not x) (not y))
+      // to eliminate the outer negation.
       CanNegate = CanNegateL && CanNegateR;
     } else {
       // If the operands are OR expressions then we finally need to negate their
@@ -1632,7 +1638,7 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
       bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
       if (NeedsNegOutL && NeedsNegOutR)
         return false;
-      // We cannot negate an AND operation (it would become an OR),
+      // We cannot negate an AND operation.
       CanNegate = false;
     }
     return true;
@@ -1650,7 +1656,7 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
 /// for the comparisons in the current subtree; @p Depth limits the search
 /// depth to avoid stack overflow.
-static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
+static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
     AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
     AArch64CC::CondCode Predicate) {
   // We're at a tree leaf, produce a conditional comparison operation.
@@ -1707,13 +1713,13 @@ static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
   if (NegateOpsAndResult) {
     // See which side we can negate.
     bool CanNegateL;
-    bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
+    bool isValidL = canEmitConjunction(LHS, CanNegateL);
     assert(isValidL && "Valid conjunction/disjunction tree");
     (void)isValidL;
 
 #ifndef NDEBUG
     bool CanNegateR;
-    bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
+    bool isValidR = canEmitConjunction(RHS, CanNegateR);
     assert(isValidR && "Valid conjunction/disjunction tree");
     assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
 #endif
@@ -1735,12 +1741,12 @@ static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
   // through if we are already in a PushNegate case, otherwise we can negate
   // the "flags to test" afterwards.
   AArch64CC::CondCode RHSCC;
-  SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
+  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, Negate,
                                                    CCOp, Predicate);
   if (NegateOpsAndResult && !Negate)
     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
   // Emit LHS. We may need to negate it.
-  SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
+  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC,
                                                    NegateOpsAndResult, CmpR,
                                                    RHSCC);
   // If we transformed an OR to and AND then we have to negate the result
@@ -1750,21 +1756,57 @@ static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
   return CmpL;
 }
 
-/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
-/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
-/// \see emitConjunctionDisjunctionTreeRec().
-static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
-                                              AArch64CC::CondCode &OutCC) {
-  bool CanNegate;
-  if (!isConjunctionDisjunctionTree(Val, CanNegate))
+/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
+/// In some cases this is even possible with OR operations in the expression.
+/// See \ref AArch64CCMP.
+/// \see emitConjunctionRec().
+static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
+                               AArch64CC::CondCode &OutCC) {
+  bool DummyCanNegate;
+  if (!canEmitConjunction(Val, DummyCanNegate))
     return SDValue();
 
-  return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
-                                           AArch64CC::AL);
+  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
 }
 
 /// @}
 
+/// Returns how profitable it is to fold a comparison's operand's shift and/or
+/// extension operations.
+static unsigned getCmpOperandFoldingProfit(SDValue Op) {
+  auto isSupportedExtend = [&](SDValue V) {
+    if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      return true;
+
+    if (V.getOpcode() == ISD::AND)
+      if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
+        uint64_t Mask = MaskCst->getZExtValue();
+        return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
+      }
+
+    return false;
+  };
+
+  if (!Op.hasOneUse())
+    return 0;
+
+  if (isSupportedExtend(Op))
+    return 1;
+
+  unsigned Opc = Op.getOpcode();
+  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
+    if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      uint64_t Shift = ShiftCst->getZExtValue();
+      if (isSupportedExtend(Op.getOperand(0)))
+        return (Shift <= 4) ? 2 : 1;
+      EVT VT = Op.getValueType();
+      if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
+        return 1;
+    }
+
+  return 0;
+}
+
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &AArch64cc, SelectionDAG &DAG,
                              const SDLoc &dl) {
@@ -1822,6 +1864,27 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       }
     }
   }
+
+  // Comparisons are canonicalized so that the RHS operand is simpler than the
+  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
+  // can fold some shift+extend operations on the RHS operand, so swap the
+  // operands if that can be done.
+  //
+  // For example:
+  //    lsl     w13, w11, #1
+  //    cmp     w13, w12
+  // can be turned into:
+  //    cmp     w12, w11, lsl #1
+  if (!isa<ConstantSDNode>(RHS) ||
+      !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
+    SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
+
+    if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
+      std::swap(LHS, RHS);
+      CC = ISD::getSetCCSwappedOperands(CC);
+    }
+  }
+
   SDValue Cmp;
   AArch64CC::CondCode AArch64CC;
   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
@@ -1860,7 +1923,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     }
 
     if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
-      if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
+      if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
         if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
           AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
       }
@@ -2763,6 +2826,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerSELECT_CC(Op, DAG);
   case ISD::JumpTable:
     return LowerJumpTable(Op, DAG);
+  case ISD::BR_JT:
+    return LowerBR_JT(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:
@@ -2799,8 +2864,12 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFP_EXTEND(Op, DAG);
   case ISD::FRAMEADDR:
     return LowerFRAMEADDR(Op, DAG);
+  case ISD::SPONENTRY:
+    return LowerSPONENTRY(Op, DAG);
   case ISD::RETURNADDR:
     return LowerRETURNADDR(Op, DAG);
+  case ISD::ADDROFRETURNADDR:
+    return LowerADDROFRETURNADDR(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
@@ -3084,6 +3153,17 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     // We currently pass all varargs at 8-byte alignment.
     StackOffset = ((StackOffset + 7) & ~7);
     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
+
+    if (MFI.hasMustTailInVarArgFunc()) {
+      SmallVector<MVT, 2> RegParmTypes;
+      RegParmTypes.push_back(MVT::i64);
+      RegParmTypes.push_back(MVT::f128);
+      // Compute the set of forwarded registers. The rest are scratch.
+      SmallVectorImpl<ForwardedRegister> &Forwards =
+                                       FuncInfo->getForwardedMustTailRegParms();
+      CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
+                                               CC_AArch64_AAPCS);
+    }
   }
 
   unsigned StackArgSize = CCInfo.getNextStackOffset();
@@ -3544,6 +3624,14 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   SmallVector<SDValue, 8> MemOpChains;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
+  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
+    const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
+    for (const auto &F : Forwards) {
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
+       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+    }
+  }
+
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
        ++i, ++realArgIdx) {
@@ -4477,18 +4565,42 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
-  if (VT == MVT::i32)
-    Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
-  Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
+  if (VT == MVT::i32 || VT == MVT::i64) {
+    if (VT == MVT::i32)
+      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
 
-  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
-  SDValue UaddLV = DAG.getNode(
-      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
-      DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
+    SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
+    SDValue UaddLV = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+        DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
 
-  if (VT == MVT::i64)
-    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
-  return UaddLV;
+    if (VT == MVT::i64)
+      UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+    return UaddLV;
+  }
+
+  assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
+          VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
+         "Unexpected type for custom ctpop lowering");
+
+  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+  Val = DAG.getBitcast(VT8Bit, Val);
+  Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
+
+  // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
+  unsigned EltSize = 8;
+  unsigned NumElts = VT.is64BitVector() ? 8 : 16;
+  while (EltSize != VT.getScalarSizeInBits()) {
+    EltSize *= 2;
+    NumElts /= 2;
+    MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
+    Val = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
+        DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
+  }
+
+  return Val;
 }
 
 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
@@ -4816,6 +4928,22 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   return getAddr(JT, DAG);
 }
 
+SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  // Jump table entries as PC relative offsets. No additional tweaking
+  // is necessary here. Just get the address of the jump table.
+  SDLoc DL(Op);
+  SDValue JT = Op.getOperand(1);
+  SDValue Entry = Op.getOperand(2);
+  int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
+
+  SDNode *Dest =
+      DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
+                         Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
+  return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
+                     SDValue(Dest, 0));
+}
+
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
@@ -5048,6 +5176,16 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
   return FrameAddr;
 }
 
+SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+  EVT VT = getPointerTy(DAG.getDataLayout());
+  SDLoc DL(Op);
+  int FI = MFI.CreateFixedObject(4, 0, false);
+  return DAG.getFrameIndex(FI, VT);
+}
+
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
@@ -5098,6 +5236,20 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                               + StringRef(RegName)  + "\"."));
 }
 
+SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+  SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
+
+  return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
+}
+
 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
                                                SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -7868,7 +8020,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+    for (unsigned ArgI = 0, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
@@ -9763,10 +9915,10 @@ static SDValue performIntrinsicCombine(SDNode *N,
   case Intrinsic::aarch64_neon_umaxv:
     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
   case Intrinsic::aarch64_neon_fmax:
-    return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
+    return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_fmin:
-    return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
+    return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_fmaxnm:
     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
@@ -10913,9 +11065,9 @@ static SDValue performNVCASTCombine(SDNode *N) {
 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
                                            const AArch64Subtarget *Subtarget,
                                            const TargetMachine &TM) {
-  auto *GN = dyn_cast<GlobalAddressSDNode>(N);
-  if (!GN || Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
-                 AArch64II::MO_NO_FLAG)
+  auto *GN = cast<GlobalAddressSDNode>(N);
+  if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
+      AArch64II::MO_NO_FLAG)
     return SDValue();
 
   uint64_t MinOffset = -1ull;
@@ -11047,6 +11199,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     default:
       break;
     }
+    break;
   case ISD::GlobalAddress:
     return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
   }
@@ -11354,12 +11507,11 @@ unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
 }
 
 TargetLoweringBase::LegalizeTypeAction
-AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
-  MVT SVT = VT.getSimpleVT();
+AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
   // v4i16, v2i32 instead of to promote.
-  if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
-      || SVT == MVT::v1f32)
+  if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
+      VT == MVT::v1f32)
     return TypeWidenVector;
 
   return TargetLoweringBase::getPreferredVectorAction(VT);
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 94df7e4c39d9bb9b535ac55387419a0c14d96ae2..7ee3b82a4ac12a7c3f8133122917028b844672b5 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -395,7 +395,7 @@ public:
 
   bool useLoadStackGuardNode() const override;
   TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const override;
+  getPreferredVectorAction(MVT VT) const override;
 
   /// If the target has a standard location for the stack protector cookie,
   /// returns the address of that location. Otherwise, returns nullptr.
@@ -607,6 +607,7 @@ private:
                          SDValue TVal, SDValue FVal, const SDLoc &dl,
                          SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
@@ -616,6 +617,7 @@ private:
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 3ebbb446c124c7dae29a42c096936f5827b2cb85..ab90ea3f74ad63032a3fe95ed3f7597241c04b2b 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -263,6 +263,14 @@ class SImmOperand<int width> : AsmOperandClass {
   let PredicateMethod = "isSImm<" # width # ">";
 }
 
+
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+  let Name = "Imm" # Low # "_" # High;
+  let DiagnosticType = "InvalidImm" # Low # "_" # High;
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
+}
+
 // Authenticated loads for v8.3 can have scaled 10-bit immediate offsets.
 def SImm10s8Operand : SImmScaledMemoryIndexed<10, 8>;
 def simm10Scaled : Operand<i64> {
@@ -287,6 +295,10 @@ def uimm6 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
   let ParserMatchClass = UImm6Operand;
 }
 
+def uimm16 : Operand<i16>, ImmLeaf<i16, [{return Imm >= 0 && Imm < 65536;}]>{
+  let ParserMatchClass = AsmImmRange<0, 65535>;
+}
+
 def SImm9Operand : SImmOperand<9>;
 def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
   let ParserMatchClass = SImm9Operand;
@@ -447,13 +459,6 @@ def simm4s16 : Operand<i64>, ImmLeaf<i64,
   let DecoderMethod = "DecodeSImm<4>";
 }
 
-class AsmImmRange<int Low, int High> : AsmOperandClass {
-  let Name = "Imm" # Low # "_" # High;
-  let DiagnosticType = "InvalidImm" # Low # "_" # High;
-  let RenderMethod = "addImmOperands";
-  let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
-}
-
 def Imm1_8Operand : AsmImmRange<1, 8>;
 def Imm1_16Operand : AsmImmRange<1, 16>;
 def Imm1_32Operand : AsmImmRange<1, 32>;
@@ -708,11 +713,10 @@ def logical_imm64_not : Operand<i64> {
 }
 
 // imm0_65535 predicate - True if the immediate is in the range [0,65535].
-def Imm0_65535Operand : AsmImmRange<0, 65535>;
 def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 65536;
 }]> {
-  let ParserMatchClass = Imm0_65535Operand;
+  let ParserMatchClass = AsmImmRange<0, 65535>;
   let PrintMethod = "printImmHex";
 }
 
@@ -1937,7 +1941,7 @@ class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
 //---
 
 def movimm32_imm : Operand<i32> {
-  let ParserMatchClass = Imm0_65535Operand;
+  let ParserMatchClass = AsmImmRange<0, 65535>;
   let EncoderMethod = "getMoveWideImmOpValue";
   let PrintMethod = "printImm";
 }
@@ -4082,6 +4086,19 @@ class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
   let Inst{1-0}   = ll;
 }
 
+//---
+// UDF : Permanently UNDEFINED instructions.  Format: Opc = 0x0000, 16 bit imm.
+//--
+let hasSideEffects = 1, isTrap = 1, mayLoad = 0, mayStore = 0 in {
+class UDFType<bits<16> opc, string asm>
+  : I<(outs), (ins uimm16:$imm),
+       asm, "\t$imm", "", []>,
+    Sched<[]> {
+  bits<16> imm;
+  let Inst{31-16} = opc;
+  let Inst{15-0} = imm;
+}
+}
 let Predicates = [HasFPARMv8] in {
 
 //---
@@ -4941,33 +4958,6 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
   let Inst{4-0}   = Rd;
 }
 
-let Predicates = [HasNEON, HasFP16FML] in
-class BaseSIMDThreeSameMult<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1,
-                                 string kind2> :
-        BaseSIMDThreeSameVector<Q, U, size, 0b11101, V128, asm, kind1, [] > {
-  let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
-  let Inst{13} = b13;
-}
-
-class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
-                                 string kind2, RegisterOperand RegType,
-                                 ValueType AccumType, ValueType InputType,
-                                 SDPatternOperator OpNode> :
-        BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
-        [(set (AccumType RegType:$dst),
-              (OpNode (AccumType RegType:$Rd),
-                      (InputType RegType:$Rn),
-                      (InputType RegType:$Rm)))]> {
-  let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
-}
-
-multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
-                                         v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
-                                         v4i32, v16i8, OpNode>;
-}
-
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
@@ -5208,6 +5198,51 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
         V128:$LHS, V128:$MHS, V128:$RHS)>;
 }
 
+// ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
+// bytes from S-sized elements.
+class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
+                                 string kind2, RegisterOperand RegType,
+                                 ValueType AccumType, ValueType InputType,
+                                 SDPatternOperator OpNode> :
+        BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
+        [(set (AccumType RegType:$dst),
+              (OpNode (AccumType RegType:$Rd),
+                      (InputType RegType:$Rn),
+                      (InputType RegType:$Rm)))]> {
+  let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
+}
+
+multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
+                                         v2i32, v8i8, OpNode>;
+  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
+                                         v4i32, v16i8, OpNode>;
+}
+
+// ARMv8.2-A Fused Multiply Add-Long Instructions (Vector): These instructions
+// select inputs from 4H vectors and accumulate outputs to a 2S vector (or from
+// 8H to 4S, when Q=1).
+class BaseSIMDThreeSameVectorFML<bit Q, bit U, bit b13, bits<3> size, string asm, string kind1,
+                                 string kind2, RegisterOperand RegType,
+                                 ValueType AccumType, ValueType InputType,
+                                 SDPatternOperator OpNode> :
+        BaseSIMDThreeSameVectorTied<Q, U, size, 0b11101, RegType, asm, kind1,
+		[(set (AccumType RegType:$dst),
+              (OpNode (AccumType RegType:$Rd),
+                      (InputType RegType:$Rn),
+                      (InputType RegType:$Rm)))]> {
+  let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
+  let Inst{13} = b13;
+}
+
+multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
+                                  SDPatternOperator OpNode> {
+  def v4f16 : BaseSIMDThreeSameVectorFML<0, U, b13, size, asm, ".2s", ".2h", V64,
+                                         v2f32, v4f16, OpNode>;
+  def v8f16 : BaseSIMDThreeSameVectorFML<1, U, b13, size, asm, ".4s", ".4h", V128,
+                                         v4f32, v8f16, OpNode>;
+}
+
 
 //----------------------------------------------------------------------------
 // AdvSIMD two register vector instructions.
@@ -7414,7 +7449,7 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
   let Inst{4-0}   = Rd;
 }
 
-// ARMv8.2 Index Dot product instructions
+// ARMv8.2-A Dot Product Instructions (Indexed)
 class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
                                       string lhs_kind, string rhs_kind,
                                       RegisterOperand RegType,
@@ -7433,26 +7468,40 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
   let Inst{11}    = idx{1};  // H
 }
 
-let Predicates = [HasNEON, HasFP16FML] in
-class BaseSIMDThreeSameMultIndex<bit Q, bit U, bits<4> opc, string asm,
-                                 string dst_kind, string lhs_kind,
-                                 string rhs_kind> :
-        BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, V128, V128, V128,
-                            VectorIndexH, asm, "", dst_kind, lhs_kind,
-                            rhs_kind, []> {
-  //idx = H:L:M
+multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+                                       SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b",
+                                              V64, v2i32, v8i8, OpNode>;
+  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b",
+                                              V128, v4i32, v16i8, OpNode>;
+}
+
+// ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
+class BaseSIMDThreeSameVectorFMLIndex<bit Q, bit U, bits<4> opc, string asm,
+                                      string dst_kind, string lhs_kind,
+                                      string rhs_kind, RegisterOperand RegType,
+                                      ValueType AccumType, ValueType InputType,
+                                      SDPatternOperator OpNode> :
+        BaseSIMDIndexedTied<Q, U, 0, 0b10, opc, RegType, RegType, V128,
+                            VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
+          [(set (AccumType RegType:$dst),
+                (AccumType (OpNode (AccumType RegType:$Rd),
+                                   (InputType RegType:$Rn),
+                                   (InputType (AArch64duplane16 (v8f16 V128:$Rm),
+                                                VectorIndexH:$idx)))))]> {
+  // idx = H:L:M
   bits<3> idx;
   let Inst{11} = idx{2}; // H
   let Inst{21} = idx{1}; // L
   let Inst{20} = idx{0}; // M
 }
 
-multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                        SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", V64,
-                                              v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", V128,
-                                              v4i32, v16i8, OpNode>;
+  def v4f16 : BaseSIMDThreeSameVectorFMLIndex<0, U, opc, asm, ".2s", ".2h", ".h",
+                                              V64, v2f32, v4f16, OpNode>;
+  def v8f16 : BaseSIMDThreeSameVectorFMLIndex<1, U, opc, asm, ".4s", ".4h", ".h",
+                                              V128, v4f32, v8f16, OpNode>;
 }
 
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index f0f5bfa351d39e5da64d42cda7c43c243e24444c..7b4e05128058621a7acf8d144da5a86285fb4811 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -108,6 +108,14 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     // This gets lowered to an instruction sequence which takes 16 bytes
     NumBytes = 16;
     break;
+  case AArch64::JumpTableDest32:
+  case AArch64::JumpTableDest16:
+  case AArch64::JumpTableDest8:
+    NumBytes = 12;
+    break;
+  case AArch64::SPACE:
+    NumBytes = MI.getOperand(1).getImm();
+    break;
   }
 
   return NumBytes;
@@ -696,7 +704,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   // Secondly, check cases specific to sub-targets.
 
   if (Subtarget.hasExynosCheapAsMoveHandling()) {
-    if (isExynosResetFast(MI) || isExynosShiftLeftFast(MI))
+    if (isExynosResetFast(MI) || isExynosShiftExtFast(MI))
       return true;
     else
       return MI.isAsCheapAsAMove();
@@ -750,7 +758,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
 }
 
-bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) {
   unsigned Reg, Imm, Shift;
 
   switch (MI.getOpcode()) {
@@ -821,7 +829,72 @@ bool AArch64InstrInfo::isExynosResetFast(const MachineInstr &MI) const {
   }
 }
 
-bool AArch64InstrInfo::isExynosShiftLeftFast(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isExynosLdStExtFast(const MachineInstr &MI) {
+  unsigned Imm;
+  AArch64_AM::ShiftExtendType Ext;
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+
+  // WriteLD
+  case AArch64::PRFMroW:
+  case AArch64::PRFMroX:
+
+  // WriteLDIdx
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRWroW:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroW:
+  case AArch64::LDRXroX:
+
+  case AArch64::LDRBroW:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroW:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHroW:
+  case AArch64::LDRHroX:
+  case AArch64::LDRSroW:
+  case AArch64::LDRSroX:
+
+  // WriteSTIdx
+  case AArch64::STRBBroW:
+  case AArch64::STRBBroX:
+  case AArch64::STRHHroW:
+  case AArch64::STRHHroX:
+  case AArch64::STRWroW:
+  case AArch64::STRWroX:
+  case AArch64::STRXroW:
+  case AArch64::STRXroX:
+
+  case AArch64::STRBroW:
+  case AArch64::STRBroX:
+  case AArch64::STRDroW:
+  case AArch64::STRDroX:
+  case AArch64::STRHroW:
+  case AArch64::STRHroX:
+  case AArch64::STRSroW:
+  case AArch64::STRSroX:
+    Imm = MI.getOperand(3).getImm();
+    Ext = AArch64_AM::getMemExtendType(Imm);
+    return (Ext == AArch64_AM::SXTX || Ext == AArch64_AM::UXTX);
+  }
+}
+
+bool AArch64InstrInfo::isExynosShiftExtFast(const MachineInstr &MI) {
   unsigned Imm, Shift;
   AArch64_AM::ShiftExtendType Ext;
 
@@ -887,64 +960,10 @@ bool AArch64InstrInfo::isExynosShiftLeftFast(const MachineInstr &MI) const {
     Shift = AArch64_AM::getArithShiftValue(Imm);
     Ext = AArch64_AM::getArithExtendType(Imm);
     return (Shift == 0 || (Shift <= 3 && Ext == AArch64_AM::UXTX));
-
-  case AArch64::PRFMroW:
-  case AArch64::PRFMroX:
-
-  // WriteLDIdx
-  case AArch64::LDRBBroW:
-  case AArch64::LDRBBroX:
-  case AArch64::LDRHHroW:
-  case AArch64::LDRHHroX:
-  case AArch64::LDRSBWroW:
-  case AArch64::LDRSBWroX:
-  case AArch64::LDRSBXroW:
-  case AArch64::LDRSBXroX:
-  case AArch64::LDRSHWroW:
-  case AArch64::LDRSHWroX:
-  case AArch64::LDRSHXroW:
-  case AArch64::LDRSHXroX:
-  case AArch64::LDRSWroW:
-  case AArch64::LDRSWroX:
-  case AArch64::LDRWroW:
-  case AArch64::LDRWroX:
-  case AArch64::LDRXroW:
-  case AArch64::LDRXroX:
-
-  case AArch64::LDRBroW:
-  case AArch64::LDRBroX:
-  case AArch64::LDRDroW:
-  case AArch64::LDRDroX:
-  case AArch64::LDRHroW:
-  case AArch64::LDRHroX:
-  case AArch64::LDRSroW:
-  case AArch64::LDRSroX:
-
-  // WriteSTIdx
-  case AArch64::STRBBroW:
-  case AArch64::STRBBroX:
-  case AArch64::STRHHroW:
-  case AArch64::STRHHroX:
-  case AArch64::STRWroW:
-  case AArch64::STRWroX:
-  case AArch64::STRXroW:
-  case AArch64::STRXroX:
-
-  case AArch64::STRBroW:
-  case AArch64::STRBroX:
-  case AArch64::STRDroW:
-  case AArch64::STRDroX:
-  case AArch64::STRHroW:
-  case AArch64::STRHroX:
-  case AArch64::STRSroW:
-  case AArch64::STRSroX:
-    Imm = MI.getOperand(3).getImm();
-    Ext = AArch64_AM::getMemExtendType(Imm);
-    return (Ext == AArch64_AM::SXTX || Ext == AArch64_AM::UXTX);
   }
 }
 
-bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const {
+bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return false;
@@ -1066,6 +1085,32 @@ bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const {
   }
 }
 
+bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+    default:
+      return false;
+    case AArch64::SEH_StackAlloc:
+    case AArch64::SEH_SaveFPLR:
+    case AArch64::SEH_SaveFPLR_X:
+    case AArch64::SEH_SaveReg:
+    case AArch64::SEH_SaveReg_X:
+    case AArch64::SEH_SaveRegP:
+    case AArch64::SEH_SaveRegP_X:
+    case AArch64::SEH_SaveFReg:
+    case AArch64::SEH_SaveFReg_X:
+    case AArch64::SEH_SaveFRegP:
+    case AArch64::SEH_SaveFRegP_X:
+    case AArch64::SEH_SetFP:
+    case AArch64::SEH_AddFP:
+    case AArch64::SEH_Nop:
+    case AArch64::SEH_PrologEnd:
+    case AArch64::SEH_EpilogStart:
+    case AArch64::SEH_EpilogEnd:
+      return true;
+  }
+}
+
 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                              unsigned &SrcReg, unsigned &DstReg,
                                              unsigned &SubIdx) const {
@@ -1118,6 +1163,14 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
   return false;
 }
 
+bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                            const MachineBasicBlock *MBB,
+                                            const MachineFunction &MF) const {
+  if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
+    return true;
+  return isSEHInstruction(MI);
+}
+
 /// analyzeCompare - For a comparison instruction, return the source registers
 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
 /// Return true if the comparison instruction can be analyzed.
@@ -3007,7 +3060,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                            unsigned DestReg, unsigned SrcReg, int Offset,
                            const TargetInstrInfo *TII,
-                           MachineInstr::MIFlag Flag, bool SetNZCV) {
+                           MachineInstr::MIFlag Flag, bool SetNZCV,
+                           bool NeedsWinCFI) {
   if (DestReg == SrcReg && Offset == 0)
     return;
 
@@ -3052,6 +3106,11 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
         .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
         .setMIFlag(Flag);
 
+   if (NeedsWinCFI && SrcReg == AArch64::SP && DestReg == AArch64::SP)
+     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
+         .addImm(ThisVal)
+         .setMIFlag(Flag);
+
     SrcReg = DestReg;
     Offset -= ThisVal;
     if (Offset == 0)
@@ -3062,6 +3121,21 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
       .addImm(Offset)
       .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
       .setMIFlag(Flag);
+
+  if (NeedsWinCFI) {
+    if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
+        (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
+      if (Offset == 0)
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).
+                setMIFlag(Flag);
+      else
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)).
+                addImm(Offset).setMIFlag(Flag);
+    } else if (DestReg == AArch64::SP) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)).
+              addImm(Offset).setMIFlag(Flag);
+    }
+  }
 }
 
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
@@ -5084,12 +5158,9 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   unsigned FrameID = MachineOutlinerDefault;
   unsigned NumBytesToCreateFrame = 4;
 
-  bool HasBTI =
-      std::any_of(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
-                  [](outliner::Candidate &C) {
-                    return C.getMF()->getFunction().hasFnAttribute(
-                        "branch-target-enforcement");
-                  });
+  bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
+    return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
+  });
 
   // If the last instruction in any candidate is a terminator, then we should
   // tail call all of the candidates.
@@ -5124,10 +5195,9 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   // LR is live, so we need to save it. Decide whether it should be saved to
   // the stack, or if it can be saved to a register.
   else {
-    if (std::all_of(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
-                    [this](outliner::Candidate &C) {
-                      return findRegisterToSaveLRTo(C);
-                    })) {
+    if (all_of(RepeatedSequenceLocs, [this](outliner::Candidate &C) {
+          return findRegisterToSaveLRTo(C);
+        })) {
       // Every candidate has an available callee-saved register for the save.
       // We can save LR to a register.
       FrameID = MachineOutlinerRegSave;
@@ -5195,8 +5265,7 @@ AArch64InstrInfo::getMachineOutlinerMBBFlags(MachineBasicBlock &MBB) const {
   unsigned Flags = 0x0;
   // Check if there's a call inside this MachineBasicBlock. If there is, then
   // set a flag.
-  if (std::any_of(MBB.begin(), MBB.end(),
-                  [](MachineInstr &MI) { return MI.isCall(); }))
+  if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
     Flags |= MachineOutlinerMBBFlags::HasCalls;
 
   // Check if LR is available through all of the MBB. If it's not, then set
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 11882e238b701ff0052968e010ed336e5b62a691..43011dd4c3e57de43d692ee6fa164bd34ff337ea 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -189,6 +189,10 @@ public:
                     unsigned FalseReg) const override;
   void getNoop(MCInst &NopInst) const override;
 
+  bool isSchedulingBoundary(const MachineInstr &MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
   /// Return true if the comparison instruction can be analyzed.
@@ -250,15 +254,21 @@ public:
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
                      const outliner::Candidate &C) const override;
   bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
-  /// Returns true if the instruction sets to an immediate value that can be
+  /// Returns true if the instruction sets a constant value that can be
   /// executed more efficiently.
-  bool isExynosResetFast(const MachineInstr &MI) const;
-  /// Returns true if the instruction has a shift left that can be executed
+  static bool isExynosResetFast(const MachineInstr &MI);
+  /// Returns true if the load or store has an extension that can be executed
   /// more efficiently.
-  bool isExynosShiftLeftFast(const MachineInstr &MI) const;
+  static bool isExynosLdStExtFast(const MachineInstr &MI);
+  /// Returns true if the instruction has a constant shift left or extension
+  /// that can be executed more efficiently.
+  static bool isExynosShiftExtFast(const MachineInstr &MI);
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
-  bool isFalkorShiftExtFast(const MachineInstr &MI) const;
+  static bool isFalkorShiftExtFast(const MachineInstr &MI);
+  /// Return true if the instructions is a SEH instruciton used for unwinding
+  /// on Windows.
+  static bool isSEHInstruction(const MachineInstr &MI);
 
 private:
   /// Sets the offsets on outlined instructions in \p MBB which use SP
@@ -286,7 +296,7 @@ void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                      int Offset, const TargetInstrInfo *TII,
                      MachineInstr::MIFlag = MachineInstr::NoFlags,
-                     bool SetNZCV = false);
+                     bool SetNZCV = false,  bool NeedsWinCFI = false);
 
 /// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
 /// FP. Return false if the offset could not be handled directly in MI, and
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 1d9e3d0b81235639f0347170123065f9910a7350..37d3967df44efcaa6affde19fe754d684f307775 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -448,6 +448,30 @@ def : Pat<(AArch64LOADgot texternalsym:$addr),
 def : Pat<(AArch64LOADgot tconstpool:$addr),
           (LOADgot tconstpool:$addr)>;
 
+// 32-bit jump table destination is actually only 2 instructions since we can
+// use the table itself as a PC-relative base. But optimization occurs after
+// branch relaxation so be pessimistic.
+let Size = 12, Constraints = "@earlyclobber $dst,@earlyclobber $scratch" in {
+def JumpTableDest32 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
+                             (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
+                      Sched<[]>;
+def JumpTableDest16 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
+                             (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
+                      Sched<[]>;
+def JumpTableDest8 : Pseudo<(outs GPR64:$dst, GPR64sp:$scratch),
+                            (ins GPR64:$table, GPR64:$entry, i32imm:$jti), []>,
+                     Sched<[]>;
+}
+
+// Space-consuming pseudo to aid testing of placement and reachability
+// algorithms. Immediate operand is the number of bytes this "instruction"
+// occupies; register operands can be used to enforce dependency and constrain
+// the scheduler.
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+def SPACE : Pseudo<(outs GPR64:$Rd), (ins i32imm:$size, GPR64:$Rn),
+                   [(set GPR64:$Rd, (int_aarch64_space imm:$size, GPR64:$Rn))]>,
+            Sched<[]>;
+
 //===----------------------------------------------------------------------===//
 // System instructions.
 //===----------------------------------------------------------------------===//
@@ -490,7 +514,7 @@ def TSB   : CRmSystemI<barrier_op, 0b010, "tsb", []> {
 }
 }
 
-// ARMv8.2 Dot Product
+// ARMv8.2-A Dot Product
 let Predicates = [HasDotProd] in {
 defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
 defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
@@ -498,6 +522,18 @@ defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
 defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
 }
 
+// ARMv8.2-A FP16 Fused Multiply-Add Long
+let Predicates = [HasNEON, HasFP16FML] in {
+defm FMLAL      : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
+defm FMLSL      : SIMDThreeSameVectorFML<0, 1, 0b101, "fmlsl", int_aarch64_neon_fmlsl>;
+defm FMLAL2     : SIMDThreeSameVectorFML<1, 0, 0b001, "fmlal2", int_aarch64_neon_fmlal2>;
+defm FMLSL2     : SIMDThreeSameVectorFML<1, 0, 0b101, "fmlsl2", int_aarch64_neon_fmlsl2>;
+defm FMLALlane  : SIMDThreeSameVectorFMLIndex<0, 0b0000, "fmlal", int_aarch64_neon_fmlal>;
+defm FMLSLlane  : SIMDThreeSameVectorFMLIndex<0, 0b0100, "fmlsl", int_aarch64_neon_fmlsl>;
+defm FMLAL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1000, "fmlal2", int_aarch64_neon_fmlal2>;
+defm FMLSL2lane : SIMDThreeSameVectorFMLIndex<1, 0b1100, "fmlsl2", int_aarch64_neon_fmlsl2>;
+}
+
 // Armv8.2-A Crypto extensions
 let Predicates = [HasSHA3] in {
 def SHA512H   : CryptoRRRTied<0b0, 0b00, "sha512h">;
@@ -1594,6 +1630,8 @@ def : InstAlias<"dcps1", (DCPS1 0)>;
 def : InstAlias<"dcps2", (DCPS2 0)>;
 def : InstAlias<"dcps3", (DCPS3 0)>;
 
+def UDF : UDFType<0, "udf">;
+
 //===----------------------------------------------------------------------===//
 // Load instructions.
 //===----------------------------------------------------------------------===//
@@ -3026,18 +3064,18 @@ let SchedRW = [WriteFDiv] in {
 defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
 }
 defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
-defm FMAX   : TwoOperandFPData<0b0100, "fmax", fmaxnan>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", fmaximum>;
 defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
-defm FMIN   : TwoOperandFPData<0b0101, "fmin", fminnan>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", fminimum>;
 let SchedRW = [WriteFMul] in {
 defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
 defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
 }
 defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
 
-def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fmaximum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+def : Pat<(v1f64 (fminimum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
 def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
@@ -3114,6 +3152,28 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd),
   let hasNoSchedulingInfo = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Instructions used for emitting unwind opcodes on ARM64 Windows.
+//===----------------------------------------------------------------------===//
+let isPseudo = 1 in {
+  def SEH_StackAlloc : Pseudo<(outs), (ins i32imm:$size), []>, Sched<[]>;
+  def SEH_SaveFPLR : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFPLR_X : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveReg_X : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFReg : Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFReg_X :  Pseudo<(outs), (ins i32imm:$reg, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFRegP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveFRegP_X : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SetFP : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_AddFP : Pseudo<(outs), (ins i32imm:$offs), []>, Sched<[]>;
+  def SEH_Nop : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_PrologEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_EpilogStart : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
+}
 
 //===----------------------------------------------------------------------===//
 // Floating point immediate move.
@@ -3363,11 +3423,11 @@ defm FDIV    : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
 defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
 defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
 defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaximum>;
 defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
 defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
 defm FMINP   : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminimum>;
 
 // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
 // instruction expects the addend first, while the fma intrinsic puts it last.
@@ -3438,24 +3498,6 @@ defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
 defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
                                                     int_aarch64_neon_sqsub>;
 
-// FP16FML
-def FMLAL_2S   : BaseSIMDThreeSameMult<0, 0, 1, 0b001, "fmlal", ".2s", ".2h">;
-def FMLSL_2S   : BaseSIMDThreeSameMult<0, 0, 1, 0b101, "fmlsl", ".2s", ".2h">;
-def FMLAL_4S   : BaseSIMDThreeSameMult<1, 0, 1, 0b001, "fmlal", ".4s", ".4h">;
-def FMLSL_4S   : BaseSIMDThreeSameMult<1, 0, 1, 0b101, "fmlsl", ".4s", ".4h">;
-def FMLAL2_2S  : BaseSIMDThreeSameMult<0, 1, 0, 0b001, "fmlal2", ".2s", ".2h">;
-def FMLSL2_2S  : BaseSIMDThreeSameMult<0, 1, 0, 0b101, "fmlsl2", ".2s", ".2h">;
-def FMLAL2_4S  : BaseSIMDThreeSameMult<1, 1, 0, 0b001, "fmlal2", ".4s", ".4h">;
-def FMLSL2_4S  : BaseSIMDThreeSameMult<1, 1, 0, 0b101, "fmlsl2", ".4s", ".4h">;
-def FMLALI_2s  : BaseSIMDThreeSameMultIndex<0, 0, 0b0000, "fmlal", ".2s", ".2h", ".h">;
-def FMLSLI_2s  : BaseSIMDThreeSameMultIndex<0, 0, 0b0100, "fmlsl", ".2s", ".2h", ".h">;
-def FMLALI_4s  : BaseSIMDThreeSameMultIndex<1, 0, 0b0000, "fmlal", ".4s", ".4h", ".h">;
-def FMLSLI_4s  : BaseSIMDThreeSameMultIndex<1, 0, 0b0100, "fmlsl", ".4s", ".4h", ".h">;
-def FMLALI2_2s : BaseSIMDThreeSameMultIndex<0, 1, 0b1000, "fmlal2", ".2s", ".2h", ".h">;
-def FMLSLI2_2s : BaseSIMDThreeSameMultIndex<0, 1, 0b1100, "fmlsl2", ".2s", ".2h", ".h">;
-def FMLALI2_4s : BaseSIMDThreeSameMultIndex<1, 1, 0b1000, "fmlal2", ".4s", ".4h", ".h">;
-def FMLSLI2_4s : BaseSIMDThreeSameMultIndex<1, 1, 0b1100, "fmlsl2", ".4s", ".4h", ".h">;
-
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
                                   BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
@@ -4161,44 +4203,43 @@ def : Pat<(concat_vectors (v2i32 V64:$Rd),
 
 defm EXT : SIMDBitwiseExtract<"ext">;
 
-def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-
-// We use EXT to handle extract_subvector to copy the upper 64-bits of a
-// 128-bit vector.
-def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-
+def AdjustExtImm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(8 + N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+multiclass ExtPat<ValueType VT64, ValueType VT128, int N> {
+  def : Pat<(VT64 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+            (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+  def : Pat<(VT128 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+            (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+  // We use EXT to handle extract_subvector to copy the upper 64-bits of a
+  // 128-bit vector.
+  def : Pat<(VT64 (extract_subvector V128:$Rn, (i64 N))),
+            (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+  // A 64-bit EXT of two halves of the same 128-bit register can be done as a
+  // single 128-bit EXT.
+  def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 0)),
+                              (extract_subvector V128:$Rn, (i64 N)),
+                              (i32 imm:$imm))),
+            (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, imm:$imm), dsub)>;
+  // A 64-bit EXT of the high half of a 128-bit register can be done using a
+  // 128-bit EXT of the whole register with an adjustment to the immediate. The
+  // top half of the other operand will be unset, but that doesn't matter as it
+  // will not be used.
+  def : Pat<(VT64 (AArch64ext (extract_subvector V128:$Rn, (i64 N)),
+                              V64:$Rm,
+                              (i32 imm:$imm))),
+            (EXTRACT_SUBREG (EXTv16i8 V128:$Rn,
+                                      (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                                      (AdjustExtImm imm:$imm)), dsub)>;
+}
+
+defm : ExtPat<v8i8, v16i8, 8>;
+defm : ExtPat<v4i16, v8i16, 4>;
+defm : ExtPat<v4f16, v8f16, 4>;
+defm : ExtPat<v2i32, v4i32, 2>;
+defm : ExtPat<v2f32, v4f32, 2>;
+defm : ExtPat<v1i64, v2i64, 1>;
+defm : ExtPat<v1f64, v2f64, 1>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD zip vector
@@ -4896,16 +4937,6 @@ def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
 def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
           (MOVID imm0_255:$shift)>;
 
-def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
-
-def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
-
 // EDIT byte mask: 2d
 
 // The movi_edit node has the immediate value already encoded, so we use
@@ -4926,6 +4957,18 @@ def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 
+// Set 64-bit vectors to all 0/1 by extracting from a 128-bit register as the
+// extract is free and this gives better MachineCSE results.
+def : Pat<(v1i64 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+def : Pat<(v2i32 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+def : Pat<(v4i16 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+def : Pat<(v8i8  immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+
+def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+def : Pat<(v4i16 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+def : Pat<(v8i8  immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+
 // EDIT per word & halfword: 2s, 4h, 4s, & 8h
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 327c758a7f8e977b198f210a3af4659bab11e414..4b5e10ac4ec503d025f2f58e58c882c2c1581ecc 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -167,9 +167,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .unsupportedIfMemSizeNotPow2()
       // Lower any any-extending loads left into G_ANYEXT and G_LOAD
       .lowerIf([=](const LegalityQuery &Query) {
-        return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size * 8;
+        return Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
       })
-      .clampNumElements(0, v2s32, v2s32);
+      .clampNumElements(0, v2s32, v2s32)
+      .clampMaxNumElements(0, s64, 1);
 
   getActionDefinitionsBuilder(G_STORE)
       .legalForTypesWithMemSize({{s8, p0, 8},
@@ -185,9 +186,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
       .unsupportedIfMemSizeNotPow2()
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
-               Query.Types[0].getSizeInBits() != Query.MMODescrs[0].Size * 8;
+               Query.Types[0].getSizeInBits() != Query.MMODescrs[0].SizeInBits;
       })
-      .clampNumElements(0, v2s32, v2s32);
+      .clampNumElements(0, v2s32, v2s32)
+      .clampMaxNumElements(0, s64, 1);
 
   // Constants
   getActionDefinitionsBuilder(G_CONSTANT)
@@ -385,6 +387,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
                          });
   }
 
+  getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
+      .unsupportedIf([=](const LegalityQuery &Query) {
+        const LLT &EltTy = Query.Types[1].getElementType();
+        return Query.Types[0] != EltTy;
+      })
+      .minScalar(2, s64)
+      .legalIf([=](const LegalityQuery &Query) {
+        const LLT &VecTy = Query.Types[1];
+        return VecTy == v4s32 || VecTy == v2s64;
+      });
+
   computeTables();
   verify(*ST.getInstrInfo());
 }
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index e42214d15699cfdc9bab169f51d9a82f6143c8df..5183e7d3c0d0974f4d680e48e06eeb4128191082 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include <cassert>
@@ -97,6 +98,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// attribute, in which case it is set to false at construction.
   Optional<bool> HasRedZone;
 
+  /// ForwardedMustTailRegParms - A list of virtual and physical registers
+  /// that must be forwarded to every musttail call.
+  SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 public:
   AArch64FunctionInfo() = default;
 
@@ -162,6 +166,19 @@ public:
   unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
   void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
 
+  unsigned getJumpTableEntrySize(int Idx) const {
+    auto It = JumpTableEntryInfo.find(Idx);
+    if (It != JumpTableEntryInfo.end())
+      return It->second.first;
+    return 4;
+  }
+  MCSymbol *getJumpTableEntryPCRelSymbol(int Idx) const {
+    return JumpTableEntryInfo.find(Idx)->second.second;
+  }
+  void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym) {
+    JumpTableEntryInfo[Idx] = std::make_pair(Size, PCRelSym);
+  }
+
   using SetOfInstructions = SmallPtrSet<const MachineInstr *, 16>;
 
   const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
@@ -196,10 +213,16 @@ public:
     LOHRelated.insert(Args.begin(), Args.end());
   }
 
+  SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
+    return ForwardedMustTailRegParms;
+  }
+
 private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;
   SetOfInstructions LOHRelated;
+
+  DenseMap<int, std::pair<unsigned, MCSymbol *>> JumpTableEntryInfo;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index 43ebcfd989393a58c1a2c18311aefe350fd0033e..fb8a339dc4d8eb478fce6ae7c6d4389dcb43d7ef 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -21,14 +21,16 @@ using namespace llvm;
 namespace {
 
 /// CMN, CMP, TST followed by Bcc
-static bool isArithmeticBccPair(unsigned FirstOpcode, unsigned SecondOpcode,
-                                const MachineInstr *FirstMI) {
-  if (SecondOpcode != AArch64::Bcc)
+static bool isArithmeticBccPair(const MachineInstr *FirstMI,
+                                const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != AArch64::Bcc)
     return false;
 
-  switch (FirstOpcode) {
-  case AArch64::INSTRUCTION_LIST_END:
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  if (FirstMI == nullptr)
     return true;
+
+  switch (FirstMI->getOpcode()) {
   case AArch64::ADDSWri:
   case AArch64::ADDSWrr:
   case AArch64::ADDSXri:
@@ -55,21 +57,24 @@ static bool isArithmeticBccPair(unsigned FirstOpcode, unsigned SecondOpcode,
     // Shift value can be 0 making these behave like the "rr" variant...
     return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
   }
+
   return false;
 }
 
 /// ALU operations followed by CBZ/CBNZ.
-static bool isArithmeticCbzPair(unsigned FirstOpcode, unsigned SecondOpcode,
-                                const MachineInstr *FirstMI) {
-  if (SecondOpcode != AArch64::CBNZW &&
-      SecondOpcode != AArch64::CBNZX &&
-      SecondOpcode != AArch64::CBZW &&
-      SecondOpcode != AArch64::CBZX)
+static bool isArithmeticCbzPair(const MachineInstr *FirstMI,
+                                const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != AArch64::CBZW &&
+      SecondMI.getOpcode() != AArch64::CBZX &&
+      SecondMI.getOpcode() != AArch64::CBNZW &&
+      SecondMI.getOpcode() != AArch64::CBNZX)
     return false;
 
-  switch (FirstOpcode) {
-  case AArch64::INSTRUCTION_LIST_END:
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  if (FirstMI == nullptr)
     return true;
+
+  switch (FirstMI->getOpcode()) {
   case AArch64::ADDWri:
   case AArch64::ADDWrr:
   case AArch64::ADDXri:
@@ -102,34 +107,39 @@ static bool isArithmeticCbzPair(unsigned FirstOpcode, unsigned SecondOpcode,
     // Shift value can be 0 making these behave like the "rr" variant...
     return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
   }
+
   return false;
 }
 
 /// AES crypto encoding or decoding.
-static bool isAESPair(unsigned FirstOpcode, unsigned SecondOpcode) {
+static bool isAESPair(const MachineInstr *FirstMI,
+                      const MachineInstr &SecondMI) {
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  switch (SecondMI.getOpcode()) {
   // AES encode.
-  if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-       FirstOpcode == AArch64::AESErr) &&
-      (SecondOpcode == AArch64::AESMCrr ||
-       SecondOpcode == AArch64::AESMCrrTied))
-    return true;
+  case AArch64::AESMCrr:
+  case AArch64::AESMCrrTied:
+    return FirstMI == nullptr || FirstMI->getOpcode() == AArch64::AESErr;
   // AES decode.
-  else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-            FirstOpcode == AArch64::AESDrr) &&
-           (SecondOpcode == AArch64::AESIMCrr ||
-            SecondOpcode == AArch64::AESIMCrrTied))
-    return true;
+  case AArch64::AESIMCrr:
+  case AArch64::AESIMCrrTied:
+    return FirstMI == nullptr || FirstMI->getOpcode() == AArch64::AESDrr;
+  }
 
   return false;
 }
 
 /// AESE/AESD/PMULL + EOR.
-static bool isCryptoEORPair(unsigned FirstOpcode, unsigned SecondOpcode) {
-  if (SecondOpcode != AArch64::EORv16i8)
+static bool isCryptoEORPair(const MachineInstr *FirstMI,
+                            const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != AArch64::EORv16i8)
     return false;
 
-  switch (FirstOpcode) {
-  case AArch64::INSTRUCTION_LIST_END:
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+  if (FirstMI == nullptr)
+    return true;
+
+  switch (FirstMI->getOpcode()) {
   case AArch64::AESErr:
   case AArch64::AESDrr:
   case AArch64::PMULLv16i8:
@@ -138,45 +148,47 @@ static bool isCryptoEORPair(unsigned FirstOpcode, unsigned SecondOpcode) {
   case AArch64::PMULLv2i64:
     return true;
   }
+
   return false;
 }
 
 /// Literal generation.
-static bool isLiteralsPair(unsigned FirstOpcode, unsigned SecondOpcode,
-                           const MachineInstr *FirstMI,
+static bool isLiteralsPair(const MachineInstr *FirstMI,
                            const MachineInstr &SecondMI) {
+  // Assume the 1st instr to be a wildcard if it is unspecified.
+
   // PC relative address.
-  if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-       FirstOpcode == AArch64::ADRP) &&
-      SecondOpcode == AArch64::ADDXri)
+  if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::ADRP) &&
+      SecondMI.getOpcode() == AArch64::ADDXri)
     return true;
+
   // 32 bit immediate.
-  else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-            FirstOpcode == AArch64::MOVZWi) &&
-           (SecondOpcode == AArch64::MOVKWi &&
-            SecondMI.getOperand(3).getImm() == 16))
+  if ((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVZWi) &&
+      (SecondMI.getOpcode() == AArch64::MOVKWi &&
+       SecondMI.getOperand(3).getImm() == 16))
     return true;
+
   // Lower half of 64 bit immediate.
-  else if((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-           FirstOpcode == AArch64::MOVZXi) &&
-          (SecondOpcode == AArch64::MOVKXi &&
-           SecondMI.getOperand(3).getImm() == 16))
+  if((FirstMI == nullptr || FirstMI->getOpcode() == AArch64::MOVZXi) &&
+     (SecondMI.getOpcode() == AArch64::MOVKXi &&
+      SecondMI.getOperand(3).getImm() == 16))
     return true;
+
   // Upper half of 64 bit immediate.
-  else if ((FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
-            (FirstOpcode == AArch64::MOVKXi &&
-             FirstMI->getOperand(3).getImm() == 32)) &&
-           (SecondOpcode == AArch64::MOVKXi &&
-            SecondMI.getOperand(3).getImm() == 48))
+  if ((FirstMI == nullptr ||
+       (FirstMI->getOpcode() == AArch64::MOVKXi &&
+        FirstMI->getOperand(3).getImm() == 32)) &&
+      (SecondMI.getOpcode() == AArch64::MOVKXi &&
+       SecondMI.getOperand(3).getImm() == 48))
     return true;
 
   return false;
 }
 
-// Fuse address generation and loads or stores.
-static bool isAddressLdStPair(unsigned FirstOpcode, unsigned SecondOpcode,
+/// Fuse address generation and loads or stores.
+static bool isAddressLdStPair(const MachineInstr *FirstMI,
                               const MachineInstr &SecondMI) {
-  switch (SecondOpcode) {
+  switch (SecondMI.getOpcode()) {
   case AArch64::STRBBui:
   case AArch64::STRBui:
   case AArch64::STRDui:
@@ -200,29 +212,32 @@ static bool isAddressLdStPair(unsigned FirstOpcode, unsigned SecondOpcode,
   case AArch64::LDRSHWui:
   case AArch64::LDRSHXui:
   case AArch64::LDRSWui:
-    switch (FirstOpcode) {
-    case AArch64::INSTRUCTION_LIST_END:
+    // Assume the 1st instr to be a wildcard if it is unspecified.
+    if (FirstMI == nullptr)
       return true;
+
+   switch (FirstMI->getOpcode()) {
     case AArch64::ADR:
       return SecondMI.getOperand(2).getImm() == 0;
     case AArch64::ADRP:
       return true;
     }
   }
+
   return false;
 }
 
-// Compare and conditional select.
-static bool isCCSelectPair(unsigned FirstOpcode, unsigned SecondOpcode,
-                           const MachineInstr *FirstMI) {
+/// Compare and conditional select.
+static bool isCCSelectPair(const MachineInstr *FirstMI,
+                           const MachineInstr &SecondMI) {
   // 32 bits
-  if (SecondOpcode == AArch64::CSELWr) {
+  if (SecondMI.getOpcode() == AArch64::CSELWr) {
     // Assume the 1st instr to be a wildcard if it is unspecified.
-    if (FirstOpcode == AArch64::INSTRUCTION_LIST_END)
+    if (FirstMI == nullptr)
       return true;
 
     if (FirstMI->definesRegister(AArch64::WZR))
-      switch (FirstOpcode) {
+      switch (FirstMI->getOpcode()) {
       case AArch64::SUBSWrs:
         return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
       case AArch64::SUBSWrx:
@@ -232,14 +247,15 @@ static bool isCCSelectPair(unsigned FirstOpcode, unsigned SecondOpcode,
         return true;
       }
   }
+
   // 64 bits
-  else if (SecondOpcode == AArch64::CSELXr) {
+  if (SecondMI.getOpcode() == AArch64::CSELXr) {
     // Assume the 1st instr to be a wildcard if it is unspecified.
-    if (FirstOpcode == AArch64::INSTRUCTION_LIST_END)
+    if (FirstMI == nullptr)
       return true;
 
     if (FirstMI->definesRegister(AArch64::XZR))
-      switch (FirstOpcode) {
+      switch (FirstMI->getOpcode()) {
       case AArch64::SUBSXrs:
         return !AArch64InstrInfo::hasShiftedReg(*FirstMI);
       case AArch64::SUBSXrx:
@@ -250,6 +266,7 @@ static bool isCCSelectPair(unsigned FirstOpcode, unsigned SecondOpcode,
         return true;
       }
   }
+
   return false;
 }
 
@@ -262,28 +279,21 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
                                    const MachineInstr &SecondMI) {
   const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
 
-  // Assume the 1st instr to be a wildcard if it is unspecified.
-  unsigned FirstOpc =
-      FirstMI ? FirstMI->getOpcode()
-              : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
-  unsigned SecondOpc = SecondMI.getOpcode();
-
-  if (ST.hasArithmeticBccFusion() &&
-      isArithmeticBccPair(FirstOpc, SecondOpc, FirstMI))
+  // All checking functions assume that the 1st instr is a wildcard if it is
+  // unspecified.
+  if (ST.hasArithmeticBccFusion() && isArithmeticBccPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasArithmeticCbzFusion() &&
-      isArithmeticCbzPair(FirstOpc, SecondOpc, FirstMI))
+  if (ST.hasArithmeticCbzFusion() && isArithmeticCbzPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseAES() && isAESPair(FirstOpc, SecondOpc))
+  if (ST.hasFuseAES() && isAESPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseCryptoEOR() && isCryptoEORPair(FirstOpc, SecondOpc))
+  if (ST.hasFuseCryptoEOR() && isCryptoEORPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseLiterals() &&
-      isLiteralsPair(FirstOpc, SecondOpc, FirstMI, SecondMI))
+  if (ST.hasFuseLiterals() && isLiteralsPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseAddress() && isAddressLdStPair(FirstOpc, SecondOpc, SecondMI))
+  if (ST.hasFuseAddress() && isAddressLdStPair(FirstMI, SecondMI))
     return true;
-  if (ST.hasFuseCCSelect() && isCCSelectPair(FirstOpc, SecondOpc, FirstMI))
+  if (ST.hasFuseCCSelect() && isCCSelectPair(FirstMI, SecondMI))
     return true;
 
   return false;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ff1c1c97988547cde1566563ff4b133a6853f3b3..0bab5c05ba6696dcb50e74a226fd257d7221cfa0 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -43,6 +43,8 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
+    return CSR_Win_AArch64_AAPCS_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::GHC)
     // GHC set of callee saved regs is empty as all those regs are
     // used for passing STG regs around
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 1c25a654f242f2358e82bde63d54166ec45cf40e..4653c7af59d0ec1b7c1f5c8b3ed4dbe09da34014 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -30,6 +30,11 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
 public:
   AArch64RegisterInfo(const Triple &TT);
 
+  // FIXME: This should be tablegen'd like getDwarfRegNum is
+  int getSEHRegNum(unsigned i) const {
+    return getEncodingValue(i);
+  }
+
   bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
   bool isAnyArgRegReserved(const MachineFunction &MF) const;
   void emitReservedArgRegCallError(const MachineFunction &MF) const;
diff --git a/lib/Target/AArch64/AArch64SchedExynosM1.td b/lib/Target/AArch64/AArch64SchedExynosM1.td
index ecc68aed1550f987dcdd23a96b2c0a783695a9e3..d566a13dc67dc1193889bac1179c2284f7c85106 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM1.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM1.td
@@ -64,9 +64,10 @@ def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
 //===----------------------------------------------------------------------===//
 // Predicates.
 
-def M1BranchLinkFastPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
-                                            MI->getOperand(0).getReg() != AArch64::LR}]>;
-def M1ShiftLeftFastPred  : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>;
+def M1BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
+                                        MI->getOperand(0).getReg() != AArch64::LR}]>;
+def M1LdStExtPred    : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>;
+def M1ShiftExtPred   : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>;
 
 //===----------------------------------------------------------------------===//
 // Coarse scheduling model.
@@ -85,14 +86,14 @@ def M1WriteAC : SchedWriteRes<[M1UnitALU,
 def M1WriteAD : SchedWriteRes<[M1UnitALU,
                                M1UnitC]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
-def M1WriteAX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteA1]>,
-                                   SchedVar<NoSchedPred,         [M1WriteAA]>]>;
+def M1WriteAX : SchedWriteVariant<[SchedVar<M1ShiftExtPred, [M1WriteA1]>,
+                                   SchedVar<NoSchedPred,    [M1WriteAA]>]>;
 def M1WriteC1 : SchedWriteRes<[M1UnitC]>   { let Latency = 1; }
 def M1WriteC2 : SchedWriteRes<[M1UnitC]>   { let Latency = 2; }
 
 def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; }
-def M1WriteBX : SchedWriteVariant<[SchedVar<M1BranchLinkFastPred, [M1WriteAB]>,
-                                   SchedVar<NoSchedPred,          [M1WriteAC]>]>;
+def M1WriteBX : SchedWriteVariant<[SchedVar<M1BranchLinkPred, [M1WriteAB]>,
+                                   SchedVar<NoSchedPred,      [M1WriteAC]>]>;
 
 def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; }
 def M1WriteL6 : SchedWriteRes<[M1UnitL]> { let Latency = 6; }
@@ -110,10 +111,10 @@ def M1WriteLD : SchedWriteRes<[M1UnitL,
                                            let ResourceCycles = [2, 1]; }
 def M1WriteLH : SchedWriteRes<[]>        { let Latency = 5;
                                            let NumMicroOps = 0; }
-def M1WriteLX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteL5]>,
-                                   SchedVar<NoSchedPred,         [M1WriteLC]>]>;
-def M1WriteLY : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteL5]>,
-                                   SchedVar<NoSchedPred,         [M1WriteLD]>]>;
+def M1WriteLX : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteL5]>,
+                                   SchedVar<NoSchedPred,   [M1WriteLC]>]>;
+def M1WriteLY : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteL5]>,
+                                   SchedVar<NoSchedPred,   [M1WriteLD]>]>;
 
 def M1WriteS1 : SchedWriteRes<[M1UnitS]>   { let Latency = 1; }
 def M1WriteS3 : SchedWriteRes<[M1UnitS]>   { let Latency = 3; }
@@ -140,10 +141,10 @@ def M1WriteSD : SchedWriteRes<[M1UnitS,
 def M1WriteSE : SchedWriteRes<[M1UnitS,
                                M1UnitA]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
-def M1WriteSX : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteS1]>,
-                                   SchedVar<NoSchedPred,         [M1WriteSE]>]>;
-def M1WriteSY : SchedWriteVariant<[SchedVar<M1ShiftLeftFastPred, [M1WriteS1]>,
-                                   SchedVar<NoSchedPred,         [M1WriteSB]>]>;
+def M1WriteSX : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteS1]>,
+                                   SchedVar<NoSchedPred,   [M1WriteSE]>]>;
+def M1WriteSY : SchedWriteVariant<[SchedVar<M1LdStExtPred, [M1WriteS1]>,
+                                   SchedVar<NoSchedPred,   [M1WriteSB]>]>;
 
 def M1ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
                                       SchedVar<NoSchedPred,   [ReadDefault]>]>;
diff --git a/lib/Target/AArch64/AArch64SchedExynosM3.td b/lib/Target/AArch64/AArch64SchedExynosM3.td
index 5e5369a5a7fe11a5b5ddbb7d1ee1b6d57bec3fc5..e61fb611ab245bc2a2b5b52d5f9969f24dce20c9 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -26,9 +26,6 @@ def ExynosM3Model : SchedMachineModel {
   let CompleteModel         =   1; // Use the default model otherwise.
 
   list<Predicate> UnsupportedFeatures = [HasSVE];
-
-  // FIXME: Remove when all errors have been fixed.
-  let FullInstRWOverlapCheck = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -109,15 +106,16 @@ def M3UnitNSHF : ProcResGroup<[M3UnitNSHF0,
 //===----------------------------------------------------------------------===//
 // Predicates.
 
-def M3BranchLinkFastPred  : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
-                                             MI->getOperand(0).isReg() &&
-                                             MI->getOperand(0).getReg() != AArch64::LR}]>;
-def M3ResetFastPred       : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>;
-def M3RotateRightFastPred : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri ||
-                                              MI->getOpcode() == AArch64::EXTRXrri) &&
-                                             MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
-                                             MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>;
-def M3ShiftLeftFastPred   : SchedPredicate<[{TII->isExynosShiftLeftFast(*MI)}]>;
+def M3BranchLinkPred : SchedPredicate<[{MI->getOpcode() == AArch64::BLR &&
+                                        MI->getOperand(0).isReg() &&
+                                        MI->getOperand(0).getReg() != AArch64::LR}]>;
+def M3ResetPred      : SchedPredicate<[{TII->isExynosResetFast(*MI)}]>;
+def M3RotatePred     : SchedPredicate<[{(MI->getOpcode() == AArch64::EXTRWrri ||
+                                         MI->getOpcode() == AArch64::EXTRXrri) &&
+                                        MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+                                        MI->getOperand(1).getReg() == MI->getOperand(2).getReg()}]>;
+def M3LdStExtPred    : SchedPredicate<[{TII->isExynosLdStExtFast(*MI)}]>;
+def M3ShiftExtPred   : SchedPredicate<[{TII->isExynosShiftExtFast(*MI)}]>;
 
 //===----------------------------------------------------------------------===//
 // Coarse scheduling model.
@@ -140,15 +138,15 @@ def M3WriteAD : SchedWriteRes<[M3UnitALU,
                                              let NumMicroOps = 2; }
 def M3WriteC1 : SchedWriteRes<[M3UnitC]>   { let Latency = 1; }
 def M3WriteC2 : SchedWriteRes<[M3UnitC]>   { let Latency = 2; }
-def M3WriteAX : SchedWriteVariant<[SchedVar<M3ResetFastPred,     [M3WriteZ0]>,
-                                   SchedVar<M3ShiftLeftFastPred, [M3WriteA1]>,
-                                   SchedVar<NoSchedPred,         [M3WriteAA]>]>;
-def M3WriteAY : SchedWriteVariant<[SchedVar<M3RotateRightFastPred, [M3WriteA1]>,
-                                   SchedVar<NoSchedPred,           [M3WriteAA]>]>;
+def M3WriteAX : SchedWriteVariant<[SchedVar<M3ResetPred,    [M3WriteZ0]>,
+                                   SchedVar<M3ShiftExtPred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,    [M3WriteAA]>]>;
+def M3WriteAY : SchedWriteVariant<[SchedVar<M3RotatePred, [M3WriteA1]>,
+                                   SchedVar<NoSchedPred,  [M3WriteAA]>]>;
 
 def M3WriteB1 : SchedWriteRes<[M3UnitB]> { let Latency = 1; }
-def M3WriteBX : SchedWriteVariant<[SchedVar<M3BranchLinkFastPred, [M3WriteAB]>,
-                                   SchedVar<NoSchedPred,          [M3WriteAC]>]>;
+def M3WriteBX : SchedWriteVariant<[SchedVar<M3BranchLinkPred, [M3WriteAB]>,
+                                   SchedVar<NoSchedPred,      [M3WriteAC]>]>;
 
 def M3WriteL4 : SchedWriteRes<[M3UnitL]> { let Latency = 4; }
 def M3WriteL5 : SchedWriteRes<[M3UnitL]> { let Latency = 5; }
@@ -168,8 +166,8 @@ def M3WriteLD : SchedWriteRes<[M3UnitA,
 def M3WriteLH : SchedWriteRes<[]>        { let Latency = 5;
                                            let NumMicroOps = 0; }
 
-def M3WriteLX : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteL5]>,
-                                   SchedVar<NoSchedPred,         [M3WriteLB]>]>;
+def M3WriteLX : SchedWriteVariant<[SchedVar<M3LdStExtPred, [M3WriteL5]>,
+                                   SchedVar<NoSchedPred,   [M3WriteLB]>]>;
 
 def M3WriteS1 : SchedWriteRes<[M3UnitS]>   { let Latency = 1; }
 def M3WriteSA : SchedWriteRes<[M3UnitA,
@@ -183,10 +181,10 @@ def M3WriteSC : SchedWriteRes<[M3UnitA,
                                M3UnitS]>   { let Latency = 2;
                                              let NumMicroOps = 2; }
 
-def M3WriteSX : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteS1]>,
-                                   SchedVar<NoSchedPred,         [M3WriteSB]>]>;
-def M3WriteSY : SchedWriteVariant<[SchedVar<M3ShiftLeftFastPred, [M3WriteS1]>,
-                                   SchedVar<NoSchedPred,         [M3WriteSC]>]>;
+def M3WriteSX : SchedWriteVariant<[SchedVar<M3LdStExtPred, [M3WriteS1]>,
+                                   SchedVar<NoSchedPred,   [M3WriteSB]>]>;
+def M3WriteSY : SchedWriteVariant<[SchedVar<M3LdStExtPred, [M3WriteS1]>,
+                                   SchedVar<NoSchedPred,   [M3WriteSC]>]>;
 
 def M3ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
                                       SchedVar<NoSchedPred,   [ReadDefault]>]>;
@@ -484,8 +482,8 @@ def M3WriteAES     : SchedWriteRes<[M3UnitNCRY]>  { let Latency = 1; }
 def M3ReadAES      : SchedReadAdvance<1, [M3WriteAES]>;
 def M3ReadFMAC     : SchedReadAdvance<1, [M3WriteFMAC4,
                                           M3WriteFMAC5]>;
-def M3WriteMOVI    : SchedWriteVariant<[SchedVar<M3ResetFastPred, [M3WriteZ0]>,
-                                        SchedVar<NoSchedPred,     [M3WriteNALU1]>]>;
+def M3WriteMOVI    : SchedWriteVariant<[SchedVar<M3ResetPred, [M3WriteZ0]>,
+                                        SchedVar<NoSchedPred, [M3WriteNALU1]>]>;
 def M3ReadNMUL     : SchedReadAdvance<1, [M3WriteNMUL3]>;
 
 // Branch instructions
@@ -588,7 +586,7 @@ def : InstRW<[M3WriteSA,
 // ASIMD instructions.
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]ABAL?v")>;
 def : InstRW<[M3WriteNMSC1], (instregex "^[SU]ABDL?v")>;
-def : InstRW<[M3WriteNMSC1], (instregex "^(SQ)?(ABS|NEG)v")>;
+def : InstRW<[M3WriteNMSC1], (instregex "^((SQ)?ABS|SQNEG)v")>;
 def : InstRW<[M3WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Pv")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]H(ADD|SUB)v")>;
@@ -597,7 +595,6 @@ def : InstRW<[M3WriteNMSC3], (instregex "^R?(ADD|SUB)HN2?v")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]Q(ADD|SUB)v")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^(SU|US)QADDv")>;
 def : InstRW<[M3WriteNMSC3], (instregex "^[SU]RHADDv")>;
-def : InstRW<[M3WriteNMSC3], (instregex "^[SU]?ADDL?Vv")>;
 def : InstRW<[M3WriteNMSC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
 def : InstRW<[M3WriteNALU1], (instregex "^CMTSTv")>;
 def : InstRW<[M3WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index c181f4016b60a6df33ee4100084cde7cfb762fb0..49d737bea6a6dc2c544b40106c9c4776a6beb77e 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -67,16 +67,30 @@ void AArch64Subtarget::initializeProperties() {
   // this in the future so we can specify it together with the subtarget
   // features.
   switch (ARMProcFamily) {
+  case Others:
+    break;
+  case CortexA35:
+    break;
+  case CortexA53:
+    PrefFunctionAlignment = 3;
+    break;
+  case CortexA55:
+    break;
+  case CortexA57:
+    MaxInterleaveFactor = 4;
+    PrefFunctionAlignment = 4;
+    break;
+  case CortexA72:
+  case CortexA73:
+  case CortexA75:
+    PrefFunctionAlignment = 4;
+    break;
   case Cyclone:
     CacheLineSize = 64;
     PrefetchDistance = 280;
     MinPrefetchStride = 2048;
     MaxPrefetchIterationsAhead = 3;
     break;
-  case CortexA57:
-    MaxInterleaveFactor = 4;
-    PrefFunctionAlignment = 4;
-    break;
   case ExynosM1:
     MaxInterleaveFactor = 4;
     MaxJumpTableSize = 8;
@@ -98,11 +112,6 @@ void AArch64Subtarget::initializeProperties() {
     MinPrefetchStride = 2048;
     MaxPrefetchIterationsAhead = 8;
     break;
-  case Saphira:
-    MaxInterleaveFactor = 4;
-    // FIXME: remove this to enable 64-bit SLP if performance looks good.
-    MinVectorRegisterBitWidth = 128;
-    break;
   case Kryo:
     MaxInterleaveFactor = 4;
     VectorInsertExtractBaseCost = 2;
@@ -113,6 +122,11 @@ void AArch64Subtarget::initializeProperties() {
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
+  case Saphira:
+    MaxInterleaveFactor = 4;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
+    break;
   case ThunderX2T99:
     CacheLineSize = 64;
     PrefFunctionAlignment = 3;
@@ -134,17 +148,6 @@ void AArch64Subtarget::initializeProperties() {
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
-  case CortexA35: break;
-  case CortexA53:
-    PrefFunctionAlignment = 3;
-    break;
-  case CortexA55: break;
-  case CortexA72:
-  case CortexA73:
-  case CortexA75:
-    PrefFunctionAlignment = 4;
-    break;
-  case Others: break;
   }
 }
 
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index abe1980740e8affb5340b5c133946cbc39bf6de2..8bf7c1654081e10c69f64daced9852d6cea57422 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -142,6 +142,7 @@ protected:
   bool HasFuseLiterals = false;
   bool DisableLatencySchedHeuristic = false;
   bool UseRSqrt = false;
+  bool Force32BitJumpTables = false;
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 3;
   uint16_t CacheLineSize = 0;
@@ -292,6 +293,7 @@ public:
   }
 
   bool useRSqrt() const { return UseRSqrt; }
+  bool force32BitJumpTables() const { return Force32BitJumpTables; }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const {
     return VectorInsertExtractBaseCost;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index e183288d8df2cd1a1c0ee731f69aae2a2cccb6b9..2f3f87d02b787206cffe5026a871eedc648abdf7 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
@@ -123,6 +124,10 @@ static cl::opt<bool>
     BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
                      cl::desc("Relax out of range conditional branches"));
 
+static cl::opt<bool> EnableCompressJumpTables(
+    "aarch64-enable-compress-jump-tables", cl::Hidden, cl::init(true),
+    cl::desc("Use smallest entry possible for jump tables"));
+
 // FIXME: Unify control over GlobalMerge.
 static cl::opt<cl::boolOrDefault>
     EnableGlobalMerge("aarch64-enable-global-merge", cl::Hidden,
@@ -158,6 +163,7 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeAArch64AdvSIMDScalarPass(*PR);
   initializeAArch64BranchTargetsPass(*PR);
   initializeAArch64CollectLOHPass(*PR);
+  initializeAArch64CompressJumpTablesPass(*PR);
   initializeAArch64ConditionalComparesPass(*PR);
   initializeAArch64ConditionOptimizerPass(*PR);
   initializeAArch64DeadRegisterDefinitionsPass(*PR);
@@ -258,6 +264,16 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
     this->Options.NoTrapAfterNoreturn = true;
   }
 
+  if (getMCAsmInfo()->usesWindowsCFI()) {
+    // Unwinding can get confused if the last instruction in an
+    // exception-handling region (function, funclet, try block, etc.)
+    // is a call.
+    //
+    // FIXME: We could elide the trap if the next instruction would be in
+    // the same region anyway.
+    this->Options.TrapUnreachable = true;
+  }
+
   // Enable GlobalISel at or below EnableGlobalISelAt0.
   if (getOptLevel() <= EnableGlobalISelAtO)
     setGlobalISel(true);
@@ -546,6 +562,9 @@ void AArch64PassConfig::addPreEmitPass() {
   if (EnableBranchTargets)
     addPass(createAArch64BranchTargetsPass());
 
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCompressJumpTables)
+    addPass(createAArch64CompressJumpTablesPass());
+
   if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
       TM->getTargetTriple().isOSBinFormatMachO())
     addPass(createAArch64CollectLOHPass());
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 96e751e869716c9cd3e8c2c038c66f455e394c9e..a256cb7c9215645cdbf6e3568eee015da4df010e 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -659,11 +659,14 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace) {
+                                               unsigned AddressSpace,
+                                               bool UseMaskForCond,
+                                               bool UseMaskForGaps) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
-  if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  if (!UseMaskForCond && !UseMaskForGaps && 
+      Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -676,7 +679,8 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
 }
 
 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
@@ -945,9 +949,20 @@ int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
 
 int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                    Type *SubTp) {
-  if (Kind == TTI::SK_Transpose || Kind == TTI::SK_Select ||
-      Kind == TTI::SK_PermuteSingleSrc) {
+  if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
+      Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
     static const CostTblEntry ShuffleTbl[] = {
+      // Broadcast shuffle kinds can be performed with 'dup'.
+      { TTI::SK_Broadcast, MVT::v8i8,  1 },
+      { TTI::SK_Broadcast, MVT::v16i8, 1 },
+      { TTI::SK_Broadcast, MVT::v4i16, 1 },
+      { TTI::SK_Broadcast, MVT::v8i16, 1 },
+      { TTI::SK_Broadcast, MVT::v2i32, 1 },
+      { TTI::SK_Broadcast, MVT::v4i32, 1 },
+      { TTI::SK_Broadcast, MVT::v2i64, 1 },
+      { TTI::SK_Broadcast, MVT::v2f32, 1 },
+      { TTI::SK_Broadcast, MVT::v4f32, 1 },
+      { TTI::SK_Broadcast, MVT::v2f64, 1 },
       // Transpose shuffle kinds can be performed with 'trn1/trn2' and
       // 'zip1/zip2' instructions.
       { TTI::SK_Transpose, MVT::v8i8,  1 },
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c056a7d2428b206d885a4293d645aa3b71964bc5..08c1a8924220fd00085ffad86f1bd75db99774d0 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -146,7 +146,9 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
 
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index c57ebeb854cdf0b10fe10d6d42cb05cb95a0d376..58190686c7947c2b60b350c35d202fbb2ce34121 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -34,6 +34,7 @@ add_llvm_target(AArch64CodeGen
   AArch64FastISel.cpp
   AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
+  AArch64CompressJumpTables.cpp
   AArch64ConditionOptimizer.cpp
   AArch64RedundantCopyElimination.cpp
   AArch64ISelDAGToDAG.cpp
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index c0ef8b6702868053a5bd11c0ce06aceaaa68427c..9a7e34b0aeb10f14b798a2b70b5a54ed01fe4268 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -60,16 +60,6 @@ void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) {
   OS << "\t.inst\t0x" << Twine::utohexstr(Inst) << "\n";
 }
 
-class AArch64TargetELFStreamer : public AArch64TargetStreamer {
-private:
-  AArch64ELFStreamer &getStreamer();
-
-  void emitInst(uint32_t Inst) override;
-
-public:
-  AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
-};
-
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
 /// the appropriate points in the object files. These symbols are defined in the
 /// AArch64 ELF ABI:
@@ -85,8 +75,6 @@ public:
 /// by MachO. Beware!
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
-  friend class AArch64TargetELFStreamer;
-
   AArch64ELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
                      std::unique_ptr<MCObjectWriter> OW,
                      std::unique_ptr<MCCodeEmitter> Emitter)
@@ -154,6 +142,11 @@ public:
     MCELFStreamer::EmitValueImpl(Value, Size, Loc);
   }
 
+  void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
+                                  SMLoc Loc) override {
+    EmitDataMappingSymbol();
+    MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
+  }
 private:
   enum ElfMappingSymbol {
     EMS_None,
@@ -192,6 +185,8 @@ private:
 
 } // end anonymous namespace
 
+namespace llvm {
+
 AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
   return static_cast<AArch64ELFStreamer &>(Streamer);
 }
@@ -200,8 +195,6 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
   getStreamer().emitInst(Inst);
 }
 
-namespace llvm {
-
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
                                                  MCInstPrinter *InstPrint,
@@ -221,14 +214,4 @@ MCELFStreamer *createAArch64ELFStreamer(MCContext &Context,
   return S;
 }
 
-MCTargetStreamer *
-createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
-  const Triple &TT = STI.getTargetTriple();
-  if (TT.isOSBinFormatELF())
-    return new AArch64TargetELFStreamer(S);
-  if (TT.isOSBinFormatCOFF())
-    return new AArch64TargetWinCOFFStreamer(S);
-  return nullptr;
-}
-
 } // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index ebb49121c1bfa9953aa7229bb482cb0e0a640595..0e486b9392316a78688c597bcbed616f34b94d54 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -115,6 +115,7 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
 
   CommentString = ";";
   ExceptionsType = ExceptionHandling::WinEH;
+  WinEHEncodingType = WinEH::EncodingType::Itanium;
 }
 
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index dee964df26356fcc59c71c5358082199cbd6666c..a6b8d963bef905f67e9b9a90bd74dca36c8e80a4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -13,6 +13,7 @@
 
 #include "AArch64TargetStreamer.h"
 #include "llvm/MC/ConstantPools.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -52,3 +53,17 @@ void AArch64TargetStreamer::emitInst(uint32_t Inst) {
 
   getStreamer().EmitBytes(StringRef(Buffer, 4));
 }
+
+namespace llvm {
+
+MCTargetStreamer *
+createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new AArch64TargetELFStreamer(S);
+  if (TT.isOSBinFormatCOFF())
+    return new AArch64TargetWinCOFFStreamer(S);
+  return nullptr;
+}
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index 51432830f7951c29c255c27419797d61c1f6ef6a..73fb9baea3e335a3f378e25a8714783589e6ffb7 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -12,6 +12,10 @@
 
 #include "llvm/MC/MCStreamer.h"
 
+namespace {
+class AArch64ELFStreamer;
+}
+
 namespace llvm {
 
 class AArch64TargetStreamer : public MCTargetStreamer {
@@ -33,10 +37,75 @@ public:
   /// Callback used to implement the .inst directive.
   virtual void emitInst(uint32_t Inst);
 
+  virtual void EmitARM64WinCFIAllocStack(unsigned Size) {}
+  virtual void EmitARM64WinCFISaveFPLR(int Offset) {}
+  virtual void EmitARM64WinCFISaveFPLRX(int Offset) {}
+  virtual void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) {}
+  virtual void EmitARM64WinCFISetFP() {}
+  virtual void EmitARM64WinCFIAddFP(unsigned Size) {}
+  virtual void EmitARM64WinCFINop() {}
+  virtual void EmitARM64WinCFIPrologEnd() {}
+  virtual void EmitARM64WinCFIEpilogStart() {}
+  virtual void EmitARM64WinCFIEpilogEnd() {}
+
 private:
   std::unique_ptr<AssemblerConstantPools> ConstantPools;
 };
 
+class AArch64TargetELFStreamer : public AArch64TargetStreamer {
+private:
+  AArch64ELFStreamer &getStreamer();
+
+  void emitInst(uint32_t Inst) override;
+
+public:
+  AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
+};
+
+class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer {
+private:
+  // True if we are processing SEH directives in an epilogue.
+  bool InEpilogCFI = false;
+
+  // Symbol of the current epilog for which we are processing SEH directives.
+  MCSymbol *CurrentEpilog = nullptr;
+public:
+  AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S)
+    : AArch64TargetStreamer(S) {}
+
+  // The unwind codes on ARM64 Windows are documented at
+  // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+  void EmitARM64WinCFIAllocStack(unsigned Size) override;
+  void EmitARM64WinCFISaveFPLR(int Offset) override;
+  void EmitARM64WinCFISaveFPLRX(int Offset) override;
+  void EmitARM64WinCFISaveReg(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveRegX(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveRegP(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveRegPX(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveFReg(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveFRegX(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveFRegP(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISaveFRegPX(unsigned Reg, int Offset) override;
+  void EmitARM64WinCFISetFP() override;
+  void EmitARM64WinCFIAddFP(unsigned Size) override;
+  void EmitARM64WinCFINop() override;
+  void EmitARM64WinCFIPrologEnd() override;
+  void EmitARM64WinCFIEpilogStart() override;
+  void EmitARM64WinCFIEpilogEnd() override;
+private:
+  void EmitARM64WinUnwindCode(unsigned UnwindCode, int Reg, int Offset);
+};
+
+MCTargetStreamer *
+createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 9871dc553bed3ca4d2748a35363ae07d529de9b4..b828ab832e9d38637533e7e60edf06270d59cd3d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -11,31 +11,184 @@
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
 
 using namespace llvm;
 
 namespace {
 
 class AArch64WinCOFFStreamer : public MCWinCOFFStreamer {
-public:
-  friend class AArch64TargetWinCOFFStreamer;
+  Win64EH::ARM64UnwindEmitter EHStreamer;
 
+public:
   AArch64WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
                          std::unique_ptr<MCCodeEmitter> CE,
                          std::unique_ptr<MCObjectWriter> OW)
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
+  void EmitWinEHHandlerData(SMLoc Loc) override;
+  void EmitWindowsUnwindTables() override;
   void FinishImpl() override;
 };
 
+void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
+  MCStreamer::EmitWinEHHandlerData(Loc);
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo());
+}
+
+void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
+  if (!getNumWinFrameInfos())
+    return;
+  EHStreamer.Emit(*this);
+}
+
 void AArch64WinCOFFStreamer::FinishImpl() {
   EmitFrames(nullptr);
+  EmitWindowsUnwindTables();
 
   MCWinCOFFStreamer::FinishImpl();
 }
 } // end anonymous namespace
 
 namespace llvm {
+
+// Helper function to common out unwind code setup for those codes that can
+// belong to both prolog and epilog.
+// There are three types of Windows ARM64 SEH codes.  They can
+// 1) take no operands: SEH_Nop, SEH_PrologEnd, SEH_EpilogStart, SEH_EpilogEnd
+// 2) take an offset: SEH_StackAlloc, SEH_SaveFPLR, SEH_SaveFPLR_X
+// 3) take a register and an offset/size: all others
+void AArch64TargetWinCOFFStreamer::EmitARM64WinUnwindCode(unsigned UnwindCode,
+                                                          int Reg,
+                                                          int Offset) {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+  MCSymbol *Label = S.EmitCFILabel();
+  auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset);
+  if (InEpilogCFI)
+    CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
+  else
+    CurFrame->Instructions.push_back(Inst);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIAllocStack(unsigned Size) {
+  unsigned Op = Win64EH::UOP_AllocSmall;
+  if (Size >= 16384)
+    Op = Win64EH::UOP_AllocLarge;
+  else if (Size >= 512)
+    Op = Win64EH::UOP_AllocMedium;
+  EmitARM64WinUnwindCode(Op, -1, Size);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFPLR(int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFPLR, -1, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFPLRX(int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFPLRX, -1, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveReg(unsigned Reg,
+                                                          int Offset) {
+  assert(Offset >= 0 && Offset <= 504 &&
+        "Offset for save reg should be >= 0 && <= 504");
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveReg, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegX(unsigned Reg,
+                                                           int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegX, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegP(unsigned Reg,
+                                                           int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegP, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveRegPX(unsigned Reg,
+                                                            int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveRegPX, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFReg(unsigned Reg,
+                                                           int Offset) {
+  assert(Offset >= 0 && Offset <= 504 &&
+        "Offset for save reg should be >= 0 && <= 504");
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFReg, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFRegX(unsigned Reg,
+                                                            int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFRegX, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFRegP(unsigned Reg,
+                                                            int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFRegP, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISaveFRegPX(unsigned Reg,
+                                                             int Offset) {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SaveFRegPX, Reg, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFISetFP() {
+  EmitARM64WinUnwindCode(Win64EH::UOP_SetFP, -1, 0);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIAddFP(unsigned Offset) {
+  assert(Offset <= 2040 && "UOP_AddFP must have offset <= 2040");
+  EmitARM64WinUnwindCode(Win64EH::UOP_AddFP, -1, Offset);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFINop() {
+  EmitARM64WinUnwindCode(Win64EH::UOP_Nop, -1, 0);
+}
+
+// The functions below handle opcodes that can end up in either a prolog or
+// an epilog, but not both.
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+
+  MCSymbol *Label = S.EmitCFILabel();
+  CurFrame->PrologEnd = Label;
+  WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
+  auto it = CurFrame->Instructions.begin();
+  CurFrame->Instructions.insert(it, Inst);
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogStart() {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+
+  InEpilogCFI = true;
+  CurrentEpilog = S.EmitCFILabel();
+}
+
+void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() {
+  auto &S = getStreamer();
+  WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
+  if (!CurFrame)
+    return;
+
+  InEpilogCFI = false;
+  MCSymbol *Label = S.EmitCFILabel();
+  WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
+  CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
+  CurrentEpilog = nullptr;
+}
+
 MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
     MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
     std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
index c05422163584028770b46846d8ad924d039ab90a..ed265a876ab3cf906a8c2bea29c08cfca267b5b0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -17,20 +17,6 @@
 #include "AArch64TargetStreamer.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
 
-namespace {
-class AArch64WinCOFFStreamer;
-
-class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer {
-private:
-  AArch64WinCOFFStreamer &getStreamer();
-
-public:
-  AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S)
-    : AArch64TargetStreamer(S) {}
-};
-
-} // end anonymous namespace
-
 namespace llvm {
 
 MCWinCOFFStreamer *createAArch64WinCOFFStreamer(
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 5e8a402fb6ef28e1e40ce5035256bebbed1e1196..07e5d97dff90b148fdbe23905ae9f190e075dce2 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -77,6 +77,10 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
 extern char &AMDGPULowerIntrinsicsID;
 
+ModulePass *createAMDGPUFixFunctionBitcastsPass();
+void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &);
+extern char &AMDGPUFixFunctionBitcastsID;
+
 FunctionPass *createAMDGPULowerKernelArgumentsPass();
 void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
 extern char &AMDGPULowerKernelArgumentsID;
@@ -194,6 +198,8 @@ extern char &AMDGPUUnifyDivergentExitNodesID;
 
 ImmutablePass *createAMDGPUAAWrapperPass();
 void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
+ImmutablePass *createAMDGPUExternalAAWrapperPass();
+void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);
 
 void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
 
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index dd9c16a94352d87396b65fa881238fae312edf78..96a8029773d0b5ce53900fd4c4625d76de127346 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -266,13 +266,10 @@ def FeatureDLInsts : SubtargetFeature<"dl-insts",
   "Has deep learning instructions"
 >;
 
-def FeatureD16PreservesUnusedBits : SubtargetFeature<
-  "d16-preserves-unused-bits",
-  "D16PreservesUnusedBits",
+def FeatureSRAMECC : SubtargetFeature<"sram-ecc",
+  "EnableSRAMECC",
   "true",
-  "If present, then instructions defined by HasD16LoadStore predicate preserve "
-  "unused bits. Otherwise instructions defined by HasD16LoadStore predicate "
-  "zero unused bits."
+  "Enable SRAM ECC"
 >;
 
 //===------------------------------------------------------------===//
@@ -327,12 +324,6 @@ def FeatureEnableHugePrivateBuffer : SubtargetFeature<
   "Enable private/scratch buffer sizes greater than 128 GB"
 >;
 
-def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
-  "EnableVGPRSpilling",
-  "true",
-  "Enable spilling of VGPRs to scratch memory"
->;
-
 def FeatureDumpCode : SubtargetFeature <"DumpCode",
   "DumpCode",
   "true",
@@ -530,28 +521,32 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
 def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
   [FeatureGFX9,
    FeatureMadMixInsts,
-   FeatureLDSBankCount32,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureLDSBankCount32]>;
 
 def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
   [FeatureGFX9,
    FeatureMadMixInsts,
    FeatureLDSBankCount32,
-   FeatureXNACK,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureXNACK]>;
 
 def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
   [FeatureGFX9,
    FeatureLDSBankCount32,
-   FeatureFmaMixInsts,
-   FeatureD16PreservesUnusedBits]>;
+   FeatureFmaMixInsts]>;
 
 def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
   [FeatureGFX9,
    HalfRate64Ops,
    FeatureFmaMixInsts,
    FeatureLDSBankCount32,
-   FeatureDLInsts]>;
+   FeatureDLInsts,
+   FeatureSRAMECC]>;
+
+def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9,
+  [FeatureGFX9,
+   FeatureMadMixInsts,
+   FeatureLDSBankCount32,
+   FeatureXNACK]>;
 
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
@@ -683,8 +678,9 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
 def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
   AssemblerPredicate<"!FeatureUnpackedD16VMem">;
 
-def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
-  AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
+def D16PreservesUnusedBits :
+  Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">,
+  AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
 
 def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
 def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 81df0c628a2bb261fd93d854db9d18483bf0302e..73709ba13643ee06673f64a380b5c93f2d29de4e 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -34,14 +34,22 @@ using namespace llvm;
 
 // Register this pass...
 char AMDGPUAAWrapperPass::ID = 0;
+char AMDGPUExternalAAWrapper::ID = 0;
 
 INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa",
                 "AMDGPU Address space based Alias Analysis", false, true)
 
+INITIALIZE_PASS(AMDGPUExternalAAWrapper, "amdgpu-aa-wrapper",
+                "AMDGPU Address space based Alias Analysis Wrapper", false, true)
+
 ImmutablePass *llvm::createAMDGPUAAWrapperPass() {
   return new AMDGPUAAWrapperPass();
 }
 
+ImmutablePass *llvm::createAMDGPUExternalAAWrapperPass() {
+  return new AMDGPUExternalAAWrapper();
+}
+
 void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index 9a507d004d0f52a6f7aea604e126545c88fd656e..d76c9fc481995de8ea655e3f7b8a743acf1c640b 100644
--- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -96,6 +96,19 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
+// Wrapper around ExternalAAWrapperPass so that the default constructor gets the
+// callback.
+class AMDGPUExternalAAWrapper : public ExternalAAWrapperPass {
+public:
+  static char ID;
+
+  AMDGPUExternalAAWrapper() : ExternalAAWrapperPass(
+    [](Pass &P, Function &, AAResults &AAR) {
+      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+        AAR.addAAResult(WrapperPass->getResult());
+    }) {}
+};
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 7e6a406b1e34fbeb4202883f384cd045d3a05ec3..d07c0516c27265e14e6c0dfe84a2ef72f71fff64 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -116,9 +116,16 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
 }
 
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
-  if (IsaInfo::hasCodeObjectV3(getSTI()) &&
-      TM.getTargetTriple().getOS() == Triple::AMDHSA)
-    return;
+  if (IsaInfo::hasCodeObjectV3(getSTI())) {
+    std::string ExpectedTarget;
+    raw_string_ostream ExpectedTargetOS(ExpectedTarget);
+    IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
+
+    getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
+
+    if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
+      return;
+  }
 
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
       TM.getTargetTriple().getOS() != Triple::AMDPAL)
@@ -1001,7 +1008,6 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                          const SIProgramInfo &CurrentProgramInfo) {
-  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
 
@@ -1022,10 +1028,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(RsrcReg, 4);
     OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
-    if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
-      OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
-      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
-    }
+    OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
+    OutStreamer->EmitIntValue(
+        S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
   }
 
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 7af13f83401939b50a3175892ab10ce9c043898a..644e4fd558badf42e524570e0428ef0ceaf2c654 100644
--- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -53,6 +53,7 @@ private:
   const DataLayout *DL;
   DominatorTree *DT;
   bool HasDPP;
+  bool IsPixelShader;
 
   void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
                       unsigned ValIdx, bool ValDivergent) const;
@@ -96,6 +97,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
   const TargetMachine &TM = TPC.getTM<TargetMachine>();
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
   HasDPP = ST.hasDPP();
+  IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
 
   visit(F);
 
@@ -215,6 +217,31 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   // Start building just before the instruction.
   IRBuilder<> B(&I);
 
+  // If we are in a pixel shader, because of how we have to mask out helper
+  // lane invocations, we need to record the entry and exit BB's.
+  BasicBlock *PixelEntryBB = nullptr;
+  BasicBlock *PixelExitBB = nullptr;
+
+  // If we're optimizing an atomic within a pixel shader, we need to wrap the
+  // entire atomic operation in a helper-lane check. We do not want any helper
+  // lanes that are around only for the purposes of derivatives to take part
+  // in any cross-lane communication, and we use a branch on whether the lane is
+  // live to do this.
+  if (IsPixelShader) {
+    // Record I's original position as the entry block.
+    PixelEntryBB = I.getParent();
+
+    Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
+    Instruction *const NonHelperTerminator =
+        SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
+
+    // Record I's new position as the exit block.
+    PixelExitBB = I.getParent();
+
+    I.moveBefore(NonHelperTerminator);
+    B.SetInsertPoint(&I);
+  }
+
   Type *const Ty = I.getType();
   const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
   Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
@@ -398,8 +425,18 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   // first lane, to get our lane's index into the atomic result.
   Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);
 
-  // Replace the original atomic instruction with the new one.
-  I.replaceAllUsesWith(Result);
+  if (IsPixelShader) {
+    // Need a final PHI to reconverge to above the helper lane branch mask.
+    B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+
+    PHINode *const PHI = B.CreatePHI(Ty, 2);
+    PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
+    PHI->addIncoming(Result, I.getParent());
+    I.replaceAllUsesWith(PHI);
+  } else {
+    // Replace the original atomic instruction with the new one.
+    I.replaceAllUsesWith(Result);
+  }
 
   // And delete the original.
   I.eraseFromParent();
diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e2a981d33968889370d2062eaaf8cedde700457
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
@@ -0,0 +1,63 @@
+//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Promote indirect (bitcast) calls to direct calls when they are statically
+/// known to be direct. Required when InstCombine is not run (e.g. at OptNone)
+/// because AMDGPU does not support indirect calls.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-fix-function-bitcasts"
+
+namespace {
+class AMDGPUFixFunctionBitcasts final
+    : public ModulePass,
+      public InstVisitor<AMDGPUFixFunctionBitcasts> {
+
+  bool runOnModule(Module &M) override;
+
+  bool Modified;
+
+public:
+  void visitCallSite(CallSite CS) {
+    if (CS.getCalledFunction())
+      return;
+    auto Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+    if (Callee && isLegalToPromote(CS, Callee)) {
+      promoteCall(CS, Callee);
+      Modified = true;
+    }
+  }
+
+  static char ID;
+  AMDGPUFixFunctionBitcasts() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char AMDGPUFixFunctionBitcasts::ID = 0;
+char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID;
+INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE,
+                "Fix function bitcasts for AMDGPU", false, false)
+
+ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() {
+  return new AMDGPUFixFunctionBitcasts();
+}
+
+bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) {
+  Modified = false;
+  visit(M);
+  return Modified;
+}
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 4010b77172c08329efa0f4a0b74c9aee32eb1199..025e2de742d6e7ffe77194c5e3be654c0186c1d1 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -978,8 +978,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
 
   // default case
 
-  // FIXME: This is broken on SI where we still need to check if the base
-  // pointer is positive here.
   Base = Addr;
   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ae6b925800b20f9c96867119e5260bc522e5a7c4..ad0a9e388af894f4ca8d2f6f7e49b8b8950726e0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -552,6 +552,8 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case ISD::FMAD:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case ISD::FSIN:
   case ISD::FTRUNC:
   case ISD::FRINT:
@@ -665,6 +667,18 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
   EVT OldVT = N->getValueType(0);
   unsigned OldSize = OldVT.getStoreSizeInBits();
 
+  MemSDNode *MN = cast<MemSDNode>(N);
+  unsigned AS = MN->getAddressSpace();
+  // Do not shrink an aligned scalar load to sub-dword.
+  // Scalar engine cannot do sub-dword loads.
+  if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
+      (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+       (isa<LoadSDNode>(N) &&
+        AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
+      AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
+    return false;
+
   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
   // extloads, so doing one requires using a buffer_load. In cases where we
   // still couldn't use a scalar load, using the wider load shouldn't really
@@ -3512,6 +3526,10 @@ static unsigned inverseMinMax(unsigned Opc) {
     return ISD::FMINNUM;
   case ISD::FMINNUM:
     return ISD::FMAXNUM;
+  case ISD::FMAXNUM_IEEE:
+    return ISD::FMINNUM_IEEE;
+  case ISD::FMINNUM_IEEE:
+    return ISD::FMAXNUM_IEEE;
   case AMDGPUISD::FMAX_LEGACY:
     return AMDGPUISD::FMIN_LEGACY;
   case AMDGPUISD::FMIN_LEGACY:
@@ -3617,6 +3635,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   }
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINNUM_IEEE:
   case AMDGPUISD::FMAX_LEGACY:
   case AMDGPUISD::FMIN_LEGACY: {
     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
@@ -3797,9 +3817,10 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
       if (Src.getValueType() == MVT::i64) {
         SDLoc SL(N);
         uint64_t CVal = C->getZExtValue();
-        return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
-                           DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
-                           DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+        SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
       }
     }
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 92d8991e58273f0b1c2db5c53e6bea1add9e7c55..0d22cb2e3e20bcba352aafb7216ce88bc8ecaab3 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -360,6 +360,7 @@ enum NodeType : unsigned {
   SIN_HW,
   FMAX_LEGACY,
   FMIN_LEGACY,
+
   FMAX3,
   SMAX3,
   UMAX3,
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index a5f9a85f50d3d022a4f6727be28ebebcabccdf10..945c9acd379a5f3ba39d1803441f5c50504dfcb8 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -44,7 +44,7 @@ ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200),
               cl::desc("Cost of alloca argument"));
 
 // If the amount of scratch memory to eliminate exceeds our ability to allocate
-// it into registers we gain nothing by agressively inlining functions for that
+// it into registers we gain nothing by aggressively inlining functions for that
 // heuristic.
 static cl::opt<unsigned>
 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 7442a59e594f1a2b17e0e8980d66bb6bf866c71a..82644be2656384cd2add9d63a505037e5dc66322 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -62,18 +62,10 @@ def AMDGPULoopOp : SDTypeProfile<0, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
 >;
 
-def AMDGPUBreakOp : SDTypeProfile<1, 1,
-  [SDTCisVT<0, i64>, SDTCisVT<1, i64>]
->;
-
 def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
   [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
 >;
 
-def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
-  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
->;
-
 def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
   [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
 >;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index ab00b1d6326c1dbcc0d68771fcedfbb8c298c9a0..b7d1575ca898fc55254532018543dddc26ec4340 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -152,8 +152,14 @@ def smax_oneuse : HasOneUseBinOp<smax>;
 def smin_oneuse : HasOneUseBinOp<smin>;
 def umax_oneuse : HasOneUseBinOp<umax>;
 def umin_oneuse : HasOneUseBinOp<umin>;
+
 def fminnum_oneuse : HasOneUseBinOp<fminnum>;
 def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+
+def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
+def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
+
+
 def and_oneuse : HasOneUseBinOp<and>;
 def or_oneuse : HasOneUseBinOp<or>;
 def xor_oneuse : HasOneUseBinOp<xor>;
@@ -837,3 +843,25 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat <
   (AMDGPUrcp (fsqrt vt:$src)),
   (RsqInst $src)
 >;
+
+// Instructions which select to the same v_min_f*
+def fminnum_like : PatFrags<(ops node:$src0, node:$src1),
+  [(fminnum_ieee node:$src0, node:$src1),
+   (fminnum node:$src0, node:$src1)]
+>;
+
+// Instructions which select to the same v_max_f*
+def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1),
+  [(fmaxnum_ieee node:$src0, node:$src1),
+   (fmaxnum node:$src0, node:$src1)]
+>;
+
+def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+  [(fminnum_ieee_oneuse node:$src0, node:$src1),
+   (fminnum_oneuse node:$src0, node:$src1)]
+>;
+
+def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
+  [(fmaxnum_ieee_oneuse node:$src0, node:$src1),
+   (fmaxnum_oneuse node:$src0, node:$src1)]
+>;
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index fe9e4ca0ca4c7727253ad21ed0c55081c1063dd8..5d087c0991844ae19ad0732c72f124734aa1111f 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -70,6 +70,11 @@ static cl::opt<bool> DisablePromoteAllocaToVector(
   cl::desc("Disable promote alloca to vector"),
   cl::init(false));
 
+static cl::opt<bool> DisablePromoteAllocaToLDS(
+  "disable-promote-alloca-to-lds",
+  cl::desc("Disable promote alloca to LDS"),
+  cl::init(false));
+
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
@@ -323,6 +328,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
     // Currently only handle the case where the Pointer Operand is a GEP.
     // Also we could not vectorize volatile or atomic loads.
     LoadInst *LI = cast<LoadInst>(Inst);
+    if (isa<AllocaInst>(User) &&
+        LI->getPointerOperandType() == User->getType() &&
+        isa<VectorType>(LI->getType()))
+      return true;
     return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
   }
   case Instruction::BitCast:
@@ -332,6 +341,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
     // since it should be canonical form, the User should be a GEP.
     // Also we could not vectorize volatile or atomic stores.
     StoreInst *SI = cast<StoreInst>(Inst);
+    if (isa<AllocaInst>(User) &&
+        SI->getPointerOperandType() == User->getType() &&
+        isa<VectorType>(SI->getValueOperand()->getType()))
+      return true;
     return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
   }
   default:
@@ -346,7 +359,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     return false;
   }
 
-  ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
+  Type *AT = Alloca->getAllocatedType();
+  SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
 
   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
@@ -393,7 +407,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     }
   }
 
-  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+  VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
+  if (!VectorTy)
+    VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
 
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
@@ -403,6 +419,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
+      if (Inst->getType() == AT)
+        break;
+
       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
@@ -415,9 +434,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
       break;
     }
     case Instruction::Store: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
-
       StoreInst *SI = cast<StoreInst>(Inst);
+      if (SI->getValueOperand()->getType() == AT)
+        break;
+
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = SI->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
@@ -706,6 +727,9 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   if (tryPromoteAllocaToVector(&I))
     return true; // Promoted to vector.
 
+  if (DisablePromoteAllocaToLDS)
+    return false;
+
   const Function &ContainingFunction = *I.getParent()->getParent();
   CallingConv::ID CC = ContainingFunction.getCallingConv();
 
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8314b4a490ff6e6476244b43c887397a81d77d9a..05b714f924b938f630c72b9b42fd2690610bb959 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -351,7 +351,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_SHL:
     if (isSALUMapping(MI))
       return getDefaultMappingSOP(MI);
-    // Fall-through
+    LLVM_FALLTHROUGH;
 
   case AMDGPU::G_FADD:
   case AMDGPU::G_FPTOSI:
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d34834329b573422267f31d003b3725674e3d266..f1acd72b03a2086eebd069e2052466bade548a05 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -171,7 +171,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     DebuggerEmitPrologue(false),
 
     EnableHugePrivateBuffer(false),
-    EnableVGPRSpilling(false),
     EnableLoadStoreOpt(false),
     EnableUnsafeDSOffsetFolding(false),
     EnableSIScheduler(false),
@@ -199,7 +198,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasDPP(false),
     HasR128A16(false),
     HasDLInsts(false),
-    D16PreservesUnusedBits(false),
+    EnableSRAMECC(false),
     FlatAddressSpace(false),
     FlatInstOffsets(false),
     FlatGlobalInsts(false),
@@ -480,10 +479,6 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
     Policy.ShouldTrackLaneMasks = true;
 }
 
-bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
-  return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
-}
-
 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     if (SGPRs <= 80)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index fb39dc4493cfdd0063c9437e903b3c395d6060fc..8b1cb23c6722af19e6fe9472e3aeae84d4ff6971 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -263,6 +263,7 @@ public:
     ISAVersion9_0_2,
     ISAVersion9_0_4,
     ISAVersion9_0_6,
+    ISAVersion9_0_9,
   };
 
   enum TrapHandlerAbi {
@@ -321,7 +322,6 @@ protected:
 
   // Used as options.
   bool EnableHugePrivateBuffer;
-  bool EnableVGPRSpilling;
   bool EnableLoadStoreOpt;
   bool EnableUnsafeDSOffsetFolding;
   bool EnableSIScheduler;
@@ -353,7 +353,7 @@ protected:
   bool HasDPP;
   bool HasR128A16;
   bool HasDLInsts;
-  bool D16PreservesUnusedBits;
+  bool EnableSRAMECC;
   bool FlatAddressSpace;
   bool FlatInstOffsets;
   bool FlatGlobalInsts;
@@ -515,6 +515,10 @@ public:
     return FMA;
   }
 
+  bool hasSwap() const {
+    return GFX9Insts;
+  }
+
   TrapHandlerAbi getTrapHandlerAbi() const {
     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
   }
@@ -675,8 +679,8 @@ public:
     return HasDLInsts;
   }
 
-  bool d16PreservesUnusedBits() const {
-    return D16PreservesUnusedBits;
+  bool isSRAMECCEnabled() const {
+    return EnableSRAMECC;
   }
 
   // Scratch is allocated in 256 dword per wave blocks for the entire
@@ -743,8 +747,6 @@ public:
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
-  bool isVGPRSpillingEnabled(const Function &F) const;
-
   unsigned getMaxNumUserSGPRs() const {
     return 16;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ef54100a9c42a7461ba252856f01336feb35a1d2..403dace533a335181d7e0509a86ff821cd1145b9 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -166,6 +166,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIOptimizeExecMaskingPreRAPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
+  initializeAMDGPUFixFunctionBitcastsPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
@@ -191,6 +192,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIFormMemoryClausesPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
+  initializeAMDGPUExternalAAWrapperPass(*PR);
   initializeAMDGPUUseNativeCallsPass(*PR);
   initializeAMDGPUSimplifyLibCallsPass(*PR);
   initializeAMDGPUInlinerPass(*PR);
@@ -339,13 +341,6 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     FSAttr.getValueAsString();
 }
 
-static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
-  return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
-      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
-        AAR.addAAResult(WrapperPass->getResult());
-      });
-}
-
 /// Predicate for Internalize pass.
 static bool mustPreserveGV(const GlobalValue &GV) {
   if (const Function *F = dyn_cast<Function>(&GV))
@@ -611,6 +606,11 @@ void AMDGPUPassConfig::addIRPasses() {
   disablePass(&PatchableFunctionID);
 
   addPass(createAtomicExpandPass());
+
+  // This must occur before inlining, as the inliner will not look through
+  // bitcast calls.
+  addPass(createAMDGPUFixFunctionBitcastsPass());
+
   addPass(createAMDGPULowerIntrinsicsPass());
 
   // Function calls are not supported, so make sure we inline everything.
@@ -811,8 +811,8 @@ bool GCNPassConfig::addILPOpts() {
 
 bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
-  addPass(createSILowerI1CopiesPass());
   addPass(&SIFixSGPRCopiesID);
+  addPass(createSILowerI1CopiesPass());
   return false;
 }
 
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 7b2dc3494ab66e8cbd1a2b53628231dbdc1cf42a..e48b73b0f1e793d95d11dc9e2b687b859f828650 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -17,8 +17,6 @@ def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [],
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
-def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">;
-def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">;
 
 class MubufLoad <SDPatternOperator op> : PatFrag <
   (ops node:$ptr), (op node:$ptr), [{
@@ -658,11 +656,10 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
   let AsmMatchConverter = "cvtMubufAtomicReturn";
 }
 
-multiclass MUBUF_Pseudo_Atomics <string opName,
-                                 RegisterClass vdataClass,
-                                 ValueType vdataType,
-                                 SDPatternOperator atomic> {
-
+multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
+                                        RegisterClass vdataClass,
+                                        ValueType vdataType,
+                                        SDPatternOperator atomic> {
   def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
                 MUBUFAddr64Table <0, NAME>;
   def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
@@ -670,7 +667,12 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
   def _OFFEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass>;
   def _IDXEN  : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass>;
   def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+}
 
+multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
+                                     RegisterClass vdataClass,
+                                     ValueType vdataType,
+                                     SDPatternOperator atomic> {
   def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
     [(set vdataType:$vdata,
      (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc),
@@ -688,6 +690,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
   def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
 }
 
+multiclass MUBUF_Pseudo_Atomics <string opName,
+                                 RegisterClass vdataClass,
+                                 ValueType vdataType,
+                                 SDPatternOperator atomic> :
+  MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType, atomic>,
+  MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+
 
 //===----------------------------------------------------------------------===//
 // MUBUF Instructions
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 5af27cd1d8cabaf7b742c8426b24217a5ab18803..3c87dc188270364f66e0986062de9f6d6f8e706f 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -40,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAtomicOptimizer.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInstrInfo.cpp
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index cdc6ab9412e610cce0b7fa06d3dfa710ccbdc762..31d2ebef481d20430213c207eeb718e5fc1d057d 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -728,7 +728,9 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
               (i1 0))
 >;
 
-let OtherPredicates = [LDSRequiresM0Init] in {
+// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
+// related to bounds checking.
+let OtherPredicates = [LDSRequiresM0Init, isCIVI] in {
 def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
 def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
 }
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 43130dfcae9975ce8d4448665443192dcb3a6e9c..18e8b8a1c2d1a6e6fd29bc5cda3f4bdcb5f51f5f 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -275,7 +275,7 @@ multiclass FLAT_Atomic_Pseudo<
        AtomicNoRet <opName, 1>;
 }
 
-multiclass FLAT_Global_Atomic_Pseudo<
+multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
   string opName,
   RegisterClass vdst_rc,
   ValueType vt,
@@ -292,16 +292,6 @@ multiclass FLAT_Global_Atomic_Pseudo<
     let PseudoInstr = NAME;
   }
 
-  def _RTN : FLAT_AtomicRet_Pseudo <opName,
-    (outs vdst_rc:$vdst),
-      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
-    " $vdst, $vaddr, $vdata, off$offset glc$slc",
-    [(set vt:$vdst,
-      (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
-      AtomicNoRet <opName, 1> {
-    let has_saddr = 1;
-  }
-
   def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
@@ -311,6 +301,25 @@ multiclass FLAT_Global_Atomic_Pseudo<
     let enabled_saddr = 1;
     let PseudoInstr = NAME#"_SADDR";
   }
+}
+
+multiclass FLAT_Global_Atomic_Pseudo_RTN<
+  string opName,
+  RegisterClass vdst_rc,
+  ValueType vt,
+  SDPatternOperator atomic = null_frag,
+  ValueType data_vt = vt,
+  RegisterClass data_rc = vdst_rc> {
+
+  def _RTN : FLAT_AtomicRet_Pseudo <opName,
+    (outs vdst_rc:$vdst),
+      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
+    " $vdst, $vaddr, $vdata, off$offset glc$slc",
+    [(set vt:$vdst,
+      (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+      AtomicNoRet <opName, 1> {
+    let has_saddr = 1;
+  }
 
   def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
     (outs vdst_rc:$vdst),
@@ -323,6 +332,16 @@ multiclass FLAT_Global_Atomic_Pseudo<
   }
 }
 
+multiclass FLAT_Global_Atomic_Pseudo<
+  string opName,
+  RegisterClass vdst_rc,
+  ValueType vt,
+  SDPatternOperator atomic = null_frag,
+  ValueType data_vt = vt,
+  RegisterClass data_rc = vdst_rc> :
+    FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>,
+    FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>;
+
 class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
   (ops node:$ptr, node:$value),
   (atomic_op node:$ptr, node:$value),
diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td
index d76acfa24f9010e685b873d7a6d7e2f31c893461..b8142a4e4ff888f159f450c1b864cc671132de0a 100644
--- a/lib/Target/AMDGPU/GCNProcessors.td
+++ b/lib/Target/AMDGPU/GCNProcessors.td
@@ -156,3 +156,8 @@ def : ProcessorModel<"gfx904", SIQuarterSpeedModel,
 def : ProcessorModel<"gfx906", SIQuarterSpeedModel,
   [FeatureISAVersion9_0_6]
 >;
+
+def : ProcessorModel<"gfx909", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_9]
+>;
+
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 3f6ab244c34bac335c0990370c1cc32235ac2a2a..225bf5b7816bf8ab0712f32cf8fcf4999dd9b295 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -83,6 +83,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904;  break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906;  break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909;  break;
   case ELF::EF_AMDGPU_MACH_NONE:          AK = GK_NONE;    break;
   }
 
@@ -129,6 +130,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX902:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902;
   case GK_GFX904:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904;
   case GK_GFX906:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
+  case GK_GFX909:  return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909;
   case GK_NONE:    return ELF::EF_AMDGPU_MACH_NONE;
   }
 
@@ -345,6 +347,10 @@ AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
   if (AMDGPU::hasXNACK(STI))
     EFlags |= ELF::EF_AMDGPU_XNACK;
 
+  EFlags &= ~ELF::EF_AMDGPU_SRAM_ECC;
+  if (AMDGPU::hasSRAMECC(STI))
+    EFlags |= ELF::EF_AMDGPU_SRAM_ECC;
+
   MCA.setELFHeaderEFlags(EFlags);
 }
 
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 8864aabb063f7bfdb0a2ebe2986fc0d1b7f4d8a2..e2a0f05d2b34d4d0e0fc3110e2a481285a8f8a7b 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1685,14 +1685,15 @@ bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
 static SDValue CompactSwizzlableVector(
   SelectionDAG &DAG, SDValue VectorEntry,
   DenseMap<unsigned, unsigned> &RemapSwizzle) {
-  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
-  SDValue NewBldVec[4] = {
-    VectorEntry.getOperand(0),
-    VectorEntry.getOperand(1),
-    VectorEntry.getOperand(2),
-    VectorEntry.getOperand(3)
-  };
+
+  SDLoc DL(VectorEntry);
+  EVT EltTy = VectorEntry.getValueType().getVectorElementType();
+
+  SDValue NewBldVec[4];
+  for (unsigned i = 0; i < 4; i++)
+    NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
+                               DAG.getIntPtrConstant(i, DL));
 
   for (unsigned i = 0; i < 4; i++) {
     if (NewBldVec[i].isUndef())
@@ -1727,15 +1728,17 @@ static SDValue CompactSwizzlableVector(
 
 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
-  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
-  SDValue NewBldVec[4] = {
-      VectorEntry.getOperand(0),
-      VectorEntry.getOperand(1),
-      VectorEntry.getOperand(2),
-      VectorEntry.getOperand(3)
-  };
-  bool isUnmovable[4] = { false, false, false, false };
+
+  SDLoc DL(VectorEntry);
+  EVT EltTy = VectorEntry.getValueType().getVectorElementType();
+
+  SDValue NewBldVec[4];
+  bool isUnmovable[4] = {false, false, false, false};
+  for (unsigned i = 0; i < 4; i++)
+    NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry,
+                               DAG.getIntPtrConstant(i, DL));
+
   for (unsigned i = 0; i < 4; i++) {
     RemapSwizzle[i] = i;
     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
@@ -1766,7 +1769,6 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
                                             SelectionDAG &DAG,
                                             const SDLoc &DL) const {
-  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
   // Old -> New swizzle values
   DenseMap<unsigned, unsigned> SwizzleRemap;
 
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 478a473a51b61d80ea9f2fcaff5c08a96c3dfb5a..7769a35aadcee6ad114ef25c4881a84dc1b5cc0e 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -236,6 +236,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
       // MI will become a KILL, don't considers it in scheduling
       return AluDiscarded;
     }
+    break;
   default:
     break;
   }
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index c52313f84ef0675af7212af5a84535678ebe6cae..90f430d5ca4c6c16bb946951db76dee704c68025 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -66,9 +66,7 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   Function *If;
   Function *Else;
-  Function *Break;
   Function *IfBreak;
-  Function *ElseBreak;
   Function *Loop;
   Function *EndCf;
 
@@ -95,8 +93,7 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   Value *
   handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,
-                      BranchInst *Term,
-                      SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions);
+                      BranchInst *Term);
 
   void handleLoop(BranchInst *Term);
 
@@ -149,9 +146,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
 
   If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
   Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
-  Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break);
   IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
-  ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break);
   Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
   EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
   return false;
@@ -227,76 +222,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 
 /// Recursively handle the condition leading to a loop
 Value *SIAnnotateControlFlow::handleLoopCondition(
-    Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
-    SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
-  // Only search through PHI nodes which are inside the loop.  If we try this
-  // with PHI nodes that are outside of the loop, we end up inserting new PHI
-  // nodes outside of the loop which depend on values defined inside the loop.
-  // This will break the module with
-  // 'Instruction does not dominate all users!' errors.
-  PHINode *Phi = nullptr;
-  if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
-    BasicBlock *Parent = Phi->getParent();
-    PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front());
-    Value *Ret = NewPhi;
-
-    // Handle all non-constant incoming values first
-    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-      Value *Incoming = Phi->getIncomingValue(i);
-      BasicBlock *From = Phi->getIncomingBlock(i);
-      if (isa<ConstantInt>(Incoming)) {
-        NewPhi->addIncoming(Broken, From);
-        continue;
-      }
-
-      Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken, L,
-                                          Term, LoopPhiConditions);
-      NewPhi->addIncoming(PhiArg, From);
-    }
-
-    BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
-
-    for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-      Value *Incoming = Phi->getIncomingValue(i);
-      if (Incoming != BoolTrue)
-        continue;
-
-      BasicBlock *From = Phi->getIncomingBlock(i);
-      if (From == IDom) {
-        // We're in the following situation:
-        //   IDom/From
-        //      |   \
-        //      |   If-block
-        //      |   /
-        //     Parent
-        // where we want to break out of the loop if the If-block is not taken.
-        // Due to the depth-first traversal, there should be an end.cf
-        // intrinsic in Parent, and we insert an else.break before it.
-        //
-        // Note that the end.cf need not be the first non-phi instruction
-        // of parent, particularly when we're dealing with a multi-level
-        // break, but it should occur within a group of intrinsic calls
-        // at the beginning of the block.
-        CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt());
-        while (OldEnd && OldEnd->getCalledFunction() != EndCf)
-          OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode());
-        if (OldEnd && OldEnd->getCalledFunction() == EndCf) {
-          Value *Args[] = { OldEnd->getArgOperand(0), NewPhi };
-          Ret = CallInst::Create(ElseBreak, Args, "", OldEnd);
-          continue;
-        }
-      }
-
-      TerminatorInst *Insert = From->getTerminator();
-      Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
-      NewPhi->setIncomingValue(i, PhiArg);
-    }
-
-    LoopPhiConditions.push_back(WeakTrackingVH(Phi));
-    return Ret;
-  }
-
+    Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term) {
   if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
     BasicBlock *Parent = Inst->getParent();
     Instruction *Insert;
@@ -335,21 +261,15 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
   BasicBlock *Target = Term->getSuccessor(1);
   PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
 
-  SmallVector<WeakTrackingVH, 8> LoopPhiConditions;
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions);
+  Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
 
   for (BasicBlock *Pred : predecessors(Target))
     Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
 
   Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
 
-  for (WeakTrackingVH Val : llvm::reverse(LoopPhiConditions)) {
-    if (PHINode *Cond = cast_or_null<PHINode>(Val))
-      eraseIfUnused(Cond);
-  }
-
   push(Term->getSuccessor(0), Arg);
 }
 
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index ed52278e441b766250120e825e1af7d1cb4d0ff6..809f5bab46932c9e2683fa77f7c2ecbf9a3aeda4 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -183,13 +183,15 @@ getCopyRegClasses(const MachineInstr &Copy,
 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
                              const TargetRegisterClass *DstRC,
                              const SIRegisterInfo &TRI) {
-  return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);
+  return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
+         TRI.hasVGPRs(SrcRC);
 }
 
 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
                              const TargetRegisterClass *DstRC,
                              const SIRegisterInfo &TRI) {
-  return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
+  return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
+         TRI.hasVGPRs(DstRC);
 }
 
 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
@@ -327,9 +329,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI,
     switch (DefInstr->getOpcode()) {
     default:
       break;
-    case AMDGPU::SI_BREAK:
     case AMDGPU::SI_IF_BREAK:
-    case AMDGPU::SI_ELSE_BREAK:
       return true;
     case AMDGPU::PHI:
       if (phiHasBreakDef(*DefInstr, MRI, Visited))
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 062232873961abe3189c80e14a242d4fd64a5937..254f1362f1f0f3cabed56208b33b3243ddd6e5c7 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__MINGW32__)
 // Provide M_PI.
 #define _USE_MATH_DEFINES
 #endif
@@ -384,8 +384,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   if (Subtarget->hasBFE())
     setHasExtractBitsInsn(true);
 
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
+  setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
+  setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
+
+
+  // These are really only legal for ieee_mode functions. We should be avoiding
+  // them for functions that don't have ieee_mode enabled, so just say they are
+  // legal.
+  setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
+  setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
+  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+
 
   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
@@ -474,8 +486,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     // F16 - VOP2 Actions.
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+
     setOperationAction(ISD::FDIV, MVT::f16, Custom);
 
     // F16 - VOP3 Actions.
@@ -558,6 +569,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     // This isn't really legal, but this avoids the legalizer unrolling it (and
     // allows matching fneg (fabs x) patterns)
     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
+
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
   }
 
   if (Subtarget->hasVOP3PInsts()) {
@@ -575,8 +597,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
+
     setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
@@ -596,6 +620,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
+
     setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
@@ -634,6 +662,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
+  setTargetDAGCombine(ISD::FMINNUM_IEEE);
+  setTargetDAGCombine(ISD::FMAXNUM_IEEE);
   setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::SMIN);
   setTargetDAGCombine(ISD::SMAX);
@@ -649,7 +679,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::BUILD_VECTOR);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -1156,7 +1185,7 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
 }
 
 TargetLoweringBase::LegalizeTypeAction
-SITargetLowering::getPreferredVectorAction(EVT VT) const {
+SITargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
     return TypeSplitVector;
 
@@ -3580,6 +3609,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FNEG:
   case ISD::FCANONICALIZE:
     return splitUnaryVectorOp(Op, DAG);
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+    return lowerFMINNUM_FMAXNUM(Op, DAG);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
@@ -3590,10 +3622,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:
-  case ISD::FMINNUM:
-  case ISD::FMAXNUM:
   case ISD::FADD:
   case ISD::FMUL:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
     return splitBinaryVectorOp(Op, DAG);
   }
   return SDValue();
@@ -4048,6 +4080,23 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
 }
 
+SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+
+  // FIXME: Assert during eslection that this is only selected for
+  // ieee_mode. Currently a combine can produce the ieee version for non-ieee
+  // mode functions, but this happens to be OK since it's only done in cases
+  // where there is known no sNaN.
+  if (IsIEEEMode)
+    return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
+
+  if (VT == MVT::v4f16)
+    return splitBinaryVectorOp(Op, DAG);
+  return Op;
+}
+
 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Chain = Op.getOperand(0);
@@ -4677,9 +4726,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   // Check for 16 bit addresses and pack if true.
   unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
   MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
-  if (VAddrVT.getScalarType() == MVT::f16 &&
+  const MVT VAddrScalarVT = VAddrVT.getScalarType();
+  if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
       ST->hasFeature(AMDGPU::FeatureR128A16)) {
     IsA16 = true;
+    const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
     for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
       SDValue AddrLo, AddrHi;
       // Push back extra arguments.
@@ -4698,7 +4749,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
           AddrHi = Op.getOperand(i + 1);
           i++;
         }
-        AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f16,
+        AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
                              {AddrLo, AddrHi});
         AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
       }
@@ -4934,12 +4985,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::r600_read_tgid_z:
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
-  case Intrinsic::amdgcn_workitem_id_x: {
+  case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::r600_read_tidig_x:
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDX);
-  }
   case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::r600_read_tidig_y:
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
@@ -6251,6 +6301,17 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
     if (NumElements > 2)
       return SplitVectorLoad(Op, DAG);
+
+    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+    // address is negative, then the instruction is incorrectly treated as
+    // out-of-bounds even if base + offsets is in bounds. Split vectorized
+    // loads here to avoid emitting ds_read2_b32. We may re-combine the
+    // load later in the SILoadStoreOptimizer.
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+        NumElements == 2 && MemVT.getStoreSize() == 8 &&
+        Load->getAlignment() < 8) {
+      return SplitVectorLoad(Op, DAG);
+    }
   }
   return SDValue();
 }
@@ -6653,6 +6714,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
     if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
+
+    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
+    // address is negative, then the instruction is incorrectly treated as
+    // out-of-bounds even if base + offsets is in bounds. Split vectorized
+    // stores here to avoid emitting ds_write2_b32. We may re-combine the
+    // store later in the SILoadStoreOptimizer.
+    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+        NumElements == 2 && VT.getStoreSize() == 8 &&
+        Store->getAlignment() < 8) {
+      return SplitVectorStore(Op, DAG);
+    }
+
     return SDValue();
   } else {
     llvm_unreachable("unhandled address space");
@@ -7457,37 +7530,32 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
 
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
   case AMDGPUISD::CLAMP:
   case AMDGPUISD::FMED3:
   case AMDGPUISD::FMAX3:
   case AMDGPUISD::FMIN3: {
     // FIXME: Shouldn't treat the generic operations different based these.
-    bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
-    if (IsIEEEMode) {
-      // snans will be quieted, so we only need to worry about denormals.
-      if (Subtarget->supportsMinMaxDenormModes() ||
-          denormalsEnabledForType(Op.getValueType()))
-        return true;
-
-      // Flushing may be required.
-      // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
-      // targets need to check their input recursively.
-      return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
-             isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
-    }
+    // However, we aren't really required to flush the result from
+    // minnum/maxnum..
 
+    // snans will be quieted, so we only need to worry about denormals.
     if (Subtarget->supportsMinMaxDenormModes() ||
-        denormalsEnabledForType(Op.getValueType())) {
-      // Only quieting may be necessary.
-      return DAG.isKnownNeverSNaN(Op.getOperand(0)) &&
-             DAG.isKnownNeverSNaN(Op.getOperand(1));
+        denormalsEnabledForType(Op.getValueType()))
+      return true;
+
+    // Flushing may be required.
+    // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
+    // targets need to check their input recursively.
+
+    // FIXME: Does this apply with clamp? It's implemented with max.
+    for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
+      if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
+        return false;
     }
 
-    // Flushing and quieting may be necessary
-    // With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
-    // needs to be quieted.
-    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
-           isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
+    return true;
   }
   case ISD::SELECT: {
     return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
@@ -7514,6 +7582,21 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
     // Could be anything.
     return false;
 
+  case ISD::BITCAST: {
+    // Hack round the mess we make when legalizing extract_vector_elt
+    SDValue Src = Op.getOperand(0);
+    if (Src.getValueType() == MVT::i16 &&
+        Src.getOpcode() == ISD::TRUNCATE) {
+      SDValue TruncSrc = Src.getOperand(0);
+      if (TruncSrc.getValueType() == MVT::i32 &&
+          TruncSrc.getOpcode() == ISD::BITCAST &&
+          TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
+        return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
+      }
+    }
+
+    return false;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IntrinsicID
       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -7539,7 +7622,6 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
 }
 
 // Constant fold canonicalize.
-
 SDValue SITargetLowering::getCanonicalConstantFP(
   SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
   // Flush denormals to 0 if not enabled.
@@ -7635,18 +7717,40 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
     }
   }
 
+  unsigned SrcOpc = N0.getOpcode();
+
+  // If it's free to do so, push canonicalizes further up the source, which may
+  // find a canonical source.
+  //
+  // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
+  // sNaNs.
+  if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
+    auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    if (CRHS && N0.hasOneUse()) {
+      SDLoc SL(N);
+      SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
+                                   N0.getOperand(0));
+      SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
+      DCI.AddToWorklist(Canon0.getNode());
+
+      return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
+    }
+  }
+
   return isCanonicalized(DAG, N0) ? N0 : SDValue();
 }
 
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   switch (Opc) {
   case ISD::FMAXNUM:
+  case ISD::FMAXNUM_IEEE:
     return AMDGPUISD::FMAX3;
   case ISD::SMAX:
     return AMDGPUISD::SMAX3;
   case ISD::UMAX:
     return AMDGPUISD::UMAX3;
   case ISD::FMINNUM:
+  case ISD::FMINNUM_IEEE:
     return AMDGPUISD::FMIN3;
   case ISD::SMIN:
     return AMDGPUISD::SMIN3;
@@ -7813,6 +7917,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
 
   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+       (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
        (Opc == AMDGPUISD::FMIN_LEGACY &&
         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
       (VT == MVT::f32 || VT == MVT::f64 ||
@@ -7931,7 +8036,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
     case ISD::SMIN:
     case ISD::SMAX:
     case ISD::FMAXNUM:
-    case ISD::FMINNUM: {
+    case ISD::FMINNUM:
+    case ISD::FMAXNUM_IEEE:
+    case ISD::FMINNUM_IEEE: {
       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
                                  Vec.getOperand(0), Idx);
       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
@@ -7985,48 +8092,6 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
   return SDValue();
 }
 
-static bool convertBuildVectorCastElt(SelectionDAG &DAG,
-                                      SDValue &Lo, SDValue &Hi) {
-  if (Hi.getOpcode() == ISD::BITCAST &&
-      Hi.getOperand(0).getValueType() == MVT::f16 &&
-      (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
-    Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
-    Hi = Hi.getOperand(0);
-    return true;
-  }
-
-  return false;
-}
-
-SDValue SITargetLowering::performBuildVectorCombine(
-  SDNode *N, DAGCombinerInfo &DCI) const {
-  SDLoc SL(N);
-
-  if (!isTypeLegal(MVT::v2i16))
-    return SDValue();
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  if (VT == MVT::v2i16) {
-    SDValue Lo = N->getOperand(0);
-    SDValue Hi = N->getOperand(1);
-
-    // v2i16 build_vector (const|undef), (bitcast f16:$x)
-    // -> bitcast (v2f16 build_vector const|undef, $x
-    if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
-      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
-      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
-    }
-
-    if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
-      SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
-      return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
-    }
-  }
-
-  return SDValue();
-}
-
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
                                           const SDNode *N1) const {
@@ -8531,13 +8596,15 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performSetCCCombine(N, DCI);
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
+  case ISD::FMAXNUM_IEEE:
+  case ISD::FMINNUM_IEEE:
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
   case ISD::UMIN:
   case AMDGPUISD::FMIN_LEGACY:
   case AMDGPUISD::FMAX_LEGACY: {
-    if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
+    if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
       return performMinMaxCombine(N, DCI);
     break;
@@ -8633,8 +8700,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   }
   case ISD::EXTRACT_VECTOR_ELT:
     return performExtractVectorEltCombine(N, DCI);
-  case ISD::BUILD_VECTOR:
-    return performBuildVectorCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -8693,7 +8758,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
     // Set which texture component corresponds to the lane.
     unsigned Comp;
-    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+    for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
       Comp = countTrailingZeros(Dmask);
       Dmask &= ~(1 << Comp);
     }
@@ -9256,3 +9321,17 @@ bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
     return false;
   }
 }
+
+bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
+                                                    const SelectionDAG &DAG,
+                                                    bool SNaN,
+                                                    unsigned Depth) const {
+  if (Op.getOpcode() == AMDGPUISD::CLAMP) {
+    if (Subtarget->enableDX10Clamp())
+      return true; // Clamped to 0.
+    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
+  }
+
+  return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
+                                                            SNaN, Depth);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 6c02688483b4a5b7c8f3057ab2a7671e82585bbd..73fa05ea58f51e063b40ba419ad823ad7345d7ef 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -108,6 +108,7 @@ private:
 
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
                              SelectionDAG &DAG) const;
@@ -151,7 +152,6 @@ private:
   SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
@@ -232,7 +232,7 @@ public:
   bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
   TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const override;
+  getPreferredVectorAction(MVT VT) const override;
 
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                         Type *Ty) const override;
@@ -344,6 +344,11 @@ public:
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
                        unsigned MaxDepth = 5) const;
   bool denormalsEnabledForType(EVT VT) const;
+
+  bool isKnownNeverNaNForTargetNode(SDValue Op,
+                                    const SelectionDAG &DAG,
+                                    bool SNaN = false,
+                                    unsigned Depth = 0) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 819b1b9fcd75c871c7e0be623848158076fe8365..eb39984f795910e38acf036c3856bd5c968d7456 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -382,8 +382,6 @@ private:
 
   DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
 
-  std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
-
   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
   // because of amdgpu-waitcnt-forcezero flag
   bool ForceEmitZeroWaitcnts;
@@ -410,13 +408,6 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
-    // The waitcnt information is copied because it changes as the block is
-    // traversed.
-    KillWaitBrackets.push_back(
-        llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
-  }
-
   bool isForceEmitWaitcnt() const {
     for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
          T = (enum InstCounterType)(T + 1))
@@ -889,24 +880,14 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
   // Start with an assumption that there is no need to emit.
   unsigned int EmitWaitcnt = 0;
 
-  // No need to wait before phi. If a phi-move exists, then the wait should
-  // has been inserted before the move. If a phi-move does not exist, then
-  // wait should be inserted before the real use. The same is true for
-  // sc-merge. It is not a coincident that all these cases correspond to the
-  // instructions that are skipped in the assembling loop.
-  bool NeedLineMapping = false; // TODO: Check on this.
-
   // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
   bool ForceEmitZeroWaitcnt = false;
 
   setForceEmitWaitcnt();
   bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
 
-  if (MI.isDebugInstr() &&
-      // TODO: any other opcode?
-      !NeedLineMapping) {
+  if (MI.isDebugInstr())
     return;
-  }
 
   // See if an s_waitcnt is forced at block entry, or is needed at
   // program end.
@@ -1150,7 +1131,6 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
   if (EmitWaitcnt || IsForceEmitWaitcnt) {
     int CntVal[NUM_INST_CNTS];
 
-    bool UseDefaultWaitcntStrategy = true;
     if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
       // Force all waitcnts to 0.
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
@@ -1160,10 +1140,7 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
       CntVal[VM_CNT] = 0;
       CntVal[EXP_CNT] = 0;
       CntVal[LGKM_CNT] = 0;
-      UseDefaultWaitcntStrategy = false;
-    }
-
-    if (UseDefaultWaitcntStrategy) {
+    } else {
       for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
            T = (enum InstCounterType)(T + 1)) {
         if (EmitWaitcnt & CNT_MASK(T)) {
@@ -1187,95 +1164,89 @@ void SIInsertWaitcnts::generateWaitcntInstBefore(
       }
     }
 
-    // If we are not waiting on any counter we can skip the wait altogether.
-    if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
-      MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
-      int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
-      if (!OldWaitcnt ||
-          (AMDGPU::decodeVmcnt(IV, Imm) !=
-                          (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
-          (AMDGPU::decodeExpcnt(IV, Imm) !=
-           (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
-          (AMDGPU::decodeLgkmcnt(IV, Imm) !=
-           (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
-        MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
-        if (ContainingLoop) {
-          MachineBasicBlock *TBB = ContainingLoop->getHeader();
-          BlockWaitcntBrackets *ScoreBracket =
-              BlockWaitcntBracketsMap[TBB].get();
-          if (!ScoreBracket) {
-            assert(!BlockVisitedSet.count(TBB));
-            BlockWaitcntBracketsMap[TBB] =
-                llvm::make_unique<BlockWaitcntBrackets>(ST);
-            ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
-          }
-          ScoreBracket->setRevisitLoop(true);
-          LLVM_DEBUG(dbgs()
-                         << "set-revisit2: Block"
-                         << ContainingLoop->getHeader()->getNumber() << '\n';);
+    MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
+    int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
+    if (!OldWaitcnt ||
+        (AMDGPU::decodeVmcnt(IV, Imm) !=
+         (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
+        (AMDGPU::decodeExpcnt(IV, Imm) !=
+         (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
+        (AMDGPU::decodeLgkmcnt(IV, Imm) !=
+         (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
+      MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
+      if (ContainingLoop) {
+        MachineBasicBlock *TBB = ContainingLoop->getHeader();
+        BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
+        if (!ScoreBracket) {
+          assert(!BlockVisitedSet.count(TBB));
+          BlockWaitcntBracketsMap[TBB] =
+              llvm::make_unique<BlockWaitcntBrackets>(ST);
+          ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
         }
+        ScoreBracket->setRevisitLoop(true);
+        LLVM_DEBUG(dbgs() << "set-revisit2: Block"
+                          << ContainingLoop->getHeader()->getNumber() << '\n';);
       }
+    }
 
-      // Update an existing waitcount, or make a new one.
-      unsigned Enc = AMDGPU::encodeWaitcnt(IV,
+    // Update an existing waitcount, or make a new one.
+    unsigned Enc = AMDGPU::encodeWaitcnt(IV,
                       ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
                       ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
                       ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
-      // We don't remove waitcnts that existed prior to the waitcnt
-      // pass. Check if the waitcnt to-be-inserted can be avoided
-      // or if the prev waitcnt can be updated.
-      bool insertSWaitInst = true;
-      for (MachineBasicBlock::iterator I = MI.getIterator(),
-                                       B = MI.getParent()->begin();
-           insertSWaitInst && I != B; --I) {
-        if (I == MI.getIterator())
-          continue;
+    // We don't remove waitcnts that existed prior to the waitcnt
+    // pass. Check if the waitcnt to-be-inserted can be avoided
+    // or if the prev waitcnt can be updated.
+    bool insertSWaitInst = true;
+    for (MachineBasicBlock::iterator I = MI.getIterator(),
+                                     B = MI.getParent()->begin();
+         insertSWaitInst && I != B; --I) {
+      if (I == MI.getIterator())
+        continue;
 
-        switch (I->getOpcode()) {
-        case AMDGPU::S_WAITCNT:
-          if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
-            insertSWaitInst = false;
-          else if (!OldWaitcnt) {
-            OldWaitcnt = &*I;
-            Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
-          }
-          break;
-        // TODO: skip over instructions which never require wait.
+      switch (I->getOpcode()) {
+      case AMDGPU::S_WAITCNT:
+        if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
+          insertSWaitInst = false;
+        else if (!OldWaitcnt) {
+          OldWaitcnt = &*I;
+          Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
         }
         break;
+        // TODO: skip over instructions which never require wait.
       }
-      if (insertSWaitInst) {
-        if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
-          if (ForceEmitZeroWaitcnts)
-            LLVM_DEBUG(
-                dbgs()
-                << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
-          if (IsForceEmitWaitcnt)
-            LLVM_DEBUG(dbgs()
-                       << "Force emit a s_waitcnt due to debug counter\n");
-
-          OldWaitcnt->getOperand(0).setImm(Enc);
-          if (!OldWaitcnt->getParent())
-            MI.getParent()->insert(MI, OldWaitcnt);
-
-          LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
-                            << "Old Instr: " << MI << '\n'
-                            << "New Instr: " << *OldWaitcnt << '\n');
-        } else {
-            auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
-                               MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+      break;
+    }
+    if (insertSWaitInst) {
+      if (OldWaitcnt) {
+        assert(OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT);
+        if (ForceEmitZeroWaitcnts)
+          LLVM_DEBUG(dbgs()
+                     << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
+        if (IsForceEmitWaitcnt)
+          LLVM_DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n");
+
+        OldWaitcnt->getOperand(0).setImm(Enc);
+        if (!OldWaitcnt->getParent())
+          MI.getParent()->insert(MI, OldWaitcnt);
+
+        LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
+                          << "Old Instr: " << MI << '\n'
+                          << "New Instr: " << *OldWaitcnt << '\n');
+      } else {
+        auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
+                                 MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
                              .addImm(Enc);
-            TrackedWaitcntSet.insert(SWaitInst);
+        TrackedWaitcntSet.insert(SWaitInst);
 
-            LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
-                              << "Old Instr: " << MI << '\n'
-                              << "New Instr: " << *SWaitInst << '\n');
-        }
+        LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                          << "Old Instr: " << MI << '\n'
+                          << "New Instr: " << *SWaitInst << '\n');
       }
+    }
 
-      if (CntVal[EXP_CNT] == 0) {
-        ScoreBrackets->setMixedExpTypes(false);
-      }
+    if (CntVal[EXP_CNT] == 0) {
+      ScoreBrackets->setMixedExpTypes(false);
     }
   }
 }
@@ -1425,24 +1396,6 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
     MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
   }
 
-  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
-  // Also handle kills for exit block.
-  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
-    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        int Span = KillWaitBrackets[I]->getScoreUB(T) -
-                   KillWaitBrackets[I]->getScoreLB(T);
-        MaxPending[T] = std::max(MaxPending[T], Span);
-        Span = KillWaitBrackets[I]->pendingFlat(T) -
-               KillWaitBrackets[I]->getScoreLB(T);
-        MaxFlat[T] = std::max(MaxFlat[T], Span);
-      }
-
-      MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
-    }
-  }
-
   // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
   for (MachineBasicBlock *Pred : Block.predecessors()) {
     BlockWaitcntBrackets *PredScoreBrackets =
@@ -1460,18 +1413,6 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
     MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
   }
 
-  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
-  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
-    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
-      int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
-                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
-      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
-      int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
-                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
-      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
-    }
-  }
-
 #if 0
   // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
   // TODO: how does LC distinguish between function entry and main entry?
@@ -1551,60 +1492,6 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
     }
   }
 
-  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
-  // Set the register scoreboard.
-  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
-    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
-      // Now merge the gpr_reg_score information.
-      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
-           T = (enum InstCounterType)(T + 1)) {
-        int PredLB = KillWaitBrackets[I]->getScoreLB(T);
-        int PredUB = KillWaitBrackets[I]->getScoreUB(T);
-        if (PredLB < PredUB) {
-          int PredScale = MaxPending[T] - PredUB;
-          // Merge vgpr scores.
-          for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
-            int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
-            if (PredRegScore <= PredLB)
-              continue;
-            int NewRegScore = PredScale + PredRegScore;
-            ScoreBrackets->setRegScore(
-                J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
-          }
-          // Also need to merge sgpr scores for lgkm_cnt.
-          if (T == LGKM_CNT) {
-            for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
-              int PredRegScore =
-                  KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
-              if (PredRegScore <= PredLB)
-                continue;
-              int NewRegScore = PredScale + PredRegScore;
-              ScoreBrackets->setRegScore(
-                  J + NUM_ALL_VGPRS, LGKM_CNT,
-                  std::max(
-                      ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
-                      NewRegScore));
-            }
-          }
-        }
-      }
-
-      // Also merge the WaitEvent information.
-      ForAllWaitEventType(W) {
-        enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
-        int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
-        if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
-          int NewEventUB =
-              MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
-          if (NewEventUB > 0) {
-            ScoreBrackets->setEventUB(
-                W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
-          }
-        }
-      }
-    }
-  }
-
   // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
   // sequencing predecessors, because changes to EXEC require waitcnts due to
   // the delayed nature of these operations.
@@ -1701,13 +1588,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       continue;
     }
 
-    // Kill instructions generate a conditional branch to the endmain block.
-    // Merge the current waitcnt state into the endmain block information.
-    // TODO: Are there other flavors of KILL instruction?
-    if (Inst.getOpcode() == AMDGPU::KILL) {
-      addKillWaitBracket(ScoreBrackets);
-    }
-
     bool VCCZBugWorkAround = false;
     if (readsVCCZ(Inst) &&
         (!VCCZBugHandledSet.count(&Inst))) {
@@ -1871,7 +1751,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   LoopWaitcntDataMap.clear();
   BlockWaitcntProcessedSet.clear();
 
-  // Walk over the blocks in reverse post-dominator order, inserting
+  // Walk over the blocks in reverse post order, inserting
   // s_waitcnt where needed.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
   bool Modified = false;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 61a0030aea2163a65bb768f1276ddf0d92005f8b..562428ef37c011a2badf1df50d485186b0ed46e8 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -908,16 +908,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
-    LLVMContext &Ctx = MF->getFunction().getContext();
-    Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
-                  " spill register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
-      .addReg(SrcReg);
-
-    return;
-  }
-
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
   unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
@@ -1010,15 +1000,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  if (!ST.isVGPRSpillingEnabled(MF->getFunction())) {
-    LLVMContext &Ctx = MF->getFunction().getContext();
-    Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
-                  " restore register");
-    BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
-
-    return;
-  }
-
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
   unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
@@ -1555,8 +1536,9 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   //   buzz;
 
   RS->enterBasicBlockEnd(MBB);
-  unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
-                                       MachineBasicBlock::iterator(GetPC), 0);
+  unsigned Scav = RS->scavengeRegisterBackwards(
+    AMDGPU::SReg_64RegClass,
+    MachineBasicBlock::iterator(GetPC), false, 0);
   MRI.replaceRegWith(PCReg, Scav);
   MRI.clearVirtRegs();
   RS->setRegUsed(Scav);
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index c6043ea1c243b485b784d5b66d48b2f202c1ce8f..9714203d3d7053d7d0b3aa838d02e1e1cef47c13 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -264,14 +264,6 @@ def SI_END_CF : CFPseudoInstSI <
   let mayStore = 1;
 }
 
-def SI_BREAK : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src),
-  [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> {
-  let Size = 4;
-  let isAsCheapAsAMove = 1;
-  let isReMaterializable = 1;
-}
-
 def SI_IF_BREAK : CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src),
   [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> {
@@ -280,14 +272,6 @@ def SI_IF_BREAK : CFPseudoInstSI <
   let isReMaterializable = 1;
 }
 
-def SI_ELSE_BREAK : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1),
-  [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> {
-  let Size = 4;
-  let isAsCheapAsAMove = 1;
-  let isReMaterializable = 1;
-}
-
 let Uses = [EXEC] in {
 
 multiclass PseudoInstKill <dag ins> {
@@ -1327,11 +1311,21 @@ def : GCNPat <
   (S_XOR_B64 $src0, $src1)
 >;
 
+def : GCNPat <
+  (i1 (sub i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
 let AddedComplexity = 1 in {
 def : GCNPat <
   (i1 (add i1:$src0, (i1 -1))),
   (S_NOT_B64 $src0)
 >;
+
+def : GCNPat <
+  (i1 (sub i1:$src0, (i1 -1))),
+  (S_NOT_B64 $src0)
+>;
 }
 
 def : GCNPat <
@@ -1635,10 +1629,11 @@ def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
 // This matches 16 permutations of
 // max(min(x, y), min(max(x, y), z))
 class FPMed3Pat<ValueType vt,
+                //SDPatternOperator max, SDPatternOperator min,
                 Instruction med3Inst> : GCNPat<
-  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
-           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
                                            (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
                            (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
@@ -1646,10 +1641,10 @@ class FPMed3Pat<ValueType vt,
 
 class FP16Med3Pat<ValueType vt,
                 Instruction med3Inst> : GCNPat<
-  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
-                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
-           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
-                                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+  (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+           (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                                     (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
                            (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
   (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
 >;
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index ad30317c344c33036f7551e7ffdce09d205a0c29..1aa1feebbdae654304bc2535313822c098e24e85 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -85,9 +85,7 @@ private:
 
   void emitIf(MachineInstr &MI);
   void emitElse(MachineInstr &MI);
-  void emitBreak(MachineInstr &MI);
   void emitIfBreak(MachineInstr &MI);
-  void emitElseBreak(MachineInstr &MI);
   void emitLoop(MachineInstr &MI);
   void emitEndCf(MachineInstr &MI);
 
@@ -329,20 +327,6 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
 }
 
-void SILowerControlFlow::emitBreak(MachineInstr &MI) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  const DebugLoc &DL = MI.getDebugLoc();
-  unsigned Dst = MI.getOperand(0).getReg();
-
-  MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-                         .addReg(AMDGPU::EXEC)
-                         .add(MI.getOperand(1));
-
-  if (LIS)
-    LIS->ReplaceMachineInstrInMaps(MI, *Or);
-  MI.eraseFromParent();
-}
-
 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -384,11 +368,6 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
-  // Lowered in the same way as emitIfBreak above.
-  emitIfBreak(MI);
-}
-
 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -515,18 +494,10 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
         emitElse(MI);
         break;
 
-      case AMDGPU::SI_BREAK:
-        emitBreak(MI);
-        break;
-
       case AMDGPU::SI_IF_BREAK:
         emitIfBreak(MI);
         break;
 
-      case AMDGPU::SI_ELSE_BREAK:
-        emitElseBreak(MI);
-        break;
-
       case AMDGPU::SI_LOOP:
         emitLoop(MI);
         break;
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index ecc6cff407e18f7d2be0bdf74ea6c6016e21a8dc..eb038bb5d5fcf5561439bf19ddec9dd65f58a89d 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -5,37 +5,61 @@
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-/// i1 values are usually inserted by the CFG Structurize pass and they are
-/// unique in that they can be copied from VALU to SALU registers.
-/// This is not possible for any other value type.  Since there are no
-/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
-///
 //===----------------------------------------------------------------------===//
 //
+// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
+// to lane masks (64-bit scalar registers). The pass assumes machine SSA form
+// and a wave-level control flow graph.
+//
+// Before this pass, values that are semantically i1 and are defined and used
+// within the same basic block are already represented as lane masks in scalar
+// registers. However, values that cross basic blocks are always transferred
+// between basic blocks in vreg_1 virtual registers and are lowered by this
+// pass.
+//
+// The only instructions that use or define vreg_1 virtual registers are COPY,
+// PHI, and IMPLICIT_DEF.
+//
+//===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "si-i1-copies"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPULaneDominator.h"
-#include "llvm/CodeGen/LiveIntervals.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
 
+#define DEBUG_TYPE "si-i1-copies"
+
 using namespace llvm;
 
+static unsigned createLaneMaskReg(MachineFunction &MF);
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
+
 namespace {
 
 class SILowerI1Copies : public MachineFunctionPass {
 public:
   static char ID;
 
+private:
+  MachineFunction *MF = nullptr;
+  MachineDominatorTree *DT = nullptr;
+  MachinePostDominatorTree *PDT = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  const GCNSubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+
+  DenseSet<unsigned> ConstrainRegs;
+
 public:
   SILowerI1Copies() : MachineFunctionPass(ID) {
     initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
@@ -47,14 +71,337 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
+
+private:
+  void lowerCopiesFromI1();
+  void lowerPhis();
+  void lowerCopiesToI1();
+  bool isConstantLaneMask(unsigned Reg, bool &Val) const;
+  void buildMergeLaneMasks(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, const DebugLoc &DL,
+                           unsigned DstReg, unsigned PrevReg, unsigned CurReg);
+  MachineBasicBlock::iterator
+  getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+
+  bool isLaneMaskReg(unsigned Reg) const {
+    return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
+           TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
+               ST->getWavefrontSize();
+  }
+};
+
+/// Helper class that determines the relationship between incoming values of a
+/// phi in the control flow graph to determine where an incoming value can
+/// simply be taken as a scalar lane mask as-is, and where it needs to be
+/// merged with another, previously defined lane mask.
+///
+/// The approach is as follows:
+///  - Determine all basic blocks which, starting from the incoming blocks,
+///    a wave may reach before entering the def block (the block containing the
+///    phi).
+///  - If an incoming block has no predecessors in this set, we can take the
+///    incoming value as a scalar lane mask as-is.
+///  -- A special case of this is when the def block has a self-loop.
+///  - Otherwise, the incoming value needs to be merged with a previously
+///    defined lane mask.
+///  - If there is a path into the set of reachable blocks that does _not_ go
+///    through an incoming block where we can take the scalar lane mask as-is,
+///    we need to invent an available value for the SSAUpdater. Choices are
+///    0 and undef, with differing consequences for how to merge values etc.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+///       the traversal.
+///
+class PhiIncomingAnalysis {
+  MachinePostDominatorTree &PDT;
+
+  // For each reachable basic block, whether it is a source in the induced
+  // subgraph of the CFG.
+  DenseMap<MachineBasicBlock *, bool> ReachableMap;
+  SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
+  SmallVector<MachineBasicBlock *, 4> Stack;
+  SmallVector<MachineBasicBlock *, 4> Predecessors;
+
+public:
+  PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}
+
+  /// Returns whether \p MBB is a source in the induced subgraph of reachable
+  /// blocks.
+  bool isSource(MachineBasicBlock &MBB) const {
+    return ReachableMap.find(&MBB)->second;
+  }
+
+  ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
+
+  void analyze(MachineBasicBlock &DefBlock,
+               ArrayRef<MachineBasicBlock *> IncomingBlocks) {
+    assert(Stack.empty());
+    ReachableMap.clear();
+    ReachableOrdered.clear();
+    Predecessors.clear();
+
+    // Insert the def block first, so that it acts as an end point for the
+    // traversal.
+    ReachableMap.try_emplace(&DefBlock, false);
+    ReachableOrdered.push_back(&DefBlock);
+
+    for (MachineBasicBlock *MBB : IncomingBlocks) {
+      if (MBB == &DefBlock) {
+        ReachableMap[&DefBlock] = true; // self-loop on DefBlock
+        continue;
+      }
+
+      ReachableMap.try_emplace(MBB, false);
+      ReachableOrdered.push_back(MBB);
+
+      // If this block has a divergent terminator and the def block is its
+      // post-dominator, the wave may first visit the other successors.
+      bool Divergent = false;
+      for (MachineInstr &MI : MBB->terminators()) {
+        if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
+            MI.getOpcode() == AMDGPU::SI_IF ||
+            MI.getOpcode() == AMDGPU::SI_ELSE ||
+            MI.getOpcode() == AMDGPU::SI_LOOP) {
+          Divergent = true;
+          break;
+        }
+      }
+
+      if (Divergent && PDT.dominates(&DefBlock, MBB)) {
+        for (MachineBasicBlock *Succ : MBB->successors())
+          Stack.push_back(Succ);
+      }
+    }
+
+    while (!Stack.empty()) {
+      MachineBasicBlock *MBB = Stack.pop_back_val();
+      if (!ReachableMap.try_emplace(MBB, false).second)
+        continue;
+      ReachableOrdered.push_back(MBB);
+
+      for (MachineBasicBlock *Succ : MBB->successors())
+        Stack.push_back(Succ);
+    }
+
+    for (MachineBasicBlock *MBB : ReachableOrdered) {
+      bool HaveReachablePred = false;
+      for (MachineBasicBlock *Pred : MBB->predecessors()) {
+        if (ReachableMap.count(Pred)) {
+          HaveReachablePred = true;
+        } else {
+          Stack.push_back(Pred);
+        }
+      }
+      if (!HaveReachablePred)
+        ReachableMap[MBB] = true;
+      if (HaveReachablePred) {
+        for (MachineBasicBlock *UnreachablePred : Stack) {
+          if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
+            Predecessors.push_back(UnreachablePred);
+        }
+      }
+      Stack.clear();
+    }
+  }
+};
+
+/// Helper class that detects loops which require us to lower an i1 COPY into
+/// bitwise manipulation.
+///
+/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
+/// between loops with the same header. Consider this example:
+///
+///  A-+-+
+///  | | |
+///  B-+ |
+///  |   |
+///  C---+
+///
+/// A is the header of a loop containing A, B, and C as far as LoopInfo is
+/// concerned. However, an i1 COPY in B that is used in C must be lowered to
+/// bitwise operations to combine results from different loop iterations when
+/// B has a divergent branch (since by default we will compile this code such
+/// that threads in a wave are merged at the entry of C).
+///
+/// The following rule is implemented to determine whether bitwise operations
+/// are required: use the bitwise lowering for a def in block B if a backward
+/// edge to B is reachable without going through the nearest common
+/// post-dominator of B and all uses of the def.
+///
+/// TODO: This rule is conservative because it does not check whether the
+///       relevant branches are actually divergent.
+///
+/// The class is designed to cache the CFG traversal so that it can be re-used
+/// for multiple defs within the same basic block.
+///
+/// TODO: We could use region analysis to quickly skip over SESE regions during
+///       the traversal.
+///
+class LoopFinder {
+  MachineDominatorTree &DT;
+  MachinePostDominatorTree &PDT;
+
+  // All visited / reachable block, tagged by level (level 0 is the def block,
+  // level 1 are all blocks reachable including but not going through the def
+  // block's IPDOM, etc.).
+  DenseMap<MachineBasicBlock *, unsigned> Visited;
+
+  // Nearest common dominator of all visited blocks by level (level 0 is the
+  // def block). Used for seeding the SSAUpdater.
+  SmallVector<MachineBasicBlock *, 4> CommonDominators;
+
+  // Post-dominator of all visited blocks.
+  MachineBasicBlock *VisitedPostDom = nullptr;
+
+  // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
+  // reachable without going through the IPDOM of the def block (if the IPDOM
+  // itself has an edge to the def block, the loop level is 2), etc.
+  unsigned FoundLoopLevel = ~0u;
+
+  MachineBasicBlock *DefBlock = nullptr;
+  SmallVector<MachineBasicBlock *, 4> Stack;
+  SmallVector<MachineBasicBlock *, 4> NextLevel;
+
+public:
+  LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
+      : DT(DT), PDT(PDT) {}
+
+  void initialize(MachineBasicBlock &MBB) {
+    Visited.clear();
+    CommonDominators.clear();
+    Stack.clear();
+    NextLevel.clear();
+    VisitedPostDom = nullptr;
+    FoundLoopLevel = ~0u;
+
+    DefBlock = &MBB;
+  }
+
+  /// Check whether a backward edge can be reached without going through the
+  /// given \p PostDom of the def block.
+  ///
+  /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
+  unsigned findLoop(MachineBasicBlock *PostDom) {
+    MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
+
+    if (!VisitedPostDom)
+      advanceLevel();
+
+    unsigned Level = 0;
+    while (PDNode->getBlock() != PostDom) {
+      if (PDNode->getBlock() == VisitedPostDom)
+        advanceLevel();
+      PDNode = PDNode->getIDom();
+      Level++;
+      if (FoundLoopLevel == Level)
+        return Level;
+    }
+
+    return 0;
+  }
+
+  /// Add undef values dominating the loop and the optionally given additional
+  /// blocks, so that the SSA updater doesn't have to search all the way to the
+  /// function entry.
+  void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
+                      ArrayRef<MachineBasicBlock *> Blocks = {}) {
+    assert(LoopLevel < CommonDominators.size());
+
+    MachineBasicBlock *Dom = CommonDominators[LoopLevel];
+    for (MachineBasicBlock *MBB : Blocks)
+      Dom = DT.findNearestCommonDominator(Dom, MBB);
+
+    if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
+      SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
+    } else {
+      // The dominator is part of the loop or the given blocks, so add the
+      // undef value to unreachable predecessors instead.
+      for (MachineBasicBlock *Pred : Dom->predecessors()) {
+        if (!inLoopLevel(*Pred, LoopLevel, Blocks))
+          SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
+      }
+    }
+  }
+
+private:
+  bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
+                   ArrayRef<MachineBasicBlock *> Blocks) const {
+    auto DomIt = Visited.find(&MBB);
+    if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
+      return true;
+
+    if (llvm::find(Blocks, &MBB) != Blocks.end())
+      return true;
+
+    return false;
+  }
+
+  void advanceLevel() {
+    MachineBasicBlock *VisitedDom;
+
+    if (!VisitedPostDom) {
+      VisitedPostDom = DefBlock;
+      VisitedDom = DefBlock;
+      Stack.push_back(DefBlock);
+    } else {
+      VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
+      VisitedDom = CommonDominators.back();
+
+      for (unsigned i = 0; i < NextLevel.size();) {
+        if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
+          Stack.push_back(NextLevel[i]);
+
+          NextLevel[i] = NextLevel.back();
+          NextLevel.pop_back();
+        } else {
+          i++;
+        }
+      }
+    }
+
+    unsigned Level = CommonDominators.size();
+    while (!Stack.empty()) {
+      MachineBasicBlock *MBB = Stack.pop_back_val();
+      if (!PDT.dominates(VisitedPostDom, MBB))
+        NextLevel.push_back(MBB);
+
+      Visited[MBB] = Level;
+      VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
+
+      for (MachineBasicBlock *Succ : MBB->successors()) {
+        if (Succ == DefBlock) {
+          if (MBB == VisitedPostDom)
+            FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
+          else
+            FoundLoopLevel = std::min(FoundLoopLevel, Level);
+          continue;
+        }
+
+        if (Visited.try_emplace(Succ, ~0u).second) {
+          if (MBB == VisitedPostDom)
+            NextLevel.push_back(Succ);
+          else
+            Stack.push_back(Succ);
+        }
+      }
+    }
+
+    CommonDominators.push_back(VisitedDom);
+  }
 };
 
 } // End anonymous namespace.
 
-INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,
-                "SI Lower i1 Copies", false, false)
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
+                    false)
 
 char SILowerI1Copies::ID = 0;
 
@@ -64,104 +411,415 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
   return new SILowerI1Copies();
 }
 
-bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+static unsigned createLaneMaskReg(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+}
+
+static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
+  MachineFunction &MF = *MBB.getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
-  const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
+  unsigned UndefReg = createLaneMaskReg(MF);
+  BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
+          UndefReg);
+  return UndefReg;
+}
 
-  std::vector<unsigned> I1Defs;
+/// Lower all instructions that def or use vreg_1 registers.
+///
+/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
+/// occur around inline assembly. We do this first, before vreg_1 registers
+/// are changed to scalar mask registers.
+///
+/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
+/// all others, because phi lowering looks through copies and can therefore
+/// often make copy lowering unnecessary.
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
+  MF = &TheMF;
+  MRI = &MF->getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
+  ST = &MF->getSubtarget<GCNSubtarget>();
+  TII = ST->getInstrInfo();
 
-    MachineBasicBlock &MBB = *BI;
-    MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
-      Next = std::next(I);
-      MachineInstr &MI = *I;
+  lowerCopiesFromI1();
+  lowerPhis();
+  lowerCopiesToI1();
 
-      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
-        unsigned Reg = MI.getOperand(0).getReg();
-        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-        if (RC == &AMDGPU::VReg_1RegClass)
-          MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
-        continue;
-      }
+  for (unsigned Reg : ConstrainRegs)
+    MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
+  ConstrainRegs.clear();
+
+  return true;
+}
 
+void SILowerI1Copies::lowerCopiesFromI1() {
+  SmallVector<MachineInstr *, 4> DeadCopies;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
       if (MI.getOpcode() != AMDGPU::COPY)
         continue;
 
-      const MachineOperand &Dst = MI.getOperand(0);
-      const MachineOperand &Src = MI.getOperand(1);
-
-      if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) ||
-          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+          MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
         continue;
 
-      const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());
-      const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());
+      if (isLaneMaskReg(DstReg) ||
+          (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+           MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
+        continue;
 
+      // Copy into a 32-bit vector register.
+      LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
       DebugLoc DL = MI.getDebugLoc();
-      MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
-      if (DstRC == &AMDGPU::VReg_1RegClass &&
-          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
-        I1Defs.push_back(Dst.getReg());
-
-        if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {
-          if (DefInst->getOperand(1).isImm()) {
-            I1Defs.push_back(Dst.getReg());
-
-            int64_t Val = DefInst->getOperand(1).getImm();
-            assert(Val == 0 || Val == -1);
-
-            BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
-                .add(Dst)
-                .addImm(Val);
-            MI.eraseFromParent();
-            continue;
+
+      assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
+      assert(!MI.getOperand(0).getSubReg());
+
+      ConstrainRegs.insert(SrcReg);
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addImm(0)
+          .addImm(-1)
+          .addReg(SrcReg);
+      DeadCopies.push_back(&MI);
+    }
+
+    for (MachineInstr *MI : DeadCopies)
+      MI->eraseFromParent();
+    DeadCopies.clear();
+  }
+}
+
+void SILowerI1Copies::lowerPhis() {
+  MachineSSAUpdater SSAUpdater(*MF);
+  LoopFinder LF(*DT, *PDT);
+  PhiIncomingAnalysis PIA(*PDT);
+  SmallVector<MachineInstr *, 4> DeadPhis;
+  SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
+  SmallVector<unsigned, 4> IncomingRegs;
+  SmallVector<unsigned, 4> IncomingUpdated;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    LF.initialize(MBB);
+
+    for (MachineInstr &MI : MBB.phis()) {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);
+
+      MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+
+      // Collect incoming values.
+      for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+        assert(i + 1 < MI.getNumOperands());
+        unsigned IncomingReg = MI.getOperand(i).getReg();
+        MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
+        MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
+
+        if (IncomingDef->getOpcode() == AMDGPU::COPY) {
+          IncomingReg = IncomingDef->getOperand(1).getReg();
+          assert(isLaneMaskReg(IncomingReg));
+          assert(!IncomingDef->getOperand(1).getSubReg());
+        } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
+          continue;
+        } else {
+          assert(IncomingDef->isPHI());
+        }
+
+        IncomingBlocks.push_back(IncomingMBB);
+        IncomingRegs.push_back(IncomingReg);
+      }
+
+      // Phis in a loop that are observed outside the loop receive a simple but
+      // conservatively correct treatment.
+      MachineBasicBlock *PostDomBound = &MBB;
+      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+        PostDomBound =
+            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+      }
+
+      unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+
+      SSAUpdater.Initialize(DstReg);
+
+      if (FoundLoopLevel) {
+        LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
+
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          IncomingUpdated.push_back(createLaneMaskReg(*MF));
+          SSAUpdater.AddAvailableValue(IncomingBlocks[i],
+                                       IncomingUpdated.back());
+        }
+
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          MachineBasicBlock &IMBB = *IncomingBlocks[i];
+          buildMergeLaneMasks(
+              IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+              SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
+        }
+      } else {
+        // The phi is not observed from outside a loop. Use a more accurate
+        // lowering.
+        PIA.analyze(MBB, IncomingBlocks);
+
+        for (MachineBasicBlock *MBB : PIA.predecessors())
+          SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
+
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          MachineBasicBlock &IMBB = *IncomingBlocks[i];
+          if (PIA.isSource(IMBB)) {
+            IncomingUpdated.push_back(0);
+            SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
+          } else {
+            IncomingUpdated.push_back(createLaneMaskReg(*MF));
+            SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
           }
         }
 
-        unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc)
-            .add(Src);
-        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
-            .add(Dst)
-            .addImm(0)
-            .addImm(-1)
-            .addReg(TmpSrc);
-        MI.eraseFromParent();
-      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
-                 SrcRC == &AMDGPU::VReg_1RegClass) {
-        if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
-            DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
-            DefInst->getOperand(1).getImm() == 0 &&
-            DefInst->getOperand(2).getImm() != 0 &&
-            DefInst->getOperand(3).isReg() &&
-            TargetRegisterInfo::isVirtualRegister(
-              DefInst->getOperand(3).getReg()) &&
-            TRI->getCommonSubClass(
-              MRI.getRegClass(DefInst->getOperand(3).getReg()),
-              &AMDGPU::SGPR_64RegClass) &&
-            AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
-          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
-              .add(Dst)
-              .addReg(AMDGPU::EXEC)
-              .add(DefInst->getOperand(3));
-        } else {
-          BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
-              .add(Dst)
-              .add(Src)
-              .addImm(0);
+        for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
+          if (!IncomingUpdated[i])
+            continue;
+
+          MachineBasicBlock &IMBB = *IncomingBlocks[i];
+          buildMergeLaneMasks(
+              IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
+              SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
         }
-        MI.eraseFromParent();
+      }
+
+      unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
+      if (NewReg != DstReg) {
+        MRI->replaceRegWith(NewReg, DstReg);
+
+        // Ensure that DstReg has a single def and mark the old PHI node for
+        // deletion.
+        MI.getOperand(0).setReg(NewReg);
+        DeadPhis.push_back(&MI);
+      }
+
+      IncomingBlocks.clear();
+      IncomingRegs.clear();
+      IncomingUpdated.clear();
+    }
+
+    for (MachineInstr *MI : DeadPhis)
+      MI->eraseFromParent();
+    DeadPhis.clear();
+  }
+}
+
+void SILowerI1Copies::lowerCopiesToI1() {
+  MachineSSAUpdater SSAUpdater(*MF);
+  LoopFinder LF(*DT, *PDT);
+  SmallVector<MachineInstr *, 4> DeadCopies;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    LF.initialize(MBB);
+
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
+          MI.getOpcode() != AMDGPU::COPY)
+        continue;
+
+      unsigned DstReg = MI.getOperand(0).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(DstReg) ||
+          MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
+        continue;
+
+      if (MRI->use_empty(DstReg)) {
+        DeadCopies.push_back(&MI);
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
+
+      MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
+      if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
+        continue;
+
+      DebugLoc DL = MI.getDebugLoc();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      assert(!MI.getOperand(1).getSubReg());
+
+      if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+          !isLaneMaskReg(SrcReg)) {
+        assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
+        unsigned TmpReg = createLaneMaskReg(*MF);
+        BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
+            .addReg(SrcReg)
+            .addImm(0);
+        MI.getOperand(1).setReg(TmpReg);
+        SrcReg = TmpReg;
+      }
+
+      // Defs in a loop that are observed outside the loop must be transformed
+      // into appropriate bit manipulation.
+      MachineBasicBlock *PostDomBound = &MBB;
+      for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
+        PostDomBound =
+            PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
+      }
+
+      unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
+      if (FoundLoopLevel) {
+        SSAUpdater.Initialize(DstReg);
+        SSAUpdater.AddAvailableValue(&MBB, DstReg);
+        LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
+
+        buildMergeLaneMasks(MBB, MI, DL, DstReg,
+                            SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
+        DeadCopies.push_back(&MI);
       }
     }
+
+    for (MachineInstr *MI : DeadCopies)
+      MI->eraseFromParent();
+    DeadCopies.clear();
   }
+}
 
-  for (unsigned Reg : I1Defs)
-    MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);
+bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
+  const MachineInstr *MI;
+  for (;;) {
+    MI = MRI->getUniqueVRegDef(Reg);
+    if (MI->getOpcode() != AMDGPU::COPY)
+      break;
+
+    Reg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      return false;
+    if (!isLaneMaskReg(Reg))
+      return false;
+  }
+
+  if (MI->getOpcode() != AMDGPU::S_MOV_B64)
+    return false;
+
+  if (!MI->getOperand(1).isImm())
+    return false;
+
+  int64_t Imm = MI->getOperand(1).getImm();
+  if (Imm == 0) {
+    Val = false;
+    return true;
+  }
+  if (Imm == -1) {
+    Val = true;
+    return true;
+  }
 
   return false;
 }
+
+static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
+  Def = false;
+  Use = false;
+
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
+      if (MO.isUse())
+        Use = true;
+      else
+        Def = true;
+    }
+  }
+}
+
+/// Return a point at the end of the given \p MBB to insert SALU instructions
+/// for lane mask calculation. Take terminators and SCC into account.
+MachineBasicBlock::iterator
+SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
+  auto InsertionPt = MBB.getFirstTerminator();
+  bool TerminatorsUseSCC = false;
+  for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
+    bool DefsSCC;
+    instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
+    if (TerminatorsUseSCC || DefsSCC)
+      break;
+  }
+
+  if (!TerminatorsUseSCC)
+    return InsertionPt;
+
+  while (InsertionPt != MBB.begin()) {
+    InsertionPt--;
+
+    bool DefSCC, UseSCC;
+    instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
+    if (DefSCC)
+      return InsertionPt;
+  }
+
+  // We should have at least seen an IMPLICIT_DEF or COPY
+  llvm_unreachable("SCC used by terminator but no def in block");
+}
+
+void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          const DebugLoc &DL, unsigned DstReg,
+                                          unsigned PrevReg, unsigned CurReg) {
+  bool PrevVal;
+  bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
+  bool CurVal;
+  bool CurConstant = isConstantLaneMask(CurReg, CurVal);
+
+  if (PrevConstant && CurConstant) {
+    if (PrevVal == CurVal) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
+    } else if (CurVal) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
+    } else {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
+          .addReg(AMDGPU::EXEC)
+          .addImm(-1);
+    }
+    return;
+  }
+
+  unsigned PrevMaskedReg = 0;
+  unsigned CurMaskedReg = 0;
+  if (!PrevConstant) {
+    if (CurConstant && CurVal) {
+      PrevMaskedReg = PrevReg;
+    } else {
+      PrevMaskedReg = createLaneMaskReg(*MF);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
+          .addReg(PrevReg)
+          .addReg(AMDGPU::EXEC);
+    }
+  }
+  if (!CurConstant) {
+    // TODO: check whether CurReg is already masked by EXEC
+    if (PrevConstant && PrevVal) {
+      CurMaskedReg = CurReg;
+    } else {
+      CurMaskedReg = createLaneMaskReg(*MF);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
+          .addReg(CurReg)
+          .addReg(AMDGPU::EXEC);
+    }
+  }
+
+  if (PrevConstant && !PrevVal) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+        .addReg(CurMaskedReg);
+  } else if (CurConstant && !CurVal) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
+        .addReg(PrevMaskedReg);
+  } else if (PrevConstant && PrevVal) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
+        .addReg(CurMaskedReg)
+        .addReg(AMDGPU::EXEC);
+  } else {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
+        .addReg(PrevMaskedReg)
+        .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
+  }
+}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index ee1ff85523adebc1e0e1e459115a4afcf47c0530..181cc41bd5ff743eedcabac51500599af1afd11c 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -117,7 +117,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   }
 
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  bool MaySpill = ST.isVGPRSpillingEnabled(F);
   bool HasStackObjects = FrameInfo.hasStackObjects();
 
   if (isEntryFunction()) {
@@ -126,21 +125,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     if (WorkItemIDZ)
       WorkItemIDY = true;
 
-    if (HasStackObjects || MaySpill) {
-      PrivateSegmentWaveByteOffset = true;
+    PrivateSegmentWaveByteOffset = true;
 
     // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
     if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
         (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
-      ArgInfo.PrivateSegmentWaveByteOffset
-        = ArgDescriptor::createRegister(AMDGPU::SGPR5);
-    }
+      ArgInfo.PrivateSegmentWaveByteOffset =
+          ArgDescriptor::createRegister(AMDGPU::SGPR5);
   }
 
   bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
   if (isAmdHsaOrMesa) {
-    if (HasStackObjects || MaySpill)
-      PrivateSegmentBuffer = true;
+    PrivateSegmentBuffer = true;
 
     if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
       DispatchPtr = true;
@@ -151,8 +147,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     if (F.hasFnAttribute("amdgpu-dispatch-id"))
       DispatchID = true;
   } else if (ST.isMesaGfxShader(F)) {
-    if (HasStackObjects || MaySpill)
-      ImplicitBufferPtr = true;
+    ImplicitBufferPtr = true;
   }
 
   if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index d37ad077dd65da93943f88875ed50ca9b94f7f32..015773b110420505693912faa7d137630926ca3f 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -212,6 +212,169 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
   }
 }
 
+// This is the same as MachineInstr::readsRegister/modifiesRegister except
+// it takes subregs into account.
+static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
+                          unsigned Reg, unsigned SubReg,
+                          const SIRegisterInfo &TRI) {
+  for (const MachineOperand &MO : R) {
+    if (!MO.isReg())
+      continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+        TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (TRI.regsOverlap(Reg, MO.getReg()))
+        return true;
+    } else if (MO.getReg() == Reg &&
+               TargetRegisterInfo::isVirtualRegister(Reg)) {
+      LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
+                            TRI.getSubRegIndexLaneMask(MO.getSubReg());
+      if (Overlap.any())
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool instReadsReg(const MachineInstr *MI,
+                         unsigned Reg, unsigned SubReg,
+                         const SIRegisterInfo &TRI) {
+  return instAccessReg(MI->uses(), Reg, SubReg, TRI);
+}
+
+static bool instModifiesReg(const MachineInstr *MI,
+                            unsigned Reg, unsigned SubReg,
+                            const SIRegisterInfo &TRI) {
+  return instAccessReg(MI->defs(), Reg, SubReg, TRI);
+}
+
+static TargetInstrInfo::RegSubRegPair
+getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
+                  const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
+  if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
+    } else {
+      LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
+      Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger()));
+    }
+  }
+  return TargetInstrInfo::RegSubRegPair(Reg, Sub);
+}
+
+// Match:
+// mov t, x
+// mov x, y
+// mov y, t
+//
+// =>
+//
+// mov t, x (t is potentially dead and move eliminated)
+// v_swap_b32 x, y
+//
+// Returns next valid instruction pointer if was able to create v_swap_b32.
+//
+// This shall not be done too early not to prevent possible folding which may
+// remove matched moves, and this should prefereably be done before RA to
+// release saved registers and also possibly after RA which can insert copies
+// too.
+//
+// This is really just a generic peephole that is not a canocical shrinking,
+// although requirements match the pass placement and it reduces code size too.
+static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
+                               const SIInstrInfo *TII) {
+  assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+         MovT.getOpcode() == AMDGPU::COPY);
+
+  unsigned T = MovT.getOperand(0).getReg();
+  unsigned Tsub = MovT.getOperand(0).getSubReg();
+  MachineOperand &Xop = MovT.getOperand(1);
+
+  if (!Xop.isReg())
+    return nullptr;
+  unsigned X = Xop.getReg();
+  unsigned Xsub = Xop.getSubReg();
+
+  unsigned Size = TII->getOpSize(MovT, 0) / 4;
+
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  if (!TRI.isVGPR(MRI, X))
+    return nullptr;
+
+  for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) {
+    if (YTop.getSubReg() != Tsub)
+      continue;
+
+    MachineInstr &MovY = *YTop.getParent();
+    if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+         MovY.getOpcode() != AMDGPU::COPY) ||
+        MovY.getOperand(1).getSubReg() != Tsub)
+      continue;
+
+    unsigned Y = MovY.getOperand(0).getReg();
+    unsigned Ysub = MovY.getOperand(0).getSubReg();
+
+    if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
+      continue;
+
+    MachineInstr *MovX = nullptr;
+    auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end();
+    for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) {
+      if (instReadsReg(&*I, X, Xsub, TRI) ||
+          instModifiesReg(&*I, Y, Ysub, TRI) ||
+          instModifiesReg(&*I, T, Tsub, TRI) ||
+          (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
+        MovX = nullptr;
+        break;
+      }
+      if (!instReadsReg(&*I, Y, Ysub, TRI)) {
+        if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) {
+          MovX = nullptr;
+          break;
+        }
+        continue;
+      }
+      if (MovX ||
+          (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+           I->getOpcode() != AMDGPU::COPY) ||
+          I->getOperand(0).getReg() != X ||
+          I->getOperand(0).getSubReg() != Xsub) {
+        MovX = nullptr;
+        break;
+      }
+      MovX = &*I;
+    }
+
+    if (!MovX || I == E)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
+
+    for (unsigned I = 0; I < Size; ++I) {
+      TargetInstrInfo::RegSubRegPair X1, Y1;
+      X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
+      Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
+      BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(),
+                TII->get(AMDGPU::V_SWAP_B32))
+        .addDef(X1.Reg, 0, X1.SubReg)
+        .addDef(Y1.Reg, 0, Y1.SubReg)
+        .addReg(Y1.Reg, 0, Y1.SubReg)
+        .addReg(X1.Reg, 0, X1.SubReg).getInstr();
+    }
+    MovX->eraseFromParent();
+    MovY.eraseFromParent();
+    MachineInstr *Next = &*std::next(MovT.getIterator());
+    if (MRI.use_nodbg_empty(T))
+      MovT.eraseFromParent();
+    else
+      Xop.setIsKill(false);
+
+    return Next;
+  }
+
+  return nullptr;
+}
+
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -252,6 +415,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
         }
       }
 
+      if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+                           MI.getOpcode() == AMDGPU::COPY)) {
+        if (auto *NextMI = matchSwap(MI, MRI, TII)) {
+          Next = NextMI->getIterator();
+          continue;
+        }
+      }
+
       // Combine adjacent s_nops to use the immediate operand encoding how long
       // to wait.
       //
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b242345c52ab4de596c127e177c025df944bc7fb..634ec8fcc3d11ad657327d32445e3a89f4aef55f 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -152,6 +152,8 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
 
   if (hasXNACK(*STI))
     Stream << "+xnack";
+  if (hasSRAMECC(*STI))
+    Stream << "+sram-ecc";
 
   Stream.flush();
 }
@@ -593,6 +595,10 @@ bool hasXNACK(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
 }
 
+bool hasSRAMECC(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureSRAMECC];
+}
+
 bool hasMIMG_R128(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
 }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index da004a6a841a0114f3bb4bac0a66c291b294e4a2..d45f42498692ede8e565d1c8c63fc540493ba473 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -342,6 +342,7 @@ inline bool isKernel(CallingConv::ID CC) {
 }
 
 bool hasXNACK(const MCSubtargetInfo &STI);
+bool hasSRAMECC(const MCSubtargetInfo &STI);
 bool hasMIMG_R128(const MCSubtargetInfo &STI);
 bool hasPackedD16(const MCSubtargetInfo &STI);
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
deleted file mode 100644
index 1924f71f11c84c7028b51af97199bb8dc346bf95..0000000000000000000000000000000000000000
--- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// MBB A lane-dominates MBB B if
-// 1. A dominates B in the usual sense, i.e. every path from the entry to B
-//    goes through A, and
-// 2. whenever B executes, every active lane during that execution of B was
-//    also active during the most recent execution of A.
-//
-// The simplest example where A dominates B but does not lane-dominate it is
-// where A is a loop:
-//
-//     |
-//     +--+
-//     A  |
-//     +--+
-//     |
-//     B
-//
-// Unfortunately, the second condition is not fully captured by the control
-// flow graph when it is unstructured (as may happen when branch conditions are
-// uniform).
-//
-// The following replacement of the second condition is a conservative
-// approximation. It is an equivalent condition when the CFG is fully
-// structured:
-//
-// 2'. every cycle in the CFG that contains A also contains B.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPULaneDominator.h"
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-
-namespace llvm {
-
-namespace AMDGPU {
-
-// Given machine basic blocks A and B where A dominates B, check whether
-// A lane-dominates B.
-//
-// The check is conservative, i.e. there can be false-negatives.
-bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
-  // Check whether A is reachable from itself without going through B.
-  DenseSet<MachineBasicBlock *> Reachable;
-  SmallVector<MachineBasicBlock *, 8> Stack;
-
-  Stack.push_back(A);
-  do {
-    MachineBasicBlock *MBB = Stack.back();
-    Stack.pop_back();
-
-    for (MachineBasicBlock *Succ : MBB->successors()) {
-      if (Succ == A)
-        return false;
-      if (Succ != B && Reachable.insert(Succ).second)
-        Stack.push_back(Succ);
-    }
-  } while (!Stack.empty());
-
-  return true;
-}
-
-} // namespace AMDGPU
-
-} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
deleted file mode 100644
index 4f33a89a364bd1ec8f8c4dbee9d735909a66669c..0000000000000000000000000000000000000000
--- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
-
-namespace llvm {
-
-class MachineBasicBlock;
-
-namespace AMDGPU {
-
-bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
-
-} // end namespace AMDGPU
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index c5ed32e46821bc672290101269ba708cd6a540fb..01b80ebe8d3dc6bbdc622f58e99532bf4a9ad9d4 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,5 +2,4 @@ add_llvm_library(LLVMAMDGPUUtils
   AMDGPUBaseInfo.cpp
   AMDKernelCodeTUtils.cpp
   AMDGPUAsmUtils.cpp
-  AMDGPULaneDominator.cpp
   )
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index e9d12ba83f3a3c33c507ceabcc5cc6a093a236e8..db031be7e558ed9d3b46ae5a7f9be9aaf5ed213a 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -393,8 +393,8 @@ defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>,
 defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
 defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>;
 defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
-defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
-defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
+defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
+defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
 defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
 defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
 defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
@@ -556,8 +556,8 @@ defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
 defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
-defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
-defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
+defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
+defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
 defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
 defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
 defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 96b233b5a3865c47eca964c1a142163050eaf375..51bee3efeb2c4b923d87cd324aece67117c47672 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -295,8 +295,8 @@ let SchedRW = [WriteDoubleAdd] in {
 def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
 def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
-def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
-def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
+def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
+def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
 } // End SchedRW = [WriteDoubleAdd]
 
 let SchedRW = [WriteQuarterRate32] in {
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 41e21c116a962bb673b0ec83c4d23bc44f47e0f2..2efd28b9cd8bc1b0c76ca85253a4adff3abaa79f 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -48,8 +48,8 @@ def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_
 
 def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
 def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
-def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
-def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
 
 def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
 def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
@@ -287,6 +287,15 @@ foreach Type = ["U", "I"] in
                       (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
     (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
 
+// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase
+// in the compile time. Directly handle the pattern generated by the FE here.
+foreach Type = ["U", "I"] in
+  def : GCNPat <
+    !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
+                      [7, 1, 2, 3, 4, 5, 6], lhs, y,
+                      (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))),
+    (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
+
 } // End SubtargetPredicate = HasDLInsts
 
 multiclass VOP3P_Real_vi<bits<10> op> {
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index fc23495ebf3abd2d1e5b2d23284ede053f5bb333..b71a09828bce0273fe12577ceaf88e4d218ced37 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -1043,7 +1043,7 @@ def : ProcessorModel<"cortex-a57",  CortexA57Model,     [ARMv8a, ProcA57,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureCheapPredicableCPSR]>;
 
-def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
+def : ProcessorModel<"cortex-a72",  CortexA57Model,     [ARMv8a, ProcA72,
                                                          FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 0d1908ada7fc70b4f6158704fa8d0a47c231f986..bbebed59c8512ffa32346204210941e233385bfb 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2199,6 +2199,7 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
   {ARM::tSUBSi8, ARM::tSUBi8},
   {ARM::tSUBSrr, ARM::tSUBrr},
   {ARM::tSBCS, ARM::tSBC},
+  {ARM::tRSBS, ARM::tRSB},
 
   {ARM::t2ADDSri, ARM::t2ADDri},
   {ARM::t2ADDSrr, ARM::t2ADDrr},
@@ -2963,6 +2964,8 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
     OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
 
+  MI->clearRegisterDeads(ARM::CPSR);
+
   return true;
 }
 
diff --git a/lib/Target/ARM/ARMCodeGenPrepare.cpp b/lib/Target/ARM/ARMCodeGenPrepare.cpp
index fb9fad472d9b2b02bbfcd233ba0e56af9374c106..0bd1f9ca63918d253fe44f11c67198da294ad357 100644
--- a/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -110,33 +110,43 @@ namespace {
 class IRPromoter {
   SmallPtrSet<Value*, 8> NewInsts;
   SmallVector<Instruction*, 4> InstsToRemove;
+  DenseMap<Value*, Type*> TruncTysMap;
+  SmallPtrSet<Value*, 8> Promoted;
   Module *M = nullptr;
   LLVMContext &Ctx;
+  IntegerType *ExtTy = nullptr;
+  IntegerType *OrigTy = nullptr;
+
+  void PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
+                         SmallPtrSetImpl<Instruction*> &SafeToPromote);
+  void ExtendSources(SmallPtrSetImpl<Value*> &Sources);
+  void PromoteTree(SmallPtrSetImpl<Value*> &Visited,
+                   SmallPtrSetImpl<Value*> &Sources,
+                   SmallPtrSetImpl<Instruction*> &Sinks,
+                   SmallPtrSetImpl<Instruction*> &SafeToPromote);
+  void TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
+                     SmallPtrSetImpl<Instruction*> &Sinks);
+  void Cleanup(SmallPtrSetImpl<Instruction*> &Sinks);
 
 public:
-  IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { }
+  IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
+                          ExtTy(Type::getInt32Ty(Ctx)) { }
 
-  void Cleanup() {
-    for (auto *I : InstsToRemove) {
-      LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
-      I->dropAllReferences();
-      I->eraseFromParent();
-    }
-    InstsToRemove.clear();
-    NewInsts.clear();
-  }
 
   void Mutate(Type *OrigTy,
               SmallPtrSetImpl<Value*> &Visited,
               SmallPtrSetImpl<Value*> &Sources,
-              SmallPtrSetImpl<Instruction*> &Sinks);
+              SmallPtrSetImpl<Instruction*> &Sinks,
+              SmallPtrSetImpl<Instruction*> &SafeToPromote);
 };
 
 class ARMCodeGenPrepare : public FunctionPass {
   const ARMSubtarget *ST = nullptr;
   IRPromoter *Promoter = nullptr;
   std::set<Value*> AllVisited;
+  SmallPtrSet<Instruction*, 8> SafeToPromote;
 
+  bool isSafeOverflow(Instruction *I);
   bool isSupportedValue(Value *V);
   bool isLegalToPromote(Value *V);
   bool TryToPromote(Value *V);
@@ -241,8 +251,8 @@ static bool isSink(Value *V) {
 }
 
 /// Return whether the instruction can be promoted within any modifications to
-/// it's operands or result.
-static bool isSafeOverflow(Instruction *I) {
+/// its operands or result.
+bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) {
   // FIXME Do we need NSW too?
   if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
     return true;
@@ -383,15 +393,7 @@ static bool isPromotedResultSafe(Value *V) {
   if (generateSignBits(V))
     return false;
 
-  // If I is only being used by something that will require its value to be
-  // truncated, then we don't care about the promoted result.
-  auto *I = cast<Instruction>(V);
-  if (I->hasOneUse() && isSink(*I->use_begin()))
-    return true;
-
-  if (isa<OverflowingBinaryOperator>(I))
-    return isSafeOverflow(I);
-  return true;
+  return !isa<OverflowingBinaryOperator>(V);
 }
 
 /// Return the intrinsic for the instruction that can perform the same
@@ -414,73 +416,105 @@ static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
   llvm_unreachable("unhandled opcode for narrow intrinsic");
 }
 
-void IRPromoter::Mutate(Type *OrigTy,
-                        SmallPtrSetImpl<Value*> &Visited,
-                        SmallPtrSetImpl<Value*> &Sources,
-                        SmallPtrSetImpl<Instruction*> &Sinks) {
+static void ReplaceAllUsersOfWith(Value *From, Value *To) {
+  SmallVector<Instruction*, 4> Users;
+  Instruction *InstTo = dyn_cast<Instruction>(To);
+  for (Use &U : From->uses()) {
+    auto *User = cast<Instruction>(U.getUser());
+    if (InstTo && User->isIdenticalTo(InstTo))
+      continue;
+    Users.push_back(User);
+  }
+
+  for (auto *U : Users)
+    U->replaceUsesOfWith(From, To);
+}
+
+void
+IRPromoter::PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
+                             SmallPtrSetImpl<Instruction*> &SafeToPromote) {
   IRBuilder<> Builder{Ctx};
-  Type *ExtTy = Type::getInt32Ty(M->getContext());
-  SmallPtrSet<Value*, 8> Promoted;
-  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
-             << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
+  // First step is to prepare the instructions for mutation. Most constants
+  // just need to be zero extended into their new type, but complications arise
+  // because:
+  // - For nuw binary operators, negative immediates would need sign extending;
+  //   however, instead we'll change them to positive and zext them. We can do
+  //   this because:
+  //   > The operators that can wrap are: add, sub, mul and shl.
+  //   > shl interprets its second operand as unsigned and if the first operand
+  //     is an immediate, it will need zext to be nuw.
+  //   > I'm assuming mul has to interpret immediates as unsigned for nuw.
+  //   > Which leaves the nuw add and sub to be handled; as with shl, if an
+  //     immediate is used as operand 0, it will need zext to be nuw.
+  // - We also allow add and sub to safely overflow in certain circumstances
+  //   and only when the value (operand 0) is being decreased.
+  //
+  // For adds and subs, that are either nuw or safely wrap and use a negative
+  // immediate as operand 1, we create an equivalent instruction using a
+  // positive immediate. That positive immediate can then be zext along with
+  // all the other immediates later.
+  for (auto *V : Visited) {
+    if (!isa<Instruction>(V))
+      continue;
 
-  // Cache original types.
-  DenseMap<Value*, Type*> TruncTysMap;
-  for (auto *V : Visited)
-    TruncTysMap[V] = V->getType();
+    auto *I = cast<Instruction>(V);
+    if (SafeToPromote.count(I)) {
 
-  auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
-    SmallVector<Instruction*, 4> Users;
-    Instruction *InstTo = dyn_cast<Instruction>(To);
-    for (Use &U : From->uses()) {
-      auto *User = cast<Instruction>(U.getUser());
-      if (InstTo && User->isIdenticalTo(InstTo))
+      if (!isa<OverflowingBinaryOperator>(I))
         continue;
-      Users.push_back(User);
-    }
 
-    for (auto *U : Users)
-      U->replaceUsesOfWith(From, To);
-  };
+      if (auto *Const = dyn_cast<ConstantInt>(I->getOperand(1))) {
+        if (!Const->isNegative())
+          break;
 
-  auto FixConst = [&](ConstantInt *Const, Instruction *I) {
-    Constant *NewConst = isSafeOverflow(I) && Const->isNegative() ?
-      ConstantExpr::getSExt(Const, ExtTy) :
-      ConstantExpr::getZExt(Const, ExtTy);
-    I->replaceUsesOfWith(Const, NewConst);
-  };
+        unsigned Opc = I->getOpcode();
+        if (Opc != Instruction::Add && Opc != Instruction::Sub)
+          continue;
 
-  auto InsertDSPIntrinsic = [&](Instruction *I) {
-    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
-               << *I << "\n");
-    Function *DSPInst =
-      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
-    Builder.SetInsertPoint(I);
-    Builder.SetCurrentDebugLocation(I->getDebugLoc());
-    Value *Args[] = { I->getOperand(0), I->getOperand(1) };
-    CallInst *Call = Builder.CreateCall(DSPInst, Args);
-    ReplaceAllUsersOfWith(I, Call);
-    InstsToRemove.push_back(I);
-    NewInsts.insert(Call);
-    TruncTysMap[Call] = OrigTy;
-  };
+        LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
+        auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
+        Builder.SetInsertPoint(I);
+        Value *NewVal = Opc == Instruction::Sub ?
+          Builder.CreateAdd(I->getOperand(0), NewConst) :
+          Builder.CreateSub(I->getOperand(0), NewConst);
+        LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n");
+
+        if (auto *NewInst = dyn_cast<Instruction>(NewVal)) {
+          NewInst->copyIRFlags(I);
+          NewInsts.insert(NewInst);
+        }
+        InstsToRemove.push_back(I);
+        I->replaceAllUsesWith(NewVal);
+      }
+    }
+  }
+  for (auto *I : NewInsts)
+    Visited.insert(I);
+}
+
+void IRPromoter::ExtendSources(SmallPtrSetImpl<Value*> &Sources) {
+  IRBuilder<> Builder{Ctx};
 
   auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
+    assert(V->getType() != ExtTy && "zext already extends to i32");
     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
     Builder.SetInsertPoint(InsertPt);
     if (auto *I = dyn_cast<Instruction>(V))
       Builder.SetCurrentDebugLocation(I->getDebugLoc());
-    auto *ZExt = cast<Instruction>(Builder.CreateZExt(V, ExtTy));
-    if (isa<Argument>(V))
-      ZExt->moveBefore(InsertPt);
-    else
-      ZExt->moveAfter(InsertPt);
+
+    Value *ZExt = Builder.CreateZExt(V, ExtTy);
+    if (auto *I = dyn_cast<Instruction>(ZExt)) {
+      if (isa<Argument>(V))
+        I->moveBefore(InsertPt);
+      else
+        I->moveAfter(InsertPt);
+      NewInsts.insert(I);
+    }
     ReplaceAllUsersOfWith(V, ZExt);
-    NewInsts.insert(ZExt);
     TruncTysMap[ZExt] = TruncTysMap[V];
   };
 
-  // First, insert extending instructions between the sources and their users.
+  // Now, insert extending instructions between the sources and their users.
   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
   for (auto V : Sources) {
     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
@@ -494,9 +528,17 @@ void IRPromoter::Mutate(Type *OrigTy,
     }
     Promoted.insert(V);
   }
+}
 
+void IRPromoter::PromoteTree(SmallPtrSetImpl<Value*> &Visited,
+                             SmallPtrSetImpl<Value*> &Sources,
+                             SmallPtrSetImpl<Instruction*> &Sinks,
+                             SmallPtrSetImpl<Instruction*> &SafeToPromote) {
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
-  // Then mutate the types of the instructions within the tree. Here we handle
+
+  IRBuilder<> Builder{Ctx};
+
+  // Mutate the types of the instructions within the tree. Here we handle
   // constant operands.
   for (auto *V : Visited) {
     if (Sources.count(V))
@@ -511,9 +553,10 @@ void IRPromoter::Mutate(Type *OrigTy,
       if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType()))
         continue;
 
-      if (auto *Const = dyn_cast<ConstantInt>(Op))
-        FixConst(Const, I);
-      else if (isa<UndefValue>(Op))
+      if (auto *Const = dyn_cast<ConstantInt>(Op)) {
+        Constant *NewConst = ConstantExpr::getZExt(Const, ExtTy);
+        I->setOperand(i, NewConst);
+      } else if (isa<UndefValue>(Op))
         I->setOperand(i, UndefValue::get(ExtTy));
     }
 
@@ -523,20 +566,42 @@ void IRPromoter::Mutate(Type *OrigTy,
     }
   }
 
-  // Now we need to remove any zexts that have become unnecessary, as well
-  // as insert any intrinsics.
+  // Finally, any instructions that should be promoted but haven't yet been,
+  // need to be handled using intrinsics.
   for (auto *V : Visited) {
-    if (Sources.count(V))
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
       continue;
 
-    if (!shouldPromote(V) || isPromotedResultSafe(V))
+    if (Sources.count(I) || Sinks.count(I))
       continue;
 
+    if (!shouldPromote(I) || SafeToPromote.count(I) || NewInsts.count(I))
+      continue;
+  
     assert(EnableDSP && "DSP intrinisc insertion not enabled!");
 
     // Replace unsafe instructions with appropriate intrinsic calls.
-    InsertDSPIntrinsic(cast<Instruction>(V));
+    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
+               << *I << "\n");
+    Function *DSPInst =
+      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
+    Builder.SetInsertPoint(I);
+    Builder.SetCurrentDebugLocation(I->getDebugLoc());
+    Value *Args[] = { I->getOperand(0), I->getOperand(1) };
+    CallInst *Call = Builder.CreateCall(DSPInst, Args);
+    ReplaceAllUsersOfWith(I, Call);
+    InstsToRemove.push_back(I);
+    NewInsts.insert(Call);
+    TruncTysMap[Call] = OrigTy;
   }
+}
+
+void IRPromoter::TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
+                               SmallPtrSetImpl<Instruction*> &Sinks) {
+  LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
+
+  IRBuilder<> Builder{Ctx};
 
   auto InsertTrunc = [&](Value *V) -> Instruction* {
     if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
@@ -553,12 +618,12 @@ void IRPromoter::Mutate(Type *OrigTy,
     LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for "
                << *V << "\n");
     Builder.SetInsertPoint(cast<Instruction>(V));
-    auto *Trunc = cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
-    NewInsts.insert(Trunc);
+    auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
+    if (Trunc)
+      NewInsts.insert(Trunc);
     return Trunc;
   };
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
   // Fix up any stores or returns that use the results of the promoted
   // chain.
   for (auto I : Sinks) {
@@ -584,6 +649,87 @@ void IRPromoter::Mutate(Type *OrigTy,
       }
     }
   }
+
+}
+
+void IRPromoter::Cleanup(SmallPtrSetImpl<Instruction*> &Sinks) {
+  // Some zext sinks will now have become redundant, along with their trunc
+  // operands, so remove them.
+  for (auto I : Sinks) {
+    if (auto *ZExt = dyn_cast<ZExtInst>(I)) {
+      if (ZExt->getDestTy() != ExtTy)
+        continue;
+
+      Value *Src = ZExt->getOperand(0);
+      if (ZExt->getSrcTy() == ZExt->getDestTy()) {
+        LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary zext\n");
+        ReplaceAllUsersOfWith(ZExt, Src);
+        InstsToRemove.push_back(ZExt);
+        continue;
+      }
+
+      // For any truncs that we insert to handle zexts, we can replace the
+      // result of the zext with the input to the trunc.
+      if (NewInsts.count(Src) && isa<TruncInst>(Src)) {
+        auto *Trunc = cast<TruncInst>(Src);
+        assert(Trunc->getOperand(0)->getType() == ExtTy &&
+               "expected inserted trunc to be operating on i32");
+        LLVM_DEBUG(dbgs() << "ARM CGP: Replacing zext with trunc operand: "
+                   << *Trunc->getOperand(0));
+        ReplaceAllUsersOfWith(ZExt, Trunc->getOperand(0));
+        InstsToRemove.push_back(ZExt);
+      }
+    }
+  }
+
+  for (auto *I : InstsToRemove) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
+    I->dropAllReferences();
+    I->eraseFromParent();
+  }
+
+  InstsToRemove.clear();
+  NewInsts.clear();
+  TruncTysMap.clear();
+  Promoted.clear();
+}
+
+void IRPromoter::Mutate(Type *OrigTy,
+                        SmallPtrSetImpl<Value*> &Visited,
+                        SmallPtrSetImpl<Value*> &Sources,
+                        SmallPtrSetImpl<Instruction*> &Sinks,
+                        SmallPtrSetImpl<Instruction*> &SafeToPromote) {
+  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
+             << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
+
+  assert(isa<IntegerType>(OrigTy) && "expected integer type");
+  this->OrigTy = cast<IntegerType>(OrigTy);
+  assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() &&
+         "original type not smaller than extended type");
+
+  // Cache original types.
+  for (auto *V : Visited)
+    TruncTysMap[V] = V->getType();
+
+  // Convert adds and subs using negative immediates to equivalent instructions
+  // that use positive constants.
+  PrepareConstants(Visited, SafeToPromote);
+
+  // Insert zext instructions between sources and their users.
+  ExtendSources(Sources);
+
+  // Promote visited instructions, mutating their types in place. Also insert
+  // DSP intrinsics, if enabled, for adds and subs which would be unsafe to
+  // promote.
+  PromoteTree(Visited, Sources, Sinks, SafeToPromote);
+
+  // Insert trunc instructions for use by calls, stores etc...
+  TruncateSinks(Sources, Sinks);
+
+  // Finally, remove unecessary zexts and truncs, delete old instructions and
+  // clear the data structures.
+  Cleanup(Sinks);
+
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n");
   LLVM_DEBUG(dbgs();
              for (auto *V : Sources)
@@ -651,11 +797,20 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
 /// smaller than the targeted promoted type. Check that we're not trying to
 /// promote something larger than our base 'TypeSize' type.
 bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
-  if (isPromotedResultSafe(V))
-    return true;
 
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
+    return true;
+
+  if (SafeToPromote.count(I))
+   return true;
+
+  if (isPromotedResultSafe(V) || isSafeOverflow(I)) {
+    SafeToPromote.insert(I);
+    return true;
+  }
+
+  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
     return false;
 
   // If promotion is not safe, can we use a DSP instruction to natively
@@ -666,9 +821,6 @@ bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
   if (ST->isThumb() && !ST->hasThumb2())
     return false;
 
-  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
-    return false;
-
   // TODO
   // Would it be profitable? For Thumb code, these parallel DSP instructions
   // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
@@ -680,6 +832,7 @@ bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
         return false;
     }
   }
+  LLVM_DEBUG(dbgs() << "ARM CGP: Will use an intrinsic for: " << *I << "\n");
   return true;
 }
 
@@ -689,6 +842,8 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
   if (TypeSize > 16 || TypeSize < 8)
     return false;
 
+  SafeToPromote.clear();
+
   if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
     return false;
 
@@ -698,9 +853,8 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
   SetVector<Value*> WorkList;
   SmallPtrSet<Value*, 8> Sources;
   SmallPtrSet<Instruction*, 4> Sinks;
-  WorkList.insert(V);
   SmallPtrSet<Value*, 16> CurrentVisited;
-  CurrentVisited.clear();
+  WorkList.insert(V);
 
   // Return true if V was added to the worklist as a supported instruction,
   // if it was already visited, or if we don't need to explore it (e.g.
@@ -783,7 +937,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
   if (ToPromote < 2)
     return false;
 
-  Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks);
+  Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote);
   return true;
 }
 
@@ -825,9 +979,8 @@ bool ARMCodeGenPrepare::runOnFunction(Function &F) {
         }
       }
     }
-    Promoter->Cleanup();
     LLVM_DEBUG(if (verifyFunction(F, &dbgs())) {
-                dbgs();
+                dbgs() << F;
                 report_fatal_error("Broken function after type promotion");
                });
   }
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index b35a16b8a1e9e1509905cf4c04ddd034ffc29d45..eecd0a10dc7d5b6b089538e45fea485dcf1e0505 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1030,10 +1030,10 @@ static void addExclusiveRegPair(MachineInstrBuilder &MIB, MachineOperand &Reg,
   if (IsThumb) {
     unsigned RegLo = TRI->getSubReg(Reg.getReg(), ARM::gsub_0);
     unsigned RegHi = TRI->getSubReg(Reg.getReg(), ARM::gsub_1);
-    MIB.addReg(RegLo, Flags | getKillRegState(Reg.isDead()));
-    MIB.addReg(RegHi, Flags | getKillRegState(Reg.isDead()));
+    MIB.addReg(RegLo, Flags);
+    MIB.addReg(RegHi, Flags);
   } else
-    MIB.addReg(Reg.getReg(), Flags | getKillRegState(Reg.isDead()));
+    MIB.addReg(Reg.getReg(), Flags);
 }
 
 /// Expand a 64-bit CMP_SWAP to an ldrexd/strexd loop.
@@ -1103,7 +1103,8 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   //     bne .Lloadcmp
   unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
   MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg);
-  addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
+  unsigned Flags = getKillRegState(New.isDead());
+  addExclusiveRegPair(MIB, New, Flags, IsThumb, TRI);
   MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index bfff368a8fea17d91e050cee1b040dd8d72b4b1e..56d2e510cb719761d1b0ed3200179a491148902a 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -669,8 +669,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
-    setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
-    setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
+    setOperationAction(ISD::CTPOP,      MVT::v1i64, Custom);
+    setOperationAction(ISD::CTPOP,      MVT::v2i64, Custom);
 
     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
@@ -956,6 +956,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   // Use the default implementation.
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
@@ -1142,14 +1143,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasNEON()) {
     // vmin and vmax aren't available in a scalar form, so we use
     // a NEON instruction with an undef lane instead.
-    setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
 
     if (Subtarget->hasFullFP16()) {
       setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
@@ -1157,10 +1158,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
       setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
 
-      setOperationAction(ISD::FMINNAN, MVT::v4f16, Legal);
-      setOperationAction(ISD::FMAXNAN, MVT::v4f16, Legal);
-      setOperationAction(ISD::FMINNAN, MVT::v8f16, Legal);
-      setOperationAction(ISD::FMAXNAN, MVT::v8f16, Legal);
+      setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
+      setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
+      setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
+      setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
     }
   }
 
@@ -3171,9 +3172,11 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
 
 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    GV = GA->getBaseObject();
-  return (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
-         isa<Function>(GV);
+    if (!(GV = GA->getBaseObject()))
+      return false;
+  if (const auto *V = dyn_cast<GlobalVariable>(GV))
+    return V->isConstant();
+  return isa<Function>(GV);
 }
 
 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
@@ -3405,7 +3408,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                          Op.getOperand(1), Op.getOperand(2));
     }
     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
-      ? ISD::FMINNAN : ISD::FMAXNAN;
+      ? ISD::FMINIMUM : ISD::FMAXIMUM;
     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
@@ -5409,10 +5412,6 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
 
     // Compute with: cttz(x) = ctpop(lsb - 1)
 
-    // Since we can only compute the number of bits in a byte with vcnt.8, we
-    // have to gather the result with pairwise addition (vpaddl) for i16, i32,
-    // and i64.
-
     // Compute LSB - 1.
     SDValue Bits;
     if (ElemTy == MVT::i64) {
@@ -5425,32 +5424,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                                 DAG.getTargetConstant(1, dl, ElemTy));
       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
     }
-
-    // Count #bits with vcnt.8.
-    EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
-    SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
-    SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
-
-    // Gather the #bits with vpaddl (pairwise add.)
-    EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
-    SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
-        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
-        Cnt8);
-    if (ElemTy == MVT::i16)
-      return Cnt16;
-
-    EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
-    SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
-        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
-        Cnt16);
-    if (ElemTy == MVT::i32)
-      return Cnt32;
-
-    assert(ElemTy == MVT::i64);
-    SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-        DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
-        Cnt32);
-    return Cnt64;
+    return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
   }
 
   if (!ST->hasV6T2Ops())
@@ -5460,112 +5434,37 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
 }
 
-/// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
-/// for each 16-bit element from operand, repeated.  The basic idea is to
-/// leverage vcnt to get the 8-bit counts, gather and add the results.
-///
-/// Trace for v4i16:
-/// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
-/// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
-/// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
-/// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
-///            [b0 b1 b2 b3 b4 b5 b6 b7]
-///           +[b1 b0 b3 b2 b5 b4 b7 b6]
-/// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
-/// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
-static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
-
-  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
-  SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
-  SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
-  SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
-  SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
-  return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
-}
-
-/// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
-/// bit-count for each 16-bit element from the operand.  We need slightly
-/// different sequencing for v4i16 and v8i16 to stay within NEON's available
-/// 64/128-bit registers.
-///
-/// Trace for v4i16:
-/// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
-/// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
-/// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
-/// v4i16:Extracted = [k0    k1    k2    k3    ]
-static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
+static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
-  SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
-  if (VT.is64BitVector()) {
-    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
-                       DAG.getIntPtrConstant(0, DL));
-  } else {
-    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
-                                    BitCounts, DAG.getIntPtrConstant(0, DL));
-    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
-  }
-}
-
-/// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
-/// bit-count for each 32-bit element from the operand.  The idea here is
-/// to split the vector into 16-bit elements, leverage the 16-bit count
-/// routine, and then combine the results.
-///
-/// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
-/// input    = [v0    v1    ] (vi: 32-bit elements)
-/// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
-/// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
-/// vrev: N0 = [k1 k0 k3 k2 ]
-///            [k0 k1 k2 k3 ]
-///       N1 =+[k1 k0 k3 k2 ]
-///            [k0 k2 k1 k3 ]
-///       N2 =+[k1 k3 k0 k2 ]
-///            [k0    k2    k1    k3    ]
-/// Extended =+[k1    k3    k0    k2    ]
-///            [k0    k2    ]
-/// Extracted=+[k1    k3    ]
-///
-static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  SDLoc DL(N);
+  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
+  assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
+          VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
+         "Unexpected type for custom ctpop lowering");
 
-  EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
+  SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
+  Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
 
-  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
-  SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
-  SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
-  SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
-  SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
+  // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
+  unsigned EltSize = 8;
+  unsigned NumElts = VT.is64BitVector() ? 8 : 16;
+  while (EltSize != VT.getScalarSizeInBits()) {
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
+                                  TLI.getPointerTy(DAG.getDataLayout())));
+    Ops.push_back(Res);
 
-  if (VT.is64BitVector()) {
-    SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
-                       DAG.getIntPtrConstant(0, DL));
-  } else {
-    SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
-                                    DAG.getIntPtrConstant(0, DL));
-    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
+    EltSize *= 2;
+    NumElts /= 2;
+    MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
+    Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
   }
-}
-
-static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
-                          const ARMSubtarget *ST) {
-  EVT VT = N->getValueType(0);
-
-  assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
-  assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
-          VT == MVT::v4i16 || VT == MVT::v8i16) &&
-         "Unexpected type for custom ctpop lowering");
 
-  if (VT.getVectorElementType() == MVT::i32)
-    return lowerCTPOP32BitElements(N, DAG);
-  else
-    return lowerCTPOP16BitElements(N, DAG);
+  return Res;
 }
 
 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
@@ -9261,6 +9160,42 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
   return ContBB;
 }
 
+// The CPSR operand of SelectItr might be missing a kill marker
+// because there were multiple uses of CPSR, and ISel didn't know
+// which to mark. Figure out whether SelectItr should have had a
+// kill marker, and set it if it should. Returns the correct kill
+// marker value.
+static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr,
+                                   MachineBasicBlock* BB,
+                                   const TargetRegisterInfo* TRI) {
+  // Scan forward through BB for a use/def of CPSR.
+  MachineBasicBlock::iterator miI(std::next(SelectItr));
+  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
+    const MachineInstr& mi = *miI;
+    if (mi.readsRegister(ARM::CPSR))
+      return false;
+    if (mi.definesRegister(ARM::CPSR))
+      break; // Should have kill-flag - update below.
+  }
+
+  // If we hit the end of the block, check whether CPSR is live into a
+  // successor.
+  if (miI == BB->end()) {
+    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+                                          sEnd = BB->succ_end();
+         sItr != sEnd; ++sItr) {
+      MachineBasicBlock* succ = *sItr;
+      if (succ->isLiveIn(ARM::CPSR))
+        return false;
+    }
+  }
+
+  // We found a def, or hit the end of the basic block and CPSR wasn't live
+  // out. SelectMI should have a kill flag on CPSR.
+  SelectItr->addRegisterKilled(ARM::CPSR, TRI);
+  return true;
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -9360,6 +9295,14 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
 
+    // Check whether CPSR is live past the tMOVCCr_pseudo.
+    const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+    if (!MI.killsRegister(ARM::CPSR) &&
+        !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
+      copy0MBB->addLiveIn(ARM::CPSR);
+      sinkMBB->addLiveIn(ARM::CPSR);
+    }
+
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     sinkMBB->splice(sinkMBB->begin(), BB,
                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index e1a077ef1667a793ac14d905d3b49b619f1a1ec3..76f8414e8f0b87ef2bbded872d1f7952ed7f5f43 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -221,6 +221,7 @@ def HasV4T           : Predicate<"Subtarget->hasV4TOps()">,
 def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
 def HasV5T           : Predicate<"Subtarget->hasV5TOps()">,
                                  AssemblerPredicate<"HasV5TOps", "armv5t">;
+def NoV5T            : Predicate<"!Subtarget->hasV5TOps()">;
 def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
                                  AssemblerPredicate<"HasV5TEOps", "armv5te">;
 def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
@@ -357,21 +358,22 @@ let RecomputePerFunction = 1 in {
   def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;
   def UseMovtInPic     : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
   def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
-  def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx() || MF->getFunction().optForMinSize()">;
+
+  def UseFPVMLx: Predicate<"((Subtarget->useFPVMLx() &&"
+                           "  TM.Options.AllowFPOpFusion != FPOpFusion::Fast) ||"
+                           "MF->getFunction().optForMinSize())">;
 }
 def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 
 // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
-// But only select them if more precision in FP computation is allowed.
+// But only select them if more precision in FP computation is allowed, and when
+// they are not slower than a mul + add sequence.
 // Do not use them for Darwin platforms.
 def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
                                  " FPOpFusion::Fast && "
                                  " Subtarget->hasVFP4()) && "
-                                 "!Subtarget->isTargetDarwin()">;
-def DontUseFusedMAC  : Predicate<"!(TM.Options.AllowFPOpFusion =="
-                                 " FPOpFusion::Fast &&"
-                                 " Subtarget->hasVFP4()) || "
-                                 "Subtarget->isTargetDarwin()">;
+                                 "!Subtarget->isTargetDarwin() &&"
+                                 "Subtarget->useFPVMLx()">;
 
 def HasFastVGETLNi32 : Predicate<"!Subtarget->hasSlowVGETLNi32()">;
 def HasSlowVGETLNi32 : Predicate<"Subtarget->hasSlowVGETLNi32()">;
@@ -2199,6 +2201,9 @@ def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
   let Inst = 0xe7ffdefe;
 }
 
+def : Pat<(debugtrap), (BKPT 0)>, Requires<[IsARM, HasV5T]>;
+def : Pat<(debugtrap), (UDF 254)>, Requires<[IsARM, NoV5T]>;
+
 // Address computation and loads and stores in PIC mode.
 let isNotDuplicable = 1 in {
 def PICADD  : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index a7bb32d31f6f2e76bd691cc2648616c2488e9117..96986e74415b1f91cb05dbfe9ca14c1d5b39a6e6 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -4402,16 +4402,16 @@ defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
 def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
                           v2f32, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
                           v4f32, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLAhd   : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16",
                           v4f16, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 def  VMLAhq   : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16",
                           v8f16, fmul_su, fadd_mlx>,
-                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
 def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
@@ -4632,16 +4632,16 @@ defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
 def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
                           v2f32, fmul_su, fsub_mlx>,
-                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
                           v4f32, fmul_su, fsub_mlx>,
-                Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, UseFPVMLx]>;
 def  VMLShd   : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16",
                           v4f16, fmul, fsub>,
-                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 def  VMLShq   : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16",
                           v8f16, fmul, fsub>,
-                Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>;
+                Requires<[HasNEON, HasFullFP16, UseFPVMLx]>;
 defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
                               IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
 def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
@@ -5521,17 +5521,17 @@ defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm,
                            "vmax", "u", umax, 1>;
 def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmax", "f32",
-                        v2f32, v2f32, fmaxnan, 1>;
+                        v2f32, v2f32, fmaximum, 1>;
 def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmax", "f32",
-                        v4f32, v4f32, fmaxnan, 1>;
+                        v4f32, v4f32, fmaximum, 1>;
 def  VMAXhd   : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmax", "f16",
-                        v4f16, v4f16, fmaxnan, 1>,
+                        v4f16, v4f16, fmaximum, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 def  VMAXhq   : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmax", "f16",
-                        v8f16, v8f16, fmaxnan, 1>,
+                        v8f16, v8f16, fmaximum, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 // VMAXNM
@@ -5563,17 +5563,17 @@ defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm,
                            "vmin", "u", umin, 1>;
 def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmin", "f32",
-                        v2f32, v2f32, fminnan, 1>;
+                        v2f32, v2f32, fminimum, 1>;
 def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmin", "f32",
-                        v4f32, v4f32, fminnan, 1>;
+                        v4f32, v4f32, fminimum, 1>;
 def  VMINhd   : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND,
                         "vmin", "f16",
-                        v4f16, v4f16, fminnan, 1>,
+                        v4f16, v4f16, fminimum, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 def  VMINhq   : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmin", "f16",
-                        v8f16, v8f16, fminnan, 1>,
+                        v8f16, v8f16, fminimum, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 // VMINNM
@@ -7084,19 +7084,19 @@ def : N3VSPat<fadd, VADDfd>;
 def : N3VSPat<fsub, VSUBfd>;
 def : N3VSPat<fmul, VMULfd>;
 def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
-      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
 def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
-      Requires<[HasNEON, UseNEONForFP, UseFPVMLx, DontUseFusedMAC]>;
+      Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
 def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
       Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
 def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
       Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>;
 def : N2VSPat<fabs, VABSfd>;
 def : N2VSPat<fneg, VNEGfd>;
-def : N3VSPatFP16<fmaxnan, VMAXhd>, Requires<[HasFullFP16]>;
-def : N3VSPatFP16<fminnan, VMINhd>, Requires<[HasFullFP16]>;
-def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>;
-def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>;
+def : N3VSPatFP16<fmaximum, VMAXhd>, Requires<[HasFullFP16]>;
+def : N3VSPatFP16<fminimum, VMINhd>, Requires<[HasFullFP16]>;
+def : N3VSPat<fmaximum, VMAXfd>, Requires<[HasNEON]>;
+def : N3VSPat<fminimum, VMINfd>, Requires<[HasNEON]>;
 def : NVCVTFIPat<fp_to_sint, VCVTf2sd>;
 def : NVCVTFIPat<fp_to_uint, VCVTf2ud>;
 def : NVCVTIFPat<sint_to_fp, VCVTs2fd>;
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 88aab47a79bfd22f20a933c1fd121f2f715db124..3c153625b01684be5ff8fd55963738c71d47af3f 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1343,6 +1343,12 @@ let hasPostISelHook = 1, Defs = [CPSR] in {
                                                            tGPR:$Rm))]>,
                 Requires<[IsThumb1Only]>,
                 Sched<[WriteALU]>;
+
+  def tRSBS   : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn),
+                            2, IIC_iALUr,
+                            [(set tGPR:$Rd, CPSR, (ARMsubc 0, tGPR:$Rn))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
 }
 
 // Sign-extend byte
@@ -1380,6 +1386,9 @@ def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
   let Inst{7-0} = imm8;
 }
 
+def : Pat<(debugtrap), (tBKPT 0)>, Requires<[IsThumb, HasV5T]>;
+def : Pat<(debugtrap), (tUDF 254)>, Requires<[IsThumb, NoV5T]>;
+
 def t__brkdiv0 : TI<(outs), (ins), IIC_Br, "__brkdiv0",
                     [(int_arm_undefined 249)]>, Encoding16,
     Requires<[IsThumb, IsWindows]> {
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index b4e28b90747ad431c130cdb622bf9b09d9f4db27..b58730c452f741a06b41893ce9f3e5ee177e7ab8 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -1814,7 +1814,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
               Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
@@ -1823,7 +1823,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
               Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -1836,17 +1836,17 @@ def VMLAH : AHbI<0b11100, 0b00, 0, 0,
                   [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
                                            HPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasFullFP16,UseFPVMLx]>;
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>;
 def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
           (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>;
 
 
 def VMLSD : ADbI<0b11100, 0b00, 1, 0,
@@ -1855,7 +1855,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
               Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
@@ -1864,7 +1864,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
               Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -1877,17 +1877,17 @@ def VMLSH : AHbI<0b11100, 0b00, 1, 0,
                   [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
                                            HPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasFullFP16,UseFPVMLx]>;
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
           (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1895,7 +1895,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+                Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
                 Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
@@ -1904,7 +1904,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
                 Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -1917,29 +1917,29 @@ def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
                   [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
                                            HPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+                Requires<[HasFullFP16,UseFPVMLx]>;
 
 // (-(a * b) - dst) -> -(dst + (a * b))
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
           (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 // (-dst - (a * b)) -> -(dst + (a * b))
 def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
           (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@@ -1947,7 +1947,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+               Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>,
                Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
@@ -1955,7 +1955,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                   IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
              [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>,
              Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
@@ -1967,17 +1967,17 @@ def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
                   IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
              [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
+                Requires<[HasFullFP16,UseFPVMLx]>;
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
-          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
 def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
           (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
-          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 //===----------------------------------------------------------------------===//
 // Fused FP Multiply-Accumulate Operations.
diff --git a/lib/Target/ARM/ARMMacroFusion.cpp b/lib/Target/ARM/ARMMacroFusion.cpp
index d11fe9d5c502dccfc575c0a00432010eb0072cb8..df1da9d8e4744ac2a4fad78a64dac00fa8d0d5ef 100644
--- a/lib/Target/ARM/ARMMacroFusion.cpp
+++ b/lib/Target/ARM/ARMMacroFusion.cpp
@@ -23,20 +23,13 @@ namespace llvm {
 static bool isAESPair(const MachineInstr *FirstMI,
                       const MachineInstr &SecondMI) {
   // Assume the 1st instr to be a wildcard if it is unspecified.
-  unsigned FirstOpcode =
-      FirstMI ? FirstMI->getOpcode()
-              : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
-  unsigned SecondOpcode = SecondMI.getOpcode();
-
-  switch(SecondOpcode) {
+  switch(SecondMI.getOpcode()) {
   // AES encode.
   case ARM::AESMC :
-    return FirstOpcode == ARM::AESE ||
-           FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    return FirstMI == nullptr || FirstMI->getOpcode() == ARM::AESE;
   // AES decode.
   case ARM::AESIMC:
-    return FirstOpcode == ARM::AESD ||
-           FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    return FirstMI == nullptr || FirstMI->getOpcode() == ARM::AESD;
   }
 
   return false;
@@ -46,15 +39,8 @@ static bool isAESPair(const MachineInstr *FirstMI,
 static bool isLiteralsPair(const MachineInstr *FirstMI,
                            const MachineInstr &SecondMI) {
   // Assume the 1st instr to be a wildcard if it is unspecified.
-  unsigned FirstOpcode =
-      FirstMI ? FirstMI->getOpcode()
-              : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
-  unsigned SecondOpcode = SecondMI.getOpcode();
-
-  // 32 bit immediate.
-  if ((FirstOpcode == ARM::INSTRUCTION_LIST_END ||
-       FirstOpcode == ARM::MOVi16) &&
-      SecondOpcode == ARM::MOVTi16)
+  if ((FirstMI == nullptr || FirstMI->getOpcode() == ARM::MOVi16) &&
+      SecondMI.getOpcode() == ARM::MOVTi16)
     return true;
 
   return false;
diff --git a/lib/Target/ARM/ARMParallelDSP.cpp b/lib/Target/ARM/ARMParallelDSP.cpp
index 050a76413cfdb5a383606cb21f30142af1c75a2b..3ab9298c1108ca00b60b5752b3b6619080e62b6d 100644
--- a/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/lib/Target/ARM/ARMParallelDSP.cpp
@@ -71,7 +71,7 @@ namespace {
     virtual ~OpChain() = default;
 
     void SetMemoryLocations() {
-      const auto Size = MemoryLocation::UnknownSize;
+      const auto Size = LocationSize::unknown();
       for (auto *V : AllValues) {
         if (auto *I = dyn_cast<Instruction>(V)) {
           if (I->mayWriteToMemory())
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 1b0d162f7262f95a45f4fc2e27dae16f9a24fff3..f72bb8632eb78a5826e8ce775bfd8939430c3abb 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -400,10 +400,29 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 
 int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  // We only handle costs of reverse and select shuffles for now.
-  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Select)
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  if (Kind == TTI::SK_Broadcast) {
+    static const CostTblEntry NEONDupTbl[] = {
+        // VDUP handles these cases.
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i8,  1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
 
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+
+    if (const auto *Entry = CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+
+    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  }
   if (Kind == TTI::SK_Reverse) {
     static const CostTblEntry NEONShuffleTbl[] = {
         // Reverse shuffle cost one instruction if we are shuffling within a
@@ -412,6 +431,8 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
         {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
         {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
         {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i8,  1},
 
         {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
         {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
@@ -542,14 +563,17 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace) {
+                                           unsigned AddressSpace,
+                                           bool UseMaskForCond,
+                                           bool UseMaskForGaps) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
   // vldN/vstN doesn't support vector types of i64/f64 element.
   bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
 
-  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
+  if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
+      !UseMaskForCond && !UseMaskForGaps) {
     unsigned NumElts = VecTy->getVectorNumElements();
     auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
@@ -562,7 +586,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
 }
 
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 7d14bd7c256bd4e560ce8b28bce4c4a58889fd50..2dd143d48a15c5a661c374f42e567a35091f0c8f 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -169,7 +169,9 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index ec4b6c9a77778279efacd43040e8b266e56c4714..5720af7d8df6167c046997d2064f74ecaca9142b 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -1222,7 +1222,7 @@ isReMaterializable = 1 in
   // ldd Rd,   P+q
   // ldd Rd+1, P+q+1
   let Constraints = "@earlyclobber $dst" in
-  def LDDWRdPtrQ : Pseudo<(outs DREGS:$dst),
+  def LDDWRdPtrQ : Pseudo<(outs DREGS_WITHOUT_Z_WORKAROUND:$dst),
                           (ins memri:$memri),
                           "lddw\t$dst, $memri",
                           [(set i16:$dst, (load addr:$memri))]>,
diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp
index d171a620760ec8c68c2a82ecd6ed0ecdf82fa264..808a85e459c1bbaee5f703e2811ed6f013e1056c 100644
--- a/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -152,6 +152,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (MI.getOpcode() == AVR::FRMIDX) {
     MI.setDesc(TII.get(AVR::MOVWRdRr));
     MI.getOperand(FIOperandNum).ChangeToRegister(AVR::R29R28, false);
+    MI.RemoveOperand(2);
 
     assert(Offset > 0 && "Invalid offset");
 
diff --git a/lib/Target/AVR/AVRRegisterInfo.td b/lib/Target/AVR/AVRRegisterInfo.td
index 8162f12052be5aa22bf962d8368152ae92de28c3..d55252bcac46d1f91d01e1c0cbb02b52c121eddc 100644
--- a/lib/Target/AVR/AVRRegisterInfo.td
+++ b/lib/Target/AVR/AVRRegisterInfo.td
@@ -157,6 +157,26 @@ def DREGS : RegisterClass<"AVR", [i16], 8,
     R9R8, R7R6, R5R4, R3R2, R1R0
   )>;
 
+// The 16-bit DREGS register class, excluding the Z pointer register.
+//
+// This is used by instructions which cause high pointer register
+// contention which leads to an assertion in the register allocator.
+//
+// There is no technical reason why instructions that use this class
+// cannot use Z; it's simply a workaround a regalloc bug.
+//
+// More information can be found in PR39553.
+def DREGS_WITHOUT_Z_WORKAROUND : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24, R19R18, R21R20, R23R22,
+    // Scratch registers.
+    R27R26,
+    // Callee saved registers.
+    R29R28, R17R16, R15R14, R13R12, R11R10,
+    R9R8, R7R6, R5R4, R3R2, R1R0
+  )>;
+
 // 16-bit register class for immediate instructions.
 def DLDREGS : RegisterClass<"AVR", [i16], 8,
   (
diff --git a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 496f2befde58be61e76aea0b4a655fdcb052f140..8890fb8adf4de0e85688c23c1e8e567461ae3704 100644
--- a/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -357,8 +357,8 @@ BPFAsmParser::parseOperandAsOperator(OperandVector &Operands) {
   case AsmToken::Plus: {
     if (getLexer().peekTok().is(AsmToken::Integer))
       return MatchOperand_NoMatch;
+    LLVM_FALLTHROUGH;
   }
-  // Fall through.
 
   case AsmToken::Equal:
   case AsmToken::Greater:
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 59377f4f359dd31f840c9e650fd6674a7e02e602..3536aa81fb2156da6baa85fd8ae6c67f46df27d7 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -73,4 +73,3 @@ add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(TargetInfo)
-
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 69e263a425f869a48e8f80dd4a917414b6098e75..8853dd6d550943db24d4bac73132eaa4eeb362b7 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -323,31 +323,27 @@ class Proc<string Name, SchedMachineModel Model,
  : ProcessorModel<Name, Model, Features>;
 
 def : Proc<"generic", HexagonModelV60,
-           [ArchV4, ArchV5, ArchV55, ArchV60,
+           [ArchV5, ArchV55, ArchV60,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
-def : Proc<"hexagonv4",  HexagonModelV4,
-           [ArchV4,
-            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
-            FeaturePackets, FeatureSmallData]>;
-def : Proc<"hexagonv5",  HexagonModelV4,
-           [ArchV4, ArchV5,
+def : Proc<"hexagonv5",  HexagonModelV5,
+           [ArchV5,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv55", HexagonModelV55,
-           [ArchV4, ArchV5, ArchV55,
+           [ArchV5, ArchV55,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv60", HexagonModelV60,
-           [ArchV4, ArchV5, ArchV55, ArchV60,
+           [ArchV5, ArchV55, ArchV60,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv62", HexagonModelV62,
-           [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62,
+           [ArchV5, ArchV55, ArchV60, ArchV62,
             FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
             FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv65", HexagonModelV65,
-           [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62, ArchV65,
+           [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65,
             FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ,
             FeatureNVS, FeaturePackets, FeatureSmallData]>;
 
diff --git a/lib/Target/Hexagon/HexagonConstExtenders.cpp b/lib/Target/Hexagon/HexagonConstExtenders.cpp
index 6b48384c737f2235693b90141de52d6288f3ea48..ba9f638796eb3c6d7ec2fbcb5a0444330bb9a087 100644
--- a/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -376,7 +376,7 @@ namespace {
     using IndexList = SetVector<unsigned>;
     using ExtenderInit = std::pair<ExtValue, ExtExpr>;
     using AssignmentMap = std::map<ExtenderInit, IndexList>;
-    using LocDefMap = std::map<Loc, IndexList>;
+    using LocDefList = std::vector<std::pair<Loc, IndexList>>;
 
     const HexagonInstrInfo *HII = nullptr;
     const HexagonRegisterInfo *HRI = nullptr;
@@ -399,7 +399,7 @@ namespace {
     void assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
                      AssignmentMap &IMap);
     void calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
-                            LocDefMap &Defs);
+                            LocDefList &Defs);
     Register insertInitializer(Loc DefL, const ExtenderInit &ExtI);
     bool replaceInstrExact(const ExtDesc &ED, Register ExtR);
     bool replaceInstrExpr(const ExtDesc &ED, const ExtenderInit &ExtI,
@@ -731,7 +731,12 @@ bool HCE::ExtRoot::operator< (const HCE::ExtRoot &ER) const {
     case MachineOperand::MO_ExternalSymbol:
       return StringRef(V.SymbolName) < StringRef(ER.V.SymbolName);
     case MachineOperand::MO_GlobalAddress:
-      return V.GV->getGUID() < ER.V.GV->getGUID();
+      // Do not use GUIDs, since they depend on the source path. Moving the
+      // source file to a different directory could cause different GUID
+      // values for a pair of given symbols. These symbols could then compare
+      // "less" in one directory, but "greater" in another.
+      assert(!V.GV->getName().empty() && !ER.V.GV->getName().empty());
+      return V.GV->getName() < ER.V.GV->getName();
     case MachineOperand::MO_BlockAddress: {
       const BasicBlock *ThisB = V.BA->getBasicBlock();
       const BasicBlock *OtherB = ER.V.BA->getBasicBlock();
@@ -783,6 +788,7 @@ HCE::ExtValue::operator MachineOperand() const {
       return MachineOperand::CreateCPI(V.ImmVal, Offset, TF);
     case MachineOperand::MO_JumpTableIndex:
       assert(Offset == 0);
+      return MachineOperand::CreateJTI(V.ImmVal, TF);
     default:
       llvm_unreachable("Unhandled kind");
  }
@@ -1202,6 +1208,7 @@ void HCE::recordExtender(MachineInstr &MI, unsigned OpNum) {
       case Hexagon::S4_subaddi:       // (__: ## - Rs<<0)
         ED.Expr.Rs = MI.getOperand(OpNum+1);
         ED.Expr.Neg = true;
+        break;
       default:                        // (__: ## + __<<_)
         break;
     }
@@ -1236,9 +1243,13 @@ void HCE::collectInstr(MachineInstr &MI) {
 
 void HCE::collect(MachineFunction &MF) {
   Extenders.clear();
-  for (MachineBasicBlock &MBB : MF)
+  for (MachineBasicBlock &MBB : MF) {
+    // Skip unreachable blocks.
+    if (MBB.getNumber() == -1)
+      continue;
     for (MachineInstr &MI : MBB)
       collectInstr(MI);
+  }
 }
 
 void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
@@ -1463,7 +1474,7 @@ void HCE::assignInits(const ExtRoot &ER, unsigned Begin, unsigned End,
 }
 
 void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
-      LocDefMap &Defs) {
+      LocDefList &Defs) {
   if (Refs.empty())
     return;
 
@@ -1510,7 +1521,7 @@ void HCE::calculatePlacement(const ExtenderInit &ExtI, const IndexList &Refs,
     It = DomB->getFirstTerminator();
   }
   Loc DefLoc(DomB, It);
-  Defs.emplace(DefLoc, Refs);
+  Defs.emplace_back(DefLoc, Refs);
 }
 
 HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) {
@@ -1880,7 +1891,7 @@ bool HCE::replaceInstr(unsigned Idx, Register ExtR, const ExtenderInit &ExtI) {
 }
 
 bool HCE::replaceExtenders(const AssignmentMap &IMap) {
-  LocDefMap Defs;
+  LocDefList Defs;
   bool Changed = false;
 
   for (const std::pair<ExtenderInit,IndexList> &P : IMap) {
@@ -1947,8 +1958,23 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) {
   AssignmentMap IMap;
 
   collect(MF);
-  llvm::sort(Extenders, [](const ExtDesc &A, const ExtDesc &B) {
-    return ExtValue(A) < ExtValue(B);
+  llvm::sort(Extenders, [this](const ExtDesc &A, const ExtDesc &B) {
+    ExtValue VA(A), VB(B);
+    if (VA != VB)
+      return VA < VB;
+    const MachineInstr *MA = A.UseMI;
+    const MachineInstr *MB = B.UseMI;
+    if (MA == MB) {
+      // If it's the same instruction, compare operand numbers.
+      return A.OpNum < B.OpNum;
+    }
+
+    const MachineBasicBlock *BA = MA->getParent();
+    const MachineBasicBlock *BB = MB->getParent();
+    assert(BA->getNumber() != -1 && BB->getNumber() != -1);
+    if (BA != BB)
+      return BA->getNumber() < BB->getNumber();
+    return MDT->dominates(MA, MB);
   });
 
   bool Changed = false;
diff --git a/lib/Target/Hexagon/HexagonConstPropagation.cpp b/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 8f22a71dc1f30eb5dbbb2f0c323b97a284057fd8..fa192391313e20ffae860970b88c39fee888f538 100644
--- a/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -2463,6 +2463,7 @@ APInt HexagonConstEvaluator::getCmpImm(unsigned Opc, unsigned OpX,
     case Hexagon::A4_cmpheqi:    // s8
     case Hexagon::C4_cmpneqi:   // s8
       Signed = true;
+      break;
     case Hexagon::A4_cmpbeqi:    // u8
       break;
     case Hexagon::C2_cmpgtui:      // u9
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index fccde96d8a32c4073fd2cdb282b4c717afe3fd79..28965b69e284161d48b4f87421573c7f6ffca617 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -555,8 +555,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
     if ((!IsI1LowReg && !IsI2LowReg) || !isEvenReg(FirstRegIndex))
       continue;
 
-    // Check that the two instructions are combinable. V4 allows more
-    // instructions to be merged into a combine.
+    // Check that the two instructions are combinable.
     // The order matters because in a A2_tfrsi we might can encode a int8 as
     // the hi reg operand but only a uint6 as the low reg operand.
     if ((IsI2LowReg && !areCombinableOperations(TRI, I1, *I2, AllowC64)) ||
diff --git a/lib/Target/Hexagon/HexagonDepArch.h b/lib/Target/Hexagon/HexagonDepArch.h
index dc75f8f634008b99d600b7f6e6a8167711224f4b..1bcf40220619f0e2ece0da2e36122c01139937c5 100644
--- a/lib/Target/Hexagon/HexagonDepArch.h
+++ b/lib/Target/Hexagon/HexagonDepArch.h
@@ -15,7 +15,7 @@
 #define HEXAGON_DEP_ARCH_H
 namespace llvm {
 namespace Hexagon {
-enum class ArchEnum { V4,V5,V55,V60,V62,V65 };
+enum class ArchEnum { NoArch,Generic,V5,V55,V60,V62,V65 };
 } // namespace Hexagon
 } // namespace llvm;
 #endif // HEXAGON_DEP_ARCH_H
diff --git a/lib/Target/Hexagon/HexagonDepArch.td b/lib/Target/Hexagon/HexagonDepArch.td
index 3594379aa841fcd090c5da7e3033931e725a4fbc..ce7956926101b92fe369ddf0115e99bfb4e4e6b4 100644
--- a/lib/Target/Hexagon/HexagonDepArch.td
+++ b/lib/Target/Hexagon/HexagonDepArch.td
@@ -18,7 +18,4 @@ def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "Hexagon::ArchEnum::V
 def HasV60 : Predicate<"HST->hasV60Ops()">, AssemblerPredicate<"ArchV60">;
 def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "Hexagon::ArchEnum::V55", "Enable Hexagon V55 architecture">;
 def HasV55 : Predicate<"HST->hasV55Ops()">, AssemblerPredicate<"ArchV55">;
-def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "Hexagon::ArchEnum::V4", "Enable Hexagon V4 architecture">;
-def HasV4 : Predicate<"HST->hasV4Ops()">, AssemblerPredicate<"ArchV4">;
 def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "Hexagon::ArchEnum::V5", "Enable Hexagon V5 architecture">;
-def HasV5 : Predicate<"HST->hasV5Ops()">, AssemblerPredicate<"ArchV5">;
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
index 5c9ed271cea009766942038173f9f7084665d6a0..0b5efda933da3f27d9b589f9daa2dda60dd52543 100644
--- a/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -991,7 +991,7 @@ def A2_roundsat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = round($Rss32):sat",
-tc_c2f7d806, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_c2f7d806, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000110;
 let hasNewValue = 1;
@@ -3314,7 +3314,7 @@ def A5_vaddhubs : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vaddhub($Rss32,$Rtt32):sat",
-tc_2b6f77c6, TypeS_3op>, Enc_d2216a, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_3op>, Enc_d2216a {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -4059,7 +4059,7 @@ def F2_conv_d2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_d2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4069,7 +4069,7 @@ def F2_conv_d2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_d2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -4081,7 +4081,7 @@ def F2_conv_df2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4091,7 +4091,7 @@ def F2_conv_df2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4101,7 +4101,7 @@ def F2_conv_df2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -4113,7 +4113,7 @@ def F2_conv_df2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4123,7 +4123,7 @@ def F2_conv_df2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4133,7 +4133,7 @@ def F2_conv_df2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -4145,7 +4145,7 @@ def F2_conv_df2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000101;
 let hasNewValue = 1;
@@ -4157,7 +4157,7 @@ def F2_conv_df2w : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -4169,7 +4169,7 @@ def F2_conv_df2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000111;
 let hasNewValue = 1;
@@ -4181,7 +4181,7 @@ def F2_conv_sf2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4191,7 +4191,7 @@ def F2_conv_sf2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4201,7 +4201,7 @@ def F2_conv_sf2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4211,7 +4211,7 @@ def F2_conv_sf2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4221,7 +4221,7 @@ def F2_conv_sf2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4231,7 +4231,7 @@ def F2_conv_sf2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4243,7 +4243,7 @@ def F2_conv_sf2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4255,7 +4255,7 @@ def F2_conv_sf2w : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4267,7 +4267,7 @@ def F2_conv_sf2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32):chop",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4279,7 +4279,7 @@ def F2_conv_ud2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_ud2df($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4289,7 +4289,7 @@ def F2_conv_ud2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_ud2sf($Rss32)",
-tc_f3eaa14b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000001;
 let hasNewValue = 1;
@@ -4301,7 +4301,7 @@ def F2_conv_uw2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_uw2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4311,7 +4311,7 @@ def F2_conv_uw2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_uw2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011001;
 let hasNewValue = 1;
@@ -4323,7 +4323,7 @@ def F2_conv_w2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_w2df($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_3a3d62, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4333,7 +4333,7 @@ def F2_conv_w2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_w2sf($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011010;
 let hasNewValue = 1;
@@ -4345,7 +4345,7 @@ def F2_dfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Pd4 = dfclass($Rss32,#$Ii)",
-tc_7a830544, TypeALU64>, Enc_1f19b5, Requires<[HasV5]> {
+tc_7a830544, TypeALU64>, Enc_1f19b5 {
 let Inst{4-2} = 0b100;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b11011100100;
@@ -4356,7 +4356,7 @@ def F2_dfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.eq($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4368,7 +4368,7 @@ def F2_dfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.ge($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4380,7 +4380,7 @@ def F2_dfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.gt($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4392,7 +4392,7 @@ def F2_dfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.uo($Rss32,$Rtt32)",
-tc_1e856f58, TypeALU64>, Enc_fcf7a7, Requires<[HasV5]> {
+tc_1e856f58, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4404,7 +4404,7 @@ def F2_dfimm_n : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5]> {
+tc_234a11a5, TypeALU64>, Enc_e6c957 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100101;
 let prefersSlot3 = 1;
@@ -4413,7 +4413,7 @@ def F2_dfimm_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_e6c957, Requires<[HasV5]> {
+tc_234a11a5, TypeALU64>, Enc_e6c957 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100100;
 let prefersSlot3 = 1;
@@ -4422,7 +4422,7 @@ def F2_sfadd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfadd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -4436,7 +4436,7 @@ def F2_sfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Pd4 = sfclass($Rs32,#$Ii)",
-tc_7a830544, TypeS_2op>, Enc_83ee64, Requires<[HasV5]> {
+tc_7a830544, TypeS_2op>, Enc_83ee64 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000101111;
@@ -4447,7 +4447,7 @@ def F2_sfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.eq($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4459,7 +4459,7 @@ def F2_sfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.ge($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4471,7 +4471,7 @@ def F2_sfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.gt($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4483,7 +4483,7 @@ def F2_sfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.uo($Rs32,$Rt32)",
-tc_1e856f58, TypeS_3op>, Enc_c2b48e, Requires<[HasV5]> {
+tc_1e856f58, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4495,7 +4495,7 @@ def F2_sffixupd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupd($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4507,7 +4507,7 @@ def F2_sffixupn : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupn($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4519,7 +4519,7 @@ def F2_sffixupr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sffixupr($Rs32)",
-tc_f3eaa14b, TypeS_2op>, Enc_5e2823, Requires<[HasV5]> {
+tc_f3eaa14b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011101;
 let hasNewValue = 1;
@@ -4530,7 +4530,7 @@ def F2_sffma : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
+tc_d580173f, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4544,7 +4544,7 @@ def F2_sffma_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
+tc_d580173f, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4558,7 +4558,7 @@ def F2_sffma_sc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4),
 "$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale",
-tc_038a1342, TypeM>, Enc_437f33, Requires<[HasV5]> {
+tc_038a1342, TypeM>, Enc_437f33 {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111011;
@@ -4572,7 +4572,7 @@ def F2_sffms : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32)",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
+tc_d580173f, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4586,7 +4586,7 @@ def F2_sffms_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32):lib",
-tc_d580173f, TypeM>, Enc_2ae154, Requires<[HasV5]> {
+tc_d580173f, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4600,7 +4600,7 @@ def F2_sfimm_n : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):neg",
-tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5]> {
+tc_234a11a5, TypeALU64>, Enc_6c9440 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011001;
 let hasNewValue = 1;
@@ -4611,7 +4611,7 @@ def F2_sfimm_p : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):pos",
-tc_234a11a5, TypeALU64>, Enc_6c9440, Requires<[HasV5]> {
+tc_234a11a5, TypeALU64>, Enc_6c9440 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011000;
 let hasNewValue = 1;
@@ -4622,7 +4622,7 @@ def F2_sfinvsqrta : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32),
 "$Rd32,$Pe4 = sfinvsqrta($Rs32)",
-tc_4d99bca9, TypeS_2op>, Enc_890909, Requires<[HasV5]> {
+tc_4d99bca9, TypeS_2op>, Enc_890909 {
 let Inst{13-7} = 0b0000000;
 let Inst{31-21} = 0b10001011111;
 let hasNewValue = 1;
@@ -4634,7 +4634,7 @@ def F2_sfmax : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmax($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_976ddc4f, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4648,7 +4648,7 @@ def F2_sfmin : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmin($Rs32,$Rt32)",
-tc_976ddc4f, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_976ddc4f, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4662,7 +4662,7 @@ def F2_sfmpy : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmpy($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011010;
@@ -4676,7 +4676,7 @@ def F2_sfrecipa : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)",
-tc_9c00ce8d, TypeM>, Enc_a94f3b, Requires<[HasV5]> {
+tc_9c00ce8d, TypeM>, Enc_a94f3b {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011111;
@@ -4689,7 +4689,7 @@ def F2_sfsub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfsub($Rs32,$Rt32)",
-tc_6792d5ff, TypeM>, Enc_5ab2be, Requires<[HasV5]> {
+tc_6792d5ff, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -16981,7 +16981,7 @@ def M4_cmpyi_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5]> {
+tc_8fd5f294, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17007,7 +17007,7 @@ def M4_cmpyr_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_8fd5f294, TypeS_3op>, Enc_3d5b28, Requires<[HasV5]> {
+tc_8fd5f294, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17360,7 +17360,7 @@ def M5_vdmacbsu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat",
-tc_e913dc32, TypeM>, Enc_88c16c, Requires<[HasV5]> {
+tc_e913dc32, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -17372,7 +17372,7 @@ def M5_vdmpybsu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat",
-tc_8fd5f294, TypeM>, Enc_a56825, Requires<[HasV5]> {
+tc_8fd5f294, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -18207,7 +18207,7 @@ def S2_asr_i_p_rnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asr($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Enc_5eac98, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18216,7 +18216,7 @@ def S2_asr_i_p_rnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asrrnd($Rss32,#$Ii)",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op> {
 let isPseudo = 1;
 }
 def S2_asr_i_r : HInst<
@@ -25151,7 +25151,7 @@ def S5_asrhub_rnd_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op>, Enc_11a146 {
 let Inst{7-5} = 0b100;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25164,7 +25164,7 @@ def S5_asrhub_rnd_sat_goodsyntax : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -25173,7 +25173,7 @@ def S5_asrhub_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):sat",
-tc_2b6f77c6, TypeS_2op>, Enc_11a146, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op>, Enc_11a146 {
 let Inst{7-5} = 0b101;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25186,7 +25186,7 @@ def S5_popcountp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = popcount($Rss32)",
-tc_00afc57e, TypeS_2op>, Enc_90cd8b, Requires<[HasV5]> {
+tc_00afc57e, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -25197,7 +25197,7 @@ def S5_vasrhrnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):raw",
-tc_2b6f77c6, TypeS_2op>, Enc_12b6e9, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000001;
@@ -25207,7 +25207,7 @@ def S5_vasrhrnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):rnd",
-tc_2b6f77c6, TypeS_2op>, Requires<[HasV5]> {
+tc_2b6f77c6, TypeS_2op> {
 let isPseudo = 1;
 }
 def S6_allocframe_to_raw : HInst<
@@ -37007,7 +37007,7 @@ def Y5_l2fetch : HInst<
 (outs),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "l2fetch($Rs32,$Rtt32)",
-tc_daa058fa, TypeST>, Enc_e6abcf, Requires<[HasV5]> {
+tc_daa058fa, TypeST>, Enc_e6abcf {
 let Inst{7-0} = 0b00000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100110100;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index a2598244dab9e9d38ca5035d1f2bbaa76c86d07a..2f3e18c99c54465ec8280b307f864e873b0880bd 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1708,7 +1708,7 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   // register that is entirely undefined.
   LivePhysRegs LPR(HRI);
   LPR.addLiveIns(B);
-  SmallVector<std::pair<unsigned, const MachineOperand*>,2> Clobbers;
+  SmallVector<std::pair<MCPhysReg, const MachineOperand*>,2> Clobbers;
   for (auto R = B.begin(); R != It; ++R) {
     Clobbers.clear();
     LPR.stepForward(*R, Clobbers);
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 9a66aece5798e60ce9841fbf0b29f71f24ccc16b..470b05bda4c6bc9069755410455caed0459c9558 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1547,6 +1547,7 @@ bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits,
         return true;
       }
     }
+    break;
   }
   default:
     break;
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index aad457fc0518341842766851af2beede203cbddf..755a8539be7f1d12b65356b6c37990f0eb30be84 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1228,7 +1228,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
                                              const HexagonSubtarget &ST)
     : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)),
       Subtarget(ST) {
-  bool IsV4 = !Subtarget.hasV5Ops();
   auto &HRI = *Subtarget.getRegisterInfo();
 
   setPrefLoopAlignment(4);
@@ -1270,10 +1269,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass);
   addRegisterClass(MVT::v2i32, &Hexagon::DoubleRegsRegClass);
 
-  if (Subtarget.hasV5Ops()) {
-    addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
-    addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
-  }
+  addRegisterClass(MVT::f32, &Hexagon::IntRegsRegClass);
+  addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass);
 
   //
   // Handling of scalar operations.
@@ -1351,8 +1348,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTTZ, MVT::i8,  Promote);
   setOperationAction(ISD::CTTZ, MVT::i16, Promote);
 
-  // In V5, popcount can count # of 1s in i64 but returns i32.
-  // On V4 it will be expanded (set later).
+  // Popcount can count # of 1s in i64 but returns i32.
   setOperationAction(ISD::CTPOP, MVT::i8,  Promote);
   setOperationAction(ISD::CTPOP, MVT::i16, Promote);
   setOperationAction(ISD::CTPOP, MVT::i32, Promote);
@@ -1515,57 +1511,28 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ROTL, MVT::i32, Custom);
     setOperationAction(ISD::ROTL, MVT::i64, Custom);
   }
-  if (Subtarget.hasV5Ops()) {
-    setOperationAction(ISD::FMA,  MVT::f64, Expand);
-    setOperationAction(ISD::FADD, MVT::f64, Expand);
-    setOperationAction(ISD::FSUB, MVT::f64, Expand);
-    setOperationAction(ISD::FMUL, MVT::f64, Expand);
-
-    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-
-    setOperationAction(ISD::FP_TO_UINT, MVT::i1,  Promote);
-    setOperationAction(ISD::FP_TO_UINT, MVT::i8,  Promote);
-    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
-    setOperationAction(ISD::FP_TO_SINT, MVT::i1,  Promote);
-    setOperationAction(ISD::FP_TO_SINT, MVT::i8,  Promote);
-    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i1,  Promote);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::i1,  Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
-    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
-  } else { // V4
-    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
-    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
-    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
-    setOperationAction(ISD::FP_TO_SINT, MVT::f64, Expand);
-    setOperationAction(ISD::FP_TO_SINT, MVT::f32, Expand);
-    setOperationAction(ISD::FP_EXTEND,  MVT::f32, Expand);
-    setOperationAction(ISD::FP_ROUND,   MVT::f64, Expand);
-    setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
-
-    setOperationAction(ISD::CTPOP, MVT::i8,  Expand);
-    setOperationAction(ISD::CTPOP, MVT::i16, Expand);
-    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
-    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-
-    // Expand these operations for both f32 and f64:
-    for (unsigned FPExpOpV4 :
-         {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FABS, ISD::FNEG, ISD::FMA}) {
-      setOperationAction(FPExpOpV4, MVT::f32, Expand);
-      setOperationAction(FPExpOpV4, MVT::f64, Expand);
-    }
 
-    for (ISD::CondCode FPExpCCV4 :
-         {ISD::SETOEQ, ISD::SETOGT, ISD::SETOLT, ISD::SETOGE, ISD::SETOLE,
-          ISD::SETUO,  ISD::SETO}) {
-      setCondCodeAction(FPExpCCV4, MVT::f32, Expand);
-      setCondCodeAction(FPExpCCV4, MVT::f64, Expand);
-    }
-  }
+  // V5+.
+  setOperationAction(ISD::FMA,  MVT::f64, Expand);
+  setOperationAction(ISD::FADD, MVT::f64, Expand);
+  setOperationAction(ISD::FSUB, MVT::f64, Expand);
+  setOperationAction(ISD::FMUL, MVT::f64, Expand);
+
+  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+  setOperationAction(ISD::FP_TO_UINT, MVT::i1,  Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i8,  Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i1,  Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i8,  Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i1,  Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i1,  Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
 
   // Handling of indexed loads/stores: default is "expand".
   //
@@ -1601,42 +1568,18 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setLibcallName(RTLIB::FPTOSINT_F32_I128, "__hexagon_fixsfti");
   setLibcallName(RTLIB::FPTOSINT_F64_I128, "__hexagon_fixdfti");
 
-  if (IsV4) {
-    // Handle single-precision floating point operations on V4.
-    if (FastMath) {
-      setLibcallName(RTLIB::ADD_F32, "__hexagon_fast_addsf3");
-      setLibcallName(RTLIB::SUB_F32, "__hexagon_fast_subsf3");
-      setLibcallName(RTLIB::MUL_F32, "__hexagon_fast_mulsf3");
-      setLibcallName(RTLIB::OGT_F32, "__hexagon_fast_gtsf2");
-      setLibcallName(RTLIB::OLT_F32, "__hexagon_fast_ltsf2");
-      // Double-precision compares.
-      setLibcallName(RTLIB::OGT_F64, "__hexagon_fast_gtdf2");
-      setLibcallName(RTLIB::OLT_F64, "__hexagon_fast_ltdf2");
-    } else {
-      setLibcallName(RTLIB::ADD_F32, "__hexagon_addsf3");
-      setLibcallName(RTLIB::SUB_F32, "__hexagon_subsf3");
-      setLibcallName(RTLIB::MUL_F32, "__hexagon_mulsf3");
-      setLibcallName(RTLIB::OGT_F32, "__hexagon_gtsf2");
-      setLibcallName(RTLIB::OLT_F32, "__hexagon_ltsf2");
-      // Double-precision compares.
-      setLibcallName(RTLIB::OGT_F64, "__hexagon_gtdf2");
-      setLibcallName(RTLIB::OLT_F64, "__hexagon_ltdf2");
-    }
-  }
-
   // This is the only fast library function for sqrtd.
   if (FastMath)
     setLibcallName(RTLIB::SQRT_F64, "__hexagon_fast2_sqrtdf2");
 
   // Prefix is: nothing  for "slow-math",
-  //            "fast2_" for V4 fast-math and V5+ fast-math double-precision
+  //            "fast2_" for V5+ fast-math double-precision
   // (actually, keep fast-math and fast-math2 separate for now)
   if (FastMath) {
     setLibcallName(RTLIB::ADD_F64, "__hexagon_fast_adddf3");
     setLibcallName(RTLIB::SUB_F64, "__hexagon_fast_subdf3");
     setLibcallName(RTLIB::MUL_F64, "__hexagon_fast_muldf3");
     setLibcallName(RTLIB::DIV_F64, "__hexagon_fast_divdf3");
-    // Calling __hexagon_fast2_divsf3 with fast-math on V5 (ok).
     setLibcallName(RTLIB::DIV_F32, "__hexagon_fast_divsf3");
   } else {
     setLibcallName(RTLIB::ADD_F64, "__hexagon_adddf3");
@@ -1646,44 +1589,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setLibcallName(RTLIB::DIV_F32, "__hexagon_divsf3");
   }
 
-  if (Subtarget.hasV5Ops()) {
-    if (FastMath)
-      setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf");
-    else
-      setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf");
-  } else {
-    // V4
-    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__hexagon_floatsisf");
-    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__hexagon_floatsidf");
-    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__hexagon_floatdisf");
-    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__hexagon_floatdidf");
-    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__hexagon_floatunsisf");
-    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__hexagon_floatunsidf");
-    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__hexagon_floatundisf");
-    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__hexagon_floatundidf");
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__hexagon_fixunssfsi");
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__hexagon_fixunssfdi");
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__hexagon_fixunsdfsi");
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__hexagon_fixunsdfdi");
-    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__hexagon_fixsfsi");
-    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__hexagon_fixsfdi");
-    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__hexagon_fixdfsi");
-    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__hexagon_fixdfdi");
-    setLibcallName(RTLIB::FPEXT_F32_F64,    "__hexagon_extendsfdf2");
-    setLibcallName(RTLIB::FPROUND_F64_F32,  "__hexagon_truncdfsf2");
-    setLibcallName(RTLIB::OEQ_F32, "__hexagon_eqsf2");
-    setLibcallName(RTLIB::OEQ_F64, "__hexagon_eqdf2");
-    setLibcallName(RTLIB::OGE_F32, "__hexagon_gesf2");
-    setLibcallName(RTLIB::OGE_F64, "__hexagon_gedf2");
-    setLibcallName(RTLIB::OLE_F32, "__hexagon_lesf2");
-    setLibcallName(RTLIB::OLE_F64, "__hexagon_ledf2");
-    setLibcallName(RTLIB::UNE_F32, "__hexagon_nesf2");
-    setLibcallName(RTLIB::UNE_F64, "__hexagon_nedf2");
-    setLibcallName(RTLIB::UO_F32,  "__hexagon_unordsf2");
-    setLibcallName(RTLIB::UO_F64,  "__hexagon_unorddf2");
-    setLibcallName(RTLIB::O_F32,   "__hexagon_unordsf2");
-    setLibcallName(RTLIB::O_F64,   "__hexagon_unorddf2");
-  }
+  if (FastMath)
+    setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf");
+  else
+    setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf");
 
   // These cause problems when the shift amount is non-constant.
   setLibcallName(RTLIB::SHL_I128, nullptr);
@@ -1925,12 +1834,12 @@ bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask,
 }
 
 TargetLoweringBase::LegalizeTypeAction
-HexagonTargetLowering::getPreferredVectorAction(EVT VT) const {
+HexagonTargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT.getVectorNumElements() == 1)
     return TargetLoweringBase::TypeScalarizeVector;
 
   // Always widen vectors of i1.
-  MVT ElemTy = VT.getSimpleVT().getVectorElementType();
+  MVT ElemTy = VT.getVectorElementType();
   if (ElemTy == MVT::i1)
     return TargetLoweringBase::TypeWidenVector;
 
@@ -3007,7 +2916,7 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool HexagonTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  return Subtarget.hasV5Ops();
+  return true;
 }
 
 /// isLegalAddressingMode - Return true if the addressing mode represented by
@@ -3171,6 +3080,21 @@ HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
   return TargetLowering::findRepresentativeClass(TRI, VT);
 }
 
+bool HexagonTargetLowering::shouldReduceLoadWidth(SDNode *Load,
+      ISD::LoadExtType ExtTy, EVT NewVT) const {
+  auto *L = cast<LoadSDNode>(Load);
+  std::pair<SDValue,int> BO = getBaseAndOffset(L->getBasePtr());
+  // Small-data object, do not shrink.
+  if (BO.first.getOpcode() == HexagonISD::CONST32_GP)
+    return false;
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(BO.first)) {
+    auto &HTM = static_cast<const HexagonTargetMachine&>(getTargetMachine());
+    const auto *GO = dyn_cast_or_null<const GlobalObject>(GA->getGlobal());
+    return !GO || !HTM.getObjFileLowering()->isGlobalInSmallSection(GO, HTM);
+  }
+  return true;
+}
+
 Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
       AtomicOrdering Ord) const {
   BasicBlock *BB = Builder.GetInsertBlock();
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 8efb3c9cda5075396439ebcbf58ad61d9246bccd..265c37e6ae61ca065472882823575cd27e4345a5 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -141,7 +141,7 @@ namespace HexagonISD {
         unsigned DefinedValues) const override;
 
     bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
-    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
         const override;
 
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -304,6 +304,9 @@ namespace HexagonISD {
     SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG)
                                      const override;
 
+    bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+                               EVT NewVT) const override;
+
     // Handling of atomic RMW instructions.
     Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
         AtomicOrdering Ord) const override;
diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index b931f606ee5535f200a8125fcdd759d040e00bdd..a6400b5d826685bb00d781acd9e66614cb533005 100644
--- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -1426,7 +1426,8 @@ SDValue
 HexagonTargetLowering::LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const {
   // Sign- and zero-extends are legal.
   assert(Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG);
-  return DAG.getZeroExtendVectorInReg(Op.getOperand(0), SDLoc(Op), ty(Op));
+  return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(Op), ty(Op),
+                     Op.getOperand(0));
 }
 
 SDValue
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 1bb3bc1ea31bb44679752284413c0d3ef7a31322..a1082e7a77760db4d87461ca5a1de264e61737fe 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -194,8 +194,6 @@ class HInst<dag outs, dag ins, string asmstr, InstrItinClass itin, IType type> :
 //                         Instruction Classes Definitions +
 //===----------------------------------------------------------------------===//
 
-// LD Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
 let mayLoad = 1 in
 class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
@@ -205,9 +203,6 @@ class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
 
-// ST Instruction Class in V2/V3 can take SLOT0 only.
-// ST Instruction Class in V4    can take SLOT0 & SLOT1.
-// Definition of the instruction class CHANGED from V2/V3 to V4.
 let mayStore = 1 in
 class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
@@ -235,15 +230,6 @@ class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 //                         Instruction Classes Definitions -
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// V4 Instruction Format Definitions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrFormatsV4.td"
-
-//===----------------------------------------------------------------------===//
-// V60+ Instruction Format Definitions +
-//===----------------------------------------------------------------------===//
-
+include "HexagonInstrFormatsV5.td"
 include "HexagonInstrFormatsV60.td"
 include "HexagonInstrFormatsV65.td"
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV5.td
similarity index 95%
rename from lib/Target/Hexagon/HexagonInstrFormatsV4.td
rename to lib/Target/Hexagon/HexagonInstrFormatsV5.td
index c5fa25995212d08c43079e2092c864bf72f4ee80..482688ab90aa72f6c940a28af61659e309c94d12 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV5.td
@@ -1,4 +1,4 @@
-//==- HexagonInstrFormatsV4.td - Hexagon Instruction Formats --*- tablegen -==//
+//==- HexagonInstrFormatsV5.td - Hexagon Instruction Formats --*- tablegen -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file describes the Hexagon V4 instruction classes in TableGen format.
+// This file describes the Hexagon V5 instruction classes in TableGen format.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index b25e316709c5e388fac03bc8dc182ed8fdd24ff5..206e74983d203aba49b4bf3f3f2ab95d57d2751c 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -1398,7 +1398,5 @@ def: T_R_pat<Y2_dczeroa,     int_hexagon_Y2_dczeroa>;
 def: T_RR_pat<Y4_l2fetch,    int_hexagon_Y4_l2fetch>;
 def: T_RP_pat<Y5_l2fetch,    int_hexagon_Y5_l2fetch>;
 
-include "HexagonIntrinsicsV3.td"
-include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
 include "HexagonIntrinsicsV60.td"
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV3.td b/lib/Target/Hexagon/HexagonIntrinsicsV3.td
deleted file mode 100644
index 6152cb098825c52a1f4309aa9a28df804ec3f277..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonIntrinsicsV3.td
+++ /dev/null
@@ -1,27 +0,0 @@
-//=- HexagonIntrinsicsV3.td - Target Description for Hexagon -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V3 Compiler Intrinsics in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-// Vector reduce complex multiply real or imaginary
-def : T_PR_pat <M2_vrcmpys_s1,     int_hexagon_M2_vrcmpys_s1>;
-def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
-def : T_PR_pat <M2_vrcmpys_s1rp,   int_hexagon_M2_vrcmpys_s1rp>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
-
-def: T_RP_pat<A2_addsp,   int_hexagon_A2_addsp>;
-def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
-def: T_PP_pat<A2_minp,    int_hexagon_A2_minp>;
-def: T_PP_pat<A2_minup,   int_hexagon_A2_minup>;
-def: T_PP_pat<A2_maxp,    int_hexagon_A2_maxp>;
-def: T_PP_pat<A2_maxup,   int_hexagon_A2_maxup>;
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV4.td b/lib/Target/Hexagon/HexagonIntrinsicsV4.td
deleted file mode 100644
index 2affe531515d08106acf1709b5ee8bdf30698a38..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonIntrinsicsV4.td
+++ /dev/null
@@ -1,305 +0,0 @@
-//===- HexagonIntrinsicsV4.td - V4 Instruction intrinsics --*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This is populated based on the following specs:
-// Hexagon V4 Architecture Extensions
-// Application-Level Specification
-// 80-V9418-12 Rev. A
-// June 15, 2010
-
-// Vector reduce multiply word by signed half (32x16)
-//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
-def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
-def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
-
-//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
-def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
-
-// Vector multiply halfwords, signed by unsigned
-// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
-def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
-
-// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
-def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
-def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
-
-// Vector polynomial multiply halfwords
-// Rdd=vpmpyh(Rs,Rt)
-def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
-// Rxx[^]=vpmpyh(Rs,Rt)
-def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
-
-// Polynomial multiply words
-// Rdd=pmpyw(Rs,Rt)
-def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
-// Rxx^=pmpyw(Rs,Rt)
-def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
-
-//Rxx^=asr(Rss,Rt)
-def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
-//Rxx^=asl(Rss,Rt)
-def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
-//Rxx^=lsr(Rss,Rt)
-def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
-//Rxx^=lsl(Rss,Rt)
-def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
-
-// Multiply and use upper result
-def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
-def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
-def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
-def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
-def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
-
-def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
-def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
-
-// Vector reduce add unsigned halfwords
-def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
-
-def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
-def: T_P_pat<S2_ct0p,  int_hexagon_S2_ct0p>;
-def: T_P_pat<S2_ct1p,  int_hexagon_S2_ct1p>;
-
-def: T_Q_RR_pat<C4_nbitsset,  int_hexagon_C4_nbitsset>;
-def: T_Q_RR_pat<C4_nbitsclr,  int_hexagon_C4_nbitsclr>;
-def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
-
-def : T_Q_PI_pat<A4_vcmpbeqi,     int_hexagon_A4_vcmpbeqi>;
-def : T_Q_PI_pat<A4_vcmpbgti,     int_hexagon_A4_vcmpbgti>;
-def : T_Q_PI_pat<A4_vcmpbgtui,    int_hexagon_A4_vcmpbgtui>;
-def : T_Q_PI_pat<A4_vcmpheqi,     int_hexagon_A4_vcmpheqi>;
-def : T_Q_PI_pat<A4_vcmphgti,     int_hexagon_A4_vcmphgti>;
-def : T_Q_PI_pat<A4_vcmphgtui,    int_hexagon_A4_vcmphgtui>;
-def : T_Q_PI_pat<A4_vcmpweqi,     int_hexagon_A4_vcmpweqi>;
-def : T_Q_PI_pat<A4_vcmpwgti,     int_hexagon_A4_vcmpwgti>;
-def : T_Q_PI_pat<A4_vcmpwgtui,    int_hexagon_A4_vcmpwgtui>;
-def : T_Q_PP_pat<A4_vcmpbeq_any,  int_hexagon_A4_vcmpbeq_any>;
-
-def : T_Q_RR_pat<A4_cmpbeq,   int_hexagon_A4_cmpbeq>;
-def : T_Q_RR_pat<A4_cmpbgt,   int_hexagon_A4_cmpbgt>;
-def : T_Q_RR_pat<A4_cmpbgtu,  int_hexagon_A4_cmpbgtu>;
-def : T_Q_RR_pat<A4_cmpheq,   int_hexagon_A4_cmpheq>;
-def : T_Q_RR_pat<A4_cmphgt,   int_hexagon_A4_cmphgt>;
-def : T_Q_RR_pat<A4_cmphgtu,  int_hexagon_A4_cmphgtu>;
-
-def : T_Q_RI_pat<A4_cmpbeqi,  int_hexagon_A4_cmpbeqi>;
-def : T_Q_RI_pat<A4_cmpbgti,  int_hexagon_A4_cmpbgti>;
-def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
-
-def : T_Q_RI_pat<A4_cmpheqi,  int_hexagon_A4_cmpheqi>;
-def : T_Q_RI_pat<A4_cmphgti,  int_hexagon_A4_cmphgti>;
-def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
-
-def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
-def : T_Q_PR_pat<A4_tlbmatch,    int_hexagon_A4_tlbmatch>;
-
-def : T_RRR_pat <M4_mpyrr_addr,    int_hexagon_M4_mpyrr_addr>;
-def : T_IRR_pat <M4_mpyrr_addi,    int_hexagon_M4_mpyrr_addi>;
-def : T_IRI_pat <M4_mpyri_addi,    int_hexagon_M4_mpyri_addi>;
-def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
-def : T_RRI_pat <M4_mpyri_addr,    int_hexagon_M4_mpyri_addr>;
-def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
-def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
-
-// Complex multiply 32x16
-def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
-def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
-
-def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
-def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
-
-def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
-def : T_PP_pat<A4_ornp,  int_hexagon_A4_ornp>;
-
-// Complex add/sub halfwords/words
-def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
-def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
-def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
-def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
-
-def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
-def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
-
-// Extract bitfield
-def : T_PP_pat  <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
-def : T_RP_pat  <S4_extract_rp, int_hexagon_S4_extract_rp>;
-def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
-def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
-
-// Vector conditional negate
-// Rdd=vcnegh(Rss,Rt)
-def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
-
-// Shift an immediate left by register amount
-def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
-
-// Vector reduce maximum halfwords
-def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
-def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
-
-// Vector reduce maximum words
-def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
-def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
-
-// Vector reduce minimum halfwords
-def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
-def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
-
-// Vector reduce minimum words
-def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
-def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
-
-// Rotate and reduce bytes
-def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
-                                     u2_0ImmPred:$src3),
-           (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>;
-
-// Rotate and reduce bytes with accumulation
-// Rxx+=vrcrotate(Rss,Rt,#u2)
-def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
-                                         IntRegs:$src3, u2_0ImmPred:$src4),
-           (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
-                             IntRegs:$src3, u2_0ImmPred:$src4)>;
-
-// Vector conditional negate
-def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
-
-// Logical xor with xor accumulation
-def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
-
-// ALU64 - Vector min/max byte
-def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
-def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
-
-// Shift and add/sub/and/or
-def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
-def : T_IRI_pat <S4_ori_asl_ri,  int_hexagon_S4_ori_asl_ri>;
-def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
-def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
-def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
-def : T_IRI_pat <S4_ori_lsr_ri,  int_hexagon_S4_ori_lsr_ri>;
-def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
-def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
-
-// Split bitfield
-def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
-def : T_RR_pat <A4_bitsplit,  int_hexagon_A4_bitsplit>;
-
-def: T_RR_pat<S4_parity,      int_hexagon_S4_parity>;
-
-def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
-def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
-
-def: T_RI_pat<S4_clbaddi,     int_hexagon_S4_clbaddi>;
-def: T_PI_pat<S4_clbpaddi,    int_hexagon_S4_clbpaddi>;
-def: T_P_pat <S4_clbpnorm,    int_hexagon_S4_clbpnorm>;
-
-//*******************************************************************
-//            ALU32/ALU
-//*******************************************************************
-
-// ALU32 / ALU / Logical Operations.
-def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
-def: T_RR_pat<A4_orn,  int_hexagon_A4_orn>;
-
-//*******************************************************************
-//            ALU32/PERM
-//*******************************************************************
-
-// Combine Words Into Doublewords.
-def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>;
-def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>;
-
-//*******************************************************************
-//           ALU32/PRED
-//*******************************************************************
-
-// Compare
-def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>;
-def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>;
-
-// Compare To General Register.
-def: T_Q_RR_pat<C4_cmpneq,  int_hexagon_C4_cmpneq>;
-def: T_Q_RR_pat<C4_cmplte,  int_hexagon_C4_cmplte>;
-def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
-
-def: T_RR_pat<A4_rcmpeq,  int_hexagon_A4_rcmpeq>;
-def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
-
-def: T_RI_pat<A4_rcmpeqi,  int_hexagon_A4_rcmpeqi>;
-def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
-
-//*******************************************************************
-//           CR
-//*******************************************************************
-
-// CR / Logical Operations On Predicates.
-def: T_Q_QQQ_pat<C4_and_and,  int_hexagon_C4_and_and>;
-def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
-def: T_Q_QQQ_pat<C4_and_or,   int_hexagon_C4_and_or>;
-def: T_Q_QQQ_pat<C4_and_orn,  int_hexagon_C4_and_orn>;
-def: T_Q_QQQ_pat<C4_or_and,   int_hexagon_C4_or_and>;
-def: T_Q_QQQ_pat<C4_or_andn,  int_hexagon_C4_or_andn>;
-def: T_Q_QQQ_pat<C4_or_or,    int_hexagon_C4_or_or>;
-def: T_Q_QQQ_pat<C4_or_orn,   int_hexagon_C4_or_orn>;
-
-//*******************************************************************
-//           XTYPE/ALU
-//*******************************************************************
-
-// Add And Accumulate.
-
-def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
-def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
-
-
-// XTYPE / ALU / Logical-logical Words.
-def : T_RRR_pat <M4_or_xor,   int_hexagon_M4_or_xor>;
-def : T_RRR_pat <M4_and_xor,  int_hexagon_M4_and_xor>;
-def : T_RRR_pat <M4_or_and,   int_hexagon_M4_or_and>;
-def : T_RRR_pat <M4_and_and,  int_hexagon_M4_and_and>;
-def : T_RRR_pat <M4_xor_and,  int_hexagon_M4_xor_and>;
-def : T_RRR_pat <M4_or_or,    int_hexagon_M4_or_or>;
-def : T_RRR_pat <M4_and_or,   int_hexagon_M4_and_or>;
-def : T_RRR_pat <M4_xor_or,   int_hexagon_M4_xor_or>;
-def : T_RRR_pat <M4_or_andn,  int_hexagon_M4_or_andn>;
-def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
-def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
-
-def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
-def : T_RRI_pat <S4_or_andix,  int_hexagon_S4_or_andix>;
-def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
-
-// Modulo wrap.
-def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
-
-// Arithmetic/Convergent round
-// Rd=[cround|round](Rs,Rt)[:sat]
-// Rd=[cround|round](Rs,#u5)[:sat]
-def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
-def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
-
-def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
-def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
-
-def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
-def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
-
-def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV5.td b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
index 29f67cffcf89139da2857d9e052d797ac8cb5d41..a852394f216076551a649c8c07b97da40a2a6365 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV5.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV5.td
@@ -7,9 +7,314 @@
 //
 //===----------------------------------------------------------------------===//
 
+def : T_PR_pat <M2_vrcmpys_s1,     int_hexagon_M2_vrcmpys_s1>;
+def : T_PPR_pat<M2_vrcmpys_acc_s1, int_hexagon_M2_vrcmpys_acc_s1>;
+def : T_PR_pat <M2_vrcmpys_s1rp,   int_hexagon_M2_vrcmpys_s1rp>;
+
+// Vector reduce add unsigned halfwords
+def : T_PP_pat<M2_vradduh, int_hexagon_M2_vradduh>;
+
+def: T_RP_pat<A2_addsp,   int_hexagon_A2_addsp>;
+def: T_PP_pat<A2_addpsat, int_hexagon_A2_addpsat>;
+def: T_PP_pat<A2_minp,    int_hexagon_A2_minp>;
+def: T_PP_pat<A2_minup,   int_hexagon_A2_minup>;
+def: T_PP_pat<A2_maxp,    int_hexagon_A2_maxp>;
+def: T_PP_pat<A2_maxup,   int_hexagon_A2_maxup>;
+
+// Vector reduce multiply word by signed half (32x16)
+//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyeh_s0, int_hexagon_M4_vrmpyeh_s0>;
+def : T_PP_pat <M4_vrmpyeh_s1, int_hexagon_M4_vrmpyeh_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PP_pat <M4_vrmpyoh_s0, int_hexagon_M4_vrmpyoh_s0>;
+def : T_PP_pat <M4_vrmpyoh_s1, int_hexagon_M4_vrmpyoh_s1>;
+
+//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyeh_acc_s0, int_hexagon_M4_vrmpyeh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyeh_acc_s1, int_hexagon_M4_vrmpyeh_acc_s1>;
+
+//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
+def : T_PPP_pat <M4_vrmpyoh_acc_s0, int_hexagon_M4_vrmpyoh_acc_s0>;
+def : T_PPP_pat <M4_vrmpyoh_acc_s1, int_hexagon_M4_vrmpyoh_acc_s1>;
+
+// Vector multiply halfwords, signed by unsigned
+// Rdd=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_RR_pat <M2_vmpy2su_s0, int_hexagon_M2_vmpy2su_s0>;
+def : T_RR_pat <M2_vmpy2su_s1, int_hexagon_M2_vmpy2su_s1>;
+
+// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
+def : T_PRR_pat <M2_vmac2su_s0, int_hexagon_M2_vmac2su_s0>;
+def : T_PRR_pat <M2_vmac2su_s1, int_hexagon_M2_vmac2su_s1>;
+
+// Vector polynomial multiply halfwords
+// Rdd=vpmpyh(Rs,Rt)
+def : T_RR_pat <M4_vpmpyh, int_hexagon_M4_vpmpyh>;
+// Rxx[^]=vpmpyh(Rs,Rt)
+def : T_PRR_pat <M4_vpmpyh_acc, int_hexagon_M4_vpmpyh_acc>;
+
+// Polynomial multiply words
+// Rdd=pmpyw(Rs,Rt)
+def : T_RR_pat <M4_pmpyw, int_hexagon_M4_pmpyw>;
+// Rxx^=pmpyw(Rs,Rt)
+def : T_PRR_pat <M4_pmpyw_acc, int_hexagon_M4_pmpyw_acc>;
+
+//Rxx^=asr(Rss,Rt)
+def : T_PPR_pat <S2_asr_r_p_xor, int_hexagon_S2_asr_r_p_xor>;
+//Rxx^=asl(Rss,Rt)
+def : T_PPR_pat <S2_asl_r_p_xor, int_hexagon_S2_asl_r_p_xor>;
+//Rxx^=lsr(Rss,Rt)
+def : T_PPR_pat <S2_lsr_r_p_xor, int_hexagon_S2_lsr_r_p_xor>;
+//Rxx^=lsl(Rss,Rt)
+def : T_PPR_pat <S2_lsl_r_p_xor, int_hexagon_S2_lsl_r_p_xor>;
+
+// Multiply and use upper result
+def : T_RR_pat <M2_mpysu_up, int_hexagon_M2_mpysu_up>;
+def : T_RR_pat <M2_mpy_up_s1, int_hexagon_M2_mpy_up_s1>;
+def : T_RR_pat <M2_hmmpyh_s1, int_hexagon_M2_hmmpyh_s1>;
+def : T_RR_pat <M2_hmmpyl_s1, int_hexagon_M2_hmmpyl_s1>;
+def : T_RR_pat <M2_mpy_up_s1_sat, int_hexagon_M2_mpy_up_s1_sat>;
+
+def : T_PP_pat <A2_vaddub, int_hexagon_A2_vaddb_map>;
+def : T_PP_pat <A2_vsubub, int_hexagon_A2_vsubb_map>;
+
+// Vector reduce add unsigned halfwords
+def : T_PP_pat <M2_vraddh, int_hexagon_M2_vraddh>;
+
+def: T_P_pat<S2_brevp, int_hexagon_S2_brevp>;
+def: T_P_pat<S2_ct0p,  int_hexagon_S2_ct0p>;
+def: T_P_pat<S2_ct1p,  int_hexagon_S2_ct1p>;
+
+def: T_Q_RR_pat<C4_nbitsset,  int_hexagon_C4_nbitsset>;
+def: T_Q_RR_pat<C4_nbitsclr,  int_hexagon_C4_nbitsclr>;
+def: T_Q_RI_pat<C4_nbitsclri, int_hexagon_C4_nbitsclri>;
+
+def : T_Q_PI_pat<A4_vcmpbeqi,     int_hexagon_A4_vcmpbeqi>;
+def : T_Q_PI_pat<A4_vcmpbgti,     int_hexagon_A4_vcmpbgti>;
+def : T_Q_PI_pat<A4_vcmpbgtui,    int_hexagon_A4_vcmpbgtui>;
+def : T_Q_PI_pat<A4_vcmpheqi,     int_hexagon_A4_vcmpheqi>;
+def : T_Q_PI_pat<A4_vcmphgti,     int_hexagon_A4_vcmphgti>;
+def : T_Q_PI_pat<A4_vcmphgtui,    int_hexagon_A4_vcmphgtui>;
+def : T_Q_PI_pat<A4_vcmpweqi,     int_hexagon_A4_vcmpweqi>;
+def : T_Q_PI_pat<A4_vcmpwgti,     int_hexagon_A4_vcmpwgti>;
+def : T_Q_PI_pat<A4_vcmpwgtui,    int_hexagon_A4_vcmpwgtui>;
+def : T_Q_PP_pat<A4_vcmpbeq_any,  int_hexagon_A4_vcmpbeq_any>;
+
+def : T_Q_RR_pat<A4_cmpbeq,   int_hexagon_A4_cmpbeq>;
+def : T_Q_RR_pat<A4_cmpbgt,   int_hexagon_A4_cmpbgt>;
+def : T_Q_RR_pat<A4_cmpbgtu,  int_hexagon_A4_cmpbgtu>;
+def : T_Q_RR_pat<A4_cmpheq,   int_hexagon_A4_cmpheq>;
+def : T_Q_RR_pat<A4_cmphgt,   int_hexagon_A4_cmphgt>;
+def : T_Q_RR_pat<A4_cmphgtu,  int_hexagon_A4_cmphgtu>;
+
+def : T_Q_RI_pat<A4_cmpbeqi,  int_hexagon_A4_cmpbeqi>;
+def : T_Q_RI_pat<A4_cmpbgti,  int_hexagon_A4_cmpbgti>;
+def : T_Q_RI_pat<A4_cmpbgtui, int_hexagon_A4_cmpbgtui>;
+
+def : T_Q_RI_pat<A4_cmpheqi,  int_hexagon_A4_cmpheqi>;
+def : T_Q_RI_pat<A4_cmphgti,  int_hexagon_A4_cmphgti>;
+def : T_Q_RI_pat<A4_cmphgtui, int_hexagon_A4_cmphgtui>;
+
+def : T_Q_RP_pat<A4_boundscheck, int_hexagon_A4_boundscheck>;
+def : T_Q_PR_pat<A4_tlbmatch,    int_hexagon_A4_tlbmatch>;
+
+def : T_RRR_pat <M4_mpyrr_addr,    int_hexagon_M4_mpyrr_addr>;
+def : T_IRR_pat <M4_mpyrr_addi,    int_hexagon_M4_mpyrr_addi>;
+def : T_IRI_pat <M4_mpyri_addi,    int_hexagon_M4_mpyri_addi>;
+def : T_RIR_pat <M4_mpyri_addr_u2, int_hexagon_M4_mpyri_addr_u2>;
+def : T_RRI_pat <M4_mpyri_addr,    int_hexagon_M4_mpyri_addr>;
+def : T_RRR_pat <M4_mac_up_s1_sat, int_hexagon_M4_mac_up_s1_sat>;
+def : T_RRR_pat <M4_nac_up_s1_sat, int_hexagon_M4_nac_up_s1_sat>;
+
+// Complex multiply 32x16
+def : T_PR_pat <M4_cmpyi_wh, int_hexagon_M4_cmpyi_wh>;
+def : T_PR_pat <M4_cmpyr_wh, int_hexagon_M4_cmpyr_wh>;
+
+def : T_PR_pat <M4_cmpyi_whc, int_hexagon_M4_cmpyi_whc>;
+def : T_PR_pat <M4_cmpyr_whc, int_hexagon_M4_cmpyr_whc>;
+
+def : T_PP_pat<A4_andnp, int_hexagon_A4_andnp>;
+def : T_PP_pat<A4_ornp,  int_hexagon_A4_ornp>;
+
+// Complex add/sub halfwords/words
+def : T_PP_pat <S4_vxaddsubw, int_hexagon_S4_vxaddsubw>;
+def : T_PP_pat <S4_vxsubaddw, int_hexagon_S4_vxsubaddw>;
+def : T_PP_pat <S4_vxaddsubh, int_hexagon_S4_vxaddsubh>;
+def : T_PP_pat <S4_vxsubaddh, int_hexagon_S4_vxsubaddh>;
+
+def : T_PP_pat <S4_vxaddsubhr, int_hexagon_S4_vxaddsubhr>;
+def : T_PP_pat <S4_vxsubaddhr, int_hexagon_S4_vxsubaddhr>;
+
+// Extract bitfield
+def : T_PP_pat  <S4_extractp_rp, int_hexagon_S4_extractp_rp>;
+def : T_RP_pat  <S4_extract_rp, int_hexagon_S4_extract_rp>;
+def : T_PII_pat <S4_extractp, int_hexagon_S4_extractp>;
+def : T_RII_pat <S4_extract, int_hexagon_S4_extract>;
+
+// Vector conditional negate
+// Rdd=vcnegh(Rss,Rt)
+def : T_PR_pat <S2_vcnegh, int_hexagon_S2_vcnegh>;
+
+// Shift an immediate left by register amount
+def : T_IR_pat<S4_lsli, int_hexagon_S4_lsli>;
+
+// Vector reduce maximum halfwords
+def : T_PPR_pat <A4_vrmaxh, int_hexagon_A4_vrmaxh>;
+def : T_PPR_pat <A4_vrmaxuh, int_hexagon_A4_vrmaxuh>;
+
+// Vector reduce maximum words
+def : T_PPR_pat <A4_vrmaxw, int_hexagon_A4_vrmaxw>;
+def : T_PPR_pat <A4_vrmaxuw, int_hexagon_A4_vrmaxuw>;
+
+// Vector reduce minimum halfwords
+def : T_PPR_pat <A4_vrminh, int_hexagon_A4_vrminh>;
+def : T_PPR_pat <A4_vrminuh, int_hexagon_A4_vrminuh>;
+
+// Vector reduce minimum words
+def : T_PPR_pat <A4_vrminw, int_hexagon_A4_vrminw>;
+def : T_PPR_pat <A4_vrminuw, int_hexagon_A4_vrminuw>;
+
+// Rotate and reduce bytes
+def : Pat <(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2,
+                                     u2_0ImmPred:$src3),
+           (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred:$src3)>;
+
+// Rotate and reduce bytes with accumulation
+// Rxx+=vrcrotate(Rss,Rt,#u2)
+def : Pat <(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+                                         IntRegs:$src3, u2_0ImmPred:$src4),
+           (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2,
+                             IntRegs:$src3, u2_0ImmPred:$src4)>;
+
+// Vector conditional negate
+def : T_PPR_pat<S2_vrcnegh, int_hexagon_S2_vrcnegh>;
+
+// Logical xor with xor accumulation
+def : T_PPP_pat<M4_xor_xacc, int_hexagon_M4_xor_xacc>;
+
+// ALU64 - Vector min/max byte
+def : T_PP_pat <A2_vminb, int_hexagon_A2_vminb>;
+def : T_PP_pat <A2_vmaxb, int_hexagon_A2_vmaxb>;
+
+// Shift and add/sub/and/or
+def : T_IRI_pat <S4_andi_asl_ri, int_hexagon_S4_andi_asl_ri>;
+def : T_IRI_pat <S4_ori_asl_ri,  int_hexagon_S4_ori_asl_ri>;
+def : T_IRI_pat <S4_addi_asl_ri, int_hexagon_S4_addi_asl_ri>;
+def : T_IRI_pat <S4_subi_asl_ri, int_hexagon_S4_subi_asl_ri>;
+def : T_IRI_pat <S4_andi_lsr_ri, int_hexagon_S4_andi_lsr_ri>;
+def : T_IRI_pat <S4_ori_lsr_ri,  int_hexagon_S4_ori_lsr_ri>;
+def : T_IRI_pat <S4_addi_lsr_ri, int_hexagon_S4_addi_lsr_ri>;
+def : T_IRI_pat <S4_subi_lsr_ri, int_hexagon_S4_subi_lsr_ri>;
+
+// Split bitfield
+def : T_RI_pat <A4_bitspliti, int_hexagon_A4_bitspliti>;
+def : T_RR_pat <A4_bitsplit,  int_hexagon_A4_bitsplit>;
+
+def: T_RR_pat<S4_parity,      int_hexagon_S4_parity>;
+
+def: T_Q_RI_pat<S4_ntstbit_i, int_hexagon_S4_ntstbit_i>;
+def: T_Q_RR_pat<S4_ntstbit_r, int_hexagon_S4_ntstbit_r>;
+
+def: T_RI_pat<S4_clbaddi,     int_hexagon_S4_clbaddi>;
+def: T_PI_pat<S4_clbpaddi,    int_hexagon_S4_clbpaddi>;
+def: T_P_pat <S4_clbpnorm,    int_hexagon_S4_clbpnorm>;
+
+//*******************************************************************
+//            ALU32/ALU
+//*******************************************************************
+
+// ALU32 / ALU / Logical Operations.
+def: T_RR_pat<A4_andn, int_hexagon_A4_andn>;
+def: T_RR_pat<A4_orn,  int_hexagon_A4_orn>;
+
+//*******************************************************************
+//            ALU32/PERM
+//*******************************************************************
+
+// Combine Words Into Doublewords.
+def: T_RI_pat<A4_combineri, int_hexagon_A4_combineri, s32_0ImmPred>;
+def: T_IR_pat<A4_combineir, int_hexagon_A4_combineir, s32_0ImmPred>;
+
+//*******************************************************************
+//           ALU32/PRED
+//*******************************************************************
+
+// Compare
+def : T_Q_RI_pat<C4_cmpneqi, int_hexagon_C4_cmpneqi, s32_0ImmPred>;
+def : T_Q_RI_pat<C4_cmpltei, int_hexagon_C4_cmpltei, s32_0ImmPred>;
+def : T_Q_RI_pat<C4_cmplteui, int_hexagon_C4_cmplteui, u32_0ImmPred>;
+
+// Compare To General Register.
+def: T_Q_RR_pat<C4_cmpneq,  int_hexagon_C4_cmpneq>;
+def: T_Q_RR_pat<C4_cmplte,  int_hexagon_C4_cmplte>;
+def: T_Q_RR_pat<C4_cmplteu, int_hexagon_C4_cmplteu>;
+
+def: T_RR_pat<A4_rcmpeq,  int_hexagon_A4_rcmpeq>;
+def: T_RR_pat<A4_rcmpneq, int_hexagon_A4_rcmpneq>;
+
+def: T_RI_pat<A4_rcmpeqi,  int_hexagon_A4_rcmpeqi>;
+def: T_RI_pat<A4_rcmpneqi, int_hexagon_A4_rcmpneqi>;
+
+//*******************************************************************
+//           CR
+//*******************************************************************
+
+// CR / Logical Operations On Predicates.
+def: T_Q_QQQ_pat<C4_and_and,  int_hexagon_C4_and_and>;
+def: T_Q_QQQ_pat<C4_and_andn, int_hexagon_C4_and_andn>;
+def: T_Q_QQQ_pat<C4_and_or,   int_hexagon_C4_and_or>;
+def: T_Q_QQQ_pat<C4_and_orn,  int_hexagon_C4_and_orn>;
+def: T_Q_QQQ_pat<C4_or_and,   int_hexagon_C4_or_and>;
+def: T_Q_QQQ_pat<C4_or_andn,  int_hexagon_C4_or_andn>;
+def: T_Q_QQQ_pat<C4_or_or,    int_hexagon_C4_or_or>;
+def: T_Q_QQQ_pat<C4_or_orn,   int_hexagon_C4_or_orn>;
+
+//*******************************************************************
+//           XTYPE/ALU
+//*******************************************************************
+
+// Add And Accumulate.
+
+def : T_RRI_pat <S4_addaddi, int_hexagon_S4_addaddi>;
+def : T_RIR_pat <S4_subaddi, int_hexagon_S4_subaddi>;
+
+
+// XTYPE / ALU / Logical-logical Words.
+def : T_RRR_pat <M4_or_xor,   int_hexagon_M4_or_xor>;
+def : T_RRR_pat <M4_and_xor,  int_hexagon_M4_and_xor>;
+def : T_RRR_pat <M4_or_and,   int_hexagon_M4_or_and>;
+def : T_RRR_pat <M4_and_and,  int_hexagon_M4_and_and>;
+def : T_RRR_pat <M4_xor_and,  int_hexagon_M4_xor_and>;
+def : T_RRR_pat <M4_or_or,    int_hexagon_M4_or_or>;
+def : T_RRR_pat <M4_and_or,   int_hexagon_M4_and_or>;
+def : T_RRR_pat <M4_xor_or,   int_hexagon_M4_xor_or>;
+def : T_RRR_pat <M4_or_andn,  int_hexagon_M4_or_andn>;
+def : T_RRR_pat <M4_and_andn, int_hexagon_M4_and_andn>;
+def : T_RRR_pat <M4_xor_andn, int_hexagon_M4_xor_andn>;
+
+def : T_RRI_pat <S4_or_andi, int_hexagon_S4_or_andi>;
+def : T_RRI_pat <S4_or_andix,  int_hexagon_S4_or_andix>;
+def : T_RRI_pat <S4_or_ori, int_hexagon_S4_or_ori>;
+
+// Modulo wrap.
+def : T_RR_pat <A4_modwrapu, int_hexagon_A4_modwrapu>;
+
+// Arithmetic/Convergent round
+// Rd=[cround|round](Rs,Rt)[:sat]
+// Rd=[cround|round](Rs,#u5)[:sat]
+def : T_RI_pat <A4_cround_ri, int_hexagon_A4_cround_ri>;
+def : T_RR_pat <A4_cround_rr, int_hexagon_A4_cround_rr>;
+
+def : T_RI_pat <A4_round_ri, int_hexagon_A4_round_ri>;
+def : T_RR_pat <A4_round_rr, int_hexagon_A4_round_rr>;
+
+def : T_RI_pat <A4_round_ri_sat, int_hexagon_A4_round_ri_sat>;
+def : T_RR_pat <A4_round_rr_sat, int_hexagon_A4_round_rr_sat>;
+
+def : T_P_pat <A2_roundsat, int_hexagon_A2_roundsat>;
+
 //Rdd[+]=vrmpybsu(Rss,Rtt)
 //Rdd[+]=vrmpybuu(Rss,Rtt)
-let Predicates = [HasV5]  in {
 def : T_PP_pat  <M5_vrmpybsu, int_hexagon_M5_vrmpybsu>;
 def : T_PP_pat  <M5_vrmpybuu, int_hexagon_M5_vrmpybuu>;
 
@@ -31,7 +336,6 @@ def : T_PRR_pat <M5_vmacbuu, int_hexagon_M5_vmacbuu>;
 
 // Rd=vaddhub(Rss,Rtt):sat
 def : T_PP_pat <A5_vaddhubs, int_hexagon_A5_vaddhubs>;
-}
 
 def : T_FF_pat<F2_sfadd, int_hexagon_F2_sfadd>;
 def : T_FF_pat<F2_sfsub, int_hexagon_F2_sfsub>;
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index f9ed039092333b13b9c18b15bee51c4c4930f568..702d68fad9bf0832065298ef84a63787f17058b4 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1970,7 +1970,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
   // Get the location that may be stored across the loop.  Since the access
   // is strided positively through memory, we say that the modified location
   // starts at the pointer and has infinite size.
-  LocationSize AccessSize = MemoryLocation::UnknownSize;
+  LocationSize AccessSize = LocationSize::unknown();
 
   // If the loop iterates a fixed number of times, we can refine the access
   // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
@@ -2360,7 +2360,7 @@ bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
   auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
     return DT->dominates(BB, EB);
   };
-  if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
+  if (!all_of(ExitBlocks, DominatedByBB))
     return false;
 
   bool MadeChange = false;
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index ebfe21bd17d7b44b3ccad0fcc1e28ab255b986a2..908ce24136c766c1108a44438b47ef0f8a24586e 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -105,6 +105,7 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) {
   default:
     if (!ResourcesModel->canReserveResources(*SU->getInstr()))
       return false;
+    break;
   case TargetOpcode::EXTRACT_SUBREG:
   case TargetOpcode::INSERT_SUBREG:
   case TargetOpcode::SUBREG_TO_REG:
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index 2f5033a20af87479996581223f81c2b6caf1af03..ddf5a9ca3645f958c1c11638078548f04e7b2a5a 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -257,6 +257,23 @@ class pf2<SDNode Op> : PatFrag<(ops node:$a, node:$b), (Op node:$a, node:$b)>;
 class Not2<PatFrag P>
   : PatFrag<(ops node:$A, node:$B), (P node:$A, (not node:$B))>;
 
+// If there is a constant operand that feeds the and/or instruction,
+// do not generate the compound instructions.
+// It is not always profitable, as some times we end up with a transfer.
+// Check the below example.
+// ra = #65820; rb = lsr(rb, #8); rc ^= and (rb, ra)
+// Instead this is preferable.
+// ra = and (#65820, lsr(ra, #8)); rb = xor(rb, ra)
+class Su_ni1<PatFrag Op>
+  : PatFrag<Op.Operands, !head(Op.Fragments), [{
+            if (hasOneUse(N)){
+              // Check if Op1 is an immediate operand.
+              SDValue Op1 = N->getOperand(1);
+              return !dyn_cast<ConstantSDNode>(Op1);
+            }
+            return false;}],
+            Op.OperandTransform>;
+
 class Su<PatFrag Op>
   : PatFrag<Op.Operands, !head(Op.Fragments), [{ return hasOneUse(N); }],
             Op.OperandTransform>;
@@ -348,38 +365,34 @@ def ToI32: OutPatFrag<(ops node:$V), (A2_tfrsi $V)>;
 // --(2) Type cast -------------------------------------------------------
 //
 
-let Predicates = [HasV5] in {
-  def: OpR_R_pat<F2_conv_sf2df,      pf1<fpextend>,   f64, F32>;
-  def: OpR_R_pat<F2_conv_df2sf,      pf1<fpround>,    f32, F64>;
+def: OpR_R_pat<F2_conv_sf2df,      pf1<fpextend>,   f64, F32>;
+def: OpR_R_pat<F2_conv_df2sf,      pf1<fpround>,    f32, F64>;
 
-  def: OpR_R_pat<F2_conv_w2sf,       pf1<sint_to_fp>, f32, I32>;
-  def: OpR_R_pat<F2_conv_d2sf,       pf1<sint_to_fp>, f32, I64>;
-  def: OpR_R_pat<F2_conv_w2df,       pf1<sint_to_fp>, f64, I32>;
-  def: OpR_R_pat<F2_conv_d2df,       pf1<sint_to_fp>, f64, I64>;
+def: OpR_R_pat<F2_conv_w2sf,       pf1<sint_to_fp>, f32, I32>;
+def: OpR_R_pat<F2_conv_d2sf,       pf1<sint_to_fp>, f32, I64>;
+def: OpR_R_pat<F2_conv_w2df,       pf1<sint_to_fp>, f64, I32>;
+def: OpR_R_pat<F2_conv_d2df,       pf1<sint_to_fp>, f64, I64>;
 
-  def: OpR_R_pat<F2_conv_uw2sf,      pf1<uint_to_fp>, f32, I32>;
-  def: OpR_R_pat<F2_conv_ud2sf,      pf1<uint_to_fp>, f32, I64>;
-  def: OpR_R_pat<F2_conv_uw2df,      pf1<uint_to_fp>, f64, I32>;
-  def: OpR_R_pat<F2_conv_ud2df,      pf1<uint_to_fp>, f64, I64>;
+def: OpR_R_pat<F2_conv_uw2sf,      pf1<uint_to_fp>, f32, I32>;
+def: OpR_R_pat<F2_conv_ud2sf,      pf1<uint_to_fp>, f32, I64>;
+def: OpR_R_pat<F2_conv_uw2df,      pf1<uint_to_fp>, f64, I32>;
+def: OpR_R_pat<F2_conv_ud2df,      pf1<uint_to_fp>, f64, I64>;
 
-  def: OpR_R_pat<F2_conv_sf2w_chop,  pf1<fp_to_sint>, i32, F32>;
-  def: OpR_R_pat<F2_conv_df2w_chop,  pf1<fp_to_sint>, i32, F64>;
-  def: OpR_R_pat<F2_conv_sf2d_chop,  pf1<fp_to_sint>, i64, F32>;
-  def: OpR_R_pat<F2_conv_df2d_chop,  pf1<fp_to_sint>, i64, F64>;
+def: OpR_R_pat<F2_conv_sf2w_chop,  pf1<fp_to_sint>, i32, F32>;
+def: OpR_R_pat<F2_conv_df2w_chop,  pf1<fp_to_sint>, i32, F64>;
+def: OpR_R_pat<F2_conv_sf2d_chop,  pf1<fp_to_sint>, i64, F32>;
+def: OpR_R_pat<F2_conv_df2d_chop,  pf1<fp_to_sint>, i64, F64>;
 
-  def: OpR_R_pat<F2_conv_sf2uw_chop, pf1<fp_to_uint>, i32, F32>;
-  def: OpR_R_pat<F2_conv_df2uw_chop, pf1<fp_to_uint>, i32, F64>;
-  def: OpR_R_pat<F2_conv_sf2ud_chop, pf1<fp_to_uint>, i64, F32>;
-  def: OpR_R_pat<F2_conv_df2ud_chop, pf1<fp_to_uint>, i64, F64>;
-}
+def: OpR_R_pat<F2_conv_sf2uw_chop, pf1<fp_to_uint>, i32, F32>;
+def: OpR_R_pat<F2_conv_df2uw_chop, pf1<fp_to_uint>, i32, F64>;
+def: OpR_R_pat<F2_conv_sf2ud_chop, pf1<fp_to_uint>, i64, F32>;
+def: OpR_R_pat<F2_conv_df2ud_chop, pf1<fp_to_uint>, i64, F64>;
 
 // Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
-let Predicates = [HasV5] in {
-  def: Pat<(i32 (bitconvert F32:$v)), (I32:$v)>;
-  def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
-  def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
-  def: Pat<(f64 (bitconvert I64:$v)), (F64:$v)>;
-}
+def: Pat<(i32 (bitconvert F32:$v)), (I32:$v)>;
+def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
+def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
+def: Pat<(f64 (bitconvert I64:$v)), (F64:$v)>;
 
 multiclass Cast_pat<ValueType Ta, ValueType Tb, RegisterClass RC> {
   def: Pat<(Tb (bitconvert (Ta RC:$Rs))), (Tb RC:$Rs)>;
@@ -582,31 +595,29 @@ def: OpR_RR_pat<A2_vcmpwgtu,  RevCmp<setult>, v2i1, V2I32>;
 def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         i1,   V2I32>;
 def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         v2i1, V2I32>;
 
-let Predicates = [HasV5] in {
-  def: OpR_RR_pat<F2_sfcmpeq,   seteq,          i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   setgt,          i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   setge,          i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpeq,   setoeq,         i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   setogt,         i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   setoge,         i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setolt>, i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setole>, i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setlt>,  i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setle>,  i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpuo,   setuo,          i1, F32>;
-
-  def: OpR_RR_pat<F2_dfcmpeq,   seteq,          i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   setgt,          i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   setge,          i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpeq,   setoeq,         i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   setogt,         i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   setoge,         i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setolt>, i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setole>, i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setlt>,  i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setle>,  i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpuo,   setuo,          i1, F64>;
-}
+def: OpR_RR_pat<F2_sfcmpeq,   seteq,          i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   setgt,          i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   setge,          i1, F32>;
+def: OpR_RR_pat<F2_sfcmpeq,   setoeq,         i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   setogt,         i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   setoge,         i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setolt>, i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setole>, i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setlt>,  i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setle>,  i1, F32>;
+def: OpR_RR_pat<F2_sfcmpuo,   setuo,          i1, F32>;
+
+def: OpR_RR_pat<F2_dfcmpeq,   seteq,          i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   setgt,          i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   setge,          i1, F64>;
+def: OpR_RR_pat<F2_dfcmpeq,   setoeq,         i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   setogt,         i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   setoge,         i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setolt>, i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setole>, i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setlt>,  i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setle>,  i1, F64>;
+def: OpR_RR_pat<F2_dfcmpuo,   setuo,          i1, F64>;
 
 // Avoid C4_cmpneqi, C4_cmpltei, C4_cmplteui, since they cannot form compounds.
 
@@ -729,32 +740,28 @@ class Cmpud<InstHexagon MI>:  T3<C2_or,  F2_dfcmpuo, MI>;
 class Cmpufn<InstHexagon MI>: T3<C2_orn, F2_sfcmpuo, MI>;
 class Cmpudn<InstHexagon MI>: T3<C2_orn, F2_dfcmpuo, MI>;
 
-let Predicates = [HasV5] in {
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpeq>,  setueq,         i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  setuge,         i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  setugt,         i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  RevCmp<setule>, i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  RevCmp<setult>, i1, F32>;
-  def: OpmR_RR_pat<Cmpufn<F2_sfcmpeq>, setune,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpeq>,  setueq,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  setuge,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  setugt,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  RevCmp<setule>, i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  RevCmp<setult>, i1, F32>;
+def: OpmR_RR_pat<Cmpufn<F2_sfcmpeq>, setune,         i1, F32>;
 
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpeq>,  setueq,         i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  setuge,         i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  setugt,         i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  RevCmp<setule>, i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  RevCmp<setult>, i1, F64>;
-  def: OpmR_RR_pat<Cmpudn<F2_dfcmpeq>, setune,         i1, F64>;
-}
+def: OpmR_RR_pat<Cmpud<F2_dfcmpeq>,  setueq,         i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  setuge,         i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  setugt,         i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  RevCmp<setule>, i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  RevCmp<setult>, i1, F64>;
+def: OpmR_RR_pat<Cmpudn<F2_dfcmpeq>, setune,         i1, F64>;
 
-let Predicates = [HasV5] in {
-  def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setone, i1, F32>;
-  def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne,  i1, F32>;
+def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setone, i1, F32>;
+def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne,  i1, F32>;
 
-  def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setone, i1, F64>;
-  def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setne,  i1, F64>;
+def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setone, i1, F64>;
+def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setne,  i1, F64>;
 
-  def: OpmR_RR_pat<Outn<F2_sfcmpuo>, seto,   i1, F32>;
-  def: OpmR_RR_pat<Outn<F2_dfcmpuo>, seto,   i1, F64>;
-}
+def: OpmR_RR_pat<Outn<F2_sfcmpuo>, seto,   i1, F32>;
+def: OpmR_RR_pat<Outn<F2_dfcmpuo>, seto,   i1, F64>;
 
 
 // --(6) Select ----------------------------------------------------------
@@ -784,27 +791,25 @@ def: Pat<(select I1:$Pu, I64:$Rs, I64:$Rt),
          (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
                    (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
 
-let Predicates = [HasV5] in {
-  def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$I),
-           (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
-  def: Pat<(select I1:$Pu, f32ImmPred:$I, F32:$Rt),
-           (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
-  def: Pat<(select I1:$Pu, F32:$Rs, F32:$Rt),
-           (C2_mux I1:$Pu, F32:$Rs, F32:$Rt)>;
-  def: Pat<(select I1:$Pu, F64:$Rs, F64:$Rt),
-           (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
-                     (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
+def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$I),
+         (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
+def: Pat<(select I1:$Pu, f32ImmPred:$I, F32:$Rt),
+         (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
+def: Pat<(select I1:$Pu, F32:$Rs, F32:$Rt),
+         (C2_mux I1:$Pu, F32:$Rs, F32:$Rt)>;
+def: Pat<(select I1:$Pu, F64:$Rs, F64:$Rt),
+         (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
+                   (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
 
-  def: Pat<(select (i1 (setult F32:$Ra, F32:$Rb)), F32:$Rs, F32:$Rt),
-           (C2_mux (F2_sfcmpgt F32:$Rb, F32:$Ra), F32:$Rs, F32:$Rt)>;
-  def: Pat<(select (i1 (setult F64:$Ra, F64:$Rb)), F64:$Rs, F64:$Rt),
-           (C2_vmux (F2_dfcmpgt F64:$Rb, F64:$Ra), F64:$Rs, F64:$Rt)>;
+def: Pat<(select (i1 (setult F32:$Ra, F32:$Rb)), F32:$Rs, F32:$Rt),
+         (C2_mux (F2_sfcmpgt F32:$Rb, F32:$Ra), F32:$Rs, F32:$Rt)>;
+def: Pat<(select (i1 (setult F64:$Ra, F64:$Rb)), F64:$Rs, F64:$Rt),
+         (C2_vmux (F2_dfcmpgt F64:$Rb, F64:$Ra), F64:$Rs, F64:$Rt)>;
 
-  def: Pat<(select (not I1:$Pu), f32ImmPred:$I, F32:$Rs),
-           (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
-  def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I),
-           (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
-}
+def: Pat<(select (not I1:$Pu), f32ImmPred:$I, F32:$Rs),
+         (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
+def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I),
+         (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
 
 def: Pat<(select I1:$Pu, V4I8:$Rs, V4I8:$Rt),
          (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
@@ -872,7 +877,7 @@ let AddedComplexity = 200 in {
   defm: SelMinMax_pats<setult, I64, A2_minup, A2_maxup>;
 }
 
-let AddedComplexity = 100, Predicates = [HasV5] in {
+let AddedComplexity = 100 in {
   defm: SelMinMax_pats<setolt, F32, F2_sfmin, F2_sfmax>;
   defm: SelMinMax_pats<setole, F32, F2_sfmin, F2_sfmax>;
   defm: SelMinMax_pats<setogt, F32, F2_sfmax, F2_sfmin>;
@@ -997,7 +1002,7 @@ let Predicates = [HasV60] in {
 def: Pat<(sra (add (sra I32:$Rs, u5_0ImmPred:$u5), 1), (i32 1)),
          (S2_asr_i_r_rnd I32:$Rs, imm:$u5)>;
 def: Pat<(sra (add (sra I64:$Rs, u6_0ImmPred:$u6), 1), (i32 1)),
-         (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>, Requires<[HasV5]>;
+         (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>;
 
 // Prefer S2_addasl_rrri over S2_asl_i_r_acc.
 let AddedComplexity = 120 in
@@ -1174,17 +1179,15 @@ def: Pat<(not  I32:$Rs), (A2_subri -1, I32:$Rs)>;
 def: Pat<(not  I64:$Rs), (A2_notp  I64:$Rs)>;
 def: Pat<(ineg I64:$Rs), (A2_negp  I64:$Rs)>;
 
-let Predicates = [HasV5] in {
-  def: Pat<(fabs F32:$Rs), (S2_clrbit_i    F32:$Rs, 31)>;
-  def: Pat<(fneg F32:$Rs), (S2_togglebit_i F32:$Rs, 31)>;
+def: Pat<(fabs F32:$Rs), (S2_clrbit_i    F32:$Rs, 31)>;
+def: Pat<(fneg F32:$Rs), (S2_togglebit_i F32:$Rs, 31)>;
 
-  def: Pat<(fabs F64:$Rs),
-           (Combinew (S2_clrbit_i (HiReg $Rs), 31),
-                     (i32 (LoReg $Rs)))>;
-  def: Pat<(fneg F64:$Rs),
-           (Combinew (S2_togglebit_i (HiReg $Rs), 31),
-                     (i32 (LoReg $Rs)))>;
-}
+def: Pat<(fabs F64:$Rs),
+         (Combinew (S2_clrbit_i (HiReg $Rs), 31),
+                   (i32 (LoReg $Rs)))>;
+def: Pat<(fneg F64:$Rs),
+         (Combinew (S2_togglebit_i (HiReg $Rs), 31),
+                   (i32 (LoReg $Rs)))>;
 
 def: Pat<(add I32:$Rs, anyimm:$s16),   (A2_addi   I32:$Rs,  imm:$s16)>;
 def: Pat<(or  I32:$Rs, anyimm:$s10),   (A2_orir   I32:$Rs,  imm:$s10)>;
@@ -1250,13 +1253,11 @@ def: OpR_RR_pat<C2_and,       Mul,        v2i1,  V2I1>;
 def: OpR_RR_pat<C2_and,       Mul,        v4i1,  V4I1>;
 def: OpR_RR_pat<C2_and,       Mul,        v8i1,  V8I1>;
 
-let Predicates = [HasV5] in {
-  def: OpR_RR_pat<F2_sfadd,     pf2<fadd>,    f32, F32>;
-  def: OpR_RR_pat<F2_sfsub,     pf2<fsub>,    f32, F32>;
-  def: OpR_RR_pat<F2_sfmpy,     pf2<fmul>,    f32, F32>;
-  def: OpR_RR_pat<F2_sfmin,     pf2<fminnum>, f32, F32>;
-  def: OpR_RR_pat<F2_sfmax,     pf2<fmaxnum>, f32, F32>;
-}
+def: OpR_RR_pat<F2_sfadd,     pf2<fadd>,    f32, F32>;
+def: OpR_RR_pat<F2_sfsub,     pf2<fsub>,    f32, F32>;
+def: OpR_RR_pat<F2_sfmpy,     pf2<fmul>,    f32, F32>;
+def: OpR_RR_pat<F2_sfmin,     pf2<fminnum>, f32, F32>;
+def: OpR_RR_pat<F2_sfmax,     pf2<fmaxnum>, f32, F32>;
 
 // In expressions like a0*b0 + a1*b1 + ..., prefer to generate multiply-add,
 // over add-add with individual multiplies as inputs.
@@ -1336,16 +1337,16 @@ def: Pat<(mul I32:$Rs, n8_0ImmPred:$n8),
 def: Pat<(add Sext64:$Rs, I64:$Rt),
          (A2_addsp (LoReg Sext64:$Rs), I64:$Rt)>;
 
-def: AccRRR_pat<M4_and_and,   And, Su<And>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_and_or,    And, Su<Or>,        I32,  I32,  I32>;
-def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_or_and,    Or,  Su<And>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_or_or,     Or,  Su<Or>,        I32,  I32,  I32>;
-def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_xor_and,   Xor, Su<And>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_xor_or,    Xor, Su<Or>,        I32,  I32,  I32>;
-def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,       I32,  I32,  I32>;
-def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,       I64,  I64,  I64>;
+def: AccRRR_pat<M4_and_and,   And, Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_or,    And, Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_and,    Or,  Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_or,     Or,  Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_and,   Xor, Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_or,    Xor, Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,      I64,  I64,  I64>;
 
 // For dags like (or (and (not _), _), (shl _, _)) where the "or" with
 // one argument matches the patterns below, and with the other argument
@@ -1489,14 +1490,12 @@ def: Pat<(add I32:$Ru, (Su<Mul> I32:$Ry, I32:$Rs)),
          (M4_mpyrr_addr IntRegs:$Ru, IntRegs:$Ry, IntRegs:$Rs)>;
 
 
-let Predicates = [HasV5] in {
-  def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
-           (F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
-  def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
-           (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
-  def: Pat<(fma F32:$Rs, (fneg F32:$Rt), F32:$Rx),
-           (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
-}
+def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
+         (F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
+def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
+         (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
+def: Pat<(fma F32:$Rs, (fneg F32:$Rt), F32:$Rx),
+         (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
 
 
 def: Pat<(mul V2I32:$Rs, V2I32:$Rt),
@@ -1523,14 +1522,12 @@ def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
 
 // Multiplies two v4i8 vectors.
 def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
-         (S2_vtrunehb (M5_vmpybuu V4I8:$Rs, V4I8:$Rt))>,
-     Requires<[HasV5]>;
+         (S2_vtrunehb (M5_vmpybuu V4I8:$Rs, V4I8:$Rt))>;
 
 // Multiplies two v8i8 vectors.
 def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
          (Combinew (S2_vtrunehb (M5_vmpybuu (HiReg $Rs), (HiReg $Rt))),
-                   (S2_vtrunehb (M5_vmpybuu (LoReg $Rs), (LoReg $Rt))))>,
-     Requires<[HasV5]>;
+                   (S2_vtrunehb (M5_vmpybuu (LoReg $Rs), (LoReg $Rt))))>;
 
 
 // --(10) Bit ------------------------------------------------------------
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index fd7466349ecd5b8d9c9f4dfe286e1013615c98e8..6935e3b7bebca915214d196d2bdb25a7ae515037 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -208,6 +208,7 @@ class Call_nr<bits<5> nbits, bit isPred, bit isFalse, dag iops,
     let isPredicable = 0;  // !if(isPred, 0, 1);
     let isPredicated = 0;  // isPred;
     let isPredicatedFalse = isFalse;
+    let Itinerary = itin;
 }
 
 def PS_call_nr : Call_nr<24, 0, 0, (ins s32_0Imm:$Ii), J2_call.Itinerary>;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2e11f875c0f97ccc4df66dbae6c351e585900c59..545def45a1c307d669f7da63ac80cddf4b4c19b5 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -118,18 +118,7 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
   bool HasEHReturn = MF->getInfo<HexagonMachineFunctionInfo>()->hasEHReturn();
 
-  switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) {
-  case Hexagon::ArchEnum::V4:
-  case Hexagon::ArchEnum::V5:
-  case Hexagon::ArchEnum::V55:
-  case Hexagon::ArchEnum::V60:
-  case Hexagon::ArchEnum::V62:
-  case Hexagon::ArchEnum::V65:
-    return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
-  }
-
-  llvm_unreachable("Callee saved registers requested for unknown architecture "
-                   "version");
+  return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
 }
 
 
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index a1dfb66017a5c0ac31b1cd7ab318297e3dceabd2..fa4f9ca639ccf209ef2e51716c55465c952c0517 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -57,10 +57,10 @@ include "HexagonDepIICScalar.td"
 include "HexagonDepIICHVX.td"
 
 //===----------------------------------------------------------------------===//
-// V4 Machine Info +
+// V5 Machine Info +
 //===----------------------------------------------------------------------===//
 
-include "HexagonScheduleV4.td"
+include "HexagonScheduleV5.td"
 
 // V55 Machine Info +
 include "HexagonScheduleV55.td"
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV5.td
similarity index 70%
rename from lib/Target/Hexagon/HexagonScheduleV4.td
rename to lib/Target/Hexagon/HexagonScheduleV5.td
index 69b704a805b82d5ba85906c429ed8e09f9a8428b..9a893f6dde027c11e14e698a5c2114207c27d6cd 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV5.td
@@ -1,4 +1,4 @@
-//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//=-HexagonScheduleV5.td - HexagonV5 Scheduling Definitions --*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,8 +10,8 @@
 def LD_tc_ld_SLOT01 : InstrItinClass;
 def ST_tc_st_SLOT01 : InstrItinClass;
 
-class HexagonV4PseudoItin {
-  list<InstrItinData> V4PseudoItin_list = [
+class HexagonV5PseudoItin {
+  list<InstrItinData> V5PseudoItin_list = [
     InstrItinData<PSEUDO,     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
     InstrItinData<PSEUDOM,    [InstrStage<1, [SLOT2, SLOT3], 0>,
                                InstrStage<1, [SLOT2, SLOT3]>]>,
@@ -20,27 +20,27 @@ class HexagonV4PseudoItin {
   ];
 }
 
-def HexagonV4ItinList : DepScalarItinV4, HexagonV4PseudoItin {
-  list<InstrItinData> V4Itin_list = [
+def HexagonV5ItinList : DepScalarItinV5, HexagonV5PseudoItin {
+  list<InstrItinData> V5Itin_list = [
     InstrItinData<LD_tc_ld_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>]>,
     InstrItinData<ST_tc_st_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>]>
   ];
   list<InstrItinData> ItinList =
-    !listconcat(V4Itin_list, DepScalarItinV4_list, V4PseudoItin_list);
+    !listconcat(V5Itin_list, DepScalarItinV5_list, V5PseudoItin_list);
 }
 
-def HexagonItinerariesV4 :
+def HexagonItinerariesV5 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP],
-                           [Hex_FWD], HexagonV4ItinList.ItinList>;
+                           [Hex_FWD], HexagonV5ItinList.ItinList>;
 
-def HexagonModelV4 : SchedMachineModel {
+def HexagonModelV5 : SchedMachineModel {
   // Max issue per cycle == bundle width.
   let IssueWidth = 4;
-  let Itineraries = HexagonItinerariesV4;
+  let Itineraries = HexagonItinerariesV5;
   let LoadLatency = 1;
   let CompleteModel = 0;
 }
 
 //===----------------------------------------------------------------------===//
-// Hexagon V4 Resource Definitions -
+// Hexagon V5 Resource Definitions -
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index c41f0d3c085c8bab21361327854c9381130d9d75..55de25120943b304da4227e170bdb04ac4f624d5 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -63,7 +63,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
   auto &HST = Fn.getSubtarget<HexagonSubtarget>();
   auto &HTM = static_cast<const HexagonTargetMachine&>(Fn.getTarget());
   auto &TLOF = *HTM.getObjFileLowering();
-  if (HST.useSmallData() && TLOF.isSmallDataEnabled())
+  if (HST.useSmallData() && TLOF.isSmallDataEnabled(HTM))
     return false;
 
   const TargetInstrInfo *TII = HST.getInstrInfo();
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 0686d6eb6118689c491f0b1bdf02d1a818c49581..68e276be0f691c15ce3831024e6eaf71afb83bda 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -93,7 +93,6 @@ HexagonSubtarget &
 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   static std::map<StringRef, Hexagon::ArchEnum> CpuTable{
       {"generic", Hexagon::ArchEnum::V60},
-      {"hexagonv4", Hexagon::ArchEnum::V4},
       {"hexagonv5", Hexagon::ArchEnum::V5},
       {"hexagonv55", Hexagon::ArchEnum::V55},
       {"hexagonv60", Hexagon::ArchEnum::V60},
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index dc8d173a5057d67b0390f86e3664d78a7d48d7a3..eaae4db6ba90a6ce502d6abcbbc27eaecdd9b044 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -59,7 +59,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
 
 public:
   Hexagon::ArchEnum HexagonArchVersion;
-  Hexagon::ArchEnum HexagonHVXVersion = Hexagon::ArchEnum::V4;
+  Hexagon::ArchEnum HexagonHVXVersion = Hexagon::ArchEnum::NoArch;
   CodeGenOpt::Level OptLevel;
   /// True if the target should use Back-Skip-Back scheduling. This is the
   /// default for V60.
@@ -158,7 +158,9 @@ public:
   bool useNewValueStores() const { return UseNewValueStores; }
   bool useSmallData() const { return UseSmallData; }
 
-  bool useHVXOps() const { return HexagonHVXVersion > Hexagon::ArchEnum::V4; }
+  bool useHVXOps() const {
+    return HexagonHVXVersion > Hexagon::ArchEnum::NoArch;
+  }
   bool useHVX128BOps() const { return useHVXOps() && UseHVX128BOps; }
   bool useHVX64BOps() const { return useHVXOps() && UseHVX64BOps; }
 
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index e771f383dffafc32f3a3fe947ac9b9f25589df8b..386cd14c827beaf3e6cb7e03dd8b332f5162a3cf 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -199,6 +199,11 @@ MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
 /// section.
 bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
       const TargetMachine &TM) const {
+  if (!isSmallDataEnabled(TM)) {
+    LLVM_DEBUG(dbgs() << "Small data is not available.\n");
+    return false;
+  }
+
   // Only global variables, not functions.
   LLVM_DEBUG(dbgs() << "Checking if value is in small-data, -G"
                     << SmallDataThreshold << ": \"" << GO->getName() << "\": ");
@@ -263,8 +268,9 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
   return true;
 }
 
-bool HexagonTargetObjectFile::isSmallDataEnabled() const {
-  return SmallDataThreshold > 0;
+bool HexagonTargetObjectFile::isSmallDataEnabled(const TargetMachine &TM)
+    const {
+  return SmallDataThreshold > 0 && !TM.isPositionIndependent();
 }
 
 unsigned HexagonTargetObjectFile::getSmallDataSize() const {
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index eff44f097e03f58677bcd2b92d264ed20c99eb90..18863630fde23d5a9812604642fa80e63ae4162b 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -29,7 +29,7 @@ namespace llvm {
     bool isGlobalInSmallSection(const GlobalObject *GO,
                                 const TargetMachine &TM) const;
 
-    bool isSmallDataEnabled() const;
+    bool isSmallDataEnabled(const TargetMachine &TM) const;
 
     unsigned getSmallDataSize() const;
 
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 4d0e7dc52e80f50ca63ed4cb7eba443f28aa80e1..c942f645aa88706859acc84210b952c12aaddf5e 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -54,7 +54,7 @@ bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
     return false;
   if (ST.isHVXVectorType(VecVT.getSimpleVT()))
     return true;
-  auto Action = TLI.getPreferredVectorAction(VecVT);
+  auto Action = TLI.getPreferredVectorAction(VecVT.getSimpleVT());
   return Action == TargetLoweringBase::TypeWidenVector;
 }
 
@@ -206,10 +206,12 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
 
 unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
       Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      unsigned Alignment, unsigned AddressSpace) {
-  if (Indices.size() != Factor)
+      unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+      bool UseMaskForGaps) {
+  if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
   return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr);
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 2c03cd268ff4baffd22e110aa595490ddae583b4..5c6f85584ec2f45a1d9d7828cca4f12d29d0473b 100644
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -123,7 +123,8 @@ public:
             bool VariableMask, unsigned Alignment);
   unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
             unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
-            unsigned AddressSpace);
+            unsigned AddressSpace, bool UseMaskForCond = false,
+            bool UseMaskForGaps = false);
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
             const Instruction *I);
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index a896700df1b07110541f1ba3c44ae68938876111..722699907ca04aa71411dc5ac2732cbd2900a623 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -768,7 +768,7 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
 
   // Make sure that for non-POST_INC stores:
   // 1. The only use of reg is DepReg and no other registers.
-  //    This handles V4 base+index registers.
+  //    This handles base+index registers.
   //    The following store can not be dot new.
   //    Eg.   r0 = add(r0, #3)
   //          memw(r1+r0<<#2) = r0
@@ -838,11 +838,7 @@ static bool isImplicitDependency(const MachineInstr &I, bool CheckDef,
   return false;
 }
 
-// Check to see if an instruction can be dot new
-// There are three kinds.
-// 1. dot new on predicate - V2/V3/V4
-// 2. dot new on stores NV/ST - V4
-// 3. dot new on jump NV/J - V4 -- This is generated in a pass.
+// Check to see if an instruction can be dot new.
 bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
       const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
       const TargetRegisterClass* RC) {
@@ -1075,9 +1071,6 @@ bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
   if (MI.isInlineAsm() && !ScheduleInlineAsm)
     return true;
 
-  // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints:
-  // trap, pause, barrier, icinva, isync, and syncht are solo instructions.
-  // They must not be grouped with other instructions in a packet.
   if (isSchedBarrier(MI))
     return true;
 
@@ -1289,8 +1282,8 @@ bool HexagonPacketizerList::hasRegMaskDependence(const MachineInstr &I,
   return false;
 }
 
-bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr &I,
-                                                    const MachineInstr &J) {
+bool HexagonPacketizerList::hasDualStoreDependence(const MachineInstr &I,
+                                                   const MachineInstr &J) {
   bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J);
   bool StoreI = I.mayStore(), StoreJ = J.mayStore();
   if ((SysI && StoreJ) || (SysJ && StoreI))
@@ -1343,10 +1336,10 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   if (Dependence)
     return false;
 
-  // V4 allows dual stores. It does not allow second store, if the first
-  // store is not in SLOT0. New value store, new value jump, dealloc_return
-  // and memop always take SLOT0. Arch spec 3.4.4.2.
-  Dependence = hasV4SpecificDependence(I, J);
+  // Dual-store does not allow second store, if the first store is not
+  // in SLOT0. New value store, new value jump, dealloc_return and memop
+  // always take SLOT0. Arch spec 3.4.4.2.
+  Dependence = hasDualStoreDependence(I, J);
   if (Dependence)
     return false;
 
@@ -1505,10 +1498,10 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     }
 
     // For Order dependences:
-    // 1. On V4 or later, volatile loads/stores can be packetized together,
-    //    unless other rules prevent is.
+    // 1. Volatile loads/stores can be packetized together, unless other
+    //    rules prevent is.
     // 2. Store followed by a load is not allowed.
-    // 3. Store followed by a store is only valid on V4 or later.
+    // 3. Store followed by a store is valid.
     // 4. Load followed by any memory operation is allowed.
     if (DepType == SDep::Order) {
       if (!PacketizeVolatiles) {
@@ -1555,7 +1548,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       continue;
     }
 
-    // For V4, special case ALLOCFRAME. Even though there is dependency
+    // Special case for ALLOCFRAME: even though there is dependency
     // between ALLOCFRAME and subsequent store, allow it to be packetized
     // in a same packet. This implies that the store is using the caller's
     // SP. Hence, offset needs to be updated accordingly.
@@ -1575,6 +1568,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
             if (GlueAllocframeStore)
               continue;
           }
+          break;
         default:
           break;
       }
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index d54dd7050e1f228ed228053f54bc31db291afe35..ca70cf967a46fa994fd12bc34fea78f3efdcdd65 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -149,7 +149,7 @@ protected:
   bool hasDeadDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasControlDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasRegMaskDependence(const MachineInstr &I, const MachineInstr &J);
-  bool hasV4SpecificDependence(const MachineInstr &I, const MachineInstr &J);
+  bool hasDualStoreDependence(const MachineInstr &I, const MachineInstr &J);
   bool producesStall(const MachineInstr &MI);
 };
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index b208a36681242d44438bf5d436a030a0f3ffe1b0..f0654d612b4bea8cf28a619d1624fc2a92801f6f 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -127,6 +127,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x4;
     }
+    break;
   case HexagonII::HSIG_L2:
     switch (Gb) {
     default:
@@ -138,6 +139,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x5;
     }
+    break;
   case HexagonII::HSIG_S1:
     switch (Gb) {
     default:
@@ -151,6 +153,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x6;
     }
+    break;
   case HexagonII::HSIG_S2:
     switch (Gb) {
     default:
@@ -166,6 +169,7 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x7;
     }
+    break;
   case HexagonII::HSIG_A:
     switch (Gb) {
     default:
@@ -173,11 +177,13 @@ unsigned HexagonMCInstrInfo::iClassOfDuplexPair(unsigned Ga, unsigned Gb) {
     case HexagonII::HSIG_A:
       return 0x3;
     }
+    break;
   case HexagonII::HSIG_Compound:
     switch (Gb) {
     case HexagonII::HSIG_Compound:
       return 0xFFFFFFFF;
     }
+    break;
   }
   return 0xFFFFFFFF;
 }
@@ -634,8 +640,7 @@ bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
       return false;
   }
 
-  if (STI.getCPU().equals_lower("hexagonv4") ||
-      STI.getCPU().equals_lower("hexagonv5") ||
+  if (STI.getCPU().equals_lower("hexagonv5") ||
       STI.getCPU().equals_lower("hexagonv55") ||
       STI.getCPU().equals_lower("hexagonv60")) {
     // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index b211a81524fb93b5e932d730dff9a42f0ad9820c..8f3c09e7204f5407e40462f8c781dcbba6160543 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -61,8 +61,6 @@ cl::opt<bool> llvm::HexagonDisableDuplex
    cl::desc("Disable looking for duplex instructions for Hexagon"));
 
 namespace { // These flags are to be deprecated
-cl::opt<bool> MV4("mv4", cl::Hidden, cl::desc("Build for Hexagon V4"),
-                  cl::init(false));
 cl::opt<bool> MV5("mv5", cl::Hidden, cl::desc("Build for Hexagon V5"),
                   cl::init(false));
 cl::opt<bool> MV55("mv55", cl::Hidden, cl::desc("Build for Hexagon V55"),
@@ -83,18 +81,18 @@ cl::opt<Hexagon::ArchEnum>
         clEnumValN(Hexagon::ArchEnum::V62, "v62", "Build for HVX v62"),
         clEnumValN(Hexagon::ArchEnum::V65, "v65", "Build for HVX v65"),
         // Sentinal for no value specified
-        clEnumValN(Hexagon::ArchEnum::V5, "", "")),
+        clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
       // Sentinal for flag not present
-      cl::init(Hexagon::ArchEnum::V4), cl::ValueOptional);
+      cl::init(Hexagon::ArchEnum::NoArch), cl::ValueOptional);
+
 static cl::opt<bool>
-  DisableHVX("mno-hvx", cl::Hidden, cl::desc("Disable Hexagon Vector eXtensions"));
+  DisableHVX("mno-hvx", cl::Hidden,
+             cl::desc("Disable Hexagon Vector eXtensions"));
 
 
 static StringRef DefaultArch = "hexagonv60";
 
 static StringRef HexagonGetArchVariant() {
-  if (MV4)
-    return "hexagonv4";
   if (MV5)
     return "hexagonv5";
   if (MV55)
@@ -123,7 +121,7 @@ StringRef Hexagon_MC::selectHexagonCPU(StringRef CPU) {
   return ArchV;
 }
 
-unsigned llvm::HexagonGetLastSlot() { return HexagonItinerariesV4FU::SLOT3; }
+unsigned llvm::HexagonGetLastSlot() { return HexagonItinerariesV5FU::SLOT3; }
 
 namespace {
 
@@ -279,6 +277,7 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
     Result.push_back(FS);
 
   switch (EnableHVX) {
+  case Hexagon::ArchEnum::V5:
   case Hexagon::ArchEnum::V55:
     break;
   case Hexagon::ArchEnum::V60:
@@ -290,14 +289,14 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
   case Hexagon::ArchEnum::V65:
     Result.push_back("+hvxv65");
     break;
-  case Hexagon::ArchEnum::V5:{
+  case Hexagon::ArchEnum::Generic:{
     Result.push_back(StringSwitch<StringRef>(CPU)
              .Case("hexagonv60", "+hvxv60")
              .Case("hexagonv62", "+hvxv62")
              .Case("hexagonv65", "+hvxv65"));
     break;
   }
-  case Hexagon::ArchEnum::V4:
+  case Hexagon::ArchEnum::NoArch:
     // Sentinal if -mhvx isn't specified
     break;
   }
@@ -307,15 +306,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
 
 static bool isCPUValid(std::string CPU)
 {
-  std::vector<std::string> table
-  {
-    "generic",
-    "hexagonv4",
-    "hexagonv5",
-    "hexagonv55",
-    "hexagonv60",
-    "hexagonv62",
-    "hexagonv65",
+  std::vector<std::string> table {
+    "generic",    "hexagonv5",  "hexagonv55", "hexagonv60",
+    "hexagonv62", "hexagonv65",
   };
 
   return std::find(table.begin(), table.end(), CPU) != table.end();
@@ -336,8 +329,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   // Make sure that +hvx-length turns hvx on, and that "hvx" alone
   // turns on hvxvNN, corresponding to the existing ArchVNN.
   FeatureBitset FB = S;
-  unsigned CpuArch = ArchV4;
-  for (unsigned F : {ArchV65, ArchV62, ArchV60, ArchV55, ArchV5, ArchV4}) {
+  unsigned CpuArch = ArchV5;
+  for (unsigned F : {ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
     if (!FB.test(F))
       continue;
     CpuArch = F;
@@ -402,7 +395,6 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
 
 unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
   static std::map<StringRef,unsigned> ElfFlags = {
-    {"hexagonv4",  ELF::EF_HEXAGON_MACH_V4},
     {"hexagonv5",  ELF::EF_HEXAGON_MACH_V5},
     {"hexagonv55", ELF::EF_HEXAGON_MACH_V55},
     {"hexagonv60", ELF::EF_HEXAGON_MACH_V60},
diff --git a/lib/Target/MSP430/AsmParser/CMakeLists.txt b/lib/Target/MSP430/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb484898afa0d26699bbb5aa09aa7b5fca8e41b8
--- /dev/null
+++ b/lib/Target/MSP430/AsmParser/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMMSP430AsmParser
+  MSP430AsmParser.cpp
+)
diff --git a/lib/Target/MSP430/AsmParser/LLVMBuild.txt b/lib/Target/MSP430/AsmParser/LLVMBuild.txt
new file mode 100644
index 0000000000000000000000000000000000000000..58f67c07db1849e4d377970f8dd2e99e54fd0cbf
--- /dev/null
+++ b/lib/Target/MSP430/AsmParser/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- lib/Target/MSP430/AsmParser/LLVMBuild.txt ----------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = MSP430AsmParser
+parent = MSP430
+required_libraries = MC MCParser MSP430Desc MSP430Info Support
+add_to_library_groups = MSP430
diff --git a/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f7d1860e9a9388d88dedc1dab8aed6c7569d928
--- /dev/null
+++ b/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -0,0 +1,562 @@
+//===- MSP430AsmParser.cpp - Parse MSP430 assembly to MCInst instructions -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430RegisterInfo.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define DEBUG_TYPE "msp430-asm-parser"
+
+namespace llvm {
+
+/// Parses MSP430 assembly from a stream.
+class MSP430AsmParser : public MCTargetAsmParser {
+  const MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  const MCRegisterInfo *MRI;
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  bool parseJccInstruction(ParseInstructionInfo &Info, StringRef Name,
+                           SMLoc NameLoc, OperandVector &Operands);
+
+  bool ParseOperand(OperandVector &Operands);
+
+  bool ParseLiteralValues(unsigned Size, SMLoc L);
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  /// @name Auto-generated Matcher Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "MSP430GenAsmMatcher.inc"
+
+  /// }
+
+public:
+  MSP430AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+                  const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, STI, MII), STI(STI), Parser(Parser) {
+    MCAsmParserExtension::Initialize(Parser);
+    MRI = getContext().getRegisterInfo();
+
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+};
+
+/// A parsed MSP430 assembly operand.
+class MSP430Operand : public MCParsedAsmOperand {
+  typedef MCParsedAsmOperand Base;
+
+  enum KindTy {
+    k_Imm,
+    k_Reg,
+    k_Tok,
+    k_Mem,
+    k_IndReg,
+    k_PostIndReg
+  } Kind;
+
+  struct Memory {
+    unsigned Reg;
+    const MCExpr *Offset;
+  };
+  union {
+    const MCExpr *Imm;
+    unsigned      Reg;
+    StringRef     Tok;
+    Memory        Mem;
+  };
+
+  SMLoc Start, End;
+
+public:
+  MSP430Operand(StringRef Tok, SMLoc const &S)
+      : Base(), Kind(k_Tok), Tok(Tok), Start(S), End(S) {}
+  MSP430Operand(KindTy Kind, unsigned Reg, SMLoc const &S, SMLoc const &E)
+      : Base(), Kind(Kind), Reg(Reg), Start(S), End(E) {}
+  MSP430Operand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
+      : Base(), Kind(k_Imm), Imm(Imm), Start(S), End(E) {}
+  MSP430Operand(unsigned Reg, MCExpr const *Expr, SMLoc const &S, SMLoc const &E)
+      : Base(), Kind(k_Mem), Mem({Reg, Expr}), Start(S), End(E) {}
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert((Kind == k_Reg || Kind == k_IndReg || Kind == k_PostIndReg) &&
+        "Unexpected operand kind");
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(Reg));
+  }
+
+  void addExprOperand(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediate when possible
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(Kind == k_Imm && "Unexpected operand kind");
+    assert(N == 1 && "Invalid number of operands!");
+
+    addExprOperand(Inst, Imm);
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert(Kind == k_Mem && "Unexpected operand kind");
+    assert(N == 2 && "Invalid number of operands");
+
+    Inst.addOperand(MCOperand::createReg(Mem.Reg));
+    addExprOperand(Inst, Mem.Offset);
+  }
+
+  bool isReg() const        { return Kind == k_Reg; }
+  bool isImm() const        { return Kind == k_Imm; }
+  bool isToken() const      { return Kind == k_Tok; }
+  bool isMem() const        { return Kind == k_Mem; }
+  bool isIndReg() const     { return Kind == k_IndReg; }
+  bool isPostIndReg() const { return Kind == k_PostIndReg; }
+
+  bool isCGImm() const {
+    if (Kind != k_Imm)
+      return false;
+
+    int64_t Val;
+    if (!Imm->evaluateAsAbsolute(Val))
+      return false;
+    
+    if (Val == 0 || Val == 1 || Val == 2 || Val == 4 || Val == 8 || Val == -1)
+      return true;
+
+    return false;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Tok && "Invalid access!");
+    return Tok;
+  }
+
+  unsigned getReg() const {
+    assert(Kind == k_Reg && "Invalid access!");
+    return Reg;
+  }
+
+  void setReg(unsigned RegNo) {
+    assert(Kind == k_Reg && "Invalid access!");
+    Reg = RegNo;
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateToken(StringRef Str, SMLoc S) {
+    return make_unique<MSP430Operand>(Str, S);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateReg(unsigned RegNum, SMLoc S,
+                                                  SMLoc E) {
+    return make_unique<MSP430Operand>(k_Reg, RegNum, S, E);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateImm(const MCExpr *Val, SMLoc S,
+                                                  SMLoc E) {
+    return make_unique<MSP430Operand>(Val, S, E);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateMem(unsigned RegNum,
+                                                  const MCExpr *Val,
+                                                  SMLoc S, SMLoc E) {
+    return make_unique<MSP430Operand>(RegNum, Val, S, E);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreateIndReg(unsigned RegNum, SMLoc S,
+                                                  SMLoc E) {
+    return make_unique<MSP430Operand>(k_IndReg, RegNum, S, E);
+  }
+
+  static std::unique_ptr<MSP430Operand> CreatePostIndReg(unsigned RegNum, SMLoc S,
+                                                  SMLoc E) {
+    return make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E);
+  }
+
+  SMLoc getStartLoc() const { return Start; }
+  SMLoc getEndLoc() const { return End; }
+
+  virtual void print(raw_ostream &O) const {
+    switch (Kind) {
+    case k_Tok:
+      O << "Token " << Tok;
+      break;
+    case k_Reg:
+      O << "Register " << Reg;
+      break;
+    case k_Imm:
+      O << "Immediate " << *Imm;
+      break;
+    case k_Mem:
+      O << "Memory ";
+      O << *Mem.Offset << "(" << Reg << ")";
+      break;
+    case k_IndReg:
+      O << "RegInd " << Reg;
+      break;
+    case k_PostIndReg:
+      O << "PostInc " << Reg;
+      break;
+    }
+  }
+};
+
+bool MSP430AsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
+                                              OperandVector &Operands,
+                                              MCStreamer &Out,
+                                              uint64_t &ErrorInfo,
+                                              bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+
+  switch (MatchResult) {
+  case Match_Success:
+    Inst.setLoc(Loc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  case Match_MnemonicFail:
+    return Error(Loc, "invalid instruction mnemonic");
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = Loc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(ErrorLoc, "too few operands for instruction");
+
+      ErrorLoc = ((MSP430Operand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = Loc;
+    }
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  default:
+    return true;
+  }
+}
+
+// Auto-generated by TableGen
+static unsigned MatchRegisterName(StringRef Name);
+static unsigned MatchRegisterAltName(StringRef Name);
+
+bool MSP430AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                    SMLoc &EndLoc) {
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    auto Name = getLexer().getTok().getIdentifier().lower();
+    RegNo = MatchRegisterName(Name);
+    if (RegNo == MSP430::NoRegister) {
+      RegNo = MatchRegisterAltName(Name);
+      if (RegNo == MSP430::NoRegister)
+        return true;
+    }
+
+    AsmToken const &T = getParser().getTok();
+    StartLoc = T.getLoc();
+    EndLoc = T.getEndLoc();
+    getLexer().Lex(); // eat register token
+
+    return false;
+  }
+
+  return Error(StartLoc, "invalid register name");
+}
+
+bool MSP430AsmParser::parseJccInstruction(ParseInstructionInfo &Info,
+                                          StringRef Name, SMLoc NameLoc,
+                                          OperandVector &Operands) {
+  if (!Name.startswith_lower("j"))
+    return true;
+
+  auto CC = Name.drop_front().lower();
+  unsigned CondCode;
+  if (CC == "ne" || CC == "nz")
+    CondCode = MSP430CC::COND_NE;
+  else if (CC == "eq" || CC == "z")
+    CondCode = MSP430CC::COND_E;
+  else if (CC == "lo" || CC == "nc")
+    CondCode = MSP430CC::COND_LO;
+  else if (CC == "hs" || CC == "c")
+    CondCode = MSP430CC::COND_HS;
+  else if (CC == "n")
+    CondCode = MSP430CC::COND_N;
+  else if (CC == "ge")
+    CondCode = MSP430CC::COND_GE;
+  else if (CC == "l")
+    CondCode = MSP430CC::COND_L;
+  else if (CC == "mp")
+    CondCode = MSP430CC::COND_NONE;
+  else
+    return Error(NameLoc, "unknown instruction");
+
+  if (CondCode == (unsigned)MSP430CC::COND_NONE)
+    Operands.push_back(MSP430Operand::CreateToken("jmp", NameLoc));
+  else {
+    Operands.push_back(MSP430Operand::CreateToken("j", NameLoc));
+    const MCExpr *CCode = MCConstantExpr::create(CondCode, getContext());
+    Operands.push_back(MSP430Operand::CreateImm(CCode, SMLoc(), SMLoc()));
+  }
+
+  // Skip optional '$' sign.
+  if (getLexer().getKind() == AsmToken::Dollar)
+    getLexer().Lex(); // Eat '$'
+
+  const MCExpr *Val;
+  SMLoc ExprLoc = getLexer().getLoc();
+  if (getParser().parseExpression(Val))
+    return Error(ExprLoc, "expected expression operand");
+
+  int64_t Res;
+  if (Val->evaluateAsAbsolute(Res))
+    if (Res < -512 || Res > 511)
+      return Error(ExprLoc, "invalid jump offset");
+
+  Operands.push_back(MSP430Operand::CreateImm(Val, ExprLoc,
+    getLexer().getLoc()));
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    getParser().eatToEndOfStatement();
+    return Error(Loc, "unexpected token");
+  }
+
+  getParser().Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MSP430AsmParser::ParseInstruction(ParseInstructionInfo &Info,
+                                       StringRef Name, SMLoc NameLoc,
+                                       OperandVector &Operands) {
+  // Drop .w suffix
+  if (Name.endswith_lower(".w"))
+    Name = Name.drop_back(2);
+
+  if (!parseJccInstruction(Info, Name, NameLoc, Operands))
+    return false;
+
+  // First operand is instruction mnemonic
+  Operands.push_back(MSP430Operand::CreateToken(Name, NameLoc));
+
+  // If there are no more operands, then finish
+  if (getLexer().is(AsmToken::EndOfStatement))
+    return false;
+
+  // Parse first operand
+  if (ParseOperand(Operands))
+    return true;
+
+  // Parse second operand if any
+  if (getLexer().is(AsmToken::Comma)) {
+    getLexer().Lex(); // Eat ','
+    if (ParseOperand(Operands))
+      return true;
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    getParser().eatToEndOfStatement();
+    return Error(Loc, "unexpected token");
+  }
+
+  getParser().Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MSP430AsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal.lower() == ".long") {
+    ParseLiteralValues(4, DirectiveID.getLoc());
+  } else if (IDVal.lower() == ".word" || IDVal.lower() == ".short") {
+    ParseLiteralValues(2, DirectiveID.getLoc());
+  } else if (IDVal.lower() == ".byte") {
+    ParseLiteralValues(1, DirectiveID.getLoc());
+  }
+  return true;
+}
+
+bool MSP430AsmParser::ParseOperand(OperandVector &Operands) {
+  switch (getLexer().getKind()) {
+    default: return true;
+    case AsmToken::Identifier: {
+      // try rN
+      unsigned RegNo;
+      SMLoc StartLoc, EndLoc;
+      if (!ParseRegister(RegNo, StartLoc, EndLoc)) {
+        Operands.push_back(MSP430Operand::CreateReg(RegNo, StartLoc, EndLoc));
+        return false;
+      }
+      LLVM_FALLTHROUGH;
+    }
+    case AsmToken::Integer:
+    case AsmToken::Plus:
+    case AsmToken::Minus: {
+      SMLoc StartLoc = getParser().getTok().getLoc();
+      const MCExpr *Val;
+      // Try constexpr[(rN)]
+      if (!getParser().parseExpression(Val)) {
+        unsigned RegNo = MSP430::PC;
+        SMLoc EndLoc = getParser().getTok().getLoc();
+        // Try (rN)
+        if (getLexer().getKind() == AsmToken::LParen) {
+          getLexer().Lex(); // Eat '('
+          SMLoc RegStartLoc;
+          if (ParseRegister(RegNo, RegStartLoc, EndLoc))
+            return true;
+          if (getLexer().getKind() != AsmToken::RParen)
+            return true;
+          EndLoc = getParser().getTok().getEndLoc();
+          getLexer().Lex(); // Eat ')'
+        }
+        Operands.push_back(MSP430Operand::CreateMem(RegNo, Val, StartLoc,
+          EndLoc));
+        return false;
+      }
+      return true;
+    }
+    case AsmToken::Amp: {
+      // Try &constexpr
+      SMLoc StartLoc = getParser().getTok().getLoc();
+      getLexer().Lex(); // Eat '&'
+      const MCExpr *Val;
+      if (!getParser().parseExpression(Val)) {
+        SMLoc EndLoc = getParser().getTok().getLoc();
+        Operands.push_back(MSP430Operand::CreateMem(MSP430::SR, Val, StartLoc,
+          EndLoc));
+        return false;
+      }
+      return true;
+    }
+    case AsmToken::At: {
+      // Try @rN[+]
+      SMLoc StartLoc = getParser().getTok().getLoc();
+      getLexer().Lex(); // Eat '@'
+      unsigned RegNo;
+      SMLoc RegStartLoc, EndLoc;
+      if (ParseRegister(RegNo, RegStartLoc, EndLoc))
+        return true;
+      if (getLexer().getKind() == AsmToken::Plus) {
+        Operands.push_back(MSP430Operand::CreatePostIndReg(RegNo, StartLoc, EndLoc));
+        getLexer().Lex(); // Eat '+'
+        return false;
+      }
+      Operands.push_back(MSP430Operand::CreateIndReg(RegNo, StartLoc, EndLoc));
+      return false;
+    }
+    case AsmToken::Hash:
+      // Try #constexpr
+      SMLoc StartLoc = getParser().getTok().getLoc();
+      getLexer().Lex(); // Eat '#'
+      const MCExpr *Val;
+      if (!getParser().parseExpression(Val)) {
+        SMLoc EndLoc = getParser().getTok().getLoc();
+        Operands.push_back(MSP430Operand::CreateImm(Val, StartLoc, EndLoc));
+        return false;
+      }
+      return true;
+  }
+}
+
+bool MSP430AsmParser::ParseLiteralValues(unsigned Size, SMLoc L) {
+  auto parseOne = [&]() -> bool {
+    const MCExpr *Value;
+    if (getParser().parseExpression(Value))
+      return true;
+    getParser().getStreamer().EmitValue(Value, Size, L);
+    return false;
+  };
+  return (parseMany(parseOne));
+}
+
+extern "C" void LLVMInitializeMSP430AsmParser() {
+  RegisterMCAsmParser<MSP430AsmParser> X(getTheMSP430Target());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MSP430GenAsmMatcher.inc"
+
+static unsigned convertGR16ToGR8(unsigned Reg) {
+  switch (Reg) {
+  default:
+    llvm_unreachable("Unknown GR16 register");
+  case MSP430::PC:  return MSP430::PCB;
+  case MSP430::SP:  return MSP430::SPB;
+  case MSP430::SR:  return MSP430::SRB;
+  case MSP430::CG:  return MSP430::CGB;
+  case MSP430::FP:  return MSP430::FPB;
+  case MSP430::R5:  return MSP430::R5B;
+  case MSP430::R6:  return MSP430::R6B;
+  case MSP430::R7:  return MSP430::R7B;
+  case MSP430::R8:  return MSP430::R8B;
+  case MSP430::R9:  return MSP430::R9B;
+  case MSP430::R10: return MSP430::R10B;
+  case MSP430::R11: return MSP430::R11B;
+  case MSP430::R12: return MSP430::R12B;
+  case MSP430::R13: return MSP430::R13B;
+  case MSP430::R14: return MSP430::R14B;
+  case MSP430::R15: return MSP430::R15B;
+  }
+}
+
+unsigned MSP430AsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp,
+                                                     unsigned Kind) {
+  MSP430Operand &Op = static_cast<MSP430Operand &>(AsmOp);
+
+  if (!Op.isReg())
+    return Match_InvalidOperand;
+
+  unsigned Reg = Op.getReg();
+  bool isGR16 =
+      MSP430MCRegisterClasses[MSP430::GR16RegClassID].contains(Reg);
+
+  if (isGR16 && (Kind == MCK_GR8)) {
+    Op.setReg(convertGR16ToGR8(Reg));
+    return Match_Success;
+  }
+
+  return Match_InvalidOperand;
+}
+
+} // end of namespace llvm
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index 3facfd526a53501ace7665253645faedff9cefa0..2a0848fb3082156a109bd8bbd3fe09d15c39bbaa 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -1,9 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS MSP430.td)
 
+tablegen(LLVM MSP430GenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM MSP430GenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM MSP430GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM MSP430GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM MSP430GenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM MSP430GenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM MSP430GenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM MSP430GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM MSP430GenSubtargetInfo.inc -gen-subtarget)
 
@@ -26,3 +29,5 @@ add_llvm_target(MSP430CodeGen
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(TargetInfo)
+add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
diff --git a/lib/Target/MSP430/Disassembler/CMakeLists.txt b/lib/Target/MSP430/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bc33b9067726f622dc1358543c3913d03fa336a8
--- /dev/null
+++ b/lib/Target/MSP430/Disassembler/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMMSP430Disassembler
+  MSP430Disassembler.cpp
+  )
diff --git a/lib/Target/MSP430/Disassembler/LLVMBuild.txt b/lib/Target/MSP430/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8af9cd9c2224ed1b52e28325c8e99cc8c8ffb71f
--- /dev/null
+++ b/lib/Target/MSP430/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;====- lib/Target/MSP430/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = MSP430Disassembler
+parent = MSP430
+required_libraries = MCDisassembler MSP430Info Support
+add_to_library_groups = MSP430
diff --git a/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a66b4ed7f2d150fa2df1e1b081715fe1b17f0c8
--- /dev/null
+++ b/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -0,0 +1,375 @@
+//===-- MSP430Disassembler.cpp - Disassembler for MSP430 ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430Disassembler class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class MSP430Disassembler : public MCDisassembler {
+  DecodeStatus getInstructionI(MCInst &MI, uint64_t &Size,
+                               ArrayRef<uint8_t> Bytes, uint64_t Address,
+                               raw_ostream &VStream,
+                               raw_ostream &CStream) const;
+
+  DecodeStatus getInstructionII(MCInst &MI, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &VStream,
+                                raw_ostream &CStream) const;
+
+  DecodeStatus getInstructionCJ(MCInst &MI, uint64_t &Size,
+                                ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                raw_ostream &VStream,
+                                raw_ostream &CStream) const;
+
+public:
+  MSP430Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
+
+  DecodeStatus getInstruction(MCInst &MI, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &VStream,
+                              raw_ostream &CStream) const override;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createMSP430Disassembler(const Target &T,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new MSP430Disassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeMSP430Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(getTheMSP430Target(),
+                                         createMSP430Disassembler);
+}
+
+static const unsigned GR8DecoderTable[] = {
+  MSP430::PCB,  MSP430::SPB,  MSP430::SRB,  MSP430::CGB,
+  MSP430::FPB,  MSP430::R5B,  MSP430::R6B,  MSP430::R7B,
+  MSP430::R8B,  MSP430::R9B,  MSP430::R10B, MSP430::R11B,
+  MSP430::R12B, MSP430::R13B, MSP430::R14B, MSP430::R15B
+};
+
+static DecodeStatus DecodeGR8RegisterClass(MCInst &MI, uint64_t RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = GR8DecoderTable[RegNo];
+  MI.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static const unsigned GR16DecoderTable[] = {
+  MSP430::PC,  MSP430::SP,  MSP430::SR,  MSP430::CG,
+  MSP430::FP,  MSP430::R5,  MSP430::R6,  MSP430::R7,
+  MSP430::R8,  MSP430::R9,  MSP430::R10, MSP430::R11,
+  MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
+};
+
+static DecodeStatus DecodeGR16RegisterClass(MCInst &MI, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = GR16DecoderTable[RegNo];
+  MI.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
+                                const void *Decoder);
+
+static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+#include "MSP430GenDisassemblerTables.inc"
+
+static DecodeStatus DecodeCGImm(MCInst &MI, uint64_t Bits, uint64_t Address,
+                                const void *Decoder) {
+  int64_t Imm;
+  switch (Bits) {
+  default:
+    llvm_unreachable("Invalid immediate value");
+  case 0x22: Imm =  4; break;
+  case 0x32: Imm =  8; break;
+  case 0x03: Imm =  0; break;
+  case 0x13: Imm =  1; break;
+  case 0x23: Imm =  2; break;
+  case 0x33: Imm = -1; break;
+  }
+  MI.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemOperand(MCInst &MI, uint64_t Bits,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  unsigned Reg = Bits & 15;
+  unsigned Imm = Bits >> 4;
+
+  if (DecodeGR16RegisterClass(MI, Reg, Address, Decoder) !=
+      MCDisassembler::Success)
+    return MCDisassembler::Fail;
+  
+  MI.addOperand(MCOperand::createImm((int16_t)Imm));
+  return MCDisassembler::Success;
+}
+
+enum AddrMode {
+  amInvalid = 0,
+  amRegister,
+  amIndexed,
+  amIndirect,
+  amIndirectPost,
+  amSymbolic,
+  amImmediate,
+  amAbsolute,
+  amConstant
+};
+
+static AddrMode DecodeSrcAddrMode(unsigned Rs, unsigned As) {
+  switch (Rs) {
+  case 0:
+    if (As == 1) return amSymbolic;
+    if (As == 2) return amInvalid;
+    if (As == 3) return amImmediate;
+    break;
+  case 2:
+    if (As == 1) return amAbsolute;
+    if (As == 2) return amConstant;
+    if (As == 3) return amConstant;
+    break;
+  case 3:
+    return amConstant;
+  default:
+    break;
+  }
+  switch (As) {
+  case 0: return amRegister;
+  case 1: return amIndexed;
+  case 2: return amIndirect;
+  case 3: return amIndirectPost;
+  default:
+    llvm_unreachable("As out of range");
+  }
+}
+
+static AddrMode DecodeSrcAddrModeI(unsigned Insn) {
+  unsigned Rs = fieldFromInstruction(Insn, 8, 4);
+  unsigned As = fieldFromInstruction(Insn, 4, 2);
+  return DecodeSrcAddrMode(Rs, As);
+}
+
+static AddrMode DecodeSrcAddrModeII(unsigned Insn) {
+  unsigned Rs = fieldFromInstruction(Insn, 0, 4);
+  unsigned As = fieldFromInstruction(Insn, 4, 2);
+  return DecodeSrcAddrMode(Rs, As);
+}
+
+static AddrMode DecodeDstAddrMode(unsigned Insn) {
+  unsigned Rd = fieldFromInstruction(Insn, 0, 4);
+  unsigned Ad = fieldFromInstruction(Insn, 7, 1);
+  switch (Rd) {
+  case 0: return Ad ? amSymbolic : amRegister;
+  case 2: return Ad ? amAbsolute : amRegister;
+  default:
+    break;
+  }
+  return Ad ? amIndexed : amRegister;
+}
+
+static const uint8_t *getDecoderTable(AddrMode SrcAM, unsigned Words) {
+  assert(0 < Words && Words < 4 && "Incorrect number of words");
+  switch (SrcAM) {
+  default:
+    llvm_unreachable("Invalid addressing mode");
+  case amRegister:
+    assert(Words < 3 && "Incorrect number of words");
+    return Words == 2 ? DecoderTableAlpha32 : DecoderTableAlpha16;
+  case amConstant:
+    assert(Words < 3 && "Incorrect number of words");
+    return Words == 2 ? DecoderTableBeta32 : DecoderTableBeta16;
+  case amIndexed:
+  case amSymbolic:
+  case amImmediate:
+  case amAbsolute:
+    assert(Words > 1 && "Incorrect number of words");
+    return Words == 2 ? DecoderTableGamma32 : DecoderTableGamma48;
+  case amIndirect:
+  case amIndirectPost:
+    assert(Words < 3 && "Incorrect number of words");
+    return Words == 2 ? DecoderTableDelta32 : DecoderTableDelta16;
+  }
+}
+
+DecodeStatus MSP430Disassembler::getInstructionI(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &VStream,
+                                                 raw_ostream &CStream) const {
+  uint64_t Insn = support::endian::read16le(Bytes.data());
+  AddrMode SrcAM = DecodeSrcAddrModeI(Insn);
+  AddrMode DstAM = DecodeDstAddrMode(Insn);
+  if (SrcAM == amInvalid || DstAM == amInvalid) {
+    Size = 2; // skip one word and let disassembler to try further
+    return MCDisassembler::Fail;
+  }
+
+  unsigned Words = 1;
+  switch (SrcAM) {
+  case amIndexed:
+  case amSymbolic:
+  case amImmediate:
+  case amAbsolute:
+    Insn |= (uint64_t)support::endian::read16le(Bytes.data() + 2) << 16;
+    ++Words;
+    break;
+  default:
+    break;
+  }
+  switch (DstAM) {
+  case amIndexed:
+  case amSymbolic:
+  case amAbsolute:
+    Insn |= (uint64_t)support::endian::read16le(Bytes.data() + Words * 2)
+        << (Words * 16);
+    ++Words;
+    break;
+  default:
+    break;
+  }
+
+  DecodeStatus Result = decodeInstruction(getDecoderTable(SrcAM, Words), MI,
+                                          Insn, Address, this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = Words * 2;
+    return Result;
+  }
+
+  Size = 2;
+  return DecodeStatus::Fail;
+}
+
+DecodeStatus MSP430Disassembler::getInstructionII(MCInst &MI, uint64_t &Size,
+                                                  ArrayRef<uint8_t> Bytes,
+                                                  uint64_t Address,
+                                                  raw_ostream &VStream,
+                                                  raw_ostream &CStream) const {
+  uint64_t Insn = support::endian::read16le(Bytes.data());
+  AddrMode SrcAM = DecodeSrcAddrModeII(Insn);
+  if (SrcAM == amInvalid) {
+    Size = 2; // skip one word and let disassembler to try further
+    return MCDisassembler::Fail;
+  }
+
+  unsigned Words = 1;
+  switch (SrcAM) {
+  case amIndexed:
+  case amSymbolic:
+  case amImmediate:
+  case amAbsolute:
+    Insn |= (uint64_t)support::endian::read16le(Bytes.data() + 2) << 16;
+    ++Words;
+    break;
+  default:
+    break;
+  }
+
+  const uint8_t *DecoderTable = Words == 2 ? DecoderTable32 : DecoderTable16;
+  DecodeStatus Result = decodeInstruction(DecoderTable, MI, Insn, Address,
+                                          this, STI);
+  if (Result != MCDisassembler::Fail) {
+    Size = Words * 2;
+    return Result;
+  }
+
+  Size = 2;
+  return DecodeStatus::Fail;
+}
+
+static MSP430CC::CondCodes getCondCode(unsigned Cond) {
+  switch (Cond) {
+  case 0: return MSP430CC::COND_NE;
+  case 1: return MSP430CC::COND_E;
+  case 2: return MSP430CC::COND_LO;
+  case 3: return MSP430CC::COND_HS;
+  case 4: return MSP430CC::COND_N;
+  case 5: return MSP430CC::COND_GE;
+  case 6: return MSP430CC::COND_L;
+  case 7: return MSP430CC::COND_NONE;
+  default:
+    llvm_unreachable("Cond out of range");
+  }
+}
+
+DecodeStatus MSP430Disassembler::getInstructionCJ(MCInst &MI, uint64_t &Size,
+                                                  ArrayRef<uint8_t> Bytes,
+                                                  uint64_t Address,
+                                                  raw_ostream &VStream,
+                                                  raw_ostream &CStream) const {
+  uint64_t Insn = support::endian::read16le(Bytes.data());
+  unsigned Cond = fieldFromInstruction(Insn, 10, 3);
+  unsigned Offset = fieldFromInstruction(Insn, 0, 10);
+
+  MI.addOperand(MCOperand::createImm(SignExtend32(Offset, 10)));
+
+  if (Cond == 7)
+    MI.setOpcode(MSP430::JMP);
+  else {
+    MI.setOpcode(MSP430::JCC);
+    MI.addOperand(MCOperand::createImm(getCondCode(Cond)));
+  }
+
+  Size = 2;
+  return DecodeStatus::Success;
+}
+
+DecodeStatus MSP430Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                ArrayRef<uint8_t> Bytes,
+                                                uint64_t Address,
+                                                raw_ostream &VStream,
+                                                raw_ostream &CStream) const {
+  if (Bytes.size() < 2) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint64_t Insn = support::endian::read16le(Bytes.data());
+  unsigned Opc = fieldFromInstruction(Insn, 13, 3);
+  switch (Opc) {
+  case 0:
+    return getInstructionII(MI, Size, Bytes, Address, VStream, CStream);
+  case 1:
+    return getInstructionCJ(MI, Size, Bytes, Address, VStream, CStream);
+  default:
+    return getInstructionI(MI, Size, Bytes, Address, VStream, CStream);
+  }
+}
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
index be6d1a84a377518277a69dc97e58350480b95b28..4d62547bc65bedd46218788f61ffb8fbaa04b3b9 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -16,28 +16,34 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-
 // Include the auto-generated portion of the assembly writer.
+#define PRINT_ALIAS_INSTR
 #include "MSP430GenAsmWriter.inc"
 
 void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                   StringRef Annot, const MCSubtargetInfo &STI) {
-  printInstruction(MI, O);
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
   printAnnotation(O, Annot);
 }
 
 void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
                                              raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isImm())
-    O << Op.getImm();
-  else {
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm() * 2 + 2;
+    O << "$";
+    if (Imm >= 0)
+      O << '+';
+    O << Imm;
+  } else {
     assert(Op.isExpr() && "unknown pcrel immediate operand");
     Op.getExpr()->print(O, &MAI);
   }
@@ -72,7 +78,7 @@ void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
   // vs
   //   mov.w glb(r1), r2
   // Otherwise (!) msp430-as will silently miscompile the output :(
-  if (!Base.getReg())
+  if (Base.getReg() == MSP430::SR)
     O << '&';
 
   if (Disp.isExpr())
@@ -83,10 +89,23 @@ void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
   }
 
   // Print register base field
-  if (Base.getReg())
+  if ((Base.getReg() != MSP430::SR) &&
+      (Base.getReg() != MSP430::PC))
     O << '(' << getRegisterName(Base.getReg()) << ')';
 }
 
+void MSP430InstPrinter::printIndRegOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  const MCOperand &Base = MI->getOperand(OpNo);
+  O << "@" << getRegisterName(Base.getReg());
+}
+
+void MSP430InstPrinter::printPostIndRegOperand(const MCInst *MI, unsigned OpNo,
+                                               raw_ostream &O) {
+  const MCOperand &Base = MI->getOperand(OpNo);
+  O << "@" << getRegisterName(Base.getReg()) << "+";
+}
+
 void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   unsigned CC = MI->getOperand(OpNo).getImm();
@@ -112,5 +131,8 @@ void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo,
   case MSP430CC::COND_L:
    O << 'l';
    break;
+  case MSP430CC::COND_N:
+   O << 'n';
+   break;
   }
 }
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index 72afec18becb2e3e7c9b9dcf5500a39855c63374..cd02c4fa645a2a7cd5583c469f017979afce0e41 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -28,13 +28,20 @@ namespace llvm {
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, raw_ostream &O);
+    bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+    void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                 unsigned PrintMethodIdx, raw_ostream &O);
     static const char *getRegisterName(unsigned RegNo);
 
+private:
     void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                       const char *Modifier = nullptr);
     void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
     void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                             const char *Modifier = nullptr);
+    void printIndRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+    void printPostIndRegOperand(const MCInst *MI, unsigned OpNo,
+                                raw_ostream &O);
     void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   };
diff --git a/lib/Target/MSP430/LLVMBuild.txt b/lib/Target/MSP430/LLVMBuild.txt
index 51d9702ac56005825037f7711e5759dd32f3d4ec..0cbd1851777bba24ead74a35771233debfd21fc0 100644
--- a/lib/Target/MSP430/LLVMBuild.txt
+++ b/lib/Target/MSP430/LLVMBuild.txt
@@ -16,13 +16,15 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = InstPrinter MCTargetDesc TargetInfo
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
 name = MSP430
 parent = Target
+has_asmparser = 1
 has_asmprinter = 1
+has_disassembler = 1
 
 [component_1]
 type = Library
diff --git a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
index 0f3ebd303924154092f218e2efd9371ad6f16b15..a2f468779f50d634f4391855cd4402aeeb9b9755 100644
--- a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,8 @@
 add_llvm_library(LLVMMSP430Desc
-  MSP430MCTargetDesc.cpp
+  MSP430AsmBackend.cpp
+  MSP430ELFObjectWriter.cpp
+  MSP430ELFStreamer.cpp
   MSP430MCAsmInfo.cpp
+  MSP430MCCodeEmitter.cpp
+  MSP430MCTargetDesc.cpp
   )
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd69a9d8d795a2465ccbc2894a7611448162383a
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -0,0 +1,178 @@
+//===-- MSP430AsmBackend.cpp - MSP430 Assembler Backend -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MSP430FixupKinds.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class MSP430AsmBackend : public MCAsmBackend {
+  uint8_t OSABI;
+
+  uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                            MCContext &Ctx) const;
+
+public:
+  MSP430AsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI)
+      : MCAsmBackend(support::little), OSABI(OSABI) {}
+  ~MSP430AsmBackend() override {}
+
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override;
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createMSP430ELFObjectWriter(OSABI);
+  }
+
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, bool Resolved,
+                                    uint64_t Value,
+                                    const MCRelaxableFragment *DF,
+                                    const MCAsmLayout &Layout,
+                                    const bool WasForced) const override {
+    return false;
+  }
+
+  unsigned getNumFixupKinds() const override {
+    return MSP430::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[MSP430::NumTargetFixupKinds] = {
+      // This table must be in the same order of enum in MSP430FixupKinds.h.
+      //
+      // name            offset bits flags
+      {"fixup_32",            0, 32, 0},
+      {"fixup_10_pcrel",      0, 10, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_16",            0, 16, 0},
+      {"fixup_16_pcrel",      0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_16_byte",       0, 16, 0},
+      {"fixup_16_pcrel_byte", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_2x_pcrel",      0, 10, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_rl_pcrel",      0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_8",             0,  8, 0},
+      {"fixup_sym_diff",      0, 32, 0},
+    };
+    static_assert((array_lengthof(Infos)) == MSP430::NumTargetFixupKinds,
+                  "Not all fixup kinds added to Infos array");
+  
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+  
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    return false;
+  }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
+
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
+};
+
+uint64_t MSP430AsmBackend::adjustFixupValue(const MCFixup &Fixup,
+                                            uint64_t Value,
+                                            MCContext &Ctx) const {
+  unsigned Kind = Fixup.getKind();
+  switch (Kind) {
+  case MSP430::fixup_10_pcrel: {
+    if (Value & 0x1)
+      Ctx.reportError(Fixup.getLoc(), "fixup value must be 2-byte aligned");
+
+    // Offset is signed
+    int16_t Offset = Value;
+    // Jumps are in words
+    Offset >>= 1;
+    // PC points to the next instruction so decrement by one
+    --Offset;
+
+    if (Offset < -512 || Offset > 511)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+
+    // Mask 10 bits
+    Offset &= 0x3ff;
+
+    return Offset;
+  }
+  default:
+    return Value;
+  }
+}
+
+void MSP430AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                  const MCValue &Target,
+                                  MutableArrayRef<char> Data,
+                                  uint64_t Value, bool IsResolved,
+                                  const MCSubtargetInfo *STI) const {
+  Value = adjustFixupValue(Fixup, Value, Asm.getContext());
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  unsigned NumBytes = alignTo(Info.TargetSize + Info.TargetOffset, 8) / 8;
+
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i) {
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+  }
+}
+
+bool MSP430AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
+  if ((Count % 2) != 0)
+    return false;
+
+  // The canonical nop on MSP430 is mov #0, r3
+  uint64_t NopCount = Count / 2;
+  while (NopCount--)
+    OS.write("\x03\x43", 2);
+
+  return true;
+}
+
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createMSP430MCAsmBackend(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             const MCRegisterInfo &MRI,
+                                             const MCTargetOptions &Options) {
+  return new MSP430AsmBackend(STI, ELF::ELFOSABI_STANDALONE);
+}
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30d077b5b58831f1b200bc136364f1f395e2ef25
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -0,0 +1,59 @@
+//===-- MSP430ELFObjectWriter.cpp - MSP430 ELF Writer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MSP430FixupKinds.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class MSP430ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  MSP430ELFObjectWriter(uint8_t OSABI)
+    : MCELFObjectTargetWriter(false, OSABI, ELF::EM_MSP430,
+                              /*HasRelocationAddend*/ true) {}
+
+  ~MSP430ELFObjectWriter() override {}
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override {
+    // Translate fixup kind to ELF relocation type.
+    switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_1:                   return ELF::R_MSP430_8;
+    case FK_Data_2:                   return ELF::R_MSP430_16;
+    case FK_Data_4:                   return ELF::R_MSP430_32;
+    case MSP430::fixup_32:            return ELF::R_MSP430_32;
+    case MSP430::fixup_10_pcrel:      return ELF::R_MSP430_10_PCREL;
+    case MSP430::fixup_16:            return ELF::R_MSP430_16;
+    case MSP430::fixup_16_pcrel:      return ELF::R_MSP430_16_PCREL;
+    case MSP430::fixup_16_byte:       return ELF::R_MSP430_16_BYTE;
+    case MSP430::fixup_16_pcrel_byte: return ELF::R_MSP430_16_PCREL_BYTE;
+    case MSP430::fixup_2x_pcrel:      return ELF::R_MSP430_2X_PCREL;
+    case MSP430::fixup_rl_pcrel:      return ELF::R_MSP430_RL_PCREL;
+    case MSP430::fixup_8:             return ELF::R_MSP430_8;
+    case MSP430::fixup_sym_diff:      return ELF::R_MSP430_SYM_DIFF;
+    default:
+      llvm_unreachable("Invalid fixup kind");
+    }
+  }
+};
+} // end of anonymous namespace
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createMSP430ELFObjectWriter(uint8_t OSABI) {
+  return llvm::make_unique<MSP430ELFObjectWriter>(OSABI);
+}
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9449cb2780249bcc7f63093db27ac1c3b56c8209
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
@@ -0,0 +1,81 @@
+//===-- MSP430ELFStreamer.cpp - MSP430 ELF Target Streamer Methods --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides MSP430 specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+class MSP430TargetELFStreamer : public MCTargetStreamer {
+public:
+  MCELFStreamer &getStreamer();
+  MSP430TargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+};
+
+// This part is for ELF object output.
+MSP430TargetELFStreamer::MSP430TargetELFStreamer(MCStreamer &S,
+                                                 const MCSubtargetInfo &STI)
+    : MCTargetStreamer(S) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned EFlags = MCA.getELFHeaderEFlags();
+  MCA.setELFHeaderEFlags(EFlags);
+
+  // Emit build attributes section according to
+  // MSP430 EABI (slaa534.pdf, part 13).
+  MCSection *AttributeSection = getStreamer().getContext().getELFSection(
+      ".MSP430.attributes", ELF::SHT_MSP430_ATTRIBUTES, 0);
+  Streamer.SwitchSection(AttributeSection);
+
+  // Format version.
+  Streamer.EmitIntValue(0x41, 1);
+  // Subsection length.
+  Streamer.EmitIntValue(22, 4);
+  // Vendor name string, zero-terminated.
+  Streamer.EmitBytes("mspabi");
+  Streamer.EmitIntValue(0, 1);
+
+  // Attribute vector scope tag. 1 stands for the entire file.
+  Streamer.EmitIntValue(1, 1);
+  // Attribute vector length.
+  Streamer.EmitIntValue(11, 4);
+  // OFBA_MSPABI_Tag_ISA(4) = 1, MSP430
+  Streamer.EmitIntValue(4, 1);
+  Streamer.EmitIntValue(1, 1);
+  // OFBA_MSPABI_Tag_Code_Model(6) = 1, Small
+  Streamer.EmitIntValue(6, 1);
+  Streamer.EmitIntValue(1, 1);
+  // OFBA_MSPABI_Tag_Data_Model(8) = 1, Small
+  Streamer.EmitIntValue(8, 1);
+  Streamer.EmitIntValue(1, 1);
+}
+
+MCELFStreamer &MSP430TargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(Streamer);
+}
+
+MCTargetStreamer *
+createMSP430ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new MSP430TargetELFStreamer(S, STI);
+  return nullptr;
+}
+
+} // namespace llvm
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h b/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eb6a27594235abeacf838876bc21bf0c5595e0d
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430FixupKinds.h
@@ -0,0 +1,53 @@
+//===-- MSP430FixupKinds.h - MSP430 Specific Fixup Entries ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430FIXUPKINDS_H
+#define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430FIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+#undef MSP430
+
+namespace llvm {
+namespace MSP430 {
+
+// This table must be in the same order of
+// MCFixupKindInfo Infos[MSP430::NumTargetFixupKinds]
+// in MSP430AsmBackend.cpp.
+//
+enum Fixups {
+  // A 32 bit absolute fixup.
+  fixup_32 = FirstTargetFixupKind,
+  // A 10 bit PC relative fixup.
+  fixup_10_pcrel,
+  // A 16 bit absolute fixup.
+  fixup_16,
+  // A 16 bit PC relative fixup.
+  fixup_16_pcrel,
+  // A 16 bit absolute fixup for byte operations.
+  fixup_16_byte,
+  // A 16 bit PC relative fixup for command address.
+  fixup_16_pcrel_byte,
+  // A 10 bit PC relative fixup for complicated polymorphs.
+  fixup_2x_pcrel,
+  // A 16 bit relaxable fixup.
+  fixup_rl_pcrel,
+  // A 8 bit absolute fixup.
+  fixup_8,
+  // A 32 bit symbol difference fixup.
+  fixup_sym_diff,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace MSP430
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ba9f7d7a9a5e86ee6be15f2a8ed4f61fd5caa33a
--- /dev/null
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCCodeEmitter.cpp
@@ -0,0 +1,212 @@
+//===-- MSP430MCCodeEmitter.cpp - Convert MSP430 code to machine code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MCTargetDesc/MSP430MCTargetDesc.h"
+#include "MCTargetDesc/MSP430FixupKinds.h"
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "mccodeemitter"
+
+namespace llvm {
+
+class MSP430MCCodeEmitter : public MCCodeEmitter {
+  MCContext &Ctx;
+  MCInstrInfo const &MCII;
+
+  // Offset keeps track of current word number being emitted
+  // inside a particular instruction.
+  mutable unsigned Offset;
+
+  /// TableGen'erated function for getting the binary encoding for an
+  /// instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// Returns the binary encoding of operands.
+  ///
+  /// If an operand requires relocation, the relocation is recorded
+  /// and zero is returned.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  unsigned getMemOpValue(const MCInst &MI, unsigned Op,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
+
+  unsigned getPCRelImmOpValue(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  unsigned getCGImmOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups,
+                           const MCSubtargetInfo &STI) const;
+
+  unsigned getCCOpValue(const MCInst &MI, unsigned Op,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const;
+
+public:
+  MSP430MCCodeEmitter(MCContext &ctx, MCInstrInfo const &MCII)
+      : Ctx(ctx), MCII(MCII) {}
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+};
+
+void MSP430MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  // Get byte count of instruction.
+  unsigned Size = Desc.getSize();
+
+  // Initialize fixup offset
+  Offset = 2;
+
+  uint64_t BinaryOpCode = getBinaryCodeForInstr(MI, Fixups, STI);
+  const uint16_t *Words = reinterpret_cast<uint16_t const *>(&BinaryOpCode);
+  size_t WordCount = Size / 2;
+
+  for (size_t i = 0; i < WordCount; ++i) {
+    uint16_t Word = Words[i];
+    support::endian::write(OS, Word, support::little);
+  }
+}
+
+unsigned MSP430MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                                const MCOperand &MO,
+                                                SmallVectorImpl<MCFixup> &Fixups,
+                                                const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm()) {
+    Offset += 2;
+    return MO.getImm();
+  }
+
+  assert(MO.isExpr() && "Expected expr operand");
+  Fixups.push_back(MCFixup::create(Offset, MO.getExpr(),
+      static_cast<MCFixupKind>(MSP430::fixup_16_byte), MI.getLoc()));
+  Offset += 2;
+  return 0;
+}
+
+unsigned MSP430MCCodeEmitter::getMemOpValue(const MCInst &MI, unsigned Op,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO1 = MI.getOperand(Op);
+  assert(MO1.isReg() && "Register operand expected");
+  unsigned Reg = Ctx.getRegisterInfo()->getEncodingValue(MO1.getReg());
+
+  const MCOperand &MO2 = MI.getOperand(Op + 1);
+  if (MO2.isImm()) {
+    Offset += 2;
+    return (MO2.getImm() << 4) | Reg;
+  }
+
+  assert(MO2.isExpr() && "Expr operand expected");
+  MSP430::Fixups FixupKind;
+  switch (Reg) {
+  case 0:
+    FixupKind = MSP430::fixup_16_pcrel_byte;
+    break;
+  case 2:
+    FixupKind = MSP430::fixup_16_byte;
+    break;
+  default:
+    FixupKind = MSP430::fixup_16_byte;
+    break;
+  }
+  Fixups.push_back(MCFixup::create(Offset, MO2.getExpr(),
+    static_cast<MCFixupKind>(FixupKind), MI.getLoc()));
+  Offset += 2;
+  return Reg;
+}
+
+unsigned MSP430MCCodeEmitter::getPCRelImmOpValue(const MCInst &MI, unsigned Op,
+                                                 SmallVectorImpl<MCFixup> &Fixups,
+                                                 const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr() && "Expr operand expected");
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+    static_cast<MCFixupKind>(MSP430::fixup_10_pcrel), MI.getLoc()));
+  return 0;
+}
+
+unsigned MSP430MCCodeEmitter::getCGImmOpValue(const MCInst &MI, unsigned Op,
+                                              SmallVectorImpl<MCFixup> &Fixups,
+                                              const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  assert(MO.isImm() && "Expr operand expected");
+  
+  int64_t Imm = MO.getImm();
+  switch (Imm) {
+  default:
+    llvm_unreachable("Invalid immediate value");
+  case 4:  return 0x22;
+  case 8:  return 0x32;
+  case 0:  return 0x03;
+  case 1:  return 0x13;
+  case 2:  return 0x23;
+  case -1: return 0x33;
+  }
+}
+
+unsigned MSP430MCCodeEmitter::getCCOpValue(const MCInst &MI, unsigned Op,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(Op);
+  assert(MO.isImm() && "Immediate operand expected");
+  switch (MO.getImm()) {
+  case MSP430CC::COND_NE: return 0;
+  case MSP430CC::COND_E:  return 1;
+  case MSP430CC::COND_LO: return 2;
+  case MSP430CC::COND_HS: return 3;
+  case MSP430CC::COND_N:  return 4;
+  case MSP430CC::COND_GE: return 5;
+  case MSP430CC::COND_L:  return 6;
+  default:
+    llvm_unreachable("Unknown condition code");
+  }
+}
+
+MCCodeEmitter *createMSP430MCCodeEmitter(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         MCContext &Ctx) {
+  return new MSP430MCCodeEmitter(Ctx, MCII);
+}
+
+#include "MSP430GenMCCodeEmitter.inc"
+
+} // end of namespace llvm
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 8c715500f38b1243e1f6bcea0e1c33e2a9b515c1..b21145d3904a33e5981058b907bc476e3a74b458 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -58,22 +58,15 @@ static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
 }
 
 extern "C" void LLVMInitializeMSP430TargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfo<MSP430MCAsmInfo> X(getTheMSP430Target());
+  Target &T = getTheMSP430Target();
 
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(getTheMSP430Target(),
-                                      createMSP430MCInstrInfo);
-
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(getTheMSP430Target(),
-                                    createMSP430MCRegisterInfo);
-
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(getTheMSP430Target(),
-                                          createMSP430MCSubtargetInfo);
-
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(getTheMSP430Target(),
-                                        createMSP430MCInstPrinter);
+  RegisterMCAsmInfo<MSP430MCAsmInfo> X(T);
+  TargetRegistry::RegisterMCInstrInfo(T, createMSP430MCInstrInfo);
+  TargetRegistry::RegisterMCRegInfo(T, createMSP430MCRegisterInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(T, createMSP430MCSubtargetInfo);
+  TargetRegistry::RegisterMCInstPrinter(T, createMSP430MCInstPrinter);
+  TargetRegistry::RegisterMCCodeEmitter(T, createMSP430MCCodeEmitter);
+  TargetRegistry::RegisterMCAsmBackend(T, createMSP430MCAsmBackend);
+  TargetRegistry::RegisterObjectTargetStreamer(
+      T, createMSP430ObjectTargetStreamer);
 }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
index b901c5f0979420302ff12d467f480f9141df44ff..e484c79c9ee91cc82b26fdc275c8ce02f8ae0f68 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -15,12 +15,39 @@
 #define LLVM_LIB_TARGET_MSP430_MCTARGETDESC_MSP430MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
+#include <memory>
 
 namespace llvm {
 class Target;
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCInstrInfo;
+class MCSubtargetInfo;
+class MCRegisterInfo;
+class MCContext;
+class MCTargetOptions;
+class MCObjectTargetWriter;
+class MCStreamer;
+class MCTargetStreamer;
 
 Target &getTheMSP430Target();
 
+/// Creates a machine code emitter for MSP430.
+MCCodeEmitter *createMSP430MCCodeEmitter(const MCInstrInfo &MCII,
+                                         const MCRegisterInfo &MRI,
+                                         MCContext &Ctx);
+
+MCAsmBackend *createMSP430MCAsmBackend(const Target &T,
+                                       const MCSubtargetInfo &STI,
+                                       const MCRegisterInfo &MRI,
+                                       const MCTargetOptions &Options);
+
+MCTargetStreamer *
+createMSP430ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
+
+std::unique_ptr<MCObjectTargetWriter>
+createMSP430ELFObjectWriter(uint8_t OSABI);
+
 } // End llvm namespace
 
 // Defines symbolic names for MSP430 registers.
diff --git a/lib/Target/MSP430/MSP430.h b/lib/Target/MSP430/MSP430.h
index 796f25233123a9572684e57a49544003ada0dfce..7a5314a10844b7dba692cff79de1f6c0087600e7 100644
--- a/lib/Target/MSP430/MSP430.h
+++ b/lib/Target/MSP430/MSP430.h
@@ -27,6 +27,8 @@ namespace MSP430CC {
     COND_LO = 3,  // aka COND_NC
     COND_GE = 4,
     COND_L  = 5,
+    COND_N  = 6,  // jump if negative
+    COND_NONE,    // unconditional
 
     COND_INVALID = -1
   };
diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td
index 203864dd40650ee3830c2a46779250ac1aea9e67..8fa99dc13dd5c7fb5e963bc0e4ad436c8d275ab2 100644
--- a/lib/Target/MSP430/MSP430.td
+++ b/lib/Target/MSP430/MSP430.td
@@ -64,11 +64,29 @@ include "MSP430InstrInfo.td"
 
 def MSP430InstrInfo : InstrInfo;
 
+//===---------------------------------------------------------------------===//
+// Assembly Printers
+//===---------------------------------------------------------------------===//
+
+def MSP430AsmWriter : AsmWriter {
+  string AsmWriterClassName = "InstPrinter";
+}
+
+//===---------------------------------------------------------------------===//
+// Assembly Parsers
+//===---------------------------------------------------------------------===//
+
+def MSP430AsmParser : AsmParser {
+  let AllowDuplicateRegisterNames = 1;
+  let ShouldEmitMatchRegisterAltName = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Target Declaration
 //===----------------------------------------------------------------------===//
 
 def MSP430 : Target {
   let InstructionSet = MSP430InstrInfo;
+  let AssemblyParsers = [MSP430AsmParser];
 }
 
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index b196c013902c0251501f2b97fbc6801922ef1053..7a1998ad355d1d8615e3bf8c167b4bf0cee58922 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -98,6 +98,7 @@ namespace {
     MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
         : SelectionDAGISel(TM, OptLevel) {}
 
+  private:
     StringRef getPassName() const override {
       return "MSP430 DAG->DAG Pattern Instruction Selection";
     }
@@ -112,8 +113,9 @@ namespace {
     // Include the pieces autogenerated from the target description.
   #include "MSP430GenDAGISel.inc"
 
-  private:
+    // Main method to transform nodes into machine nodes.
     void Select(SDNode *N) override;
+
     bool tryIndexedLoad(SDNode *Op);
     bool tryIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2, unsigned Opc8,
                          unsigned Opc16);
@@ -250,11 +252,9 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
   if (MatchAddress(N, AM))
     return false;
 
-  EVT VT = N.getValueType();
-  if (AM.BaseType == MSP430ISelAddressMode::RegBase) {
+  if (AM.BaseType == MSP430ISelAddressMode::RegBase)
     if (!AM.Base.Reg.getNode())
-      AM.Base.Reg = CurDAG->getRegister(0, VT);
-  }
+      AM.Base.Reg = CurDAG->getRegister(MSP430::SR, MVT::i16);
 
   Base = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase)
              ? CurDAG->getTargetFrameIndex(
@@ -336,10 +336,10 @@ bool MSP430DAGToDAGISel::tryIndexedLoad(SDNode *N) {
   unsigned Opcode = 0;
   switch (VT.SimpleTy) {
   case MVT::i8:
-    Opcode = MSP430::MOV8rm_POST;
+    Opcode = MSP430::MOV8rp;
     break;
   case MVT::i16:
-    Opcode = MSP430::MOV16rm_POST;
+    Opcode = MSP430::MOV16rp;
     break;
   default:
     return false;
@@ -412,47 +412,47 @@ void MSP430DAGToDAGISel::Select(SDNode *Node) {
     break;
   case ISD::ADD:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+                        MSP430::ADD8rp, MSP430::ADD16rp))
       return;
     else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                             MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+                             MSP430::ADD8rp, MSP430::ADD16rp))
       return;
 
     // Other cases are autogenerated.
     break;
   case ISD::SUB:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::SUB8rm_POST, MSP430::SUB16rm_POST))
+                        MSP430::SUB8rp, MSP430::SUB16rp))
       return;
 
     // Other cases are autogenerated.
     break;
   case ISD::AND:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+                        MSP430::AND8rp, MSP430::AND16rp))
       return;
     else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                             MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+                             MSP430::AND8rp, MSP430::AND16rp))
       return;
 
     // Other cases are autogenerated.
     break;
   case ISD::OR:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+                        MSP430::BIS8rp, MSP430::BIS16rp))
       return;
     else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                             MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+                             MSP430::BIS8rp, MSP430::BIS16rp))
       return;
 
     // Other cases are autogenerated.
     break;
   case ISD::XOR:
     if (tryIndexedBinOp(Node, Node->getOperand(0), Node->getOperand(1),
-                        MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+                        MSP430::XOR8rp, MSP430::XOR16rp))
       return;
     else if (tryIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
-                             MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+                             MSP430::XOR8rp, MSP430::XOR16rp))
       return;
 
     // Other cases are autogenerated.
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index f5b2bda5d1e462e13371ca7ffca888097e94244d..ac93d7efc2b97a3ebeb833c45a2a67ead08d9ff4 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -940,18 +940,7 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
 
   // Expand non-constant shifts to loops:
   if (!isa<ConstantSDNode>(N->getOperand(1)))
-    switch (Opc) {
-    default: llvm_unreachable("Invalid shift opcode!");
-    case ISD::SHL:
-      return DAG.getNode(MSP430ISD::SHL, dl,
-                         VT, N->getOperand(0), N->getOperand(1));
-    case ISD::SRA:
-      return DAG.getNode(MSP430ISD::SRA, dl,
-                         VT, N->getOperand(0), N->getOperand(1));
-    case ISD::SRL:
-      return DAG.getNode(MSP430ISD::SRL, dl,
-                         VT, N->getOperand(0), N->getOperand(1));
-    }
+    return Op;
 
   uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
@@ -963,7 +952,7 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
   if (Opc == ISD::SRL && ShiftAmount) {
     // Emit a special goodness here:
     // srl A, 1 => clrc; rrc A
-    Victim = DAG.getNode(MSP430ISD::RRC, dl, VT, Victim);
+    Victim = DAG.getNode(MSP430ISD::RRCL, dl, VT, Victim);
     ShiftAmount -= 1;
   }
 
@@ -1342,15 +1331,14 @@ const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MSP430ISD::RRA:                return "MSP430ISD::RRA";
   case MSP430ISD::RLA:                return "MSP430ISD::RLA";
   case MSP430ISD::RRC:                return "MSP430ISD::RRC";
+  case MSP430ISD::RRCL:               return "MSP430ISD::RRCL";
   case MSP430ISD::CALL:               return "MSP430ISD::CALL";
   case MSP430ISD::Wrapper:            return "MSP430ISD::Wrapper";
   case MSP430ISD::BR_CC:              return "MSP430ISD::BR_CC";
   case MSP430ISD::CMP:                return "MSP430ISD::CMP";
   case MSP430ISD::SETCC:              return "MSP430ISD::SETCC";
   case MSP430ISD::SELECT_CC:          return "MSP430ISD::SELECT_CC";
-  case MSP430ISD::SHL:                return "MSP430ISD::SHL";
-  case MSP430ISD::SRA:                return "MSP430ISD::SRA";
-  case MSP430ISD::SRL:                return "MSP430ISD::SRL";
+  case MSP430ISD::DADD:               return "MSP430ISD::DADD";
   }
   return nullptr;
 }
@@ -1397,33 +1385,49 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
   const TargetInstrInfo &TII = *F->getSubtarget().getInstrInfo();
 
   unsigned Opc;
+  bool ClearCarry = false;
   const TargetRegisterClass * RC;
   switch (MI.getOpcode()) {
   default: llvm_unreachable("Invalid shift opcode!");
   case MSP430::Shl8:
-   Opc = MSP430::SHL8r1;
-   RC = &MSP430::GR8RegClass;
-   break;
+    Opc = MSP430::ADD8rr;
+    RC = &MSP430::GR8RegClass;
+    break;
   case MSP430::Shl16:
-   Opc = MSP430::SHL16r1;
-   RC = &MSP430::GR16RegClass;
-   break;
+    Opc = MSP430::ADD16rr;
+    RC = &MSP430::GR16RegClass;
+    break;
   case MSP430::Sra8:
-   Opc = MSP430::SAR8r1;
-   RC = &MSP430::GR8RegClass;
-   break;
+    Opc = MSP430::RRA8r;
+    RC = &MSP430::GR8RegClass;
+    break;
   case MSP430::Sra16:
-   Opc = MSP430::SAR16r1;
-   RC = &MSP430::GR16RegClass;
-   break;
+    Opc = MSP430::RRA16r;
+    RC = &MSP430::GR16RegClass;
+    break;
   case MSP430::Srl8:
-   Opc = MSP430::SAR8r1c;
-   RC = &MSP430::GR8RegClass;
-   break;
+    ClearCarry = true;
+    Opc = MSP430::RRC8r;
+    RC = &MSP430::GR8RegClass;
+    break;
   case MSP430::Srl16:
-   Opc = MSP430::SAR16r1c;
-   RC = &MSP430::GR16RegClass;
-   break;
+    ClearCarry = true;
+    Opc = MSP430::RRC16r;
+    RC = &MSP430::GR16RegClass;
+    break;
+  case MSP430::Rrcl8:
+  case MSP430::Rrcl16: {
+    BuildMI(*BB, MI, dl, TII.get(MSP430::BIC16rc), MSP430::SR)
+      .addReg(MSP430::SR).addImm(1);
+    unsigned SrcReg = MI.getOperand(1).getReg();
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned RrcOpc = MI.getOpcode() == MSP430::Rrcl16
+                    ? MSP430::RRC16r : MSP430::RRC8r;
+    BuildMI(*BB, MI, dl, TII.get(RrcOpc), DstReg)
+      .addReg(SrcReg);
+    MI.eraseFromParent(); // The pseudo instruction is gone now.
+    return BB;
+  }
   }
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -1476,8 +1480,16 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
   BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftAmtReg)
     .addReg(ShiftAmtSrcReg).addMBB(BB)
     .addReg(ShiftAmtReg2).addMBB(LoopBB);
-  BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
-    .addReg(ShiftReg);
+  if (ClearCarry)
+    BuildMI(LoopBB, dl, TII.get(MSP430::BIC16rc), MSP430::SR)
+      .addReg(MSP430::SR).addImm(1);
+  if (Opc == MSP430::ADD8rr || Opc == MSP430::ADD16rr)
+    BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+      .addReg(ShiftReg)
+      .addReg(ShiftReg);
+  else
+    BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+      .addReg(ShiftReg);
   BuildMI(LoopBB, dl, TII.get(MSP430::SUB8ri), ShiftAmtReg2)
     .addReg(ShiftAmtReg).addImm(1);
   BuildMI(LoopBB, dl, TII.get(MSP430::JCC))
@@ -1499,9 +1511,10 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                   MachineBasicBlock *BB) const {
   unsigned Opc = MI.getOpcode();
 
-  if (Opc == MSP430::Shl8 || Opc == MSP430::Shl16 ||
-      Opc == MSP430::Sra8 || Opc == MSP430::Sra16 ||
-      Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
+  if (Opc == MSP430::Shl8  || Opc == MSP430::Shl16 ||
+      Opc == MSP430::Sra8  || Opc == MSP430::Sra16 ||
+      Opc == MSP430::Srl8  || Opc == MSP430::Srl16 ||
+      Opc == MSP430::Rrcl8 || Opc == MSP430::Rrcl16)
     return EmitShiftInstr(MI, BB);
 
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 842d03df32fc1758786b0c1cfa6b596b6a01680e..731bc1406711f307ced5406cd5e1c0b7b7481453 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -36,6 +36,9 @@ namespace llvm {
       /// Y = RRC X, rotate right via carry
       RRC,
 
+      /// Rotate right via carry, carry gets cleared beforehand by clrc
+      RRCL,
+
       /// CALL - These operations represent an abstract call
       /// instruction, which includes a bunch of information.
       CALL,
@@ -61,8 +64,9 @@ namespace llvm {
       /// is condition code and operand 4 is flag operand.
       SELECT_CC,
 
-      /// SHL, SRA, SRL - Non-constant shifts.
-      SHL, SRA, SRL
+      /// DADD - Decimal addition with carry
+      /// TODO Nothing generates a node of this type yet.
+      DADD,
     };
   }
 
diff --git a/lib/Target/MSP430/MSP430InstrFormats.td b/lib/Target/MSP430/MSP430InstrFormats.td
index a9e87dad0cd8b6d1bda42ce4cf3bc2f7b0027f26..e2e4503db20c48c35b1a0d43697e5a7b8c4895a8 100644
--- a/lib/Target/MSP430/MSP430InstrFormats.td
+++ b/lib/Target/MSP430/MSP430InstrFormats.td
@@ -11,201 +11,431 @@
 //  Describe MSP430 instructions format here
 //
 
-// Format specifies the encoding used by the instruction.  This is part of the
-// ad-hoc solution used to emit machine instruction encodings by our machine
-// code emitter.
-class Format<bits<2> val> {
-  bits<2> Value = val;
-}
-
-def PseudoFrm   : Format<0>;
-def SingleOpFrm : Format<1>;
-def DoubleOpFrm : Format<2>;
-def CondJumpFrm : Format<3>;
-
 class SourceMode<bits<2> val> {
   bits<2> Value = val;
 }
 
-def SrcReg      : SourceMode<0>;
-def SrcMem      : SourceMode<1>;
-def SrcIndReg   : SourceMode<2>;
-def SrcPostInc  : SourceMode<3>;
-def SrcImm      : SourceMode<3>;
+def SrcReg      : SourceMode<0>; // r
+def SrcMem      : SourceMode<1>; // m
+def SrcIndReg   : SourceMode<2>; // n
+def SrcPostInc  : SourceMode<3>; // p
+def SrcImm      : SourceMode<3>; // i
+//  SrcCGImm    : SourceMode< >; // c
 
 class DestMode<bit val> {
   bit Value = val;
 }
 
-def DstReg      : DestMode<0>;
-def DstMem      : DestMode<1>;
-
-class SizeVal<bits<3> val> {
-  bits<3> Value = val;
-}
-
-def SizeUnknown : SizeVal<0>; // Unknown / unset size
-def SizeSpecial : SizeVal<1>; // Special instruction, e.g. pseudo
-def Size2Bytes  : SizeVal<2>;
-def Size4Bytes  : SizeVal<3>;
-def Size6Bytes  : SizeVal<4>;
+def DstReg      : DestMode<0>;   // r
+def DstMem      : DestMode<1>;   // m
 
 // Generic MSP430 Format
-class MSP430Inst<dag outs, dag ins, SizeVal sz, Format f,
-                 string asmstr> : Instruction {
-  field bits<16> Inst;
+class MSP430Inst<dag outs, dag ins, int size, string asmstr> : Instruction {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
   let Namespace = "MSP430";
 
   dag OutOperandList = outs;
   dag InOperandList  = ins;
 
-  Format Form = f;
-  SizeVal Sz = sz;
-
-  // Define how we want to layout our TargetSpecific information field... This
-  // should be kept up-to-date with the fields in the MSP430InstrInfo.h file.
-  let TSFlags{1-0} = Form.Value;
-  let TSFlags{4-2} = Sz.Value;
-
-  let AsmString   = asmstr;
+  let AsmString = asmstr;
+  let Size = size;
 }
 
-// FIXME: Create different classes for different addressing modes.
-
 // MSP430 Double Operand (Format I) Instructions
-class IForm<bits<4> opcode, DestMode dest, bit bw, SourceMode src, SizeVal sz,
+class IForm<bits<4> opcode, DestMode ad, bit bw, SourceMode as, int size,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : MSP430Inst<outs, ins, sz, DoubleOpFrm, asmstr> {
+  : MSP430Inst<outs, ins, size, asmstr> {
   let Pattern = pattern;
 
-  DestMode ad = dest;
-  SourceMode as = src;
-  
-  let Inst{12-15} = opcode;
+  bits<4> rs;
+  bits<4> rd;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = rs;
   let Inst{7}     = ad.Value;
   let Inst{6}     = bw;
-  let Inst{4-5}   = as.Value;
+  let Inst{5-4}   = as.Value;
+  let Inst{3-0}   = rd;
 }
 
 // 8 bit IForm instructions
-class IForm8<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+class IForm8<bits<4> opcode, DestMode dest, SourceMode src, int size,
              dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm<opcode, dest, 1, src, sz, outs, ins, asmstr, pattern>;
+  : IForm<opcode, dest, 1, src, size, outs, ins, asmstr, pattern>;
 
 class I8rr<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstReg, SrcReg, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Alpha";
+}
 
 class I8ri<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstReg, SrcImm, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<16> imm;
+  let Inst{31-16} = imm;
+  let rs = 0b0000;
+}
+
+class I8rc<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 2, asmstr> {
+  let DecoderNamespace = "Beta";
+  let Pattern = pattern;
+
+  bits<6> imm;
+  bits<4> rd;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = imm{3-0};
+  let Inst{7}     = DstReg.Value;
+  let Inst{6}     = 1;
+  let Inst{5-4}   = imm{5-4};
+  let Inst{3-0}   = rd;
+}
 
 class I8rm<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstReg, SrcMem, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<20> src;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+}
+
+class I8rn<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcIndReg, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+}
+
+class I8rp<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcPostInc, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+}
 
 class I8mr<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstMem, SrcReg, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Alpha";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
 
 class I8mi<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstMem, SrcImm, 6, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<16> imm;
+  bits<20> dst;
+  let rs = 0b0000;
+  let Inst{31-16} = imm;
+  let rd = dst{3-0};
+  let Inst{47-32} = dst{19-4};
+}
+
+class I8mc<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 4, asmstr> {
+  let DecoderNamespace = "Beta";
+  let Pattern = pattern;
+
+  bits<6> imm;
+  bits<20> dst;
+
+  let Inst{31-16} = dst{19-4};
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = imm{3-0};
+  let Inst{7}     = DstMem.Value;
+  let Inst{6}     = 1;
+  let Inst{5-4}   = imm{5-4};
+  let Inst{3-0}   = dst{3-0};
+}
 
 class I8mm<bits<4> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm8<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+  : IForm8<opcode, DstMem, SrcMem, 6, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<20> src;
+  bits<20> dst;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+  let rd = dst{3-0};
+  let Inst{47-32} = dst{19-4};
+}
+
+class I8mn<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcIndReg, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
+
+class I8mp<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcPostInc, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
 
 // 16 bit IForm instructions
-class IForm16<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+class IForm16<bits<4> opcode, DestMode dest, SourceMode src, int size,
               dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm<opcode, dest, 0, src, sz, outs, ins, asmstr, pattern>;
+  : IForm<opcode, dest, 0, src, size, outs, ins, asmstr, pattern>;
 
 class I16rr<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstReg, SrcReg, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Alpha";
+}
 
 class I16ri<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstReg, SrcImm, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<16> imm;
+  let Inst{31-16} = imm;
+  let rs = 0b0000;
+}
+
+class I16rc<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 2, asmstr> {
+  let DecoderNamespace = "Beta";
+  let Pattern = pattern;
+
+  bits<6> imm;
+  bits<4> rd;
+
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = imm{3-0};
+  let Inst{7}     = DstReg.Value;
+  let Inst{6}     = 0;
+  let Inst{5-4}   = imm{5-4};
+  let Inst{3-0}   = rd;
+}
 
 class I16rm<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstReg, SrcMem, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<20> src;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+}
+
+class I16rn<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcIndReg, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+}
+
+class I16rp<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcPostInc, 2, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+}
 
 class I16mr<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstMem, SrcReg, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Alpha";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
 
 class I16mi<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstMem, SrcImm, 6, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<16> imm;
+  bits<20> dst;
+  let Inst{31-16} = imm;
+  let rs = 0b0000;
+  let rd = dst{3-0};
+  let Inst{47-32} = dst{19-4};
+}
+
+class I16mc<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 4, asmstr> {
+  let DecoderNamespace = "Beta";
+  let Pattern = pattern;
+
+  bits<6> imm;
+  bits<20> dst;
+
+  let Inst{31-16} = dst{19-4};
+  let Inst{15-12} = opcode;
+  let Inst{11-8}  = imm{3-0};
+  let Inst{7}     = DstMem.Value;
+  let Inst{6}     = 0;
+  let Inst{5-4}   = imm{5-4};
+  let Inst{3-0}   = dst{3-0};
+}
 
 class I16mm<bits<4> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IForm16<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+  : IForm16<opcode, DstMem, SrcMem, 6, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Gamma";
+  bits<20> src;
+  bits<20> dst;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+  let rd = dst{3-0};
+  let Inst{47-32} = dst{19-4};
+}
+
+class I16mn<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcIndReg, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
+
+class I16mp<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcPostInc, 4, outs, ins, asmstr, pattern> {
+  let DecoderNamespace = "Delta";
+  bits<20> dst;
+  let rd = dst{3-0};
+  let Inst{31-16} = dst{19-4};
+}
 
 // MSP430 Single Operand (Format II) Instructions
-class IIForm<bits<9> opcode, bit bw, SourceMode src, SizeVal sz,
+class IIForm<bits<3> opcode, bit bw, SourceMode as, int size,
              dag outs, dag ins, string asmstr, list<dag> pattern>
-  : MSP430Inst<outs, ins, sz, SingleOpFrm, asmstr> {
+  : MSP430Inst<outs, ins, size, asmstr> {
   let Pattern = pattern;
-  
-  SourceMode as = src;
 
-  let Inst{7-15} = opcode;
-  let Inst{6}    = bw;
-  let Inst{4-5}  = as.Value;
+  bits<4> rs;
+
+  let Inst{15-10} = 0b000100;
+  let Inst{9-7}   = opcode;
+  let Inst{6}     = bw;
+  let Inst{5-4}   = as.Value;
+  let Inst{3-0}   = rs;
 }
 
 // 8 bit IIForm instructions
-class IIForm8<bits<9> opcode, SourceMode src, SizeVal sz,
+class IIForm8<bits<3> opcode, SourceMode src, int size,
               dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm<opcode, 1, src, sz, outs, ins, asmstr, pattern>;
+  : IIForm<opcode, 1, src, size, outs, ins, asmstr, pattern>;
+
+class II8r<bits<3> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcReg, 2, outs, ins, asmstr, pattern>;
 
-class II8r<bits<9> opcode,
+class II8m<bits<3> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm8<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+  : IIForm8<opcode, SrcMem, 4, outs, ins, asmstr, pattern> {
+  bits<20> src;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+}
 
-class II8m<bits<9> opcode,
+class II8i<bits<3> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm8<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IIForm8<opcode, SrcImm, 4, outs, ins, asmstr, pattern> {
+  bits<16> imm;
+  let rs = 0b0000;
+  let Inst{31-16} = imm;
+}
 
-class II8i<bits<9> opcode,
+class II8c<bits<3> opcode,
            dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm8<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+  : MSP430Inst<outs, ins, 2, asmstr> {
+  let Pattern = pattern;
+
+  bits<6> imm;
+
+  let Inst{15-10} = 0b000100;
+  let Inst{9-7}   = opcode;
+  let Inst{6}     = 1;
+  let Inst{5-0}   = imm;
+}
+
+class II8n<bits<3> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcIndReg, 2, outs, ins, asmstr, pattern>;
+
+class II8p<bits<3> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcPostInc, 2, outs, ins, asmstr, pattern>;
 
 // 16 bit IIForm instructions
-class IIForm16<bits<9> opcode, SourceMode src, SizeVal sz,
+class IIForm16<bits<3> opcode, SourceMode src, int size,
                dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm<opcode, 0, src, sz, outs, ins, asmstr, pattern>;
+  : IIForm<opcode, 0, src, size, outs, ins, asmstr, pattern>;
 
-class II16r<bits<9> opcode,
+class II16r<bits<3> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm16<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+  : IIForm16<opcode, SrcReg, 2, outs, ins, asmstr, pattern>;
 
-class II16m<bits<9> opcode,
+class II16m<bits<3> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm16<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IIForm16<opcode, SrcMem, 4, outs, ins, asmstr, pattern> {
+  bits<20> src;
+  let rs = src{3-0};
+  let Inst{31-16} = src{19-4};
+}
 
-class II16i<bits<9> opcode,
+class II16i<bits<3> opcode,
             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : IIForm16<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+  : IIForm16<opcode, SrcImm, 4, outs, ins, asmstr, pattern> {
+  bits<16> imm;
+  let rs = 0b0000;
+  let Inst{31-16} = imm;
+}
+
+class II16c<bits<3> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 2, asmstr> {
+  let Pattern = pattern;
+
+  bits<6> imm;
+
+  let Inst{15-10} = 0b000100;
+  let Inst{9-7}   = opcode;
+  let Inst{6}     = 0;
+  let Inst{5-0}   = imm;
+}
+
+class II16n<bits<3> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcIndReg, 2, outs, ins, asmstr, pattern>;
+
+class II16p<bits<3> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcPostInc, 2, outs, ins, asmstr, pattern>;
 
 // MSP430 Conditional Jumps Instructions
-class CJForm<bits<3> opcode, bits<3> cond,
-             dag outs, dag ins, string asmstr, list<dag> pattern>
-  : MSP430Inst<outs, ins, Size2Bytes, CondJumpFrm, asmstr> {
+class CJForm<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, 2, asmstr> {
   let Pattern = pattern;
   
-  let Inst{13-15} = opcode;
-  let Inst{10-12} = cond;
+  bits<3> cond;
+  bits<10> dst;
+
+  let Inst{15-13} = 0b001;
+  let Inst{12-10} = cond;
+  let Inst{9-0} = dst;
 }
 
 // Pseudo instructions
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-  : MSP430Inst<outs, ins, SizeSpecial, PseudoFrm, asmstr> {
+  : MSP430Inst<outs, ins, 0, asmstr> {
   let Pattern = pattern;
-  let Inst{15-0} = 0;
 }
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index dd1b30a3e470a4e1887fb58f597572798e258b87..c136933a51bcfd6395486e4ba26368fa03fcb54f 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -301,35 +301,20 @@ unsigned MSP430InstrInfo::insertBranch(MachineBasicBlock &MBB,
 unsigned MSP430InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   const MCInstrDesc &Desc = MI.getDesc();
 
-  switch (Desc.TSFlags & MSP430II::SizeMask) {
-  default:
-    switch (Desc.getOpcode()) {
-    default: llvm_unreachable("Unknown instruction size!");
-    case TargetOpcode::CFI_INSTRUCTION:
-    case TargetOpcode::EH_LABEL:
-    case TargetOpcode::IMPLICIT_DEF:
-    case TargetOpcode::KILL:
-    case TargetOpcode::DBG_VALUE:
-      return 0;
-    case TargetOpcode::INLINEASM: {
-      const MachineFunction *MF = MI.getParent()->getParent();
-      const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
-      return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
-                                    *MF->getTarget().getMCAsmInfo());
-    }
-    }
-  case MSP430II::SizeSpecial:
-    switch (MI.getOpcode()) {
-    default: llvm_unreachable("Unknown instruction size!");
-    case MSP430::SAR8r1c:
-    case MSP430::SAR16r1c:
-      return 4;
-    }
-  case MSP430II::Size2Bytes:
-    return 2;
-  case MSP430II::Size4Bytes:
-    return 4;
-  case MSP430II::Size6Bytes:
-    return 6;
+  switch (Desc.getOpcode()) {
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::DBG_VALUE:
+    return 0;
+  case TargetOpcode::INLINEASM: {
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+    return TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+                                  *MF->getTarget().getMCAsmInfo());
   }
+  }
+
+  return Desc.getSize();
 }
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index 45357f54c9c67327128b643a6b01c4e5b0a9c7e3..fee3bea9b8d692f5a377ce37cdaf396cea83c910 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -24,22 +24,6 @@ namespace llvm {
 
 class MSP430Subtarget;
 
-/// MSP430II - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace MSP430II {
-  enum {
-    SizeShift   = 2,
-    SizeMask    = 7 << SizeShift,
-
-    SizeUnknown = 0 << SizeShift,
-    SizeSpecial = 1 << SizeShift,
-    Size2Bytes  = 2 << SizeShift,
-    Size4Bytes  = 3 << SizeShift,
-    Size6Bytes  = 4 << SizeShift
-  };
-}
-
 class MSP430InstrInfo : public MSP430GenInstrInfo {
   const MSP430RegisterInfo RI;
   virtual void anchor();
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index cec43040f60d4c177898bf0ac94be2b939ffabcd..3ed17374a2d87e70e674853c3dd920258b8c38aa 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -34,8 +34,9 @@ def SDT_MSP430BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
 def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
                                                   SDTCisSameAs<1, 2>, 
                                                   SDTCisVT<3, i8>]>;
-def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
-                                                  SDTCisI8<2>]>;
+def SDT_MSP430DAdd         : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                                  SDTCisSameAs<0, 2>,
+                                                  SDTCisInt<0>]>;
 
 //===----------------------------------------------------------------------===//
 // MSP430 Specific Node Definitions.
@@ -48,6 +49,7 @@ def MSP430retiflag : SDNode<"MSP430ISD::RETI_FLAG", SDTNone,
 def MSP430rra     : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
 def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
 def MSP430rrc     : SDNode<"MSP430ISD::RRC", SDTIntUnaryOp, []>;
+def MSP430rrcl    : SDNode<"MSP430ISD::RRCL", SDTIntUnaryOp, []>;
 
 def MSP430call    : SDNode<"MSP430ISD::CALL", SDT_MSP430Call,
                      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
@@ -63,33 +65,88 @@ def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC,
                             [SDNPHasChain, SDNPInGlue]>;
 def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC,
                             [SDNPInGlue]>;
-def MSP430shl     : SDNode<"MSP430ISD::SHL", SDT_MSP430Shift, []>;
-def MSP430sra     : SDNode<"MSP430ISD::SRA", SDT_MSP430Shift, []>;
-def MSP430srl     : SDNode<"MSP430ISD::SRL", SDT_MSP430Shift, []>;
+def MSP430dadd    : SDNode<"MSP430ISD::DADD", SDT_MSP430DAdd, []>;
 
 //===----------------------------------------------------------------------===//
 // MSP430 Operand Definitions.
 //===----------------------------------------------------------------------===//
 
+def MemAsmOperand : AsmOperandClass {
+  let Name = "Mem";
+}
+
 // Address operands
 def memsrc : Operand<i16> {
   let PrintMethod = "printSrcMemOperand";
   let MIOperandInfo = (ops GR16, i16imm);
+  let ParserMatchClass = MemAsmOperand;
+  let EncoderMethod = "getMemOpValue";
+  let DecoderMethod = "DecodeMemOperand";
 }
 
 def memdst : Operand<i16> {
   let PrintMethod = "printSrcMemOperand";
   let MIOperandInfo = (ops GR16, i16imm);
+  let ParserMatchClass = MemAsmOperand;
+  let EncoderMethod = "getMemOpValue";
+  let DecoderMethod = "DecodeMemOperand";
+}
+
+def IndRegAsmOperand : AsmOperandClass {
+  let Name = "IndReg";
+  let RenderMethod = "addRegOperands";
+}
+
+def indreg : Operand<i16> {
+  let PrintMethod = "printIndRegOperand";
+  let MIOperandInfo = (ops GR16);
+  let ParserMatchClass = IndRegAsmOperand;
+  let DecoderMethod = "DecodeGR16RegisterClass";
+}
+
+def PostIndRegAsmOperand : AsmOperandClass {
+  let Name = "PostIndReg";
+  let RenderMethod = "addRegOperands";
+}
+
+def postreg : Operand<i16> {
+  let PrintMethod = "printPostIndRegOperand";
+  let MIOperandInfo = (ops GR16);
+  let ParserMatchClass = PostIndRegAsmOperand;
+  let DecoderMethod = "DecodeGR16RegisterClass";
 }
 
 // Short jump targets have OtherVT type and are printed as pcrel imm values.
 def jmptarget : Operand<OtherVT> {
   let PrintMethod = "printPCRelImmOperand";
+  let EncoderMethod = "getPCRelImmOpValue";
 }
 
 // Operand for printing out a condition code.
 def cc : Operand<i8> {
   let PrintMethod = "printCCOperand";
+  let EncoderMethod = "getCCOpValue";
+}
+
+def CGImmAsmOperand : AsmOperandClass {
+  let Name = "CGImm";
+  let RenderMethod = "addImmOperands";
+}
+
+def cg8imm : Operand<i8>,
+             ImmLeaf<i8, [{return Imm == 0 || Imm == 1 || Imm == 2 ||
+                                  Imm == 4 || Imm == 8 || Imm == -1;}]> {
+  let ParserMatchClass = CGImmAsmOperand;
+  let EncoderMethod = "getCGImmOpValue";
+  let DecoderMethod = "DecodeCGImm";
+}
+
+def cg16imm : Operand<i16>,
+              ImmLeaf<i16, [{return Imm == 0 || Imm == 1 || Imm == 2 ||
+                                    Imm == 4 || Imm == 8 || Imm == -1;}]> {
+  let ParserMatchClass = CGImmAsmOperand;
+  let EncoderMethod = "getCGImmOpValue";
+  let DecoderMethod = "DecodeCGImm";
 }
 
 //===----------------------------------------------------------------------===//
@@ -102,6 +159,7 @@ def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], []>;
 // Pattern Fragments
 def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
 def  extloadi16i8 : PatFrag<(ops node:$ptr), (i16 ( extloadi8 node:$ptr))>;
+def bic : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, (not node:$rhs))>;
 def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
   return N->hasOneUse();
 }]>;
@@ -113,21 +171,21 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
 // pointer before prolog-epilog rewriting occurs.
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber SR.
-let Defs = [SP, SR], Uses = [SP] in {
+let isCodeGenOnly = 1, Defs = [SP, SR], Uses = [SP] in {
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
-                              "#ADJCALLSTACKDOWN",
+                              "#ADJCALLSTACKDOWN $amt1 $amt2",
                               [(MSP430callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
-                              "#ADJCALLSTACKUP",
+                              "#ADJCALLSTACKUP $amt1 $amt2",
                               [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
-let Defs = [SR], Uses = [SP] in {
+let isCodeGenOnly = 1, Defs = [SR], Uses = [SP] in {
 def ADDframe : Pseudo<(outs GR16:$dst), (ins i16imm:$base, i16imm:$offset),
                       "# ADDframe PSEUDO", []>;
 }
 
-let usesCustomInserter = 1 in {
+let isCodeGenOnly = 1, usesCustomInserter = 1 in {
   let Uses = [SR] in {
   def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
                         "# Select8 PSEUDO",
@@ -141,38 +199,45 @@ let usesCustomInserter = 1 in {
   let Defs = [SR] in {
   def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Shl8 PSEUDO",
-                        [(set GR8:$dst, (MSP430shl GR8:$src, GR8:$cnt))]>;
+                        [(set GR8:$dst, (shl GR8:$src, GR8:$cnt))]>;
   def Shl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
                         "# Shl16 PSEUDO",
-                        [(set GR16:$dst, (MSP430shl GR16:$src, GR8:$cnt))]>;
+                        [(set GR16:$dst, (shl GR16:$src, GR8:$cnt))]>;
   def Sra8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Sra8 PSEUDO",
-                        [(set GR8:$dst, (MSP430sra GR8:$src, GR8:$cnt))]>;
+                        [(set GR8:$dst, (sra GR8:$src, GR8:$cnt))]>;
   def Sra16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
                         "# Sra16 PSEUDO",
-                        [(set GR16:$dst, (MSP430sra GR16:$src, GR8:$cnt))]>;
+                        [(set GR16:$dst, (sra GR16:$src, GR8:$cnt))]>;
   def Srl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Srl8 PSEUDO",
-                        [(set GR8:$dst, (MSP430srl GR8:$src, GR8:$cnt))]>;
+                        [(set GR8:$dst, (srl GR8:$src, GR8:$cnt))]>;
   def Srl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
                         "# Srl16 PSEUDO",
-                        [(set GR16:$dst, (MSP430srl GR16:$src, GR8:$cnt))]>;
-
+                        [(set GR16:$dst, (srl GR16:$src, GR8:$cnt))]>;
+  def Rrcl8    : Pseudo<(outs GR8:$dst), (ins GR8:$src), "",
+                        [(set GR8:$dst, (MSP430rrcl GR8:$src))]>;
+  def Rrcl16   : Pseudo<(outs GR16:$dst), (ins GR16:$src), "",
+                        [(set GR16:$dst, (MSP430rrcl GR16:$src))]>;
   }
 }
 
-let hasSideEffects = 0 in
-def NOP : Pseudo<(outs), (ins), "nop", []>;
-
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions...
 //
 
 // FIXME: Provide proper encoding!
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-  def RET  : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                     (outs), (ins), "ret",  [(MSP430retflag)]>;
-  def RETI : II16r<0x0, (outs), (ins), "reti", [(MSP430retiflag)]>;
+  def RET  : IForm16<0b0100, DstReg, SrcPostInc, 2,
+                     (outs), (ins), "ret",  [(MSP430retflag)]> {
+    let DecoderNamespace = "Delta";
+    let rs = 1;
+    let rd = 0;
+  }
+  def RETI : IIForm16<0b110, SrcReg, 2,
+                      (outs), (ins), "reti", [(MSP430retiflag)]> {
+    let rs = 0;
+  }
 }
 
 let isBranch = 1, isTerminator = 1 in {
@@ -182,64 +247,69 @@ let isBranch = 1, isTerminator = 1 in {
 // Direct branch
 let isBarrier = 1 in {
   // Short branch
-  def JMP : CJForm<0, 0, (outs), (ins jmptarget:$dst),
+  def JMP : CJForm<(outs), (ins jmptarget:$dst),
                    "jmp\t$dst",
-                   [(br bb:$dst)]>;
-  let isIndirectBranch = 1 in {
+                   [(br bb:$dst)]> {
+    let cond = 0b111;
+  }
+  let isIndirectBranch = 1, rd = 0 in {
     // Long branches
-    def Bi  : I16ri<0, (outs), (ins i16imm:$brdst),
-                    "br\t$brdst",
-                    [(brind tblockaddress:$brdst)]>;
-    def Br  : I16rr<0, (outs), (ins GR16:$brdst),
-                    "br\t$brdst",
-                    [(brind GR16:$brdst)]>;
-    def Bm  : I16rm<0, (outs), (ins memsrc:$brdst),
-                    "br\t$brdst",
-                    [(brind (load addr:$brdst))]>;
+    def Bi  : I16ri<0b0100, (outs), (ins i16imm:$imm),
+                    "br\t$imm",
+                    [(brind tblockaddress:$imm)]>;
+    def Br  : I16rr<0b0100, (outs), (ins GR16:$rs),
+                    "br\t$rs",
+                    [(brind GR16:$rs)]>;
+    def Bm  : I16rm<0b0100, (outs), (ins memsrc:$src),
+                    "br\t$src",
+                    [(brind (load addr:$src))]>;
   }
 }
 
 // Conditional branches
 let Uses = [SR] in
-  def JCC : CJForm<0, 0,
-                   (outs), (ins jmptarget:$dst, cc:$cc),
-                   "j$cc\t$dst",
-                   [(MSP430brcc bb:$dst, imm:$cc)]>;
+  def JCC : CJForm<(outs), (ins jmptarget:$dst, cc:$cond),
+                   "j$cond\t$dst",
+                   [(MSP430brcc bb:$dst, imm:$cond)]>;
 } // isBranch, isTerminator
 
 //===----------------------------------------------------------------------===//
 //  Call Instructions...
 //
-let isCall = 1 in
-  // All calls clobber the non-callee saved registers. SPW is marked as
-  // a use to prevent stack-pointer assignments that appear immediately
-  // before calls from potentially appearing dead. Uses for argument
-  // registers are added manually.
-  let Defs = [R11, R12, R13, R14, R15, SR],
-      Uses = [SP] in {
-    def CALLi     : II16i<0x0,
-                          (outs), (ins i16imm:$dst),
-                          "call\t$dst", [(MSP430call imm:$dst)]>;
-    def CALLr     : II16r<0x0,
-                          (outs), (ins GR16:$dst),
-                          "call\t$dst", [(MSP430call GR16:$dst)]>;
-    def CALLm     : II16m<0x0,
-                          (outs), (ins memsrc:$dst),
-                          "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
-  }
-
+// All calls clobber the non-callee saved registers. SPW is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead. Uses for argument
+// registers are added manually.
+let isCall = 1,
+    Defs = [R11, R12, R13, R14, R15, SR],
+    Uses = [SP] in {
+  def CALLi     : II16i<0b101,
+                        (outs), (ins i16imm:$imm),
+                        "call\t$imm", [(MSP430call imm:$imm)]>;
+  def CALLr     : II16r<0b101,
+                        (outs), (ins GR16:$rs),
+                        "call\t$rs", [(MSP430call GR16:$rs)]>;
+  def CALLm     : II16m<0b101,
+                        (outs), (ins memsrc:$src),
+                        "call\t$src", [(MSP430call (load addr:$src))]>;
+}
 
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions...
 //
-let Defs = [SP], Uses = [SP], hasSideEffects=0 in {
+let Defs = [SP], Uses = [SP], hasSideEffects = 0 in {
 let mayLoad = 1 in
-def POP16r   : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                       (outs GR16:$reg), (ins), "pop.w\t$reg", []>;
+def POP16r   : IForm16<0b0100, DstReg, SrcPostInc, 2,
+                       (outs GR16:$rd), (ins), "pop\t$rd", []> {
+  let DecoderNamespace = "Delta";
+  let rs = 1;
+}
 
 let mayStore = 1 in
-def PUSH16r  : II16r<0x0,
-                     (outs), (ins GR16:$reg), "push.w\t$reg",[]>;
+def PUSH8r :  II8r<0b100, (outs), (ins GR8:$rs), "push.b\t$rs", []>;
+def PUSH16r : II16r<0b100, (outs), (ins GR16:$rs), "push\t$rs", []>;
+def PUSH16c : II16c<0b100, (outs), (ins cg16imm:$imm), "push\t$imm", []>;
+def PUSH16i : II16i<0b100, (outs), (ins i16imm:$imm), "push\t$imm", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -247,55 +317,73 @@ def PUSH16r  : II16r<0x0,
 
 // FIXME: Provide proper encoding!
 let hasSideEffects = 0 in {
-def MOV8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src),
-                   "mov.b\t{$src, $dst}",
+def MOV8rr  : I8rr<0b0100,
+                   (outs GR8:$rd), (ins GR8:$rs),
+                   "mov.b\t{$rs, $rd}",
                    []>;
-def MOV16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "mov.w\t{$src, $dst}",
+def MOV16rr : I16rr<0b0100,
+                    (outs GR16:$rd), (ins GR16:$rs),
+                    "mov\t{$rs, $rd}",
                     []>;
 }
 
 // FIXME: Provide proper encoding!
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-def MOV8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins i8imm:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(set GR8:$dst, imm:$src)]>;
-def MOV16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins i16imm:$src),
-                    "mov.w\t{$src, $dst}",
-                    [(set GR16:$dst, imm:$src)]>;
+def MOV8rc : I8rc<0b0100,
+                   (outs GR8:$rd), (ins cg8imm:$imm),
+                   "mov.b\t$imm, $rd",
+                   [(set GR8:$rd, cg8imm:$imm)]>;
+def MOV16rc : I16rc<0b0100,
+                    (outs GR16:$rd), (ins cg16imm:$imm),
+                    "mov\t$imm, $rd",
+                    [(set GR16:$rd, cg16imm:$imm)]>;
+def MOV8ri  : I8ri<0b0100,
+                   (outs GR8:$rd), (ins i8imm:$imm),
+                   "mov.b\t{$imm, $rd}",
+                   [(set GR8:$rd, imm:$imm)]>;
+def MOV16ri : I16ri<0b0100,
+                    (outs GR16:$rd), (ins i16imm:$imm),
+                    "mov\t{$imm, $rd}",
+                    [(set GR16:$rd, imm:$imm)]>;
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-def MOV8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins memsrc:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(set GR8:$dst, (load addr:$src))]>;
-def MOV16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins memsrc:$src),
-                    "mov.w\t{$src, $dst}",
-                    [(set GR16:$dst, (load addr:$src))]>;
-}
-
-def MOVZX16rr8 : I8rr<0x0,
-                      (outs GR16:$dst), (ins GR8:$src),
-                      "mov.b\t{$src, $dst}",
-                      [(set GR16:$dst, (zext GR8:$src))]>;
-def MOVZX16rm8 : I8rm<0x0,
-                      (outs GR16:$dst), (ins memsrc:$src),
-                      "mov.b\t{$src, $dst}",
-                      [(set GR16:$dst, (zextloadi16i8 addr:$src))]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, Constraints = "$base = $base_wb" in {
-def MOV8rm_POST  : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb), (ins GR16:$base),
-                         "mov.b\t{@$base+, $dst}", []>;
-def MOV16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                           (outs GR16:$dst, GR16:$base_wb), (ins GR16:$base),
-                           "mov.w\t{@$base+, $dst}", []>;
+def MOV8rm  : I8rm<0b0100,
+                   (outs GR8:$rd), (ins memsrc:$src),
+                   "mov.b\t{$src, $rd}",
+                   [(set GR8:$rd, (load addr:$src))]>;
+def MOV16rm : I16rm<0b0100,
+                    (outs GR16:$rd), (ins memsrc:$src),
+                    "mov\t{$src, $rd}",
+                    [(set GR16:$rd, (load addr:$src))]>;
+def MOV8rn  : I8rn<0b0100,
+                   (outs GR8:$rd), (ins indreg:$rs),
+                   "mov.b\t{$rs, $rd}",
+                   [(set GR8:$rd, (load addr:$rs))]>;
+def MOV16rn : I16rn<0b0100,
+                    (outs GR16:$rd), (ins indreg:$rs),
+                    "mov\t{$rs, $rd}",
+                    [(set GR16:$rd, (load addr:$rs))]>;
+}
+
+let isCodeGenOnly = 1 in {
+def MOVZX16rr8 : I8rr<0b0100,
+                      (outs GR16:$rd), (ins GR8:$rs),
+                      "mov.b\t{$rs, $rd}",
+                      [(set GR16:$rd, (zext GR8:$rs))]>;
+def MOVZX16rm8 : I8rm<0b0100,
+                      (outs GR16:$rd), (ins memsrc:$src),
+                      "mov.b\t{$src, $rd}",
+                      [(set GR16:$rd, (zextloadi16i8 addr:$src))]>;
+}
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, Constraints = "$rs = $wb" in {
+def MOV8rp  : I8rp<0b0100,
+                   (outs GR8:$rd, GR16:$wb), (ins postreg:$rs),
+                   "mov.b\t{$rs, $rd}", []>;
+def MOV16rp : I16rp<0b0100,
+                    (outs GR16:$rd, GR16:$wb), (ins postreg:$rs),
+                    "mov\t{$rs, $rd}", []>;
 }
 
 // Any instruction that defines a 8-bit result leaves the high half of the
@@ -313,821 +401,450 @@ def def8 : PatLeaf<(i8 GR8:$src), [{
 def : Pat<(i16 (zext def8:$src)),
           (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
 
-def MOV8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(store (i8 imm:$src), addr:$dst)]>;
-def MOV16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "mov.w\t{$src, $dst}",
-                    [(store (i16 imm:$src), addr:$dst)]>;
-
-def MOV8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(store GR8:$src, addr:$dst)]>;
-def MOV16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "mov.w\t{$src, $dst}",
-                    [(store GR16:$src, addr:$dst)]>;
-
-def MOV8mm  : I8mm<0x0,
+def MOV8mc  : I8mc<0b0100,
+                   (outs), (ins memdst:$dst, cg8imm:$imm),
+                   "mov.b\t{$imm, $dst}",
+                   [(store (i8 cg8imm:$imm), addr:$dst)]>;
+def MOV16mc : I16mc<0b0100,
+                    (outs), (ins memdst:$dst, cg16imm:$imm),
+                    "mov\t{$imm, $dst}",
+                    [(store (i16 cg16imm:$imm), addr:$dst)]>;
+
+def MOV8mi  : I8mi<0b0100,
+                   (outs), (ins memdst:$dst, i8imm:$imm),
+                   "mov.b\t{$imm, $dst}",
+                   [(store (i8 imm:$imm), addr:$dst)]>;
+def MOV16mi : I16mi<0b0100,
+                    (outs), (ins memdst:$dst, i16imm:$imm),
+                    "mov\t{$imm, $dst}",
+                    [(store (i16 imm:$imm), addr:$dst)]>;
+
+def MOV8mr  : I8mr<0b0100,
+                   (outs), (ins memdst:$dst, GR8:$rs),
+                   "mov.b\t{$rs, $dst}",
+                   [(store GR8:$rs, addr:$dst)]>;
+def MOV16mr : I16mr<0b0100,
+                    (outs), (ins memdst:$dst, GR16:$rs),
+                    "mov\t{$rs, $dst}",
+                    [(store GR16:$rs, addr:$dst)]>;
+
+def MOV8mm  : I8mm<0b0100,
                    (outs), (ins memdst:$dst, memsrc:$src),
                    "mov.b\t{$src, $dst}",
                    [(store (i8 (load addr:$src)), addr:$dst)]>;
-def MOV16mm : I16mm<0x0,
+def MOV16mm : I16mm<0b0100,
                     (outs), (ins memdst:$dst, memsrc:$src),
-                    "mov.w\t{$src, $dst}",
+                    "mov\t{$src, $dst}",
                     [(store (i16 (load addr:$src)), addr:$dst)]>;
 
 //===----------------------------------------------------------------------===//
 // Arithmetic Instructions
 
-let Constraints = "$src = $dst" in {
-
-let Defs = [SR] in {
-
-let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
-
-def ADD8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "add.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (add GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def ADD16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "add.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (add GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-}
-
-def ADD8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "add.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (add GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def ADD16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "add.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (add GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def ADD8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src, GR16:$base),
-                         "add.b\t{@$base+, $dst}", []>;
-def ADD16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                           (outs GR16:$dst, GR16:$base_wb),
-                           (ins GR16:$src, GR16:$base),
-                          "add.w\t{@$base+, $dst}", []>;
-}
-
-
-def ADD8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "add.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (add GR8:$src, imm:$src2)),
-                    (implicit SR)]>;
-def ADD16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "add.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (add GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-let Constraints = "" in {
-def ADD8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "add.b\t{$src, $dst}",
-                   [(store (add (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def ADD16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "add.w\t{$src, $dst}",
-                    [(store (add (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def ADD8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "add.b\t{$src, $dst}",
-                   [(store (add (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def ADD16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "add.w\t{$src, $dst}",
-                    [(store (add (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def ADD8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "add.b\t{$src, $dst}",
-                   [(store (add (load addr:$dst), 
-                                (i8 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-def ADD16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "add.w\t{$src, $dst}",
-                    [(store (add (load addr:$dst), 
-                                  (i16 (load addr:$src))), addr:$dst),
-                     (implicit SR)]>;
-}
-
-let Uses = [SR] in {
-
-let isCommutable = 1 in { // X = ADDC Y, Z  == X = ADDC Z, Y
-def ADC8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "addc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (adde GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def ADC16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "addc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (adde GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-} // isCommutable
-
-def ADC8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "addc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (adde GR8:$src, imm:$src2)),
-                    (implicit SR)]>;
-def ADC16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "addc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (adde GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def ADC8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "addc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (adde GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def ADC16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "addc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (adde GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let Constraints = "" in {
-def ADC8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "addc.b\t{$src, $dst}",
-                   [(store (adde (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def ADC16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "addc.w\t{$src, $dst}",
-                    [(store (adde (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def ADC8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "addc.b\t{$src, $dst}",
-                   [(store (adde (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def ADC16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "addc.w\t{$src, $dst}",
-                    [(store (adde (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def ADC8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "addc.b\t{$src, $dst}",
-                   [(store (adde (load addr:$dst), 
-                                 (i8 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-def ADC16mm : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "addc.w\t{$src, $dst}",
-                   [(store (adde (load addr:$dst), 
-                                 (i16 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-}
-
-} // Uses = [SR]
-
-let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
-def AND8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "and.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def AND16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "and.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-}
-
-def AND8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "and.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src, imm:$src2)),
+multiclass Arith<bits<4> opcode, string asmstring, SDNode node,
+                 bit commutes, list<Register> uses> {
+  let Defs = [SR], Uses = uses in {
+  let Constraints = "$src2 = $rd" in {
+  let isCommutable = commutes in {
+  def 8rr : I8rr<opcode, (outs GR8:$rd), (ins GR8:$src2, GR8:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $rd"),
+                 [(set GR8:$rd, (node GR8:$src2, GR8:$rs)),
+                  (implicit SR)]>;
+  def 16rr : I16rr<opcode, (outs GR16:$rd), (ins GR16:$src2, GR16:$rs),
+                   !strconcat(asmstring, "\t$rs, $rd"),
+                   [(set GR16:$rd, (node GR16:$src2, GR16:$rs)),
                     (implicit SR)]>;
-def AND16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "and.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def AND8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "and.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def AND16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "and.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def AND8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src, GR16:$base),
-                         "and.b\t{@$base+, $dst}", []>;
-def AND16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                           (outs GR16:$dst, GR16:$base_wb),
-                           (ins GR16:$src, GR16:$base),
-                           "and.w\t{@$base+, $dst}", []>;
-}
-
-let Constraints = "" in {
-def AND8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "and.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def AND16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "and.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def AND8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "and.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def AND16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "and.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def AND8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "and.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst), 
-                                (i8 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-def AND16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "and.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst), 
+  }
+  def 8rm : I8rm<opcode, (outs GR8:$rd), (ins GR8:$src2, memsrc:$src),
+                 !strconcat(asmstring, ".b\t$src, $rd"),
+                 [(set GR8:$rd, (node GR8:$src2, (load addr:$src))),
+                  (implicit SR)]>;
+  def 16rm : I16rm<opcode, (outs GR16:$rd), (ins GR16:$src2, memsrc:$src),
+                   !strconcat(asmstring, "\t$src, $rd"),
+                   [(set GR16:$rd, (node GR16:$src2, (load addr:$src))),
+                    (implicit SR)]>;
+  def 8rn : I8rn<opcode, (outs GR8:$rd), (ins GR8:$src2, indreg:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $rd"), []>;
+  def 16rn : I16rn<opcode, (outs GR16:$rd), (ins GR16:$src2, indreg:$rs),
+                   !strconcat(asmstring, "\t$rs, $rd"), []>;
+  let mayLoad = 1,
+      hasExtraDefRegAllocReq = 1,
+      Constraints = "$rs = $wb, $src2 = $rd" in {
+  def 8rp : I8rp<opcode, (outs GR8:$rd, GR16:$wb), (ins GR8:$src2, postreg:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $rd"), []>;
+  def 16rp : I16rp<opcode, (outs GR16:$rd, GR16:$wb), (ins GR16:$src2, postreg:$rs),
+                   !strconcat(asmstring, "\t$rs, $rd"), []>;
+  }
+  def 8rc : I8rc<opcode, (outs GR8:$rd), (ins GR8:$src2, cg8imm:$imm),
+                 !strconcat(asmstring, ".b\t$imm, $rd"),
+                 [(set GR8:$rd, (node GR8:$src2, cg8imm:$imm)),
+                  (implicit SR)]>;
+  def 16rc : I16rc<opcode, (outs GR16:$rd), (ins GR16:$src2, cg16imm:$imm),
+                 !strconcat(asmstring, "\t$imm, $rd"),
+                 [(set GR16:$rd, (node GR16:$src2, cg16imm:$imm)),
+                  (implicit SR)]>;
+  def 8ri : I8ri<opcode, (outs GR8:$rd), (ins GR8:$src2, i8imm:$imm),
+                 !strconcat(asmstring, ".b\t$imm, $rd"),
+                 [(set GR8:$rd, (node GR8:$src2, imm:$imm)),
+                  (implicit SR)]>;
+  def 16ri : I16ri<opcode, (outs GR16:$rd), (ins GR16:$src2, i16imm:$imm),
+                 !strconcat(asmstring, "\t$imm, $rd"),
+                 [(set GR16:$rd, (node GR16:$src2, imm:$imm)),
+                  (implicit SR)]>;
+  }
+  def 8mr : I8mr<opcode, (outs), (ins memdst:$dst, GR8:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $dst"),
+                 [(store (node (load addr:$dst), GR8:$rs), addr:$dst),
+                  (implicit SR)]>;
+  def 16mr : I16mr<opcode, (outs), (ins memdst:$dst, GR16:$rs),
+                   !strconcat(asmstring, "\t$rs, $dst"),
+                   [(store (node (load addr:$dst), GR16:$rs), addr:$dst),
+                    (implicit SR)]>;
+  def 8mc : I8mc<opcode, (outs), (ins memdst:$dst, cg8imm:$imm),
+                 !strconcat(asmstring, ".b\t$imm, $dst"),
+                 [(store (node (load addr:$dst), (i8 cg8imm:$imm)), addr:$dst),
+                  (implicit SR)]>;
+  def 16mc : I16mc<opcode, (outs), (ins memdst:$dst, cg16imm:$imm),
+                   !strconcat(asmstring, "\t$imm, $dst"),
+                   [(store (node (load addr:$dst), (i16 cg16imm:$imm)), addr:$dst),
+                    (implicit SR)]>;
+  def 8mi : I8mi<opcode, (outs), (ins memdst:$dst, i8imm:$imm),
+                 !strconcat(asmstring, ".b\t$imm, $dst"),
+                 [(store (node (load addr:$dst), (i8 imm:$imm)), addr:$dst),
+                  (implicit SR)]>;
+  def 16mi : I16mi<opcode, (outs), (ins memdst:$dst, i16imm:$imm),
+                   !strconcat(asmstring, "\t$imm, $dst"),
+                   [(store (node (load addr:$dst), (i16 imm:$imm)), addr:$dst),
+                    (implicit SR)]>;
+  def 8mm : I8mm<opcode, (outs), (ins memdst:$dst, memsrc:$src),
+                 !strconcat(asmstring, ".b\t$src, $dst"),
+                 [(store (node (load addr:$dst), 
+                               (i8 (load addr:$src))), addr:$dst),
+                  (implicit SR)]>;
+  def 16mm : I16mm<opcode, (outs), (ins memdst:$dst, memsrc:$src),
+                   !strconcat(asmstring, "\t$src, $dst"),
+                   [(store (node (load addr:$dst), 
                                  (i16 (load addr:$src))), addr:$dst),
-                     (implicit SR)]>;
-}
-
-let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
-def OR8rr  : I8rr<0x0,
-                  (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                  "bis.b\t{$src2, $dst}",
-                  [(set GR8:$dst, (or GR8:$src, GR8:$src2))]>;
-def OR16rr : I16rr<0x0,
-                   (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                   "bis.w\t{$src2, $dst}",
-                   [(set GR16:$dst, (or GR16:$src, GR16:$src2))]>;
-}
-
-def OR8ri  : I8ri<0x0,
-                  (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                  "bis.b\t{$src2, $dst}",
-                  [(set GR8:$dst, (or GR8:$src, imm:$src2))]>;
-def OR16ri : I16ri<0x0,
-                   (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                   "bis.w\t{$src2, $dst}",
-                   [(set GR16:$dst, (or GR16:$src, imm:$src2))]>;
-
-def OR8rm  : I8rm<0x0,
-                  (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                  "bis.b\t{$src2, $dst}",
-                  [(set GR8:$dst, (or GR8:$src, (load addr:$src2)))]>;
-def OR16rm : I16rm<0x0,
-                   (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                   "bis.w\t{$src2, $dst}",
-                   [(set GR16:$dst, (or GR16:$src, (load addr:$src2)))]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def OR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                        (outs GR8:$dst, GR16:$base_wb),
-                        (ins GR8:$src, GR16:$base),
-                        "bis.b\t{@$base+, $dst}", []>;
-def OR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                          (outs GR16:$dst, GR16:$base_wb),
-                          (ins GR16:$src, GR16:$base),
-                          "bis.w\t{@$base+, $dst}", []>;
-}
-
-let Constraints = "" in {
-def OR8mr  : I8mr<0x0,
-                  (outs), (ins memdst:$dst, GR8:$src),
-                  "bis.b\t{$src, $dst}",
-                  [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
-def OR16mr : I16mr<0x0,
-                   (outs), (ins memdst:$dst, GR16:$src),
-                   "bis.w\t{$src, $dst}",
-                   [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>;
-
-def OR8mi  : I8mi<0x0, 
-                  (outs), (ins memdst:$dst, i8imm:$src),
-                  "bis.b\t{$src, $dst}",
-                  [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst)]>;
-def OR16mi : I16mi<0x0,
-                   (outs), (ins memdst:$dst, i16imm:$src),
-                   "bis.w\t{$src, $dst}",
-                   [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst)]>;
-
-def OR8mm  : I8mm<0x0,
-                  (outs), (ins memdst:$dst, memsrc:$src),
-                  "bis.b\t{$src, $dst}",
-                  [(store (or (i8 (load addr:$dst)),
-                              (i8 (load addr:$src))), addr:$dst)]>;
-def OR16mm : I16mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "bis.w\t{$src, $dst}",
-                   [(store (or (i16 (load addr:$dst)),
-                               (i16 (load addr:$src))), addr:$dst)]>;
-}
-
-// bic does not modify condition codes
-def BIC8rr :  I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "bic.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (and GR8:$src, (not GR8:$src2)))]>;
-def BIC16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "bic.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, (not GR16:$src2)))]>;
-
-def BIC8rm :  I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "bic.b\t{$src2, $dst}",
-                    [(set GR8:$dst, (and GR8:$src, (not (i8 (load addr:$src2)))))]>;
-def BIC16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "bic.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (and GR16:$src, (not (i16 (load addr:$src2)))))]>;
-
-let Constraints = "" in {
-def BIC8mr :  I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "bic.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst), (not GR8:$src)), addr:$dst)]>;
-def BIC16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "bic.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst), (not GR16:$src)), addr:$dst)]>;
-
-def BIC8mm :  I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "bic.b\t{$src, $dst}",
-                   [(store (and (load addr:$dst),
-                                (not (i8 (load addr:$src)))), addr:$dst)]>;
-def BIC16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "bic.w\t{$src, $dst}",
-                    [(store (and (load addr:$dst),
-                                 (not (i16 (load addr:$src)))), addr:$dst)]>;
-}
-
-let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
-def XOR8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "xor.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (xor GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def XOR16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "xor.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (xor GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-}
-
-def XOR8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "xor.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (xor GR8:$src, imm:$src2)),
-                    (implicit SR)]>;
-def XOR16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "xor.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (xor GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def XOR8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "xor.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (xor GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def XOR16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "xor.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (xor GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def XOR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src, GR16:$base),
-                         "xor.b\t{@$base+, $dst}", []>;
-def XOR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                           (outs GR16:$dst, GR16:$base_wb),
-                           (ins GR16:$src, GR16:$base),
-                           "xor.w\t{@$base+, $dst}", []>;
-}
-
-let Constraints = "" in {
-def XOR8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "xor.b\t{$src, $dst}",
-                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def XOR16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "xor.w\t{$src, $dst}",
-                    [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def XOR8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "xor.b\t{$src, $dst}",
-                   [(store (xor (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def XOR16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "xor.w\t{$src, $dst}",
-                    [(store (xor (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def XOR8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "xor.b\t{$src, $dst}",
-                   [(store (xor (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
                     (implicit SR)]>;
-def XOR16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "xor.w\t{$src, $dst}",
-                    [(store (xor (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
-                     (implicit SR)]>;
-}
-
-
-def SUB8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "sub.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sub GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def SUB16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "sub.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sub GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-
-def SUB8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "sub.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sub GR8:$src, imm:$src2)),
-                    (implicit SR)]>;
-def SUB16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "sub.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sub GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def SUB8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "sub.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sub GR8:$src, (load addr:$src2))),
-                    (implicit SR)]>;
-def SUB16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "sub.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sub GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
-Constraints = "$base = $base_wb, $src = $dst" in {
-def SUB8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
-                         (outs GR8:$dst, GR16:$base_wb),
-                         (ins GR8:$src, GR16:$base),
-                         "sub.b\t{@$base+, $dst}", []>;
-def SUB16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
-                          (outs GR16:$dst, GR16:$base_wb),
-                          (ins GR16:$src, GR16:$base),
-                          "sub.w\t{@$base+, $dst}", []>;
+  def 8mn : I8mn<opcode, (outs), (ins memdst:$dst, indreg:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $dst"), []>;
+  def 16mn : I16mn<opcode, (outs), (ins memdst:$dst, indreg:$rs),
+                   !strconcat(asmstring, "\t$rs, $dst"), []>;
+  def 8mp : I8mp<opcode, (outs), (ins memdst:$dst, postreg:$rs),
+                 !strconcat(asmstring, ".b\t$rs, $dst"), []>;
+  def 16mp : I16mp<opcode, (outs), (ins memdst:$dst, postreg:$rs),
+                   !strconcat(asmstring, "\t$rs, $dst"), []>;
+  }
 }
 
-let Constraints = "" in {
-def SUB8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "sub.b\t{$src, $dst}",
-                   [(store (sub (load addr:$dst), GR8:$src), addr:$dst),
-                    (implicit SR)]>;
-def SUB16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "sub.w\t{$src, $dst}",
-                    [(store (sub (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
+defm ADD  : Arith<0b0101, "add",  add,  1, []>;
+defm ADDC : Arith<0b0110, "addc", adde, 1, [SR]>;
+defm AND  : Arith<0b1111, "and",  and,  1, []>;
+defm BIS  : Arith<0b1101, "bis",  or,   1, []>;
+defm BIC  : Arith<0b1100, "bic",  bic,  0, []>;
+defm XOR  : Arith<0b1110, "xor",  xor,  1, []>;
+defm SUB  : Arith<0b1000, "sub",  sub,  0, []>;
+defm SUBC : Arith<0b0111, "subc", sube, 0, [SR]>;
+defm DADD : Arith<0b1010, "dadd", MSP430dadd, 1, [SR]>;
+
+def ADC8r   : InstAlias<"adc.b\t$dst",  (ADDC8rc   GR8:$dst,     0)>;
+def ADC16r  : InstAlias<"adc\t$dst",    (ADDC16rc  GR16:$dst,    0)>;
+def ADC8m   : InstAlias<"adc.b\t$dst",  (ADDC8mc   memdst:$dst,  0)>;
+def ADC16m  : InstAlias<"adc\t$dst",    (ADDC16mc  memdst:$dst,  0)>;
+
+def DADC8r  : InstAlias<"dadc.b\t$dst", (DADD8rc   GR8:$dst,     0)>;
+def DADC16r : InstAlias<"dadc\t$dst",   (DADD16rc  GR16:$dst,    0)>;
+def DADC8m  : InstAlias<"dadc.b\t$dst", (DADD8mc   memdst:$dst,  0)>;
+def DADC16m : InstAlias<"dadc\t$dst",   (DADD16mc  memdst:$dst,  0)>;
+
+def DEC8r   : InstAlias<"dec.b\t$dst",  (SUB8rc    GR8:$dst,     1)>;
+def DEC16r  : InstAlias<"dec\t$dst",    (SUB16rc   GR16:$dst,    1)>;
+def DEC8m   : InstAlias<"dec.b\t$dst",  (SUB8mc    memdst:$dst,  1)>;
+def DEC16m  : InstAlias<"dec\t$dst",    (SUB16mc   memdst:$dst,  1)>;
+
+def DECD8r  : InstAlias<"decd.b\t$dst", (SUB8rc    GR8:$dst,     2)>;
+def DECD16r : InstAlias<"decd\t$dst",   (SUB16rc   GR16:$dst,    2)>;
+def DECD8m  : InstAlias<"decd.b\t$dst", (SUB8mc    memdst:$dst,  2)>;
+def DECD16m : InstAlias<"decd\t$dst",   (SUB16mc   memdst:$dst,  2)>;
+
+def INC8r   : InstAlias<"inc.b\t$dst",  (ADD8rc    GR8:$dst,     1)>;
+def INC16r  : InstAlias<"inc\t$dst",    (ADD16rc   GR16:$dst,    1)>;
+def INC8m   : InstAlias<"inc.b\t$dst",  (ADD8mc    memdst:$dst,  1)>;
+def INC16m  : InstAlias<"inc\t$dst",    (ADD16mc   memdst:$dst,  1)>;
+
+def INCD8r  : InstAlias<"incd.b\t$dst", (ADD8rc    GR8:$dst,     2)>;
+def INCD16r : InstAlias<"incd\t$dst",   (ADD16rc   GR16:$dst,    2)>;
+def INCD8m  : InstAlias<"incd.b\t$dst", (ADD8mc    memdst:$dst,  2)>;
+def INCD16m : InstAlias<"incd\t$dst",   (ADD16mc   memdst:$dst,  2)>;
+
+def SBC8r   : InstAlias<"sbc.b\t$dst",  (SUBC8rc   GR8:$dst,     0)>;
+def SBC16r  : InstAlias<"sbc\t$dst",    (SUBC16rc  GR16:$dst,    0)>;
+def SBC8m   : InstAlias<"sbc.b\t$dst",  (SUBC8mc   memdst:$dst,  0)>;
+def SBC16m  : InstAlias<"sbc\t$dst",    (SUBC16mc  memdst:$dst,  0)>;
+
+def INV8r   : InstAlias<"inv.b\t$dst",  (XOR8rc    GR8:$dst,    -1)>;
+def INV16r  : InstAlias<"inv\t$dst",    (XOR16rc   GR16:$dst,   -1)>;
+def INV8m   : InstAlias<"inv.b\t$dst",  (XOR8mc    memdst:$dst, -1)>;
+def INV16m  : InstAlias<"inv\t$dst",    (XOR16mc   memdst:$dst, -1)>;
+
+// printAliasInstr() doesn't check $dst operands are actually equal
+// for RLA and RLC aliases below, so disable printing aliases.
+
+def RLA8r   : InstAlias<"rla.b\t$dst",  (ADD8rr    GR8:$dst,     GR8:$dst),    0>;
+def RLA16r  : InstAlias<"rla\t$dst",    (ADD16rr   GR16:$dst,    GR16:$dst),   0>;
+def RLA8m   : InstAlias<"rla.b\t$dst",  (ADD8mm    memdst:$dst,  memdst:$dst), 0>;
+def RLA16m  : InstAlias<"rla\t$dst",    (ADD16mm   memdst:$dst,  memdst:$dst), 0>;
+
+def RLC8r   : InstAlias<"rlc.b\t$dst",  (ADDC8rr   GR8:$dst,     GR8:$dst),    0>;
+def RLC16r  : InstAlias<"rlc\t$dst",    (ADDC16rr  GR16:$dst,    GR16:$dst),   0>;
+def RLC8m   : InstAlias<"rlc.b\t$dst",  (ADDC8mm   memdst:$dst,  memdst:$dst), 0>;
+def RLC16m  : InstAlias<"rlc\t$dst",    (ADDC16mm  memdst:$dst,  memdst:$dst), 0>;
+
+def DINT : InstAlias<"dint", (BIC16rc SR, 8)>;
+def EINT : InstAlias<"eint", (BIS16rc SR, 8)>;
+
+def NOP  : InstAlias<"nop",  (MOV16rc CG, 0)>;
+
+def CLR8r   : InstAlias<"clr.b\t$dst",  (MOV8rc    GR8:$dst,     0)>;
+def CLR16r  : InstAlias<"clr\t$dst",    (MOV16rc   GR16:$dst,    0)>;
+def CLR8m   : InstAlias<"clr.b\t$dst",  (MOV8mc    memdst:$dst,  0)>;
+def CLR16m  : InstAlias<"clr\t$dst",    (MOV16mc   memdst:$dst,  0)>;
+
+def CLRC : InstAlias<"clrc", (BIC16rc SR, 1)>;
+def CLRN : InstAlias<"clrn", (BIC16rc SR, 4)>;
+def CLRZ : InstAlias<"clrz", (BIC16rc SR, 2)>;
+def SETC : InstAlias<"setc", (BIS16rc SR, 1)>;
+def SETN : InstAlias<"setn", (BIS16rc SR, 4)>;
+def SETZ : InstAlias<"setz", (BIS16rc SR, 2)>;
+
+def : Pat<(MSP430rla GR8:$dst),  (ADD8rr  $dst, $dst)>;
+def : Pat<(MSP430rla GR16:$dst), (ADD16rr $dst, $dst)>;
+
+let Constraints = "$rs = $rd" in {
 
-def SUB8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "sub.b\t{$src, $dst}",
-                   [(store (sub (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def SUB16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "sub.w\t{$src, $dst}",
-                    [(store (sub (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
+let Defs = [SR] in {
 
-def SUB8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "sub.b\t{$src, $dst}",
-                   [(store (sub (load addr:$dst), 
-                                (i8 (load addr:$src))), addr:$dst),
+// FIXME: memory variant!
+def RRA8r :   II8r<0b010,
+                   (outs GR8:$rd), (ins GR8:$rs),
+                   "rra.b\t$rd",
+                   [(set GR8:$rd, (MSP430rra GR8:$rs)),
                     (implicit SR)]>;
-def SUB16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "sub.w\t{$src, $dst}",
-                    [(store (sub (load addr:$dst), 
-                                 (i16 (load addr:$src))), addr:$dst),
+def RRA16r : II16r<0b010,
+                    (outs GR16:$rd), (ins GR16:$rs),
+                    "rra\t$rd",
+                    [(set GR16:$rd, (MSP430rra GR16:$rs)),
                      (implicit SR)]>;
-}
 
 let Uses = [SR] in {
-def SBC8rr  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, GR8:$src2),
-                   "subc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sube GR8:$src, GR8:$src2)),
-                    (implicit SR)]>;
-def SBC16rr : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, GR16:$src2),
-                    "subc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sube GR16:$src, GR16:$src2)),
-                     (implicit SR)]>;
-
-def SBC8ri  : I8ri<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, i8imm:$src2),
-                   "subc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sube GR8:$src, imm:$src2)),
+def RRC8r :   II8r<0b000,
+                   (outs GR8:$rd), (ins GR8:$rs),
+                   "rrc.b\t$rd",
+                   [(set GR8:$rd, (MSP430rrc GR8:$rs)),
                     (implicit SR)]>;
-def SBC16ri : I16ri<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, i16imm:$src2),
-                    "subc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sube GR16:$src, imm:$src2)),
-                     (implicit SR)]>;
-
-def SBC8rm  : I8rm<0x0,
-                   (outs GR8:$dst), (ins GR8:$src, memsrc:$src2),
-                   "subc.b\t{$src2, $dst}",
-                   [(set GR8:$dst, (sube GR8:$src, (load addr:$src2))),
+def RRC16r : II16r<0b000,
+                   (outs GR16:$rd), (ins GR16:$rs),
+                   "rrc\t$rd",
+                   [(set GR16:$rd, (MSP430rrc GR16:$rs)),
                     (implicit SR)]>;
-def SBC16rm : I16rm<0x0,
-                    (outs GR16:$dst), (ins GR16:$src, memsrc:$src2),
-                    "subc.w\t{$src2, $dst}",
-                    [(set GR16:$dst, (sube GR16:$src, (load addr:$src2))),
-                     (implicit SR)]>;
-
-let Constraints = "" in {
-def SBC8mr  : I8mr<0x0,
-                   (outs), (ins memdst:$dst, GR8:$src),
-                   "subc.b\t{$src, $dst}",
-                  [(store (sube (load addr:$dst), GR8:$src), addr:$dst),
-                   (implicit SR)]>;
-def SBC16mr : I16mr<0x0,
-                    (outs), (ins memdst:$dst, GR16:$src),
-                    "subc.w\t{$src, $dst}",
-                    [(store (sube (load addr:$dst), GR16:$src), addr:$dst),
-                     (implicit SR)]>;
-
-def SBC8mi  : I8mi<0x0,
-                   (outs), (ins memdst:$dst, i8imm:$src),
-                   "subc.b\t{$src, $dst}",
-                   [(store (sube (load addr:$dst), (i8 imm:$src)), addr:$dst),
-                    (implicit SR)]>;
-def SBC16mi : I16mi<0x0,
-                    (outs), (ins memdst:$dst, i16imm:$src),
-                    "subc.w\t{$src, $dst}",
-                    [(store (sube (load addr:$dst), (i16 imm:$src)), addr:$dst),
-                     (implicit SR)]>;
-
-def SBC8mm  : I8mm<0x0,
-                   (outs), (ins memdst:$dst, memsrc:$src),
-                   "subc.b\t{$src, $dst}",
-                   [(store (sube (load addr:$dst),
-                                 (i8 (load addr:$src))), addr:$dst),
-                    (implicit SR)]>;
-def SBC16mm : I16mm<0x0,
-                    (outs), (ins memdst:$dst, memsrc:$src),
-                    "subc.w\t{$src, $dst}",
-                    [(store (sube (load addr:$dst),
-                            (i16 (load addr:$src))), addr:$dst),
-                     (implicit SR)]>;
 }
 
-} // Uses = [SR]
-
-// FIXME: memory variant!
-def SAR8r1  : II8r<0x0,
-                   (outs GR8:$dst), (ins GR8:$src),
-                   "rra.b\t$dst",
-                   [(set GR8:$dst, (MSP430rra GR8:$src)),
-                    (implicit SR)]>;
-def SAR16r1 : II16r<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "rra.w\t$dst",
-                    [(set GR16:$dst, (MSP430rra GR16:$src)),
-                     (implicit SR)]>;
-
-def SHL8r1  : I8rr<0x0,
-                   (outs GR8:$dst), (ins GR8:$src),
-                   "rla.b\t$dst",
-                   [(set GR8:$dst, (MSP430rla GR8:$src)),
-                    (implicit SR)]>;
-def SHL16r1 : I16rr<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "rla.w\t$dst",
-                    [(set GR16:$dst, (MSP430rla GR16:$src)),
-                     (implicit SR)]>;
-
-def SAR8r1c  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
-                      "clrc\n\t"
-                      "rrc.b\t$dst",
-                      [(set GR8:$dst, (MSP430rrc GR8:$src)),
-                       (implicit SR)]>;
-def SAR16r1c : Pseudo<(outs GR16:$dst), (ins GR16:$src),
-                      "clrc\n\t"
-                      "rrc.w\t$dst",
-                      [(set GR16:$dst, (MSP430rrc GR16:$src)),
-                       (implicit SR)]>;
-
 // FIXME: Memory sext's ?
-def SEXT16r : II16r<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "sxt\t$dst",
-                    [(set GR16:$dst, (sext_inreg GR16:$src, i8)),
+def SEXT16r : II16r<0b011,
+                    (outs GR16:$rd), (ins GR16:$rs),
+                    "sxt\t$rd",
+                    [(set GR16:$rd, (sext_inreg GR16:$rs, i8)),
                      (implicit SR)]>;
 
 } // Defs = [SR]
 
-def ZEXT16r : I8rr<0x0,
-                   (outs GR16:$dst), (ins GR16:$src),
-                   "mov.b\t{$src, $dst}",
-                   [(set GR16:$dst, (zext (trunc GR16:$src)))]>;
+let isCodeGenOnly = 1 in
+def ZEXT16r : I8rr<0b0100,
+                   (outs GR16:$rd), (ins GR16:$rs),
+                   "mov.b\t{$rs, $rd}",
+                   [(set GR16:$rd, (zext (trunc GR16:$rs)))]>;
 
 // FIXME: Memory bitswaps?
-def SWPB16r : II16r<0x0,
-                    (outs GR16:$dst), (ins GR16:$src),
-                    "swpb\t$dst",
-                    [(set GR16:$dst, (bswap GR16:$src))]>;
+def SWPB16r : II16r<0b001,
+                    (outs GR16:$rd), (ins GR16:$rs),
+                    "swpb\t$rd",
+                    [(set GR16:$rd, (bswap GR16:$rs))]>;
 
 } // Constraints = "$src = $dst"
 
 // Integer comparisons
 let Defs = [SR] in {
-def CMP8rr  : I8rr<0x0,
-                   (outs), (ins GR8:$src, GR8:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp GR8:$src, GR8:$src2), (implicit SR)]>;
-def CMP16rr : I16rr<0x0,
-                    (outs), (ins GR16:$src, GR16:$src2),
-                    "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp GR16:$src, GR16:$src2), (implicit SR)]>;
-
-def CMP8ri  : I8ri<0x0,
-                   (outs), (ins GR8:$src, i8imm:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp GR8:$src, imm:$src2), (implicit SR)]>;
-def CMP16ri : I16ri<0x0,
-                    (outs), (ins GR16:$src, i16imm:$src2),
-                    "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp GR16:$src, imm:$src2), (implicit SR)]>;
-
-def CMP8mi  : I8mi<0x0,
-                   (outs), (ins memsrc:$src, i8imm:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp (load addr:$src),
-                               (i8 imm:$src2)), (implicit SR)]>;
-def CMP16mi : I16mi<0x0,
-                    (outs), (ins memsrc:$src, i16imm:$src2),
-                    "cmp.w\t{$src2, $src}",
-                     [(MSP430cmp (load addr:$src),
-                                 (i16 imm:$src2)), (implicit SR)]>;
-
-def CMP8rm  : I8rm<0x0,
-                   (outs), (ins GR8:$src, memsrc:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp GR8:$src, (load addr:$src2)), 
-                    (implicit SR)]>;
-def CMP16rm : I16rm<0x0,
-                    (outs), (ins GR16:$src, memsrc:$src2),
-                    "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp GR16:$src, (load addr:$src2)),
-                     (implicit SR)]>;
-
-def CMP8mr  : I8mr<0x0,
-                   (outs), (ins memsrc:$src, GR8:$src2),
-                   "cmp.b\t{$src2, $src}",
-                   [(MSP430cmp (load addr:$src), GR8:$src2),
-                    (implicit SR)]>;
-def CMP16mr : I16mr<0x0,
-                    (outs), (ins memsrc:$src, GR16:$src2),
-                    "cmp.w\t{$src2, $src}",
-                    [(MSP430cmp (load addr:$src), GR16:$src2), 
+def CMP8rr  : I8rr<0b1001,
+                   (outs), (ins GR8:$rd, GR8:$rs),
+                   "cmp.b\t$rs, $rd",
+                   [(MSP430cmp GR8:$rd, GR8:$rs), (implicit SR)]>;
+def CMP16rr : I16rr<0b1001,
+                    (outs), (ins GR16:$rd, GR16:$rs),
+                    "cmp\t$rs, $rd",
+                    [(MSP430cmp GR16:$rd, GR16:$rs), (implicit SR)]>;
+
+def CMP8rc  : I8rc<0b1001,
+                   (outs), (ins GR8:$rd, cg8imm:$imm),
+                   "cmp.b\t$imm, $rd",
+                   [(MSP430cmp GR8:$rd, cg8imm:$imm), (implicit SR)]>;
+def CMP16rc : I16rc<0b1001,
+                    (outs), (ins GR16:$rd, cg16imm:$imm),
+                    "cmp\t$imm, $rd",
+                    [(MSP430cmp GR16:$rd, cg16imm:$imm), (implicit SR)]>;
+
+def CMP8ri  : I8ri<0b1001,
+                   (outs), (ins GR8:$rd, i8imm:$imm),
+                   "cmp.b\t$imm, $rd",
+                   [(MSP430cmp GR8:$rd, imm:$imm), (implicit SR)]>;
+def CMP16ri : I16ri<0b1001,
+                    (outs), (ins GR16:$rd, i16imm:$imm),
+                    "cmp\t$imm, $rd",
+                    [(MSP430cmp GR16:$rd, imm:$imm), (implicit SR)]>;
+
+def CMP8mc  : I8mc<0b1001,
+                   (outs), (ins memsrc:$dst, cg8imm:$imm),
+                   "cmp.b\t$imm, $dst",
+                   [(MSP430cmp (load addr:$dst), (i8 cg8imm:$imm)),
+                    (implicit SR)]>;
+def CMP16mc : I16mc<0b1001,
+                    (outs), (ins memsrc:$dst, cg16imm:$imm),
+                    "cmp\t$imm, $dst",
+                    [(MSP430cmp (load addr:$dst), (i16 cg16imm:$imm)),
+                     (implicit SR)]>;
+
+def CMP8mi  : I8mi<0b1001,
+                   (outs), (ins memsrc:$dst, i8imm:$imm),
+                   "cmp.b\t$imm, $dst",
+                   [(MSP430cmp (load addr:$dst),
+                               (i8 imm:$imm)), (implicit SR)]>;
+def CMP16mi : I16mi<0b1001,
+                    (outs), (ins memsrc:$dst, i16imm:$imm),
+                    "cmp\t$imm, $dst",
+                     [(MSP430cmp (load addr:$dst),
+                                 (i16 imm:$imm)), (implicit SR)]>;
+
+def CMP8rm  : I8rm<0b1001,
+                   (outs), (ins GR8:$rd, memsrc:$src),
+                   "cmp.b\t$src, $rd",
+                   [(MSP430cmp GR8:$rd, (load addr:$src)), 
+                    (implicit SR)]>;
+def CMP16rm : I16rm<0b1001,
+                    (outs), (ins GR16:$rd, memsrc:$src),
+                    "cmp\t$src, $rd",
+                    [(MSP430cmp GR16:$rd, (load addr:$src)),
+                     (implicit SR)]>;
+
+def CMP8mr  : I8mr<0b1001,
+                   (outs), (ins memsrc:$dst, GR8:$rs),
+                   "cmp.b\t$rs, $dst",
+                   [(MSP430cmp (load addr:$dst), GR8:$rs),
+                    (implicit SR)]>;
+def CMP16mr : I16mr<0b1001,
+                    (outs), (ins memsrc:$dst, GR16:$rs),
+                    "cmp\t$rs, $dst",
+                    [(MSP430cmp (load addr:$dst), GR16:$rs), 
                      (implicit SR)]>;
 
-
 // BIT TESTS, just sets condition codes
 // Note that the C condition is set differently than when using CMP.
 let isCommutable = 1 in {
-def BIT8rr  : I8rr<0x0,
-                   (outs), (ins GR8:$src, GR8:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su GR8:$src, GR8:$src2), 0),
+def BIT8rr  : I8rr<0b1011,
+                   (outs), (ins GR8:$rd, GR8:$rs),
+                   "bit.b\t$rs, $rd",
+                   [(MSP430cmp (and_su GR8:$rd, GR8:$rs), 0),
                     (implicit SR)]>;
-def BIT16rr : I16rr<0x0,
-                    (outs), (ins GR16:$src, GR16:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su GR16:$src, GR16:$src2), 0),
+def BIT16rr : I16rr<0b1011,
+                    (outs), (ins GR16:$rd, GR16:$rs),
+                    "bit\t$rs, $rd",
+                    [(MSP430cmp (and_su GR16:$rd, GR16:$rs), 0),
                      (implicit SR)]>;
 }
-def BIT8ri  : I8ri<0x0,
-                   (outs), (ins GR8:$src, i8imm:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su GR8:$src, imm:$src2), 0),
+def BIT8rc  : I8rc<0b1011,
+                   (outs), (ins GR8:$rd, cg8imm:$imm),
+                   "bit.b\t$imm, $rd",
+                   [(MSP430cmp (and_su GR8:$rd, cg8imm:$imm), 0),
+                    (implicit SR)]>;
+def BIT16rc : I16rc<0b1011,
+                    (outs), (ins GR16:$rd, cg16imm:$imm),
+                    "bit\t$imm, $rd",
+                    [(MSP430cmp (and_su GR16:$rd, cg16imm:$imm), 0),
+                     (implicit SR)]>;
+
+def BIT8ri  : I8ri<0b1011,
+                   (outs), (ins GR8:$rd, i8imm:$imm),
+                   "bit.b\t$imm, $rd",
+                   [(MSP430cmp (and_su GR8:$rd, imm:$imm), 0),
                     (implicit SR)]>;
-def BIT16ri : I16ri<0x0,
-                    (outs), (ins GR16:$src, i16imm:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su GR16:$src, imm:$src2), 0),
+def BIT16ri : I16ri<0b1011,
+                    (outs), (ins GR16:$rd, i16imm:$imm),
+                    "bit\t$imm, $rd",
+                    [(MSP430cmp (and_su GR16:$rd, imm:$imm), 0),
                      (implicit SR)]>;
 
-def BIT8rm  : I8rm<0x0,
-                   (outs), (ins GR8:$src, memdst:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su GR8:$src,  (load addr:$src2)), 0),
+def BIT8rm  : I8rm<0b1011,
+                   (outs), (ins GR8:$rd, memdst:$src),
+                   "bit.b\t$src, $rd",
+                   [(MSP430cmp (and_su GR8:$rd,  (load addr:$src)), 0),
                     (implicit SR)]>;
-def BIT16rm : I16rm<0x0,
-                    (outs), (ins GR16:$src, memdst:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su GR16:$src,  (load addr:$src2)), 0),
+def BIT16rm : I16rm<0b1011,
+                    (outs), (ins GR16:$rd, memdst:$src),
+                    "bit\t$src, $rd",
+                    [(MSP430cmp (and_su GR16:$rd,  (load addr:$src)), 0),
                      (implicit SR)]>;
 
-def BIT8mr  : I8mr<0x0,
-                  (outs), (ins memsrc:$src, GR8:$src2),
-                  "bit.b\t{$src2, $src}",
-                  [(MSP430cmp (and_su (load addr:$src), GR8:$src2), 0),
+def BIT8mr  : I8mr<0b1011,
+                  (outs), (ins memsrc:$dst, GR8:$rs),
+                  "bit.b\t$rs, $dst",
+                  [(MSP430cmp (and_su (load addr:$dst), GR8:$rs), 0),
                    (implicit SR)]>;
-def BIT16mr : I16mr<0x0,
-                    (outs), (ins memsrc:$src, GR16:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su (load addr:$src), GR16:$src2), 0),
+def BIT16mr : I16mr<0b1011,
+                    (outs), (ins memsrc:$dst, GR16:$rs),
+                    "bit\t$rs, $dst",
+                    [(MSP430cmp (and_su (load addr:$dst), GR16:$rs), 0),
+                     (implicit SR)]>;
+
+def BIT8mc  : I8mc<0b1011,
+                   (outs), (ins memsrc:$dst, cg8imm:$imm),
+                   "bit.b\t$imm, $dst",
+                   [(MSP430cmp (and_su (load addr:$dst), (i8 cg8imm:$imm)), 0),
+                    (implicit SR)]>;
+def BIT16mc : I16mc<0b1011,
+                    (outs), (ins memsrc:$dst, i16imm:$imm),
+                    "bit\t$imm, $dst",
+                    [(MSP430cmp (and_su (load addr:$dst), (i16 cg16imm:$imm)), 0),
                      (implicit SR)]>;
 
-def BIT8mi  : I8mi<0x0,
-                   (outs), (ins memsrc:$src, i8imm:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su (load addr:$src), (i8 imm:$src2)), 0),
+def BIT8mi  : I8mi<0b1011,
+                   (outs), (ins memsrc:$dst, i8imm:$imm),
+                   "bit.b\t$imm, $dst",
+                   [(MSP430cmp (and_su (load addr:$dst), (i8 imm:$imm)), 0),
                     (implicit SR)]>;
-def BIT16mi : I16mi<0x0,
-                    (outs), (ins memsrc:$src, i16imm:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su (load addr:$src), (i16 imm:$src2)), 0),
+def BIT16mi : I16mi<0b1011,
+                    (outs), (ins memsrc:$dst, i16imm:$imm),
+                    "bit\t$imm, $dst",
+                    [(MSP430cmp (and_su (load addr:$dst), (i16 imm:$imm)), 0),
                      (implicit SR)]>;
 
-def BIT8mm  : I8mm<0x0,
-                   (outs), (ins memsrc:$src, memsrc:$src2),
-                   "bit.b\t{$src2, $src}",
-                   [(MSP430cmp (and_su (i8 (load addr:$src)),
-                                       (load addr:$src2)),
+def BIT8mm  : I8mm<0b1011,
+                   (outs), (ins memsrc:$dst, memsrc:$src),
+                   "bit.b\t$src, $dst",
+                   [(MSP430cmp (and_su (i8 (load addr:$dst)),
+                                       (load addr:$src)),
                                  0),
                       (implicit SR)]>;
-def BIT16mm : I16mm<0x0,
-                    (outs), (ins memsrc:$src, memsrc:$src2),
-                    "bit.w\t{$src2, $src}",
-                    [(MSP430cmp (and_su (i16 (load addr:$src)),
-                                        (load addr:$src2)),
+def BIT16mm : I16mm<0b1011,
+                    (outs), (ins memsrc:$dst, memsrc:$src),
+                    "bit\t$src, $dst",
+                    [(MSP430cmp (and_su (i16 (load addr:$dst)),
+                                        (load addr:$src)),
                                  0),
                      (implicit SR)]>;
 } // Defs = [SR]
 
+def TST8r   : InstAlias<"tst.b\t$dst",  (CMP8rc    GR8:$dst,     0)>;
+def TST16r  : InstAlias<"tst\t$dst",    (CMP16rc   GR16:$dst,    0)>;
+def TST8m   : InstAlias<"tst.b\t$dst",  (CMP8mc    memdst:$dst,  0)>;
+def TST16m  : InstAlias<"tst\t$dst",    (CMP16mc   memdst:$dst,  0)>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index e7716382b222a0317ed552106a37683c776301b6..860c0006f782f534e011cbb469d9a495d48e11d9 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -110,6 +110,9 @@ LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
   return MCOperand::createExpr(Expr);
 }
 
+#define GET_REGINFO_ENUM
+#include "MSP430GenRegisterInfo.inc"
+
 void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index b5a6ed0f0a56d2de3a4004897083adbcad209307..1e86bdf34a0b5894c41daf9620e29b138737f75b 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -11,26 +11,31 @@
 //  Declarations that describe the MSP430 register file
 //===----------------------------------------------------------------------===//
 
-class MSP430Reg<bits<4> num, string n> : Register<n> {
+class MSP430Reg<bits<4> num, string n, list<string> alt = []> : Register<n> {
   field bits<4> Num = num;
   let Namespace = "MSP430";
+  let HWEncoding{3-0} = num;
+  let AltNames = alt;
 }
 
-class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs> 
+class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs,
+                           list<string> alt = []> 
   : RegisterWithSubRegs<n, subregs> {
   field bits<4> Num = num;
   let Namespace = "MSP430";
+  let HWEncoding{3-0} = num;
+  let AltNames = alt;
 }
 
 //===----------------------------------------------------------------------===//
 //  Registers
 //===----------------------------------------------------------------------===//
 
-def PCB  : MSP430Reg<0,  "r0">;
-def SPB  : MSP430Reg<1,  "r1">;
-def SRB  : MSP430Reg<2,  "r2">;
-def CGB  : MSP430Reg<3,  "r3">;
-def FPB  : MSP430Reg<4,  "r4">;
+def PCB  : MSP430Reg<0,  "r0", ["pc"]>;
+def SPB  : MSP430Reg<1,  "r1", ["sp"]>;
+def SRB  : MSP430Reg<2,  "r2", ["sr"]>;
+def CGB  : MSP430Reg<3,  "r3", ["cg"]>;
+def FPB  : MSP430Reg<4,  "r4", ["fp"]>;
 def R5B  : MSP430Reg<5,  "r5">;
 def R6B  : MSP430Reg<6,  "r6">;
 def R7B  : MSP430Reg<7,  "r7">;
@@ -46,11 +51,11 @@ def R15B : MSP430Reg<15, "r15">;
 def subreg_8bit : SubRegIndex<8> { let Namespace = "MSP430"; }
 
 let SubRegIndices = [subreg_8bit] in {
-def PC  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
-def SP  : MSP430RegWithSubregs<1,  "r1",  [SPB]>;
-def SR  : MSP430RegWithSubregs<2,  "r2",  [SRB]>;
-def CG  : MSP430RegWithSubregs<3,  "r3",  [CGB]>;
-def FP  : MSP430RegWithSubregs<4,  "r4",  [FPB]>;
+def PC  : MSP430RegWithSubregs<0,  "r0",  [PCB], ["pc"]>;
+def SP  : MSP430RegWithSubregs<1,  "r1",  [SPB], ["sp"]>;
+def SR  : MSP430RegWithSubregs<2,  "r2",  [SRB], ["sr"]>;
+def CG  : MSP430RegWithSubregs<3,  "r3",  [CGB], ["cg"]>;
+def FP  : MSP430RegWithSubregs<4,  "r4",  [FPB], ["fp"]>;
 def R5  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
 def R6  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
 def R7  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 2e70d35fc4affd58a711d8c71f0b5aa409eaff4f..79e0c001a636cfdd6ca71cac52e015cbb1b58f87 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -767,13 +767,13 @@ public:
 
   ~MipsOperand() override {
     switch (Kind) {
-    case k_Immediate:
-      break;
     case k_Memory:
       delete Mem.Base;
       break;
     case k_RegList:
       delete RegList.List;
+      break;
+    case k_Immediate:
     case k_RegisterIndex:
     case k_Token:
       break;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 4544be9f27f147dbd7c022e5bdf4972c860df199..63f9151da6babaa50cb606a1d1fe1f22352558b4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -569,6 +569,14 @@ bool MipsAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   }
 }
 
+bool MipsAsmBackend::isMicroMips(const MCSymbol *Sym) const {
+  if (const auto *ElfSym = dyn_cast<const MCSymbolELF>(Sym)) {
+    if (ElfSym->getOther() & ELF::STO_MIPS_MICROMIPS)
+      return true;
+  }
+  return false;
+}
+
 MCAsmBackend *llvm::createMipsAsmBackend(const Target &T,
                                          const MCSubtargetInfo &STI,
                                          const MCRegisterInfo &MRI,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index 3d5e16fcf9b415e72177c78968d6cd38eb974d16..30359132e92b2642b05f0fe3ba2c6eb36147c4ac 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -25,6 +25,7 @@ class MCAssembler;
 struct MCFixupKindInfo;
 class MCObjectWriter;
 class MCRegisterInfo;
+class MCSymbolELF;
 class Target;
 
 class MipsAsmBackend : public MCAsmBackend {
@@ -90,6 +91,7 @@ public:
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target) override;
 
+  bool isMicroMips(const MCSymbol *Sym) const override;
 }; // class MipsAsmBackend
 
 } // namespace
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index 7b9a02503ce2128891f2b64030bc55a4607f8540..21b01e8509678ffac52fd1626143667b9edc4c21 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -53,6 +54,22 @@ void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
   createPendingLabelRelocs();
 }
 
+void MipsELFStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+  Frame.Begin = getContext().createTempSymbol();
+  MCELFStreamer::EmitLabel(Frame.Begin);
+}
+
+MCSymbol *MipsELFStreamer::EmitCFILabel() {
+  MCSymbol *Label = getContext().createTempSymbol("cfi", true);
+  MCELFStreamer::EmitLabel(Label);
+  return Label;
+}
+
+void MipsELFStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+  Frame.End = getContext().createTempSymbol();
+  MCELFStreamer::EmitLabel(Frame.End);
+}
+
 void MipsELFStreamer::createPendingLabelRelocs() {
   MipsTargetELFStreamer *ELFTargetStreamer =
       static_cast<MipsTargetELFStreamer *>(getTargetStreamer());
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index d141f5d77c61b7c1d5e7dd2800bbd7957f33bb5f..56a0ff96c7bd6e6e10ba4410a4ae35c727aa5a1e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -26,6 +26,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCSubtargetInfo;
+struct MCDwarfFrameInfo;
 
 class MipsELFStreamer : public MCELFStreamer {
   SmallVector<std::unique_ptr<MipsOptionRecord>, 8> MipsOptionRecords;
@@ -60,6 +61,12 @@ public:
   void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
   void EmitIntValue(uint64_t Value, unsigned Size) override;
 
+  // Overriding these functions allows us to avoid recording of these labels
+  // in EmitLabel and later marking them as microMIPS.
+  void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
+  void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
+  MCSymbol *EmitCFILabel() override;
+
   /// Emits all the option records stored up until the point it's called.
   void EmitMipsOptionRecords();
 
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index b5896060a710e3360ccea4c757518bbb8dd7f21d..814918d25e70d7f5508825aa27f21d1f041ff6b0 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -159,6 +159,7 @@ class SYNC_MMR6_ENC : POOL32A_SYNC_FM_MMR6;
 class SYNCI_MMR6_ENC : POOL32I_SYNCI_FM_MMR6, MMR6Arch<"synci">;
 class RDPGPR_MMR6_ENC : POOL32A_RDPGPR_FM_MMR6<0b1110000101>;
 class SDBBP_MMR6_ENC : SDBBP_FM_MM, MMR6Arch<"sdbbp">;
+class SIGRIE_MMR6_ENC : SIGRIE_FM_MM, MMR6Arch<"sigrie">;
 class XOR_MMR6_ENC : ARITH_FM_MMR6<"xor", 0x310>;
 class XORI_MMR6_ENC : ADDI_FM_MMR6<"xori", 0x1c>;
 class ABS_S_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.s", 0, 0b0001101>;
@@ -1162,6 +1163,14 @@ class SDBBP_MMR6_DESC : MipsR6Inst {
   InstrItinClass Itinerary = II_SDBBP;
 }
 
+class SIGRIE_MMR6_DESC : MipsR6Inst {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm16:$code_);
+  string AsmString = !strconcat("sigrie", "\t$code_");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_SIGRIE;
+}
+
 class LWM16_MMR6_DESC
     : MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr),
                       !strconcat("lwm16", "\t$rt, $addr"), [],
@@ -1427,6 +1436,7 @@ def SYNCI_MMR6 : StdMMR6Rel, SYNCI_MMR6_DESC, SYNCI_MMR6_ENC, ISA_MICROMIPS32R6;
 def RDPGPR_MMR6 : R6MMR6Rel, RDPGPR_MMR6_DESC, RDPGPR_MMR6_ENC,
                   ISA_MICROMIPS32R6;
 def SDBBP_MMR6 : R6MMR6Rel, SDBBP_MMR6_DESC, SDBBP_MMR6_ENC, ISA_MICROMIPS32R6;
+def SIGRIE_MMR6 : R6MMR6Rel, SIGRIE_MMR6_DESC, SIGRIE_MMR6_ENC, ISA_MICROMIPS32R6;
 def XOR_MMR6 : StdMMR6Rel, XOR_MMR6_DESC, XOR_MMR6_ENC, ISA_MICROMIPS32R6;
 def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6;
 let DecoderMethod = "DecodeMemMMImm16" in {
@@ -1635,6 +1645,7 @@ def B_MMR6_Pseudo : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset),
 }
 def : MipsInstAlias<"sync", (SYNC_MMR6 0), 1>, ISA_MICROMIPS32R6;
 def : MipsInstAlias<"sdbbp", (SDBBP_MMR6 0), 1>, ISA_MICROMIPS32R6;
+def : MipsInstAlias<"sigrie", (SIGRIE_MMR6 0), 1>, ISA_MICROMIPS32R6;
 def : MipsInstAlias<"rdhwr $rt, $rs",
                     (RDHWR_MMR6 GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>,
                     ISA_MICROMIPS32R6;
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index a9c53e08b810578365204e21f8fc64a23e503c2d..2a4cc279ef0d730eb6b25a34c5b90009595c7de3 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -933,6 +933,17 @@ class SDBBP_FM_MM : MMArch {
   let Inst{5-0}   = 0x3c;
 }
 
+class SIGRIE_FM_MM : MMArch {
+  bits<16> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x0;
+  let Inst{25-22} = 0x0;
+  let Inst{21-6} = code_;
+  let Inst{5-0} = 0b111111;
+}
+
 class RDHWR_FM_MM : MMArch {
   bits<5> rt;
   bits<5> rd;
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 174a05ea7af6be8f915c37994bf1b912ca99877a..af380a0ec71e0a0b52d16cacb4a609e232247d79 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1271,6 +1271,8 @@ let AddedComplexity = 40 in
 def : MipsPat<(bswap GPR32:$rt), (ROTR_MM (WSBH_MM GPR32:$rt), 16)>,
       ISA_MICROMIPS;
 
+def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
+              (JAL_MM texternalsym:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
 def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
               (TAILCALL_MM tglobaladdr:$dst)>, ISA_MICROMIPS32_NOT_MIPS32R6;
 def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
index e1d08cad88b77917c4a0dec8b5c39d5bc0774ee6..623af570a5e6cc12ab60dbca190bf7311398eb3e 100644
--- a/lib/Target/Mips/Mips32r6InstrFormats.td
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -87,6 +87,7 @@ def OPCODE5_BC1NEZ : OPCODE5<0b01101>;
 def OPCODE5_BC2EQZ : OPCODE5<0b01001>;
 def OPCODE5_BC2NEZ : OPCODE5<0b01101>;
 def OPCODE5_BGEZAL : OPCODE5<0b10001>;
+def OPCODE5_SIGRIE : OPCODE5<0b10111>;
 // The next four constants are unnamed in the spec. These names are taken from
 // the OPGROUP names they are used with.
 def OPCODE5_LDC2   : OPCODE5<0b01110>;
@@ -602,3 +603,12 @@ class SPECIAL3_GINV<bits<2> ginv> : MipsR6Inst {
   let Inst{7-6}   = ginv;
   let Inst{5-0}   = 0b111101;
 }
+
+class SIGRIE_FM : MipsR6Inst {
+  bits<16> code_;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = 0;
+  let Inst{20-16} = OPCODE5_SIGRIE.Value;
+  let Inst{15-0} = code_;
+}
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index d86fc3f658aef2ba7ad6efc87ed7154666412001..2bd0cf2d59a64d8e91635dd77aaa1bcb8698db64 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -200,6 +200,8 @@ class CRC32CW_ENC : SPECIAL3_2R_SZ_CRC<2,1>;
 class GINVI_ENC : SPECIAL3_GINV<0>;
 class GINVT_ENC : SPECIAL3_GINV<2>;
 
+class SIGRIE_ENC : SIGRIE_FM;
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Multiclasses
@@ -846,6 +848,14 @@ class GINVI_DESC : GINV_DESC_BASE<"ginvi", GPR32Opnd, II_GINVI> {
 }
 class GINVT_DESC : GINV_DESC_BASE<"ginvt", GPR32Opnd, II_GINVT>;
 
+class SIGRIE_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins uimm16:$code_);
+  string AsmString = "sigrie\t$code_";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_SIGRIE;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Instruction Definitions
@@ -961,6 +971,7 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def SEL_S : R6MMR6Rel, SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT;
   def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6;
   def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6;
+  def SIGRIE : SIGRIE_ENC, SIGRIE_DESC, ISA_MIPS32R6;
 }
 
 let AdditionalPredicates = [NotInMicroMips] in {
@@ -988,6 +999,7 @@ def : MipsInstAlias<"evp", (EVP ZERO), 0>, ISA_MIPS32R6;
 
 let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6;
+def : MipsInstAlias<"sigrie", (SIGRIE 0)>, ISA_MIPS32R6;
 def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6, GPR_32;
 }
 
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index b5317bec70c478f8421553fa84eabc7aee887f4a..5729182deafb438772c9d1f3e9e7ccef053ceeaf 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -416,6 +416,13 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
 // long branches.  See the comment in file MipsLongBranch.cpp for detailed
 // explanation.
 
+// Expands to: lui $dst, %highest/%higher/%hi/%lo($tgt)
+def LONG_BRANCH_LUi2Op_64 : PseudoSE<(outs GPR64Opnd:$dst),
+  (ins brtarget:$tgt), []>, GPR_64;
+// Expands to: addiu $dst, %highest/%higher/%hi/%lo($tgt)
+def LONG_BRANCH_DADDiu2Op : PseudoSE<(outs GPR64Opnd:$dst),
+  (ins GPR64Opnd:$src, brtarget:$tgt), []>, GPR_64;
+
 // Expands to: daddiu $dst, $src, %PART($tgt - $baltgt)
 // where %PART may be %hi or %lo, depending on the relocation kind
 // that $tgt is annotated with.
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 2e0c25de2bc89ffaf26b7c5069bacf43a785fa74..16a2481a00d8996d1ac91c9a22482112a1f0695a 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -561,6 +561,7 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         O << '$' << MipsInstPrinter::getRegisterName(Reg);
         return false;
       }
+      break;
     }
     case 'w':
       // Print MSA registers for the 'f' constraint
@@ -1240,8 +1241,12 @@ void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
 
 bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
   return (Opcode == Mips::LONG_BRANCH_LUi
+          || Opcode == Mips::LONG_BRANCH_LUi2Op
+          || Opcode == Mips::LONG_BRANCH_LUi2Op_64
           || Opcode == Mips::LONG_BRANCH_ADDiu
-          || Opcode == Mips::LONG_BRANCH_DADDiu);
+          || Opcode == Mips::LONG_BRANCH_ADDiu2Op
+          || Opcode == Mips::LONG_BRANCH_DADDiu
+          || Opcode == Mips::LONG_BRANCH_DADDiu2Op);
 }
 
 // Force static initialization.
diff --git a/lib/Target/Mips/MipsBranchExpansion.cpp b/lib/Target/Mips/MipsBranchExpansion.cpp
index f316e308be7b2fa57871fcd8fe66d4de14e32ecf..e59267c4fd9b61357d2516a21b10d0034c42386a 100644
--- a/lib/Target/Mips/MipsBranchExpansion.cpp
+++ b/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -271,7 +271,8 @@ void MipsBranchExpansion::splitMBB(MachineBasicBlock *MBB) {
   // Insert NewMBB and fix control flow.
   MachineBasicBlock *Tgt = getTargetMBB(*FirstBr);
   NewMBB->transferSuccessors(MBB);
-  NewMBB->removeSuccessor(Tgt, true);
+  if (Tgt != getTargetMBB(*LastBr))
+    NewMBB->removeSuccessor(Tgt, true);
   MBB->addSuccessor(NewMBB);
   MBB->addSuccessor(Tgt);
   MFp->insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
@@ -673,32 +674,32 @@ void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) {
       // instructions, where we first load the offset into register, and then we
       // do branch register.
       if (ABI.IsN64()) {
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi))
-            .addReg(Mips::AT_64)
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi2Op_64),
+                Mips::AT_64)
             .addMBB(TgtMBB, MipsII::MO_HIGHEST);
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu2Op),
                 Mips::AT_64)
             .addReg(Mips::AT_64)
             .addMBB(TgtMBB, MipsII::MO_HIGHER);
         BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
             .addReg(Mips::AT_64)
             .addImm(16);
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu2Op),
                 Mips::AT_64)
             .addReg(Mips::AT_64)
             .addMBB(TgtMBB, MipsII::MO_ABS_HI);
         BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
             .addReg(Mips::AT_64)
             .addImm(16);
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu2Op),
                 Mips::AT_64)
             .addReg(Mips::AT_64)
             .addMBB(TgtMBB, MipsII::MO_ABS_LO);
       } else {
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi))
-            .addReg(Mips::AT)
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi2Op),
+                Mips::AT)
             .addMBB(TgtMBB, MipsII::MO_ABS_HI);
-        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_ADDiu),
+        BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_ADDiu2Op),
                 Mips::AT)
             .addReg(Mips::AT)
             .addMBB(TgtMBB, MipsII::MO_ABS_LO);
diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp
index 81a1cced93b7a355eb8a6dc210541d7752a259a1..90cb3f437bd5b7119aa150aaca82d62af91a25c6 100644
--- a/lib/Target/Mips/MipsCCState.cpp
+++ b/lib/Target/Mips/MipsCCState.cpp
@@ -24,10 +24,10 @@ static bool isF128SoftLibCall(const char *CallSym) {
       "__lttf2",       "__multf3",     "__netf2",       "__powitf2",
       "__subtf3",      "__trunctfdf2", "__trunctfsf2",  "__unordtf2",
       "ceill",         "copysignl",    "cosl",          "exp2l",
-      "expl",          "floorl",       "fmal",          "fmodl",
-      "log10l",        "log2l",        "logl",          "nearbyintl",
-      "powl",          "rintl",        "roundl",        "sinl",
-      "sqrtl",         "truncl"};
+      "expl",          "floorl",       "fmal",          "fmaxl",
+      "fmodl",         "log10l",       "log2l",         "logl",
+      "nearbyintl",    "powl",         "rintl",         "roundl",
+      "sinl",          "sqrtl",        "truncl"};
 
   // Check that LibCalls is sorted alphabetically.
   auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
diff --git a/lib/Target/Mips/MipsCallLowering.cpp b/lib/Target/Mips/MipsCallLowering.cpp
index 8babdbf902a8df9f75aecb7cda49210f1943a8bd..c550fadf66320036af906112bd6ca63e863c36ac 100644
--- a/lib/Target/Mips/MipsCallLowering.cpp
+++ b/lib/Target/Mips/MipsCallLowering.cpp
@@ -45,9 +45,9 @@ bool MipsCallLowering::MipsHandler::assignVRegs(ArrayRef<unsigned> VRegs,
   return true;
 }
 
-void MipsCallLowering::MipsHandler::setMostSignificantFirst(
+void MipsCallLowering::MipsHandler::setLeastSignificantFirst(
     SmallVectorImpl<unsigned> &VRegs) {
-  if (MIRBuilder.getMF().getDataLayout().isLittleEndian())
+  if (!MIRBuilder.getMF().getDataLayout().isLittleEndian())
     std::reverse(VRegs.begin(), VRegs.end());
 }
 
@@ -181,7 +181,7 @@ bool IncomingValueHandler::handleSplit(SmallVectorImpl<unsigned> &VRegs,
                                        unsigned ArgsReg) {
   if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex))
     return false;
-  setMostSignificantFirst(VRegs);
+  setLeastSignificantFirst(VRegs);
   MIRBuilder.buildMerge(ArgsReg, VRegs);
   return true;
 }
@@ -283,7 +283,7 @@ bool OutgoingValueHandler::handleSplit(SmallVectorImpl<unsigned> &VRegs,
                                        unsigned ArgLocsStartIndex,
                                        unsigned ArgsReg) {
   MIRBuilder.buildUnmerge(VRegs, ArgsReg);
-  setMostSignificantFirst(VRegs);
+  setLeastSignificantFirst(VRegs);
   if (!assignVRegs(VRegs, ArgLocs, ArgLocsStartIndex))
     return false;
 
@@ -298,8 +298,8 @@ static bool isSupportedType(Type *T) {
   return false;
 }
 
-CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
-                                      const ISD::ArgFlagsTy &Flags) {
+static CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
+                                             const ISD::ArgFlagsTy &Flags) {
   // > does not mean loss of information as type RegisterVT can't hold type VT,
   // it means that type VT is split into multiple registers of type RegisterVT
   if (VT.getSizeInBits() >= RegisterVT.getSizeInBits())
@@ -312,8 +312,8 @@ CCValAssign::LocInfo determineLocInfo(const MVT RegisterVT, const EVT VT,
 }
 
 template <typename T>
-void setLocInfo(SmallVectorImpl<CCValAssign> &ArgLocs,
-                const SmallVectorImpl<T> &Arguments) {
+static void setLocInfo(SmallVectorImpl<CCValAssign> &ArgLocs,
+                       const SmallVectorImpl<T> &Arguments) {
   for (unsigned i = 0; i < ArgLocs.size(); ++i) {
     const CCValAssign &VA = ArgLocs[i];
     CCValAssign::LocInfo LocInfo = determineLocInfo(
diff --git a/lib/Target/Mips/MipsCallLowering.h b/lib/Target/Mips/MipsCallLowering.h
index 389db3a3b681dc36d8866f0d534a8529c91d8e00..9916b04ef50ca25d3ed66b1d2d2f11e6e9621e62 100644
--- a/lib/Target/Mips/MipsCallLowering.h
+++ b/lib/Target/Mips/MipsCallLowering.h
@@ -38,7 +38,7 @@ public:
     bool assignVRegs(ArrayRef<unsigned> VRegs, ArrayRef<CCValAssign> ArgLocs,
                      unsigned Index);
 
-    void setMostSignificantFirst(SmallVectorImpl<unsigned> &VRegs);
+    void setLeastSignificantFirst(SmallVectorImpl<unsigned> &VRegs);
 
     MachineIRBuilder &MIRBuilder;
     MachineRegisterInfo &MRI;
@@ -81,7 +81,7 @@ private:
                                       SmallVectorImpl<T> &ISDArgs) const;
 
   /// Split structures and arrays, save original argument indices since
-  /// Mips calling conv needs info about original argument type.
+  /// Mips calling convention needs info about original argument type.
   void splitToValueTypes(const ArgInfo &OrigArg, unsigned OriginalIndex,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
                          SmallVectorImpl<unsigned> &SplitArgsOrigIndices) const;
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 0faa13d4d63f0b16207032f4ea6e71ef7388986a..d9398b7d6024a58361d40583d4a307495ba58f29 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -2002,13 +2002,19 @@ let isPseudo = 1, isCodeGenOnly = 1, hasNoSchedulingInfo = 1 in {
 // branches.  See the comment in file MipsLongBranch.cpp for detailed
 // explanation.
 
-// Expands to: lui $dst, %hi($tgt - $baltgt)
+// Expands to: lui $dst, %highest/%higher/%hi/%lo($tgt - $baltgt)
 def LONG_BRANCH_LUi : PseudoSE<(outs GPR32Opnd:$dst),
   (ins brtarget:$tgt, brtarget:$baltgt), []>;
+// Expands to: lui $dst, highest/%higher/%hi/%lo($tgt)
+def LONG_BRANCH_LUi2Op : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins brtarget:$tgt), []>;
 
-// Expands to: addiu $dst, $src, %lo($tgt - $baltgt)
+// Expands to: addiu $dst, $src, %highest/%higher/%hi/%lo($tgt - $baltgt)
 def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
   (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+// Expands to: addiu $dst, $src, %highest/%higher/%hi/%lo($tgt)
+def LONG_BRANCH_ADDiu2Op : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins GPR32Opnd:$src, brtarget:$tgt), []>;
 
 //===----------------------------------------------------------------------===//
 // Instruction definition
diff --git a/lib/Target/Mips/MipsLegalizerInfo.cpp b/lib/Target/Mips/MipsLegalizerInfo.cpp
index 6a16e7955a16425590c8cdafb6e0e8e7d33af4e3..02701f31e32947e4952280118350e2fd7058cece 100644
--- a/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -40,7 +40,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
       .minScalar(0, s32);
 
   getActionDefinitionsBuilder(G_CONSTANT)
-      .legalFor({s32});
+      .legalFor({s32})
+      .minScalar(0, s32)
+      .customFor({s64});
 
   getActionDefinitionsBuilder(G_GEP)
       .legalFor({{p0, s32}});
@@ -78,15 +80,36 @@ bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI,
     unsigned Carry = MRI.createGenericVirtualRegister(sHalf);
     unsigned TmpResHigh = MRI.createGenericVirtualRegister(sHalf);
 
-    MIRBuilder.buildUnmerge({RHSHigh, RHSLow}, MI.getOperand(2).getReg());
-    MIRBuilder.buildUnmerge({LHSHigh, LHSLow}, MI.getOperand(1).getReg());
+    MIRBuilder.buildUnmerge({RHSLow, RHSHigh}, MI.getOperand(2).getReg());
+    MIRBuilder.buildUnmerge({LHSLow, LHSHigh}, MI.getOperand(1).getReg());
 
     MIRBuilder.buildAdd(TmpResHigh, LHSHigh, RHSHigh);
     MIRBuilder.buildAdd(ResLow, LHSLow, RHSLow);
     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, Carry, ResLow, LHSLow);
     MIRBuilder.buildAdd(ResHigh, TmpResHigh, Carry);
 
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResHigh, ResLow});
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResLow, ResHigh});
+
+    MI.eraseFromParent();
+    break;
+  }
+  case G_CONSTANT: {
+
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    const LLT sHalf = LLT::scalar(Size / 2);
+
+    const APInt &CImmValue = MI.getOperand(1).getCImm()->getValue();
+
+    unsigned ResLow = MRI.createGenericVirtualRegister(sHalf);
+    unsigned ResHigh = MRI.createGenericVirtualRegister(sHalf);
+    MIRBuilder.buildConstant(
+        ResLow, *ConstantInt::get(MI.getMF()->getFunction().getContext(),
+                                  CImmValue.trunc(Size / 2)));
+    MIRBuilder.buildConstant(
+        ResHigh, *ConstantInt::get(MI.getMF()->getFunction().getContext(),
+                                   CImmValue.lshr(Size / 2).trunc(Size / 2)));
+
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {ResLow, ResHigh});
 
     MI.eraseFromParent();
     break;
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 2b7f64099923104be7c3a1ffe38eb683f665f5dc..46b37ceae391816882e00dcb945b3c4aa2763691 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -298,12 +298,16 @@ bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
   default:
     return false;
   case Mips::LONG_BRANCH_LUi:
+  case Mips::LONG_BRANCH_LUi2Op:
+  case Mips::LONG_BRANCH_LUi2Op_64:
     lowerLongBranchLUi(MI, OutMI);
     return true;
   case Mips::LONG_BRANCH_ADDiu:
+  case Mips::LONG_BRANCH_ADDiu2Op:
     lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu);
     return true;
   case Mips::LONG_BRANCH_DADDiu:
+  case Mips::LONG_BRANCH_DADDiu2Op:
     lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu);
     return true;
   }
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index cf2899dd375e97c0c174796598f130fcaeeec62a..f030f83295dd15554bb42c6cd4298a43977e2872 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -244,7 +244,7 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
           MI.addOperand(MachineOperand::CreateReg(Mips::SP, false, true));
           break;
         }
-      // fallthrough
+        LLVM_FALLTHROUGH;
       case Mips::BuildPairF64:
       case Mips::ExtractElementF64:
         if (Subtarget->isABI_FPXX() && !Subtarget->hasMTHC1())
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index f625a2903bd71a0b6ac4008e116737624136353f..d745ce00149011b7b850a8d523aea8d64f6160c7 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -158,8 +158,8 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
     setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
     setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
-    setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
-    setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
+    setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
+    setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
 
     setTargetDAGCombine(ISD::AND);
     setTargetDAGCombine(ISD::OR);
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 64db815a0f4c18b487fc5416ce930d1f0614ac89..410fa655a225a26e3008a70b55a416ac1e06faae 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -154,6 +154,7 @@ def II_DERET            : InstrItinClass;
 def II_ERETNC           : InstrItinClass;
 def II_EHB              : InstrItinClass;
 def II_SDBBP            : InstrItinClass;
+def II_SIGRIE           : InstrItinClass;
 def II_SSNOP            : InstrItinClass;
 def II_SYSCALL          : InstrItinClass;
 def II_PAUSE            : InstrItinClass;
@@ -546,6 +547,7 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_ERETNC          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_EHB             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SDBBP           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_SIGRIE          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SSNOP           , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_SYSCALL         , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_PAUSE           , [InstrStage<1,  [ALU]>]>,
diff --git a/lib/Target/Mips/MipsScheduleGeneric.td b/lib/Target/Mips/MipsScheduleGeneric.td
index 79c55dbb9e0370d153e7d069c663e7ce7fa5bd30..80ffe7ada7c815395d5175043fe422ca8e58460a 100644
--- a/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/lib/Target/Mips/MipsScheduleGeneric.td
@@ -179,7 +179,7 @@ def GenericWriteTrap : SchedWriteRes<[GenericIssueCTISTD]>;
 def : ItinRW<[GenericWriteTrap], [II_BREAK, II_SYSCALL, II_TEQ, II_TEQI,
                                   II_TGE, II_TGEI, II_TGEIU, II_TGEU, II_TNE,
                                   II_TNEI, II_TLT, II_TLTI, II_TLTU, II_TTLTIU,
-                                  II_TRAP, II_SDBBP]>;
+                                  II_TRAP, II_SDBBP, II_SIGRIE]>;
 
 // COP0 Pipeline
 // =============
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index aeb90eca3a0513692a850c6e613cddb2458a6edb..f7b4cf3a0f7243c15aebedbc8e9b893127089c8a 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -25,6 +25,12 @@ NVPTXTargetStreamer::NVPTXTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
 NVPTXTargetStreamer::~NVPTXTargetStreamer() = default;
 
+void NVPTXTargetStreamer::outputDwarfFileDirectives() {
+  for (const std::string &S : DwarfFiles)
+    getStreamer().EmitRawText(S.data());
+  DwarfFiles.clear();
+}
+
 void NVPTXTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
   DwarfFiles.emplace_back(Directive);
 }
@@ -82,9 +88,7 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
     OS << "//\t}\n";
   if (isDwarfSection(FI, Section)) {
     // Emit DWARF .file directives in the outermost scope.
-    for (const std::string &S : DwarfFiles)
-      getStreamer().EmitRawText(S.data());
-    DwarfFiles.clear();
+    outputDwarfFileDirectives();
     OS << "//\t.section";
     Section->PrintSwitchToSection(*getStreamer().getContext().getAsmInfo(),
                                   FI->getTargetTriple(), OS, SubSection);
@@ -92,3 +96,30 @@ void NVPTXTargetStreamer::changeSection(const MCSection *CurSection,
     OS << "//\t{\n";
   }
 }
+
+void NVPTXTargetStreamer::emitRawBytes(StringRef Data) {
+  const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+  const char *Directive = MAI->getData8bitsDirective();
+  unsigned NumElements = Data.size();
+  const unsigned MaxLen = 40;
+  unsigned NumChunks = 1 + ((NumElements - 1) / MaxLen);
+  // Split the very long directives into several parts if the limit is
+  // specified.
+  for (unsigned I = 0; I < NumChunks; ++I) {
+    SmallString<128> Str;
+    raw_svector_ostream OS(Str);
+
+    const char *Label = Directive;
+    for (auto It = std::next(Data.bytes_begin(), I * MaxLen),
+              End = (I == NumChunks - 1)
+                        ? Data.bytes_end()
+                        : std::next(Data.bytes_begin(), (I + 1) * MaxLen);
+         It != End; ++It) {
+      OS << Label << (unsigned)*It;
+      if (Label == Directive)
+        Label = ",";
+    }
+    Streamer.EmitRawText(OS.str());
+  }
+}
+
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
index 30831ab8bbebcce78d8940fee98fd24f17f4ccaf..f18e61cdca574c9dc0f4612e9d379e433e57c93a 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.h
@@ -24,6 +24,9 @@ public:
   NVPTXTargetStreamer(MCStreamer &S);
   ~NVPTXTargetStreamer() override;
 
+  /// Outputs the list of the DWARF '.file' directives to the streamer.
+  void outputDwarfFileDirectives();
+
   /// Record DWARF file directives for later output.
   /// According to PTX ISA, CUDA Toolkit documentation, 11.5.3. Debugging
   /// Directives: .file
@@ -39,6 +42,10 @@ public:
   void emitDwarfFileDirective(StringRef Directive) override;
   void changeSection(const MCSection *CurSection, MCSection *Section,
                      const MCExpr *SubSection, raw_ostream &OS) override;
+  /// Emit the bytes in \p Data into the output.
+  ///
+  /// This is used to emit bytes in \p Data as sequence of .byte directives.
+  void emitRawBytes(StringRef Data) override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
index bed52293197d51ad812b6ca33f510a5195f46916..bf922eb8a19554fb7f086f26e6a7f9facf723165 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -41,7 +41,7 @@ public:
 bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
   bool functionModified = false;
   Function::iterator I = function.begin();
-  TerminatorInst *firstTerminatorInst = (I++)->getTerminator();
+  Instruction *firstTerminatorInst = (I++)->getTerminator();
 
   for (Function::iterator E = function.end(); I != E; ++I) {
     for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index a966b99284007fae3efa048260bc1664ba59e9fa..aec0d7db81a8c715052dc58e59837323bfdfca8f 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -16,6 +16,7 @@
 #include "InstPrinter/NVPTXInstPrinter.h"
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
+#include "MCTargetDesc/NVPTXTargetStreamer.h"
 #include "NVPTX.h"
 #include "NVPTXMCExpr.h"
 #include "NVPTXMachineFunctionInfo.h"
@@ -199,7 +200,7 @@ bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI,
 
 void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
   // Ewwww
-  TargetMachine &TM = const_cast<TargetMachine&>(MF->getTarget());
+  LLVMTargetMachine &TM = const_cast<LLVMTargetMachine&>(MF->getTarget());
   NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine&>(TM);
   const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
   const char *Sym = MFI->getImageHandleSymbol(Index);
@@ -880,8 +881,22 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
   if (NTM.getDrvInterface() == NVPTX::NVCL)
     O << ", texmode_independent";
 
+  bool HasFullDebugInfo = false;
+  for (DICompileUnit *CU : M.debug_compile_units()) {
+    switch(CU->getEmissionKind()) {
+    case DICompileUnit::NoDebug:
+    case DICompileUnit::DebugDirectivesOnly:
+      break;
+    case DICompileUnit::LineTablesOnly:
+    case DICompileUnit::FullDebug:
+      HasFullDebugInfo = true;
+      break;
+    }
+    if (HasFullDebugInfo)
+      break;
+  }
   // FIXME: remove comment once debug info is properly supported.
-  if (MMI && MMI->hasDebugInfo())
+  if (MMI && MMI->hasDebugInfo() && HasFullDebugInfo)
     O << "//, debug";
 
   O << "\n";
@@ -938,6 +953,10 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
   if (HasDebugInfo)
     OutStreamer->EmitRawText("//\t}");
 
+  // Output last DWARF .file directives, if any.
+  static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer())
+      ->outputDwarfFileDirectives();
+
   return ret;
 
   //bool Result = AsmPrinter::doFinalization(M);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2536623fb853e08d087d38b2953b8fe9e22595c9..c352b9b9c9dc66a9f4f76649056e2a1b87d1dcff 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -560,8 +560,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   }
   setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
   setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
-  setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
-  setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
+  setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
+  setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
 
   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
   // No FPOW or FREM in PTX.
@@ -1170,7 +1170,7 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
 }
 
 TargetLoweringBase::LegalizeTypeAction
-NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
+NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
     return TypeSplitVector;
   if (VT == MVT::v2f16)
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index ef04a8573d45707b22974bdcb15ff2241d4b904f..3e109f75b66815ade98cd746469fa9b883f42c99 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -511,7 +511,7 @@ public:
   }
 
   TargetLoweringBase::LegalizeTypeAction
-  getPreferredVectorAction(EVT VT) const override;
+  getPreferredVectorAction(MVT VT) const override;
 
   // Get the degree of precision we want from 32-bit floating point division
   // operations.
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 5bb4fc3edd093a4d11a7304ce26924d16852ccae..2ca0ccf2dfa7c43f78a006cbe66614df788c1002 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -60,6 +61,24 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
       for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
         if (!MI.getOperand(i).isFI())
           continue;
+
+        // Frame indices in debug values are encoded in a target independent
+        // way with simply the frame index and offset rather than any
+        // target-specific addressing mode.
+        if (MI.isDebugValue()) {
+          assert(i == 0 && "Frame indices can only appear as the first "
+                           "operand of a DBG_VALUE machine instruction");
+          unsigned Reg;
+          int64_t Offset =
+              TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg);
+          MI.getOperand(0).ChangeToRegister(Reg, /*isDef=*/false);
+          MI.getOperand(0).setIsDebug();
+          auto *DIExpr = DIExpression::prepend(MI.getDebugExpression(),
+                                               DIExpression::NoDeref, Offset);
+          MI.getOperand(3).setMetadata(DIExpr);
+          continue;
+        }
+
         TRI.eliminateFrameIndex(MI, 0, i, nullptr);
         Modified = true;
       }
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
index fe41e1b36a5d522aa46997288aeea933f5433d61..a03e691ef5bb3ecf26178870e0c6bacba6c892f6 100644
--- a/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -392,7 +392,7 @@ void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL,
     // liveness state at the end of MBB (liveOut of MBB) as the liveIn for
     // NewSuccessor. Otherwise, will cause cyclic dependence.
     LivePhysRegs LPR(*MF->getSubtarget<PPCSubtarget>().getRegisterInfo());
-    SmallVector<std::pair<unsigned, const MachineOperand *>, 2> Clobbers;
+    SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 2> Clobbers;
     for (MachineInstr &MI : *MBB)
       LPR.stepForward(MI, Clobbers);
     for (auto &LI : LPR)
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index f212894035db9d8a3bc55402e05e7c848824264e..668169839e78cc539ece36e3c38bb84e311b4890 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -903,7 +903,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
     case MVT::i8:
     case MVT::i16:
       NeedsExt = true;
-      // Intentional fall-through.
+      LLVM_FALLTHROUGH;
     case MVT::i32:
       if (!UseImm)
         CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index af17bb5f1659bb835b21858d52ffd470dcd7bc47..8861de6f0d8953dbd30ada9c80cdaa4c56b2195c 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -490,7 +490,7 @@ static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo,
   if (!FuncInfo->BPI) return PPC::BR_NO_HINT;
 
   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
-  const TerminatorInst *BBTerm = BB->getTerminator();
+  const Instruction *BBTerm = BB->getTerminator();
 
   if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT;
 
@@ -1083,9 +1083,14 @@ class BitPermutationSelector {
     // lowest-order bit.
     unsigned Idx;
 
+    // ConstZero means a bit we need to mask off.
+    // Variable is a bit comes from an input variable.
+    // VariableKnownToBeZero is also a bit comes from an input variable,
+    // but it is known to be already zero. So we do not need to mask them.
     enum Kind {
       ConstZero,
-      Variable
+      Variable,
+      VariableKnownToBeZero
     } K;
 
     ValueBit(SDValue V, unsigned I, Kind K = Variable)
@@ -1094,11 +1099,11 @@ class BitPermutationSelector {
       : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}
 
     bool isZero() const {
-      return K == ConstZero;
+      return K == ConstZero || K == VariableKnownToBeZero;
     }
 
     bool hasValue() const {
-      return K == Variable;
+      return K == Variable || K == VariableKnownToBeZero;
     }
 
     SDValue getValue() const {
@@ -1248,8 +1253,14 @@ class BitPermutationSelector {
         for (unsigned i = 0; i < NumBits; ++i)
           if (((Mask >> i) & 1) == 1)
             Bits[i] = (*LHSBits)[i];
-          else
-            Bits[i] = ValueBit(ValueBit::ConstZero);
+          else {
+            // AND instruction masks this bit. If the input is already zero,
+            // we have nothing to do here. Otherwise, make the bit ConstZero.
+            if ((*LHSBits)[i].isZero())
+              Bits[i] = (*LHSBits)[i];
+            else
+              Bits[i] = ValueBit(ValueBit::ConstZero);
+          }
 
         return std::make_pair(Interesting, &Bits);
       }
@@ -1259,8 +1270,26 @@ class BitPermutationSelector {
       const auto &RHSBits = *getValueBits(V.getOperand(1), NumBits).second;
 
       bool AllDisjoint = true;
-      for (unsigned i = 0; i < NumBits; ++i)
-        if (LHSBits[i].isZero())
+      SDValue LastVal = SDValue();
+      unsigned LastIdx = 0;
+      for (unsigned i = 0; i < NumBits; ++i) {
+        if (LHSBits[i].isZero() && RHSBits[i].isZero()) {
+          // If both inputs are known to be zero and one is ConstZero and
+          // another is VariableKnownToBeZero, we can select whichever
+          // we like. To minimize the number of bit groups, we select
+          // VariableKnownToBeZero if this bit is the next bit of the same
+          // input variable from the previous bit. Otherwise, we select
+          // ConstZero.
+          if (LHSBits[i].hasValue() && LHSBits[i].getValue() == LastVal &&
+              LHSBits[i].getValueBitIndex() == LastIdx + 1)
+            Bits[i] = LHSBits[i];
+          else if (RHSBits[i].hasValue() && RHSBits[i].getValue() == LastVal &&
+                   RHSBits[i].getValueBitIndex() == LastIdx + 1)
+            Bits[i] = RHSBits[i];
+          else
+            Bits[i] = ValueBit(ValueBit::ConstZero);
+        }
+        else if (LHSBits[i].isZero())
           Bits[i] = RHSBits[i];
         else if (RHSBits[i].isZero())
           Bits[i] = LHSBits[i];
@@ -1268,6 +1297,16 @@ class BitPermutationSelector {
           AllDisjoint = false;
           break;
         }
+        // We remember the value and bit index of this bit.
+        if (Bits[i].hasValue()) {
+          LastVal = Bits[i].getValue();
+          LastIdx = Bits[i].getValueBitIndex();
+        }
+        else {
+          if (LastVal) LastVal = SDValue();
+          LastIdx = 0;
+        }
+      }
 
       if (!AllDisjoint)
         break;
@@ -1293,6 +1332,44 @@ class BitPermutationSelector {
 
       return std::make_pair(Interesting, &Bits);
     }
+    case ISD::AssertZext: {
+      // For AssertZext, we look through the operand and
+      // mark the bits known to be zero.
+      const SmallVector<ValueBit, 64> *LHSBits;
+      std::tie(Interesting, LHSBits) = getValueBits(V.getOperand(0),
+                                                    NumBits);
+
+      EVT FromType = cast<VTSDNode>(V.getOperand(1))->getVT();
+      const unsigned NumValidBits = FromType.getSizeInBits();
+      for (unsigned i = 0; i < NumValidBits; ++i)
+        Bits[i] = (*LHSBits)[i];
+
+      // These bits are known to be zero.
+      for (unsigned i = NumValidBits; i < NumBits; ++i)
+        Bits[i] = ValueBit((*LHSBits)[i].getValue(),
+                           (*LHSBits)[i].getValueBitIndex(),
+                           ValueBit::VariableKnownToBeZero);
+
+      return std::make_pair(Interesting, &Bits);
+    }
+    case ISD::LOAD:
+      LoadSDNode *LD = cast<LoadSDNode>(V);
+      if (ISD::isZEXTLoad(V.getNode()) && V.getResNo() == 0) {
+        EVT VT = LD->getMemoryVT();
+        const unsigned NumValidBits = VT.getSizeInBits();
+
+        for (unsigned i = 0; i < NumValidBits; ++i)
+          Bits[i] = ValueBit(V, i);
+
+        // These bits are known to be zero.
+        for (unsigned i = NumValidBits; i < NumBits; ++i)
+          Bits[i] = ValueBit(V, i, ValueBit::VariableKnownToBeZero);
+
+        // Zero-extending load itself cannot be optimized. So, it is not
+        // interesting by itself though it gives useful information.
+        return std::make_pair(Interesting = false, &Bits);
+      }
+      break;
     }
 
     for (unsigned i = 0; i < NumBits; ++i)
@@ -1304,7 +1381,7 @@ class BitPermutationSelector {
   // For each value (except the constant ones), compute the left-rotate amount
   // to get it from its original to final position.
   void computeRotationAmounts() {
-    HasZeros = false;
+    NeedMask = false;
     RLAmt.resize(Bits.size());
     for (unsigned i = 0; i < Bits.size(); ++i)
       if (Bits[i].hasValue()) {
@@ -1314,7 +1391,7 @@ class BitPermutationSelector {
         else
           RLAmt[i] = Bits.size() - (VBI - i);
       } else if (Bits[i].isZero()) {
-        HasZeros = true;
+        NeedMask = true;
         RLAmt[i] = UINT32_MAX;
       } else {
         llvm_unreachable("Unknown value bit type");
@@ -1330,6 +1407,7 @@ class BitPermutationSelector {
     unsigned LastRLAmt = RLAmt[0];
     SDValue LastValue = Bits[0].hasValue() ? Bits[0].getValue() : SDValue();
     unsigned LastGroupStartIdx = 0;
+    bool IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue();
     for (unsigned i = 1; i < Bits.size(); ++i) {
       unsigned ThisRLAmt = RLAmt[i];
       SDValue ThisValue = Bits[i].hasValue() ? Bits[i].getValue() : SDValue();
@@ -1342,10 +1420,20 @@ class BitPermutationSelector {
           LastGroupStartIdx = 0;
       }
 
+      // If this bit is known to be zero and the current group is a bit group
+      // of zeros, we do not need to terminate the current bit group even the
+      // Value or RLAmt does not match here. Instead, we terminate this group
+      // when the first non-zero bit appears later.
+      if (IsGroupOfZeros && Bits[i].isZero())
+        continue;
+
       // If this bit has the same underlying value and the same rotate factor as
       // the last one, then they're part of the same group.
       if (ThisRLAmt == LastRLAmt && ThisValue == LastValue)
-        continue;
+        // We cannot continue the current group if this bits is not known to
+        // be zero in a bit group of zeros.
+        if (!(IsGroupOfZeros && ThisValue && !Bits[i].isZero()))
+          continue;
 
       if (LastValue.getNode())
         BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
@@ -1353,6 +1441,7 @@ class BitPermutationSelector {
       LastRLAmt = ThisRLAmt;
       LastValue = ThisValue;
       LastGroupStartIdx = i;
+      IsGroupOfZeros = !Bits[LastGroupStartIdx].hasValue();
     }
     if (LastValue.getNode())
       BitGroups.push_back(BitGroup(LastValue, LastRLAmt, LastGroupStartIdx,
@@ -1698,7 +1787,7 @@ class BitPermutationSelector {
     // If we've not yet selected a 'starting' instruction, and we have no zeros
     // to fill in, select the (Value, RLAmt) with the highest priority (largest
     // number of groups), and start with this rotated value.
-    if ((!HasZeros || LateMask) && !Res) {
+    if ((!NeedMask || LateMask) && !Res) {
       ValueRotInfo &VRI = ValueRotsVec[0];
       if (VRI.RLAmt) {
         if (InstCnt) *InstCnt += 1;
@@ -2077,7 +2166,7 @@ class BitPermutationSelector {
     // If we've not yet selected a 'starting' instruction, and we have no zeros
     // to fill in, select the (Value, RLAmt) with the highest priority (largest
     // number of groups), and start with this rotated value.
-    if ((!HasZeros || LateMask) && !Res) {
+    if ((!NeedMask || LateMask) && !Res) {
       // If we have both Repl32 groups and non-Repl32 groups, the non-Repl32
       // groups will come first, and so the VRI representing the largest number
       // of groups might not be first (it might be the first Repl32 groups).
@@ -2230,7 +2319,7 @@ class BitPermutationSelector {
 
   SmallVector<ValueBit, 64> Bits;
 
-  bool HasZeros;
+  bool NeedMask;
   SmallVector<unsigned, 64> RLAmt;
 
   SmallVector<BitGroup, 16> BitGroups;
@@ -2259,10 +2348,10 @@ public:
                          " selection for:    ");
     LLVM_DEBUG(N->dump(CurDAG));
 
-    // Fill it RLAmt and set HasZeros.
+    // Fill it RLAmt and set NeedMask.
     computeRotationAmounts();
 
-    if (!HasZeros)
+    if (!NeedMask)
       return Select(N, false);
 
     // We currently have two techniques for handling results with zeros: early
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 1fd5018d05c08421f40dea9ccc4a352d031fb1bc..c6f0212ab404ded54197438490319b8cc6e3774c 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -792,6 +792,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
       setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
 
+      setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
+      setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom);
+
       setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
       setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
       setOperationAction(ISD::FABS, MVT::v4f32, Legal);
@@ -1070,6 +1073,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
 
+  setTargetDAGCombine(ISD::TRUNCATE);
+
   if (Subtarget.useCRBits()) {
     setTargetDAGCombine(ISD::TRUNCATE);
     setTargetDAGCombine(ISD::SETCC);
@@ -3965,7 +3970,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
 
       assert(ObjectVT.getSimpleVT().SimpleTy == MVT::v4f32 &&
              "Invalid QPX parameter type");
-      /* fall through */
+      LLVM_FALLTHROUGH;
 
     case MVT::v4f64:
     case MVT::v4i1:
@@ -6108,7 +6113,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       assert(Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32 &&
              "Invalid QPX parameter type");
 
-      /* fall through */
+      LLVM_FALLTHROUGH;
     case MVT::v4f64:
     case MVT::v4i1: {
       bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
@@ -7263,10 +7268,75 @@ SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
   return FP;
 }
 
+static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
+
+  EVT VecVT = Vec.getValueType();
+  assert(VecVT.isVector() && "Expected a vector type.");
+  assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");
+
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned WideNumElts = 128 / EltVT.getSizeInBits();
+  EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
+
+  unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumConcat);
+  Ops[0] = Vec;
+  SDValue UndefVec = DAG.getUNDEF(VecVT);
+  for (unsigned i = 1; i < NumConcat; ++i)
+    Ops[i] = UndefVec;
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
+}
+
+SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op,
+                                                SelectionDAG &DAG,
+                                                const SDLoc &dl) const {
+
+  unsigned Opc = Op.getOpcode();
+  assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP) &&
+         "Unexpected conversion type");
+  assert(Op.getValueType() == MVT::v2f64 && "Supports v2f64 only.");
+
+  // CPU's prior to P9 don't have a way to sign-extend in vectors.
+  bool SignedConv = Opc == ISD::SINT_TO_FP;
+  if (SignedConv && !Subtarget.hasP9Altivec())
+    return SDValue();
+
+  SDValue Wide = widenVec(DAG, Op.getOperand(0), dl);
+  EVT WideVT = Wide.getValueType();
+  unsigned WideNumElts = WideVT.getVectorNumElements();
+
+  SmallVector<int, 16> ShuffV;
+  for (unsigned i = 0; i < WideNumElts; ++i)
+    ShuffV.push_back(i + WideNumElts);
+
+  if (Subtarget.isLittleEndian()) {
+    ShuffV[0] = 0;
+    ShuffV[WideNumElts / 2] = 1;
+  }
+  else {
+    ShuffV[WideNumElts / 2 - 1] = 0;
+    ShuffV[WideNumElts - 1] = 1;
+  }
+
+  SDValue ShuffleSrc2 = SignedConv ? DAG.getUNDEF(WideVT) :
+                                     DAG.getConstant(0, dl, WideVT);
+  SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);
+  unsigned ExtendOp = SignedConv ? (unsigned) PPCISD::SExtVElems :
+                                   (unsigned) ISD::BITCAST;
+  SDValue Extend = DAG.getNode(ExtendOp, dl, MVT::v2i64, Arrange);
+
+  return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
+}
+
 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
+  if (Op.getValueType() == MVT::v2f64 &&
+      Op.getOperand(0).getValueType() == MVT::v2i16)
+    return LowerINT_TO_FPVector(Op, DAG, dl);
+
   // Conversions to f128 are legal.
   if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))
     return Op;
@@ -9634,6 +9704,9 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
     return;
+  case ISD::BITCAST:
+    // Don't handle bitcast here.
+    return;
   }
 }
 
@@ -11750,6 +11823,37 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
       ShiftCst);
 }
 
+SDValue PPCTargetLowering::combineSetCC(SDNode *N,
+                                        DAGCombinerInfo &DCI) const {
+  assert(N->getOpcode() == ISD::SETCC &&
+         "Should be called with a SETCC node");
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (CC == ISD::SETNE || CC == ISD::SETEQ) {
+    SDValue LHS = N->getOperand(0);
+    SDValue RHS = N->getOperand(1);
+
+    // If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
+    if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
+        LHS.hasOneUse())
+      std::swap(LHS, RHS);
+
+    // x == 0-y --> x+y == 0
+    // x != 0-y --> x+y != 0
+    if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
+        RHS.hasOneUse()) {
+      SDLoc DL(N);
+      SelectionDAG &DAG = DCI.DAG;
+      EVT VT = N->getValueType(0);
+      EVT OpVT = LHS.getValueType();
+      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
+      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
+    }
+  }
+
+  return DAGCombineTruncBoolExt(N, DCI);
+}
+
 // Is this an extending load from an f32 to an f64?
 static bool isFPExtLoad(SDValue Op) {
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
@@ -12479,7 +12583,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ANY_EXTEND:
     return DAGCombineExtBoolTrunc(N, DCI);
   case ISD::TRUNCATE:
+    return combineTRUNCATE(N, DCI);
   case ISD::SETCC:
+    if (SDValue CSCC = combineSetCC(N, DCI))
+      return CSCC;
+    LLVM_FALLTHROUGH;
   case ISD::SELECT_CC:
     return DAGCombineTruncBoolExt(N, DCI);
   case ISD::SINT_TO_FP:
@@ -13254,7 +13362,8 @@ PPCTargetLowering::getConstraintType(StringRef Constraint) const {
   } else if (Constraint == "wc") { // individual CR bits.
     return C_RegisterClass;
   } else if (Constraint == "wa" || Constraint == "wd" ||
-             Constraint == "wf" || Constraint == "ws") {
+             Constraint == "wf" || Constraint == "ws" ||
+             Constraint == "wi") {
     return C_RegisterClass; // VSX registers.
   }
   return TargetLowering::getConstraintType(Constraint);
@@ -13284,6 +13393,8 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
     return CW_Register;
   else if (StringRef(constraint) == "ws" && type->isDoubleTy())
     return CW_Register;
+  else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
+    return CW_Register; // just hold 64-bit integers data.
 
   switch (*constraint) {
   default:
@@ -13366,7 +13477,8 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     // An individual CR bit.
     return std::make_pair(0U, &PPC::CRBITRCRegClass);
   } else if ((Constraint == "wa" || Constraint == "wd" ||
-             Constraint == "wf") && Subtarget.hasVSX()) {
+             Constraint == "wf" || Constraint == "wi") &&
+             Subtarget.hasVSX()) {
     return std::make_pair(0U, &PPC::VSRCRegClass);
   } else if (Constraint == "ws" && Subtarget.hasVSX()) {
     if (VT == MVT::f32 && Subtarget.hasP8Vector())
@@ -14253,6 +14365,58 @@ SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
   return SDValue();
 }
 
+// Detect TRUNCATE operations on bitcasts of float128 values.
+// What we are looking for here is the situtation where we extract a subset
+// of bits from a 128 bit float.
+// This can be of two forms:
+// 1) BITCAST of f128 feeding TRUNCATE
+// 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
+// The reason this is required is because we do not have a legal i128 type
+// and so we want to prevent having to store the f128 and then reload part
+// of it.
+SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
+                                           DAGCombinerInfo &DCI) const {
+  // If we are using CRBits then try that first.
+  if (Subtarget.useCRBits()) {
+    // Check if CRBits did anything and return that if it did.
+    if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
+      return CRTruncValue;
+  }
+
+  SDLoc dl(N);
+  SDValue Op0 = N->getOperand(0);
+
+  // Looking for a truncate of i128 to i64.
+  if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;
+
+  // SRL feeding TRUNCATE.
+  if (Op0.getOpcode() == ISD::SRL) {
+    ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+    // The right shift has to be by 64 bits.
+    if (!ConstNode || ConstNode->getZExtValue() != 64)
+      return SDValue();
+
+    // Switch the element number to extract.
+    EltToExtract = EltToExtract ? 0 : 1;
+    // Update Op0 past the SRL.
+    Op0 = Op0.getOperand(0);
+  }
+
+  // BITCAST feeding a TRUNCATE possibly via SRL.
+  if (Op0.getOpcode() == ISD::BITCAST &&
+      Op0.getValueType() == MVT::i128 &&
+      Op0.getOperand(0).getValueType() == MVT::f128) {
+    SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0));
+    return DCI.DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast,
+        DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32));
+  }
+  return SDValue();
+}
+
 bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
   if (!Subtarget.isSVR4ABI() || !Subtarget.isPPC64())
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 9709d6bb09ebfb152ee947dc5687a97f49f3c564..1020cab48c8c7bdbf2c29472a04ce81c3803acd1 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -569,7 +569,7 @@ namespace llvm {
     /// of v4i8's and shuffle them. This will turn into a mess of 8 extending
     /// loads, moves back into VSR's (or memory ops if we don't have moves) and
     /// then the VPERM for the shuffle. All in all a very slow sequence.
-    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+    TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
       const override {
       if (VT.getScalarSizeInBits() % 8 == 0)
         return TypeWidenVector;
@@ -927,6 +927,9 @@ namespace llvm {
     SDValue LowerINT_TO_FPDirectMove(SDValue Op, SelectionDAG &DAG,
                                      const SDLoc &dl) const;
 
+    SDValue LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
+                                 const SDLoc &dl) const;
+
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
     SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
 
@@ -1093,6 +1096,8 @@ namespace llvm {
     SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 883f8390b7d5bb295299e5dd24ce05a4f29e7a82..559ed59bec9f1f08f9f1ba02bcf639e935aeb7a9 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2319,7 +2319,7 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
       Opc == PPC::RLDICL_32 || Opc == PPC::RLDICL_32_64 ||
       Opc == PPC::RLWINM || Opc == PPC::RLWINMo ||
       Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8o;
-    if (!instrHasImmForm(MI, III) && !ConvertibleImmForm)
+    if (!instrHasImmForm(MI, III, true) && !ConvertibleImmForm)
       return nullptr;
 
     // Don't convert or %X, %Y, %Y since that's just a register move.
@@ -2421,7 +2421,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
     *KilledDef = DefMI;
 
   ImmInstrInfo III;
-  bool HasImmForm = instrHasImmForm(MI, III);
+  bool HasImmForm = instrHasImmForm(MI, III, PostRA);
   // If this is a reg+reg instruction that has a reg+imm form,
   // and one of the operands is produced by an add-immediate,
   // try to convert it.
@@ -2644,8 +2644,12 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
   return false;
 }
 
+static bool isVFReg(unsigned Reg) {
+  return PPC::VFRCRegClass.contains(Reg);
+}
+
 bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
-                                   ImmInstrInfo &III) const {
+                                   ImmInstrInfo &III, bool PostRA) const {
   unsigned Opc = MI.getOpcode();
   // The vast majority of the instructions would need their operand 2 replaced
   // with an immediate when switching to the reg+imm form. A marked exception
@@ -2946,13 +2950,20 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
     case PPC::STFDUX: III.ImmOpcode = PPC::STFDU; break;
     }
     break;
-  // Power9 only.
+  // Power9 and up only. For some of these, the X-Form version has access to all
+  // 64 VSR's whereas the D-Form only has access to the VR's. We replace those
+  // with pseudo-ops pre-ra and for post-ra, we check that the register loaded
+  // into or stored from is one of the VR registers.
   case PPC::LXVX:
   case PPC::LXSSPX:
   case PPC::LXSDX:
   case PPC::STXVX:
   case PPC::STXSSPX:
   case PPC::STXSDX:
+  case PPC::XFLOADf32:
+  case PPC::XFLOADf64:
+  case PPC::XFSTOREf32:
+  case PPC::XFSTOREf64:
     if (!Subtarget.hasP9Vector())
       return false;
     III.SignedImm = true;
@@ -2962,6 +2973,7 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
     III.IsSummingOperands = true;
     III.ImmOpNo = 1;
     III.OpNoForForwarding = 2;
+    III.ImmMustBeMultipleOf = 4;
     switch(Opc) {
     default: llvm_unreachable("Unknown opcode");
     case PPC::LXVX:
@@ -2969,24 +2981,56 @@ bool PPCInstrInfo::instrHasImmForm(const MachineInstr &MI,
       III.ImmMustBeMultipleOf = 16;
       break;
     case PPC::LXSSPX:
-      III.ImmOpcode = PPC::LXSSP;
-      III.ImmMustBeMultipleOf = 4;
+      if (PostRA) {
+        if (isVFReg(MI.getOperand(0).getReg()))
+          III.ImmOpcode = PPC::LXSSP;
+        else
+          III.ImmOpcode = PPC::LFS;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case PPC::XFLOADf32:
+      III.ImmOpcode = PPC::DFLOADf32;
       break;
     case PPC::LXSDX:
-      III.ImmOpcode = PPC::LXSD;
-      III.ImmMustBeMultipleOf = 4;
+      if (PostRA) {
+        if (isVFReg(MI.getOperand(0).getReg()))
+          III.ImmOpcode = PPC::LXSD;
+        else
+          III.ImmOpcode = PPC::LFD;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case PPC::XFLOADf64:
+      III.ImmOpcode = PPC::DFLOADf64;
       break;
     case PPC::STXVX:
       III.ImmOpcode = PPC::STXV;
       III.ImmMustBeMultipleOf = 16;
       break;
     case PPC::STXSSPX:
-      III.ImmOpcode = PPC::STXSSP;
-      III.ImmMustBeMultipleOf = 4;
+      if (PostRA) {
+        if (isVFReg(MI.getOperand(0).getReg()))
+          III.ImmOpcode = PPC::STXSSP;
+        else
+          III.ImmOpcode = PPC::STFS;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case PPC::XFSTOREf32:
+      III.ImmOpcode = PPC::DFSTOREf32;
       break;
     case PPC::STXSDX:
-      III.ImmOpcode = PPC::STXSD;
-      III.ImmMustBeMultipleOf = 4;
+      if (PostRA) {
+        if (isVFReg(MI.getOperand(0).getReg()))
+          III.ImmOpcode = PPC::STXSD;
+        else
+          III.ImmOpcode = PPC::STFD;
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case PPC::XFSTOREf64:
+      III.ImmOpcode = PPC::DFSTOREf64;
       break;
     }
     break;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 8a062daab55fc50cd9a7cab2a88598c14d74073c..9c556e32496a4a74d88c4d06292efadd9964963c 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -414,7 +414,8 @@ public:
                               MachineInstr **KilledDef = nullptr) const;
   void replaceInstrWithLI(MachineInstr &MI, const LoadImmediateInfo &LII) const;
 
-  bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III) const;
+  bool instrHasImmForm(const MachineInstr &MI, ImmInstrInfo &III,
+                       bool PostRA) const;
 
   /// getRegNumForOperand - some operands use different numbering schemes
   /// for the same registers. For example, a VSX instruction may have any of
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index b1cfbc7b6645e24393fa8e2f0c3eade3c8ccdaa7..9d462df6fef5f41deb24716b38fdbad8b3ec8b47 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1040,6 +1040,15 @@ def : Pat<(v2f64 (bitconvert v1i128:$A)),
 def : Pat<(v1i128 (bitconvert v2f64:$A)),
           (COPY_TO_REGCLASS $A, VRRC)>;
 
+def : Pat<(v2i64 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+
 def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)),
           (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>;
 def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)),
@@ -1057,10 +1066,6 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
   // Stores.
   def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
             (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
-            (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
-            (STXVW4X $rS, xoaddr:$dst)>;
   def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
 }
 let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
@@ -1981,6 +1986,10 @@ let Predicates = [IsLittleEndian, HasVSX] in
   def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
             (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
 
+def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
+            (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
+            (STXVW4X $rS, xoaddr:$dst)>;
 def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
 def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
 
@@ -3864,10 +3873,11 @@ let AddedComplexity = 400 in {
                         (COPY_TO_REGCLASS (MTVSRD $A), VSRC),
                         (COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC),
-                                   (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC), 0),
-                      (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC),
-                                   (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC), 0))>;
+              (XXPERMDI
+                (COPY_TO_REGCLASS
+                  (MTVSRD (RLDIMI AnyExts.B, AnyExts.A, 32, 0)), VSRC),
+                (COPY_TO_REGCLASS
+                  (MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), VSRC), 0)>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
               (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
   }
@@ -3879,10 +3889,11 @@ let AddedComplexity = 400 in {
                         (COPY_TO_REGCLASS (MTVSRD $B), VSRC),
                         (COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $D), VSRC),
-                                   (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC), 0),
-                      (XXPERMDI (COPY_TO_REGCLASS (MTVSRWZ $C), VSRC),
-                                   (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 0))>;
+              (XXPERMDI
+                (COPY_TO_REGCLASS
+                  (MTVSRD (RLDIMI AnyExts.C, AnyExts.D, 32, 0)), VSRC),
+                (COPY_TO_REGCLASS
+                  (MTVSRD (RLDIMI AnyExts.A, AnyExts.B, 32, 0)), VSRC), 0)>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
               (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
   }
@@ -3935,10 +3946,9 @@ let AddedComplexity = 400 in {
     def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
               (v2i64 (MTVSRDD $rB, $rA))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW
-                (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.A, AnyExts.C), VSRC)),
-                (v4i32
-                  (COPY_TO_REGCLASS (MTVSRDD AnyExts.B, AnyExts.D), VSRC)))>;
+              (MTVSRDD
+                (RLDIMI AnyExts.B, AnyExts.A, 32, 0),
+                (RLDIMI AnyExts.D, AnyExts.C, 32, 0))>;
   }
 
   let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
@@ -3948,10 +3958,9 @@ let AddedComplexity = 400 in {
     def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
               (v2i64 (MTVSRDD $rB, $rA))>;
     def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (VMRGOW
-                (v4i32 (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC)),
-                (v4i32
-                  (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC)))>;
+              (MTVSRDD
+                (RLDIMI AnyExts.C, AnyExts.D, 32, 0),
+                (RLDIMI AnyExts.A, AnyExts.B, 32, 0))>;
   }
   // P9 Altivec instructions that can be used to build vectors.
   // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index b0da9b5a6d709448ddd012c43575dee9c68f3b6a..bc9bcab83a0a890f76c235520e2224aba28cac5f 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -473,7 +473,14 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace) {
+                                           unsigned AddressSpace,
+                                           bool UseMaskForCond,
+                                           bool UseMaskForGaps) {
+  if (UseMaskForCond || UseMaskForGaps)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
+
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 2ee2b3eb8084638ba794461a3b5283fd984ea275..9221a910288adedf442d69f9ca4dc68de96059ae 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -90,7 +90,9 @@ public:
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
 
   /// @}
 };
diff --git a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
index aa21cf0e6b41433350d720bea5f622a465316ceb..979c8f4e2fa7a9c6ef2ccc5739aa8102e4afb91d 100644
--- a/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
+++ b/lib/Target/RISCV/InstPrinter/RISCVInstPrinter.cpp
@@ -93,6 +93,8 @@ void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
   unsigned FenceArg = MI->getOperand(OpNo).getImm();
+  assert (((FenceArg >> 4) == 0) && "Invalid immediate in printFenceArg");
+
   if ((FenceArg & RISCVFenceField::I) != 0)
     O << 'i';
   if ((FenceArg & RISCVFenceField::O) != 0)
@@ -101,6 +103,8 @@ void RISCVInstPrinter::printFenceArg(const MCInst *MI, unsigned OpNo,
     O << 'r';
   if ((FenceArg & RISCVFenceField::W) != 0)
     O << 'w';
+  if (FenceArg == 0)
+    O << "unknown";
 }
 
 void RISCVInstPrinter::printFRMArg(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/RISCV/RISCVISelLowering.cpp b/lib/Target/RISCV/RISCVISelLowering.cpp
index 5c347ca4684dd4b5c58dec4776251b40a9abadaa..85758c0cdf8cf4f050ad31edb311131fde361a9e 100644
--- a/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -111,6 +111,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       ISD::SETUGT, ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE,
       ISD::SETGT,  ISD::SETGE,  ISD::SETNE};
 
+  // TODO: add proper support for the various FMA variants
+  // (FMADD.S, FMSUB.S, FNMSUB.S, FNMADD.S).
+  ISD::NodeType FPOpToExtend[] = {
+      ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FMA};
+
   if (Subtarget.hasStdExtF()) {
     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
@@ -119,6 +124,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     setOperationAction(ISD::SELECT, MVT::f32, Custom);
     setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+    for (auto Op : FPOpToExtend)
+      setOperationAction(Op, MVT::f32, Expand);
   }
 
   if (Subtarget.hasStdExtD()) {
@@ -131,6 +138,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+    for (auto Op : FPOpToExtend)
+      setOperationAction(Op, MVT::f64, Expand);
   }
 
   setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
diff --git a/lib/Target/RISCV/RISCVInstrInfo.td b/lib/Target/RISCV/RISCVInstrInfo.td
index 5ca1cbd165d052770cfce18e1a293c1bf591102a..631a1f7deca04010c69bdb7f14cb2dc05d9a9dda 100644
--- a/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/lib/Target/RISCV/RISCVInstrInfo.td
@@ -205,6 +205,12 @@ def ixlenimm : Operand<XLenVT> {
 // Standalone (codegen-only) immleaf patterns.
 def simm32     : ImmLeaf<XLenVT, [{return isInt<32>(Imm);}]>;
 def simm32hi20 : ImmLeaf<XLenVT, [{return isShiftedInt<20, 12>(Imm);}]>;
+// A mask value that won't affect significant shift bits.
+def immbottomxlenset : ImmLeaf<XLenVT, [{
+  if (Subtarget->is64Bit())
+    return countTrailingOnes<uint64_t>(Imm) >= 6;
+  return countTrailingOnes<uint64_t>(Imm) >= 5;
+}]>;
 
 // Addressing modes.
 // Necessary because a frameindex can't be matched directly in a pattern.
@@ -646,13 +652,23 @@ def : PatGprGpr<and, AND>;
 def : PatGprSimm12<and, ANDI>;
 def : PatGprGpr<xor, XOR>;
 def : PatGprSimm12<xor, XORI>;
-def : PatGprGpr<shl, SLL>;
 def : PatGprUimmLog2XLen<shl, SLLI>;
-def : PatGprGpr<srl, SRL>;
 def : PatGprUimmLog2XLen<srl, SRLI>;
-def : PatGprGpr<sra, SRA>;
 def : PatGprUimmLog2XLen<sra, SRAI>;
 
+// Match both a plain shift and one where the shift amount is masked (this is
+// typically introduced when the legalizer promotes the shift amount and
+// zero-extends it). For RISC-V, the mask is unnecessary as shifts in the base
+// ISA only read the least significant 5 bits (RV32I) or 6 bits (RV64I).
+class shiftop<SDPatternOperator operator>
+    : PatFrags<(ops node:$val, node:$count),
+               [(operator node:$val, node:$count),
+                (operator node:$val, (and node:$count, immbottomxlenset))]>;
+
+def : PatGprGpr<shiftop<shl>, SLL>;
+def : PatGprGpr<shiftop<srl>, SRL>;
+def : PatGprGpr<shiftop<sra>, SRA>;
+
 /// FrameIndex calculations
 
 def : Pat<(add (i32 AddrFI:$Rs), simm12:$imm12),
diff --git a/lib/Target/SystemZ/AsmParser/LLVMBuild.txt b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
index 602898ea341e5bfe46b28b5acfb410e539ee1258..76aa5a4aa9dcacfe10c56c7e85cbff1f3190f659 100644
--- a/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
+++ b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SystemZAsmParser
 parent = SystemZ
-required_libraries = MC MCParser Support SystemZDesc SystemZInfo
+required_libraries = MC MCParser Support SystemZDesc SystemZInfo SystemZAsmPrinter
 add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index bde067d6c1294fd89df3aaed726a043bd9d05fb9..91959b4151b32334300f93f7cc5da0cbcd32ab8f 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "InstPrinter/SystemZInstPrinter.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -243,6 +244,11 @@ public:
     return Kind == KindImmTLS;
   }
 
+  const ImmTLSOp getImmTLS() const {
+    assert(Kind == KindImmTLS && "Not a TLS immediate");
+    return ImmTLS;
+  }
+
   // Memory operands.
   bool isMem() const override {
     return Kind == KindMem;
@@ -270,6 +276,11 @@ public:
     return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100);
   }
 
+  const MemOp& getMem() const {
+    assert(Kind == KindMem && "Not a Mem operand");
+    return Mem;
+  }
+
   // Override MCParsedAsmOperand.
   SMLoc getStartLoc() const override { return StartLoc; }
   SMLoc getEndLoc() const override { return EndLoc; }
@@ -623,8 +634,61 @@ static struct InsnMatchEntry InsnMatchTable[] = {
     { MCK_U48Imm, MCK_BDAddr64Disp12, MCK_BDAddr64Disp12, MCK_AnyReg } }
 };
 
+static void printMCExpr(const MCExpr *E, raw_ostream &OS) {
+  if (!E)
+    return;
+  if (auto *CE = dyn_cast<MCConstantExpr>(E))
+    OS << *CE;
+  else if (auto *UE = dyn_cast<MCUnaryExpr>(E))
+    OS << *UE;
+  else if (auto *BE = dyn_cast<MCBinaryExpr>(E))
+    OS << *BE;
+  else if (auto *SRE = dyn_cast<MCSymbolRefExpr>(E))
+    OS << *SRE;
+  else
+    OS << *E;
+}
+
 void SystemZOperand::print(raw_ostream &OS) const {
-  llvm_unreachable("Not implemented");
+  switch (Kind) {
+    break;
+  case KindToken:
+    OS << "Token:" << getToken();
+    break;
+  case KindReg:
+    OS << "Reg:" << SystemZInstPrinter::getRegisterName(getReg());
+    break;
+  case KindImm:
+    OS << "Imm:";
+    printMCExpr(getImm(), OS);
+    break;
+  case KindImmTLS:
+    OS << "ImmTLS:";
+    printMCExpr(getImmTLS().Imm, OS);
+    if (getImmTLS().Sym) {
+      OS << ", ";
+      printMCExpr(getImmTLS().Sym, OS);
+    }
+    break;
+  case KindMem: {
+    const MemOp &Op = getMem();
+    OS << "Mem:" << *cast<MCConstantExpr>(Op.Disp);
+    if (Op.Base) {
+      OS << "(";
+      if (Op.MemKind == BDLMem)
+        OS << *cast<MCConstantExpr>(Op.Length.Imm) << ",";
+      else if (Op.MemKind == BDRMem)
+        OS << SystemZInstPrinter::getRegisterName(Op.Length.Reg) << ",";
+      if (Op.Index)
+        OS << SystemZInstPrinter::getRegisterName(Op.Index) << ",";
+      OS << SystemZInstPrinter::getRegisterName(Op.Base);
+      OS << ")";
+    }
+    break;
+  }
+  case KindInvalid:
+    break;
+  }
 }
 
 // Parse one register of the form %<prefix><number>.
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 5edfdf645e586455d37276f1e79a22f678d7b253..0d2c2389847f6b643f1cfa5ee22a8ab89020e7fa 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -71,19 +71,19 @@ struct SystemZAddressingMode {
   // True if the address can (and must) include ADJDYNALLOC.
   bool isDynAlloc() { return Form == FormBDXDynAlloc; }
 
-  void dump() {
+  void dump(const llvm::SelectionDAG *DAG) {
     errs() << "SystemZAddressingMode " << this << '\n';
 
     errs() << " Base ";
     if (Base.getNode())
-      Base.getNode()->dump();
+      Base.getNode()->dump(DAG);
     else
       errs() << "null\n";
 
     if (hasIndexField()) {
       errs() << " Index ";
       if (Index.getNode())
-        Index.getNode()->dump();
+        Index.getNode()->dump(DAG);
       else
         errs() << "null\n";
     }
@@ -589,7 +589,7 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
   if (AM.isDynAlloc() && !AM.IncludesDynAlloc)
     return false;
 
-  LLVM_DEBUG(AM.dump());
+  LLVM_DEBUG(AM.dump(CurDAG));
   return true;
 }
 
@@ -1308,7 +1308,7 @@ bool SystemZDAGToDAGISel::tryFoldLoadStoreIntoMemOperand(SDNode *Node) {
     return false;
   case SystemZISD::SSUBO:
     NegateOperand = true;
-    /* fall through */
+    LLVM_FALLTHROUGH;
   case SystemZISD::SADDO:
     if (MemVT == MVT::i32)
       NewOpc = SystemZ::ASI;
@@ -1319,7 +1319,7 @@ bool SystemZDAGToDAGISel::tryFoldLoadStoreIntoMemOperand(SDNode *Node) {
     break;
   case SystemZISD::USUBO:
     NegateOperand = true;
-    /* fall through */
+    LLVM_FALLTHROUGH;
   case SystemZISD::UADDO:
     if (MemVT == MVT::i32)
       NewOpc = SystemZ::ALSI;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 7ab4024d43c3a52cfa302ea94eb6ae32dd7c12ad..d2c33546716c051f7ab450b011ac279c0034d714 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -452,29 +452,29 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f64, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f64, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f64, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v2f64, Legal);
     setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::v2f64, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
 
     setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
-    setOperationAction(ISD::FMAXNAN, MVT::f128, Legal);
+    setOperationAction(ISD::FMAXIMUM, MVT::f128, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
-    setOperationAction(ISD::FMINNAN, MVT::f128, Legal);
+    setOperationAction(ISD::FMINIMUM, MVT::f128, Legal);
   }
 
   // We have fused multiply-addition for f32 and f64 but not f128.
@@ -527,6 +527,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::FP_ROUND);
   setTargetDAGCombine(ISD::BSWAP);
+  setTargetDAGCombine(ISD::SDIV);
+  setTargetDAGCombine(ISD::UDIV);
+  setTargetDAGCombine(ISD::SREM);
+  setTargetDAGCombine(ISD::UREM);
 
   // Handle intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -5394,8 +5398,7 @@ SDValue SystemZTargetLowering::combineSTORE(
         BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp);
 
       SDValue Ops[] = {
-        N->getOperand(0), BSwapOp, N->getOperand(2),
-        DAG.getValueType(Op1.getValueType())
+        N->getOperand(0), BSwapOp, N->getOperand(2)
       };
 
       return
@@ -5492,13 +5495,14 @@ SDValue SystemZTargetLowering::combineBSWAP(
       // Create the byte-swapping load.
       SDValue Ops[] = {
         LD->getChain(),    // Chain
-        LD->getBasePtr(),  // Ptr
-        DAG.getValueType(N->getValueType(0)) // VT
+        LD->getBasePtr()   // Ptr
       };
+      EVT LoadVT = N->getValueType(0);
+      if (LoadVT == MVT::i16)
+        LoadVT = MVT::i32;
       SDValue BSLoad =
         DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N),
-                                DAG.getVTList(N->getValueType(0) == MVT::i64 ?
-                                              MVT::i64 : MVT::i32, MVT::Other),
+                                DAG.getVTList(LoadVT, MVT::Other),
                                 Ops, LD->getMemoryVT(), LD->getMemOperand());
 
       // If this is an i16 load, insert the truncate.
@@ -5664,6 +5668,23 @@ SDValue SystemZTargetLowering::combineGET_CCMASK(
   return Select->getOperand(4);
 }
 
+SDValue SystemZTargetLowering::combineIntDIVREM(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  // In the case where the divisor is a vector of constants a cheaper
+  // sequence of instructions can replace the divide. BuildSDIV is called to
+  // do this during DAG combining, but it only succeeds when it can build a
+  // multiplication node. The only option for SystemZ is ISD::SMUL_LOHI, and
+  // since it is not Legal but Custom it can only happen before
+  // legalization. Therefore we must scalarize this early before Combine
+  // 1. For widened vectors, this is already the result of type legalization.
+  if (VT.isVector() && isTypeLegal(VT) &&
+      DAG.isConstantIntBuildVectorOrConstantInt(N->getOperand(1)))
+    return DAG.UnrollVectorOp(N);
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   switch(N->getOpcode()) {
@@ -5681,6 +5702,10 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
   case SystemZISD::BR_CCMASK:   return combineBR_CCMASK(N, DCI);
   case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
   case SystemZISD::GET_CCMASK:  return combineGET_CCMASK(N, DCI);
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:               return combineIntDIVREM(N, DCI);
   }
 
   return SDValue();
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 267e31a852167127a85c75bfabd544461bc02d15..9bf9440794713ea81a3e21c4edfe58a9fc6721e1 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -337,18 +337,8 @@ enum NodeType : unsigned {
   // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
   ATOMIC_CMP_SWAP_128,
 
-  // Byte swapping load.
-  //
-  // Operand 0: the address to load from
-  // Operand 1: the type of load (i16, i32, i64)
-  LRV,
-
-  // Byte swapping store.
-  //
-  // Operand 0: the value to store
-  // Operand 1: the address to store to
-  // Operand 2: the type of store (i16, i32, i64)
-  STRV,
+  // Byte swapping load/store.  Same operands as regular load/store.
+  LRV, STRV,
 
   // Prefetch from the second operand using the 4-bit control code in
   // the first operand.  The code is 1 for a load prefetch and 2 for
@@ -389,7 +379,7 @@ public:
     // want to clobber the upper 32 bits of a GPR unnecessarily.
     return MVT::i32;
   }
-  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT)
+  TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
     const override {
     // Widen subvectors to the full width rather than promoting integer
     // elements.  This is better because:
@@ -605,6 +595,7 @@ private:
   SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineIntDIVREM(SDNode *N, DAGCombinerInfo &DCI) const;
 
   // If the last instruction before MBBI in MBB was some form of COMPARE,
   // try to replace it with a COMPARE AND BRANCH just before MBBI.
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index bb5b7aae883bb47f244eb2d0238a3ff735b5fce9..8d3b1011d0a7d67b670382f0a383faedfa3e1169 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -756,16 +756,15 @@ def STMH : StoreMultipleRSY<"stmh", 0xEB26, GRH32>;
 def LRVR  : UnaryRRE<"lrvr",  0xB91F, bswap, GR32, GR32>;
 def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>;
 
-// Byte-swapping loads.  Unlike normal loads, these instructions are
-// allowed to access storage more than once.
-def LRVH : UnaryRXY<"lrvh", 0xE31F, z_lrvh, GR32, 2>;
-def LRV  : UnaryRXY<"lrv",  0xE31E, z_lrv,  GR32, 4>;
-def LRVG : UnaryRXY<"lrvg", 0xE30F, z_lrvg, GR64, 8>;
-
-// Likewise byte-swapping stores.
-def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
-def STRV  : StoreRXY<"strv",  0xE33E, z_strv,  GR32, 4>;
-def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
+// Byte-swapping loads.
+def LRVH : UnaryRXY<"lrvh", 0xE31F, z_loadbswap16, GR32, 2>;
+def LRV  : UnaryRXY<"lrv",  0xE31E, z_loadbswap32, GR32, 4>;
+def LRVG : UnaryRXY<"lrvg", 0xE30F, z_loadbswap64, GR64, 8>;
+
+// Byte-swapping stores.
+def STRVH : StoreRXY<"strvh", 0xE33F, z_storebswap16, GR32, 2>;
+def STRV  : StoreRXY<"strv",  0xE33E, z_storebswap32, GR32, 4>;
+def STRVG : StoreRXY<"strvg", 0xE32F, z_storebswap64, GR64, 8>;
 
 // Byte-swapping memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td
index 094d3a7de3d2a7b4c68995261729e12bf4a2e43a..8523af7e57386519cf1f188e64091687d8801ae2 100644
--- a/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/lib/Target/SystemZ/SystemZInstrVector.td
@@ -1031,7 +1031,7 @@ let Predicates = [FeatureVector] in {
   // Maximum.
   multiclass VectorMax<Instruction insn, TypedReg tr> {
     def : FPMinMax<insn, fmaxnum, tr, 4>;
-    def : FPMinMax<insn, fmaxnan, tr, 1>;
+    def : FPMinMax<insn, fmaximum, tr, 1>;
   }
   let Predicates = [FeatureVectorEnhancements1] in {
     def VFMAX   : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>;
@@ -1055,7 +1055,7 @@ let Predicates = [FeatureVector] in {
   // Minimum.
   multiclass VectorMin<Instruction insn, TypedReg tr> {
     def : FPMinMax<insn, fminnum, tr, 4>;
-    def : FPMinMax<insn, fminnan, tr, 1>;
+    def : FPMinMax<insn, fminimum, tr, 1>;
   }
   let Predicates = [FeatureVectorEnhancements1] in {
     def VFMIN   : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>;
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 1f299d5fd764c814dbb740935f0e1df3fa8ec4fb..c55a6273f5e447036540ca72c034fdc60cc0e659 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -127,14 +127,6 @@ def SDT_ZIPM                : SDTypeProfile<1, 1,
 def SDT_ZPrefetch           : SDTypeProfile<0, 2,
                                             [SDTCisVT<0, i32>,
                                              SDTCisPtrTy<1>]>;
-def SDT_ZLoadBSwap          : SDTypeProfile<1, 2,
-                                            [SDTCisInt<0>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisVT<2, OtherVT>]>;
-def SDT_ZStoreBSwap         : SDTypeProfile<0, 3,
-                                            [SDTCisInt<0>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisVT<2, OtherVT>]>;
 def SDT_ZTBegin             : SDTypeProfile<1, 2,
                                             [SDTCisVT<0, i32>,
                                              SDTCisPtrTy<1>,
@@ -283,9 +275,9 @@ def z_subcarry_1        : SDNode<"SystemZISD::SUBCARRY", SDT_ZBinaryWithCarry>;
 def z_membarrier        : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
                                  [SDNPHasChain, SDNPSideEffect]>;
 
-def z_loadbswap        : SDNode<"SystemZISD::LRV", SDT_ZLoadBSwap,
+def z_loadbswap        : SDNode<"SystemZISD::LRV", SDTLoad,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def z_storebswap       : SDNode<"SystemZISD::STRV", SDT_ZStoreBSwap,
+def z_storebswap       : SDNode<"SystemZISD::STRV", SDTStore,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest>;
@@ -429,16 +421,28 @@ def z_vsrl              : SDNode<"ISD::SRL", SDT_ZVecBinary>;
 // Pattern fragments
 //===----------------------------------------------------------------------===//
 
-def z_lrvh  : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i16)>;
-def z_lrv   : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i32)>;
-def z_lrvg  : PatFrag<(ops node:$addr), (z_loadbswap node:$addr, i64)>;
+def z_loadbswap16 : PatFrag<(ops node:$addr), (z_loadbswap node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_loadbswap32 : PatFrag<(ops node:$addr), (z_loadbswap node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_loadbswap64 : PatFrag<(ops node:$addr), (z_loadbswap node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
 
-def z_strvh : PatFrag<(ops node:$src, node:$addr),
-                      (z_storebswap node:$src, node:$addr, i16)>;
-def z_strv  : PatFrag<(ops node:$src, node:$addr),
-                      (z_storebswap node:$src, node:$addr, i32)>;
-def z_strvg : PatFrag<(ops node:$src, node:$addr),
-                      (z_storebswap node:$src, node:$addr, i64)>;
+def z_storebswap16 : PatFrag<(ops node:$src, node:$addr),
+                             (z_storebswap node:$src, node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def z_storebswap32 : PatFrag<(ops node:$src, node:$addr),
+                             (z_storebswap node:$src, node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+def z_storebswap64 : PatFrag<(ops node:$src, node:$addr),
+                             (z_storebswap node:$src, node:$addr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
 
 // Fragments including CC as an implicit source.
 def z_br_ccmask
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 6f553d5bed377e4bd94f0d5202c40fd12f4e3365..f296d80dbf523061cb4a731bbc1885841f5d4699 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -362,50 +362,58 @@ int SystemZTTIImpl::getArithmeticInstrCost(
 
   unsigned ScalarBits = Ty->getScalarSizeInBits();
 
-  // Div with a constant which is a power of 2 will be converted by
-  // DAGCombiner to use shifts. With vector shift-element instructions, a
-  // vector sdiv costs about as much as a scalar one.
-  const unsigned SDivCostEstimate = 4;
-  bool SDivPow2 = false;
-  bool UDivPow2 = false;
-  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv) &&
-      Args.size() == 2) {
-    const ConstantInt *CI = nullptr;
+  // There are thre cases of division and remainder: Dividing with a register
+  // needs a divide instruction. A divisor which is a power of two constant
+  // can be implemented with a sequence of shifts. Any other constant needs a
+  // multiply and shifts.
+  const unsigned DivInstrCost = 20;
+  const unsigned DivMulSeqCost = 10;
+  const unsigned SDivPow2Cost = 4;
+
+  bool SignedDivRem =
+      Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
+  bool UnsignedDivRem =
+      Opcode == Instruction::UDiv || Opcode == Instruction::URem;
+
+  // Check for a constant divisor.
+  bool DivRemConst = false;
+  bool DivRemConstPow2 = false;
+  if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
     if (const Constant *C = dyn_cast<Constant>(Args[1])) {
-      if (C->getType()->isVectorTy())
-        CI = dyn_cast_or_null<const ConstantInt>(C->getSplatValue());
+      const ConstantInt *CVal =
+          (C->getType()->isVectorTy()
+               ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
+               : dyn_cast<const ConstantInt>(C));
+      if (CVal != nullptr &&
+          (CVal->getValue().isPowerOf2() || (-CVal->getValue()).isPowerOf2()))
+        DivRemConstPow2 = true;
       else
-        CI = dyn_cast<const ConstantInt>(C);
-    }
-    if (CI != nullptr &&
-        (CI->getValue().isPowerOf2() || (-CI->getValue()).isPowerOf2())) {
-      if (Opcode == Instruction::SDiv)
-        SDivPow2 = true;
-      else
-        UDivPow2 = true;
+        DivRemConst = true;
     }
   }
 
   if (Ty->isVectorTy()) {
-    assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+    assert(ST->hasVector() &&
+           "getArithmeticInstrCost() called with vector type.");
     unsigned VF = Ty->getVectorNumElements();
     unsigned NumVectors = getNumVectorRegs(Ty);
 
     // These vector operations are custom handled, but are still supported
     // with one instruction per vector, regardless of element size.
     if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
-        Opcode == Instruction::AShr || UDivPow2) {
+        Opcode == Instruction::AShr) {
       return NumVectors;
     }
 
-    if (SDivPow2)
-      return (NumVectors * SDivCostEstimate);
-
-    // Temporary hack: disable high vectorization factors with integer
-    // division/remainder, which will get scalarized and handled with GR128
-    // registers. The mischeduler is not clever enough to avoid spilling yet.
-    if ((Opcode == Instruction::UDiv || Opcode == Instruction::SDiv ||
-         Opcode == Instruction::URem || Opcode == Instruction::SRem) && VF > 4)
+    if (DivRemConstPow2)
+      return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
+    if (DivRemConst)
+      return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
+    if ((SignedDivRem || UnsignedDivRem) && VF > 4)
+      // Temporary hack: disable high vectorization factors with integer
+      // division/remainder, which will get scalarized and handled with
+      // GR128 registers. The mischeduler is not clever enough to avoid
+      // spilling yet.
       return 1000;
 
     // These FP operations are supported with a single vector instruction for
@@ -421,7 +429,8 @@ int SystemZTTIImpl::getArithmeticInstrCost(
           return NumVectors;
         // Return the cost of multiple scalar invocation plus the cost of
         // inserting and extracting the values.
-        unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+        unsigned ScalarCost =
+            getArithmeticInstrCost(Opcode, Ty->getScalarType());
         unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
@@ -471,19 +480,16 @@ int SystemZTTIImpl::getArithmeticInstrCost(
       return 7; // 2 * ipm sequences ; xor ; shift ; compare
     }
 
-    if (UDivPow2)
-      return 1;
-    if (SDivPow2)
-      return SDivCostEstimate;
-
-    // An extra extension for narrow types is needed.
-    if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+    if (DivRemConstPow2)
+      return (SignedDivRem ? SDivPow2Cost : 1);
+    if (DivRemConst)
+      return DivMulSeqCost;
+    if (SignedDivRem)
       // sext of op(s) for narrow types
-      return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
-
-    if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+      return DivInstrCost + (ScalarBits < 32 ? 3 : (ScalarBits == 32 ? 1 : 0));
+    if (UnsignedDivRem)
       // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
-      return (ScalarBits < 32 ? 4 : 2);
+      return DivInstrCost + (ScalarBits < 32 ? 3 : 1);
   }
 
   // Fallback to the default implementation.
@@ -629,6 +635,25 @@ static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
   return nullptr;
 }
 
+// Get the cost of converting a boolean vector to a vector with same width
+// and element size as Dst, plus the cost of zero extending if needed.
+unsigned SystemZTTIImpl::
+getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
+                              const Instruction *I) {
+  assert (Dst->isVectorTy());
+  unsigned VF = Dst->getVectorNumElements();
+  unsigned Cost = 0;
+  // If we know what the widths of the compared operands, get any cost of
+  // converting it to match Dst. Otherwise assume same widths.
+  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+  if (CmpOpTy != nullptr)
+    Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
+  if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
+    // One 'vn' per dst vector with an immediate mask.
+    Cost += getNumVectorRegs(Dst);
+  return Cost;
+}
+
 int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                      const Instruction *I) {
   unsigned DstScalarBits = Dst->getScalarSizeInBits();
@@ -660,19 +685,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
         return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
       }
-      else if (SrcScalarBits == 1) {
-        // This should be extension of a compare i1 result.
-        // If we know what the widths of the compared operands, get the
-        // cost of converting it to Dst. Otherwise assume same widths.
-        unsigned Cost = 0;
-        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
-        if (CmpOpTy != nullptr)
-          Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
-        if (Opcode == Instruction::ZExt)
-          // One 'vn' per dst vector with an immediate mask.
-          Cost += NumDstVectors;
-        return Cost;
-      }
+      else if (SrcScalarBits == 1)
+        return getBoolVecToIntConversionCost(Opcode, Dst, I);
     }
 
     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
@@ -681,8 +695,13 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       // (seems to miss on differentiating on scalar/vector types).
 
       // Only 64 bit vector conversions are natively supported.
-      if (SrcScalarBits == 64 && DstScalarBits == 64)
-        return NumDstVectors;
+      if (DstScalarBits == 64) {
+        if (SrcScalarBits == 64)
+          return NumDstVectors;
+
+        if (SrcScalarBits == 1)
+          return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
+      }
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values. Base implementation does not
@@ -729,8 +748,12 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   else { // Scalar
     assert (!Dst->isVectorTy());
 
-    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
-      return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/);
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+      if (SrcScalarBits >= 32 ||
+          (I != nullptr && isa<LoadInst>(I->getOperand(0))))
+        return 1;
+      return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
+    }
 
     if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
         Src->isIntegerTy(1)) {
@@ -755,8 +778,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
 }
 
-int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-                                       const Instruction *I) {
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                       Type *CondTy, const Instruction *I) {
   if (ValTy->isVectorTy()) {
     assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
     unsigned VF = ValTy->getVectorNumElements();
@@ -817,7 +840,7 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondT
     }
     case Instruction::Select:
       if (ValTy->isFloatingPointTy())
-        return 4; // No load on condition for FP, so this costs a conditional jump.
+        return 4; // No load on condition for FP - costs a conditional jump.
       return 1; // Load On Condition.
     }
   }
@@ -845,54 +868,107 @@ getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   return BaseT::getVectorInstrCost(Opcode, Val, Index);
 }
 
+// Check if a load may be folded as a memory operand in its user.
+bool SystemZTTIImpl::
+isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
+  if (!Ld->hasOneUse())
+    return false;
+  FoldedValue = Ld;
+  const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
+  unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
+  unsigned TruncBits = 0;
+  unsigned SExtBits = 0;
+  unsigned ZExtBits = 0;
+  if (UserI->hasOneUse()) {
+    unsigned UserBits = UserI->getType()->getScalarSizeInBits();
+    if (isa<TruncInst>(UserI))
+      TruncBits = UserBits;
+    else if (isa<SExtInst>(UserI))
+      SExtBits = UserBits;
+    else if (isa<ZExtInst>(UserI))
+      ZExtBits = UserBits;
+  }
+  if (TruncBits || SExtBits || ZExtBits) {
+    FoldedValue = UserI;
+    UserI = cast<Instruction>(*UserI->user_begin());
+    // Load (single use) -> trunc/extend (single use) -> UserI
+  }
+  if ((UserI->getOpcode() == Instruction::Sub ||
+       UserI->getOpcode() == Instruction::SDiv ||
+       UserI->getOpcode() == Instruction::UDiv) &&
+      UserI->getOperand(1) != FoldedValue)
+    return false; // Not commutative, only RHS foldable.
+  switch (UserI->getOpcode()) {
+  case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
+  case Instruction::Sub:
+    if (LoadedBits == 32 && ZExtBits == 64)
+      return true;
+    LLVM_FALLTHROUGH;
+  case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
+    if (LoadedBits == 16 &&
+        (SExtBits == 32 ||
+         (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
+      return true;
+    LLVM_FALLTHROUGH;
+  case Instruction::SDiv:// SE: 32->64
+    if (LoadedBits == 32 && SExtBits == 64)
+      return true;
+    LLVM_FALLTHROUGH;
+  case Instruction::UDiv:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    // This also makes sense for float operations, but disabled for now due
+    // to regressions.
+    // case Instruction::FCmp:
+    // case Instruction::FAdd:
+    // case Instruction::FSub:
+    // case Instruction::FMul:
+    // case Instruction::FDiv:
+
+    // All possible extensions of memory checked above.
+    if (SExtBits || ZExtBits)
+      return false;
+
+    unsigned LoadOrTruncBits = (TruncBits ? TruncBits : LoadedBits);
+    return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
+    break;
+  }
+  return false;
+}
+
 int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                     unsigned Alignment, unsigned AddressSpace,
                                     const Instruction *I) {
   assert(!Src->isVoidTy() && "Invalid type");
 
-  if (!Src->isVectorTy() && Opcode == Instruction::Load &&
-      I != nullptr && I->hasOneUse()) {
-      const Instruction *UserI = cast<Instruction>(*I->user_begin());
-      unsigned Bits = getScalarSizeInBits(Src);
-      bool FoldsLoad = false;
-      switch (UserI->getOpcode()) {
-      case Instruction::ICmp:
-      case Instruction::Add:
-      case Instruction::Sub:
-      case Instruction::Mul:
-      case Instruction::SDiv:
-      case Instruction::UDiv:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor:
-      // This also makes sense for float operations, but disabled for now due
-      // to regressions.
-      // case Instruction::FCmp:
-      // case Instruction::FAdd:
-      // case Instruction::FSub:
-      // case Instruction::FMul:
-      // case Instruction::FDiv:
-        FoldsLoad = (Bits == 32 || Bits == 64);
-        break;
-      }
-
-      if (FoldsLoad) {
-        assert (UserI->getNumOperands() == 2 &&
-                "Expected to only handle binops.");
-
-        // UserI can't fold two loads, so in that case return 0 cost only
-        // half of the time.
-        for (unsigned i = 0; i < 2; ++i) {
-          if (UserI->getOperand(i) == I)
-            continue;
-          if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) {
-            if (LI->hasOneUse())
-              return i == 0;
-          }
+  if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
+    // Store the load or its truncated or extended value in FoldedValue.
+    const Instruction *FoldedValue = nullptr;
+    if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
+      const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
+      assert (UserI->getNumOperands() == 2 && "Expected a binop.");
+
+      // UserI can't fold two loads, so in that case return 0 cost only
+      // half of the time.
+      for (unsigned i = 0; i < 2; ++i) {
+        if (UserI->getOperand(i) == FoldedValue)
+          continue;
+
+        if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
+          LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
+          if (!OtherLoad &&
+              (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
+               isa<ZExtInst>(OtherOp)))
+            OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
+          if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
+            return i == 0; // Both operands foldable.
         }
-
-        return 0;
       }
+
+      return 0; // Only I is foldable in user.
+    }
   }
 
   unsigned NumOps =
@@ -905,30 +981,68 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   return  NumOps;
 }
 
+// The generic implementation of getInterleavedMemoryOpCost() is based on
+// adding costs of the memory operations plus all the extracts and inserts
+// needed for using / defining the vector operands. The SystemZ version does
+// roughly the same but bases the computations on vector permutations
+// instead.
 int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace) {
+                                               unsigned AddressSpace,
+                                               bool UseMaskForCond,
+                                               bool UseMaskForGaps) {
+  if (UseMaskForCond || UseMaskForGaps)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
 
-  int NumWideParts = getNumVectorRegs(VecTy);
-
-  // How many source vectors are handled to produce a vectorized operand?
-  int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
-  int NumSrcParts =
-    ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
-
-  // A Load group may have gaps.
-  unsigned NumOperands =
-    ((Opcode == Instruction::Load) ? Indices.size() : Factor);
-
-  // Each needed permute takes two vectors as input.
-  if (NumSrcParts > 1)
-    NumSrcParts--;
-  int NumPermutes = NumSrcParts * NumOperands;
+  // Return the ceiling of dividing A by B.
+  auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
+
+  unsigned NumElts = VecTy->getVectorNumElements();
+  assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
+  unsigned VF = NumElts / Factor;
+  unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
+  unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
+  unsigned NumPermutes = 0;
+
+  if (Opcode == Instruction::Load) {
+    // Loading interleave groups may have gaps, which may mean fewer
+    // loads. Find out how many vectors will be loaded in total, and in how
+    // many of them each value will be in.
+    BitVector UsedInsts(NumVectorMemOps, false);
+    std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
+    for (unsigned Index : Indices)
+      for (unsigned Elt = 0; Elt < VF; ++Elt) {
+        unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
+        UsedInsts.set(Vec);
+        ValueVecs[Index].set(Vec);
+      }
+    NumVectorMemOps = UsedInsts.count();
+
+    for (unsigned Index : Indices) {
+      // Estimate that each loaded source vector containing this Index
+      // requires one operation, except that vperm can handle two input
+      // registers first time for each dst vector.
+      unsigned NumSrcVecs = ValueVecs[Index].count();
+      unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U);
+      assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
+      NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
+    }
+  } else {
+    // Estimate the permutes for each stored vector as the smaller of the
+    // number of elements and the number of source vectors. Subtract one per
+    // dst vector for vperm (S.A.).
+    unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
+    unsigned NumDstVecs = NumVectorMemOps;
+    assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
+    NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
+  }
 
   // Cost of load/store operations and the permutations needed.
-  return NumWideParts + NumPermutes;
+  return NumVectorMemOps + NumPermutes;
 }
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index bfa942357c55979989660516c9b2d194a73f60f6..dd85c4ea541717a4d4e0dc832b45abb1bab76587 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -80,11 +80,14 @@ public:
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
   unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
+  unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
+                                         const Instruction *I);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                        const Instruction *I = nullptr);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
 
@@ -92,7 +95,9 @@ public:
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
-                                 unsigned AddressSpace);
+                                 unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
   /// @}
 };
 
diff --git a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 60f82fac5ded53b4e902c04da57dfac763b6e203..efa6793cff27603a5900c69909cb2500e6a3af3a 100644
--- a/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -131,14 +132,14 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
 class WebAssemblyAsmParser final : public MCTargetAsmParser {
   MCAsmParser &Parser;
   MCAsmLexer &Lexer;
-  MCSymbol *LastLabel;
+  MCSymbolWasm *LastSymbol;
 
 public:
-  WebAssemblyAsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
-                       const MCInstrInfo &mii, const MCTargetOptions &Options)
-      : MCTargetAsmParser(Options, sti, mii), Parser(Parser),
-        Lexer(Parser.getLexer()), LastLabel(nullptr) {
-    setAvailableFeatures(ComputeAvailableFeatures(sti.getFeatureBits()));
+  WebAssemblyAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
+                       const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, STI, MII), Parser(Parser),
+        Lexer(Parser.getLexer()), LastSymbol(nullptr) {
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
 #define GET_ASSEMBLER_HEADER
@@ -168,24 +169,26 @@ public:
     return false;
   }
 
-  MVT::SimpleValueType ParseRegType(const StringRef &RegType) {
+
+  std::pair<MVT::SimpleValueType, unsigned>
+  ParseRegType(const StringRef &RegType) {
     // Derive type from .param .local decls, or the instruction itself.
-    return StringSwitch<MVT::SimpleValueType>(RegType)
-        .Case("i32", MVT::i32)
-        .Case("i64", MVT::i64)
-        .Case("f32", MVT::f32)
-        .Case("f64", MVT::f64)
-        .Case("i8x16", MVT::v16i8)
-        .Case("i16x8", MVT::v8i16)
-        .Case("i32x4", MVT::v4i32)
-        .Case("i64x2", MVT::v2i64)
-        .Case("f32x4", MVT::v4f32)
-        .Case("f64x2", MVT::v2f64)
+    return StringSwitch<std::pair<MVT::SimpleValueType, unsigned>>(RegType)
+        .Case("i32", {MVT::i32, wasm::WASM_TYPE_I32})
+        .Case("i64", {MVT::i64, wasm::WASM_TYPE_I64})
+        .Case("f32", {MVT::f32, wasm::WASM_TYPE_F32})
+        .Case("f64", {MVT::f64, wasm::WASM_TYPE_F64})
+        .Case("i8x16", {MVT::v16i8, wasm::WASM_TYPE_V128})
+        .Case("i16x8", {MVT::v8i16, wasm::WASM_TYPE_V128})
+        .Case("i32x4", {MVT::v4i32, wasm::WASM_TYPE_V128})
+        .Case("i64x2", {MVT::v2i64, wasm::WASM_TYPE_V128})
+        .Case("f32x4", {MVT::v4f32, wasm::WASM_TYPE_V128})
+        .Case("f64x2", {MVT::v2f64, wasm::WASM_TYPE_V128})
         // arbitrarily chosen vector type to associate with "v128"
         // FIXME: should these be EVTs to avoid this arbitrary hack? Do we want
         // to accept more specific SIMD register types?
-        .Case("v128", MVT::v16i8)
-        .Default(MVT::INVALID_SIMPLE_VALUE_TYPE);
+        .Case("v128", {MVT::v16i8, wasm::WASM_TYPE_V128})
+        .Default({MVT::INVALID_SIMPLE_VALUE_TYPE, wasm::WASM_TYPE_NORESULT});
   }
 
   void ParseSingleInteger(bool IsNegative, OperandVector &Operands) {
@@ -303,7 +306,7 @@ public:
     // assembly, so we add a dummy one explicitly (since we have no control
     // over signature tables here, we assume these will be regenerated when
     // the wasm module is generated).
-    if (BaseName == "block" || BaseName == "loop") {
+    if (BaseName == "block" || BaseName == "loop" || BaseName == "try") {
       Operands.push_back(make_unique<WebAssemblyOperand>(
           WebAssemblyOperand::Integer, NameLoc, NameLoc,
           WebAssemblyOperand::IntOp{-1}));
@@ -311,24 +314,84 @@ public:
     return false;
   }
 
-  void onLabelParsed(MCSymbol *Symbol) override { LastLabel = Symbol; }
+  void onLabelParsed(MCSymbol *Symbol) override {
+    LastSymbol = cast<MCSymbolWasm>(Symbol);
+  }
 
   bool ParseDirective(AsmToken DirectiveID) override {
+    // This function has a really weird return value behavior that is different
+    // from all the other parsing functions:
+    // - return true && no tokens consumed -> don't know this directive / let
+    //   the generic parser handle it.
+    // - return true && tokens consumed -> a parsing error occurred.
+    // - return false -> processed this directive successfully.
     assert(DirectiveID.getKind() == AsmToken::Identifier);
     auto &Out = getStreamer();
     auto &TOut =
         reinterpret_cast<WebAssemblyTargetStreamer &>(*Out.getTargetStreamer());
-    // TODO: we're just parsing the subset of directives we're interested in,
-    // and ignoring ones we don't recognise. We should ideally verify
-    // all directives here.
+    // TODO: any time we return an error, at least one token must have been
+    // consumed, otherwise this will not signal an error to the caller.
     if (DirectiveID.getString() == ".type") {
       // This could be the start of a function, check if followed by
       // "label,@function"
-      if (!(IsNext(AsmToken::Identifier) && IsNext(AsmToken::Comma) &&
-            IsNext(AsmToken::At) && Lexer.is(AsmToken::Identifier)))
+      if (!Lexer.is(AsmToken::Identifier))
+        return Error("Expected label after .type directive, got: ",
+                     Lexer.getTok());
+      auto WasmSym = cast<MCSymbolWasm>(
+                       TOut.getStreamer().getContext().getOrCreateSymbol(
+                         Lexer.getTok().getString()));
+      Parser.Lex();
+      if (!(IsNext(AsmToken::Comma) && IsNext(AsmToken::At) &&
+            Lexer.is(AsmToken::Identifier)))
         return Error("Expected label,@type declaration, got: ", Lexer.getTok());
+      auto TypeName = Lexer.getTok().getString();
+      if (TypeName == "function")
+        WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
+      else if (TypeName == "global")
+        WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+      else
+        return Error("Unknown WASM symbol type: ", Lexer.getTok());
+      Parser.Lex();
+      return Expect(AsmToken::EndOfStatement, "EOL");
+    } else if (DirectiveID.getString() == ".size") {
+      if (!Lexer.is(AsmToken::Identifier))
+        return Error("Expected label after .size directive, got: ",
+                     Lexer.getTok());
+      auto WasmSym = cast<MCSymbolWasm>(
+                       TOut.getStreamer().getContext().getOrCreateSymbol(
+                         Lexer.getTok().getString()));
       Parser.Lex();
-      // Out.EmitSymbolAttribute(??, MCSA_ELF_TypeFunction);
+      if (!IsNext(AsmToken::Comma))
+        return Error("Expected `,`, got: ", Lexer.getTok());
+      const MCExpr *Exp;
+      if (Parser.parseExpression(Exp))
+        return Error("Cannot parse .size expression: ", Lexer.getTok());
+      WasmSym->setSize(Exp);
+      return Expect(AsmToken::EndOfStatement, "EOL");
+    } else if (DirectiveID.getString() == ".globaltype") {
+      if (!Lexer.is(AsmToken::Identifier))
+        return Error("Expected symbol name after .globaltype directive, got: ",
+                     Lexer.getTok());
+      auto Name = Lexer.getTok().getString();
+      Parser.Lex();
+      if (!IsNext(AsmToken::Comma))
+        return Error("Expected `,`, got: ", Lexer.getTok());
+      if (!Lexer.is(AsmToken::Identifier))
+        return Error("Expected type in .globaltype directive, got: ",
+                     Lexer.getTok());
+      auto Type = ParseRegType(Lexer.getTok().getString()).second;
+      if (Type == wasm::WASM_TYPE_NORESULT)
+        return Error("Unknown type in .globaltype directive: ",
+                     Lexer.getTok());
+      Parser.Lex();
+      // Now set this symbol with the correct type.
+      auto WasmSym = cast<MCSymbolWasm>(
+                       TOut.getStreamer().getContext().getOrCreateSymbol(Name));
+      WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+      WasmSym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), true});
+      // And emit the directive again.
+      TOut.emitGlobalType(WasmSym);
+      return Expect(AsmToken::EndOfStatement, "EOL");
     } else if (DirectiveID.getString() == ".param" ||
                DirectiveID.getString() == ".local") {
       // Track the number of locals, needed for correct virtual register
@@ -337,7 +400,7 @@ public:
       std::vector<MVT> Params;
       std::vector<MVT> Locals;
       while (Lexer.is(AsmToken::Identifier)) {
-        auto RegType = ParseRegType(Lexer.getTok().getString());
+        auto RegType = ParseRegType(Lexer.getTok().getString()).first;
         if (RegType == MVT::INVALID_SIMPLE_VALUE_TYPE)
           return true;
         if (DirectiveID.getString() == ".param") {
@@ -349,15 +412,20 @@ public:
         if (!IsNext(AsmToken::Comma))
           break;
       }
-      assert(LastLabel);
-      TOut.emitParam(LastLabel, Params);
+      assert(LastSymbol);
+      // TODO: LastSymbol isn't even used by emitParam, so could be removed.
+      TOut.emitParam(LastSymbol, Params);
       TOut.emitLocal(Locals);
+      return Expect(AsmToken::EndOfStatement, "EOL");
     } else {
-      // For now, ignore anydirective we don't recognize:
+      // TODO: remove.
       while (Lexer.isNot(AsmToken::EndOfStatement))
         Parser.Lex();
+      return Expect(AsmToken::EndOfStatement, "EOL");
     }
-    return Expect(AsmToken::EndOfStatement, "EOL");
+    // TODO: current ELF directive parsing is broken, fix this is a followup.
+    //return true;  // We didn't process this directive.
+    return false;
   }
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned & /*Opcode*/,
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index 2af5b9cb23d4b2c4b65166fa7ad49f5a5ae89454..549229ad572b25e45a0c1880b573d3b3064a1acb 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -9,7 +9,6 @@ tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM WebAssemblyGenStackifier.inc -gen-wasm-stackifier)
 
 add_public_tablegen_target(WebAssemblyCommonTableGen)
 
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 08c1155fed70afc3ee4f39c3239f3ab0b6943d96..e94faa1a21409240048f05b90647801980a48957 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -35,7 +35,7 @@ using namespace llvm;
 WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI,
                                                const MCInstrInfo &MII,
                                                const MCRegisterInfo &MRI)
-    : MCInstPrinter(MAI, MII, MRI), ControlFlowCounter(0) {}
+    : MCInstPrinter(MAI, MII, MRI) {}
 
 void WebAssemblyInstPrinter::printRegName(raw_ostream &OS,
                                           unsigned RegNo) const {
@@ -70,31 +70,63 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   if (CommentStream) {
     // Observe any effects on the control flow stack, for use in annotating
     // control flow label references.
-    switch (MI->getOpcode()) {
+    unsigned Opc = MI->getOpcode();
+    switch (Opc) {
     default:
       break;
+
     case WebAssembly::LOOP:
-    case WebAssembly::LOOP_S: {
+    case WebAssembly::LOOP_S:
       printAnnotation(OS, "label" + utostr(ControlFlowCounter) + ':');
       ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, true));
       break;
-    }
+
     case WebAssembly::BLOCK:
     case WebAssembly::BLOCK_S:
       ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
       break;
+
+    case WebAssembly::TRY:
+    case WebAssembly::TRY_S:
+      ControlFlowStack.push_back(std::make_pair(ControlFlowCounter++, false));
+      EHPadStack.push_back(EHPadStackCounter++);
+      LastSeenEHInst = TRY;
+      break;
+
     case WebAssembly::END_LOOP:
     case WebAssembly::END_LOOP_S:
-      // Have to guard against an empty stack, in case of mismatched pairs
-      // in assembly parsing.
-      if (!ControlFlowStack.empty())
-        ControlFlowStack.pop_back();
+      assert(!ControlFlowStack.empty() && "End marker mismatch!");
+      ControlFlowStack.pop_back();
       break;
+
     case WebAssembly::END_BLOCK:
     case WebAssembly::END_BLOCK_S:
-      if (!ControlFlowStack.empty())
-        printAnnotation(
-            OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      assert(!ControlFlowStack.empty() && "End marker mismatch!");
+      printAnnotation(
+          OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      break;
+
+    case WebAssembly::END_TRY:
+    case WebAssembly::END_TRY_S:
+      assert(!ControlFlowStack.empty() && "End marker mismatch!");
+      printAnnotation(
+          OS, "label" + utostr(ControlFlowStack.pop_back_val().first) + ':');
+      LastSeenEHInst = END_TRY;
+      break;
+
+    case WebAssembly::CATCH_I32:
+    case WebAssembly::CATCH_I32_S:
+    case WebAssembly::CATCH_I64:
+    case WebAssembly::CATCH_I64_S:
+    case WebAssembly::CATCH_ALL:
+    case WebAssembly::CATCH_ALL_S:
+      // There can be multiple catch instructions for one try instruction, so we
+      // print a label only for the first 'catch' label.
+      if (LastSeenEHInst != CATCH) {
+        assert(!EHPadStack.empty() && "try-catch mismatch!");
+        printAnnotation(OS, "catch" + utostr(EHPadStack.pop_back_val()) + ':');
+      }
+      LastSeenEHInst = CATCH;
       break;
     }
 
@@ -110,9 +142,26 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
       uint64_t Depth = MI->getOperand(i).getImm();
       if (!Printed.insert(Depth).second)
         continue;
-      const auto &Pair = ControlFlowStack.rbegin()[Depth];
-      printAnnotation(OS, utostr(Depth) + ": " + (Pair.second ? "up" : "down") +
-                              " to label" + utostr(Pair.first));
+
+      if (Opc == WebAssembly::RETHROW || Opc == WebAssembly::RETHROW_S) {
+        assert(Depth <= EHPadStack.size() && "Invalid depth argument!");
+        if (Depth == EHPadStack.size()) {
+          // This can happen when rethrow instruction breaks out of all nests
+          // and throws up to the current function's caller.
+          printAnnotation(OS, utostr(Depth) + ": " + "to caller");
+        } else {
+          uint64_t CatchNo = EHPadStack.rbegin()[Depth];
+          printAnnotation(OS, utostr(Depth) + ": " + "down to catch" +
+                                  utostr(CatchNo));
+        }
+
+      } else {
+        assert(Depth < ControlFlowStack.size() && "Invalid depth argument!");
+        const auto &Pair = ControlFlowStack.rbegin()[Depth];
+        printAnnotation(OS, utostr(Depth) + ": " +
+                                (Pair.second ? "up" : "down") + " to label" +
+                                utostr(Pair.first));
+      }
     }
   }
 }
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index 18023328b38ade95e483a258975d0e18b63e872b..ded64f9a6e9b83a046f330ec112c1a732963a2c1 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -25,8 +25,13 @@ namespace llvm {
 class MCSubtargetInfo;
 
 class WebAssemblyInstPrinter final : public MCInstPrinter {
-  uint64_t ControlFlowCounter;
-  SmallVector<std::pair<uint64_t, bool>, 0> ControlFlowStack;
+  uint64_t ControlFlowCounter = 0;
+  uint64_t EHPadStackCounter = 0;
+  SmallVector<std::pair<uint64_t, bool>, 4> ControlFlowStack;
+  SmallVector<uint64_t, 4> EHPadStack;
+
+  enum EHInstKind { TRY, CATCH, END_TRY };
+  EHInstKind LastSeenEHInst = END_TRY;
 
 public:
   WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 2158ee3be04fd960289b8e5e0a79f3f7e505258d..4c4ca4e599c67c53b41731f058ac92ed31d933f5 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -99,8 +99,11 @@ void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
   OS << '\n';
 }
 
-void WebAssemblyTargetAsmStreamer::emitGlobalImport(StringRef name) {
-  OS << "\t.import_global\t" << name << '\n';
+void WebAssemblyTargetAsmStreamer::emitGlobalType(MCSymbolWasm *Sym) {
+  OS << "\t.globaltype\t" << Sym->getName() << ", " <<
+        WebAssembly::TypeToString(
+          static_cast<wasm::ValType>(Sym->getGlobalType().Type)) <<
+        '\n';
 }
 
 void WebAssemblyTargetAsmStreamer::emitImportModule(MCSymbolWasm *Sym,
@@ -152,8 +155,8 @@ void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType(
   Symbol->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
 }
 
-void WebAssemblyTargetWasmStreamer::emitGlobalImport(StringRef name) {
-  llvm_unreachable(".global_import is not needed for direct wasm output");
+void WebAssemblyTargetWasmStreamer::emitGlobalType(MCSymbolWasm *Sym) {
+  // Not needed.
 }
 
 void WebAssemblyTargetWasmStreamer::emitImportModule(MCSymbolWasm *Sym,
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 43c422d593a6b8a030dfcb09ba8df55e73c98996..e60158b5defb90369d2113a4a29d2a58a908563b 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -43,8 +43,8 @@ public:
   virtual void emitIndirectFunctionType(MCSymbolWasm *Symbol) = 0;
   /// .indidx
   virtual void emitIndIdx(const MCExpr *Value) = 0;
-  /// .import_global
-  virtual void emitGlobalImport(StringRef name) = 0;
+  /// .globaltype
+  virtual void emitGlobalType(MCSymbolWasm *Sym) = 0;
   /// .import_module
   virtual void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) = 0;
 
@@ -65,7 +65,7 @@ public:
   void emitEndFunc() override;
   void emitIndirectFunctionType(MCSymbolWasm *Symbol) override;
   void emitIndIdx(const MCExpr *Value) override;
-  void emitGlobalImport(StringRef name) override;
+  void emitGlobalType(MCSymbolWasm *Sym) override;
   void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
 };
 
@@ -80,7 +80,7 @@ public:
   void emitEndFunc() override;
   void emitIndirectFunctionType(MCSymbolWasm *Symbol) override;
   void emitIndIdx(const MCExpr *Value) override;
-  void emitGlobalImport(StringRef name) override;
+  void emitGlobalType(MCSymbolWasm *Sym) override;
   void emitImportModule(MCSymbolWasm *Sym, StringRef ModuleName) override;
 };
 
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 2ea3760b923d4e3624c34f8d3ffbfbc114453ce7..1e21ab92b62997ab3488284e5eedc07fa00f2968 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -78,6 +78,14 @@ WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
 //===----------------------------------------------------------------------===//
 
 void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  for (auto &It : OutContext.getSymbols()) {
+    // Emit a .globaltype declaration.
+    auto Sym = cast<MCSymbolWasm>(It.getValue());
+    if (Sym->getType() == wasm::WASM_SYMBOL_TYPE_GLOBAL) {
+      getTargetStreamer()->emitGlobalType(Sym);
+    }
+  }
+
   for (const auto &F : M) {
     // Emit function type info for all undefined functions
     if (F.isDeclarationForLinker() && !F.isIntrinsic()) {
@@ -105,6 +113,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
       }
     }
   }
+
   for (const auto &G : M.globals()) {
     if (!G.hasInitializer() && G.hasExternalLinkage()) {
       if (G.getValueType()->isSized()) {
@@ -176,14 +185,14 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   LLVM_DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
 
   switch (MI->getOpcode()) {
-  case WebAssembly::ARGUMENT_I32:
-  case WebAssembly::ARGUMENT_I32_S:
-  case WebAssembly::ARGUMENT_I64:
-  case WebAssembly::ARGUMENT_I64_S:
-  case WebAssembly::ARGUMENT_F32:
-  case WebAssembly::ARGUMENT_F32_S:
-  case WebAssembly::ARGUMENT_F64:
-  case WebAssembly::ARGUMENT_F64_S:
+  case WebAssembly::ARGUMENT_i32:
+  case WebAssembly::ARGUMENT_i32_S:
+  case WebAssembly::ARGUMENT_i64:
+  case WebAssembly::ARGUMENT_i64_S:
+  case WebAssembly::ARGUMENT_f32:
+  case WebAssembly::ARGUMENT_f32_S:
+  case WebAssembly::ARGUMENT_f64:
+  case WebAssembly::ARGUMENT_f64_S:
   case WebAssembly::ARGUMENT_v16i8:
   case WebAssembly::ARGUMENT_v16i8_S:
   case WebAssembly::ARGUMENT_v8i16:
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 93ca670bbdbf70bc5d1fbcd61bbfecc2ab0ecab0..b1955017c687d5c56900be37994016813b767d39 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -739,7 +739,20 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
       case WebAssembly::CATCH_I32:
       case WebAssembly::CATCH_I64:
       case WebAssembly::CATCH_ALL:
-        EHPadStack.push_back(&MBB);
+        // Currently the only case there are more than one catch for a try is
+        // for catch terminate pad, in the form of
+        //   try
+        //   catch
+        //     call @__clang_call_terminate
+        //     unreachable
+        //   catch_all
+        //     call @std::terminate
+        //     unreachable
+        //   end
+        // So we shouldn't push the current BB for the second catch_all block
+        // here.
+        if (!WebAssembly::isCatchAllTerminatePad(MBB))
+          EHPadStack.push_back(&MBB);
         break;
 
       case WebAssembly::LOOP:
@@ -767,7 +780,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) {
       case WebAssembly::RETHROW_TO_CALLER: {
         MachineInstr *Rethrow =
             BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(WebAssembly::RETHROW))
-                .addImm(Stack.size());
+                .addImm(EHPadStack.size());
         MI.eraseFromParent();
         I = MachineBasicBlock::reverse_iterator(Rethrow);
         break;
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 8dc535445d6f7839db0eae11efda182e11b26bb8..00e37a4af2963829dbdb7c6471792b635a985386 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -37,7 +37,10 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "wasm-fastisel"
 
@@ -417,9 +420,10 @@ unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V, bool &Not) {
         return getRegForValue(ICmp->getOperand(0));
       }
 
-  if (BinaryOperator::isNot(V) && V->getType()->isIntegerTy(32)) {
+  Value *NotV;
+  if (match(V, m_Not(m_Value(NotV))) && V->getType()->isIntegerTy(32)) {
     Not = true;
-    return getRegForValue(BinaryOperator::getNotArgument(V));
+    return getRegForValue(NotV);
   }
 
   Not = false;
@@ -443,6 +447,7 @@ unsigned WebAssemblyFastISel::zeroExtendToI32(unsigned Reg, const Value *V,
           (isa<Argument>(V) && cast<Argument>(V)->hasZExtAttr()))
         return copyValue(Reg);
     }
+    break;
   case MVT::i8:
   case MVT::i16:
     break;
@@ -646,19 +651,19 @@ bool WebAssemblyFastISel::fastLowerArguments() {
     case MVT::i8:
     case MVT::i16:
     case MVT::i32:
-      Opc = WebAssembly::ARGUMENT_I32;
+      Opc = WebAssembly::ARGUMENT_i32;
       RC = &WebAssembly::I32RegClass;
       break;
     case MVT::i64:
-      Opc = WebAssembly::ARGUMENT_I64;
+      Opc = WebAssembly::ARGUMENT_i64;
       RC = &WebAssembly::I64RegClass;
       break;
     case MVT::f32:
-      Opc = WebAssembly::ARGUMENT_F32;
+      Opc = WebAssembly::ARGUMENT_f32;
       RC = &WebAssembly::F32RegClass;
       break;
     case MVT::f64:
-      Opc = WebAssembly::ARGUMENT_F64;
+      Opc = WebAssembly::ARGUMENT_f64;
       RC = &WebAssembly::F64RegClass;
       break;
     case MVT::v16i8:
@@ -686,7 +691,7 @@ bool WebAssemblyFastISel::fastLowerArguments() {
       RC = &WebAssembly::V128RegClass;
       break;
     case MVT::ExceptRef:
-      Opc = WebAssembly::ARGUMENT_EXCEPT_REF;
+      Opc = WebAssembly::ARGUMENT_ExceptRef;
       RC = &WebAssembly::EXCEPT_REFRegClass;
       break;
     default:
diff --git a/lib/Target/WebAssembly/WebAssemblyISD.def b/lib/Target/WebAssembly/WebAssemblyISD.def
index f326d37944f2fa6fef1e30032c973d7fd9dab5d6..444a087605ef6fa92c65d30fea03fa19450e995b 100644
--- a/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -22,12 +22,8 @@ HANDLE_NODETYPE(Wrapper)
 HANDLE_NODETYPE(BR_IF)
 HANDLE_NODETYPE(BR_TABLE)
 HANDLE_NODETYPE(SHUFFLE)
-HANDLE_NODETYPE(ANYTRUE)
-HANDLE_NODETYPE(ALLTRUE)
-HANDLE_NODETYPE(BITSELECT)
-HANDLE_NODETYPE(ADD_SAT_S)
-HANDLE_NODETYPE(ADD_SAT_U)
-HANDLE_NODETYPE(SUB_SAT_S)
-HANDLE_NODETYPE(SUB_SAT_U)
+HANDLE_NODETYPE(VEC_SHL)
+HANDLE_NODETYPE(VEC_SHR_S)
+HANDLE_NODETYPE(VEC_SHR_U)
 
 // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here...
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 4ecbf6d748767a1059a8ab14dbdc5aef06d4814d..578d23570f85c03073cba1bec336c1930fc2a4d9 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -48,6 +49,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   // Booleans always contain 0 or 1.
   setBooleanContents(ZeroOrOneBooleanContent);
+  // Except in SIMD vectors
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
   // WebAssembly does not produce floating-point exceptions on normal floating
   // point operations.
   setHasFloatingPointExceptions(false);
@@ -103,9 +106,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     for (auto Op :
          {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT, ISD::FRINT})
       setOperationAction(Op, T, Legal);
-    // Support minnan and maxnan, which otherwise default to expand.
-    setOperationAction(ISD::FMINNAN, T, Legal);
-    setOperationAction(ISD::FMAXNAN, T, Legal);
+    // Support minimum and maximum, which otherwise default to expand.
+    setOperationAction(ISD::FMINIMUM, T, Legal);
+    setOperationAction(ISD::FMAXIMUM, T, Legal);
     // WebAssembly currently has no builtin f16 support.
     setOperationAction(ISD::FP16_TO_FP, T, Expand);
     setOperationAction(ISD::FP_TO_FP16, T, Expand);
@@ -113,6 +116,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setTruncStoreAction(T, MVT::f16, Expand);
   }
 
+  // Support saturating add for i8x16 and i16x8
+  if (Subtarget->hasSIMD128())
+    for (auto T : {MVT::v16i8, MVT::v8i16})
+      for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
+        setOperationAction(Op, T, Legal);
+
   for (auto T : {MVT::i32, MVT::i64}) {
     // Expand unavailable integer operations.
     for (auto Op :
@@ -137,6 +146,25 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     }
   }
 
+  // Custom lowering since wasm shifts must have a scalar shift amount
+  if (Subtarget->hasSIMD128()) {
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+      for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
+        setOperationAction(Op, T, Custom);
+    if (EnableUnimplementedWasmSIMDInstrs)
+      for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
+        setOperationAction(Op, MVT::v2i64, Custom);
+  }
+
+  // There is no select instruction for vectors
+  if (Subtarget->hasSIMD128()) {
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+      setOperationAction(ISD::VSELECT, T, Expand);
+    if (EnableUnimplementedWasmSIMDInstrs)
+      for (auto T : {MVT::v2i64, MVT::v2f64})
+        setOperationAction(ISD::VSELECT, T, Expand);
+  }
+
   // As a special case, these operators use the type to mean the type to
   // sign-extend from.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
@@ -144,6 +172,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     for (auto T : {MVT::i8, MVT::i16, MVT::i32})
       setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
   }
+  for (auto T : MVT::integer_vector_valuetypes())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand);
 
   // Dynamic stack allocation: use the default expansion.
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
@@ -165,11 +195,38 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   //  - Floating-point extending loads.
   //  - Floating-point truncating stores.
   //  - i1 extending loads.
+  //  - extending/truncating SIMD loads/stores
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   for (auto T : MVT::integer_valuetypes())
     for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
       setLoadExtAction(Ext, T, MVT::i1, Promote);
+  if (Subtarget->hasSIMD128()) {
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32,
+                   MVT::v2f64}) {
+      for (auto MemT : MVT::vector_valuetypes()) {
+        if (MVT(T) != MemT) {
+          setTruncStoreAction(T, MemT, Expand);
+          for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD})
+            setLoadExtAction(Ext, T, MemT, Expand);
+        }
+      }
+    }
+  }
+
+  // Custom lower lane accesses to expand out variable indices
+  if (Subtarget->hasSIMD128()) {
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32}) {
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom);
+    }
+    if (EnableUnimplementedWasmSIMDInstrs) {
+      for (auto T : {MVT::v2i64, MVT::v2f64}) {
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
+        setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom);
+      }
+    }
+  }
 
   // Trap lowers to wasm unreachable
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
@@ -821,8 +878,15 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
     return LowerCopyToReg(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerAccessVectorElement(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    return LowerShift(Op, DAG);
   }
 }
 
@@ -966,47 +1030,17 @@ WebAssemblyTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   default:
     return {}; // Don't custom lower most intrinsics.
 
-  case Intrinsic::wasm_add_saturate_signed:
-  case Intrinsic::wasm_add_saturate_unsigned:
-  case Intrinsic::wasm_sub_saturate_signed:
-  case Intrinsic::wasm_sub_saturate_unsigned: {
-    unsigned OpCode;
-    switch (IntNo) {
-    case Intrinsic::wasm_add_saturate_signed:
-      OpCode = WebAssemblyISD::ADD_SAT_S;
-      break;
-    case Intrinsic::wasm_add_saturate_unsigned:
-      OpCode = WebAssemblyISD::ADD_SAT_U;
-      break;
-    case Intrinsic::wasm_sub_saturate_signed:
-      OpCode = WebAssemblyISD::SUB_SAT_S;
-      break;
-    case Intrinsic::wasm_sub_saturate_unsigned:
-      OpCode = WebAssemblyISD::SUB_SAT_U;
-      break;
-    default:
-      llvm_unreachable("unexpected intrinsic id");
-      break;
-    }
-    return DAG.getNode(OpCode, DL, Op.getValueType(), Op.getOperand(1),
-                       Op.getOperand(2));
-  }
-
-  case Intrinsic::wasm_bitselect:
-    return DAG.getNode(WebAssemblyISD::BITSELECT, DL, Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-  case Intrinsic::wasm_anytrue:
-  case Intrinsic::wasm_alltrue: {
-    unsigned OpCode = IntNo == Intrinsic::wasm_anytrue
-                          ? WebAssemblyISD::ANYTRUE
-                          : WebAssemblyISD::ALLTRUE;
-    return DAG.getNode(OpCode, DL, Op.getValueType(), Op.getOperand(1));
+  case Intrinsic::wasm_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    EVT VT = Op.getValueType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    auto &Context = MF.getMMI().getContext();
+    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
+                                            Twine(MF.getFunctionNumber()));
+    return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT,
+                       DAG.getMCSymbol(S, PtrVT));
   }
-
-  case Intrinsic::wasm_lsda:
-    // TODO For now, just return 0 not to crash
-    return DAG.getConstant(0, DL, Op.getValueType());
   }
 }
 
@@ -1028,12 +1062,65 @@ WebAssemblyTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // Expand mask indices to byte indices and materialize them as operands
   for (size_t I = 0, Lanes = Mask.size(); I < Lanes; ++I) {
     for (size_t J = 0; J < LaneBytes; ++J) {
-      Ops[OpIdx++] =
-          DAG.getConstant((uint64_t)Mask[I] * LaneBytes + J, DL, MVT::i32);
+      // Lower undefs (represented by -1 in mask) to zero
+      uint64_t ByteIndex =
+          Mask[I] == -1 ? 0 : (uint64_t)Mask[I] * LaneBytes + J;
+      Ops[OpIdx++] = DAG.getConstant(ByteIndex, DL, MVT::i32);
     }
   }
 
-  return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, MVT::v16i8, Ops);
+  return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
+}
+
+SDValue
+WebAssemblyTargetLowering::LowerAccessVectorElement(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  // Allow constant lane indices, expand variable lane indices
+  SDNode *IdxNode = Op.getOperand(Op.getNumOperands() - 1).getNode();
+  if (isa<ConstantSDNode>(IdxNode) || IdxNode->isUndef())
+    return Op;
+  else
+    // Perform default expansion
+    return SDValue();
+}
+
+SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  // Only manually lower vector shifts
+  assert(Op.getSimpleValueType().isVector());
+
+  // Unroll non-splat vector shifts
+  BuildVectorSDNode *ShiftVec;
+  SDValue SplatVal;
+  if (!(ShiftVec = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode())) ||
+      !(SplatVal = ShiftVec->getSplatValue()))
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // All splats except i64x2 const splats are handled by patterns
+  ConstantSDNode *SplatConst = dyn_cast<ConstantSDNode>(SplatVal);
+  if (!SplatConst || Op.getSimpleValueType() != MVT::v2i64)
+    return Op;
+
+  // i64x2 const splats are custom lowered to avoid unnecessary wraps
+  unsigned Opcode;
+  switch (Op.getOpcode()) {
+  case ISD::SHL:
+    Opcode = WebAssemblyISD::VEC_SHL;
+    break;
+  case ISD::SRA:
+    Opcode = WebAssemblyISD::VEC_SHR_S;
+    break;
+  case ISD::SRL:
+    Opcode = WebAssemblyISD::VEC_SHR_U;
+    break;
+  default:
+    llvm_unreachable("unexpected opcode");
+  }
+  APInt Shift = SplatConst->getAPIntValue().zextOrTrunc(32);
+  return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0),
+                     DAG.getConstant(Shift, DL, MVT::i32));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 7b22651ff6d395eb2903deecf5abee92c8034bc5..5182a58efc784839ed0216a613b7e44b8d947e29 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -99,6 +99,8 @@ private:
   SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace WebAssembly {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index 9eff2cfde0ad2fce8e82bfedf946e62d2f5f67b8..f9d092e4b8a695cfac6ea5b015140d7f2cacd5ff 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -24,10 +24,8 @@ multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
             Requires<[HasAtomics]>;
 }
 
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_LOAD_I32 : WebAssemblyLoad<I32, "i32.atomic.load", 0xfe10>;
 defm ATOMIC_LOAD_I64 : WebAssemblyLoad<I64, "i64.atomic.load", 0xfe11>;
-} // Defs = [ARGUMENTS]
 
 // Select loads with no constant offset.
 let Predicates = [HasAtomics] in {
@@ -62,13 +60,11 @@ def : LoadPatExternSymOffOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
 
 // Extending loads. Note that there are only zero-extending atomic loads, no
 // sign-extending loads.
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load8_u", 0xfe12>;
 defm ATOMIC_LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.atomic.load16_u", 0xfe13>;
 defm ATOMIC_LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load8_u", 0xfe14>;
 defm ATOMIC_LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load16_u", 0xfe15>;
 defm ATOMIC_LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.atomic.load32_u", 0xfe16>;
-} // Defs = [ARGUMENTS]
 
 // Fragments for extending loads. These are different from regular loads because
 // the SDNodes are derived from AtomicSDNode rather than LoadSDNode and
@@ -200,10 +196,8 @@ def : LoadPatExternSymOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 // Atomic stores
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_STORE_I32 : WebAssemblyStore<I32, "i32.atomic.store", 0xfe17>;
 defm ATOMIC_STORE_I64 : WebAssemblyStore<I64, "i64.atomic.store", 0xfe18>;
-} // Defs = [ARGUMENTS]
 
 // We need an 'atomic' version of store patterns because store and atomic_store
 // nodes have different operand orders:
@@ -263,13 +257,11 @@ def : AStorePatExternSymOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
 } // Predicates = [HasAtomics]
 
 // Truncating stores.
-let Defs = [ARGUMENTS] in {
 defm ATOMIC_STORE8_I32 : WebAssemblyStore<I32, "i32.atomic.store8", 0xfe19>;
 defm ATOMIC_STORE16_I32 : WebAssemblyStore<I32, "i32.atomic.store16", 0xfe1a>;
 defm ATOMIC_STORE8_I64 : WebAssemblyStore<I64, "i64.atomic.store8", 0xfe1b>;
 defm ATOMIC_STORE16_I64 : WebAssemblyStore<I64, "i64.atomic.store16", 0xfe1c>;
 defm ATOMIC_STORE32_I64 : WebAssemblyStore<I64, "i64.atomic.store32", 0xfe1d>;
-} // Defs = [ARGUMENTS]
 
 // Fragments for truncating stores.
 
@@ -341,8 +333,6 @@ def : AStorePatExternSymOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
 // Atomic binary read-modify-writes
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
   defm "" : I<(outs rc:$dst),
               (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
@@ -430,7 +420,6 @@ defm ATOMIC_RMW16_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw16_u.xchg", 0xfe46>;
 defm ATOMIC_RMW32_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw32_u.xchg", 0xfe47>;
-}
 
 // Select binary RMWs with no constant offset.
 class BinRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -674,8 +663,6 @@ defm : BinRMWTruncExtPattern<
 // Consider adding a pass after instruction selection that optimizes this case
 // if it is frequent.
 
-let Defs = [ARGUMENTS] in {
-
 multiclass WebAssemblyTerRMW<WebAssemblyRegClass rc, string Name, int Opcode> {
   defm "" : I<(outs rc:$dst),
               (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp,
@@ -699,7 +686,6 @@ defm ATOMIC_RMW16_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw16_u.cmpxchg", 0xfe4d>;
 defm ATOMIC_RMW32_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw32_u.cmpxchg", 0xfe4e>;
-}
 
 // Select ternary RMWs with no constant offset.
 class TerRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
@@ -912,7 +898,6 @@ defm : TerRMWTruncExtPattern<
 // Atomic wait / notify
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
 let hasSideEffects = 1 in {
 defm ATOMIC_NOTIFY :
   I<(outs I32:$dst),
@@ -935,7 +920,6 @@ defm ATOMIC_WAIT_I64 :
     "i64.atomic.wait \t${off}, ${p2align}", 0xfe02>;
 } // mayLoad = 1
 } // hasSideEffects = 1
-} // Defs = [ARGUMENTS]
 
 let Predicates = [HasAtomics] in {
 // Select notifys with no constant offset.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 3c9caa3f0ded1d520fa654bc223c504e1903fb6a..07839b7901142e27df529bc6cc3242800d8c5b73 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -15,8 +15,6 @@
 // TODO: addr64: These currently assume the callee address is 32-bit.
 // FIXME: add $type to first call_indirect asmstr (and maybe $flags)
 
-let Defs = [ARGUMENTS] in {
-
 // Call sequence markers. These have an immediate which represents the amount of
 // stack space to allocate or free, which is used for varargs lowering.
 let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
@@ -118,8 +116,6 @@ let Uses = [SP32, SP64], isCall = 1 in {
                               0x11>;
 } // Uses = [SP32,SP64], isCall = 1
 
-} // Defs = [ARGUMENTS]
-
 // Patterns for matching a direct call to a global address.
 def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
           (CALL_I32 tglobaladdr:$callee)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index e27d81937dd370de89632504034c0417b4e46f3b..0af94ef875518d0484e6e2671b09d9dfb53a269b 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -12,8 +12,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // The condition operand is a boolean value which WebAssembly represents as i32.
 defm BR_IF : I<(outs), (ins bb_op:$dst, I32:$cond),
@@ -30,15 +28,11 @@ defm BR   : NRI<(outs), (ins bb_op:$dst),
 } // isBarrier = 1
 } // isBranch = 1, isTerminator = 1, hasCtrlDep = 1
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst),
           (BR_IF bb_op:$dst, I32:$cond)>;
 def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst),
           (BR_UNLESS bb_op:$dst, I32:$cond)>;
 
-let Defs = [ARGUMENTS] in {
-
 // TODO: SelectionDAG's lowering insists on using a pointer as the index for
 // jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
 // currently.
@@ -49,27 +43,29 @@ let Defs = [ARGUMENTS] in {
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 let isCodeGenOnly = 1 in
 def BR_TABLE_I32 : NI<(outs), (ins I32:$index, variable_ops),
-                      [(WebAssemblybr_table I32:$index)], 0,
+                      [(WebAssemblybr_table I32:$index)], "false",
                       "br_table \t$index", 0x0e> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
+let BaseName = "BR_TABLE_I32" in
 def BR_TABLE_I32_S : NI<(outs), (ins variable_ops),
-                        [], 1,
-                        "br_table", 0x0e> {
+                        [], "true",
+                        "br_table \t", 0x0e> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
 let isCodeGenOnly = 1 in
 def BR_TABLE_I64 : NI<(outs), (ins I64:$index, variable_ops),
-                      [(WebAssemblybr_table I64:$index)], 0,
+                      [(WebAssemblybr_table I64:$index)], "false",
                       "br_table \t$index"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
+let BaseName = "BR_TABLE_I64" in
 def BR_TABLE_I64_S : NI<(outs), (ins variable_ops),
-                        [], 1,
-                        "br_table"> {
+                        [], "true",
+                        "br_table \t"> {
   let TSFlags{0} = 1;
   let TSFlags{1} = 1;
 }
@@ -194,5 +190,3 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
                    [(catchret bb:$dst, bb:$from)], "", 0>;
 }
 }
-
-} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
index c89c1b549816549b839aa7a9b117bda458b1ac39..0d772c743a759a7cea874027990fdd49394661e5 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrConv.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrConv.td
@@ -13,8 +13,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 defm I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                       [(set I32:$dst, (trunc I64:$src))],
                       "i32.wrap/i64\t$dst, $src", "i32.wrap/i64", 0xa7>;
@@ -51,15 +49,11 @@ defm I64_EXTEND32_S_I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
                             0xc4>;
 } // Predicates = [HasSignExt]
 
-} // defs = [ARGUMENTS]
-
 // Expand a "don't care" extend into zero-extend (chosen over sign-extend
 // somewhat arbitrarily, although it favors popular hardware architectures
 // and is conceptually a simpler operation).
 def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>;
 
-let Defs = [ARGUMENTS] in {
-
 // Conversion from floating point to integer instructions which don't trap on
 // overflow or invalid.
 defm I32_TRUNC_S_SAT_F32 : I<(outs I32:$dst), (ins F32:$src), (outs), (ins),
@@ -103,6 +97,24 @@ defm I64_TRUNC_U_SAT_F64 : I<(outs I64:$dst), (ins F64:$src), (outs), (ins),
                              "i64.trunc_u:sat/f64", 0xfc07>,
                              Requires<[HasNontrappingFPToInt]>;
 
+// Lower llvm.wasm.trunc.saturate.* to saturating instructions
+def : Pat<(int_wasm_trunc_saturate_signed F32:$src),
+          (I32_TRUNC_S_SAT_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_saturate_unsigned F32:$src),
+          (I32_TRUNC_U_SAT_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_saturate_signed F64:$src),
+          (I32_TRUNC_S_SAT_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src),
+          (I32_TRUNC_U_SAT_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_saturate_signed F32:$src),
+          (I64_TRUNC_S_SAT_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_saturate_unsigned F32:$src),
+          (I64_TRUNC_U_SAT_F32 F32:$src)>;
+def : Pat<(int_wasm_trunc_saturate_signed F64:$src),
+          (I64_TRUNC_S_SAT_F64 F64:$src)>;
+def : Pat<(int_wasm_trunc_saturate_unsigned F64:$src),
+          (I64_TRUNC_U_SAT_F64 F64:$src)>;
+
 // Conversion from floating point to integer pseudo-instructions which don't
 // trap on overflow or invalid.
 let usesCustomInserter = 1, isCodeGenOnly = 1 in {
@@ -218,5 +230,3 @@ defm F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), (outs), (ins),
                              [(set F64:$dst, (bitconvert I64:$src))],
                              "f64.reinterpret/i64\t$dst, $src",
                              "f64.reinterpret/i64", 0xbf>;
-
-} // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
index 41b39f69e51c460371d050e54c7f53735db76895..a251d60b89ee283cd541b2f96f41aca16c69666e 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrExceptRef.td
@@ -12,8 +12,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-let Defs = [ARGUMENTS] in {
-
 defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
                            (ins EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond),
                            (outs), (ins),
@@ -23,8 +21,6 @@ defm SELECT_EXCEPT_REF : I<(outs EXCEPT_REF:$dst),
                            "except_ref.select\t$dst, $lhs, $rhs, $cond",
                            "except_ref.select", 0x1b>;
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(select (i32 (setne I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
           (SELECT_EXCEPT_REF EXCEPT_REF:$lhs, EXCEPT_REF:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), EXCEPT_REF:$lhs, EXCEPT_REF:$rhs),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 70e27df27e66cc66ad9b60a640be3e015f7cba23..c5290f00b431b520818df570ed2bc5414b6d4d33 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -45,8 +45,6 @@ multiclass ComparisonFP<CondCode cond, string name, bits<32> f32Inst, bits<32> f
                 !strconcat("f64.", name), f64Inst>;
 }
 
-let Defs = [ARGUMENTS] in {
-
 let isCommutable = 1 in
 defm ADD : BinaryFP<fadd, "add ", 0x92, 0xa0>;
 defm SUB : BinaryFP<fsub, "sub ", 0x93, 0xa1>;
@@ -60,8 +58,8 @@ defm NEG : UnaryFP<fneg, "neg ", 0x8c, 0x9a>;
 defm COPYSIGN : BinaryFP<fcopysign, "copysign", 0x98, 0xa6>;
 
 let isCommutable = 1 in {
-defm MIN : BinaryFP<fminnan, "min ", 0x96, 0xa4>;
-defm MAX : BinaryFP<fmaxnan, "max ", 0x97, 0xa5>;
+defm MIN : BinaryFP<fminimum, "min ", 0x96, 0xa4>;
+defm MAX : BinaryFP<fmaximum, "max ", 0x97, 0xa5>;
 } // isCommutable = 1
 
 defm CEIL : UnaryFP<fceil, "ceil", 0x8d, 0x9b>;
@@ -69,8 +67,6 @@ defm FLOOR : UnaryFP<ffloor, "floor", 0x8e, 0x9c>;
 defm TRUNC : UnaryFP<ftrunc, "trunc", 0x8f, 0x9d>;
 defm NEAREST : UnaryFP<fnearbyint, "nearest", 0x90, 0x9e>;
 
-} // Defs = [ARGUMENTS]
-
 // DAGCombine oddly folds casts into the rhs of copysign. Unfold them.
 def : Pat<(fcopysign F64:$lhs, F32:$rhs),
           (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>;
@@ -81,8 +77,6 @@ def : Pat<(fcopysign F32:$lhs, F64:$rhs),
 def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>;
 def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>;
 
-let Defs = [ARGUMENTS] in {
-
 let isCommutable = 1 in {
 defm EQ : ComparisonFP<SETOEQ, "eq  ", 0x5b, 0x61>;
 defm NE : ComparisonFP<SETUNE, "ne  ", 0x5c, 0x62>;
@@ -92,8 +86,6 @@ defm LE : ComparisonFP<SETOLE, "le  ", 0x5f, 0x65>;
 defm GT : ComparisonFP<SETOGT, "gt  ", 0x5e, 0x64>;
 defm GE : ComparisonFP<SETOGE, "ge  ", 0x60, 0x66>;
 
-} // Defs = [ARGUMENTS]
-
 // Don't care floating-point comparisons, supported via other comparisons.
 def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>;
 def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>;
@@ -108,8 +100,6 @@ def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>;
 def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>;
 def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>;
 
-let Defs = [ARGUMENTS] in {
-
 defm SELECT_F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))],
@@ -119,8 +109,6 @@ defm SELECT_F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs, I32:$cond),
                     [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))],
                     "f64.select\t$dst, $lhs, $rhs, $cond", "f64.select", 0x1b>;
 
-} // Defs = [ARGUMENTS]
-
 // ISD::SELECT requires its operand to conform to getBooleanContents, but
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
@@ -134,3 +122,10 @@ def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs),
           (SELECT_F32 F32:$rhs, F32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs),
           (SELECT_F64 F64:$rhs, F64:$lhs, I32:$cond)>;
+
+// The legalizer inserts an unnecessary `and 1` to make input conform
+// to getBooleanContents, which we can lower away.
+def : Pat<(select (i32 (and I32:$cond, 1)), F32:$lhs, F32:$rhs),
+          (SELECT_F32 F32:$lhs, F32:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (and I32:$cond, 1)), F64:$lhs, F64:$rhs),
+          (SELECT_F64 F64:$lhs, F64:$rhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 683fb3d981ff80bf182fae8ece74f425e3e966d9..97583ea0e6ac05cf08b94e83c6ace23a1ef81b38 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -15,21 +15,24 @@
 // WebAssembly Instruction Format.
 // We instantiate 2 of these for every actual instruction (register based
 // and stack based), see below.
-class WebAssemblyInst<bits<32> inst, string asmstr, bit stack> : Instruction {
-  field bits<32> Inst = inst; // Instruction encoding.
-  field bit StackBased = stack;
+class WebAssemblyInst<bits<32> inst, string asmstr, string stack> : StackRel,
+  Instruction {
+  bits<32> Inst = inst; // Instruction encoding.
+  string StackBased = stack;
+  string BaseName = NAME;
   let Namespace   = "WebAssembly";
   let Pattern     = [];
   let AsmString   = asmstr;
 }
 
 // Normal instructions. Default instantiation of a WebAssemblyInst.
-class NI<dag oops, dag iops, list<dag> pattern, bit stack, string asmstr = "",
-         bits<32> inst = -1>
+class NI<dag oops, dag iops, list<dag> pattern, string stack,
+         string asmstr = "", bits<32> inst = -1>
     : WebAssemblyInst<inst, asmstr, stack> {
   dag OutOperandList = oops;
   dag InOperandList  = iops;
   let Pattern        = pattern;
+  let Defs           = [ARGUMENTS];
 }
 
 // Generates both register and stack based versions of one actual instruction.
@@ -49,8 +52,9 @@ multiclass I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
              list<dag> pattern_r, string asmstr_r = "", string asmstr_s = "",
              bits<32> inst = -1> {
   let isCodeGenOnly = 1 in
-  def "" : NI<oops_r, iops_r, pattern_r, 0, asmstr_r, inst>;
-  def _S : NI<oops_s, iops_s, [], 1, asmstr_s, inst>;
+  def "" : NI<oops_r, iops_r, pattern_r, "false", asmstr_r, inst>;
+  let BaseName = NAME in
+  def _S : NI<oops_s, iops_s, [], "true", asmstr_s, inst>;
 }
 
 // For instructions that have no register ops, so both sets are the same.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index a1e0516a53bf211a4f4490b34ce0fde900e06d03..5efff32d616768d1abfc4c2cfdcaadc0d58a89db 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -70,6 +70,8 @@ void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     CopyOpcode = WebAssembly::COPY_F32;
   else if (RC == &WebAssembly::F64RegClass)
     CopyOpcode = WebAssembly::COPY_F64;
+  else if (RC == &WebAssembly::V128RegClass)
+    CopyOpcode = WebAssembly::COPY_V128;
   else
     llvm_unreachable("Unexpected register class");
 
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index a2ea14cc28b94e97dfe0cdb5dfd6a6b38abe8e05..8fff924265ff3133c99054732d305c45ee137460 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -153,6 +153,19 @@ def TypeIndex : Operand<i32>;
 
 } // OperandNamespace = "WebAssembly"
 
+//===----------------------------------------------------------------------===//
+// WebAssembly Register to Stack instruction mapping
+//===----------------------------------------------------------------------===//
+
+class StackRel;
+def getStackOpcode : InstrMapping {
+  let FilterClass = "StackRel";
+  let RowFields = ["BaseName"];
+  let ColFields = ["StackBased"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
 //===----------------------------------------------------------------------===//
@@ -163,19 +176,18 @@ include "WebAssemblyInstrFormats.td"
 // Additional instructions.
 //===----------------------------------------------------------------------===//
 
-multiclass ARGUMENT<WebAssemblyRegClass vt> {
-  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
-  defm ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
-                        (outs), (ins i32imm:$argno),
-                        [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
+multiclass ARGUMENT<WebAssemblyRegClass reg, ValueType vt> {
+  let hasSideEffects = 1, isCodeGenOnly = 1,
+      Defs = []<Register>, Uses = [ARGUMENTS] in
+  defm ARGUMENT_#vt :
+    I<(outs reg:$res), (ins i32imm:$argno), (outs), (ins i32imm:$argno),
+      [(set (vt reg:$res), (WebAssemblyargument timm:$argno))]>;
 }
-defm "": ARGUMENT<I32>;
-defm "": ARGUMENT<I64>;
-defm "": ARGUMENT<F32>;
-defm "": ARGUMENT<F64>;
-defm "": ARGUMENT<EXCEPT_REF>;
-
-let Defs = [ARGUMENTS] in {
+defm "": ARGUMENT<I32, i32>;
+defm "": ARGUMENT<I64, i64>;
+defm "": ARGUMENT<F32, f32>;
+defm "": ARGUMENT<F64, f64>;
+defm "": ARGUMENT<EXCEPT_REF, ExceptRef>;
 
 // get_local and set_local are not generated by instruction selection; they
 // are implied by virtual register uses and defs.
@@ -266,12 +278,12 @@ defm CONST_F64 : I<(outs F64:$res), (ins f64imm_op:$imm),
                    "f64.const\t$res, $imm", "f64.const\t$imm", 0x44>;
 } // isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$addr)),
           (CONST_I32 tglobaladdr:$addr)>;
 def : Pat<(i32 (WebAssemblywrapper texternalsym:$addr)),
           (CONST_I32 texternalsym:$addr)>;
+def : Pat<(i32 (WebAssemblywrapper mcsym:$sym)), (CONST_I32 mcsym:$sym)>;
+def : Pat<(i64 (WebAssemblywrapper mcsym:$sym)), (CONST_I64 mcsym:$sym)>;
 
 //===----------------------------------------------------------------------===//
 // Additional sets of instructions.
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 44c93de54aa452e8488e8ad02ef65e9fd5830605..d5b63d64369735be6cc4459e983500bc95acd4d8 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -45,9 +45,6 @@ multiclass ComparisonInt<CondCode cond, string name, bits<32> i32Inst, bits<32>
                 !strconcat("i64.", name), i64Inst>;
 }
 
-
-let Defs = [ARGUMENTS] in {
-
 // The spaces after the names are for aesthetic purposes only, to make
 // operands line up vertically after tab expansion.
 let isCommutable = 1 in
@@ -97,16 +94,12 @@ defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                  [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
                  "i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
 
-} // Defs = [ARGUMENTS]
-
 // Optimize away an explicit mask on a rotate count.
 def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
 def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
-let Defs = [ARGUMENTS] in {
-
 defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
@@ -116,8 +109,6 @@ defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
                     [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
                     "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>;
 
-} // Defs = [ARGUMENTS]
-
 // ISD::SELECT requires its operand to conform to getBooleanContents, but
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
@@ -131,3 +122,10 @@ def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
           (SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
           (SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>;
+
+// The legalizer inserts an unnecessary `and 1` to make input conform
+// to getBooleanContents, which we can lower away.
+def : Pat<(select (i32 (and I32:$cond, 1)), I32:$lhs, I32:$rhs),
+          (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
+def : Pat<(select (i32 (and I32:$cond, 1)), I64:$lhs, I64:$rhs),
+          (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index 76ef1461d22a6ddc02d170c44ac5959129890f45..ccc331d1bf03203f5cbde16a1a9042dc52123a9b 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -53,8 +53,6 @@ def regPlusGA : PatFrag<(ops node:$addr, node:$off),
 // We don't need a regPlusES because external symbols never have constant
 // offsets folded into them, so we can just use add.
 
-let Defs = [ARGUMENTS] in {
-
 // Defines atomic and non-atomic loads, regular and extending.
 multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> {
   let mayLoad = 1 in
@@ -73,8 +71,6 @@ defm LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29>;
 defm LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a>;
 defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b>;
 
-} // Defs = [ARGUMENTS]
-
 // Select loads with no constant offset.
 class LoadPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
   Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>;
@@ -144,8 +140,6 @@ def : LoadPatExternSymOffOnly<i64, load, LOAD_I64>;
 def : LoadPatExternSymOffOnly<f32, load, LOAD_F32>;
 def : LoadPatExternSymOffOnly<f64, load, LOAD_F64>;
 
-let Defs = [ARGUMENTS] in {
-
 // Extending load.
 defm LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
 defm LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
@@ -158,8 +152,6 @@ defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33>;
 defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34>;
 defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35>;
 
-} // Defs = [ARGUMENTS]
-
 // Select extending loads with no constant offset.
 def : LoadPatNoOffset<i32, sextloadi8, LOAD8_S_I32>;
 def : LoadPatNoOffset<i32, zextloadi8, LOAD8_U_I32>;
@@ -303,9 +295,6 @@ def : LoadPatExternSymOffOnly<i64, extloadi8, LOAD8_U_I64>;
 def : LoadPatExternSymOffOnly<i64, extloadi16, LOAD16_U_I64>;
 def : LoadPatExternSymOffOnly<i64, extloadi32, LOAD32_U_I64>;
 
-
-let Defs = [ARGUMENTS] in {
-
 // Defines atomic and non-atomic stores, regular and truncating
 multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode> {
   let mayStore = 1 in
@@ -323,8 +312,6 @@ defm STORE_I64  : WebAssemblyStore<I64, "i64.store", 0x37>;
 defm STORE_F32  : WebAssemblyStore<F32, "f32.store", 0x38>;
 defm STORE_F64  : WebAssemblyStore<F64, "f64.store", 0x39>;
 
-} // Defs = [ARGUMENTS]
-
 // Select stores with no constant offset.
 class StorePatNoOffset<ValueType ty, PatFrag node, NI inst> :
   Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>;
@@ -389,9 +376,6 @@ def : StorePatExternSymOffOnly<i64, store, STORE_I64>;
 def : StorePatExternSymOffOnly<f32, store, STORE_F32>;
 def : StorePatExternSymOffOnly<f64, store, STORE_F64>;
 
-
-let Defs = [ARGUMENTS] in {
-
 // Truncating store.
 defm STORE8_I32 : WebAssemblyStore<I32, "i32.store8", 0x3a>;
 defm STORE16_I32 : WebAssemblyStore<I32, "i32.store16", 0x3b>;
@@ -399,8 +383,6 @@ defm STORE8_I64 : WebAssemblyStore<I64, "i64.store8", 0x3c>;
 defm STORE16_I64 : WebAssemblyStore<I64, "i64.store16", 0x3d>;
 defm STORE32_I64 : WebAssemblyStore<I64, "i64.store32", 0x3e>;
 
-} // Defs = [ARGUMENTS]
-
 // Select truncating stores with no constant offset.
 def : StorePatNoOffset<i32, truncstorei8, STORE8_I32>;
 def : StorePatNoOffset<i32, truncstorei16, STORE16_I32>;
@@ -448,8 +430,6 @@ def : StorePatExternSymOffOnly<i64, truncstorei8, STORE8_I64>;
 def : StorePatExternSymOffOnly<i64, truncstorei16, STORE16_I64>;
 def : StorePatExternSymOffOnly<i64, truncstorei32, STORE32_I64>;
 
-let Defs = [ARGUMENTS] in {
-
 // Current memory size.
 defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
                          (outs), (ins i32imm:$flags),
@@ -493,8 +473,6 @@ defm GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
                          0x40>,
                        Requires<[HasAddr32]>;
 
-} // Defs = [ARGUMENTS]
-
 def : Pat<(int_wasm_current_memory),
           (CURRENT_MEMORY_I32 0)>;
 def : Pat<(int_wasm_grow_memory I32:$delta),
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 28262fbcaf67cfca087c815358a58c9876996a31..caad638e9e387107485f5e7baa249da8c58127ee 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -21,24 +21,18 @@ multiclass SIMD_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
             Requires<[HasSIMD128]>;
 }
 
-multiclass SIMD_ARGUMENT<ValueType vt> {
-  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
-  defm ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
-                             (outs), (ins i32imm:$argno),
-                             [(set (vt V128:$res),
-                                  (WebAssemblyargument timm:$argno))]>;
-}
-
-defm "": SIMD_ARGUMENT<v16i8>;
-defm "": SIMD_ARGUMENT<v8i16>;
-defm "": SIMD_ARGUMENT<v4i32>;
-defm "": SIMD_ARGUMENT<v2i64>;
-defm "": SIMD_ARGUMENT<v4f32>;
-defm "": SIMD_ARGUMENT<v2f64>;
+defm "" : ARGUMENT<V128, v16i8>;
+defm "" : ARGUMENT<V128, v8i16>;
+defm "" : ARGUMENT<V128, v4i32>;
+defm "" : ARGUMENT<V128, v2i64>;
+defm "" : ARGUMENT<V128, v4f32>;
+defm "" : ARGUMENT<V128, v2f64>;
 
 // Constrained immediate argument types
 foreach SIZE = [8, 16] in
-def ImmI#SIZE : ImmLeaf<i32, "return (Imm & ((1UL << "#SIZE#") - 1)) == Imm;">;
+def ImmI#SIZE : ImmLeaf<i32,
+  "return ((uint64_t)Imm & ((1UL << "#SIZE#") - 1)) == (uint64_t)Imm;"
+>;
 foreach SIZE = [2, 4, 8, 16, 32] in
 def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
 
@@ -55,7 +49,6 @@ multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
                                   "v128.const\t"#args, 0>;
 }
 
-let Defs = [ARGUMENTS] in {
 defm "" : ConstVec<v16i8,
                    (ins vec_i8imm_op:$i0, vec_i8imm_op:$i1,
                         vec_i8imm_op:$i2, vec_i8imm_op:$i3,
@@ -100,7 +93,6 @@ defm "" : ConstVec<v2f64,
                   (ins f64imm_op:$i0, f64imm_op:$i1),
                   (build_vector (f64 fpimm:$i0), (f64 fpimm:$i1)),
                   "$i0, $i1">;
-} // Defs = [ARGUMENTS]
 
 // Create vector with identical lanes: splat
 def splat2 : PatFrag<(ops node:$x), (build_vector node:$x, node:$x)>;
@@ -189,6 +181,28 @@ def : Pat<(i32 (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx))),
 def : Pat<(i32 (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx))),
           (EXTRACT_LANE_v8i16_u V128:$vec, (i32 LaneIdx8:$idx))>;
 
+// Lower undef lane indices to zero
+def : Pat<(and (i32 (vector_extract (v16i8 V128:$vec), undef)), (i32 0xff)),
+          (EXTRACT_LANE_v16i8_u V128:$vec, 0)>;
+def : Pat<(and (i32 (vector_extract (v8i16 V128:$vec), undef)), (i32 0xffff)),
+          (EXTRACT_LANE_v8i16_u V128:$vec, 0)>;
+def : Pat<(i32 (vector_extract (v16i8 V128:$vec), undef)),
+          (EXTRACT_LANE_v16i8_u V128:$vec, 0)>;
+def : Pat<(i32 (vector_extract (v8i16 V128:$vec), undef)),
+          (EXTRACT_LANE_v8i16_u V128:$vec, 0)>;
+def : Pat<(sext_inreg (i32 (vector_extract (v16i8 V128:$vec), undef)), i8),
+          (EXTRACT_LANE_v16i8_s V128:$vec, 0)>;
+def : Pat<(sext_inreg (i32 (vector_extract (v8i16 V128:$vec), undef)), i16),
+          (EXTRACT_LANE_v8i16_s V128:$vec, 0)>;
+def : Pat<(vector_extract (v4i32 V128:$vec), undef),
+          (EXTRACT_LANE_v4i32 V128:$vec, 0)>;
+def : Pat<(vector_extract (v2i64 V128:$vec), undef),
+          (EXTRACT_LANE_v2i64 V128:$vec, 0)>;
+def : Pat<(vector_extract (v4f32 V128:$vec), undef),
+          (EXTRACT_LANE_v4f32 V128:$vec, 0)>;
+def : Pat<(vector_extract (v2f64 V128:$vec), undef),
+          (EXTRACT_LANE_v2f64 V128:$vec, 0)>;
+
 // Replace lane value: replace_lane
 multiclass ReplaceLane<ValueType vec_t, string vec, ImmLeaf imm_t,
                        WebAssemblyRegClass reg_t, ValueType lane_t,
@@ -209,6 +223,20 @@ defm "" : ReplaceLane<v2i64, "i64x2", LaneIdx2, I64, i64, 20>;
 defm "" : ReplaceLane<v4f32, "f32x4", LaneIdx4, F32, f32, 21>;
 defm "" : ReplaceLane<v2f64, "f64x2", LaneIdx2, F64, f64, 22>;
 
+// Lower undef lane indices to zero
+def : Pat<(vector_insert (v16i8 V128:$vec), I32:$x, undef),
+          (REPLACE_LANE_v16i8 V128:$vec, 0, I32:$x)>;
+def : Pat<(vector_insert (v8i16 V128:$vec), I32:$x, undef),
+          (REPLACE_LANE_v8i16 V128:$vec, 0, I32:$x)>;
+def : Pat<(vector_insert (v4i32 V128:$vec), I32:$x, undef),
+          (REPLACE_LANE_v4i32 V128:$vec, 0, I32:$x)>;
+def : Pat<(vector_insert (v2i64 V128:$vec), I64:$x, undef),
+          (REPLACE_LANE_v2i64 V128:$vec, 0, I64:$x)>;
+def : Pat<(vector_insert (v4f32 V128:$vec), F32:$x, undef),
+          (REPLACE_LANE_v4f32 V128:$vec, 0, F32:$x)>;
+def : Pat<(vector_insert (v2f64 V128:$vec), F64:$x, undef),
+          (REPLACE_LANE_v2f64 V128:$vec, 0, F64:$x)>;
+
 // Arbitrary other BUILD_VECTOR patterns
 def : Pat<(v16i8 (build_vector
             (i32 I32:$x0), (i32 I32:$x1), (i32 I32:$x2), (i32 I32:$x3),
@@ -322,7 +350,7 @@ def : Pat<(v2f64 (build_vector (f64 F64:$x0), (f64 F64:$x1))),
             (v2f64 (SPLAT_v2f64 (f64 F64:$x0))), 1, F64:$x1))>;
 
 // Shuffle lanes: shuffle
-defm SHUFFLE_v16i8 :
+defm SHUFFLE :
   SIMD_I<(outs V128:$dst),
          (ins V128:$x, V128:$y,
            vec_i8imm_op:$m0, vec_i8imm_op:$m1,
@@ -356,7 +384,7 @@ defm SHUFFLE_v16i8 :
 def wasm_shuffle_t : SDTypeProfile<1, 18, []>;
 def wasm_shuffle : SDNode<"WebAssemblyISD::SHUFFLE", wasm_shuffle_t>;
 foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-def : Pat<(v16i8 (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
+def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
             (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
             (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
             (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
@@ -365,7 +393,7 @@ def : Pat<(v16i8 (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
             (i32 LaneIdx32:$mA), (i32 LaneIdx32:$mB),
             (i32 LaneIdx32:$mC), (i32 LaneIdx32:$mD),
             (i32 LaneIdx32:$mE), (i32 LaneIdx32:$mF))),
-          (v16i8 (SHUFFLE_v16i8 (vec_t V128:$x), (vec_t V128:$y),
+          (vec_t (SHUFFLE (vec_t V128:$x), (vec_t V128:$y),
             (i32 LaneIdx32:$m0), (i32 LaneIdx32:$m1),
             (i32 LaneIdx32:$m2), (i32 LaneIdx32:$m3),
             (i32 LaneIdx32:$m4), (i32 LaneIdx32:$m5),
@@ -384,7 +412,9 @@ multiclass SIMDBinary<ValueType vec_t, string vec, SDNode node, string name,
                       bits<32> simdop> {
   defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
                         (outs), (ins),
-                        [(set (vec_t V128:$dst), (node V128:$lhs, V128:$rhs))],
+                        [(set (vec_t V128:$dst),
+                          (node (vec_t V128:$lhs), (vec_t V128:$rhs))
+                        )],
                         vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name,
                         simdop>;
 }
@@ -436,23 +466,19 @@ multiclass SIMDBinarySat<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 2)>;
 }
 
-def wasm_saturate_t : SDTypeProfile<1, 2,
-  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>]
->;
-def wasm_add_sat_s : SDNode<"WebAssemblyISD::ADD_SAT_S", wasm_saturate_t>;
-def wasm_add_sat_u : SDNode<"WebAssemblyISD::ADD_SAT_U", wasm_saturate_t>;
-def wasm_sub_sat_s : SDNode<"WebAssemblyISD::SUB_SAT_S", wasm_saturate_t>;
-def wasm_sub_sat_u : SDNode<"WebAssemblyISD::SUB_SAT_U", wasm_saturate_t>;
-
 // Saturating integer addition: add_saturate_s / add_saturate_u
 let isCommutable = 1 in {
-defm ADD_SAT_S : SIMDBinarySat<wasm_add_sat_s, "add_saturate_s", 40>;
-defm ADD_SAT_U : SIMDBinarySat<wasm_add_sat_u, "add_saturate_u", 41>;
+defm ADD_SAT_S :
+  SIMDBinarySat<saddsat, "add_saturate_s", 40>;
+defm ADD_SAT_U :
+  SIMDBinarySat<uaddsat, "add_saturate_u", 41>;
 } // isCommutable = 1
 
 // Saturating integer subtraction: sub_saturate_s / sub_saturate_u
-defm SUB_SAT_S : SIMDBinarySat<wasm_sub_sat_s, "sub_saturate_s", 44>;
-defm SUB_SAT_U : SIMDBinarySat<wasm_sub_sat_u, "sub_saturate_u", 45>;
+defm SUB_SAT_S :
+  SIMDBinarySat<int_wasm_sub_saturate_signed, "sub_saturate_s", 44>;
+defm SUB_SAT_U :
+  SIMDBinarySat<int_wasm_sub_saturate_unsigned, "sub_saturate_u", 45>;
 
 //===----------------------------------------------------------------------===//
 // Bit shifts
@@ -489,6 +515,19 @@ foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in
 def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), (v2i64 (splat2 I64:$x)))),
           (v2i64 (shifts[1] (v2i64 V128:$vec), (I32_WRAP_I64 I64:$x)))>;
 
+// 2xi64 shifts with constant shift amounts are custom lowered to avoid wrapping
+def wasm_shift_t : SDTypeProfile<1, 2,
+  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]
+>;
+def wasm_shl : SDNode<"WebAssemblyISD::VEC_SHL", wasm_shift_t>;
+def wasm_shr_s : SDNode<"WebAssemblyISD::VEC_SHR_S", wasm_shift_t>;
+def wasm_shr_u : SDNode<"WebAssemblyISD::VEC_SHR_U", wasm_shift_t>;
+foreach shifts = [[wasm_shl, SHL_v2i64],
+                  [wasm_shr_s, SHR_S_v2i64],
+                  [wasm_shr_u, SHR_U_v2i64]] in
+def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), I32:$x)),
+          (v2i64 (shifts[1] (v2i64 V128:$vec), I32:$x))>;
+
 //===----------------------------------------------------------------------===//
 // Bitwise operations
 //===----------------------------------------------------------------------===//
@@ -508,35 +547,23 @@ defm XOR : SIMDBitwise<xor, "xor", 62>;
 } // isCommutable = 1
 
 // Bitwise logic: v128.not
-multiclass SIMDNot<ValueType vec_t, PatFrag splat_pat, ValueType lane_t> {
-  defm NOT_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec),
-                           (outs), (ins),
-                           [(set
-                             (vec_t V128:$dst),
-                             (vec_t (xor
-                               (vec_t V128:$vec),
-                               (vec_t (splat_pat (lane_t -1)))
-                             ))
-                           )],
+multiclass SIMDNot<ValueType vec_t> {
+  defm NOT_#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec), (outs), (ins),
+                           [(set (vec_t V128:$dst), (vec_t (vnot V128:$vec)))],
                            "v128.not\t$dst, $vec", "v128.not", 63>;
 }
 
-defm "" : SIMDNot<v16i8, splat16, i32>;
-defm "" : SIMDNot<v8i16, splat8, i32>;
-defm "" : SIMDNot<v4i32, splat4, i32>;
-defm "" : SIMDNot<v2i64, splat2, i64>;
+defm "" : SIMDNot<v16i8>;
+defm "" : SIMDNot<v8i16>;
+defm "" : SIMDNot<v4i32>;
+defm "" : SIMDNot<v2i64>;
 
 // Bitwise select: v128.bitselect
-def wasm_bitselect_t : SDTypeProfile<1, 3,
-  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]
->;
-def wasm_bitselect : SDNode<"WebAssemblyISD::BITSELECT", wasm_bitselect_t>;
-
 multiclass Bitselect<ValueType vec_t> {
   defm BITSELECT_#vec_t :
     SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins),
            [(set (vec_t V128:$dst),
-             (vec_t (wasm_bitselect
+             (vec_t (int_wasm_bitselect
                (vec_t V128:$c), (vec_t V128:$v1), (vec_t V128:$v2)
              ))
            )],
@@ -571,15 +598,11 @@ multiclass SIMDReduce<string name, SDNode op, bits<32> baseInst> {
   defm "" : SIMDReduceVec<v2i64, "i64x2", name, op, !add(baseInst, 3)>;
 }
 
-def wasm_reduce_t : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>;
-
 // Any lane true: any_true
-def wasm_anytrue : SDNode<"WebAssemblyISD::ANYTRUE", wasm_reduce_t>;
-defm ANYTRUE : SIMDReduce<"any_true", wasm_anytrue, 65>;
+defm ANYTRUE : SIMDReduce<"any_true", int_wasm_anytrue, 65>;
 
 // All lanes true: all_true
-def wasm_alltrue : SDNode<"WebAssemblyISD::ALLTRUE", wasm_reduce_t>;
-defm ALLTRUE : SIMDReduce<"all_true", wasm_alltrue, 69>;
+defm ALLTRUE : SIMDReduce<"all_true", int_wasm_alltrue, 69>;
 
 //===----------------------------------------------------------------------===//
 // Comparisons
@@ -590,7 +613,8 @@ multiclass SIMDCondition<ValueType vec_t, ValueType out_t, string vec,
   defm _#vec_t :
     SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
            [(set (out_t V128:$dst),
-             (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond))],
+             (setcc (vec_t V128:$lhs), (vec_t V128:$rhs), cond)
+           )],
            vec#"."#name#"\t$dst, $lhs, $rhs", vec#"."#name, simdop>;
 }
 
@@ -598,15 +622,15 @@ multiclass SIMDConditionInt<string name, CondCode cond, bits<32> baseInst,
                             int step = 1> {
   defm "" : SIMDCondition<v16i8, v16i8, "i8x16", name, cond, baseInst>;
   defm "" : SIMDCondition<v8i16, v8i16, "i16x8", name, cond,
-                              !add(baseInst, step)>;
+                          !add(baseInst, step)>;
   defm "" : SIMDCondition<v4i32, v4i32, "i32x4", name, cond,
-                              !add(!add(baseInst, step), step)>;
+                          !add(!add(baseInst, step), step)>;
 }
 
 multiclass SIMDConditionFP<string name, CondCode cond, bits<32> baseInst> {
   defm "" : SIMDCondition<v4f32, v4i32, "f32x4", name, cond, baseInst>;
   defm "" : SIMDCondition<v2f64, v2i64, "f64x2", name, cond,
-                              !add(baseInst, 1)>;
+                          !add(baseInst, 1)>;
 }
 
 // Equality: eq
@@ -727,21 +751,21 @@ defm "" : SIMDAbs<v2f64, "f64x2", 128>;
 // Floating-point min and max
 //===----------------------------------------------------------------------===//
 
+multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
+  defm "" : SIMDBinary<v4f32, "f32x4", node, name, baseInst>;
+  defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 1)>;
+}
+
 // NaN-propagating minimum: min
-// TODO
+defm MIN : SIMDBinaryFP<fminimum, "min", 129>;
 
 // NaN-propagating maximum: max
-// TODO
+defm MAX : SIMDBinaryFP<fmaximum, "max", 131>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point arithmetic
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDBinary<v4f32, "f32x4", node, name, baseInst>;
-  defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 1)>;
-}
-
 // Addition: add
 let isCommutable = 1 in
 defm ADD : SIMDBinaryFP<fadd, "add", 133>;
@@ -791,6 +815,16 @@ defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_u/f32x4", 148>;
 defm "" : SIMDConvert<v2i64, v2f64, fp_to_sint, "i64x2.trunc_sat_s/f64x2", 149>;
 defm "" : SIMDConvert<v2i64, v2f64, fp_to_uint, "i64x2.trunc_sat_u/f64x2", 150>;
 
+// Lower llvm.wasm.trunc.saturate.* to saturating instructions
+def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
+          (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>;
+def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
+          (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>;
+def : Pat<(v2i64 (int_wasm_trunc_saturate_signed (v2f64 V128:$src))),
+          (fp_to_sint_v2i64_v2f64 (v2f64 V128:$src))>;
+def : Pat<(v2i64 (int_wasm_trunc_saturate_unsigned (v2f64 V128:$src))),
+          (fp_to_uint_v2i64_v2f64 (v2f64 V128:$src))>;
+
 // Bitcasts are nops
 // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
 foreach t1 = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
diff --git a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index 936b801a9a06fc4ad57b9f1a15881c4b11e74f18..98953f0948244eec5b3f2caa040677901fef4f08 100644
--- a/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -59,7 +59,7 @@ FunctionPass *llvm::createWebAssemblyLateEHPrepare() {
 // possible search paths should be the same.
 // Returns nullptr in case it does not find any EH pad in the search, or finds
 // multiple different EH pads.
-MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) {
+static MachineBasicBlock *GetMatchingEHPad(MachineInstr *MI) {
   MachineFunction *MF = MI->getParent()->getParent();
   SmallVector<MachineBasicBlock *, 2> WL;
   SmallPtrSet<MachineBasicBlock *, 2> Visited;
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index b5a88129c6b701dc22ccb0e7c128dd5430c10292..f0d24075801f31e3364e4a8aabc47e6a86f95ba5 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -1030,7 +1030,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
 
   // Free setjmpTable buffer before each return instruction
   for (BasicBlock &BB : F) {
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (isa<ReturnInst>(TI))
       CallInst::CreateFree(SetjmpTable, TI);
   }
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index e9a0cf519055c042c68e29c26a4a6a9656313230..1dad7b8a2890f9644c2369cb9491d84f81683435 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -30,6 +30,11 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+// Defines llvm::WebAssembly::getStackOpcode to convert register instructions to
+// stack instructions
+#define GET_INSTRMAP_INFO 1
+#include "WebAssemblyGenInstrInfo.inc"
+
 // This disables the removal of registers when lowering into MC, as required
 // by some current tests.
 static cl::opt<bool>
@@ -38,7 +43,6 @@ static cl::opt<bool>
                                " instruction output for test purposes only."),
                       cl::init(false));
 
-static unsigned regInstructionToStackInstruction(unsigned OpCode);
 static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI);
 
 MCSymbol *
@@ -226,6 +230,13 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
           (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_FUNCTION) != 0,
           (MO.getTargetFlags() & WebAssemblyII::MO_SYMBOL_GLOBAL) != 0);
       break;
+    case MachineOperand::MO_MCSymbol:
+      // This is currently used only for LSDA symbols (GCC_except_table),
+      // because global addresses or other external symbols are handled above.
+      assert(MO.getTargetFlags() == 0 &&
+             "WebAssembly does not use target flags on MCSymbol");
+      MCOp = LowerSymbolOperand(MO.getMCSymbol(), /*Offset=*/0, false, false);
+      break;
     }
 
     OutMI.addOperand(MCOp);
@@ -254,7 +265,8 @@ static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI) {
 
   // Transform to _S instruction.
   auto RegOpcode = OutMI.getOpcode();
-  auto StackOpcode = regInstructionToStackInstruction(RegOpcode);
+  auto StackOpcode = WebAssembly::getStackOpcode(RegOpcode);
+  assert(StackOpcode != -1 && "Failed to stackify instruction");
   OutMI.setOpcode(StackOpcode);
 
   // Remove register operands.
@@ -265,21 +277,3 @@ static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI) {
     }
   }
 }
-
-static unsigned regInstructionToStackInstruction(unsigned OpCode) {
-  // For most opcodes, this function could have been implemented as "return
-  // OpCode + 1", but since table-gen alphabetically sorts them, this cannot be
-  // guaranteed (see e.g. BR and BR_IF). Instead we use a giant switch statement
-  // generated by a custom TableGen backend (WebAssemblyStackifierEmitter.cpp)
-  // that emits switch cases of the form
-  //
-  //   case WebAssembly::RegisterInstr: return WebAssembly::StackInstr;
-  //
-  // for every pair of equivalent register and stack instructions.
-  switch (OpCode) {
-  default:
-    llvm_unreachable(
-        "unknown WebAssembly instruction in WebAssemblyMCInstLower pass");
-#include "WebAssemblyGenStackifier.inc"
-  }
-}
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 4649230d45456df52ff503ce70e201d1c63b20b7..dc2aab875932926e45dab49124fa2aa6022ed3c8 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -118,6 +118,11 @@ static void ConvertImplicitDefToConstZero(MachineInstr *MI,
     ConstantFP *Val = cast<ConstantFP>(Constant::getNullValue(
         Type::getDoubleTy(MF.getFunction().getContext())));
     MI->addOperand(MachineOperand::CreateFPImm(Val));
+  } else if (RegClass == &WebAssembly::V128RegClass) {
+    // TODO: make splat instead of constant
+    MI->setDesc(TII->get(WebAssembly::CONST_V128_v16i8));
+    for (int I = 0; I < 16; ++I)
+      MI->addOperand(MachineOperand::CreateImm(0));
   } else {
     llvm_unreachable("Unexpected reg class");
   }
diff --git a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
index aaa0bbcbc5782335d30c1eb75e140b57b95f0196..c95af88c6f4356a124a16d7eb16d0fb0c418d39b 100644
--- a/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
+++ b/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -89,6 +89,12 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::LOAD_I64:
       case WebAssembly::LOAD_F32:
       case WebAssembly::LOAD_F64:
+      case WebAssembly::LOAD_v16i8:
+      case WebAssembly::LOAD_v8i16:
+      case WebAssembly::LOAD_v4i32:
+      case WebAssembly::LOAD_v2i64:
+      case WebAssembly::LOAD_v4f32:
+      case WebAssembly::LOAD_v2f64:
       case WebAssembly::LOAD8_S_I32:
       case WebAssembly::LOAD8_U_I32:
       case WebAssembly::LOAD16_S_I32:
@@ -164,6 +170,12 @@ bool WebAssemblySetP2AlignOperands::runOnMachineFunction(MachineFunction &MF) {
       case WebAssembly::STORE_I64:
       case WebAssembly::STORE_F32:
       case WebAssembly::STORE_F64:
+      case WebAssembly::STORE_v16i8:
+      case WebAssembly::STORE_v8i16:
+      case WebAssembly::STORE_v4i32:
+      case WebAssembly::STORE_v2i64:
+      case WebAssembly::STORE_v4f32:
+      case WebAssembly::STORE_v2f64:
       case WebAssembly::STORE8_I32:
       case WebAssembly::STORE16_I32:
       case WebAssembly::STORE8_I64:
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index a25ec7cf4c2af9f3ffc9b71214149ad451f00689..ada6fb9a96d79d43256c7ac989f43b9bf18e1189 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -27,14 +27,14 @@ const char *const WebAssembly::PersonalityWrapperFn =
 
 bool WebAssembly::isArgument(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
-  case WebAssembly::ARGUMENT_I32:
-  case WebAssembly::ARGUMENT_I32_S:
-  case WebAssembly::ARGUMENT_I64:
-  case WebAssembly::ARGUMENT_I64_S:
-  case WebAssembly::ARGUMENT_F32:
-  case WebAssembly::ARGUMENT_F32_S:
-  case WebAssembly::ARGUMENT_F64:
-  case WebAssembly::ARGUMENT_F64_S:
+  case WebAssembly::ARGUMENT_i32:
+  case WebAssembly::ARGUMENT_i32_S:
+  case WebAssembly::ARGUMENT_i64:
+  case WebAssembly::ARGUMENT_i64_S:
+  case WebAssembly::ARGUMENT_f32:
+  case WebAssembly::ARGUMENT_f32_S:
+  case WebAssembly::ARGUMENT_f64:
+  case WebAssembly::ARGUMENT_f64_S:
   case WebAssembly::ARGUMENT_v16i8:
   case WebAssembly::ARGUMENT_v16i8_S:
   case WebAssembly::ARGUMENT_v8i16:
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e67daa5d857e7962966bb9f284bd17235d14e49d..4801078925ccfadec855772a673d26eb61adb2c3 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3283,7 +3283,6 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
   else if (IDVal.startswith(".att_syntax")) {
-    getParser().setParsingInlineAsm(false);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if (Parser.getTok().getString() == "prefix")
         Parser.Lex();
@@ -3296,7 +3295,6 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
-    getParser().setParsingInlineAsm(true);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if (Parser.getTok().getString() == "noprefix")
         Parser.Lex();
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 4495bc2061892ff108289102d25e3a15b201433d..5ded1f971a035c4a997db32d145f897e9d1e8e80 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -13,6 +13,7 @@ tablegen(LLVM X86GenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM X86GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM X86GenExegesis.inc -gen-exegesis)
 
 if (X86_GEN_FOLD_TABLES)
   tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables)
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index b6320bd061250f93aa237efe38fc08134e01d34e..54d550b606520f2cf4dabc0fd6c40cee6db85fc9 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -1393,7 +1393,7 @@ static int readModRM(struct InternalInstruction* insn) {
       break;
     case 0x1:
       insn->displacementSize = 1;
-      /* FALLTHROUGH */
+      LLVM_FALLTHROUGH;
     case 0x2:
       insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
       switch (rm & 7) {
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index aef0df480bbb64c0888b3a5e4844c00ef43f207b..c85ce9bbd5a45c9c4fd7cfc730aadcc0c7374e79 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -266,12 +266,12 @@ namespace X86II {
     RawFrmSrc      = 4,
 
     /// RawFrmDst - This form is for instructions that use the destination index
-    /// register DI/EDI/ESI.
+    /// register DI/EDI/RDI.
     RawFrmDst      = 5,
 
-    /// RawFrmSrc - This form is for instructions that use the source index
-    /// register SI/ESI/ERI with a possible segment override, and also the
-    /// destination index register DI/ESI/RDI.
+    /// RawFrmDstSrc - This form is for instructions that use the source index
+    /// register SI/ESI/RSI with a possible segment override, and also the
+    /// destination index register DI/EDI/RDI.
     RawFrmDstSrc   = 6,
 
     /// RawFrmImm8 - This is used for the ENTER instruction, which has two
diff --git a/lib/Target/X86/ShadowCallStack.cpp b/lib/Target/X86/ShadowCallStack.cpp
index 9a39455f9dd55b1c8232c8deea92ac1dfbd52065..ab2cebcb58ee827acad25dc9e80cafc7bb1a0bdc 100644
--- a/lib/Target/X86/ShadowCallStack.cpp
+++ b/lib/Target/X86/ShadowCallStack.cpp
@@ -31,10 +31,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-void initializeShadowCallStackPass(PassRegistry &);
-}
-
 namespace {
 
 class ShadowCallStack : public MachineFunctionPass {
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index fe567f4cece82fb3521fe291c8258a76938023cc..bed940d0d0e9ab759c5ae2109ad3a3dd2b7f5b0e 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -304,12 +304,12 @@ void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
   }
 }
 
-void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask) {
   for (int i = 0, e = RawMask.size(); i < e; ++i) {
     uint64_t M = RawMask[i];
-    if (M == (uint64_t)SM_SentinelUndef) {
-      ShuffleMask.push_back(M);
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
       continue;
     }
     // For 256/512-bit vectors the base of the shuffle is the 128-bit
@@ -336,7 +336,7 @@ void DecodeBLENDMask(unsigned NumElts, unsigned Imm,
   }
 }
 
-void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask) {
   assert(RawMask.size() == 16 && "Illegal VPPERM shuffle mask size");
 
@@ -354,12 +354,12 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
   // 6 - Most significant bit of source byte replicated in all bit positions.
   // 7 - Invert most significant bit of source byte and replicate in all bit positions.
   for (int i = 0, e = RawMask.size(); i < e; ++i) {
-    uint64_t M = RawMask[i];
-    if (M == (uint64_t)SM_SentinelUndef) {
-      ShuffleMask.push_back(M);
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
       continue;
     }
 
+    uint64_t M = RawMask[i];
     uint64_t PermuteOp = (M >> 5) & 0x7;
     if (PermuteOp == 4) {
       ShuffleMask.push_back(SM_SentinelZero);
@@ -490,7 +490,7 @@ void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
 }
 
 void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
-                        ArrayRef<uint64_t> RawMask,
+                        ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                         SmallVectorImpl<int> &ShuffleMask) {
   unsigned VecSize = NumElts * ScalarBits;
   unsigned NumLanes = VecSize / 128;
@@ -500,6 +500,10 @@ void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
   assert((ScalarBits == 32 || ScalarBits == 64) && "Unexpected element size");
 
   for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
     uint64_t M = RawMask[i];
     M = (ScalarBits == 64 ? ((M >> 1) & 0x1) : (M & 0x3));
     unsigned LaneOffset = i & ~(NumEltsPerLane - 1);
@@ -508,7 +512,7 @@ void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
 }
 
 void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
-                         ArrayRef<uint64_t> RawMask,
+                         ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                          SmallVectorImpl<int> &ShuffleMask) {
   unsigned VecSize = NumElts * ScalarBits;
   unsigned NumLanes = VecSize / 128;
@@ -518,6 +522,11 @@ void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
   assert((NumElts == RawMask.size()) && "Unexpected mask size");
 
   for (unsigned i = 0, e = RawMask.size(); i < e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+
     // VPERMIL2 Operation.
     // Bits[3] - Match Bit.
     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
@@ -548,19 +557,29 @@ void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
   }
 }
 
-void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask) {
   uint64_t EltMaskSize = RawMask.size() - 1;
-  for (auto M : RawMask) {
+  for (int i = 0, e = RawMask.size(); i != e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    uint64_t M = RawMask[i];
     M &= EltMaskSize;
     ShuffleMask.push_back((int)M);
   }
 }
 
-void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask) {
   uint64_t EltMaskSize = (RawMask.size() * 2) - 1;
-  for (auto M : RawMask) {
+  for (int i = 0, e = RawMask.size(); i != e; ++i) {
+    if (UndefElts[i]) {
+      ShuffleMask.push_back(SM_SentinelUndef);
+      continue;
+    }
+    uint64_t M = RawMask[i];
     M &= EltMaskSize;
     ShuffleMask.push_back((int)M);
   }
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 6d13bd58a1271ee0f6279fbc6c3103d89cbeffcf..85cde14a32410a84e663f76e556cf582a7dadcc5 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
 #define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
 
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 
 //===----------------------------------------------------------------------===//
@@ -108,7 +109,7 @@ void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
 
 /// Decode a PSHUFB mask from a raw array of constants such as from
 /// BUILD_VECTOR.
-void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask,
+void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a BLEND immediate mask into a shuffle mask.
@@ -131,7 +132,7 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
 /// BUILD_VECTOR.
 /// This can only basic masks (permutes + zeros), not any of the other
 /// operations that VPPERM can perform.
-void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask,
+void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a zero extension instruction as a shuffle mask.
@@ -156,20 +157,20 @@ void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
 
 /// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
 void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits,
-                        ArrayRef<uint64_t> RawMask,
+                        ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
 void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z,
-                         ArrayRef<uint64_t> RawMask,
+                         ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                          SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
-void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask,
+void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
-void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask,
+void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
                       SmallVectorImpl<int> &ShuffleMask);
 } // llvm namespace
 
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index d5405703fdffa793c3d08792f52cf9f9a3394846..19f8e35ade04bf70cc1e3982b9164ec307791872 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -115,8 +115,6 @@ FunctionPass *createX86FixupBWInsts();
 /// to another, when profitable.
 FunctionPass *createX86DomainReassignmentPass();
 
-void initializeFixupBWInstPassPass(PassRegistry &);
-
 /// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX
 /// encoding when possible in order to reduce code size.
 FunctionPass *createX86EvexToVexInsts();
@@ -128,10 +126,21 @@ InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);
 
-void initializeEvexToVexInstPassPass(PassRegistry &);
-
 FunctionPass *createX86SpeculativeLoadHardeningPass();
 
+void initializeEvexToVexInstPassPass(PassRegistry &);
+void initializeFixupBWInstPassPass(PassRegistry &);
+void initializeFixupLEAPassPass(PassRegistry &);
+void initializeShadowCallStackPass(PassRegistry &);
+void initializeWinEHStatePassPass(PassRegistry &);
+void initializeX86AvoidSFBPassPass(PassRegistry &);
+void initializeX86CallFrameOptimizationPass(PassRegistry &);
+void initializeX86CmovConverterPassPass(PassRegistry &);
+void initializeX86DomainReassignmentPass(PassRegistry &);
+void initializeX86ExecutionDomainFixPass(PassRegistry &);
+void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 5d627f34c55a4c6335676aef132adb25fe43142b..74135656528d5a73a7130f972335b70a6f5002a3 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -98,6 +98,9 @@ def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
                                        "SHLD instruction is slow">;
 def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
                                         "PMULLD instruction is slow">;
+def FeatureSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
+                                          "true",
+                                          "PMADDWD is slower than PMULLD">;
 // FIXME: This should not apply to CPUs that do not have SSE.
 def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16",
                                 "IsUAMem16Slow", "true",
@@ -404,6 +407,15 @@ def FeatureFastBEXTR : SubtargetFeature<"fast-bextr", "HasFastBEXTR", "true",
           "Indicates that the BEXTR instruction is implemented as a single uop "
           "with good throughput.">;
 
+// Combine vector math operations with shuffles into horizontal math
+// instructions if a CPU implements horizontal operations (introduced with
+// SSE3) with better latency/throughput than the alternative sequence.
+def FeatureFastHorizontalOps
+    : SubtargetFeature<
+        "fast-hops", "HasFastHorizontalOps", "true",
+        "Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
+        "normal vector instructions with shuffles", [FeatureSSE3]>;
+
 // Merge branches using three-way conditional code.
 def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
                                         "ThreewayBranchProfitable", "true",
@@ -437,6 +449,7 @@ include "X86SchedHaswell.td"
 include "X86SchedBroadwell.td"
 include "X86ScheduleSLM.td"
 include "X86ScheduleZnver1.td"
+include "X86ScheduleBdVer2.td"
 include "X86ScheduleBtVer2.td"
 include "X86SchedSkylakeClient.td"
 include "X86SchedSkylakeServer.td"
@@ -451,22 +464,6 @@ def ProcIntelGLP  : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP",
                     "Intel Goldmont Plus processors">;
 def ProcIntelTRM  : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM",
                     "Intel Tremont processors">;
-def ProcIntelHSW  : SubtargetFeature<"haswell", "X86ProcFamily",
-                    "IntelHaswell", "Intel Haswell processors">;
-def ProcIntelBDW  : SubtargetFeature<"broadwell", "X86ProcFamily",
-                    "IntelBroadwell", "Intel Broadwell processors">;
-def ProcIntelSKL  : SubtargetFeature<"skylake", "X86ProcFamily",
-                    "IntelSkylake", "Intel Skylake processors">;
-def ProcIntelKNL  : SubtargetFeature<"knl", "X86ProcFamily",
-                    "IntelKNL", "Intel Knights Landing processors">;
-def ProcIntelSKX  : SubtargetFeature<"skx", "X86ProcFamily",
-                    "IntelSKX", "Intel Skylake Server processors">;
-def ProcIntelCNL  : SubtargetFeature<"cannonlake", "X86ProcFamily",
-                    "IntelCannonlake", "Intel Cannonlake processors">;
-def ProcIntelICL  : SubtargetFeature<"icelake-client", "X86ProcFamily",
-                    "IntelIcelakeClient", "Intel Icelake processors">;
-def ProcIntelICX  : SubtargetFeature<"icelake-server", "X86ProcFamily",
-                    "IntelIcelakeServer", "Intel Icelake Server processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -778,7 +775,6 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
 
 class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
                                            HSWFeatures.Value, [
-  ProcIntelHSW,
   FeaturePOPCNTFalseDeps,
   FeatureLZCNTFalseDeps
 ]>;
@@ -792,7 +788,6 @@ def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
 ]>;
 class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
                                              BDWFeatures.Value, [
-  ProcIntelBDW,
   FeaturePOPCNTFalseDeps,
   FeatureLZCNTFalseDeps
 ]>;
@@ -809,14 +804,32 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
 
 class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
                                                  SKLFeatures.Value, [
-  ProcIntelSKL,
   FeatureHasFastGather,
   FeaturePOPCNTFalseDeps,
   FeatureSGX
 ]>;
 def : SkylakeClientProc<"skylake">;
 
-def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [
+def KNLFeatures : ProcessorFeatures<[], [
+  FeatureX87,
+  FeatureCMOV,
+  FeatureMMX,
+  FeatureFXSR,
+  FeatureNOPL,
+  Feature64Bit,
+  FeatureCMPXCHG16B,
+  FeaturePOPCNT,
+  FeatureSlowDivide64,
+  FeaturePCLMUL,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureLAHFSAHF,
+  FeatureSlow3OpsLEA,
+  FeatureSlowIncDec,
+  FeatureAES,
+  FeatureRDRAND,
+  FeatureF16C,
+  FeatureFSGSBase,
   FeatureAVX512,
   FeatureERI,
   FeatureCDI,
@@ -835,19 +848,19 @@ def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [
 // FIXME: define KNL model
 class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
                                                   KNLFeatures.Value, [
-  ProcIntelKNL,
   FeatureSlowTwoMemOps,
   FeatureFastPartialYMMorZMMWrite,
-  FeatureHasFastGather
+  FeatureHasFastGather,
+  FeatureSlowPMADDWD
 ]>;
 def : KnightsLandingProc<"knl">;
 
 class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel,
                                                KNLFeatures.Value, [
-  ProcIntelKNL,
   FeatureSlowTwoMemOps,
   FeatureFastPartialYMMorZMMWrite,
   FeatureHasFastGather,
+  FeatureSlowPMADDWD,
   FeatureVPOPCNTDQ
 ]>;
 def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features
@@ -864,7 +877,6 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
 
 class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
                                                  SKXFeatures.Value, [
-  ProcIntelSKX,
   FeatureHasFastGather,
   FeaturePOPCNTFalseDeps
 ]>;
@@ -886,7 +898,6 @@ def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [
 
 class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
                                               CNLFeatures.Value, [
-  ProcIntelCNL,
   FeatureHasFastGather
 ]>;
 def : CannonlakeProc<"cannonlake">;
@@ -905,14 +916,12 @@ def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [
 
 class IcelakeClientProc<string Name> : ProcModel<Name, SkylakeServerModel,
                                                  ICLFeatures.Value, [
-  ProcIntelICL,
   FeatureHasFastGather
 ]>;
 def : IcelakeClientProc<"icelake-client">;
 
 class IcelakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
                                                  ICLFeatures.Value, [
-  ProcIntelICX,
   FeaturePCONFIG,
   FeatureWBNOINVD,
   FeatureHasFastGather
@@ -998,11 +1007,12 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureLAHFSAHF,
   FeatureFast15ByteNOP,
   FeatureFastBEXTR,
-  FeatureFastPartialYMMorZMMWrite
+  FeatureFastPartialYMMorZMMWrite,
+  FeatureFastHorizontalOps
 ]>;
 
 // Bulldozer
-def : Proc<"bdver1", [
+def : ProcessorModel<"bdver1", BdVer2Model, [
   FeatureX87,
   FeatureCMOV,
   FeatureXOP,
@@ -1027,7 +1037,7 @@ def : Proc<"bdver1", [
   FeatureMacroFusion
 ]>;
 // Piledriver
-def : Proc<"bdver2", [
+def : ProcessorModel<"bdver2", BdVer2Model, [
   FeatureX87,
   FeatureCMOV,
   FeatureXOP,
diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index ab2cbfc33e17e6f8cd2dd9bab31f3c0067d5eb90..eb9c4b3e5977e4a0b45e4c9b45659bc6bb9d6370 100644
--- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -52,10 +52,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86-avoid-SFB"
 
-namespace llvm {
-void initializeX86AvoidSFBPassPass(PassRegistry &);
-} // end namespace llvm
-
 static cl::opt<bool> DisableX86AvoidStoreForwardBlocks(
     "x86-disable-avoid-SFB", cl::Hidden,
     cl::desc("X86: Disable Store Forwarding Blocks fixup."), cl::init(false));
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index c73fd6eb144a423e9714f6b76fc655d653293d3d..24d7a219e7510132b10d6d3ed6c23709711c25f1 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -56,10 +56,6 @@ static cl::opt<bool>
                cl::desc("Avoid optimizing x86 call frames for size"),
                cl::init(false), cl::Hidden);
 
-namespace llvm {
-void initializeX86CallFrameOptimizationPass(PassRegistry &);
-}
-
 namespace {
 
 class X86CallFrameOptimization : public MachineFunctionPass {
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index 1c5f110d8c60e504d88c5314b353df6519e7ee7a..c3e76fd2a856ce52e1b61a4c9d104eed22bf127b 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -81,12 +81,6 @@ STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
 STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
 STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
 
-namespace llvm {
-
-void initializeX86CmovConverterPassPass(PassRegistry &);
-
-} // end namespace llvm
-
 // This internal switch can be used to turn off the cmov/branch optimization.
 static cl::opt<bool>
     EnableCmovConverter("x86-cmov-converter",
diff --git a/lib/Target/X86/X86CondBrFolding.cpp b/lib/Target/X86/X86CondBrFolding.cpp
index 8b9ef20d9169fd54fd2b7507173f3fbbb4e1d347..1d221930c2a46b3a4bd5eac60794181b0c95a2e9 100644
--- a/lib/Target/X86/X86CondBrFolding.cpp
+++ b/lib/Target/X86/X86CondBrFolding.cpp
@@ -84,6 +84,7 @@ FunctionPass *llvm::createX86CondBrFolding() {
   return new X86CondBrFoldingPass();
 }
 
+namespace {
 // A class the stores the auxiliary information for each MBB.
 struct TargetMBBInfo {
   MachineBasicBlock *TBB;
@@ -129,6 +130,7 @@ private:
     return MBBInfos[MBB->getNumber()].get();
   }
 };
+} // namespace
 
 // Find a valid path that we can reuse the CondCode.
 // The resulted path (if return true) is stored in BranchPath.
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
index 62588e9509d3db88ca65704afb8dd1c132e26ac4..7e1f1e7876c037ee03ce8c560ccdf0e66b3b8ec0 100644
--- a/lib/Target/X86/X86DomainReassignment.cpp
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -31,10 +31,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-void initializeX86DomainReassignmentPass(PassRegistry &);
-}
-
 #define DEBUG_TYPE "x86-domain-reassignment"
 
 STATISTIC(NumClosuresConverted, "Number of closures converted by the pass");
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 888c43afd8a14d78568eff8c89733d4aa60446a1..a49ad8bd59dfc840c7a56d8c0fed2132db8bd72d 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -3734,9 +3734,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected value type");
   case MVT::i1:
-    // TODO: Support this properly.
-    if (Subtarget->hasAVX512())
-      return 0;
     VT = MVT::i8;
     LLVM_FALLTHROUGH;
   case MVT::i8:  Opc = X86::MOV8ri;  break;
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 33a8baac594b59172d2a4d48149ec5bd1b718f85..ad42cb878046c45420e2723d162f0c0afba0613b 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -25,10 +25,6 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-namespace llvm {
-void initializeFixupLEAPassPass(PassRegistry &);
-}
-
 #define FIXUPLEA_DESC "X86 LEA Fixup"
 #define FIXUPLEA_NAME "x86-fixup-LEAs"
 
@@ -43,8 +39,8 @@ class FixupLEAPass : public MachineFunctionPass {
   /// Loop over all of the instructions in the basic block
   /// replacing applicable instructions with LEA instructions,
   /// where appropriate.
-  bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
-
+  bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI,
+                         bool IsSlowLEA, bool IsSlow3OpsLEA);
 
   /// Given a machine register, look for the instruction
   /// which writes it in the current basic block. If found,
@@ -62,10 +58,9 @@ class FixupLEAPass : public MachineFunctionPass {
                           MachineFunction::iterator MFI);
 
   /// Given a LEA instruction which is unprofitable
-  /// on Silvermont try to replace it with an equivalent ADD instruction
-  void processInstructionForSLM(MachineBasicBlock::iterator &I,
-                                MachineFunction::iterator MFI);
-
+  /// on SlowLEA targets try to replace it with an equivalent ADD instruction.
+  void processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
+                                    MachineFunction::iterator MFI);
 
   /// Given a LEA instruction which is unprofitable
   /// on SNB+ try to replace it with other instructions.
@@ -197,8 +192,11 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
 
   MF = &Func;
   const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
+  bool IsSlowLEA = ST.slowLEA();
+  bool IsSlow3OpsLEA = ST.slow3OpsLEA();
+
   OptIncDec = !ST.slowIncDec() || Func.getFunction().optForMinSize();
-  OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
+  OptLEA = ST.LEAusesAG() || IsSlowLEA || IsSlow3OpsLEA;
 
   if (!OptLEA && !OptIncDec)
     return false;
@@ -209,7 +207,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
   for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
-    processBasicBlock(Func, I);
+    processBasicBlock(Func, I, IsSlowLEA, IsSlow3OpsLEA);
   LLVM_DEBUG(dbgs() << "End X86FixupLEAs\n";);
 
   return true;
@@ -285,8 +283,9 @@ static inline bool isInefficientLEAReg(unsigned int Reg) {
 static inline bool isRegOperand(const MachineOperand &Op) {
   return Op.isReg() && Op.getReg() != X86::NoRegister;
 }
-/// hasIneffecientLEARegs - LEA that uses base and index registers
-/// where the base is EBP, RBP, or R13
+
+/// Returns true if this LEA uses base an index registers, and the base register
+/// is known to be inefficient for the subtarget.
 // TODO: use a variant scheduling class to model the latency profile
 // of LEA instructions, and implement this logic as a scheduling predicate.
 static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
@@ -415,8 +414,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
   }
 }
 
-void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
-                                            MachineFunction::iterator MFI) {
+void FixupLEAPass::processInstructionForSlowLEA(MachineBasicBlock::iterator &I,
+                                                MachineFunction::iterator MFI) {
   MachineInstr &MI = *I;
   const int Opcode = MI.getOpcode();
   if (!isLEA(Opcode))
@@ -571,26 +570,28 @@ FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
 }
 
 bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
-                                     MachineFunction::iterator MFI) {
-
+                                     MachineFunction::iterator MFI,
+                                     bool IsSlowLEA, bool IsSlow3OpsLEA) {
   for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
     if (OptIncDec)
       if (fixupIncDec(I, MFI))
         continue;
 
     if (OptLEA) {
-      if (MF.getSubtarget<X86Subtarget>().slowLEA())
-        processInstructionForSLM(I, MFI);
-
-      else {
-        if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
-          if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
-            MFI->erase(I);
-            I = NewMI;
-          }
-        } else
-          processInstruction(I, MFI);
+      if (IsSlowLEA) {
+        processInstructionForSlowLEA(I, MFI);
+        continue;
       }
+      
+      if (IsSlow3OpsLEA) {
+        if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
+          MFI->erase(I);
+          I = NewMI;
+        }
+        continue;
+      }
+
+      processInstruction(I, MFI);
     }
   }
   return false;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 1eb9fa0bc1ecfbbe8d0c7c536616fd7a93ec1799..e40b0f81e3306b8a690272c7429f320753192385 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -1103,15 +1103,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
       NumBytes = alignTo(NumBytes, MaxAlign);
 
-    // Get the offset of the stack slot for the EBP register, which is
-    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
-    // Update the frame offset adjustment.
-    if (!IsFunclet)
-      MFI.setOffsetAdjustment(-NumBytes);
-    else
-      assert(MFI.getOffsetAdjustment() == -(int)NumBytes &&
-             "should calculate same local variable offset for funclets");
-
     // Save EBP/RBP into the appropriate stack slot.
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
       .addReg(MachineFramePtr, RegState::Kill)
@@ -1167,6 +1158,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
   }
 
+  // Update the offset adjustment, which is mainly used by codeview to translate
+  // from ESP to VFRAME relative local variable offsets.
+  if (!IsFunclet) {
+    if (HasFP && TRI->needsStackRealignment(MF))
+      MFI.setOffsetAdjustment(-NumBytes);
+    else
+      MFI.setOffsetAdjustment(-StackSize);
+  }
+
   // For EH funclets, only allocate enough space for outgoing calls. Save the
   // NumBytes value that we would've used for the parent frame.
   unsigned ParentFrameNumBytes = NumBytes;
@@ -2471,8 +2471,8 @@ void X86FrameLowering::adjustForSegmentedStacks(
 
   allocMBB->addSuccessor(&PrologueMBB);
 
-  checkMBB->addSuccessor(allocMBB);
-  checkMBB->addSuccessor(&PrologueMBB);
+  checkMBB->addSuccessor(allocMBB, BranchProbability::getZero());
+  checkMBB->addSuccessor(&PrologueMBB, BranchProbability::getOne());
 
 #ifdef EXPENSIVE_CHECKS
   MF.verify();
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index be079659da4e25a140b7a751992961da1c35431c..16819f4451c2d2b23d311042281c8591b51e14ee 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -165,6 +165,9 @@ namespace {
     /// If true, selector should try to optimize for minimum code size.
     bool OptForMinSize;
 
+    /// Disable direct TLS access through segment registers.
+    bool IndirectTlsSegRefs;
+
   public:
     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
         : SelectionDAGISel(tm, OptLevel), OptForSize(false),
@@ -177,6 +180,8 @@ namespace {
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Reset the subtarget each time through.
       Subtarget = &MF.getSubtarget<X86Subtarget>();
+      IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
+                             "indirect-tls-seg-refs");
       SelectionDAGISel::runOnMachineFunction(MF);
       return true;
     }
@@ -239,12 +244,6 @@ namespace {
       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
     }
 
-    // Try to fold a vector load. This makes sure the load isn't non-temporal.
-    bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
-                        SDValue &Base, SDValue &Scale,
-                        SDValue &Index, SDValue &Disp,
-                        SDValue &Segment);
-
     /// Implement addressing mode selection for inline asm expressions.
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
@@ -447,6 +446,9 @@ namespace {
 
       switch (StoreSize) {
       default: llvm_unreachable("Unsupported store size");
+      case 4:
+      case 8:
+        return false;
       case 16:
         return Subtarget->hasSSE41();
       case 32:
@@ -457,7 +459,8 @@ namespace {
     }
 
     bool foldLoadStoreIntoMemOperand(SDNode *Node);
-    bool matchBEXTRFromAnd(SDNode *Node);
+    MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
+    bool matchBitExtract(SDNode *Node);
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
     bool tryShiftAmountMod(SDNode *N);
@@ -467,6 +470,8 @@ namespace {
     MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
                                 const SDLoc &dl, MVT VT, SDNode *Node,
                                 SDValue &InFlag);
+
+    bool tryOptimizeRem8Extend(SDNode *N);
   };
 }
 
@@ -517,6 +522,10 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
   if (N.getOpcode() != ISD::LOAD)
     return true;
 
+  // Don't fold non-temporal loads if we have an instruction for them.
+  if (useNonTemporalLoad(cast<LoadSDNode>(N)))
+    return false;
+
   // If N is a load, do additional profitability checks.
   if (U == Root) {
     switch (U->getOpcode()) {
@@ -834,23 +843,63 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
   }
 }
 
+// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
+bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
+  unsigned Opc = N->getMachineOpcode();
+  if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
+      Opc != X86::MOVSX64rr8)
+    return false;
+
+  SDValue N0 = N->getOperand(0);
+
+  // We need to be extracting the lower bit of an extend.
+  if (!N0.isMachineOpcode() ||
+      N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
+      N0.getConstantOperandVal(1) != X86::sub_8bit)
+    return false;
+
+  // We're looking for either a movsx or movzx to match the original opcode.
+  unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
+                                                : X86::MOVSX32rr8_NOREX;
+  SDValue N00 = N0.getOperand(0);
+  if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
+    return false;
+
+  if (Opc == X86::MOVSX64rr8) {
+    // If we had a sign extend from 8 to 64 bits. We still need to go from 32
+    // to 64.
+    MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
+                                                   MVT::i64, N00);
+    ReplaceUses(N, Extend);
+  } else {
+    // Ok we can drop this extend and just use the original extend.
+    ReplaceUses(N, N00.getNode());
+  }
+
+  return true;
+}
 
 void X86DAGToDAGISel::PostprocessISelDAG() {
   // Skip peepholes at -O0.
   if (TM.getOptLevel() == CodeGenOpt::None)
     return;
 
-  // Attempt to remove vectors moves that were inserted to zero upper bits.
-
-  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
-  ++Position;
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
+  bool MadeChange = false;
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;
     // Skip dead nodes and any non-machine opcodes.
     if (N->use_empty() || !N->isMachineOpcode())
       continue;
 
+    if (tryOptimizeRem8Extend(N)) {
+      MadeChange = true;
+      continue;
+    }
+
+    // Attempt to remove vectors moves that were inserted to zero upper bits.
+
     if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG)
       continue;
 
@@ -899,11 +948,11 @@ void X86DAGToDAGISel::PostprocessISelDAG() {
     // Producing instruction is another vector instruction. We can drop the
     // move.
     CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
-
-    // If the move is now dead, delete it.
-    if (Move.getNode()->use_empty())
-      CurDAG->RemoveDeadNode(Move.getNode());
+    MadeChange = true;
   }
+
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
 }
 
 
@@ -979,6 +1028,7 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
+        !IndirectTlsSegRefs &&
         (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
          Subtarget->isTargetFuchsia()))
       switch (N->getPointerInfo().getAddrSpace()) {
@@ -1342,6 +1392,64 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   return false;
 }
 
+// Transform "(X >> SHIFT) & (MASK << C1)" to
+// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
+// matched to a BEXTR later. Returns false if the simplification is performed.
+static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
+                                   uint64_t Mask,
+                                   SDValue Shift, SDValue X,
+                                   X86ISelAddressMode &AM,
+                                   const X86Subtarget &Subtarget) {
+  if (Shift.getOpcode() != ISD::SRL ||
+      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
+      !Shift.hasOneUse() || !N.hasOneUse())
+    return true;
+
+  // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
+  if (!Subtarget.hasTBM() &&
+      !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
+    return true;
+
+  // We need to ensure that mask is a continuous run of bits.
+  if (!isShiftedMask_64(Mask)) return true;
+
+  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
+
+  // The amount of shift we're trying to fit into the addressing mode is taken
+  // from the trailing zeros of the mask.
+  unsigned AMShiftAmt = countTrailingZeros(Mask);
+
+  // There is nothing we can do here unless the mask is removing some bits.
+  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
+  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
+  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
+  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
+  SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
+  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
+  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
+  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
+
+  // Insert the new nodes into the topological ordering. We must do this in
+  // a valid topological ordering as nothing is going to go back and re-sort
+  // these nodes. We continually insert before 'N' in sequence as this is
+  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
+  // hierarchy left to express.
+  insertDAGNode(DAG, N, NewSRLAmt);
+  insertDAGNode(DAG, N, NewSRL);
+  insertDAGNode(DAG, N, NewMask);
+  insertDAGNode(DAG, N, NewAnd);
+  insertDAGNode(DAG, N, NewSHLAmt);
+  insertDAGNode(DAG, N, NewSHL);
+  DAG.ReplaceAllUsesWith(N, NewSHL);
+
+  AM.Scale = 1 << AMShiftAmt;
+  AM.IndexReg = NewAnd;
+  return false;
+}
+
 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                               unsigned Depth) {
   SDLoc dl(N);
@@ -1622,6 +1730,11 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     // a scale on the outside of the mask.
     if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
       return false;
+
+    // Try to fold the mask and shift into BEXTR and scale.
+    if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
+      return false;
+
     break;
   }
   }
@@ -2054,20 +2167,6 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
 
-bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
-                                     SDValue &Base, SDValue &Scale,
-                                     SDValue &Index, SDValue &Disp,
-                                     SDValue &Segment) {
-  if (!ISD::isNON_EXTLoad(N.getNode()) ||
-      useNonTemporalLoad(cast<LoadSDNode>(N)) ||
-      !IsProfitableToFold(N, P, Root) ||
-      !IsLegalToFold(N, P, Root, OptLevel))
-    return false;
-
-  return selectAddr(N.getNode(),
-                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
-}
-
 /// Return an SDNode that returns the value of the global base register.
 /// Output instructions required to initialize the global base register,
 /// if necessary.
@@ -2582,8 +2681,198 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
   return true;
 }
 
+// See if this is an  X & Mask  that we can match to BEXTR/BZHI.
+// Where Mask is one of the following patterns:
+//   a) x &  (1 << nbits) - 1
+//   b) x & ~(-1 << nbits)
+//   c) x &  (-1 >> (32 - y))
+//   d) x << (32 - y) >> (32 - y)
+bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
+  assert(
+      (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
+      "Should be either an and-mask, or right-shift after clearing high bits.");
+
+  // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
+  if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
+    return false;
+
+  MVT NVT = Node->getSimpleValueType(0);
+
+  // Only supported for 32 and 64 bits.
+  if (NVT != MVT::i32 && NVT != MVT::i64)
+    return false;
+
+  unsigned Size = NVT.getSizeInBits();
+
+  SDValue NBits;
+
+  // If we have BMI2's BZHI, we are ok with muti-use patterns.
+  // Else, if we only have BMI1's BEXTR, we require one-use.
+  const bool CanHaveExtraUses = Subtarget->hasBMI2();
+  auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
+    return CanHaveExtraUses ||
+           Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
+  };
+  auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
+  auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
+
+  // a) x & ((1 << nbits) + (-1))
+  auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool {
+    // Match `add`. Must only have one use!
+    if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
+      return false;
+    // We should be adding all-ones constant (i.e. subtracting one.)
+    if (!isAllOnesConstant(Mask->getOperand(1)))
+      return false;
+    // Match `1 << nbits`. Must only have one use!
+    SDValue M0 = Mask->getOperand(0);
+    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
+      return false;
+    if (!isOneConstant(M0->getOperand(0)))
+      return false;
+    NBits = M0->getOperand(1);
+    return true;
+  };
+
+  // b) x & ~(-1 << nbits)
+  auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool {
+    // Match `~()`. Must only have one use!
+    if (!isBitwiseNot(Mask) || !checkOneUse(Mask))
+      return false;
+    // Match `-1 << nbits`. Must only have one use!
+    SDValue M0 = Mask->getOperand(0);
+    if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
+      return false;
+    if (!isAllOnesConstant(M0->getOperand(0)))
+      return false;
+    NBits = M0->getOperand(1);
+    return true;
+  };
+
+  // Match potentially-truncated (bitwidth - y)
+  auto matchShiftAmt = [checkOneUse, Size, &NBits](SDValue ShiftAmt) {
+    // Skip over a truncate of the shift amount.
+    if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
+      ShiftAmt = ShiftAmt.getOperand(0);
+      // The trunc should have been the only user of the real shift amount.
+      if (!checkOneUse(ShiftAmt))
+        return false;
+    }
+    // Match the shift amount as: (bitwidth - y). It should go away, too.
+    if (ShiftAmt.getOpcode() != ISD::SUB)
+      return false;
+    auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
+    if (!V0 || V0->getZExtValue() != Size)
+      return false;
+    NBits = ShiftAmt.getOperand(1);
+    return true;
+  };
+
+  // c) x &  (-1 >> (32 - y))
+  auto matchPatternC = [&checkOneUse, matchShiftAmt](SDValue Mask) -> bool {
+    // Match `l>>`. Must only have one use!
+    if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
+      return false;
+    // We should be shifting all-ones constant.
+    if (!isAllOnesConstant(Mask.getOperand(0)))
+      return false;
+    SDValue M1 = Mask.getOperand(1);
+    // The shift amount should not be used externally.
+    if (!checkOneUse(M1))
+      return false;
+    return matchShiftAmt(M1);
+  };
+
+  SDValue X;
+
+  // d) x << (32 - y) >> (32 - y)
+  auto matchPatternD = [&checkOneUse, &checkTwoUse, matchShiftAmt,
+                        &X](SDNode *Node) -> bool {
+    if (Node->getOpcode() != ISD::SRL)
+      return false;
+    SDValue N0 = Node->getOperand(0);
+    if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
+      return false;
+    SDValue N1 = Node->getOperand(1);
+    SDValue N01 = N0->getOperand(1);
+    // Both of the shifts must be by the exact same value.
+    // There should not be any uses of the shift amount outside of the pattern.
+    if (N1 != N01 || !checkTwoUse(N1))
+      return false;
+    if (!matchShiftAmt(N1))
+      return false;
+    X = N0->getOperand(0);
+    return true;
+  };
+
+  auto matchLowBitMask = [&matchPatternA, &matchPatternB,
+                          &matchPatternC](SDValue Mask) -> bool {
+    // FIXME: pattern c.
+    return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
+  };
+
+  if (Node->getOpcode() == ISD::AND) {
+    X = Node->getOperand(0);
+    SDValue Mask = Node->getOperand(1);
+
+    if (matchLowBitMask(Mask)) {
+      // Great.
+    } else {
+      std::swap(X, Mask);
+      if (!matchLowBitMask(Mask))
+        return false;
+    }
+  } else if (!matchPatternD(Node))
+    return false;
+
+  SDLoc DL(Node);
+
+  SDValue OrigNBits = NBits;
+  if (NBits.getValueType() != NVT) {
+    // Truncate the shift amount.
+    NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
+    insertDAGNode(*CurDAG, OrigNBits, NBits);
+
+    // Insert 8-bit NBits into lowest 8 bits of NVT-sized (32 or 64-bit)
+    // register. All the other bits are undefined, we do not care about them.
+    SDValue ImplDef =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, NVT), 0);
+    insertDAGNode(*CurDAG, OrigNBits, ImplDef);
+    NBits =
+        CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits);
+    insertDAGNode(*CurDAG, OrigNBits, NBits);
+  }
+
+  if (Subtarget->hasBMI2()) {
+    // Great, just emit the the BZHI..
+    SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
+    ReplaceNode(Node, Extract.getNode());
+    SelectCode(Extract.getNode());
+    return true;
+  }
+
+  // Else, emitting BEXTR requires one more step.
+  // The 'control' of BEXTR has the pattern of:
+  // [15...8 bit][ 7...0 bit] location
+  // [ bit count][     shift] name
+  // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
+
+  // Shift NBits left by 8 bits, thus producing 'control'.
+  SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
+  SDValue Control = CurDAG->getNode(ISD::SHL, DL, NVT, NBits, C8);
+  insertDAGNode(*CurDAG, OrigNBits, Control);
+  // NOTE: could also try to extract  start  from  (x >> start)
+
+  // And finally, form the BEXTR itself.
+  SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, NVT, X, Control);
+  ReplaceNode(Node, Extract.getNode());
+  SelectCode(Extract.getNode());
+
+  return true;
+}
+
 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
-bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
+MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   SDLoc dl(Node);
 
@@ -2598,30 +2887,30 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
   // BEXTR?
   if (!Subtarget->hasTBM() &&
       !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
-    return false;
+    return nullptr;
 
   // Must have a shift right.
   if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
-    return false;
+    return nullptr;
 
   // Shift can't have additional users.
   if (!N0->hasOneUse())
-    return false;
+    return nullptr;
 
   // Only supported for 32 and 64 bits.
   if (NVT != MVT::i32 && NVT != MVT::i64)
-    return false;
+    return nullptr;
 
   // Shift amount and RHS of and must be constant.
   ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
   ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
   if (!MaskCst || !ShiftCst)
-    return false;
+    return nullptr;
 
   // And RHS must be a mask.
   uint64_t Mask = MaskCst->getZExtValue();
   if (!isMask_64(Mask))
-    return false;
+    return nullptr;
 
   uint64_t Shift = ShiftCst->getZExtValue();
   uint64_t MaskSize = countPopulation(Mask);
@@ -2629,20 +2918,41 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
   // Don't interfere with something that can be handled by extracting AH.
   // TODO: If we are able to fold a load, BEXTR might still be better than AH.
   if (Shift == 8 && MaskSize == 8)
-    return false;
+    return nullptr;
 
   // Make sure we are only using bits that were in the original value, not
   // shifted in.
   if (Shift + MaskSize > NVT.getSizeInBits())
-    return false;
+    return nullptr;
 
-  // Create a BEXTR node and run it through selection.
-  SDValue C = CurDAG->getConstant(Shift | (MaskSize << 8), dl, NVT);
-  SDValue New = CurDAG->getNode(X86ISD::BEXTR, dl, NVT,
-                                N0->getOperand(0), C);
-  ReplaceNode(Node, New.getNode());
-  SelectCode(New.getNode());
-  return true;
+  SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+  unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+  unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+
+  // BMI requires the immediate to placed in a register.
+  if (!Subtarget->hasTBM()) {
+    ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+    MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+    unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
+    New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
+  }
+
+  MachineSDNode *NewNode;
+  SDValue Input = N0->getOperand(0);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+    SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+    // Update the chain.
+    ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
+    // Record the mem-refs
+    CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
+  } else {
+    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+  }
+
+  return NewNode;
 }
 
 // Emit a PCMISTR(I/M) instruction.
@@ -2655,21 +2965,17 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
-  // If there is a load, it will be behind a bitcast. We don't need to check
-  // alignment on this load.
+  // Try to fold a load. No need to check alignment.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
-      tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
-                     Tmp3, Tmp4)) {
-    SDValue Load = N1.getOperand(0);
+  if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
-                      Load.getOperand(0) };
+                      N1.getOperand(0) };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     // Update the chain.
-    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+    ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     return CNode;
   }
 
@@ -2692,22 +2998,18 @@ MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
-  // If there is a load, it will be behind a bitcast. We don't need to check
-  // alignment on this load.
+  // Try to fold a load. No need to check alignment.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
-      tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
-                     Tmp3, Tmp4)) {
-    SDValue Load = N2.getOperand(0);
+  if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
-                      Load.getOperand(0), InFlag };
+                      N2.getOperand(0), InFlag };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     InFlag = SDValue(CNode, 3);
     // Update the chain.
-    ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+    ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
-    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
     return CNode;
   }
 
@@ -2736,17 +3038,8 @@ bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
     ShiftAmt = ShiftAmt->getOperand(0);
 
-  // Special case to avoid messing up a BZHI pattern.
-  // Look for (srl (shl X, (size - y)), (size - y)
-  if (Subtarget->hasBMI2() && (VT == MVT::i32 || VT == MVT::i64) &&
-      N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL &&
-      // Shift amounts the same?
-      N->getOperand(1) == N->getOperand(0).getOperand(1) &&
-      // Shift amounts size - y?
-      ShiftAmt.getOpcode() == ISD::SUB &&
-      isa<ConstantSDNode>(ShiftAmt.getOperand(0)) &&
-      cast<ConstantSDNode>(ShiftAmt.getOperand(0))->getZExtValue() == Size)
-    return false;
+  // This function is called after X86DAGToDAGISel::matchBitExtract(),
+  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
 
   SDValue NewShiftAmt;
   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
@@ -2945,6 +3238,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::SRL:
+    if (matchBitExtract(Node))
+      return;
+    LLVM_FALLTHROUGH;
   case ISD::SRA:
   case ISD::SHL:
     if (tryShiftAmountMod(Node))
@@ -2952,7 +3248,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::AND:
-    if (matchBEXTRFromAnd(Node))
+    if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
+      ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+      CurDAG->RemoveDeadNode(Node);
+      return;
+    }
+    if (matchBitExtract(Node))
       return;
     if (AndImmShrink && shrinkAndImmediate(Node))
       return;
@@ -3208,15 +3509,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   }
 
   case ISD::SDIVREM:
-  case ISD::UDIVREM:
-  case X86ISD::SDIVREM8_SEXT_HREG:
-  case X86ISD::UDIVREM8_ZEXT_HREG: {
+  case ISD::UDIVREM: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
     unsigned Opc, MOpc;
-    bool isSigned = (Opcode == ISD::SDIVREM ||
-                     Opcode == X86ISD::SDIVREM8_SEXT_HREG);
+    bool isSigned = Opcode == ISD::SDIVREM;
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
@@ -3355,13 +3653,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       SDValue Result(RNode, 0);
       InFlag = SDValue(RNode, 1);
 
-      if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
-          Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
-        assert(Node->getValueType(1) == MVT::i32 && "Unexpected result type!");
-      } else {
-        Result =
-            CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
-      }
+      Result =
+          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
+
       ReplaceUses(SDValue(Node, 1), Result);
       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
                  dbgs() << '\n');
@@ -3395,6 +3689,22 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     // Save the original VT of the compare.
     MVT CmpVT = N0.getSimpleValueType();
 
+    // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
+    // by a test instruction. The test should be removed later by
+    // analyzeCompare if we are using only the zero flag.
+    // TODO: Should we check the users and use the BEXTR flags directly?
+    if (isNullConstant(N1) && N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+      if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
+        unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
+                                             : X86::TEST32rr;
+        SDValue BEXTR = SDValue(NewNode, 0);
+        NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
+        ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+        CurDAG->RemoveDeadNode(Node);
+        return;
+      }
+    }
+
     // We can peek through truncates, but we need to be careful below.
     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
       N0 = N0.getOperand(0);
@@ -3405,7 +3715,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     if (N0.getOpcode() == ISD::AND &&
         N0.getNode()->hasOneUse() &&
         N0.getValueType() != MVT::i8 &&
-        X86::isZeroNode(N1)) {
+        isNullConstant(N1)) {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C) break;
       uint64_t Mask = C->getZExtValue();
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 4c18c5a84c2b6ae24c746c6f64b6b698e594a7be..38d3a30cb19e8f6be885a339ae8957a63cfdfb33 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -791,6 +791,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UDIV, MVT::v2i32, Custom);
     setOperationAction(ISD::UREM, MVT::v2i32, Custom);
 
+    setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
+    setOperationAction(ISD::MUL,                MVT::v2i16, Custom);
+    setOperationAction(ISD::MUL,                MVT::v2i32, Custom);
+
     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
@@ -826,7 +830,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SETCC,              VT, Custom);
       setOperationAction(ISD::CTPOP,              VT, Custom);
-      setOperationAction(ISD::CTTZ,               VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -870,14 +873,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
-    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
-    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
-      setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
-      setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
-      setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
-      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
-    }
-
     // Custom lower v2i64 and v2f64 selects.
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
@@ -887,6 +882,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i16, Custom);
+    // Custom legalize these to avoid over promotion.
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i8,  Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i16, Custom);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i8,  Custom);
 
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
@@ -902,6 +902,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 
+    // We want to legalize this to an f64 load rather than an i64 load on
+    // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
+    // store.
+    setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
+    setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
+
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
@@ -1079,9 +1085,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SETCC,           VT, Custom);
       setOperationAction(ISD::CTPOP,           VT, Custom);
-      setOperationAction(ISD::CTTZ,            VT, Custom);
       setOperationAction(ISD::CTLZ,            VT, Custom);
 
+      // TODO - remove this once 256-bit X86ISD::ANDNP correctly split.
+      setOperationAction(ISD::CTTZ,  VT, HasInt256 ? Expand : Custom);
+
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
       setCondCodeAction(ISD::SETLT, VT, Custom);
@@ -1174,14 +1182,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (HasInt256)
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
 
-    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
-    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
-      setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
-      setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
-      setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
-      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
-    }
-
     if (HasInt256) {
       // Custom legalize 2x32 to get a little better code.
       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
@@ -1367,7 +1367,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SHL,              VT, Custom);
       setOperationAction(ISD::SRA,              VT, Custom);
       setOperationAction(ISD::CTPOP,            VT, Custom);
-      setOperationAction(ISD::CTTZ,             VT, Custom);
       setOperationAction(ISD::ROTL,             VT, Custom);
       setOperationAction(ISD::ROTR,             VT, Custom);
       setOperationAction(ISD::SETCC,            VT, Custom);
@@ -1378,13 +1377,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
-    // Need to promote to 64-bit even though we have 32-bit masked instructions
-    // because the IR optimizers rearrange bitcasts around logic ops leaving
-    // too many variations to handle if we don't promote them.
-    setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
-    setOperationPromotedToType(ISD::OR,  MVT::v16i32, MVT::v8i64);
-    setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
-
     if (Subtarget.hasDQI()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
@@ -1398,7 +1390,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
         setOperationAction(ISD::CTLZ,            VT, Legal);
-        setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
       }
     } // Subtarget.hasCDI()
 
@@ -1427,10 +1418,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MGATHER,             VT, Custom);
       setOperationAction(ISD::MSCATTER,            VT, Custom);
     }
-    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
-      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
-    }
-
     // Need to custom split v32i16/v64i8 bitcasts.
     if (!Subtarget.hasBWI()) {
       setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
@@ -1487,7 +1474,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     if (Subtarget.hasCDI()) {
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
         setOperationAction(ISD::CTLZ,            VT, Legal);
-        setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
       }
     } // Subtarget.hasCDI()
 
@@ -1582,7 +1568,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MLOAD,        VT, Legal);
       setOperationAction(ISD::MSTORE,       VT, Legal);
       setOperationAction(ISD::CTPOP,        VT, Custom);
-      setOperationAction(ISD::CTTZ,         VT, Custom);
       setOperationAction(ISD::CTLZ,         VT, Custom);
       setOperationAction(ISD::SMAX,         VT, Legal);
       setOperationAction(ISD::UMAX,         VT, Legal);
@@ -1590,10 +1575,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UMIN,         VT, Legal);
       setOperationAction(ISD::SETCC,        VT, Custom);
 
-      setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
-      setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
-      setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
-
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
       setCondCodeAction(ISD::SETLT, VT, Custom);
@@ -1811,13 +1792,13 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
 }
 
 TargetLoweringBase::LegalizeTypeAction
-X86TargetLowering::getPreferredVectorAction(EVT VT) const {
+X86TargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return TypeSplitVector;
 
   if (ExperimentalVectorWideningLegalization &&
       VT.getVectorNumElements() != 1 &&
-      VT.getVectorElementType().getSimpleVT() != MVT::i1)
+      VT.getVectorElementType() != MVT::i1)
     return TypeWidenVector;
 
   return TargetLoweringBase::getPreferredVectorAction(VT);
@@ -5481,8 +5462,9 @@ static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
   assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
 
   if (VT.is128BitVector() && InVT.is128BitVector())
-    return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
-                                : DAG.getZeroExtendVectorInReg(In, DL, VT);
+    return DAG.getNode(X86ISD::VSEXT == Opc ? ISD::SIGN_EXTEND_VECTOR_INREG
+                                            : ISD::ZERO_EXTEND_VECTOR_INREG,
+                       DL, VT, In);
 
   // For 256-bit vectors, we only need the lower (128-bit) input half.
   // For 512-bit vectors, we only need the lower input half or quarter.
@@ -5550,10 +5532,10 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
     Ptr = Ptr->getOperand(0);
 
   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
-  if (!CNode || CNode->isMachineConstantPoolEntry())
+  if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
     return nullptr;
 
-  return dyn_cast<Constant>(CNode->getConstVal());
+  return CNode->getConstVal();
 }
 
 // Extract raw constant bits from constant pools.
@@ -5708,11 +5690,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   // Extract constant bits from constant pool vector.
   if (auto *Cst = getTargetConstantFromNode(Op)) {
     Type *CstTy = Cst->getType();
-    if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
+    unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
+    if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
       return false;
 
     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
-    unsigned NumSrcElts = CstTy->getVectorNumElements();
+    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
     APInt UndefSrcElts(NumSrcElts, 0);
     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
@@ -5826,16 +5809,38 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   return false;
 }
 
-static bool getTargetShuffleMaskIndices(SDValue MaskNode,
-                                        unsigned MaskEltSizeInBits,
-                                        SmallVectorImpl<uint64_t> &RawMask) {
+static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
   APInt UndefElts;
-  SmallVector<APInt, 64> EltBits;
+  SmallVector<APInt, 16> EltBits;
+  if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
+                                    UndefElts, EltBits, true, false)) {
+    int SplatIndex = -1;
+    for (int i = 0, e = EltBits.size(); i != e; ++i) {
+      if (UndefElts[i])
+        continue;
+      if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
+        SplatIndex = -1;
+        break;
+      }
+      SplatIndex = i;
+    }
+    if (0 <= SplatIndex) {
+      SplatVal = EltBits[SplatIndex];
+      return true;
+    }
+  }
 
+  return false;
+}
+
+static bool getTargetShuffleMaskIndices(SDValue MaskNode,
+                                        unsigned MaskEltSizeInBits,
+                                        SmallVectorImpl<uint64_t> &RawMask,
+                                        APInt &UndefElts) {
   // Extract the raw target constant bits.
-  // FIXME: We currently don't support UNDEF bits or mask entries.
+  SmallVector<APInt, 64> EltBits;
   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
-                                     EltBits, /* AllowWholeUndefs */ false,
+                                     EltBits, /* AllowWholeUndefs */ true,
                                      /* AllowPartialUndefs */ false))
     return false;
 
@@ -5875,6 +5880,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
                                  SmallVectorImpl<SDValue> &Ops,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
+  unsigned MaskEltSize = VT.getScalarSizeInBits();
+  SmallVector<uint64_t, 32> RawMask;
+  APInt RawUndefs;
   SDValue ImmN;
 
   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
@@ -5882,26 +5890,26 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
 
   IsUnary = false;
   bool IsFakeUnary = false;
-  switch(N->getOpcode()) {
+  switch (N->getOpcode()) {
   case X86ISD::BLENDI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUFP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    DecodeSHUFPMask(NumElems, MaskEltSize,
                     cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::INSERTPS:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
@@ -5911,8 +5919,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         isa<ConstantSDNode>(N->getOperand(2))) {
       int BitLen = N->getConstantOperandVal(1);
       int BitIdx = N->getConstantOperandVal(2);
-      DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
-                       Mask);
+      DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
       IsUnary = true;
     }
     break;
@@ -5923,21 +5930,20 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
         isa<ConstantSDNode>(N->getOperand(3))) {
       int BitLen = N->getConstantOperandVal(2);
       int BitIdx = N->getConstantOperandVal(3);
-      DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
-                         Mask);
+      DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     }
     break;
   case X86ISD::UNPCKH:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
+    DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::UNPCKL:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
+    DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVHLPS:
@@ -5956,7 +5962,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -5982,21 +5988,21 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
-    DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    DecodePSHUFMask(NumElems, MaskEltSize,
                     cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFHW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFLW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       Mask);
     IsUnary = true;
@@ -6029,14 +6035,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
-    unsigned MaskEltSize = VT.getScalarSizeInBits();
-    SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-      DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
-      break;
-    }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMILPMask(C, MaskEltSize, Mask);
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+                                    RawUndefs)) {
+      DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
       break;
     }
     return false;
@@ -6047,20 +6048,15 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
-    SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
-      DecodePSHUFBMask(RawMask, Mask);
-      break;
-    }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodePSHUFBMask(C, Mask);
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
+      DecodePSHUFBMask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
   }
   case X86ISD::VPERMI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
@@ -6073,7 +6069,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::VPERM2X128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                          Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -6081,10 +6077,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::SHUF128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands()-1);
-    decodeVSHUF64x2FamilyMask(NumElems, VT.getScalarSizeInBits(),
-                              cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                              Mask);
+    ImmN = N->getOperand(N->getNumOperands() - 1);
+    decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
+                              cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVSLDUP:
@@ -6106,19 +6101,14 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
-    unsigned MaskEltSize = VT.getScalarSizeInBits();
     SDValue MaskNode = N->getOperand(2);
     SDValue CtrlNode = N->getOperand(3);
     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
       unsigned CtrlImm = CtrlOp->getZExtValue();
-      SmallVector<uint64_t, 32> RawMask;
-      if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-        DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
-                            RawMask, Mask);
-        break;
-      }
-      if (auto *C = getTargetConstantFromNode(MaskNode)) {
-        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
+      if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+                                      RawUndefs)) {
+        DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
+                            Mask);
         break;
       }
     }
@@ -6129,13 +6119,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     SDValue MaskNode = N->getOperand(2);
-    SmallVector<uint64_t, 32> RawMask;
-    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
-      DecodeVPPERMMask(RawMask, Mask);
-      break;
-    }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPPERMMask(C, Mask);
+    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
+      DecodeVPPERMMask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
@@ -6146,14 +6131,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
     Ops.push_back(N->getOperand(1));
     SDValue MaskNode = N->getOperand(0);
-    SmallVector<uint64_t, 32> RawMask;
-    unsigned MaskEltSize = VT.getScalarSizeInBits();
-    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
-      DecodeVPERMVMask(RawMask, Mask);
-      break;
-    }
-    if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMVMask(C, MaskEltSize, Mask);
+    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
+                                    RawUndefs)) {
+      DecodeVPERMVMask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
@@ -6166,9 +6146,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     Ops.push_back(N->getOperand(0));
     Ops.push_back(N->getOperand(2));
     SDValue MaskNode = N->getOperand(1);
-    unsigned MaskEltSize = VT.getScalarSizeInBits();
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
-      DecodeVPERMV3Mask(C, MaskEltSize, Mask);
+      DecodeVPERMV3Mask(C, MaskEltSize, VT.getSizeInBits(), Mask);
       break;
     }
     return false;
@@ -6356,9 +6335,6 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
         !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
       return false;
-    // TODO - Add support for more than 2 inputs.
-    if ((SrcInputs0.size() + SrcInputs1.size()) > 2)
-      return false;
     int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
     SmallVector<int, 64> Mask0, Mask1;
     scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
@@ -6410,16 +6386,14 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
       Mask.push_back(i);
     for (int i = 0; i != (int)NumSubElts; ++i) {
       int M = SubMask[i];
-      if (M < 0) {
-        Mask[i + InsertIdx] = M;
-      } else {
+      if (0 <= M) {
         int InputIdx = M / NumSubElts;
         int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1);
-        Mask[i + InsertIdx] = (NumElts * (1 + InputIdx)) + ExtractIdx + M;
+        M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts);
       }
+      Mask[i + InsertIdx] = M;
     }
-    // TODO - Add support for more than 1 subinput.
-    return Ops.size() <= 2;
+    return true;
   }
   case ISD::SCALAR_TO_VECTOR: {
     // Match against a scalar_to_vector of an extract from a vector,
@@ -6561,8 +6535,8 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     MVT SrcVT = Src.getSimpleValueType();
     if (NumSizeInBits != SrcVT.getSizeInBits())
       break;
-    DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
-                         VT.getVectorNumElements(), Mask);
+    DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+                         Mask);
     Ops.push_back(Src);
     return true;
   }
@@ -6612,7 +6586,8 @@ static bool resolveTargetShuffleInputs(SDValue Op,
       return false;
 
   resolveTargetShuffleInputsAndMask(Inputs, Mask);
-  return true;
+  // TODO - Add support for more than 2 inputs.
+  return Inputs.size() <= 2;
 }
 
 /// Returns the scalar element that will make up the ith
@@ -9917,11 +9892,7 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
 
   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
-  // We have to cast V2 around.
-  MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
-  V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
-                                      DAG.getBitcast(MaskVT, V1Mask),
-                                      DAG.getBitcast(MaskVT, V2)));
+  V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
 }
 
@@ -10099,6 +10070,15 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     // type.
     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
 
+    // x86 allows load folding with blendvb from the 2nd source operand. But
+    // we are still using LLVM select here (see comment below), so that's V1.
+    // If V2 can be load-folded and V1 cannot be load-folded, then commute to
+    // allow that load-folding possibility.
+    if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
+      ShuffleVectorSDNode::commuteMask(Mask);
+      std::swap(V1, V2);
+    }
+
     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
     // mix of LLVM's code generator and the x86 backend. We tell the code
     // generator that boolean values in the elements of an x86 vector register
@@ -11265,7 +11245,8 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
       continue;
     }
     case ISD::CONCAT_VECTORS: {
-      int OperandSize = Mask.size() / V.getNumOperands();
+      int OperandSize =
+          V.getOperand(0).getSimpleValueType().getVectorNumElements();
       V = V.getOperand(BroadcastIdx / OperandSize);
       BroadcastIdx %= OperandSize;
       continue;
@@ -13426,6 +13407,60 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
 }
 
+/// Lower a vector shuffle crossing multiple 128-bit lanes as
+/// a lane permutation followed by a per-lane permutation.
+///
+/// This is mainly for cases where we can have non-repeating permutes
+/// in each lane.
+///
+/// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
+/// we should investigate merging them.
+static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
+    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
+    SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits() / 128;
+  int NumEltsPerLane = NumElts / NumLanes;
+
+  SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
+  SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
+  SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
+
+  for (int i = 0; i != NumElts; ++i) {
+    int M = Mask[i];
+    if (M < 0)
+      continue;
+
+    // Ensure that each lane comes from a single source lane.
+    int SrcLane = M / NumEltsPerLane;
+    int DstLane = i / NumEltsPerLane;
+    if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
+      return SDValue();
+    SrcLaneMask[DstLane] = SrcLane;
+
+    LaneMask[i] = (SrcLane * NumEltsPerLane) + (i % NumEltsPerLane);
+    PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
+  }
+
+  // If we're only shuffling a single lowest lane and the rest are identity
+  // then don't bother.
+  // TODO - isShuffleMaskInputInPlace could be extended to something like this.
+  int NumIdentityLanes = 0;
+  bool OnlyShuffleLowestLane = true;
+  for (int i = 0; i != NumLanes; ++i) {
+    if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
+                                   i * NumEltsPerLane))
+      NumIdentityLanes++;
+    else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
+      OnlyShuffleLowestLane = false;
+  }
+  if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
+    return SDValue();
+
+  SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
+  return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
+}
+
 /// Lower a vector shuffle crossing multiple 128-bit lanes as
 /// a permutation and blend of those lanes.
 ///
@@ -14162,6 +14197,11 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return V;
 
+    // Try to permute the lanes and then use a per-lane permute.
+    if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+            DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
+      return V;
+
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                    DAG, Subtarget);
@@ -14196,6 +14236,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Result;
+
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
@@ -14577,9 +14618,14 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (V2.isUndef()) {
     // There are no generalized cross-lane shuffle operations available on i16
     // element types.
-    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
+      if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+              DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
+        return V;
+
       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
                                                      Mask, DAG, Subtarget);
+    }
 
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
@@ -14605,6 +14651,11 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
+  // Try to permute the lanes and then use a per-lane permute.
+  if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+          DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
+    return V;
+
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
 }
@@ -14667,9 +14718,14 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // There are no generalized cross-lane shuffle operations available on i8
   // element types.
-  if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
+  if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
+    if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+            DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+      return V;
+
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
                                                    DAG, Subtarget);
+  }
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
@@ -14685,6 +14741,11 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
+  // Try to permute the lanes and then use a per-lane permute.
+  if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
+          DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
+    return V;
+
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
 }
@@ -15626,11 +15687,15 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
 }
 
 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+
   // A vselect where all conditions and data are constants can be optimized into
   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
-  if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
-      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
-      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
+  if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
+      ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
     return SDValue();
 
   // Try to lower this to a blend-style vector shuffle. This can handle all
@@ -15640,7 +15705,9 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
   // with patterns on the mask registers on AVX-512.
-  if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+  MVT CondVT = Cond.getSimpleValueType();
+  unsigned CondEltSize = Cond.getScalarValueSizeInBits();
+  if (CondEltSize == 1)
     return Op;
 
   // Variable blends are only legal from SSE4.1 onward.
@@ -15649,24 +15716,32 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned NumElts = VT.getVectorNumElements();
 
   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
   // into an i1 condition so that we can use the mask-based 512-bit blend
   // instructions.
   if (VT.getSizeInBits() == 512) {
-    SDValue Cond = Op.getOperand(0);
-    // The vNi1 condition case should be handled above as it can be trivially
-    // lowered.
-    assert(Cond.getValueType().getScalarSizeInBits() ==
-               VT.getScalarSizeInBits() &&
-           "Should have a size-matched integer condition!");
     // Build a mask by testing the condition against zero.
-    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
-                                getZeroVector(VT, Subtarget, DAG, dl),
+                                getZeroVector(CondVT, Subtarget, DAG, dl),
                                 ISD::SETNE);
     // Now return a new VSELECT using the mask.
-    return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
+    return DAG.getSelect(dl, VT, Mask, LHS, RHS);
+  }
+
+  // SEXT/TRUNC cases where the mask doesn't match the destination size.
+  if (CondEltSize != EltSize) {
+    // If we don't have a sign splat, rely on the expansion.
+    if (CondEltSize != DAG.ComputeNumSignBits(Cond))
+      return SDValue();
+
+    MVT NewCondSVT = MVT::getIntegerVT(EltSize);
+    MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
+    Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
+    return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
   }
 
   // Only some types will be legal on some subtargets. If we can emit a legal
@@ -15687,10 +15762,10 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   case MVT::v8i16:
   case MVT::v16i16: {
     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
-    MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
-    SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
-    SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
-    SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
+    MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
+    Cond = DAG.getBitcast(CastVT, Cond);
+    LHS = DAG.getBitcast(CastVT, LHS);
+    RHS = DAG.getBitcast(CastVT, RHS);
     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
     return DAG.getBitcast(VT, Select);
   }
@@ -17381,27 +17456,26 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   // Optimize vectors in AVX mode:
   //
   //   v8i16 -> v8i32
-  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
+  //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   //   Concat upper and lower parts.
   //
   //   v4i32 -> v4i64
-  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
+  //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   //   Concat upper and lower parts.
   //
 
-  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
+
+  SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In);
+
+  SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
   SDValue Undef = DAG.getUNDEF(InVT);
   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
-  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
-
-  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
-                             VT.getVectorNumElements()/2);
-
-  OpLo = DAG.getBitcast(HVT, OpLo);
-  OpHi = DAG.getBitcast(HVT, OpHi);
+  OpHi = DAG.getBitcast(HalfVT, OpHi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
@@ -17889,43 +17963,36 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   bool IsF128 = (VT == MVT::f128);
+  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
+          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+         "Unexpected type in LowerFABSorFNEG");
 
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   // decide if we should generate a 16-byte constant mask when we only need 4 or
   // 8 bytes for the scalar case.
 
-  MVT LogicVT;
-  MVT EltVT;
-
-  if (VT.isVector()) {
-    LogicVT = VT;
-    EltVT = VT.getVectorElementType();
-  } else if (IsF128) {
-    // SSE instructions are used for optimized f128 logical operations.
-    LogicVT = MVT::f128;
-    EltVT = VT;
-  } else {
-    // There are no scalar bitwise logical SSE/AVX instructions, so we
-    // generate a 16-byte vector constant and logic op even for the scalar case.
-    // Using a 16-byte mask allows folding the load of the mask with
-    // the logic op, so it can save (~4 bytes) on code size.
+  // There are no scalar bitwise logical SSE/AVX instructions, so we
+  // generate a 16-byte vector constant and logic op even for the scalar case.
+  // Using a 16-byte mask allows folding the load of the mask with
+  // the logic op, so it can save (~4 bytes) on code size.
+  bool IsFakeVector = !VT.isVector() && !IsF128;
+  MVT LogicVT = VT;
+  if (IsFakeVector)
     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
-    EltVT = VT;
-  }
 
-  unsigned EltBits = EltVT.getSizeInBits();
+  unsigned EltBits = VT.getScalarSizeInBits();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
-  APInt MaskElt =
-    IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
-  const fltSemantics &Sem =
-      EltVT == MVT::f64 ? APFloat::IEEEdouble() :
-          (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+  APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
+                           APInt::getSignMask(EltBits);
+  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
 
   SDValue Op0 = Op.getOperand(0);
   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
-  unsigned LogicOp =
-    IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
+  unsigned LogicOp = IsFABS  ? X86ISD::FAND :
+                     IsFNABS ? X86ISD::FOR  :
+                               X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
 
   if (VT.isVector() || IsF128)
@@ -17961,10 +18028,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
          "Unexpected type in LowerFCOPYSIGN");
 
-  MVT EltVT = VT.getScalarType();
-  const fltSemantics &Sem =
-      EltVT == MVT::f64 ? APFloat::IEEEdouble()
-                        : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
+  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
 
   // Perform all scalar logic operations as 16-byte vectors because there are no
   // scalar FP logic instructions in SSE.
@@ -17981,7 +18045,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue SignMask = DAG.getConstantFP(
       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
   SDValue MagMask = DAG.getConstantFP(
-      APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
+      APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
 
   // First, clear all bits but the sign bit from the second operand (sign).
   if (IsFakeVector)
@@ -17992,7 +18056,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   // TODO: If we had general constant folding for FP logic ops, this check
   // wouldn't be necessary.
   SDValue MagBits;
-  if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
+  if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
     APFloat APF = Op0CN->getValueAPF();
     APF.clearSign();
     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
@@ -19638,7 +19702,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
     // If v16i32 is to be avoided, we'll need to split and concatenate.
     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
-      return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
+      return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
 
     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
   }
@@ -19657,7 +19721,7 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   MVT WideEltVT = WideVT.getVectorElementType();
   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
-    V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
+    V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
   } else {
     SDValue NegOne = getOnesVector(WideVT, DAG, dl);
     SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
@@ -19766,7 +19830,7 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
   // As SRAI is only available on i16/i32 types, we expand only up to i32
   // and handle i64 separately.
   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
-    Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
+    Curr = getUnpackl(DAG, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
     Curr = DAG.getBitcast(CurrVT, Curr);
@@ -19823,30 +19887,22 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   //              v4i32 to v4i64
   //
   // Divide input vector into two parts
-  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+  // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   // concat the vectors to original VT
 
-  unsigned NumElems = InVT.getVectorNumElements();
-  SDValue Undef = DAG.getUNDEF(InVT);
-
-  SmallVector<int,8> ShufMask1(NumElems, -1);
-  for (unsigned i = 0; i != NumElems/2; ++i)
-    ShufMask1[i] = i;
+  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
 
-  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
+  SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
 
-  SmallVector<int,8> ShufMask2(NumElems, -1);
+  unsigned NumElems = InVT.getVectorNumElements();
+  SmallVector<int,8> ShufMask(NumElems, -1);
   for (unsigned i = 0; i != NumElems/2; ++i)
-    ShufMask2[i] = i + NumElems/2;
-
-  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
-
-  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
-                                VT.getVectorNumElements() / 2);
+    ShufMask[i] = i + NumElems/2;
 
-  OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
-  OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
+  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
+  OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
@@ -19858,18 +19914,36 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
   SDValue StoredVal = St->getValue();
 
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
-  assert(StoredVal.getValueType().isVector() &&
-         StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
-         StoredVal.getValueType().getVectorNumElements() <= 8 &&
-         "Unexpected VT");
-  assert(!St->isTruncatingStore() && "Expected non-truncating store");
-  assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
-         "Expected AVX512F without AVX512DQI");
-
-  StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
-                          DAG.getUNDEF(MVT::v8i1), StoredVal,
+  if (StoredVal.getValueType().isVector() &&
+      StoredVal.getValueType().getVectorElementType() == MVT::i1) {
+    assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
+           "Unexpected VT");
+    assert(!St->isTruncatingStore() && "Expected non-truncating store");
+    assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+           "Expected AVX512F without AVX512DQI");
+
+    StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                            DAG.getUNDEF(MVT::v8i1), StoredVal,
+                            DAG.getIntPtrConstant(0, dl));
+    StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
+
+    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags());
+  }
+
+  if (St->isTruncatingStore())
+    return SDValue();
+
+  assert(StoredVal.getValueType() == MVT::v2f32 && "Unexpected VT");
+
+  // Widen the vector, cast to a v2x64 type, extract the single 64-bit
+  // element and store it.
+  StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, StoredVal,
+                          DAG.getUNDEF(MVT::v2f32));
+  StoredVal = DAG.getBitcast(MVT::v2f64, StoredVal);
+  StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, StoredVal,
                           DAG.getIntPtrConstant(0, dl));
-  StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
 
   return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                       St->getPointerInfo(), St->getAlignment(),
@@ -20074,7 +20148,8 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
 
-    SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
+    SDValue Shuff = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, RegVT,
+                                SlicedVec);
     return DAG.getMergeValues({Shuff, TF}, dl);
   }
 
@@ -20759,7 +20834,8 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
     MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
     if (Subtarget.hasSSE41())
-      ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+      ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
+                          MVT::v2i64, ShAmt);
     else {
       SDValue ByteShift = DAG.getConstant(
           (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
@@ -20772,7 +20848,8 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
-    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
+    ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
+                        MVT::v2i64, ShAmt);
   } else {
     SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
                         DAG.getUNDEF(SVT)};
@@ -22904,38 +22981,23 @@ static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
   return Op;
 }
 
-static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
+                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumBits = VT.getScalarSizeInBits();
+  SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
 
-  if (VT.isVector()) {
-    SDValue N0 = Op.getOperand(0);
-    SDValue Zero = DAG.getConstant(0, dl, VT);
-
-    // lsb(x) = (x & -x)
-    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
-                              DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
-
-    // cttz_undef(x) = (width - 1) - ctlz(lsb)
-    if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
-      SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
-      return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
-                         DAG.getNode(ISD::CTLZ, dl, VT, LSB));
-    }
-
-    // cttz(x) = ctpop(lsb - 1)
-    SDValue One = DAG.getConstant(1, dl, VT);
-    return DAG.getNode(ISD::CTPOP, dl, VT,
-                       DAG.getNode(ISD::SUB, dl, VT, LSB, One));
-  }
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return Lower256IntUnary(Op, DAG);
 
-  assert(Op.getOpcode() == ISD::CTTZ &&
+  assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
          "Only scalar CTTZ requires custom lowering");
 
   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
 
   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   SDValue Ops[] = {
@@ -23532,7 +23594,6 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
-  unsigned EltSizeInBits = VT.getScalarSizeInBits();
   unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
 
   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
@@ -23576,24 +23637,11 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   };
 
   // Optimize shl/srl/sra with constant shift amount.
-  APInt UndefElts;
-  SmallVector<APInt, 8> EltBits;
-  if (!getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits,
-                                     true, false))
+  APInt APIntShiftAmt;
+  if (!isConstantSplat(Amt, APIntShiftAmt))
     return SDValue();
+  uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
 
-  int SplatIndex = -1;
-  for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
-    if (UndefElts[i])
-      continue;
-    if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex])
-      return SDValue();
-    SplatIndex = i;
-  }
-  if (SplatIndex < 0)
-    return SDValue();
-
-  uint64_t ShiftAmt = EltBits[SplatIndex].getZExtValue();
   if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
@@ -25019,7 +25067,9 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
                                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
-  unsigned VecSize = VT.getSizeInBits();
+  int NumElts = VT.getVectorNumElements();
+  (void)EltVT;
+  assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
 
   // Implement a lookup table in register by using an algorithm based on:
   // http://wm.ite.pl/articles/sse-popcount.html
@@ -25031,109 +25081,30 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
   // masked out higher ones) for each byte. PSHUFB is used separately with both
   // to index the in-register table. Next, both are added and the result is a
   // i8 vector where each element contains the pop count for input byte.
-  //
-  // To obtain the pop count for elements != i8, we follow up with the same
-  // approach and use additional tricks as described below.
-  //
   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
 
-  int NumByteElts = VecSize / 8;
-  MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
-  SDValue In = DAG.getBitcast(ByteVecVT, Op);
   SmallVector<SDValue, 64> LUTVec;
-  for (int i = 0; i < NumByteElts; ++i)
+  for (int i = 0; i < NumElts; ++i)
     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
-  SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
-  SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
+  SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
+  SDValue M0F = DAG.getConstant(0x0F, DL, VT);
 
   // High nibbles
-  SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
-  SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
+  SDValue FourV = DAG.getConstant(4, DL, VT);
+  SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
 
   // Low nibbles
-  SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
+  SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
 
   // The input vector is used as the shuffle mask that index elements into the
   // LUT. After counting low and high nibbles, add the vector to obtain the
   // final pop count per i8 element.
-  SDValue HighPopCnt =
-      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
-  SDValue LowPopCnt =
-      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
-  SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
-
-  if (EltVT == MVT::i8)
-    return PopCnt;
-
-  return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
-}
-
-static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
-                                       const X86Subtarget &Subtarget,
-                                       SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  assert(VT.is128BitVector() &&
-         "Only 128-bit vector bitmath lowering supported.");
-
-  int VecSize = VT.getSizeInBits();
-  MVT EltVT = VT.getVectorElementType();
-  int Len = EltVT.getSizeInBits();
-
-  // This is the vectorized version of the "best" algorithm from
-  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-  // with a minor tweak to use a series of adds + shifts instead of vector
-  // multiplications. Implemented for all integer vector types. We only use
-  // this when we don't have SSSE3 which allows a LUT-based lowering that is
-  // much faster, even faster than using native popcnt instructions.
-
-  auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
-    MVT VT = V.getSimpleValueType();
-    SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
-    return DAG.getNode(OpCode, DL, VT, V, ShifterV);
-  };
-  auto GetMask = [&](SDValue V, APInt Mask) {
-    MVT VT = V.getSimpleValueType();
-    SDValue MaskV = DAG.getConstant(Mask, DL, VT);
-    return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
-  };
-
-  // We don't want to incur the implicit masks required to SRL vNi8 vectors on
-  // x86, so set the SRL type to have elements at least i16 wide. This is
-  // correct because all of our SRLs are followed immediately by a mask anyways
-  // that handles any bits that sneak into the high bits of the byte elements.
-  MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
-
-  SDValue V = Op;
-
-  // v = v - ((v >> 1) & 0x55555555...)
-  SDValue Srl =
-      DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
-  SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
-  V = DAG.getNode(ISD::SUB, DL, VT, V, And);
-
-  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
-  SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
-  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
-  SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
-  V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
-
-  // v = (v + (v >> 4)) & 0x0F0F0F0F...
-  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
-  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
-  V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
-
-  // At this point, V contains the byte-wise population count, and we are
-  // merely doing a horizontal sum if necessary to get the wider element
-  // counts.
-  if (EltVT == MVT::i8)
-    return V;
-
-  return LowerHorizontalByteSum(
-      DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
-      DAG);
+  SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
+  SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
+  return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
 }
 
 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
@@ -25159,12 +25130,6 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  if (!Subtarget.hasSSSE3()) {
-    // We can't use the fast LUT approach, so fall back on vectorized bitmath.
-    assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
-    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
-  }
-
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntUnary(Op, DAG);
@@ -25173,6 +25138,18 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
   if (VT.is512BitVector() && !Subtarget.hasBWI())
     return Lower512IntUnary(Op, DAG);
 
+  // For element types greater than i8, do vXi8 pop counts and a bytesum.
+  if (VT.getScalarType() != MVT::i8) {
+    MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
+    SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
+    SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
+    return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
+  }
+
+  // We can't use the fast LUT approach, so fall back on LegalizeDAG.
+  if (!Subtarget.hasSSSE3())
+    return SDValue();
+
   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
 }
 
@@ -25874,7 +25851,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
   case ISD::CTTZ:
-  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
+  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::MULHS:
   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
@@ -25943,6 +25920,24 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::MUL: {
+    EVT VT = N->getValueType(0);
+    assert(VT.isVector() && VT.getVectorNumElements() == 2 && "Unexpected VT");
+    if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
+      // Promote to a pattern that will be turned into PMULUDQ.
+      SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
+                               N->getOperand(0));
+      N0 = DAG.getNode(ISD::AND, dl, MVT::v2i64, N0,
+                       DAG.getConstant(0xffffffff, dl, MVT::v2i64));
+      SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
+                               N->getOperand(1));
+      N1 = DAG.getNode(ISD::AND, dl, MVT::v2i64, N1,
+                       DAG.getConstant(0xffffffff, dl, MVT::v2i64));
+      SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v2i64, N0, N1);
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
+    }
+    return;
+  }
   case X86ISD::ADDUS:
   case X86ISD::SUBUS:
   case X86ISD::AVG: {
@@ -26035,6 +26030,24 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Src = N->getOperand(0);
     EVT SrcVT = Src.getValueType();
 
+    // Promote these manually to avoid over promotion to v2i64. Type
+    // legalization will revisit the v2i32 operation for more cleanup.
+    if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
+        getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
+      // AVX512DQ provides instructions that produce a v2i64 result.
+      if (Subtarget.hasDQI())
+        return;
+
+      SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
+      Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
+                                                          : ISD::AssertSext,
+                        dl, MVT::v2i32, Res,
+                        DAG.getValueType(VT.getVectorElementType()));
+      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+      Results.push_back(Res);
+      return;
+    }
+
     if (VT == MVT::v2i32) {
       assert((IsSigned || Subtarget.hasAVX512()) &&
              "Can only handle signed conversion without AVX512");
@@ -26061,7 +26074,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         return;
       }
       if (SrcVT == MVT::v2f32 &&
-          getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector) {
+          getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
         SDValue Idx = DAG.getIntPtrConstant(0, dl);
         SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                   DAG.getUNDEF(MVT::v2f32));
@@ -26313,29 +26326,19 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
 
     if (SrcVT != MVT::f64 ||
-        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
+        getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
       return;
 
     unsigned NumElts = DstVT.getVectorNumElements();
     EVT SVT = DstVT.getVectorElementType();
     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
-    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
-                                   MVT::v2f64, N->getOperand(0));
-    SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
-
-    if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
-      // If we are legalizing vectors by widening, we already have the desired
-      // legal vector type, just return it.
-      Results.push_back(ToVecInt);
-      return;
-    }
-
-    SmallVector<SDValue, 8> Elts;
-    for (unsigned i = 0, e = NumElts; i != e; ++i)
-      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
-                                   ToVecInt, DAG.getIntPtrConstant(i, dl)));
-
-    Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
+    SDValue Res;
+    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
+    Res = DAG.getBitcast(WiderVT, Res);
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
+                      DAG.getIntPtrConstant(0, dl));
+    Results.push_back(Res);
     return;
   }
   case ISD::MGATHER: {
@@ -26420,6 +26423,25 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
     break;
   }
+  case ISD::LOAD: {
+    // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids
+    // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast
+    // since type legalization will try to use an i64 load.
+    assert(N->getValueType(0) == MVT::v2f32 && "Unexpected VT");
+    if (!ISD::isNON_EXTLoad(N))
+      return;
+    auto *Ld = cast<LoadSDNode>(N);
+    SDValue Res = DAG.getLoad(MVT::f64, dl, Ld->getChain(), Ld->getBasePtr(),
+                              Ld->getPointerInfo(),
+                              Ld->getAlignment(),
+                              Ld->getMemOperand()->getFlags());
+    SDValue Chain = Res.getValue(1);
+    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Res);
+    Res = DAG.getBitcast(MVT::v4f32, Res);
+    Results.push_back(Res);
+    Results.push_back(Chain);
+    return;
+  }
   }
 }
 
@@ -26564,14 +26586,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UMUL:               return "X86ISD::UMUL";
   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
-  case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
-  case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
   case X86ISD::INC:                return "X86ISD::INC";
   case X86ISD::DEC:                return "X86ISD::DEC";
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
+  case X86ISD::BZHI:               return "X86ISD::BZHI";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
@@ -26858,6 +26879,10 @@ bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
   return isInt<32>(Imm);
 }
 
+bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
+  return isInt<32>(Imm);
+}
+
 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isInteger() || !VT2.isInteger())
     return false;
@@ -29509,13 +29534,6 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known.Zero &= Known2.Zero;
     break;
   }
-  case X86ISD::UDIVREM8_ZEXT_HREG:
-    // TODO: Support more than just the zero extended bits?
-    if (Op.getResNo() != 1)
-      break;
-    // The remainder is zero extended.
-    Known.Zero.setBitsFrom(8);
-    break;
   }
 
   // Handle target shuffles.
@@ -29646,12 +29664,6 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
     return std::min(Tmp0, Tmp1);
   }
-  case X86ISD::SDIVREM8_SEXT_HREG:
-    // TODO: Support more than just the sign extended bits?
-    if (Op.getResNo() != 1)
-      break;
-    // The remainder is sign extended.
-    return VTBits - 7;
   }
 
   // Fallback case.
@@ -31847,6 +31859,68 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
   return false;
 }
 
+bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
+    SDValue Op, const APInt &OriginalDemandedBits, KnownBits &Known,
+    TargetLoweringOpt &TLO, unsigned Depth) const {
+  unsigned BitWidth = OriginalDemandedBits.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  switch(Opc) {
+  case X86ISD::PMULDQ:
+  case X86ISD::PMULUDQ: {
+    // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
+    KnownBits KnownOp;
+    SDValue LHS = Op.getOperand(0);
+    SDValue RHS = Op.getOperand(1);
+    // FIXME: Can we bound this better?
+    APInt DemandedMask = APInt::getLowBitsSet(64, 32);
+    if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
+      return true;
+    break;
+  }
+  case X86ISD::VSHLI: {
+    if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      if (ShiftImm->getAPIntValue().uge(BitWidth))
+        break;
+
+      KnownBits KnownOp;
+      unsigned ShAmt = ShiftImm->getZExtValue();
+      APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
+      if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO,
+                               Depth + 1))
+        return true;
+    }
+    break;
+  }
+  case X86ISD::VSRAI:
+  case X86ISD::VSRLI: {
+    if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      if (ShiftImm->getAPIntValue().uge(BitWidth))
+        break;
+
+      KnownBits KnownOp;
+      unsigned ShAmt = ShiftImm->getZExtValue();
+      APInt DemandedMask = OriginalDemandedBits << ShAmt;
+
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      if (Opc == X86ISD::VSRAI &&
+          OriginalDemandedBits.countLeadingZeros() < ShAmt)
+        DemandedMask.setSignBit();
+
+      if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, KnownOp, TLO,
+                               Depth + 1))
+        return true;
+    }
+    break;
+  }
+  }
+
+  return TargetLowering::SimplifyDemandedBitsForTargetNode(
+      Op, OriginalDemandedBits, Known, TLO, Depth);
+}
+
 /// Check if a vector extract from a target-specific shuffle of a load can be
 /// folded into a single element load.
 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
@@ -34313,7 +34387,7 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
+  if (Subtarget.isPMADDWDSlow())
     return SDValue();
 
   EVT VT = N->getValueType(0);
@@ -34393,6 +34467,26 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
 
+  // Look for multiply of 2 identical shuffles with a zero vector. Shuffle the
+  // result and insert the zero there instead. This can occur due to
+  // type legalization of v2i32 multiply to a PMULUDQ pattern.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (!DCI.isBeforeLegalize() && isa<ShuffleVectorSDNode>(LHS) &&
+      isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
+      LHS.getOperand(1) == RHS.getOperand(1) &&
+      ISD::isBuildVectorAllZeros(LHS.getOperand(1).getNode())) {
+    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
+    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
+    if (SVN0->getMask().equals(SVN1->getMask())) {
+      SDLoc dl(N);
+      SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, LHS.getOperand(0),
+                                RHS.getOperand(0));
+      return DAG.getVectorShuffle(VT, dl, Mul, DAG.getConstant(0, dl, VT),
+                                  SVN0->getMask());
+    }
+  }
+
   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
     return V;
 
@@ -34869,6 +34963,11 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
   }
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+                               APInt::getAllOnesValue(NumBitsPerElt), DCI))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -35002,13 +35101,13 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::AND);
 
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
+  MVT VT = N->getSimpleValueType(0);
+  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
     return SDValue();
 
   SDValue X, Y;
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
+  SDValue N0 = peekThroughBitcasts(N->getOperand(0));
+  SDValue N1 = peekThroughBitcasts(N->getOperand(1));
   if (N0.getOpcode() == ISD::XOR &&
       ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
     X = N0.getOperand(0);
@@ -35020,6 +35119,8 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   } else
     return SDValue();
 
+  X = DAG.getBitcast(VT, X);
+  Y = DAG.getBitcast(VT, Y);
   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
 }
 
@@ -35145,6 +35246,10 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
       !SplatVal.isMask())
     return SDValue();
 
+  // Don't prevent creation of ANDN.
+  if (isBitwiseNot(Op0))
+    return SDValue();
+
   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
     return SDValue();
 
@@ -35566,7 +35671,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasSSE41())
     return SDValue();
 
-  MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+  MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
 
   X = DAG.getBitcast(BlendVT, X);
   Y = DAG.getBitcast(BlendVT, Y);
@@ -36861,7 +36966,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
     if (Subtarget.is64Bit() || F64IsLegal) {
-      MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
+      MVT LdVT = (Subtarget.is64Bit() &&
+                  (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64;
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                   Ld->getMemOperand());
 
@@ -37026,9 +37132,10 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
         continue;
 
       // The  low half of the 128-bit result must choose from A.
-      // The high half of the 128-bit result must choose from B.
+      // The high half of the 128-bit result must choose from B,
+      // unless B is undef. In that case, we are always choosing from A.
       unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
-      unsigned Src = i >= NumEltsPer64BitChunk;
+      unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
 
       // Check that successive elements are being operated on. If not, this is
       // not a horizontal operation.
@@ -37044,6 +37151,16 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   return true;
 }
 
+/// Horizontal vector math instructions may be slower than normal math with
+/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
+/// implementation, and likely shuffle complexity of the alternate sequence.
+static bool shouldCombineToHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget) {
+  bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize();
+  bool HasFastHOps = Subtarget.hasFastHorizontalOps();
+  return !IsSingleSource || IsOptimizingSize || HasFastHOps;
+}
+
 /// Do target-specific dag combines on floating-point adds/subs.
 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
@@ -37056,7 +37173,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
-      isHorizontalBinOp(LHS, RHS, IsFadd)) {
+      isHorizontalBinOp(LHS, RHS, IsFadd) &&
+      shouldCombineToHorizontalOp(LHS == RHS, DAG, Subtarget)) {
     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
   }
@@ -37660,25 +37778,27 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
   // If we have integer vector types available, use the integer opcodes.
-  if ((VT.isVector() || VT == MVT::f128) && Subtarget.hasSSE2()) {
-    SDLoc dl(N);
+  if (!VT.isVector() || !Subtarget.hasSSE2())
+    return SDValue();
 
-    MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+  SDLoc dl(N);
 
-    SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
-    SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
-    unsigned IntOpcode;
-    switch (N->getOpcode()) {
-    default: llvm_unreachable("Unexpected FP logic op");
-    case X86ISD::FOR: IntOpcode = ISD::OR; break;
-    case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
-    case X86ISD::FAND: IntOpcode = ISD::AND; break;
-    case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
-    }
-    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
-    return DAG.getBitcast(VT, IntOp);
+  unsigned IntBits = VT.getScalarSizeInBits();
+  MVT IntSVT = MVT::getIntegerVT(IntBits);
+  MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
+
+  SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
+  SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
+  unsigned IntOpcode;
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("Unexpected FP logic op");
+  case X86ISD::FOR:   IntOpcode = ISD::OR; break;
+  case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
+  case X86ISD::FAND:  IntOpcode = ISD::AND; break;
+  case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
   }
-  return SDValue();
+  SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
+  return DAG.getBitcast(VT, IntOp);
 }
 
 
@@ -37943,15 +38063,22 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
+  MVT VT = N->getSimpleValueType(0);
+
   // ANDNP(0, x) -> x
   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
     return N->getOperand(1);
 
   // ANDNP(x, 0) -> 0
   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
-    return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
+    return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
 
-  EVT VT = N->getValueType(0);
+  // Turn ANDNP back to AND if input is inverted.
+  if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR &&
+      ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) {
+    return DAG.getNode(ISD::AND, SDLoc(N), VT,
+                       N->getOperand(0).getOperand(0), N->getOperand(1));
+  }
 
   // Attempt to recursively combine a bitmask ANDNP with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
@@ -38121,36 +38248,6 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
 }
 
-/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
-/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
-/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
-/// extends from AH (which we otherwise need to do contortions to access).
-static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
-  SDValue N0 = N->getOperand(0);
-  auto OpcodeN = N->getOpcode();
-  auto OpcodeN0 = N0.getOpcode();
-  if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
-        (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-  EVT InVT = N0.getValueType();
-  if (N0.getResNo() != 1 || InVT != MVT::i8 ||
-      !(VT == MVT::i32 || VT == MVT::i64))
-    return SDValue();
-
-  SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
-  auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
-                                               : X86ISD::UDIVREM8_ZEXT_HREG;
-  SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
-                          N0.getOperand(1));
-  DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
-  // If this was a 64-bit extend, complete it.
-  if (VT == MVT::i64)
-    return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
-  return R.getValue(1);
-}
-
 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
 // operands and the result of CMOV is not used anywhere else - promote CMOV
 // itself instead of promoting its result. This could be beneficial, because:
@@ -38326,9 +38423,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
     return SDValue();
 
-  // On AVX2+ targets, if the input/output types are both legal then we will be
-  // able to use SIGN_EXTEND/ZERO_EXTEND directly.
-  if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+  // If the input/output types are both legal then we have at least AVX1 and
+  // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
+  if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
     return SDValue();
 
@@ -38363,9 +38460,9 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
       (VT.is256BitVector() && Subtarget.hasAVX()) ||
       (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
-    return Opcode == ISD::SIGN_EXTEND
-               ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
-               : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
+    Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
+                                        : ISD::ZERO_EXTEND_VECTOR_INREG;
+    return DAG.getNode(Opcode, DL, VT, ExOp);
   }
 
   auto SplitAndExtendInReg = [&](unsigned SplitSize) {
@@ -38374,14 +38471,15 @@ static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
 
+    unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
+                                                : ISD::ZERO_EXTEND_VECTOR_INREG;
+
     SmallVector<SDValue, 8> Opnds;
     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
                                    DAG.getIntPtrConstant(Offset, DL));
       SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
-      SrcVec = Opcode == ISD::SIGN_EXTEND
-                   ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
-                   : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
+      SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
       Opnds.push_back(SrcVec);
     }
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
@@ -38451,9 +38549,6 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   EVT InVT = N0.getValueType();
   SDLoc DL(N);
 
-  if (SDValue DivRem8 = getDivRem8(N, DAG))
-    return DivRem8;
-
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
     return NewCMov;
 
@@ -38654,9 +38749,6 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
     if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
       return R;
 
-  if (SDValue DivRem8 = getDivRem8(N, DAG))
-    return DivRem8;
-
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
     return NewAdd;
 
@@ -39518,10 +39610,8 @@ static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
     return SDValue();
 
-  SDNode *N1 = N->getOperand(1).getNode();
   APInt SplatVal;
-  if (!ISD::isConstantSplatVector(N1, SplatVal) ||
-      !SplatVal.isOneValue())
+  if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue())
     return SDValue();
 
   SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
@@ -39780,7 +39870,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   // Try to synthesize horizontal adds from adds of shuffles.
   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
        VT == MVT::v8i32) &&
-      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) {
+      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) &&
+      shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
     auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                           ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
@@ -39911,7 +40002,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
        VT == MVT::v8i32) &&
-      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) {
+      Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) &&
+      shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
     auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                           ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
@@ -40279,6 +40371,18 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
                                                  : ISD::SIGN_EXTEND_VECTOR_INREG;
       return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
     }
+    if (InOpcode == ISD::BITCAST) {
+      // TODO - do this for target shuffles in general.
+      SDValue InVecBC = peekThroughOneUseBitcasts(InVec);
+      if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) {
+        SDLoc DL(N);
+        SDValue SubPSHUFB =
+            DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
+                        extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL),
+                        extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL));
+        return DAG.getBitcast(OpVT, SubPSHUFB);
+      }
+    }
   }
 
   return SDValue();
@@ -40316,18 +40420,10 @@ static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
     return RHS;
 
+  // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  APInt DemandedMask(APInt::getLowBitsSet(64, 32));
-
-  // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
-  if (TLI.SimplifyDemandedBits(LHS, DemandedMask, DCI)) {
-    DCI.AddToWorklist(N);
-    return SDValue(N, 0);
-  }
-  if (TLI.SimplifyDemandedBits(RHS, DemandedMask, DCI)) {
-    DCI.AddToWorklist(N);
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
     return SDValue(N, 0);
-  }
 
   return SDValue();
 }
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index b5e9eb3b86f8046545645d34a5d6b485b0336827..7cda0259bf27bdd837cdf1d0314deb816776354c 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -355,16 +355,15 @@ namespace llvm {
       // Bit field extract.
       BEXTR,
 
+      // Zero High Bits Starting with Specified Bit Position.
+      BZHI,
+
       // LOW, HI, FLAGS = umul LHS, RHS.
       UMUL,
 
       // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS.
       SMUL8, UMUL8,
 
-      // 8-bit divrem that zero-extend the high result (AH).
-      UDIVREM8_ZEXT_HREG,
-      SDIVREM8_SEXT_HREG,
-
       // X86-specific multiply by immediate.
       MUL_IMM,
 
@@ -875,6 +874,12 @@ namespace llvm {
                                                  TargetLoweringOpt &TLO,
                                                  unsigned Depth) const override;
 
+    bool SimplifyDemandedBitsForTargetNode(SDValue Op,
+                                           const APInt &DemandedBits,
+                                           KnownBits &Known,
+                                           TargetLoweringOpt &TLO,
+                                           unsigned Depth) const override;
+
     SDValue unwrapAddress(SDValue N) const override;
 
     bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
@@ -941,6 +946,8 @@ namespace llvm {
     /// the immediate into a register.
     bool isLegalAddImmediate(int64_t Imm) const override;
 
+    bool isLegalStoreImmediate(int64_t Imm) const override;
+
     /// Return the cost of the scaling factor used in the addressing
     /// mode represented by AM for this target, for a load/store
     /// of the specified type.
@@ -1106,7 +1113,7 @@ namespace llvm {
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
 
     /// Customize the preferred legalization strategy for certain types.
-    LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
+    LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
 
     MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
                                       EVT VT) const override;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index b1cb1545ec4c29312d09bb188df64979ba12adc9..f8ade37f8dfce5351e772587071d72d8b7957426 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -66,21 +66,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
                            !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
 
   // Load patterns
-  // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
-  //       due to load promotion during legalization
-  PatFrag LdFrag = !cast<PatFrag>("load" #
-                                  !if (!eq (TypeVariantName, "i"),
-                                       !if (!eq (Size, 128), "v2i64",
-                                       !if (!eq (Size, 256), "v4i64",
-                                       !if (!eq (Size, 512), "v8i64",
-                                            VTName))), VTName));
-
-  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" #
-                                         !if (!eq (TypeVariantName, "i"),
-                                               !if (!eq (Size, 128), "v2i64",
-                                               !if (!eq (Size, 256), "v4i64",
-                                               !if (!eq (Size, 512), "v8i64",
-                                                   VTName))), VTName));
+  PatFrag LdFrag = !cast<PatFrag>("load" # VTName);
+
+  PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # VTName);
 
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
 
@@ -107,10 +95,6 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
 
   RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
 
-  // A vector tye of the same width with element type i64. This is used to
-  // create patterns for logic ops.
-  ValueType i64VT = !cast<ValueType>("v" # !srl(Size, 6) # "i64");
-
   // A vector type of the same width with element type i32.  This is used to
   // create the canonical constant zero node ImmAllZerosV.
   ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32");
@@ -518,10 +502,10 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
-                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (From.VT (From.LdFrag addr:$src2)),
                                (iPTR imm)),
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
-                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (From.VT (From.LdFrag addr:$src2)),
                                (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
                    EVEX_CD8<From.EltSize, From.CD8TupleForm>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -547,7 +531,7 @@ multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
 
     def : Pat<(vinsert_insert:$ins
                   (To.VT To.RC:$src1),
-                  (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                  (From.VT (From.LdFrag addr:$src2)),
                   (iPTR imm)),
               (To.VT (!cast<Instruction>(InstrStr#"rm")
                   To.RC:$src1, addr:$src2,
@@ -680,9 +664,7 @@ let Predicates = p in {
              (vselect Cast.KRCWM:$mask,
                       (bitconvert
                        (vinsert_insert:$ins (To.VT To.RC:$src1),
-                                            (From.VT
-                                             (bitconvert
-                                              (From.LdFrag addr:$src2))),
+                                            (From.VT (From.LdFrag addr:$src2)),
                                             (iPTR imm))),
                       Cast.ImmAllZerosV)),
             (!cast<Instruction>(InstrStr#"rmkz")
@@ -1374,7 +1356,7 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                             (_Src.VT (_Src.LdFrag addr:$src))))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1389,7 +1371,7 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
                            (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
                            (null_frag),
                            (_Dst.VT (X86SubVBroadcast
-                             (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
+                             (_Src.VT (_Src.LdFrag addr:$src))))>,
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
@@ -1442,11 +1424,11 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
 let Predicates = [HasAVX512] in {
 def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
           (VBROADCASTF64X4rm addr:$src)>;
-def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
+def : Pat<(v16i32 (X86SubVBroadcast (loadv8i32 addr:$src))),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
+def : Pat<(v32i16 (X86SubVBroadcast (loadv16i16 addr:$src))),
           (VBROADCASTI64X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
+def : Pat<(v64i8 (X86SubVBroadcast (loadv32i8 addr:$src))),
           (VBROADCASTI64X4rm addr:$src)>;
 
 // Provide fallback in case the load node that is used in the patterns above
@@ -1474,9 +1456,9 @@ def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
           (VBROADCASTF32X4rm addr:$src)>;
 def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v32i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
@@ -1506,11 +1488,11 @@ def : Pat<(vselect VK8WM:$mask,
                    VR512:$src0),
           (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
                    (bc_v8i64 (v16i32 immAllZerosV))),
           (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
                    VR512:$src0),
           (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
@@ -1527,9 +1509,9 @@ def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
           (VBROADCASTF32X4Z256rm addr:$src)>;
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
@@ -1591,11 +1573,11 @@ def : Pat<(vselect VK4WM:$mask,
                    VR256X:$src0),
           (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    (bc_v4i64 (v8i32 immAllZerosV))),
           (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
 def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    VR256X:$src0),
           (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 }
@@ -1641,11 +1623,11 @@ def : Pat<(vselect VK8WM:$mask,
                    VR512:$src0),
           (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    (bc_v8i64 (v16i32 immAllZerosV))),
           (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
 def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))),
+                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
                    VR512:$src0),
           (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
@@ -1741,7 +1723,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
             (ins _.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
-                   (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
+                   (_.VT (_.LdFrag addr:$src3)))), 1>,
             EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -1859,7 +1841,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
             (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
-                   (bitconvert (_.LdFrag addr:$src3)))), 1>,
+                   (_.LdFrag addr:$src3))), 1>,
             EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -2149,7 +2131,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
-                                       (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
+                                       (_.VT (_.LdFrag addr:$src2))))]>,
              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = IsCommutable in
   def rrk : AVX512BI<opc, MRMSrcReg,
@@ -2165,8 +2147,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode,
                           "$dst {${mask}}, $src1, $src2}"),
               [(set _.KRC:$dst, (and _.KRCWM:$mask,
                                    (OpNode (_.VT _.RC:$src1),
-                                       (_.VT (bitconvert
-                                              (_.LdFrag addr:$src2))))))]>,
+                                       (_.VT (_.LdFrag addr:$src2)))))]>,
               EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -2291,7 +2272,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
              [(set _.KRC:$dst, (_.KVT
                                 (Frag:$cc
                                  (_.VT _.RC:$src1),
-                                 (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                 (_.VT (_.LdFrag addr:$src2)),
                                  cond)))]>,
              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = 1 in
@@ -2316,8 +2297,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                      (_.KVT
                                       (Frag:$cc
                                        (_.VT _.RC:$src1),
-                                       (_.VT (bitconvert
-                                              (_.LdFrag addr:$src2))),
+                                       (_.VT (_.LdFrag addr:$src2)),
                                        cond))))]>,
               EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -2352,13 +2332,13 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                NotMemoryFoldable;
   }
 
-  def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+  def : Pat<(_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
                                  (_.VT _.RC:$src1), cond)),
             (!cast<Instruction>(Name#_.ZSuffix#"rmi")
              _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>;
 
   def : Pat<(and _.KRCWM:$mask,
-                 (_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)),
+                 (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2),
                                       (_.VT _.RC:$src1), cond))),
             (!cast<Instruction>(Name#_.ZSuffix#"rmik")
              _.KRCWM:$mask, _.RC:$src1, addr:$src2,
@@ -2544,7 +2524,7 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
                 "vcmp${cc}"#_.Suffix,
                 "$src2, $src1", "$src1, $src2",
                 (X86cmpm (_.VT _.RC:$src1),
-                        (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                        (_.VT (_.LdFrag addr:$src2)),
                         imm:$cc)>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -2732,7 +2712,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,(OpNode
-                                     (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                     (_.VT (_.LdFrag addr:$src1)),
                                      (i32 imm:$src2)))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -2740,7 +2720,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     OpcodeStr##_.Suffix##mem#
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                     [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode
-                                  (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                  (_.VT (_.LdFrag addr:$src1)),
                                   (i32 imm:$src2))))]>,
                     EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
@@ -3353,7 +3333,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     !if(NoRMPattern, [],
                         [(set _.RC:$dst,
-                          (_.VT (bitconvert (ld_frag addr:$src))))]),
+                          (_.VT (ld_frag addr:$src)))]),
                     _.ExeDomain>, EVEX, Sched<[Sched.RM]>,
                     EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
 
@@ -3372,7 +3352,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                       "${dst} {${mask}}, $src1}"),
                      [(set _.RC:$dst, (_.VT
                          (vselect _.KRCWM:$mask,
-                          (_.VT (bitconvert (ld_frag addr:$src1))),
+                          (_.VT (ld_frag addr:$src1)),
                            (_.VT _.RC:$src0))))], _.ExeDomain>,
                      EVEX, EVEX_K, Sched<[Sched.RM]>;
   }
@@ -3381,7 +3361,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                   OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
                                 "${dst} {${mask}} {z}, $src}",
                   [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
-                    (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
+                    (_.VT (ld_frag addr:$src)), _.ImmAllZerosV)))],
                   _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>;
   }
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
@@ -3681,6 +3661,20 @@ let Predicates = [HasBWI, NoVLX] in {
 }
 
 let Predicates = [HasAVX512] in {
+  // 512-bit load.
+  def : Pat<(alignedloadv16i32 addr:$src),
+            (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(alignedloadv32i16 addr:$src),
+            (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(alignedloadv64i8 addr:$src),
+            (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(loadv16i32 addr:$src),
+            (VMOVDQU64Zrm addr:$src)>;
+  def : Pat<(loadv32i16 addr:$src),
+            (VMOVDQU64Zrm addr:$src)>;
+  def : Pat<(loadv64i8 addr:$src),
+            (VMOVDQU64Zrm addr:$src)>;
+
   // 512-bit store.
   def : Pat<(alignedstore (v16i32 VR512:$src), addr:$dst),
             (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
@@ -3697,6 +3691,20 @@ let Predicates = [HasAVX512] in {
 }
 
 let Predicates = [HasVLX] in {
+  // 128-bit load.
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(alignedloadv8i16 addr:$src),
+            (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(alignedloadv16i8 addr:$src),
+            (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (VMOVDQU64Z128rm addr:$src)>;
+  def : Pat<(loadv8i16 addr:$src),
+            (VMOVDQU64Z128rm addr:$src)>;
+  def : Pat<(loadv16i8 addr:$src),
+            (VMOVDQU64Z128rm addr:$src)>;
+
   // 128-bit store.
   def : Pat<(alignedstore (v4i32 VR128X:$src), addr:$dst),
             (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
@@ -3711,6 +3719,20 @@ let Predicates = [HasVLX] in {
   def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
             (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
 
+  // 256-bit load.
+  def : Pat<(alignedloadv8i32 addr:$src),
+            (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(alignedloadv16i16 addr:$src),
+            (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(alignedloadv32i8 addr:$src),
+            (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(loadv8i32 addr:$src),
+            (VMOVDQU64Z256rm addr:$src)>;
+  def : Pat<(loadv16i16 addr:$src),
+            (VMOVDQU64Z256rm addr:$src)>;
+  def : Pat<(loadv32i8 addr:$src),
+            (VMOVDQU64Z256rm addr:$src)>;
+
   // 256-bit store.
   def : Pat<(alignedstore (v8i32 VR256X:$src), addr:$dst),
             (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
@@ -4421,8 +4443,6 @@ let Predicates = [HasAVX512] in {
             (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
             (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
-  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
   def : Pat<(v2f64 (X86vzload addr:$src)),
             (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
 
@@ -4497,7 +4517,7 @@ let Predicates = [HasAVX512] in {
             (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (VMOVDI2PDIZrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (VMOVDI2PDIZrm addr:$src)>;
@@ -4593,6 +4613,12 @@ let Predicates = [HasAVX512], AddedComplexity = 400 in {
             (VMOVNTDQAZrm addr:$src)>;
   def : Pat<(v8i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v16i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v32i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
+  def : Pat<(v64i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZrm addr:$src)>;
 }
 
 let Predicates = [HasVLX], AddedComplexity = 400 in {
@@ -4609,6 +4635,12 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
             (VMOVNTDQAZ256rm addr:$src)>;
   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
+  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ256rm addr:$src)>;
 
   def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
             (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
@@ -4623,6 +4655,12 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
             (VMOVNTDQAZ128rm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAZ128rm addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4641,8 +4679,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
-                  (_.VT (OpNode _.RC:$src1,
-                                (bitconvert (_.LdFrag addr:$src2))))>,
+                  (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2)))>,
                   AVX512BIBase, EVEX_4V,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -4773,7 +4810,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
+                                      (_Src.LdFrag addr:$src2)))>,
                         AVX512BIBase, EVEX_4V,
                         Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -4878,7 +4915,7 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
-                                      (bitconvert (_Src.LdFrag addr:$src2))))>,
+                                      (_Src.LdFrag addr:$src2)))>,
                          EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
@@ -5046,95 +5083,356 @@ let Predicates = [HasAVX512, NoVLX] in {
 // AVX-512  Logical Instructions
 //===----------------------------------------------------------------------===//
 
-// OpNodeMsk is the OpNode to use when element size is important. OpNode will
-// be set to null_frag for 32-bit elements.
-multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
-                           SDPatternOperator OpNode,
-                           SDNode OpNodeMsk, X86FoldableSchedWrite sched,
-                           X86VectorVTInfo _, bit IsCommutable = 0> {
-  let hasSideEffects = 0 in
-  defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
-                    "$src2, $src1", "$src1, $src2",
-                    (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
-                                     (bitconvert (_.VT _.RC:$src2)))),
-                    (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                                          _.RC:$src2)))),
-                    IsCommutable>, AVX512BIBase, EVEX_4V,
-                    Sched<[sched]>;
+defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and,
+                                   SchedWriteVecLogic, HasAVX512, 1>;
+defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or,
+                                  SchedWriteVecLogic, HasAVX512, 1>;
+defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
+                                   SchedWriteVecLogic, HasAVX512, 1>;
+defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
+                                    SchedWriteVecLogic, HasAVX512>;
 
-  let hasSideEffects = 0, mayLoad = 1 in
-  defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
-                  "$src2, $src1", "$src1, $src2",
-                  (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
-                                   (bitconvert (_.LdFrag addr:$src2)))),
-                  (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                     (bitconvert (_.LdFrag addr:$src2))))))>,
-                  AVX512BIBase, EVEX_4V,
-                  Sched<[sched.Folded, sched.ReadAfterFold]>;
+let Predicates = [HasVLX] in {
+  def : Pat<(v16i8 (and VR128X:$src1, VR128X:$src2)),
+            (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v8i16 (and VR128X:$src1, VR128X:$src2)),
+            (VPANDQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+  def : Pat<(v16i8 (or VR128X:$src1, VR128X:$src2)),
+            (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v8i16 (or VR128X:$src1, VR128X:$src2)),
+            (VPORQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+  def : Pat<(v16i8 (xor VR128X:$src1, VR128X:$src2)),
+            (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v8i16 (xor VR128X:$src1, VR128X:$src2)),
+            (VPXORQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+  def : Pat<(v16i8 (X86andnp VR128X:$src1, VR128X:$src2)),
+            (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v8i16 (X86andnp VR128X:$src1, VR128X:$src2)),
+            (VPANDNQZ128rr VR128X:$src1, VR128X:$src2)>;
+
+  def : Pat<(and VR128X:$src1, (loadv16i8 addr:$src2)),
+            (VPANDQZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(and VR128X:$src1, (loadv8i16 addr:$src2)),
+            (VPANDQZ128rm VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(or VR128X:$src1, (loadv16i8 addr:$src2)),
+            (VPORQZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(or VR128X:$src1, (loadv8i16 addr:$src2)),
+            (VPORQZ128rm VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR128X:$src1, (loadv16i8 addr:$src2)),
+            (VPXORQZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(xor VR128X:$src1, (loadv8i16 addr:$src2)),
+            (VPXORQZ128rm VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR128X:$src1, (loadv16i8 addr:$src2)),
+            (VPANDNQZ128rm VR128X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128X:$src1, (loadv8i16 addr:$src2)),
+            (VPANDNQZ128rm VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(and VR128X:$src1,
+                 (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDDZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(or VR128X:$src1,
+                (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPORDZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(xor VR128X:$src1,
+                 (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPXORDZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128X:$src1,
+                      (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDNDZ128rmb VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(and VR128X:$src1,
+                 (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDQZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(or VR128X:$src1,
+                (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPORQZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(xor VR128X:$src1,
+                 (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPXORQZ128rmb VR128X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128X:$src1,
+                      (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDNQZ128rmb VR128X:$src1, addr:$src2)>;
+
+  def : Pat<(v32i8 (and VR256X:$src1, VR256X:$src2)),
+            (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v16i16 (and VR256X:$src1, VR256X:$src2)),
+            (VPANDQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+  def : Pat<(v32i8 (or VR256X:$src1, VR256X:$src2)),
+            (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v16i16 (or VR256X:$src1, VR256X:$src2)),
+            (VPORQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+  def : Pat<(v32i8 (xor VR256X:$src1, VR256X:$src2)),
+            (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v16i16 (xor VR256X:$src1, VR256X:$src2)),
+            (VPXORQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+  def : Pat<(v32i8 (X86andnp VR256X:$src1, VR256X:$src2)),
+            (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
+  def : Pat<(v16i16 (X86andnp VR256X:$src1, VR256X:$src2)),
+            (VPANDNQZ256rr VR256X:$src1, VR256X:$src2)>;
+
+  def : Pat<(and VR256X:$src1, (loadv32i8 addr:$src2)),
+            (VPANDQZ256rm VR256X:$src1, addr:$src2)>;
+  def : Pat<(and VR256X:$src1, (loadv16i16 addr:$src2)),
+            (VPANDQZ256rm VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(or VR256X:$src1, (loadv32i8 addr:$src2)),
+            (VPORQZ256rm VR256X:$src1, addr:$src2)>;
+  def : Pat<(or VR256X:$src1, (loadv16i16 addr:$src2)),
+            (VPORQZ256rm VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR256X:$src1, (loadv32i8 addr:$src2)),
+            (VPXORQZ256rm VR256X:$src1, addr:$src2)>;
+  def : Pat<(xor VR256X:$src1, (loadv16i16 addr:$src2)),
+            (VPXORQZ256rm VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR256X:$src1, (loadv32i8 addr:$src2)),
+            (VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256X:$src1, (loadv16i16 addr:$src2)),
+            (VPANDNQZ256rm VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(and VR256X:$src1,
+                 (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDDZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(or VR256X:$src1,
+                (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPORDZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(xor VR256X:$src1,
+                 (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPXORDZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256X:$src1,
+                      (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDNDZ256rmb VR256X:$src1, addr:$src2)>;
+
+  def : Pat<(and VR256X:$src1,
+                 (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDQZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(or VR256X:$src1,
+                (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPORQZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(xor VR256X:$src1,
+                 (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPXORQZ256rmb VR256X:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256X:$src1,
+                      (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDNQZ256rmb VR256X:$src1, addr:$src2)>;
 }
 
-// OpNodeMsk is the OpNode to use where element size is important. So use
-// for all of the broadcast patterns.
-multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr,
-                            SDPatternOperator OpNode,
-                            SDNode OpNodeMsk, X86FoldableSchedWrite sched, X86VectorVTInfo _,
-                            bit IsCommutable = 0> :
-           avx512_logic_rm<opc, OpcodeStr, OpNode, OpNodeMsk, sched, _,
-                           IsCommutable> {
-  defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
-                  "${src2}"##_.BroadcastStr##", $src1",
-                  "$src1, ${src2}"##_.BroadcastStr,
-                  (_.i64VT (OpNodeMsk _.RC:$src1,
-                                   (bitconvert
-                                    (_.VT (X86VBroadcast
-                                            (_.ScalarLdFrag addr:$src2)))))),
-                  (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
-                                     (bitconvert
-                                      (_.VT (X86VBroadcast
-                                             (_.ScalarLdFrag addr:$src2))))))))>,
-                  AVX512BIBase, EVEX_4V, EVEX_B,
-                  Sched<[sched.Folded, sched.ReadAfterFold]>;
+let Predicates = [HasAVX512] in {
+  def : Pat<(v64i8 (and VR512:$src1, VR512:$src2)),
+            (VPANDQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v32i16 (and VR512:$src1, VR512:$src2)),
+            (VPANDQZrr VR512:$src1, VR512:$src2)>;
+
+  def : Pat<(v64i8 (or VR512:$src1, VR512:$src2)),
+            (VPORQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v32i16 (or VR512:$src1, VR512:$src2)),
+            (VPORQZrr VR512:$src1, VR512:$src2)>;
+
+  def : Pat<(v64i8 (xor VR512:$src1, VR512:$src2)),
+            (VPXORQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v32i16 (xor VR512:$src1, VR512:$src2)),
+            (VPXORQZrr VR512:$src1, VR512:$src2)>;
+
+  def : Pat<(v64i8 (X86andnp VR512:$src1, VR512:$src2)),
+            (VPANDNQZrr VR512:$src1, VR512:$src2)>;
+  def : Pat<(v32i16 (X86andnp VR512:$src1, VR512:$src2)),
+            (VPANDNQZrr VR512:$src1, VR512:$src2)>;
+
+  def : Pat<(and VR512:$src1, (loadv64i8 addr:$src2)),
+            (VPANDQZrm VR512:$src1, addr:$src2)>;
+  def : Pat<(and VR512:$src1, (loadv32i16 addr:$src2)),
+            (VPANDQZrm VR512:$src1, addr:$src2)>;
+
+  def : Pat<(or VR512:$src1, (loadv64i8 addr:$src2)),
+            (VPORQZrm VR512:$src1, addr:$src2)>;
+  def : Pat<(or VR512:$src1, (loadv32i16 addr:$src2)),
+            (VPORQZrm VR512:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR512:$src1, (loadv64i8 addr:$src2)),
+            (VPXORQZrm VR512:$src1, addr:$src2)>;
+  def : Pat<(xor VR512:$src1, (loadv32i16 addr:$src2)),
+            (VPXORQZrm VR512:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR512:$src1, (loadv64i8 addr:$src2)),
+            (VPANDNQZrm VR512:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR512:$src1, (loadv32i16 addr:$src2)),
+            (VPANDNQZrm VR512:$src1, addr:$src2)>;
+
+  def : Pat<(and VR512:$src1,
+                 (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDDZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(or VR512:$src1,
+                (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPORDZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(xor VR512:$src1,
+                 (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPXORDZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR512:$src1,
+                      (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src2))))),
+            (VPANDNDZrmb VR512:$src1, addr:$src2)>;
+
+  def : Pat<(and VR512:$src1,
+                 (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDQZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(or VR512:$src1,
+                (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPORQZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(xor VR512:$src1,
+                 (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPXORQZrmb VR512:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR512:$src1,
+                      (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src2))))),
+            (VPANDNQZrmb VR512:$src1, addr:$src2)>;
+}
+
+// Patterns to catch vselect with different type than logic op.
+multiclass avx512_logical_lowering<string InstrStr, SDNode OpNode,
+                                    X86VectorVTInfo _,
+                                    X86VectorVTInfo IntInfo> {
+  // Masked register-register logical operations.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
+                   _.RC:$src0)),
+            (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
+             _.RC:$src1, _.RC:$src2)>;
+
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
+             _.RC:$src2)>;
+
+  // Masked register-memory logical operations.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (IntInfo.VT (OpNode _.RC:$src1,
+                                            (load addr:$src2)))),
+                   _.RC:$src0)),
+            (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
+             _.RC:$src1, addr:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert (IntInfo.VT (OpNode _.RC:$src1,
+                                            (load addr:$src2)))),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1,
+             addr:$src2)>;
 }
 
-multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr,
-                               SDPatternOperator OpNode,
-                               SDNode OpNodeMsk, X86SchedWriteWidths sched,
-                               AVX512VLVectorVTInfo VTInfo,
-                               bit IsCommutable = 0> {
-  let Predicates = [HasAVX512] in
-    defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.ZMM,
-                              VTInfo.info512, IsCommutable>, EVEX_V512;
+multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode,
+                                         X86VectorVTInfo _,
+                                         X86VectorVTInfo IntInfo> {
+  // Register-broadcast logical operations.
+  def : Pat<(IntInfo.VT (OpNode _.RC:$src1,
+                         (bitconvert (_.VT (X86VBroadcast
+                                            (_.ScalarLdFrag addr:$src2)))))),
+            (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert
+                    (IntInfo.VT (OpNode _.RC:$src1,
+                                 (bitconvert (_.VT
+                                              (X86VBroadcast
+                                               (_.ScalarLdFrag addr:$src2))))))),
+                   _.RC:$src0)),
+            (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
+             _.RC:$src1, addr:$src2)>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (bitconvert
+                    (IntInfo.VT (OpNode _.RC:$src1,
+                                 (bitconvert (_.VT
+                                              (X86VBroadcast
+                                               (_.ScalarLdFrag addr:$src2))))))),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr#rmbkz)  _.KRCWM:$mask,
+             _.RC:$src1, addr:$src2)>;
+}
 
-  let Predicates = [HasAVX512, HasVLX] in {
-    defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.YMM,
-                                 VTInfo.info256, IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.XMM,
-                                 VTInfo.info128, IsCommutable>, EVEX_V128;
-  }
+multiclass avx512_logical_lowering_sizes<string InstrStr, SDNode OpNode,
+                                         AVX512VLVectorVTInfo SelectInfo,
+                                         AVX512VLVectorVTInfo IntInfo> {
+let Predicates = [HasVLX] in {
+  defm : avx512_logical_lowering<InstrStr#"Z128", OpNode, SelectInfo.info128,
+                                 IntInfo.info128>;
+  defm : avx512_logical_lowering<InstrStr#"Z256", OpNode, SelectInfo.info256,
+                                 IntInfo.info256>;
+}
+let Predicates = [HasAVX512] in {
+  defm : avx512_logical_lowering<InstrStr#"Z", OpNode, SelectInfo.info512,
+                                 IntInfo.info512>;
+}
 }
 
-multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
-                                 SDNode OpNode, X86SchedWriteWidths sched,
-                                 bit IsCommutable = 0> {
-  defm Q : avx512_logic_rmb_vl<opc_q, OpcodeStr#"q", OpNode, OpNode, sched,
-                               avx512vl_i64_info, IsCommutable>,
-                               VEX_W, EVEX_CD8<64, CD8VF>;
-  defm D : avx512_logic_rmb_vl<opc_d, OpcodeStr#"d", null_frag, OpNode, sched,
-                               avx512vl_i32_info, IsCommutable>,
-                               EVEX_CD8<32, CD8VF>;
-}
-
-defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
-                                   SchedWriteVecLogic, 1>;
-defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
-                                  SchedWriteVecLogic, 1>;
-defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
-                                   SchedWriteVecLogic, 1>;
-defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
-                                    SchedWriteVecLogic>;
+multiclass avx512_logical_lowering_sizes_bcast<string InstrStr, SDNode OpNode,
+                                               AVX512VLVectorVTInfo SelectInfo,
+                                               AVX512VLVectorVTInfo IntInfo> {
+let Predicates = [HasVLX] in {
+  defm : avx512_logical_lowering_bcast<InstrStr#"Z128", OpNode,
+                                       SelectInfo.info128, IntInfo.info128>;
+  defm : avx512_logical_lowering_bcast<InstrStr#"Z256", OpNode,
+                                       SelectInfo.info256, IntInfo.info256>;
+}
+let Predicates = [HasAVX512] in {
+  defm : avx512_logical_lowering_bcast<InstrStr#"Z", OpNode,
+                                       SelectInfo.info512, IntInfo.info512>;
+}
+}
+
+multiclass avx512_logical_lowering_types<string InstrStr, SDNode OpNode> {
+  // i64 vselect with i32/i16/i8 logic op
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_i64_info,
+                                       avx512vl_i32_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_i64_info,
+                                       avx512vl_i16_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_i64_info,
+                                       avx512vl_i8_info>;
+
+  // i32 vselect with i64/i16/i8 logic op
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_i32_info,
+                                       avx512vl_i64_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_i32_info,
+                                       avx512vl_i16_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_i32_info,
+                                       avx512vl_i8_info>;
+
+  // f32 vselect with i64/i32/i16/i8 logic op
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+                                       avx512vl_i64_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+                                       avx512vl_i32_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+                                       avx512vl_i16_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"D", OpNode, avx512vl_f32_info,
+                                       avx512vl_i8_info>;
+
+  // f64 vselect with i64/i32/i16/i8 logic op
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+                                       avx512vl_i64_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+                                       avx512vl_i32_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+                                       avx512vl_i16_info>;
+  defm : avx512_logical_lowering_sizes<InstrStr#"Q", OpNode, avx512vl_f64_info,
+                                       avx512vl_i8_info>;
+
+  defm : avx512_logical_lowering_sizes_bcast<InstrStr#"D", OpNode,
+                                             avx512vl_f32_info,
+                                             avx512vl_i32_info>;
+  defm : avx512_logical_lowering_sizes_bcast<InstrStr#"Q", OpNode,
+                                             avx512vl_f64_info,
+                                             avx512vl_i64_info>;
+}
+
+defm : avx512_logical_lowering_types<"VPAND", and>;
+defm : avx512_logical_lowering_types<"VPOR",  or>;
+defm : avx512_logical_lowering_types<"VPXOR", xor>;
+defm : avx512_logical_lowering_types<"VPANDN", X86andnp>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  FP arithmetic
@@ -5439,73 +5737,6 @@ defm VOR   : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
 defm VXOR  : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
                                SchedWriteFLogicSizes, 1>;
 
-// Patterns catch floating point selects with bitcasted integer logic ops.
-multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode,
-                                      X86VectorVTInfo _, Predicate prd> {
-let Predicates = [prd] in {
-  // Masked register-register logical operations.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
-                   _.RC:$src0)),
-            (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
-             _.RC:$src1, _.RC:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))),
-                   _.ImmAllZerosV)),
-            (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
-             _.RC:$src2)>;
-  // Masked register-memory logical operations.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert (_.i64VT (OpNode _.RC:$src1,
-                                         (load addr:$src2)))),
-                   _.RC:$src0)),
-            (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
-             _.RC:$src1, addr:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))),
-                   _.ImmAllZerosV)),
-            (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1,
-             addr:$src2)>;
-  // Register-broadcast logical operations.
-  def : Pat<(_.i64VT (OpNode _.RC:$src1,
-                      (bitconvert (_.VT (X86VBroadcast
-                                         (_.ScalarLdFrag addr:$src2)))))),
-            (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert
-                    (_.i64VT (OpNode _.RC:$src1,
-                              (bitconvert (_.VT
-                                           (X86VBroadcast
-                                            (_.ScalarLdFrag addr:$src2))))))),
-                   _.RC:$src0)),
-            (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
-             _.RC:$src1, addr:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                   (bitconvert
-                    (_.i64VT (OpNode _.RC:$src1,
-                              (bitconvert (_.VT
-                                           (X86VBroadcast
-                                            (_.ScalarLdFrag addr:$src2))))))),
-                   _.ImmAllZerosV)),
-            (!cast<Instruction>(InstrStr#rmbkz)  _.KRCWM:$mask,
-             _.RC:$src1, addr:$src2)>;
-}
-}
-
-multiclass avx512_fp_logical_lowering_sizes<string InstrStr, SDNode OpNode> {
-  defm : avx512_fp_logical_lowering<InstrStr#DZ128, OpNode, v4f32x_info, HasVLX>;
-  defm : avx512_fp_logical_lowering<InstrStr#QZ128, OpNode, v2f64x_info, HasVLX>;
-  defm : avx512_fp_logical_lowering<InstrStr#DZ256, OpNode, v8f32x_info, HasVLX>;
-  defm : avx512_fp_logical_lowering<InstrStr#QZ256, OpNode, v4f64x_info, HasVLX>;
-  defm : avx512_fp_logical_lowering<InstrStr#DZ, OpNode, v16f32_info, HasAVX512>;
-  defm : avx512_fp_logical_lowering<InstrStr#QZ, OpNode, v8f64_info, HasAVX512>;
-}
-
-defm : avx512_fp_logical_lowering_sizes<"VPAND", and>;
-defm : avx512_fp_logical_lowering_sizes<"VPOR", or>;
-defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>;
-defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>;
-
 let Predicates = [HasVLX,HasDQI] in {
   // Use packed logical operations for scalar ops.
   def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
@@ -5635,15 +5866,12 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
   defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
-                   (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
-                           _.ImmAllZerosV)>,
+                   (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>,
                    EVEX_4V, Sched<[sched]>;
   defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (OpNode (bitconvert
-                            (_.i64VT (and _.RC:$src1,
-                                          (bitconvert (_.LdFrag addr:$src2))))),
+                   (OpNode (and _.RC:$src1, (_.LdFrag addr:$src2)),
                            _.ImmAllZerosV)>,
                    EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -5677,7 +5905,7 @@ multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
 // Use 512bit version to implement 128/256 bit in case NoVLX.
 multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo,
                                   X86VectorVTInfo _, string Name> {
-  def : Pat<(_.KVT (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+  def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2),
                            _.ImmAllZerosV)),
             (_.KVT (COPY_TO_REGCLASS
                      (!cast<Instruction>(Name # "Zrr")
@@ -5688,7 +5916,7 @@ multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo,
                    _.KRC))>;
 
   def : Pat<(_.KVT (and _.KRC:$mask,
-                        (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))),
+                        (OpNode (and _.RC:$src1, _.RC:$src2),
                                 _.ImmAllZerosV))),
             (COPY_TO_REGCLASS
              (!cast<Instruction>(Name # "Zrrk")
@@ -5765,7 +5993,7 @@ multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
                             v16i8x_info, NAME#"B">, EVEX_V128;
   }
 
-  let Predicates = [HasAVX512, NoVLX] in {
+  let Predicates = [HasBWI, NoVLX] in {
   defm BZ256_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v32i8x_info, NAME#"B">;
   defm BZ128_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v16i8x_info, NAME#"B">;
   defm WZ256_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v16i16x_info, NAME#"W">;
@@ -5791,6 +6019,125 @@ defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem,
 defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm,
                                          SchedWriteVecLogic>, T8XS;
 
+
+multiclass avx512_vptest_lowering_pats<string InstrStr, PatFrag OpNode,
+                                       X86VectorVTInfo _,
+                                       X86VectorVTInfo AndInfo> {
+  def : Pat<(_.KVT (OpNode (bitconvert
+                            (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>;
+
+  def : Pat<(_.KVT (and _.KRC:$mask,
+                    (OpNode (bitconvert
+                             (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
+                            _.ImmAllZerosV))),
+            (!cast<Instruction>(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1,
+                                                  _.RC:$src2)>;
+
+  def : Pat<(_.KVT (OpNode (bitconvert
+                            (AndInfo.VT (and _.RC:$src1,
+                                             (AndInfo.LdFrag addr:$src2)))),
+                           _.ImmAllZerosV)),
+            (!cast<Instruction>(InstrStr # "rm") _.RC:$src1, addr:$src2)>;
+
+  def : Pat<(_.KVT (and _.KRC:$mask,
+                    (OpNode (bitconvert
+                             (AndInfo.VT (and _.RC:$src1,
+                                              (AndInfo.LdFrag addr:$src2)))),
+                            _.ImmAllZerosV))),
+            (!cast<Instruction>(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1,
+                                                  addr:$src2)>;
+}
+
+// Patterns to use 512-bit instructions when 128/256 are not available.
+multiclass avx512_vptest_lowering_wide_pats<string InstrStr, PatFrag OpNode,
+                                            X86VectorVTInfo _,
+                                            X86VectorVTInfo AndInfo,
+                                            X86VectorVTInfo ExtendInfo> {
+  def : Pat<(_.KVT (OpNode (bitconvert
+                            (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
+                           _.ImmAllZerosV)),
+            (_.KVT (COPY_TO_REGCLASS
+                     (!cast<Instruction>(InstrStr#"rr")
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src1, _.SubRegIdx),
+                       (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                                      _.RC:$src2, _.SubRegIdx)),
+                   _.KRC))>;
+
+  def : Pat<(_.KVT (and _.KRC:$mask,
+                    (OpNode (bitconvert
+                             (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
+                            _.ImmAllZerosV))),
+            (COPY_TO_REGCLASS
+             (!cast<Instruction>(InstrStr#"rrk")
+              (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
+              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                             _.RC:$src1, _.SubRegIdx),
+              (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
+                             _.RC:$src2, _.SubRegIdx)),
+             _.KRC)>;
+}
+
+multiclass avx512_vptest_lowering_sizes<string InstrStr, PatFrag OpNode,
+                                        Predicate prd,
+                                        AVX512VLVectorVTInfo CmpInfo,
+                                        AVX512VLVectorVTInfo AndInfo> {
+let Predicates = [prd, HasVLX] in {
+  defm : avx512_vptest_lowering_pats<InstrStr#"Z128", OpNode,
+                                     CmpInfo.info128, AndInfo.info128>;
+  defm : avx512_vptest_lowering_pats<InstrStr#"Z256", OpNode,
+                                     CmpInfo.info256, AndInfo.info256>;
+}
+let Predicates = [prd] in {
+  defm : avx512_vptest_lowering_pats<InstrStr#"Z", OpNode,
+                                     CmpInfo.info512, AndInfo.info512>;
+}
+
+let Predicates = [prd, NoVLX] in {
+  defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode,
+                                          CmpInfo.info128, AndInfo.info128,
+                                          CmpInfo.info512>;
+  defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode,
+                                          CmpInfo.info256, AndInfo.info256,
+                                          CmpInfo.info512>;
+}
+}
+
+multiclass avx512_vptest_lowering_types<string InstrStr, PatFrag OpNode> {
+  defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
+                                      avx512vl_i8_info, avx512vl_i16_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
+                                      avx512vl_i8_info, avx512vl_i32_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, HasBWI,
+                                      avx512vl_i8_info, avx512vl_i64_info>;
+
+  defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
+                                      avx512vl_i16_info, avx512vl_i8_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
+                                      avx512vl_i16_info, avx512vl_i32_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, HasBWI,
+                                      avx512vl_i16_info, avx512vl_i64_info>;
+
+  defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
+                                      avx512vl_i32_info, avx512vl_i8_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
+                                      avx512vl_i32_info, avx512vl_i16_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, HasAVX512,
+                                      avx512vl_i32_info, avx512vl_i64_info>;
+
+  defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
+                                      avx512vl_i64_info, avx512vl_i8_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
+                                      avx512vl_i64_info, avx512vl_i16_info>;
+  defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, HasAVX512,
+                                      avx512vl_i64_info, avx512vl_i32_info>;
+}
+
+defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem>;
+defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm>;
+
 //===----------------------------------------------------------------------===//
 // AVX-512  Shift instructions
 //===----------------------------------------------------------------------===//
@@ -5807,7 +6154,7 @@ multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
   defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                   (_.VT (OpNode (_.VT (_.LdFrag addr:$src1)),
                           (i8 imm:$src2)))>,
                    Sched<[sched.Folded]>;
   }
@@ -5826,7 +6173,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
 
 multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86FoldableSchedWrite sched, ValueType SrcVT,
-                            PatFrag bc_frag, X86VectorVTInfo _> {
+                            X86VectorVTInfo _> {
    // src2 is always 128-bit
   let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5837,7 +6184,7 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
-                   (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2))))>,
+                   (_.VT (OpNode _.RC:$src1, (SrcVT (load addr:$src2))))>,
                    AVX512BIBase,
                    EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -5845,18 +6192,18 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               X86SchedWriteWidths sched, ValueType SrcVT,
-                              PatFrag bc_frag, AVX512VLVectorVTInfo VTInfo,
+                              AVX512VLVectorVTInfo VTInfo,
                               Predicate prd> {
   let Predicates = [prd] in
   defm Z    : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.ZMM, SrcVT,
-                               bc_frag, VTInfo.info512>, EVEX_V512,
+                               VTInfo.info512>, EVEX_V512,
                                EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
   let Predicates = [prd, HasVLX] in {
   defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.YMM, SrcVT,
-                               bc_frag, VTInfo.info256>, EVEX_V256,
+                               VTInfo.info256>, EVEX_V256,
                                EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
   defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.XMM, SrcVT,
-                               bc_frag, VTInfo.info128>, EVEX_V128,
+                               VTInfo.info128>, EVEX_V128,
                                EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
   }
 }
@@ -5866,12 +6213,12 @@ multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
                               X86SchedWriteWidths sched,
                               bit NotEVEX2VEXConvertibleQ = 0> {
   defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, sched, v4i32,
-                              bc_v4i32, avx512vl_i32_info, HasAVX512>;
+                              avx512vl_i32_info, HasAVX512>;
   let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in
   defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, sched, v2i64,
-                              bc_v2i64, avx512vl_i64_info, HasAVX512>, VEX_W;
+                              avx512vl_i64_info, HasAVX512>, VEX_W;
   defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, sched, v8i16,
-                              bc_v2i64, avx512vl_i16_info, HasBWI>;
+                              avx512vl_i16_info, HasBWI>;
 }
 
 multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
@@ -5991,7 +6338,7 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1,
-                   (_.VT (bitconvert (_.LdFrag addr:$src2)))))>,
+                   (_.VT (_.LdFrag addr:$src2))))>,
                    AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -6091,7 +6438,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
     def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1,
                _.RC:$src2)>;
-    def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
+    def : Pat<(_.VT (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
                _.RC:$src1, addr:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
@@ -6099,7 +6446,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, _.RC:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+                     (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
@@ -6108,7 +6455,7 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
                _.RC:$src1, _.RC:$src2)>;
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                     (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))),
+                     (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)),
                      _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
                _.RC:$src1, addr:$src2)>;
@@ -6333,7 +6680,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode
                            _.RC:$src1,
-                           (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
+                           (Ctrl.VT (Ctrl.LdFrag addr:$src2))))>,
                   T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
@@ -7619,7 +7966,7 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
                          (_.VT (OpNode (_Src.VT
-                             (bitconvert (_Src.LdFrag addr:$src)))))>,
+                             (_Src.LdFrag addr:$src))))>,
                          EVEX, Sched<[sched.Folded]>;
 
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -8326,8 +8673,7 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
   defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
                             (X86cvtph2ps (_src.VT
-                                          (bitconvert
-                                           (ld_frag addr:$src))))>,
+                                          (ld_frag addr:$src)))>,
                             T8PD, Sched<[sched.Folded]>;
 }
 
@@ -8342,17 +8688,17 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
 }
 
 let Predicates = [HasAVX512] in
-  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64,
+  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, load,
                                     WriteCvtPH2PSZ>,
                     avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
 
 let Predicates = [HasVLX] in {
   defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
-                       loadv2i64, WriteCvtPH2PSY>, EVEX, EVEX_V256,
+                       load, WriteCvtPH2PSY>, EVEX, EVEX_V256,
                        EVEX_CD8<32, CD8VH>;
   defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
-                       loadv2i64, WriteCvtPH2PS>, EVEX, EVEX_V128,
+                       load, WriteCvtPH2PS>, EVEX, EVEX_V128,
                        EVEX_CD8<32, CD8VH>;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
@@ -9296,7 +9642,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   }
   let Predicates = [HasVLX] in {
@@ -9306,7 +9652,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
   def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
@@ -9315,7 +9661,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
 
   def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -9326,7 +9672,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
   def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -9335,7 +9681,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
 
   def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -9346,12 +9692,12 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   }
   // 256-bit patterns
   let Predicates = [HasVLX, HasBWI] in {
-  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>;
@@ -9365,7 +9711,7 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
 
   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -9374,10 +9720,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>;
@@ -9390,10 +9736,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>;
@@ -9402,25 +9748,25 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
   }
   // 512-bit patterns
   let Predicates = [HasBWI] in {
-  def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))),
+  def : Pat<(v32i16 (ExtOp (loadv32i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWZrm) addr:$src)>;
   }
   let Predicates = [HasAVX512] in {
-  def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v16i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDZrm) addr:$src)>;
 
   def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
-  def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
 
-  def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))),
+  def : Pat<(v16i32 (ExtOp (loadv16i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDZrm) addr:$src)>;
 
-  def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQZrm) addr:$src)>;
 
-  def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))),
+  def : Pat<(v8i64 (ExtOp (loadv8i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQZrm) addr:$src)>;
   }
 }
@@ -9612,6 +9958,10 @@ def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
                   !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
                   [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>,
                   EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc?
+
+// Also need a pattern for anyextend.
+def : Pat<(Vec.VT (anyext Vec.KRC:$src)),
+          (!cast<Instruction>(NAME#"rr") Vec.KRC:$src)>;
 }
 
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
@@ -9685,11 +10035,19 @@ let Predicates = [HasDQI, NoBWI] in {
             (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
   def : Pat<(v16i16 (sext (v16i1 VK16:$src))),
             (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+
+  def : Pat<(v16i8 (anyext (v16i1 VK16:$src))),
+            (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
+  def : Pat<(v16i16 (anyext (v16i1 VK16:$src))),
+            (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
 }
 
 let Predicates = [HasDQI, NoBWI, HasVLX] in {
   def : Pat<(v8i16 (sext (v8i1 VK8:$src))),
             (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>;
+
+  def : Pat<(v8i16 (anyext (v8i1 VK8:$src))),
+            (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -10325,7 +10683,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                 (_.VT
                  (bitconvert
                   (CastInfo.VT (X86Shuf128 _.RC:$src1,
-                                           (bitconvert (_.LdFrag addr:$src2)),
+                                           (CastInfo.LdFrag addr:$src2),
                                            (i8 imm:$src3)))))>,
                 Sched<[sched.Folded, sched.ReadAfterFold]>,
                 EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
@@ -10491,7 +10849,7 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
-                                      (bitconvert (To.LdFrag addr:$src2)),
+                                              (From.LdFrag addr:$src2),
                                       imm:$src3))),
                             To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
@@ -10501,7 +10859,7 @@ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
   def : Pat<(To.VT (vselect To.KRCWM:$mask,
                             (bitconvert
                              (From.VT (OpNode From.RC:$src1,
-                                      (bitconvert (To.LdFrag addr:$src2)),
+                                              (From.LdFrag addr:$src2),
                                       imm:$src3))),
                             To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
@@ -11308,19 +11666,68 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU,
 // TODO: We should maybe have a more generalized algorithm for folding to
 // vpternlog.
 let Predicates = [HasAVX512] in {
-  def : Pat<(v8i64 (xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV)))),
+  def : Pat<(xor VR512:$src, (bc_v64i8 (v16i32 immAllOnesV))),
+            (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+  def : Pat<(xor VR512:$src, (bc_v32i16 (v16i32 immAllOnesV))),
+            (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+  def : Pat<(xor VR512:$src, (bc_v16i32 (v16i32 immAllOnesV))),
+            (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
+  def : Pat<(xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV))),
             (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
 }
 
 let Predicates = [HasAVX512, NoVLX] in {
-  def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))),
+  def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (i8 15)), sub_xmm)>;
+  def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (i8 15)), sub_xmm)>;
+  def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+              (i8 15)), sub_xmm)>;
+  def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))),
             (EXTRACT_SUBREG
              (VPTERNLOGQZrri
               (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
               (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
               (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
               (i8 15)), sub_xmm)>;
-  def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))),
+
+  def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (i8 15)), sub_ymm)>;
+  def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (i8 15)), sub_ymm)>;
+  def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))),
+            (EXTRACT_SUBREG
+             (VPTERNLOGQZrri
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+              (i8 15)), sub_ymm)>;
+  def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))),
             (EXTRACT_SUBREG
              (VPTERNLOGQZrri
               (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
@@ -11330,9 +11737,22 @@ let Predicates = [HasAVX512, NoVLX] in {
 }
 
 let Predicates = [HasVLX] in {
-  def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))),
+  def : Pat<(xor VR128X:$src, (bc_v16i8 (v4i32 immAllOnesV))),
+            (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+  def : Pat<(xor VR128X:$src, (bc_v8i16 (v4i32 immAllOnesV))),
             (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
-  def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))),
+  def : Pat<(xor VR128X:$src, (bc_v4i32 (v4i32 immAllOnesV))),
+            (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+  def : Pat<(xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV))),
+            (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>;
+
+  def : Pat<(xor VR256X:$src, (bc_v32i8 (v8i32 immAllOnesV))),
+            (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+  def : Pat<(xor VR256X:$src, (bc_v16i16 (v8i32 immAllOnesV))),
+            (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+  def : Pat<(xor VR256X:$src, (bc_v8i32 (v8i32 immAllOnesV))),
+            (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
+  def : Pat<(xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV))),
             (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>;
 }
 
@@ -11645,7 +12065,7 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                        (VTI.VT (bitconvert (VTI.LdFrag addr:$src3)))))>,
+                        (VTI.VT (VTI.LdFrag addr:$src3))))>,
                 AVX512FMA3Base,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -11748,8 +12168,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
-                                            (VTI.VT (bitconvert
-                                                     (VTI.LdFrag addr:$src3)))))>,
+                                            (VTI.VT (VTI.LdFrag addr:$src3))))>,
                                    EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
                                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -11805,7 +12224,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 "vpshufbitqmb",
                                 "$src2, $src1", "$src1, $src2",
                                 (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
-                                (VTI.VT (bitconvert (VTI.LdFrag addr:$src2))))>,
+                                (VTI.VT (VTI.LdFrag addr:$src2)))>,
                                 EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
                                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index de45b4697acc8b2d3c0b85285f29287d2618840e..71b43a38dc2cd994070cbad16728338cd3995ee6 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -37,11 +37,6 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
   def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
                       "", []>;
 
-// 64-bit large code model PIC base construction.
-let hasSideEffects = 0, mayLoad = 1, isNotDuplicable = 1, SchedRW = [WriteJump] in
-  def MOVGOT64r : PseudoI<(outs GR64:$reg),
-                          (ins GR64:$scratch, i64i32imm_pcrel:$got), []>;
-
 // ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
 // a stack adjustment and the codegen must know that they may modify the stack
 // pointer before prolog-epilog rewriting occurs.
@@ -2135,17 +2130,3 @@ def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
 let Predicates = [HasMOVBE] in {
  def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
 }
-
-// These patterns are selected by some custom code in X86ISelDAGToDAG.cpp that
-// custom combines and+srl into BEXTR. We use these patterns to avoid a bunch
-// of manual code for folding loads.
-let Predicates = [HasBMI, NoTBM] in {
-  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
-            (BEXTR32rr GR32:$src1, (MOV32ri imm:$src2))>;
-  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
-            (BEXTR32rm addr:$src1, (MOV32ri imm:$src2))>;
-  def : Pat<(X86bextr GR64:$src1, mov64imm32:$src2),
-            (BEXTR64rr GR64:$src1, (MOV32ri64 mov64imm32:$src2))>;
-  def : Pat<(X86bextr (loadi64 addr:$src1), mov64imm32:$src2),
-            (BEXTR64rm addr:$src1, (MOV32ri64 mov64imm32:$src2))>;
-} // HasBMI, NoTBM
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 3aa825ee84e4384f1a542b298638337b4cfbfebd..7bc8d0aa5309ad385a4e199ba76d12c5a8cd2c48 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -647,28 +647,29 @@ def sdmem : Operand<v2f64> {
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//
 
-// Vector load wrappers to prevent folding of non-temporal aligned loads on
-// supporting targets.
-def vecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return !useNonTemporalLoad(cast<LoadSDNode>(N));
-}]>;
-
 // 128-bit load pattern fragments
-// NOTE: all 128-bit integer vector loads are promoted to v2i64
-def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (vecload node:$ptr))>;
-def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (vecload node:$ptr))>;
-def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (vecload node:$ptr))>;
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv8i16    : PatFrag<(ops node:$ptr), (v8i16 (load node:$ptr))>;
+def loadv16i8    : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>;
 
 // 256-bit load pattern fragments
-// NOTE: all 256-bit integer vector loads are promoted to v4i64
-def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (vecload node:$ptr))>;
-def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (vecload node:$ptr))>;
-def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (vecload node:$ptr))>;
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32  (load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64  (load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64  (load node:$ptr))>;
+def loadv8i32    : PatFrag<(ops node:$ptr), (v8i32  (load node:$ptr))>;
+def loadv16i16   : PatFrag<(ops node:$ptr), (v16i16 (load node:$ptr))>;
+def loadv32i8    : PatFrag<(ops node:$ptr), (v32i8  (load node:$ptr))>;
 
 // 512-bit load pattern fragments
-def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (vecload node:$ptr))>;
-def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (vecload node:$ptr))>;
-def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (vecload node:$ptr))>;
+def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64  (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64  (load node:$ptr))>;
+def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv32i16   : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>;
+def loadv64i8    : PatFrag<(ops node:$ptr), (v64i8  (load node:$ptr))>;
 
 // 128-/256-/512-bit extload pattern fragments
 def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
@@ -682,46 +683,63 @@ def alignedstore : PatFrag<(ops node:$val, node:$ptr),
   return St->getAlignment() >= St->getMemoryVT().getStoreSize();
 }]>;
 
-// Like 'load', but always requires 128-bit vector alignment.
-def alignedvecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+// Like 'load', but always requires vector size alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   auto *Ld = cast<LoadSDNode>(N);
-  return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize() &&
-         !useNonTemporalLoad(cast<LoadSDNode>(N));
+  return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
 }]>;
 
 // 128-bit aligned load pattern fragments
 // NOTE: all 128-bit integer vector loads are promoted to v2i64
 def alignedloadv4f32 : PatFrag<(ops node:$ptr),
-                               (v4f32 (alignedvecload node:$ptr))>;
+                               (v4f32 (alignedload node:$ptr))>;
 def alignedloadv2f64 : PatFrag<(ops node:$ptr),
-                               (v2f64 (alignedvecload node:$ptr))>;
+                               (v2f64 (alignedload node:$ptr))>;
 def alignedloadv2i64 : PatFrag<(ops node:$ptr),
-                               (v2i64 (alignedvecload node:$ptr))>;
+                               (v2i64 (alignedload node:$ptr))>;
+def alignedloadv4i32 : PatFrag<(ops node:$ptr),
+                               (v4i32 (alignedload node:$ptr))>;
+def alignedloadv8i16 : PatFrag<(ops node:$ptr),
+                               (v8i16 (alignedload node:$ptr))>;
+def alignedloadv16i8 : PatFrag<(ops node:$ptr),
+                               (v16i8 (alignedload node:$ptr))>;
 
 // 256-bit aligned load pattern fragments
 // NOTE: all 256-bit integer vector loads are promoted to v4i64
-def alignedloadv8f32 : PatFrag<(ops node:$ptr),
-                               (v8f32 (alignedvecload node:$ptr))>;
-def alignedloadv4f64 : PatFrag<(ops node:$ptr),
-                               (v4f64 (alignedvecload node:$ptr))>;
-def alignedloadv4i64 : PatFrag<(ops node:$ptr),
-                               (v4i64 (alignedvecload node:$ptr))>;
+def alignedloadv8f32  : PatFrag<(ops node:$ptr),
+                                (v8f32  (alignedload node:$ptr))>;
+def alignedloadv4f64  : PatFrag<(ops node:$ptr),
+                                (v4f64  (alignedload node:$ptr))>;
+def alignedloadv4i64  : PatFrag<(ops node:$ptr),
+                                (v4i64  (alignedload node:$ptr))>;
+def alignedloadv8i32  : PatFrag<(ops node:$ptr),
+                                (v8i32  (alignedload node:$ptr))>;
+def alignedloadv16i16 : PatFrag<(ops node:$ptr),
+                                (v16i16 (alignedload node:$ptr))>;
+def alignedloadv32i8  : PatFrag<(ops node:$ptr),
+                                (v32i8  (alignedload node:$ptr))>;
 
 // 512-bit aligned load pattern fragments
 def alignedloadv16f32 : PatFrag<(ops node:$ptr),
-                                (v16f32 (alignedvecload node:$ptr))>;
+                                (v16f32 (alignedload node:$ptr))>;
 def alignedloadv8f64  : PatFrag<(ops node:$ptr),
-                                (v8f64  (alignedvecload node:$ptr))>;
+                                (v8f64  (alignedload node:$ptr))>;
 def alignedloadv8i64  : PatFrag<(ops node:$ptr),
-                                (v8i64  (alignedvecload node:$ptr))>;
-
-// Like 'vecload', but uses special alignment checks suitable for use in
+                                (v8i64  (alignedload node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+                                (v16i32 (alignedload node:$ptr))>;
+def alignedloadv32i16 : PatFrag<(ops node:$ptr),
+                                (v32i16 (alignedload node:$ptr))>;
+def alignedloadv64i8  : PatFrag<(ops node:$ptr),
+                                (v64i8  (alignedload node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
 // be naturally aligned on some targets but not on others.  If the subtarget
 // allows unaligned accesses, match any load, though this may require
 // setting a feature bit in the processor (on startup, for example).
 // Opteron 10h and later implement such a feature.
-def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   auto *Ld = cast<LoadSDNode>(N);
   return Subtarget->hasSSEUnalignedMem() ||
          Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
@@ -732,6 +750,9 @@ def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{
 def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
+def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>;
+def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
 
 def X86masked_gather : SDNode<"X86ISD::MGATHER",
                               SDTypeProfile<2, 3, [SDTCisVec<0>,
@@ -834,6 +855,7 @@ def bc_v4f64 : PatFrag<(ops node:$in), (v4f64 (bitconvert node:$in))>;
 
 // 512-bit bitconvert pattern fragments
 def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>;
+def bc_v32i16 : PatFrag<(ops node:$in), (v32i16 (bitconvert node:$in))>;
 def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
 def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
 def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 36ef7dca1f34dacc9f6ab7f97a611e4fc4414711..fe26389050c2e4a2b3399dae11ca8fe9983e3641 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2550,7 +2550,7 @@ void X86InstrInfo::replaceBranchWithTailCall(
   // call. This way they still appear live across the call.
   LivePhysRegs LiveRegs(getRegisterInfo());
   LiveRegs.addLiveOuts(MBB);
-  SmallVector<std::pair<unsigned, const MachineOperand *>, 8> Clobbers;
+  SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 8> Clobbers;
   LiveRegs.stepForward(*MIB, Clobbers);
   for (const auto &C : Clobbers) {
     MIB.addReg(C.first, RegState::Implicit);
@@ -2640,6 +2640,11 @@ bool X86InstrInfo::AnalyzeBranchImpl(
     if (BranchCode == X86::COND_INVALID)
       return true;  // Can't handle indirect branch.
 
+    // In practice we should never have an undef eflags operand, if we do
+    // abort here as we are not prepared to preserve the flag.
+    if (I->getOperand(1).isUndef())
+      return true;
+
     // Working from the bottom, handle the first conditional branch.
     if (Cond.empty()) {
       MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
@@ -7468,12 +7473,28 @@ namespace {
               .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
               .addReg(0);
         } else if (TM->getCodeModel() == CodeModel::Large) {
-          // Loading the GOT in the large code model requires math with labels,
-          // so we use a pseudo instruction and expand it during MC emission.
-          unsigned Scratch = RegInfo.createVirtualRegister(&X86::GR64RegClass);
-          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVGOT64r), PC)
-              .addReg(Scratch, RegState::Undef | RegState::Define)
-              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+          // In the large code model, we are aiming for this code, though the
+          // register allocation may vary:
+          //   leaq .LN$pb(%rip), %rax
+          //   movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
+          //   addq %rcx, %rax
+          // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
+          unsigned PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
+          unsigned GOTReg =
+              RegInfo.createVirtualRegister(&X86::GR64RegClass);
+          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addSym(MF.getPICBaseSymbol())
+              .addReg(0);
+          std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
+          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
+              .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                                 X86II::MO_PIC_BASE_OFFSET);
+          BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
+              .addReg(PBReg, RegState::Kill)
+              .addReg(GOTReg, RegState::Kill);
         } else {
           llvm_unreachable("unexpected code model");
         }
@@ -7808,3 +7829,6 @@ X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
 
   return It;
 }
+
+#define GET_TII_HELPERS
+#include "X86GenInstrInfo.inc"
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 85afcf8904a112f8940404e89d8927a354542cd3..f3965db4fe7c5c114b110650d0a4fbdd95f49c46 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -558,6 +558,9 @@ public:
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
                      const outliner::Candidate &C) const override;
 
+#define GET_TII_HELPER_DECLS
+#include "X86GenInstrInfo.inc"
+
 protected:
   /// Commutes the operands in the given instruction by changing the operands
   /// order and/or changing the instruction's opcode and/or the immediate value
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 15ed435244e21d370e8568236bec1e2c2de2a503..992e9543b33dac2ccea6bf90d6b48afb7f98726f 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -291,6 +291,8 @@ def X86lock_dec  : SDNode<"X86ISD::LDEC",  SDTLockUnaryArithWithFlags,
 
 def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
+def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntBinOp>;
+
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
 def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
@@ -2454,9 +2456,9 @@ multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
 
 let Predicates = [HasBMI2], Defs = [EFLAGS] in {
   defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem,
-                         int_x86_bmi_bzhi_32, loadi32, WriteBZHI>;
+                         X86bzhi, loadi32, WriteBZHI>;
   defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem,
-                         int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W;
+                         X86bzhi, loadi64, WriteBZHI>, VEX_W;
 }
 
 def CountTrailingOnes : SDNodeXForm<imm, [{
@@ -2497,84 +2499,6 @@ let Predicates = [HasBMI2, NoTBM] in {
                              (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
 }
 
-let Predicates = [HasBMI2] in {
-  multiclass _bmi_bzhi_pattern<dag regpattern, dag mempattern, RegisterClass RC,
-                               ValueType VT, Instruction DstInst,
-                               Instruction DstMemInst> {
-    def : Pat<regpattern,
-              (DstInst RC:$src,
-                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
-    def : Pat<mempattern,
-              (DstMemInst addr:$src,
-                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
-  }
-
-  multiclass bmi_bzhi_patterns<RegisterClass RC, int bitwidth, ValueType VT,
-                               Instruction DstInst, X86MemOperand x86memop,
-                               Instruction DstMemInst> {
-    // x & ((1 << y) - 1)
-    defm : _bmi_bzhi_pattern<(and RC:$src, (add (shl 1, GR8:$lz), -1)),
-                             (and (x86memop addr:$src),
-                                  (add (shl 1, GR8:$lz), -1)),
-                             RC, VT, DstInst, DstMemInst>;
-
-    // x & ~(-1 << y)
-    defm : _bmi_bzhi_pattern<(and RC:$src, (xor (shl -1, GR8:$lz), -1)),
-                             (and (x86memop addr:$src),
-                                  (xor (shl -1, GR8:$lz), -1)),
-                             RC, VT, DstInst, DstMemInst>;
-
-    // x & (-1 >> (bitwidth - y))
-    defm : _bmi_bzhi_pattern<(and RC:$src, (srl -1, (sub bitwidth, GR8:$lz))),
-                             (and (x86memop addr:$src),
-                                  (srl -1, (sub bitwidth, GR8:$lz))),
-                             RC, VT, DstInst, DstMemInst>;
-
-    // x << (bitwidth - y) >> (bitwidth - y)
-    defm : _bmi_bzhi_pattern<(srl (shl RC:$src, (sub bitwidth, GR8:$lz)),
-                                  (sub bitwidth, GR8:$lz)),
-                             (srl (shl (x86memop addr:$src),
-                                        (sub bitwidth, GR8:$lz)),
-                                  (sub bitwidth, GR8:$lz)),
-                             RC, VT, DstInst, DstMemInst>;
-  }
-
-  defm : bmi_bzhi_patterns<GR32, 32, i32, BZHI32rr, loadi32, BZHI32rm>;
-  defm : bmi_bzhi_patterns<GR64, 64, i64, BZHI64rr, loadi64, BZHI64rm>;
-
-  // x & (-1 >> (32 - y))
-  def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
-            (BZHI32rr GR32:$src, GR32:$lz)>;
-  def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
-            (BZHI32rm addr:$src, GR32:$lz)>;
-
-  // x & (-1 >> (64 - y))
-  def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
-            (BZHI64rr GR64:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-  def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
-            (BZHI64rm addr:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-
-  // x << (32 - y) >> (32 - y)
-  def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
-                 (i8 (trunc (sub 32, GR32:$lz)))),
-            (BZHI32rr GR32:$src, GR32:$lz)>;
-  def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
-                 (i8 (trunc (sub 32, GR32:$lz)))),
-            (BZHI32rm addr:$src, GR32:$lz)>;
-
-  // x << (64 - y) >> (64 - y)
-  def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
-                 (i8 (trunc (sub 64, GR32:$lz)))),
-            (BZHI64rr GR64:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-  def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
-                 (i8 (trunc (sub 64, GR32:$lz)))),
-            (BZHI64rm addr:$src,
-              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
-} // HasBMI2
-
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
                          X86MemOperand x86memop, Intrinsic Int,
                          PatFrag ld_frag> {
@@ -2976,6 +2900,8 @@ def : MnemonicAlias<"popf",  "popfl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"popf",  "popfq", "att">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"popf",  "popfq", "intel">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"popfd", "popfl", "att">;
+def : MnemonicAlias<"popfw", "popf",  "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"popfw", "popf",  "intel">, Requires<[In64BitMode]>;
 
 // FIXME: This is wrong for "push reg".  "push %bx" should turn into pushw in
 // all modes.  However: "push (addr)" and "push $42" should default to
@@ -2988,6 +2914,8 @@ def : MnemonicAlias<"pushf",  "pushfl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"pushf",  "pushfq", "att">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"pushf",  "pushfq", "intel">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"pushfd", "pushfl", "att">;
+def : MnemonicAlias<"pushfw", "pushf",  "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushfw", "pushf",  "intel">, Requires<[In64BitMode]>;
 
 def : MnemonicAlias<"popad",  "popal",  "intel">, Requires<[Not64BitMode]>;
 def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index b3c639f4f0c27d3f2beb20ef1a815491535b1df6..85e4fd3856339c3eb9e89d083353872e2fa96478 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -265,8 +265,6 @@ let Predicates = [UseAVX] in {
             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
-  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzload addr:$src)),
             (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
 
@@ -349,8 +347,6 @@ let Predicates = [UseSSE2] in {
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
-  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
   def : Pat<(v2f64 (X86vzload addr:$src)),
             (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
 }
@@ -593,8 +589,21 @@ let Predicates = [HasAVX, NoVLX] in {
   // available and changing the domain is beneficial.
   def : Pat<(alignedloadv4i64 addr:$src),
             (VMOVAPSYrm addr:$src)>;
+  def : Pat<(alignedloadv8i32 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(alignedloadv16i16 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(alignedloadv32i8 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
   def : Pat<(loadv4i64 addr:$src),
             (VMOVUPSYrm addr:$src)>;
+  def : Pat<(loadv8i32 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(loadv16i16 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(loadv32i8 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+
   def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
@@ -619,8 +628,20 @@ let Predicates = [HasAVX, NoVLX] in {
 let Predicates = [UseSSE1] in {
   def : Pat<(alignedloadv2i64 addr:$src),
             (MOVAPSrm addr:$src)>;
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(alignedloadv8i16 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(alignedloadv16i8 addr:$src),
+            (MOVAPSrm addr:$src)>;
   def : Pat<(loadv2i64 addr:$src),
             (MOVUPSrm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(loadv8i16 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(loadv16i8 addr:$src),
+            (MOVUPSrm addr:$src)>;
 
   def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
             (MOVAPSmr addr:$dst, VR128:$src)>;
@@ -845,7 +866,7 @@ let hasSideEffects = 0 in {
   let mayLoad = 1 in
   def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
              [(set RC:$dst, (DstTy (sint_to_fp
-                                    (SrcTy (bitconvert (ld_frag addr:$src))))))], d>,
+                                    (SrcTy (ld_frag addr:$src)))))], d>,
              Sched<[sched.Folded]>;
 }
 }
@@ -1108,16 +1129,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                  ssmem, sse_load_f32, "cvtss2si",
                                  WriteCvtSS2I>, XS, REX_W;
 
-defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
+defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PS>,
                                PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
-defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
+defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PSY>,
                                PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 
-defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
+defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, WriteCvtI2PS>,
                             PS, Requires<[UseSSE2]>;
@@ -1676,7 +1697,7 @@ let hasSideEffects = 0, mayLoad = 1 in
 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                          (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
                         VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -1686,7 +1707,7 @@ def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
-                           (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                           (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
                          VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
                          VEX_WIG;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
@@ -1700,7 +1721,7 @@ let hasSideEffects = 0, mayLoad = 1 in
 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
+                         (v2f64 (X86VSintToFP (loadv4i32 addr:$src))))]>,
                        Sched<[WriteCvtI2PDLd]>;
 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2155,54 +2176,54 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
-defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
-defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
-defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 
-defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
-defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }// Predicates = [HasAVX, NoVLX]
 
 let Constraints = "$src1 = $dst" in {
-  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
-  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
-  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
-  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
@@ -2288,8 +2309,7 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1,
-                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 } // ExeDomain = SSEPackedInt
@@ -2300,16 +2320,16 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          Predicate prd> {
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                             VR128, loadv2i64, i128mem, sched.XMM,
+                             VR128, load, i128mem, sched.XMM,
                              IsCommutable, 0>, VEX_4V, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
-                           memopv2i64, i128mem, sched.XMM, IsCommutable, 1>;
+                           memop, i128mem, sched.XMM, IsCommutable, 1>;
 
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
-                               OpVT256, VR256, loadv4i64, i256mem, sched.YMM,
+                               OpVT256, VR256, load, i256mem, sched.YMM,
                                IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
@@ -2369,24 +2389,136 @@ defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
 let isCommutable = 0 in
   defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
 
+let Predicates = [HasAVX2, NoVLX] in {
+  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
+            (VPANDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
+            (VPANDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
+            (VPANDYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
+            (VPORYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
+            (VPORYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
+            (VPORYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
+            (VPXORYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
+            (VPXORYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
+            (VPXORYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
+            (VPANDNYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
+            (VPANDNYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
+            (VPANDNYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
+            (VPANDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
+            (VPANDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
+            (VPANDYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
+            (VPORYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
+            (VPORYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
+            (VPORYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
+            (VPXORYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
+            (VPXORYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
+            (VPXORYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
+            (VPANDNYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
+            (VPANDNYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
+            (VPANDNYrm VR256:$src1, addr:$src2)>;
+}
+
 // If only AVX1 is supported, we need to handle integer operations with
 // floating point instructions since the integer versions aren't available.
 let Predicates = [HasAVX1Only] in {
+  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
+            (VANDPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
+            (VANDPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
+            (VANDPSYrr VR256:$src1, VR256:$src2)>;
   def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
             (VANDPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
+            (VORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
+            (VORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
+            (VORPSYrr VR256:$src1, VR256:$src2)>;
   def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
             (VORPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
+            (VXORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
+            (VXORPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
+            (VXORPSYrr VR256:$src1, VR256:$src2)>;
   def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
             (VXORPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
+            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
+            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
+            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
   def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
             (VANDNPSYrr VR256:$src1, VR256:$src2)>;
 
+  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
             (VANDPSYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
             (VORPSYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
             (VXORPSYrm VR256:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
             (VANDNPSYrm VR256:$src1, addr:$src2)>;
 }
@@ -2484,6 +2616,122 @@ let Predicates = [UseSSE2] in {
              FR64)>;
 }
 
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
+            (VPANDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
+            (VPANDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
+            (VPANDrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
+            (VPORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
+            (VPORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
+            (VPORrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
+            (VPXORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
+            (VPXORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
+            (VPXORrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
+            (VPANDNrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
+            (VPANDNrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
+            (VPANDNrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
+            (VPANDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
+            (VPANDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
+            (VPANDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
+            (VPORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
+            (VPORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
+            (VPORrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
+            (VPXORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
+            (VPXORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
+            (VPXORrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
+            (VPANDNrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
+            (VPANDNrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
+            (VPANDNrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
+            (PANDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
+            (PANDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
+            (PANDrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
+            (PORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
+            (PORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
+            (PORrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
+            (PXORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
+            (PXORrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
+            (PXORrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
+            (PANDNrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
+            (PANDNrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
+            (PANDNrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
+            (PANDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
+            (PANDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
+            (PANDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
+            (PORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
+            (PORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
+            (PORrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
+            (PXORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
+            (PXORrm VR128:$src1, addr:$src2)>;
+  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
+            (PXORrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
+            (PANDNrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
+            (PANDNrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
+            (PANDNrm VR128:$src1, addr:$src2)>;
+}
+
 // Patterns for packed operations when we don't have integer type available.
 def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
           (ANDPSrr VR128:$src1, VR128:$src2)>;
@@ -3310,6 +3558,19 @@ def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
 
 let Predicates = [HasAVX, NoVLX] in {
   // Additional patterns for other integer sizes.
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (VMOVDQArm addr:$src)>;
+  def : Pat<(alignedloadv8i16 addr:$src),
+            (VMOVDQArm addr:$src)>;
+  def : Pat<(alignedloadv16i8 addr:$src),
+            (VMOVDQArm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (VMOVDQUrm addr:$src)>;
+  def : Pat<(loadv8i16 addr:$src),
+            (VMOVDQUrm addr:$src)>;
+  def : Pat<(loadv16i8 addr:$src),
+            (VMOVDQUrm addr:$src)>;
+
   def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
             (VMOVDQAmr addr:$dst, VR128:$src)>;
   def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
@@ -3349,7 +3610,7 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
-                                     (bitconvert (memop_frag addr:$src2)))))]>,
+                                     (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 } // ExeDomain = SSEPackedInt
@@ -3409,28 +3670,28 @@ defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                              loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
+                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
                               VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
-                               VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM,
+                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
                                0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                             memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
+                             memop, i128mem, SchedWriteVecIMul.XMM>;
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
-                             loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>,
+                             load, i128mem, SchedWritePSADBW.XMM, 0>,
                              VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
-                             loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>,
+                             load, i256mem, SchedWritePSADBW.YMM, 0>,
                              VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
-                            memopv2i64, i128mem, SchedWritePSADBW.XMM>;
+                            memop, i128mem, SchedWritePSADBW.XMM>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -3457,7 +3718,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst, (DstVT (OpNode RC:$src1,
-                       (SrcVT (bitconvert (ld_frag addr:$src2))))))]>,
+                       (SrcVT (ld_frag addr:$src2)))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
        (ins RC:$src1, u8imm:$src2),
@@ -3477,16 +3738,16 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                               OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
-                              DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                                 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
-                                DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L,
+                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
                                 VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
                             VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
-                            memopv2i64>;
+                            memop>;
 }
 
 multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
@@ -3586,7 +3847,7 @@ let Predicates = [HasAVX, prd] in {
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
-                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+                       (vt128 (OpNode (load addr:$src1),
                         (i8 imm:$src2))))]>, VEX,
                   Sched<[sched.XMM.Folded]>, VEX_WIG;
 }
@@ -3604,7 +3865,7 @@ let Predicates = [HasAVX2, prd] in {
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
-                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+                        (vt256 (OpNode (load addr:$src1),
                          (i8 imm:$src2))))]>, VEX, VEX_L,
                    Sched<[sched.YMM.Folded]>, VEX_WIG;
 }
@@ -3622,7 +3883,7 @@ let Predicates = [UseSSE2] in {
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set VR128:$dst,
-                 (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+                 (vt128 (OpNode (memop addr:$src1),
                         (i8 imm:$src2))))]>,
                Sched<[sched.XMM.Folded]>;
 }
@@ -3662,7 +3923,7 @@ multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                [(set RC:$dst,
                      (OutVT (OpNode (ArgVT RC:$src1),
-                                    (bitconvert (ld_frag addr:$src2)))))]>,
+                                    (ld_frag addr:$src2))))]>,
                Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -3687,53 +3948,53 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
                  [(set RC:$dst,
                        (OutVT (OpNode (ArgVT RC:$src1),
-                                      (bitconvert (ld_frag addr:$src2)))))]>,
+                                      (ld_frag addr:$src2))))]>,
                  Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V, VEX_WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
-                             i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                             i128mem, SchedWriteShuffle.XMM, load, 0>,
                              VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
 
   defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
-                              i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                              i256mem, SchedWriteShuffle.YMM, load, 0>,
                               VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
   defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
 
   defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
 
   defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
-                            i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                            i128mem, SchedWriteShuffle.XMM, memop>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -3758,89 +4019,88 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (OpNode RC:$src1,
-                                  (bitconvert (ld_frag addr:$src2)))))]>,
+      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
-                                 i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>,
+                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
-                                  i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>,
+                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
 
   defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
   defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
-                                i128mem, SchedWriteShuffle.XMM, memopv2i64>;
+                                i128mem, SchedWriteShuffle.XMM, memop>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -4159,7 +4419,7 @@ let Predicates = [UseAVX] in {
             (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (VMOVDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (VMOVDI2PDIrm addr:$src)>;
@@ -4184,7 +4444,7 @@ let Predicates = [UseSSE2] in {
             (MOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
             (MOVDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (MOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload addr:$src)),
             (MOVDI2PDIrm addr:$src)>;
@@ -4339,30 +4599,30 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
 let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (VMOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
             (VMOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (VMOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
             (VMOVSLDUPrm addr:$src)>;
   def : Pat<(v8i32 (X86Movshdup VR256:$src)),
             (VMOVSHDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
             (VMOVSHDUPYrm addr:$src)>;
   def : Pat<(v8i32 (X86Movsldup VR256:$src)),
             (VMOVSLDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
             (VMOVSLDUPYrm addr:$src)>;
 }
 
 let Predicates = [UseSSE3] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (MOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
             (MOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (MOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
             (MOVSLDUPrm addr:$src)>;
 }
 
@@ -4584,7 +4844,7 @@ multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
                  (ins i128mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                  [(set VR128:$dst,
-                   (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>,
+                   (vt (OpNode (ld_frag addr:$src))))]>,
                  Sched<[sched.XMM.Folded]>;
 }
 
@@ -4601,19 +4861,19 @@ multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
                   (ins i256mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR256:$dst,
-                    (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
+                    (vt (OpNode (load addr:$src))))]>,
                   Sched<[sched.YMM.Folded]>;
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
-                              loadv2i64>, VEX, VEX_WIG;
+                              load>, VEX, VEX_WIG;
   defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
-                              loadv2i64>, VEX, VEX_WIG;
+                              load>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
   defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
-                              loadv2i64>, VEX, VEX_WIG;
+                              load>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
@@ -4627,11 +4887,11 @@ let Predicates = [HasAVX2, NoVLX] in {
 }
 
 defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
-                          memopv2i64>;
+                          memop>;
 defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
-                          memopv2i64>;
+                          memop>;
 defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
-                          memopv2i64>;
+                          memop>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Binary Operator Instructions
@@ -4656,8 +4916,7 @@ multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (DstVT (OpNode (OpVT RC:$src1),
-          (bitconvert (memop_frag addr:$src2)))))]>,
+         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4679,8 +4938,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
-         (IntId128 VR128:$src1,
-          (bitconvert (ld_frag addr:$src2))))]>,
+         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4697,83 +4955,83 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        (ins VR256:$src1, i256mem:$src2),
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
-         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
+         (IntId256 VR256:$src1, (load addr:$src2)))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
-                                  VR128, loadv2i64, i128mem,
+                                  VR128, load, i128mem,
                                   SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
-                                  v16i8, VR128, loadv2i64, i128mem,
+                                  v16i8, VR128, load, i128mem,
                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
-                                  VR128, loadv2i64, i128mem,
+                                  VR128, load, i128mem,
                                   SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
-                                  loadv2i64, i128mem,
+                                  load, i128mem,
                                   SchedWritePHAdd.XMM, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
 }
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
-                                   v32i8, VR256, loadv4i64, i256mem,
+                                   v32i8, VR256, load, i256mem,
                                    SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
-                                  loadv4i64, i256mem,
+                                  load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
-                                  VR256, loadv4i64, i256mem,
+                                  VR256, load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
-                                  loadv4i64, i256mem,
+                                  load, i256mem,
                                   SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
   defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
                                        SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
@@ -4794,33 +5052,33 @@ let isCommutable = 0 in {
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 let isCommutable = 0 in {
   defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWritePHAdd.XMM>;
+                                 memop, i128mem, SchedWritePHAdd.XMM>;
   defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
-                                     SchedWriteVecALU.XMM, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memop>;
   defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
-                                     SchedWriteVecALU.XMM, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memop>;
   defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
-                                     SchedWriteVecALU.XMM, memopv2i64>;
+                                     SchedWriteVecALU.XMM, memop>;
   defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
-                                 memopv2i64, i128mem, SchedWriteVarShuffle.XMM>;
+                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
   defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
                                      int_x86_ssse3_phadd_sw_128,
-                                     SchedWritePHAdd.XMM, memopv2i64>;
+                                     SchedWritePHAdd.XMM, memop>;
   defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
                                      int_x86_ssse3_phsub_sw_128,
-                                     SchedWritePHAdd.XMM, memopv2i64>;
+                                     SchedWritePHAdd.XMM, memop>;
   defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
-                                 v16i8, VR128, memopv2i64, i128mem,
+                                 v16i8, VR128, memop, i128mem,
                                  SchedWriteVecIMul.XMM>;
 }
 defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
-                                 VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>;
+                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -4847,20 +5105,20 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set RC:$dst, (VT (X86PAlignr RC:$src1,
-                                     (bitconvert (memop_frag addr:$src2)),
+                                     (memop_frag addr:$src2),
                                      (i8 imm:$src3))))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
-  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem,
+  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
                                 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
-  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem,
+  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
                                  SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
-  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem,
+  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
                                SchedWriteShuffle.XMM>;
 
 //===---------------------------------------------------------------------===//
@@ -4984,7 +5242,7 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
 
   // AVX2 Register-Memory patterns
   let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
   def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
@@ -4998,7 +5256,7 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
-  def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
 
   def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -5007,10 +5265,10 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
 
-  def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
   def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
@@ -5023,10 +5281,10 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtO
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
-  def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
 
-  def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
   def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
@@ -5086,7 +5344,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
   }
   let Predicates = [HasAVX, NoVLX] in {
@@ -5096,7 +5354,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
   def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
@@ -5105,7 +5363,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
             (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
 
   def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5116,7 +5374,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
   def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
@@ -5125,7 +5383,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
             (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
 
   def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
@@ -5136,7 +5394,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
             (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
   }
 }
@@ -5954,7 +6212,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                   (ins i128mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                   [(set VR128:$dst,
-                    (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
+                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
                  Sched<[Sched.Folded]>;
 }
 
@@ -5962,10 +6220,10 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
 // model, although the naming is misleading.
 let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
-                                         X86phminpos, loadv2i64,
+                                         X86phminpos, load,
                                          WritePHMINPOS>, VEX, VEX_WIG;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
-                                         X86phminpos, memopv2i64,
+                                         X86phminpos, memop,
                                          WritePHMINPOS>;
 
 /// SS48I_binop_rm - Simple SSE41 binary operator.
@@ -5987,118 +6245,118 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>,
+                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
                                   VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
-                                  loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>,
+                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
   defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
-                                 memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>;
+                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
 }
 
 let Predicates = [HasAVX, NoVLX] in
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
-                                 loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>,
+                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 let Predicates = [HasAVX] in
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
-                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX] in
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>,
+                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 let Predicates = [HasAVX2] in
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
-                                memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>;
+                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
-                                memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>;
+                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
 }
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
@@ -6124,8 +6382,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (IntId RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+          (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6152,8 +6409,7 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1,
-                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6175,28 +6431,28 @@ def BlendCommuteImm8 : SDNodeXForm<imm, [{
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                        VR128, loadv2i64, i128mem, 0,
+                                        VR128, load, i128mem, 0,
                                         SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, loadv4f32, f128mem, 0,
+                                   VR128, load, f128mem, 0,
                                    SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, loadv2f64, f128mem, 0,
+                                   VR128, load, f128mem, 0,
                                    SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                    VR256, loadv8f32, i256mem, 0,
+                                    VR256, load, i256mem, 0,
                                     SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                  VR256, loadv4i64, i256mem, 0,
+                                  VR256, load, i256mem, 0,
                                   SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
   }
 }
@@ -6204,17 +6460,17 @@ let Predicates = [HasAVX2] in {
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memopv2i64, i128mem, 1,
+                                     VR128, memop, i128mem, 1,
                                      SchedWriteMPSAD.XMM>;
   }
 
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memopv4f32, f128mem, 1,
+                                  VR128, memop, f128mem, 1,
                                   SchedWriteDPPS.XMM>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memopv2f64, f128mem, 1,
+                                  VR128, memop, f128mem, 1,
                                   SchedWriteDPPD.XMM>;
 }
 
@@ -6242,56 +6498,54 @@ let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1,
-                 (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
-                          RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
                                             (commuteXForm imm:$src3))>;
 }
 
 let Predicates = [HasAVX] in {
   defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
-                                  VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
+                                  VR128, load, f128mem, 0, SSEPackedSingle,
                                   SchedWriteFBlend.XMM, BlendCommuteImm4>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
-                                   VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
+                                   VR256, load, f256mem, 0, SSEPackedSingle,
                                    SchedWriteFBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
-                                  VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
+                                  VR128, load, f128mem, 0, SSEPackedDouble,
                                   SchedWriteFBlend.XMM, BlendCommuteImm2>,
                                   VEX_4V, VEX_WIG;
   defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
-                                   VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
+                                   VR256, load, f256mem, 0, SSEPackedDouble,
                                    SchedWriteFBlend.YMM, BlendCommuteImm4>,
                                    VEX_4V, VEX_L, VEX_WIG;
   defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
-                                  VR128, loadv2i64, i128mem, 0, SSEPackedInt,
+                                  VR128, load, i128mem, 0, SSEPackedInt,
                                   SchedWriteBlend.XMM, BlendCommuteImm8>,
                                   VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
-                                   VR256, loadv4i64, i256mem, 0, SSEPackedInt,
+                                   VR256, load, i256mem, 0, SSEPackedInt,
                                    SchedWriteBlend.YMM, BlendCommuteImm8>,
                                    VEX_4V, VEX_L, VEX_WIG;
 }
 
 defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
-                               VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
+                               VR128, memop, f128mem, 1, SSEPackedSingle,
                                SchedWriteFBlend.XMM, BlendCommuteImm4>;
 defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
-                               VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
+                               VR128, memop, f128mem, 1, SSEPackedDouble,
                                SchedWriteFBlend.XMM, BlendCommuteImm2>;
 defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
-                               VR128, memopv2i64, i128mem, 1, SSEPackedInt,
+                               VR128, memop, i128mem, 1, SSEPackedInt,
                                SchedWriteBlend.XMM, BlendCommuteImm8>;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -6325,7 +6579,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst,
-                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
+                        (IntId RC:$src1, (mem_frag addr:$src2),
                                RC:$src3))], SSEPackedInt>, TAPD, VEX_4V,
                 Sched<[sched.Folded, sched.ReadAfterFold,
                        // x86memop:$src2
@@ -6338,7 +6592,7 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
-                                           loadv2f64, int_x86_sse41_blendvpd,
+                                           load, int_x86_sse41_blendvpd,
                                            SchedWriteFVarBlend.XMM>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
                                   loadv4f64, int_x86_avx_blendv_pd_256,
@@ -6346,20 +6600,20 @@ defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
-                                           loadv4f32, int_x86_sse41_blendvps,
+                                           load, int_x86_sse41_blendvps,
                                            SchedWriteFVarBlend.XMM>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
                                   loadv8f32, int_x86_avx_blendv_ps_256,
                                   SchedWriteFVarBlend.YMM>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           loadv2i64, int_x86_sse41_pblendvb,
+                                           load, int_x86_sse41_pblendvb,
                                            SchedWriteVarBlend.XMM>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                      loadv4i64, int_x86_avx2_pblendvb,
+                                      load, int_x86_avx2_pblendvb,
                                       SchedWriteVarBlend.YMM>, VEX_L;
 }
 
@@ -6490,18 +6744,18 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                      "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
-                       (bitconvert (mem_frag addr:$src2)), XMM0))]>,
+                       (mem_frag addr:$src2), XMM0))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
 let ExeDomain = SSEPackedDouble in
-defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem,
                                   int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>;
 let ExeDomain = SSEPackedSingle in
-defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem,
                                   int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>;
-defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem,
                                   int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>;
 
 // Aliases with the implicit xmm0 argument
@@ -6557,6 +6811,12 @@ let Predicates = [HasAVX2, NoVLX] in {
             (VMOVNTDQAYrm addr:$src)>;
   def : Pat<(v4i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -6566,6 +6826,12 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVNTDQArm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
 }
 
 let Predicates = [UseSSE41] in {
@@ -6575,6 +6841,12 @@ let Predicates = [UseSSE41] in {
             (MOVNTDQArm addr:$src)>;
   def : Pat<(v2i64 (alignednontemporalload addr:$src)),
             (MOVNTDQArm addr:$src)>;
+  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
+  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
 }
 
 } // AddedComplexity
@@ -6607,17 +6879,17 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>,
+                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
                                  VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>,
+                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
                                   VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
-                                memopv2i64, i128mem, SchedWriteVecALU.XMM>;
+                                memop, i128mem, SchedWriteVecALU.XMM>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - String/text Processing Instructions
@@ -6768,9 +7040,9 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+                    (memop addr:$src2), XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8,
+                    (memop addr:$src2))))]>, T8,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6787,7 +7059,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
-                            (bc_v4i32 (memopv2i64 addr:$src2)),
+                            (memop addr:$src2),
                             (i8 imm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
@@ -6840,39 +7112,39 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG;
+                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
 }
 
 let Predicates = [NoVLX, HasVAES] in {
   defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesenc_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesenclast_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesdec_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
   defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256,
+                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
                          i256mem>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
   defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
-                         int_x86_aesni_aesenc, memopv2i64, 1>;
+                         int_x86_aesni_aesenc, memop, 1>;
   defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
-                         int_x86_aesni_aesenclast, memopv2i64, 1>;
+                         int_x86_aesni_aesenclast, memop, 1>;
   defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
-                         int_x86_aesni_aesdec, memopv2i64, 1>;
+                         int_x86_aesni_aesdec, memop, 1>;
   defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
-                         int_x86_aesni_aesdeclast, memopv2i64, 1>;
+                         int_x86_aesni_aesdeclast, memop, 1>;
 }
 
 // Perform the AES InvMixColumn Transformation
@@ -6886,7 +7158,7 @@ let Predicates = [HasAVX, HasAES] in {
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
-      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
+      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
       Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
@@ -6897,7 +7169,7 @@ def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
 def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1),
   "aesimc\t{$src1, $dst|$dst, $src1}",
-  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
   Sched<[WriteAESIMC.Folded]>;
 
 // AES Round Key Generation Assist
@@ -6912,7 +7184,7 @@ let Predicates = [HasAVX, HasAES] in {
       (ins i128mem:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
-        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
+        (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
       Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
@@ -6925,7 +7197,7 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
   (ins i128mem:$src1, u8imm:$src2),
   "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
   [(set VR128:$dst,
-    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+    (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
   Sched<[WriteAESKeyGen.Folded]>;
 
 //===----------------------------------------------------------------------===//
@@ -6953,12 +7225,12 @@ let Predicates = [NoAVX, HasPCLMUL] in {
               (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
               "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
               [(set VR128:$dst,
-                 (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2),
+                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
                   imm:$src3))]>,
               Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
   } // Constraints = "$src1 = $dst"
 
-  def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1,
+  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
                                 (i8 imm:$src3)),
             (PCLMULQDQrm VR128:$src1, addr:$src2,
                           (PCLMULCommuteImm imm:$src3))>;
@@ -7001,11 +7273,11 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
 }
 
 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
-defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64,
+defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
                              int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
 
 let Predicates = [NoVLX, HasVPCLMULQDQ] in
-defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64,
+defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
                               int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
 
 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
@@ -7161,11 +7433,11 @@ def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
 let Predicates = [HasAVX2, NoVLX] in {
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI128 addr:$src)>;
 }
 
@@ -7179,11 +7451,11 @@ def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
 let Predicates = [HasAVX1Only] in {
 def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
+def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
+def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
-def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
+def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTF128 addr:$src)>;
 }
 
@@ -7216,7 +7488,7 @@ multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
             (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
   def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
-                                    (From (bitconvert (memop_frag addr:$src2))),
+                                    (From (memop_frag addr:$src2)),
                                     (iPTR imm)),
             (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
                                        (INSERT_get_vinsert128_imm VR256:$ins))>;
@@ -7229,9 +7501,9 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [HasAVX1Only] in {
   defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
-  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv4i32>;
+  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
+  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv16i8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7320,7 +7592,7 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
 
 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                       RegisterClass RC, X86MemOperand x86memop_f,
-                      X86MemOperand x86memop_i, PatFrag i_frag,
+                      X86MemOperand x86memop_i,
                       ValueType f_vt, ValueType i_vt,
                       X86FoldableSchedWrite sched,
                       X86FoldableSchedWrite varsched> {
@@ -7334,7 +7606,7 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                (ins RC:$src1, x86memop_i:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
-                              (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
+                              (i_vt (load addr:$src2)))))]>, VEX_4V,
                Sched<[varsched.Folded, sched.ReadAfterFold]>;
 
     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
@@ -7353,18 +7625,18 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM,
+                               v4f32, v4i32, SchedWriteFShuffle.XMM,
                                SchedWriteFVarShuffle.XMM>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                               loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM,
+                               v8f32, v8i32, SchedWriteFShuffle.YMM,
                                SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                               loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM,
+                               v2f64, v2i64, SchedWriteFShuffle.XMM,
                                SchedWriteFVarShuffle.XMM>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                               loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM,
+                               v4f64, v4i64, SchedWriteFShuffle.YMM,
                                SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
@@ -7445,8 +7717,7 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             [(set RC:$dst, (X86cvtph2ps (bc_v8i16
-                                          (loadv2i64 addr:$src))))]>,
+             [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
              T8PD, VEX, Sched<[sched.Folded]>;
 }
 
@@ -7520,7 +7791,7 @@ let Predicates = [HasF16C, NoVLX] in {
 /// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
 multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, X86FoldableSchedWrite sched,
-                          RegisterClass RC, PatFrag memop_frag,
+                          RegisterClass RC,
                           X86MemOperand x86memop, SDNodeXForm commuteXForm> {
   let isCommutable = 1 in
   def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
@@ -7534,22 +7805,20 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst,
-          (OpVT (OpNode RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
+          (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
 
   // Pattern to commute if load is in first source.
-  def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
-                          RC:$src1, imm:$src3)),
+  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
             (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
                                             (commuteXForm imm:$src3))>;
 }
 
 defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
-                               SchedWriteBlend.XMM, VR128, loadv2i64, i128mem,
+                               SchedWriteBlend.XMM, VR128, i128mem,
                                BlendCommuteImm4>;
 defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
-                                SchedWriteBlend.YMM, VR256, loadv4i64, i256mem,
+                                SchedWriteBlend.YMM, VR256, i256mem,
                                 BlendCommuteImm8>, VEX_L;
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
@@ -7783,7 +8052,7 @@ let Predicates = [HasAVX1Only] in {
 // VPERM - Permute instructions
 //
 
-multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
+multiclass avx2_perm<bits<8> opc, string OpcodeStr,
                      ValueType OpVT, X86FoldableSchedWrite Sched,
                      X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
@@ -7800,16 +8069,14 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermv VR256:$src1,
-                              (bitconvert (mem_frag addr:$src2)))))]>,
+                              (load addr:$src2))))]>,
                      Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
   }
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256,
-                        i256mem>;
+defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256,
-                        f256mem>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          ValueType OpVT, X86FoldableSchedWrite Sched,
@@ -7879,9 +8146,9 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
-  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv2i64>;
+  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv4i32>;
+  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
+  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv16i8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8040,7 +8307,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
-                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+                       (vt128 (load addr:$src2)))))]>,
              VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
                             SchedWriteVarVecShift.XMM.ReadAfterFold]>;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
@@ -8054,7 +8321,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
-                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
+                       (vt256 (load addr:$src2)))))]>,
              VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
                                    SchedWriteVarVecShift.YMM.ReadAfterFold]>;
 }
@@ -8068,13 +8335,11 @@ let Predicates = [HasAVX2, NoVLX] in {
 
   def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
             (VPSRAVDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86vsrav VR128:$src1,
-                    (bitconvert (loadv2i64 addr:$src2)))),
+  def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))),
             (VPSRAVDrm VR128:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
             (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86vsrav VR256:$src1,
-                    (bitconvert (loadv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))),
             (VPSRAVDYrm VR256:$src1, addr:$src2)>;
 }
 
@@ -8156,7 +8421,7 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
 
     def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
-                                 (bitconvert (MemOpFrag addr:$src2)))))]>,
+                                 (MemOpFrag addr:$src2))))]>,
              Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
   }
 }
@@ -8174,7 +8439,7 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
   def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
               (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
               [(set RC:$dst, (OpVT (OpNode RC:$src1,
-                                    (bitconvert (MemOpFrag addr:$src2)),
+                                    (MemOpFrag addr:$src2),
                               imm:$src3)))], SSEPackedInt>,
               Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
   }
@@ -8184,24 +8449,24 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
   let Constraints = "$src1 = $dst",
       Predicates  = [HasGFNI, UseSSE2] in
   defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
-                                      VR128, loadv2i64, i128mem, 1>;
+                                      VR128, load, i128mem, 1>;
   let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
     defm V##NAME    : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
-                                      loadv2i64, i128mem>, VEX_4V, VEX_W;
+                                      load, i128mem>, VEX_4V, VEX_W;
     defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
-                                      loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W;
+                                      load, i256mem>, VEX_4V, VEX_L, VEX_W;
   }
 }
 
 // GF2P8MULB
 let Constraints = "$src1 = $dst",
     Predicates  = [HasGFNI, UseSSE2] in
-defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64,
+defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
                                     i128mem, 1>;
 let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
-  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64,
+  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
                                    i128mem>, VEX_4V;
-  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64,
+  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
                                    i256mem>, VEX_4V, VEX_L;
 }
 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td
index 0d226a3367a4640c05e860794114fde4484cccf8..c417dc99b84dd2bfbdb4414b7d838556e0026fc6 100644
--- a/lib/Target/X86/X86InstrVecCompiler.td
+++ b/lib/Target/X86/X86InstrVecCompiler.td
@@ -16,19 +16,39 @@
 //  Non-instruction patterns
 //===----------------------------------------------------------------------===//
 
-// A vector extract of the first f32/f64 position is a subregister copy
-def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
-          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
-def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
-          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+let Predicates = [NoAVX512] in {
+  // A vector extract of the first f32/f64 position is a subregister copy
+  def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+  def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+}
+
+let Predicates = [HasAVX512] in {
+  // A vector extract of the first f32/f64 position is a subregister copy
+  def : Pat<(f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X)>;
+  def : Pat<(f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X)>;
+}
 
-// Implicitly promote a 32-bit scalar to a vector.
-def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
-          (COPY_TO_REGCLASS FR32:$src, VR128)>;
-// Implicitly promote a 64-bit scalar to a vector.
-def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
-          (COPY_TO_REGCLASS FR64:$src, VR128)>;
+let Predicates = [NoVLX] in {
+  // Implicitly promote a 32-bit scalar to a vector.
+  def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+            (COPY_TO_REGCLASS FR32:$src, VR128)>;
+  // Implicitly promote a 64-bit scalar to a vector.
+  def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+            (COPY_TO_REGCLASS FR64:$src, VR128)>;
+}
 
+let Predicates = [HasVLX] in {
+  // Implicitly promote a 32-bit scalar to a vector.
+  def : Pat<(v4f32 (scalar_to_vector FR32X:$src)),
+            (COPY_TO_REGCLASS FR32X:$src, VR128X)>;
+  // Implicitly promote a 64-bit scalar to a vector.
+  def : Pat<(v2f64 (scalar_to_vector FR64X:$src)),
+            (COPY_TO_REGCLASS FR64X:$src, VR128X)>;
+}
 
 //===----------------------------------------------------------------------===//
 // Subvector tricks
@@ -446,8 +466,6 @@ def : Pat<(loadf128 addr:$src),
           (VMOVUPSZ128rm addr:$src)>;
 }
 
-// With SSE2 the DAG combiner converts fp logic ops to integer logic ops to
-// reduce patterns.
 let Predicates = [UseSSE1] in {
 // andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
 def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))),
@@ -469,4 +487,23 @@ def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
           (XORPSrr VR128:$src1, VR128:$src2)>;
 }
 
+let Predicates = [HasAVX] in {
+// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
+def : Pat<(f128 (X86fand VR128:$src1, (loadf128 addr:$src2))),
+          (VANDPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)),
+          (VANDPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(f128 (X86for VR128:$src1, (loadf128 addr:$src2))),
+          (VORPSrm VR128:$src1, f128mem:$src2)>;
+
+def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)),
+          (VORPSrr VR128:$src1, VR128:$src2)>;
+
+def : Pat<(f128 (X86fxor VR128:$src1, (loadf128 addr:$src2))),
+          (VXORPSrm VR128:$src1, f128mem:$src2)>;
 
+def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)),
+          (VXORPSrr VR128:$src1, VR128:$src2)>;
+}
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index a8013e38e636433d67ba5146413c9ca51f565762..9d810a675e3b3eb76873821fc683702eabad5933 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -11,32 +11,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
+multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
+           [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
            Sched<[SchedWritePHAdd.XMM.Folded, SchedWritePHAdd.XMM.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>;
-  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>;
-  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>;
-  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>;
-  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>;
-  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>;
-  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>;
-  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>;
-  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>;
-  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>;
-  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>;
-  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>;
-  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>;
-  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>;
-  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>;
+  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd>;
+  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq>;
+  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw>;
+  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq>;
+  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd>;
+  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq>;
+  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd>;
+  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq>;
+  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw>;
+  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq>;
+  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd>;
+  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq>;
+  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw>;
+  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq>;
+  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd>;
 }
 
 // Scalar load 2 addr operand instructions
@@ -48,47 +48,47 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP,
+           [(set VR128:$dst, (Int mem_cpat:$src))]>, XOP,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     PatFrag memop, X86FoldableSchedWrite sched> {
+                     X86FoldableSchedWrite sched> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
+           [(set VR128:$dst, (Int (load addr:$src)))]>, XOP,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     PatFrag memop, X86FoldableSchedWrite sched> {
+                     X86FoldableSchedWrite sched> {
   def Yrr : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[sched]>;
   def Yrm : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L,
+           [(set VR256:$dst, (Int (load addr:$src)))]>, XOP, VEX_L,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedSingle in {
   defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
                            ssmem, sse_load_f32, SchedWriteFRnd.Scl>;
-  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32,
+  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps,
                            SchedWriteFRnd.XMM>;
-  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32,
+  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
                            SchedWriteFRnd.YMM>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
                            sdmem, sse_load_f64, SchedWriteFRnd.Scl>;
-  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64,
+  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd,
                            SchedWriteFRnd.XMM>;
-  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64,
+  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
                            SchedWriteFRnd.YMM>;
 }
 
@@ -105,13 +105,13 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1),
-                             (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
+                             (vt128 (load addr:$src2)))))]>,
            XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
+              (vt128 (OpNode (vt128 (load addr:$src1)),
                              (vt128 VR128:$src2))))]>,
              XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
   // For disassembler
@@ -150,7 +150,7 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
            (ins i128mem:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set VR128:$dst,
-              (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>,
+              (vt128 (OpNode (vt128 (load addr:$src1)), imm:$src2)))]>,
            XOP, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -181,7 +181,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+              (Int VR128:$src1, (load addr:$src2),
               VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -260,7 +260,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                 (vt128 (OpNode (vt128 VR128:$src1),
-                               (vt128 (bitconvert (loadv2i64 addr:$src2))),
+                               (vt128 (load addr:$src2)),
                                 imm:$cc)))]>,
              XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
     let isAsmParserOnly = 1, hasSideEffects = 0 in {
@@ -279,7 +279,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
     }
   }
 
-  def : Pat<(OpNode (bitconvert (loadv2i64 addr:$src2)),
+  def : Pat<(OpNode (load addr:$src2),
                     (vt128 VR128:$src1), imm:$cc),
             (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2,
                                            (CommuteVPCOMCC imm:$cc))>;
@@ -310,14 +310,14 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                             (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
+                             (vt128 (load addr:$src3)))))]>,
             XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
             (ins VR128:$src1, i128mem:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set VR128:$dst,
-              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
+              (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)),
                              (vt128 VR128:$src3))))]>,
             XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
                            // 128mem:$src2
@@ -350,6 +350,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
                                    (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
             Sched<[sched]>;
+  // FIXME: This pattern can't match.
   def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
             (ins RC:$src1, RC:$src2, x86memop:$src3),
             !strconcat(OpcodeStr,
@@ -385,6 +386,48 @@ let ExeDomain = SSEPackedInt in {
                             SchedWriteShuffle.YMM>, VEX_L;
 }
 
+let Predicates = [HasXOP] in {
+  def : Pat<(v16i8 (or (and VR128:$src3, VR128:$src1),
+                   (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v8i16 (or (and VR128:$src3, VR128:$src1),
+                   (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v4i32 (or (and VR128:$src3, VR128:$src1),
+                   (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+  def : Pat<(or (and VR128:$src3, VR128:$src1),
+                (X86andnp VR128:$src3, (bc_v16i8 (loadv2i64 addr:$src2)))),
+            (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
+  def : Pat<(or (and VR128:$src3, VR128:$src1),
+                (X86andnp VR128:$src3, (bc_v8i16 (loadv2i64 addr:$src2)))),
+            (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
+  def : Pat<(or (and VR128:$src3, VR128:$src1),
+                (X86andnp VR128:$src3, (bc_v4i32 (loadv2i64 addr:$src2)))),
+            (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+  def : Pat<(v32i8 (or (and VR256:$src3, VR256:$src1),
+                   (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+  def : Pat<(v16i16 (or (and VR256:$src3, VR256:$src1),
+                    (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+  def : Pat<(v8i32 (or (and VR256:$src3, VR256:$src1),
+                   (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
+
+  def : Pat<(or (and VR256:$src3, VR256:$src1),
+                (X86andnp VR256:$src3, (bc_v32i8 (loadv4i64 addr:$src2)))),
+            (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
+  def : Pat<(or (and VR256:$src3, VR256:$src1),
+                (X86andnp VR256:$src3, (bc_v16i16 (loadv4i64 addr:$src2)))),
+            (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
+  def : Pat<(or (and VR256:$src3, VR256:$src1),
+                (X86andnp VR256:$src3, (bc_v8i32 (loadv4i64 addr:$src2)))),
+            (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
+}
+
 multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
                         X86MemOperand intmemop, X86MemOperand fpmemop,
                         ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag,
@@ -401,8 +444,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set RC:$dst,
-          (VT (X86vpermil2 RC:$src1, RC:$src2,
-                           (bitconvert (IntLdFrag addr:$src3)),
+          (VT (X86vpermil2 RC:$src1, RC:$src2, (IntLdFrag addr:$src3),
                            (i8 imm:$src4))))]>, VEX_W,
         Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
@@ -437,10 +479,10 @@ let ExeDomain = SSEPackedDouble in {
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
-                                 v4f32, loadv4f32, loadv2i64,
+                                 v4f32, loadv4f32, loadv4i32,
                                  SchedWriteFVarShuffle.XMM>;
   defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
-                                  v8f32, loadv8f32, loadv4i64,
+                                  v8f32, loadv8f32, loadv8i32,
                                   SchedWriteFVarShuffle.YMM>, VEX_L;
 }
 
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index 6c7fb9c339ac522c9f3c2129e6e41d8b6bfe502c..28940754a2032bcdd28b0ebb618711c24ba0303f 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -463,7 +463,7 @@ static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
 //  {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
 //  Imm variable sets the offset amount. The result of the
 //  function is stored inside ShuffleMask vector and it built as described in
-//  the begin of the description. AlignDirection is a boolean that indecat the
+//  the begin of the description. AlignDirection is a boolean that indicates the
 //  direction of the alignment. (false - align to the "right" side while true -
 //  align to the "left" side)
 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 84c7878de615a81a4ad1092920bcb58d84ac5470..252d64808f0d8a8c30cc7ffb4a645662252c270a 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1120,6 +1120,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_vpshrd_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0),
   X86_INTRINSIC_DATA(bmi_bextr_32,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(bmi_bextr_64,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
+  X86_INTRINSIC_DATA(bmi_bzhi_32,          INTR_TYPE_2OP, X86ISD::BZHI, 0),
+  X86_INTRINSIC_DATA(bmi_bzhi_64,          INTR_TYPE_2OP, X86ISD::BZHI, 0),
   X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index acb2bc2085847b7b12a954725b50ed334aa8d0c3..2816f8c62bfb1a4ef5fd58af276da1cb87b82393 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -527,7 +527,7 @@ ReSimplify:
   }
 
   case X86::CLEANUPRET: {
-    // Replace CATCHRET with the appropriate RET.
+    // Replace CLEANUPRET with the appropriate RET.
     OutMI = MCInst();
     OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
     break;
@@ -1379,7 +1379,7 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
 
 static const Constant *getConstantFromPool(const MachineInstr &MI,
                                            const MachineOperand &Op) {
-  if (!Op.isCPI())
+  if (!Op.isCPI() || Op.getOffset() != 0)
     return nullptr;
 
   ArrayRef<MachineConstantPoolEntry> Constants =
@@ -1391,7 +1391,7 @@ static const Constant *getConstantFromPool(const MachineInstr &MI,
   if (ConstantEntry.isMachineConstantPoolEntry())
     return nullptr;
 
-  auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
+  const Constant *C = ConstantEntry.Val.ConstVal;
   assert((!C || ConstantEntry.getType() == C->getType()) &&
          "Expected a constant of the same type!");
   return C;
@@ -1499,7 +1499,8 @@ static void printConstant(const APInt &Val, raw_ostream &CS) {
 
 static void printConstant(const APFloat &Flt, raw_ostream &CS) {
   SmallString<32> Str;
-  Flt.toString(Str);
+  // Force scientific notation to distinquish from integers.
+  Flt.toString(Str, 0, 0);
   CS << Str;
 }
 
@@ -1594,6 +1595,18 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   }
 }
 
+static unsigned getRegisterWidth(const MCOperandInfo &Info) {
+  if (Info.RegClass == X86::VR128RegClassID ||
+      Info.RegClass == X86::VR128XRegClassID)
+    return 128;
+  if (Info.RegClass == X86::VR256RegClassID ||
+      Info.RegClass == X86::VR256XRegClassID)
+    return 256;
+  if (Info.RegClass == X86::VR512RegClassID)
+    return 512;
+  llvm_unreachable("Unknown register class!");
+}
+
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
   const X86RegisterInfo *RI =
@@ -1697,41 +1710,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  case X86::MOVGOT64r: {
-    // Materializes the GOT for the 64-bit large code model.
-    MCSymbol *DotSym = OutContext.createTempSymbol();
-    OutStreamer->EmitLabel(DotSym);
-
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned ScratchReg = MI->getOperand(1).getReg();
-    MCSymbol *GOTSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
-
-    // .LtmpN: leaq .LtmpN(%rip), %dst
-    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
-    EmitAndCountInstruction(MCInstBuilder(X86::LEA64r)
-                                .addReg(DstReg)   // dest
-                                .addReg(X86::RIP) // base
-                                .addImm(1)        // scale
-                                .addReg(0)        // index
-                                .addExpr(DotExpr) // disp
-                                .addReg(0));      // seg
-
-    // movq $_GLOBAL_OFFSET_TABLE_ - .LtmpN, %scratch
-    const MCExpr *GOTSymExpr = MCSymbolRefExpr::create(GOTSym, OutContext);
-    const MCExpr *GOTDiffExpr =
-        MCBinaryExpr::createSub(GOTSymExpr, DotExpr, OutContext);
-    EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri)
-                                .addReg(ScratchReg)     // dest
-                                .addExpr(GOTDiffExpr)); // disp
-
-    // addq %scratch, %dst
-    EmitAndCountInstruction(MCInstBuilder(X86::ADD64rr)
-                                .addReg(DstReg)       // dest
-                                .addReg(DstReg)       // dest
-                                .addReg(ScratchReg)); // src
-    return;
-  }
-
   case X86::ADD32ri: {
     // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
     if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
@@ -1879,8 +1857,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 64> Mask;
-      DecodePSHUFBMask(C, Mask);
+      DecodePSHUFBMask(C, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1951,8 +1930,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMILPMask(C, ElSize, Mask);
+      DecodeVPERMILPMask(C, ElSize, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
                                 !EnablePrintSchedInfo);
@@ -1982,8 +1962,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
+      DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
@@ -1999,8 +1980,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     const MachineOperand &MaskOp = MI->getOperand(6);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
+      unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
-      DecodeVPPERMMask(C, Mask);
+      DecodeVPPERMMask(C, Width, Mask);
       if (!Mask.empty())
         OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
                                 !EnablePrintSchedInfo);
@@ -2133,6 +2115,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       }
     }
     break;
+  case X86::MOVDDUPrm:
+  case X86::VMOVDDUPrm:
+  case X86::VMOVDDUPZ128rm:
   case X86::VBROADCASTSSrm:
   case X86::VBROADCASTSSYrm:
   case X86::VBROADCASTSSZ128m:
@@ -2169,6 +2154,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       int NumElts;
       switch (MI->getOpcode()) {
       default: llvm_unreachable("Invalid opcode");
+      case X86::MOVDDUPrm:         NumElts = 2;  break;
+      case X86::VMOVDDUPrm:        NumElts = 2;  break;
+      case X86::VMOVDDUPZ128rm:    NumElts = 2;  break;
       case X86::VBROADCASTSSrm:    NumElts = 4;  break;
       case X86::VBROADCASTSSYrm:   NumElts = 8;  break;
       case X86::VBROADCASTSSZ128m: NumElts = 4;  break;
diff --git a/lib/Target/X86/X86PfmCounters.td b/lib/Target/X86/X86PfmCounters.td
index 684cadd496205dbc73d8d27cd9f907903d2fc328..c57798e621e53a56b1467b938c61fb61b22e7ee7 100644
--- a/lib/Target/X86/X86PfmCounters.td
+++ b/lib/Target/X86/X86PfmCounters.td
@@ -11,73 +11,104 @@
 //
 //===----------------------------------------------------------------------===//
 
-let SchedModel = SandyBridgeModel in {
-def SBCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def SBPort0Counter : PfmIssueCounter<SBPort0, ["uops_dispatched_port:port_0"]>;
-def SBPort1Counter : PfmIssueCounter<SBPort1, ["uops_dispatched_port:port_1"]>;
-def SBPort23Counter : PfmIssueCounter<SBPort23,
-                                      ["uops_dispatched_port:port_2",
-                                       "uops_dispatched_port:port_3"]>;
-def SBPort4Counter : PfmIssueCounter<SBPort4, ["uops_dispatched_port:port_4"]>;
-def SBPort5Counter : PfmIssueCounter<SBPort5, ["uops_dispatched_port:port_5"]>;
-def SBUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def UnhaltedCoreCyclesPfmCounter : PfmCounter<"unhalted_core_cycles">;
+def UopsIssuedPfmCounter : PfmCounter<"uops_issued:any">;
+
+def SandyBridgePfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"SBPort0",  "uops_dispatched_port:port_0">,
+    PfmIssueCounter<"SBPort1",  "uops_dispatched_port:port_1">,
+    PfmIssueCounter<"SBPort23", "uops_dispatched_port:port_2 + uops_dispatched_port:port_3">,
+    PfmIssueCounter<"SBPort4",  "uops_dispatched_port:port_4">,
+    PfmIssueCounter<"SBPort5",  "uops_dispatched_port:port_5">
+  ];
+}
+def : PfmCountersBinding<"sandybridge", SandyBridgePfmCounters>;
+
+def HaswellPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"HWPort0", "uops_dispatched_port:port_0">,
+    PfmIssueCounter<"HWPort1", "uops_dispatched_port:port_1">,
+    PfmIssueCounter<"HWPort2", "uops_dispatched_port:port_2">,
+    PfmIssueCounter<"HWPort3", "uops_dispatched_port:port_3">,
+    PfmIssueCounter<"HWPort4", "uops_dispatched_port:port_4">,
+    PfmIssueCounter<"HWPort5", "uops_dispatched_port:port_5">,
+    PfmIssueCounter<"HWPort6", "uops_dispatched_port:port_6">,
+    PfmIssueCounter<"HWPort7", "uops_dispatched_port:port_7">
+  ];
 }
+def : PfmCountersBinding<"haswell", HaswellPfmCounters>;
 
-let SchedModel = HaswellModel in {
-def HWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def HWPort0Counter : PfmIssueCounter<HWPort0, ["uops_dispatched_port:port_0"]>;
-def HWPort1Counter : PfmIssueCounter<HWPort1, ["uops_dispatched_port:port_1"]>;
-def HWPort2Counter : PfmIssueCounter<HWPort2, ["uops_dispatched_port:port_2"]>;
-def HWPort3Counter : PfmIssueCounter<HWPort3, ["uops_dispatched_port:port_3"]>;
-def HWPort4Counter : PfmIssueCounter<HWPort4, ["uops_dispatched_port:port_4"]>;
-def HWPort5Counter : PfmIssueCounter<HWPort5, ["uops_dispatched_port:port_5"]>;
-def HWPort6Counter : PfmIssueCounter<HWPort6, ["uops_dispatched_port:port_6"]>;
-def HWPort7Counter : PfmIssueCounter<HWPort7, ["uops_dispatched_port:port_7"]>;
-def HWUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def BroadwellPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"BWPort0", "uops_executed_port:port_0">,
+    PfmIssueCounter<"BWPort1", "uops_executed_port:port_1">,
+    PfmIssueCounter<"BWPort2", "uops_executed_port:port_2">,
+    PfmIssueCounter<"BWPort3", "uops_executed_port:port_3">,
+    PfmIssueCounter<"BWPort4", "uops_executed_port:port_4">,
+    PfmIssueCounter<"BWPort5", "uops_executed_port:port_5">,
+    PfmIssueCounter<"BWPort6", "uops_executed_port:port_6">,
+    PfmIssueCounter<"BWPort7", "uops_executed_port:port_7">
+  ];
 }
+def : PfmCountersBinding<"broadwell", BroadwellPfmCounters>;
 
-let SchedModel = BroadwellModel in {
-def BWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def BWPort0Counter : PfmIssueCounter<BWPort0, ["uops_executed_port:port_0"]>;
-def BWPort1Counter : PfmIssueCounter<BWPort1, ["uops_executed_port:port_1"]>;
-def BWPort2Counter : PfmIssueCounter<BWPort2, ["uops_executed_port:port_2"]>;
-def BWPort3Counter : PfmIssueCounter<BWPort3, ["uops_executed_port:port_3"]>;
-def BWPort4Counter : PfmIssueCounter<BWPort4, ["uops_executed_port:port_4"]>;
-def BWPort5Counter : PfmIssueCounter<BWPort5, ["uops_executed_port:port_5"]>;
-def BWPort6Counter : PfmIssueCounter<BWPort6, ["uops_executed_port:port_6"]>;
-def BWPort7Counter : PfmIssueCounter<BWPort7, ["uops_executed_port:port_7"]>;
-def BWUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def SkylakeClientPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"SKLPort0", "uops_dispatched_port:port_0">,
+    PfmIssueCounter<"SKLPort1", "uops_dispatched_port:port_1">,
+    PfmIssueCounter<"SKLPort2", "uops_dispatched_port:port_2">,
+    PfmIssueCounter<"SKLPort3", "uops_dispatched_port:port_3">,
+    PfmIssueCounter<"SKLPort4", "uops_dispatched_port:port_4">,
+    PfmIssueCounter<"SKLPort5", "uops_dispatched_port:port_5">,
+    PfmIssueCounter<"SKLPort6", "uops_dispatched_port:port_6">,
+    PfmIssueCounter<"SKLPort7", "uops_dispatched_port:port_7">
+  ];
 }
+def : PfmCountersBinding<"skylake", SkylakeClientPfmCounters>;
 
-let SchedModel = SkylakeClientModel in {
-def SKLCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def SKLPort0Counter : PfmIssueCounter<SKLPort0, ["uops_dispatched_port:port_0"]>;
-def SKLPort1Counter : PfmIssueCounter<SKLPort1, ["uops_dispatched_port:port_1"]>;
-def SKLPort2Counter : PfmIssueCounter<SKLPort2, ["uops_dispatched_port:port_2"]>;
-def SKLPort3Counter : PfmIssueCounter<SKLPort3, ["uops_dispatched_port:port_3"]>;
-def SKLPort4Counter : PfmIssueCounter<SKLPort4, ["uops_dispatched_port:port_4"]>;
-def SKLPort5Counter : PfmIssueCounter<SKLPort5, ["uops_dispatched_port:port_5"]>;
-def SKLPort6Counter : PfmIssueCounter<SKLPort6, ["uops_dispatched_port:port_6"]>;
-def SKLPort7Counter : PfmIssueCounter<SKLPort7, ["uops_dispatched_port:port_7"]>;
-def SKLUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def SkylakeServerPfmCounters : ProcPfmCounters {
+  let CycleCounter = UnhaltedCoreCyclesPfmCounter;
+  let UopsCounter = UopsIssuedPfmCounter;
+  let IssueCounters = [
+    PfmIssueCounter<"SKXPort0", "uops_dispatched_port:port_0">,
+    PfmIssueCounter<"SKXPort1", "uops_dispatched_port:port_1">,
+    PfmIssueCounter<"SKXPort2", "uops_dispatched_port:port_2">,
+    PfmIssueCounter<"SKXPort3", "uops_dispatched_port:port_3">,
+    PfmIssueCounter<"SKXPort4", "uops_dispatched_port:port_4">,
+    PfmIssueCounter<"SKXPort5", "uops_dispatched_port:port_5">,
+    PfmIssueCounter<"SKXPort6", "uops_dispatched_port:port_6">,
+    PfmIssueCounter<"SKXPort7", "uops_dispatched_port:port_7">
+  ];
 }
+def : PfmCountersBinding<"skylake-avx512", SkylakeServerPfmCounters>;
 
-let SchedModel = SkylakeServerModel in {
-def SKXCycleCounter : PfmCycleCounter<"unhalted_core_cycles">;
-def SKXPort0Counter : PfmIssueCounter<SKXPort0, ["uops_dispatched_port:port_0"]>;
-def SKXPort1Counter : PfmIssueCounter<SKXPort1, ["uops_dispatched_port:port_1"]>;
-def SKXPort2Counter : PfmIssueCounter<SKXPort2, ["uops_dispatched_port:port_2"]>;
-def SKXPort3Counter : PfmIssueCounter<SKXPort3, ["uops_dispatched_port:port_3"]>;
-def SKXPort4Counter : PfmIssueCounter<SKXPort4, ["uops_dispatched_port:port_4"]>;
-def SKXPort5Counter : PfmIssueCounter<SKXPort5, ["uops_dispatched_port:port_5"]>;
-def SKXPort6Counter : PfmIssueCounter<SKXPort6, ["uops_dispatched_port:port_6"]>;
-def SKXPort7Counter : PfmIssueCounter<SKXPort7, ["uops_dispatched_port:port_7"]>;
-def SKXUopsCounter  : PfmUopsCounter<"uops_issued:any">;
+def BdVer2PfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+  let UopsCounter = PfmCounter<"retired_uops">;
+  let IssueCounters = [
+    PfmIssueCounter<"PdFPU0", "dispatched_fpu_ops:ops_pipe0 + dispatched_fpu_ops:ops_dual_pipe0">,
+    PfmIssueCounter<"PdFPU1", "dispatched_fpu_ops:ops_pipe1 + dispatched_fpu_ops:ops_dual_pipe1">,
+    PfmIssueCounter<"PdFPU2", "dispatched_fpu_ops:ops_pipe2 + dispatched_fpu_ops:ops_dual_pipe2">,
+    PfmIssueCounter<"PdFPU3", "dispatched_fpu_ops:ops_pipe3 + dispatched_fpu_ops:ops_dual_pipe3">
+  ];
 }
+def : PfmCountersBinding<"bdver2", BdVer2PfmCounters>;
 
-let SchedModel = BtVer2Model in {
-def JCycleCounter : PfmCycleCounter<"cpu_clk_unhalted">;
-def JUopsCounter  : PfmUopsCounter<"retired_uops">;
-def JFPU0Counter  : PfmIssueCounter<JFPU0, ["dispatched_fpu:pipe0"]>;
-def JFPU1Counter  : PfmIssueCounter<JFPU1, ["dispatched_fpu:pipe1"]>;
+def BtVer2PfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cpu_clk_unhalted">;
+  let UopsCounter = PfmCounter<"retired_uops">;
+  let IssueCounters = [
+    PfmIssueCounter<"JFPU0", "dispatched_fpu:pipe0">,
+    PfmIssueCounter<"JFPU1", "dispatched_fpu:pipe1">
+  ];
 }
+def : PfmCountersBinding<"btver2", BtVer2PfmCounters>;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 31b939641fdc706afed2a7bc91099578fca762ee..0c1b05fd3abe7015d7fe23af93be7cbc486de7d5 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -436,11 +436,12 @@ def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>;
 def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
 def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
 def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
-def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, ESP)>;
 def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
-                                                     R8, R9, R11, RIP)>;
+                                                     R8, R9, R11, RIP, RSP)>;
 def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
-                                                      R8, R9, R10, R11, RIP)>;
+                                                      R8, R9, R10, R11,
+                                                      RIP, RSP)>;
 
 // GR8_NOREX - GR8 registers which do not require a REX prefix.
 def GR8_NOREX : RegisterClass<"X86", [i8], 8,
diff --git a/lib/Target/X86/X86RetpolineThunks.cpp b/lib/Target/X86/X86RetpolineThunks.cpp
index f62e89eb1ba4b6cc39ffa786a8fc2af2f2aa0871..08994cccb21e69f78cd62c3a3f0490ad4bc51958 100644
--- a/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/lib/Target/X86/X86RetpolineThunks.cpp
@@ -74,7 +74,7 @@ private:
 
   void createThunkFunction(Module &M, StringRef Name);
   void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
-  void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
+  void populateThunk(MachineFunction &MF, unsigned Reg);
 };
 
 } // end anonymous namespace
@@ -236,25 +236,33 @@ void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
 }
 
 void X86RetpolineThunks::populateThunk(MachineFunction &MF,
-                                       Optional<unsigned> Reg) {
+                                       unsigned Reg) {
   // Set MF properties. We never use vregs...
   MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
 
+  // Grab the entry MBB and erase any other blocks. O0 codegen appears to
+  // generate two bbs for the entry block.
   MachineBasicBlock *Entry = &MF.front();
   Entry->clear();
+  while (MF.size() > 1)
+    MF.erase(std::next(MF.begin()));
 
   MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
   MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+  MCSymbol *TargetSym = MF.getContext().createTempSymbol();
   MF.push_back(CaptureSpec);
   MF.push_back(CallTarget);
 
   const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
   const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
 
-  BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget);
-  Entry->addSuccessor(CallTarget);
+  Entry->addLiveIn(Reg);
+  BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
+
+  // The MIR verifier thinks that the CALL in the entry block will fall through
+  // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
+  // the successor, but the MIR verifier doesn't know how to cope with that.
   Entry->addSuccessor(CaptureSpec);
-  CallTarget->setHasAddressTaken();
 
   // In the capture loop for speculation, we want to stop the processor from
   // speculating as fast as possible. On Intel processors, the PAUSE instruction
@@ -270,7 +278,10 @@ void X86RetpolineThunks::populateThunk(MachineFunction &MF,
   CaptureSpec->setHasAddressTaken();
   CaptureSpec->addSuccessor(CaptureSpec);
 
+  CallTarget->addLiveIn(Reg);
+  CallTarget->setHasAddressTaken();
   CallTarget->setAlignment(4);
-  insertRegReturnAddrClobber(*CallTarget, *Reg);
+  insertRegReturnAddrClobber(*CallTarget, Reg);
+  CallTarget->back().setPreInstrSymbol(MF, TargetSym);
   BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
 }
diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
index b5d842a52b567cf0d3e112419fbcc98beb12ab3c..d4a3eb07b982771643611326db277f43b25c7c49 100644
--- a/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -1133,7 +1133,8 @@ def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> {
 def: InstRW<[SKLWriteResGroup91], (instrs VINSERTF128rm,
                                           VINSERTI128rm,
                                           VPBLENDDrmi)>;
-def: InstRW<[SKLWriteResGroup91], (instregex "(V?)PADD(B|D|Q|W)rm",
+def: InstRW<[SKLWriteResGroup91, ReadAfterVecXLd],
+                                  (instregex "(V?)PADD(B|D|Q|W)rm",
                                              "(V?)PSUB(B|D|Q|W)rm")>;
 
 def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> {
@@ -1230,7 +1231,8 @@ def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> {
   let ResourceCycles = [1,1];
 }
 def: InstRW<[SKLWriteResGroup110], (instrs VPBLENDDYrmi)>;
-def: InstRW<[SKLWriteResGroup110], (instregex "VPADD(B|D|Q|W)Yrm",
+def: InstRW<[SKLWriteResGroup110, ReadAfterVecYLd],
+                                   (instregex "VPADD(B|D|Q|W)Yrm",
                                               "VPSUB(B|D|Q|W)Yrm")>;
 
 def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
index d3fa912be111e0e7a3d448c0b8786defaebb641b..cbcb6a6e58bb2b0aadb2bc276f414035b39312ee 100644
--- a/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -1339,7 +1339,8 @@ def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> {
 }
 def: InstRW<[SKXWriteResGroup95], (instrs VMOVNTDQAZ128rm,
                                           VPBLENDDrmi)>;
-def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)",
+def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd],
+                                  (instregex "VBLENDMPDZ128rm(b?)",
                                              "VBLENDMPSZ128rm(b?)",
                                              "VBROADCASTI32X2Z128m(b?)",
                                              "VBROADCASTSSZ128m(b?)",
@@ -1534,7 +1535,8 @@ def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> {
 }
 def: InstRW<[SKXWriteResGroup121], (instrs VMOVNTDQAZ256rm,
                                            VPBLENDDYrmi)>;
-def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPD(Z|Z256)rm(b?)",
+def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd],
+                                   (instregex "VBLENDMPD(Z|Z256)rm(b?)",
                                               "VBLENDMPS(Z|Z256)rm(b?)",
                                               "VBROADCASTF32X2Z256m(b?)",
                                               "VBROADCASTF32X2Zm(b?)",
diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td
new file mode 100644
index 0000000000000000000000000000000000000000..bc5d112c2f4f23895d007f7a10869eeba1ab5a4f
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleBdVer2.td
@@ -0,0 +1,1278 @@
+//=- X86ScheduleBdVer2.td - X86 BdVer2 (Piledriver) Scheduling * tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for AMD bdver2 (Piledriver) to support
+// instruction scheduling and other instruction cost heuristics.
+// Based on:
+//  * AMD Software Optimization Guide for AMD Family 15h Processors.
+//    https://support.amd.com/TechDocs/47414_15h_sw_opt_guide.pdf
+//  * The microarchitecture of Intel, AMD and VIA CPUs, By Agner Fog
+//    http://www.agner.org/optimize/microarchitecture.pdf
+//  * https://www.realworldtech.com/bulldozer/
+//    Yes, that is for Bulldozer aka bdver1, not Piledriver aka bdver2.
+//
+//===----------------------------------------------------------------------===//
+
+def BdVer2Model : SchedMachineModel {
+  let IssueWidth = 4; // Up to 4 IPC can be decoded, issued, retired.
+  let MicroOpBufferSize = 128; // RCU reorder buffer size, which is unconfirmed.
+  let LoopMicroOpBufferSize = -1; // There does not seem to be a loop buffer.
+  let LoadLatency = 4; // L1 data cache has a 4-cycle load-to-use latency.
+  let HighLatency = 25; // FIXME: any better choice?
+  let MispredictPenalty = 20; // Minimum branch misdirection penalty.
+
+  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
+
+  // FIXME: Incomplete. This flag is set to allow the scheduler to assign
+  //        a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+} // SchedMachineModel
+
+let SchedModel = BdVer2Model in {
+
+
+//===----------------------------------------------------------------------===//
+// Pipes
+//===----------------------------------------------------------------------===//
+
+// There are total of eight pipes.
+
+//===----------------------------------------------------------------------===//
+// Integer execution pipes
+//
+
+// Two EX (ALU) pipes.
+def PdEX0  : ProcResource<1>; // ALU, Integer Pipe0
+def PdEX1  : ProcResource<1>; // ALU, Integer Pipe1
+def PdEX01 : ProcResGroup<[PdEX0, PdEX1]>;
+
+// Two AGLU pipes, identical.
+def PdAGLU01 : ProcResource<2>; // AGU, Integer Pipe[23]
+
+//===----------------------------------------------------------------------===//
+// Floating point execution pipes
+//
+
+// Four FPU pipes.
+
+def PdFPU0 : ProcResource<1>; // Vector/FPU Pipe0
+def PdFPU1 : ProcResource<1>; // Vector/FPU Pipe1
+def PdFPU2 : ProcResource<1>; // Vector/FPU Pipe2
+def PdFPU3 : ProcResource<1>; // Vector/FPU Pipe3
+
+// FPU grouping
+def PdFPU01 : ProcResGroup<[PdFPU0, PdFPU1]>;
+def PdFPU23 : ProcResGroup<[PdFPU2, PdFPU3]>;
+
+
+//===----------------------------------------------------------------------===//
+// RCU
+//===----------------------------------------------------------------------===//
+
+// The Retire Control Unit on Piledriver can retire up to 4 macro-ops per cycle.
+// On the other hand, the RCU reorder buffer size for Piledriver does not
+// seem be specified in any trustworthy source.
+// But as per https://www.realworldtech.com/bulldozer/6/ the Bulldozer had
+// RCU reorder buffer size of 128. So that is a good guess for now.
+def PdRCU : RetireControlUnit<128, 4>;
+
+
+//===----------------------------------------------------------------------===//
+// Pipelines
+//===----------------------------------------------------------------------===//
+
+// There are total of two pipelines, each one with it's own scheduler.
+
+//===----------------------------------------------------------------------===//
+// Integer Pipeline Scheduling
+//
+
+// There is one Integer Scheduler per core.
+
+// Integer physical register file has 96 registers of 64-bit.
+def PdIntegerPRF : RegisterFile<96, [GR64, CCR]>;
+
+// Unified Integer, Memory Scheduler has 40 entries.
+def PdEX : ProcResGroup<[PdEX0, PdEX1, PdAGLU01]> {
+  // Up to 4 IPC can be decoded, issued, retired.
+  let BufferSize = 40;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FPU Pipeline Scheduling
+//
+
+// The FPU unit is shared between the two cores.
+
+// FP physical register file has 160 registers of 128-bit.
+// Operations on 256-bit data types are cracked into two COPs.
+def PdFpuPRF : RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// Unified FP Scheduler has 64 entries,
+def PdFPU : ProcResGroup<[PdFPU0, PdFPU1, PdFPU2, PdFPU3]> {
+  // Up to 4 IPC can be decoded, issued, retired.
+  let BufferSize = 64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Functional units
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Load-Store Units
+//
+
+// FIXME: does this even make sense?
+
+def PdLoad  : ProcResGroup<[PdAGLU01]> {
+  // For Piledriver, the load queue is 40 entries deep.
+  let BufferSize = 40;
+}
+
+def PdStore : ProcResGroup<[PdAGLU01]> {
+  // For Piledriver, the store queue is 24 entries deep.
+  let BufferSize = 24;
+}
+
+//===----------------------------------------------------------------------===//
+// Integer Execution Units
+//
+
+def PdDiv    : ProcResource<1>; // PdEX0; unpipelined integer division
+def PdCount  : ProcResource<1>; // PdEX0; POPCNT, LZCOUNT
+
+def PdMul    : ProcResource<1>; // PdEX1; integer multiplication
+def PdBranch : ProcResource<1>; // PdEX1; JMP, fused branches
+
+//===----------------------------------------------------------------------===//
+// Floating-Point Units
+//
+
+// Two FMAC/FPFMA units.
+def PdFPFMA  : ProcResource<2>; // PdFPU0, PdFPU1
+
+// One 128-bit integer multiply-accumulate unit.
+def PdFPMMA  : ProcResource<1>; // PdFPU0
+
+// One fp conversion unit.
+def PdFPCVT  : ProcResource<1>; // PdFPU0
+
+// One unit for shuffles, packs, permutes, shifts.
+def PdFPXBR  : ProcResource<1>; // PdFPU1
+
+// Two 128-bit packed integer units.
+def PdFPMAL  : ProcResource<2>; // PdFPU2, PdFPU3
+
+// One FP store unit.
+def PdFPSTO  : ProcResource<1>; // PdFPU3
+
+
+//===----------------------------------------------------------------------===//
+// Basic helper classes.
+//===----------------------------------------------------------------------===//
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when dispatched by the schedulers.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass PdWriteRes<SchedWrite SchedRW,
+                      list<ProcResourceKind> ExePorts, int Lat = 1,
+                      list<int> Res = [], int UOps = 1> {
+  def : WriteRes<SchedRW, ExePorts> {
+    let Latency = Lat;
+    let ResourceCycles = Res;
+    let NumMicroOps = UOps;
+  }
+}
+
+multiclass __pdWriteResPair<X86FoldableSchedWrite SchedRW,
+                            list<ProcResourceKind> ExePorts, int Lat,
+                            list<int> Res, int UOps,
+                            int LoadLat, int LoadRes, int LoadUOps> {
+  defm : PdWriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
+
+  defm : PdWriteRes<SchedRW.Folded,
+                    !listconcat([PdLoad], ExePorts),
+                    !add(Lat, LoadLat),
+                    !if(!and(!empty(Res), !eq(LoadRes, 1)),
+                      [],
+                      !listconcat([LoadRes], Res)),
+                    !add(UOps, LoadUOps)>;
+}
+
+multiclass PdWriteResExPair<X86FoldableSchedWrite SchedRW,
+                            list<ProcResourceKind> ExePorts, int Lat = 1,
+                            list<int> Res = [], int UOps = 1,
+                            int LoadUOps = 0> {
+  defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                          /*LoadLat*/4, /*LoadRes*/1, LoadUOps>;
+}
+
+multiclass PdWriteResXMMPair<X86FoldableSchedWrite SchedRW,
+                             list<ProcResourceKind> ExePorts, int Lat = 1,
+                             list<int> Res = [], int UOps = 1,
+                             int LoadUOps = 0> {
+  defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                           /*LoadLat*/5, /*LoadRes*/1, LoadUOps>;
+}
+
+multiclass PdWriteResYMMPair<X86FoldableSchedWrite SchedRW,
+                             list<ProcResourceKind> ExePorts, int Lat,
+                             list<int> Res, int UOps = 2,
+                             int LoadUOps = 0> {
+  defm : __pdWriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
+                           /*LoadLat*/5, /*LoadRes*/2, LoadUOps>;
+}
+
+//===----------------------------------------------------------------------===//
+// Here be dragons.
+//===----------------------------------------------------------------------===//
+
+// L1 data cache has a 4-cycle load-to-use latency, so ReadAfterLd registers
+// needn't be available until 4 cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available
+// until 5 cycles after the memory operand.
+def : ReadAdvance<ReadAfterVecLd, 5>;
+def : ReadAdvance<ReadAfterVecXLd, 5>;
+def : ReadAdvance<ReadAfterVecYLd, 5>;
+
+// A folded store needs a cycle on the PdStore for the store data.
+def : WriteRes<WriteRMW, [PdStore]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Loads, stores, and moves, not folded with other operations.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteLoad,    [PdLoad]> { let Latency = 5; }
+def : WriteRes<WriteStore,   [PdStore]>;
+def : WriteRes<WriteStoreNT, [PdStore]>;
+def : WriteRes<WriteMove,    [PdEX01]>;
+
+// Load/store MXCSR.
+// FIXME: These are copy and pasted from WriteLoad/Store.
+def : WriteRes<WriteLDMXCSR, [PdLoad]> { let Latency = 5; }
+def : WriteRes<WriteSTMXCSR, [PdStore]> { let NumMicroOps = 2; }
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteZero, [/*No ExePorts*/]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResExPair<WriteJump,  [PdEX1, PdBranch]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Special case scheduling classes.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteSystem,     [PdEX01]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [PdEX01]> { let Latency = 100; }
+def : WriteRes<WriteFence,      [PdStore]>;
+
+def PdWriteXLAT : SchedWriteRes<[PdEX01]> {
+  let Latency = 6;
+}
+def : InstRW<[PdWriteXLAT], (instrs XLAT)>;
+
+def PdWriteLARrr : SchedWriteRes<[PdEX01]> {
+  let Latency = 184;
+  let NumMicroOps = 45;
+}
+def : InstRW<[PdWriteLARrr], (instregex "LAR(16|32|64)rr",
+                                        "LSL(16|32|64)rr")>;
+
+// Nops don't have dependencies, so there's no actual latency, but we set this
+// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle.
+def : WriteRes<WriteNop, [PdEX01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Arithmetic.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResExPair<WriteALU,     [PdEX01]>;
+
+def PdWriteLXADD : SchedWriteRes<[PdEX01]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteLXADD], (instrs LXADD8, LXADD16, LXADD32, LXADD64)>;
+
+def PdWriteBMI1 : SchedWriteRes<[PdEX01]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBMI1],
+             (instrs BLCFILL32rr, BLCFILL64rr, BLCI32rr, BLCI64rr,
+                     BLCIC32rr, BLCIC64rr, BLCMSK32rr, BLCMSK64rr,
+                     BLCS32rr, BLCS64rr, BLSFILL32rr, BLSFILL64rr,
+                     BLSIC32rr, BLSIC64rr, T1MSKC32rr, T1MSKC64rr,
+                     TZMSK32rr, TZMSK64rr)>;
+
+def PdWriteBMI1m : SchedWriteRes<[PdEX01]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteBMI1m],
+             (instrs BLCFILL32rm, BLCFILL64rm, BLCI32rm, BLCI64rm,
+                     BLCIC32rm, BLCIC64rm, BLCMSK32rm, BLCMSK64rm,
+                     BLCS32rm, BLCS64rm, BLSFILL32rm, BLSFILL64rm,
+                     BLSIC32rm, BLSIC64rm, T1MSKC32rm, T1MSKC64rm,
+                     TZMSK32rm, TZMSK64rm)>;
+
+defm : PdWriteResExPair<WriteADC,    [PdEX01],                  1,  [2]>;
+
+defm : PdWriteRes<WriteBSWAP32,      [PdEX1]>;
+defm : PdWriteRes<WriteBSWAP64,      [PdEX1]>;
+defm : PdWriteRes<WriteCMPXCHG,      [PdEX1],                   3,  [],       5>;
+defm : PdWriteRes<WriteCMPXCHGRMW,   [PdEX1, PdStore, PdLoad],  3,  [], 2>;
+defm : PdWriteRes<WriteXCHG,         [PdEX1],                   1,  [],       2>;
+
+def PdWriteCMPXCHG8rr : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+def : InstRW<[PdWriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
+
+def PdWriteCMPXCHG8rm : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 5;
+}
+def : InstRW<[PdWriteCMPXCHG8rm], (instrs CMPXCHG8rm)>;
+
+def PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 6;
+}
+def : InstRW<[PdWriteCMPXCHG16rm_CMPXCHG32rm_CMPXCHG64rm],
+             (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>;
+
+def PdWriteCMPXCHG8B : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 18;
+}
+def : InstRW<[PdWriteCMPXCHG8B], (instrs CMPXCHG8B)>;
+
+def PdWriteCMPXCHG16B : SchedWriteRes<[PdEX1]> {
+  let Latency = 3;
+  let NumMicroOps = 22;
+}
+def : InstRW<[PdWriteCMPXCHG16B], (instrs CMPXCHG16B)>;
+
+def PdWriteXCHG16rr : SchedWriteRes<[PdEX1]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteXCHG16rr], (instrs XCHG16rr)>;
+
+def PdWriteXADD : SchedWriteRes<[PdEX1]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteXADD], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr)>;
+
+def PdWriteXADDm : SchedWriteRes<[PdEX1]> {
+let Latency = 6;
+let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteXADDm], (instrs XADD8rm, XADD16rm, XADD32rm, XADD64rm)>;
+
+defm : PdWriteResExPair<WriteIMul8,     [PdEX1, PdMul],          4>;
+defm : PdWriteResExPair<WriteIMul16,    [PdEX1, PdMul],          4,  [],    2>;
+defm : PdWriteResExPair<WriteIMul16Imm, [PdEX1, PdMul],          5,  [],    2>;
+defm : PdWriteResExPair<WriteIMul16Reg, [PdEX1, PdMul],          4>;
+defm : PdWriteResExPair<WriteIMul32,    [PdEX1, PdMul],          4>;
+defm : PdWriteResExPair<WriteIMul32Imm, [PdEX1, PdMul],          4,  [],    1, 1>;
+defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul],          4>;
+defm : PdWriteResExPair<WriteIMul64,    [PdEX1, PdMul],          6,  [1, 4]>;
+defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul],          6,  [1, 4],1, 1>;
+defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul],          6,  [1, 4]>;
+defm : X86WriteResUnsupported<WriteIMulH>; // BMI2 MULX
+
+defm : PdWriteResExPair<WriteDiv8,    [PdEX1, PdDiv],           12,  [1, 12]>;
+defm : PdWriteResExPair<WriteDiv16,   [PdEX1, PdDiv],           15,  [1, 15],   2>;
+defm : PdWriteResExPair<WriteDiv32,   [PdEX1, PdDiv],           14,  [1, 14],   2>;
+defm : PdWriteResExPair<WriteDiv64,   [PdEX1, PdDiv],           14,  [1, 14],   2>;
+
+defm : PdWriteResExPair<WriteIDiv8,   [PdEX1, PdDiv],           12,  [1, 12]>;
+defm : PdWriteResExPair<WriteIDiv16,  [PdEX1, PdDiv],           15,  [1, 17],   2>;
+defm : PdWriteResExPair<WriteIDiv32,  [PdEX1, PdDiv],           14,  [1, 25],   2>;
+defm : PdWriteResExPair<WriteIDiv64,  [PdEX1, PdDiv],           14,  [1, 14],   2>;
+
+defm : PdWriteResExPair<WriteCRC32,   [PdEX01],                  3,  [4],       3>;
+
+def PdWriteCRC32r32r16 : SchedWriteRes<[PdEX01]> {
+  let Latency = 5;
+  let ResourceCycles = [4];
+  let NumMicroOps = 5;
+}
+def : InstRW<[PdWriteCRC32r32r16], (instrs CRC32r32r16)>;
+
+def PdWriteCRC32r32r32 : SchedWriteRes<[PdEX01]> {
+  let Latency = 6;
+  let ResourceCycles = [4];
+  let NumMicroOps = 7;
+}
+def : InstRW<[PdWriteCRC32r32r32], (instrs CRC32r32r32)>;
+
+def PdWriteCRC32r64r64 : SchedWriteRes<[PdEX01]> {
+  let Latency = 10;
+  let ResourceCycles = [4];
+  let NumMicroOps = 11;
+}
+def : InstRW<[PdWriteCRC32r64r64], (instrs CRC32r64r64)>;
+
+defm : PdWriteResExPair<WriteCMOV,    [PdEX01]>; // Conditional move.
+defm : PdWriteResExPair<WriteCMOV2,   [PdEX01], 1, [], 1, 1>; // Conditional (CF + ZF flag) move.
+
+def : InstRW<[WriteCMOV2.Folded], (instrs CMOVG16rm, CMOVG32rm, CMOVG64rm,
+                                          CMOVGE16rm, CMOVGE32rm, CMOVGE64rm,
+                                          CMOVL16rm, CMOVL32rm, CMOVL64rm,
+                                          CMOVLE16rm, CMOVLE32rm, CMOVLE64rm)>;
+
+defm : PdWriteRes<WriteFCMOV,        [PdFPU0, PdFPFMA]>; // x87 conditional move.
+
+def : WriteRes<WriteSETCC,           [PdEX01]>; // Setcc.
+def : WriteRes<WriteSETCCStore,      [PdEX01, PdStore]>;
+
+def PdWriteSETGEmSETGmSETLEmSETLm : SchedWriteRes<[PdEX01]> {
+  let ResourceCycles = [2];
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteSETGEmSETGmSETLEmSETLm], (instrs SETGEm, SETGm,
+                                                      SETLEm, SETLm)>;
+
+defm : PdWriteRes<WriteLAHFSAHF,      [PdEX01],          2,  [],     2>;
+
+def WriteLAHF : SchedWriteRes<[PdEX01]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+def : InstRW<[WriteLAHF], (instrs LAHF)>;
+
+def WriteSAHF : SchedWriteRes<[PdEX01]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteSAHF], (instrs SAHF)>;
+
+defm : PdWriteRes<WriteBitTest,          [PdEX01],         1, [1],     1>;
+defm : PdWriteRes<WriteBitTestImmLd,     [PdEX01, PdLoad], 5, [1, 1],  1>;
+defm : PdWriteRes<WriteBitTestRegLd,     [PdEX01, PdLoad], 5, [1, 1],  7>;
+defm : PdWriteRes<WriteBitTestSet,       [PdEX01],         2, [1],     2>;
+defm : PdWriteRes<WriteBitTestSetImmLd,  [PdEX01, PdLoad], 6, [1, 1],  4>;
+defm : PdWriteRes<WriteBitTestSetImmRMW, [PdEX01, PdLoad], 6, [1, 1],  4>;
+defm : PdWriteRes<WriteBitTestSetRegLd,  [PdEX01, PdLoad], 6, [1, 1], 10>;
+defm : PdWriteRes<WriteBitTestSetRegRMW, [PdEX01, PdLoad], 6, [1, 1], 10>;
+
+// This is for simple LEAs with one or two input operands.
+// FIXME: SAGU 3-operand LEA
+def : WriteRes<WriteLEA,              [PdEX01]> { let NumMicroOps = 2; }
+
+// Bit counts.
+defm : PdWriteResExPair<WriteBSF,     [PdEX01],          3,  [4],     6, 2>;
+defm : PdWriteResExPair<WriteBSR,     [PdEX01],          4,  [4],     7, 2>;
+defm : PdWriteResExPair<WritePOPCNT,  [PdEX01],          4>;
+defm : PdWriteResExPair<WriteLZCNT,   [PdEX01],          2,  [],      2>;
+defm : PdWriteResExPair<WriteTZCNT,   [PdEX01],          2,  [2],     2>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : PdWriteResExPair<WriteBEXTR,   [PdEX01],          2,  [],     2>;
+defm : PdWriteResExPair<WriteBLS,     [PdEX01],          2,  [],     2>;
+defm : PdWriteResExPair<WriteBZHI,    [PdEX01]>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Integer shifts and rotates.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResExPair<WriteShift,    [PdEX01]>;
+defm : PdWriteResExPair<WriteShiftCL,  [PdEX01]>;
+defm : PdWriteResExPair<WriteRotate,   [PdEX01]>;
+defm : PdWriteResExPair<WriteRotateCL, [PdEX01]>;
+
+def PdWriteRCL8rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 12;
+  let NumMicroOps = 26;
+}
+def : InstRW<[PdWriteRCL8rCL], (instrs RCL8rCL)>;
+
+def PdWriteRCR8ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 12;
+  let NumMicroOps = 23;
+}
+def : InstRW<[PdWriteRCR8ri], (instrs RCR8ri)>;
+
+def PdWriteRCR8rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 11;
+  let NumMicroOps = 24;
+}
+def : InstRW<[PdWriteRCR8rCL], (instrs RCR8rCL)>;
+
+def PdWriteRCL16rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 10;
+  let NumMicroOps = 22;
+}
+def : InstRW<[PdWriteRCL16rCL], (instrs RCL16rCL)>;
+
+def PdWriteRCR16ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 10;
+  let NumMicroOps = 19;
+}
+def : InstRW<[PdWriteRCR16ri], (instrs RCR16ri)>;
+
+def PdWriteRCL32rCLRCL64rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 7;
+  let NumMicroOps = 17;
+}
+def : InstRW<[PdWriteRCL32rCLRCL64rCL], (instrs RCL32rCL, RCL64rCL)>;
+
+def PdWriteRCR64rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 7;
+  let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteRCR64rCL], (instrs RCR64rCL)>;
+
+def PdWriteRCR32rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 7;
+  let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteRCR32rCL ], (instrs RCR32rCL)>;
+
+def PdWriteRCR32riRCR64ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 7;
+  let NumMicroOps = 15;
+}
+def : InstRW<[PdWriteRCR32riRCR64ri], (instrs RCR32ri, RCR64ri)>;
+
+
+def PdWriteRCR16rCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 9;
+  let NumMicroOps = 20;
+}
+def : InstRW<[PdWriteRCR16rCL], (instrs RCR16rCL)>;
+
+def PdWriteRCL16ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 11;
+  let NumMicroOps = 21;
+}
+def : InstRW<[PdWriteRCL16ri], (instrs RCL16ri)>;
+
+def PdWriteRCL3264ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 8;
+  let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteRCL3264ri], (instrs RCL32ri, RCL64ri)>;
+
+def PdWriteRCL8ri : SchedWriteRes<[PdEX01]> {
+  let Latency = 13;
+  let NumMicroOps = 25;
+}
+def : InstRW<[PdWriteRCL8ri], (instrs RCL8ri)>;
+
+// SHLD/SHRD.
+defm : PdWriteRes<WriteSHDrri,       [PdEX01],         4, [6], 6>;
+defm : PdWriteRes<WriteSHDrrcl,      [PdEX01],         4, [8], 7>;
+
+def PdWriteSHLD32rri8SHRD16rri8 : SchedWriteRes<[PdEX01]> {
+  let Latency = 3;
+  let ResourceCycles = [6];
+  let NumMicroOps = 6;
+}
+def : InstRW<[PdWriteSHLD32rri8SHRD16rri8 ], (instrs SHLD32rri8, SHRD16rri8)>;
+
+def PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL : SchedWriteRes<[PdEX01]> {
+  let Latency = 4;
+  let ResourceCycles = [8];
+  let NumMicroOps = 7;
+}
+def : InstRW<[PdWriteSHLD16rrCLSHLD32rrCLSHRD32rrCL], (instrs SHLD16rrCL,
+                                                              SHLD32rrCL,
+                                                              SHRD32rrCL)>;
+
+defm : PdWriteRes<WriteSHDmri,       [PdLoad, PdEX01], 4, [1, 22], 8>;
+defm : PdWriteRes<WriteSHDmrcl,      [PdLoad, PdEX01], 4, [1, 22], 8>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Floating point. This covers both scalar and vector operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteFLD0,               [PdFPU1, PdFPSTO], 3>;
+defm : PdWriteRes<WriteFLD1,               [PdFPU1, PdFPSTO], 3>;
+defm : PdWriteRes<WriteFLDC,               [PdFPU1, PdFPSTO], 3>;
+
+defm : PdWriteRes<WriteFLoad,              [PdLoad, PdFPU01, PdFPFMA], 5>;
+defm : PdWriteRes<WriteFLoadX,             [PdLoad, PdFPU01, PdFPFMA], 5>;
+defm : PdWriteRes<WriteFLoadY,             [PdLoad, PdFPU01, PdFPFMA], 5, [], 2>;
+
+defm : PdWriteRes<WriteFMaskedLoad,        [PdLoad, PdFPU01, PdFPFMA], 6, [1, 1, 2]>;
+defm : PdWriteRes<WriteFMaskedLoadY,       [PdLoad, PdFPU01, PdFPFMA], 6, [2, 2, 4], 2>;
+
+defm : PdWriteRes<WriteFStore,             [PdStore, PdFPU1,  PdFPSTO], 2>;
+defm : PdWriteRes<WriteFStoreX,            [PdStore, PdFPU1,  PdFPSTO]>;
+defm : PdWriteRes<WriteFStoreY,            [PdStore, PdFPU1,  PdFPSTO], 1, [], 4>;
+
+def PdWriteMOVHPm : SchedWriteRes<[PdStore, PdFPU1,  PdFPSTO]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteMOVHPm], (instrs MOVHPDmr, MOVHPSmr, VMOVHPDmr, VMOVHPSmr)>;
+
+def PdWriteVMOVUPDYmrVMOVUPSYmr : SchedWriteRes<[PdStore, PdFPU1,  PdFPSTO]> {
+  let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVMOVUPDYmrVMOVUPSYmr], (instrs VMOVUPDYmr, VMOVUPSYmr)>;
+
+defm : PdWriteRes<WriteFStoreNT,           [PdStore, PdFPU1,  PdFPSTO], 3>;
+defm : PdWriteRes<WriteFStoreNTX,          [PdStore, PdFPU1,  PdFPSTO], 3>;
+defm : PdWriteRes<WriteFStoreNTY,          [PdStore, PdFPU1,  PdFPSTO], 3, [2, 2, 2], 4>;
+
+defm : PdWriteRes<WriteFMaskedStore,       [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 4], 18>;
+defm : PdWriteRes<WriteFMaskedStoreY,      [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 4], 34>;
+
+defm : PdWriteRes<WriteFMove,              [PdFPU01, PdFPFMA]>;
+defm : PdWriteRes<WriteFMoveX,             [PdFPU01, PdFPFMA]>;
+defm : PdWriteRes<WriteFMoveY,             [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
+
+defm : PdWriteRes<WriteEMMS,               [PdFPU01, PdFPFMA], 2>;
+
+defm : PdWriteResXMMPair<WriteFAdd,         [PdFPU0, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFAddX,        [PdFPU0, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFAddY,        [PdFPU0, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+
+defm : PdWriteResXMMPair<WriteFAdd64,       [PdFPU0, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFAdd64X,      [PdFPU0, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFAdd64Y,      [PdFPU0, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+
+defm : PdWriteResXMMPair<WriteFCmp,         [PdFPU0, PdFPFMA],  2>;
+defm : PdWriteResXMMPair<WriteFCmpX,        [PdFPU0, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFCmpY,        [PdFPU0, PdFPFMA],  2, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+
+defm : PdWriteResXMMPair<WriteFCmp64,       [PdFPU0, PdFPFMA],  2>;
+defm : PdWriteResXMMPair<WriteFCmp64X,      [PdFPU0, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFCmp64Y,      [PdFPU0, PdFPFMA],  2, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+
+defm : PdWriteResXMMPair<WriteFCom,         [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
+
+def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> {
+  let Latency = 6;
+}
+def : InstRW<[PdWriteFCOMPm], (instrs FCOM32m, FCOM64m, FCOMP32m, FCOMP64m)>;
+
+def PdWriteTST_F_UCOM_FPPr : SchedWriteRes<[PdFPU1, PdFPFMA]>;
+def : InstRW<[PdWriteTST_F_UCOM_FPPr], (instrs TST_F, UCOM_FPPr)>;
+
+defm : PdWriteResXMMPair<WriteFMul,         [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFMulX,        [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFMulY,        [PdFPU1, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+
+defm : PdWriteResXMMPair<WriteFMul64,       [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFMul64X,      [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFMul64Y,      [PdFPU1, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+
+defm : PdWriteResXMMPair<WriteFMA,          [PdFPU, PdFPFMA], 5>;
+defm : PdWriteResXMMPair<WriteFMAX,         [PdFPU, PdFPFMA], 5>;
+defm : PdWriteResYMMPair<WriteFMAY,         [PdFPU, PdFPFMA], 5,   [1, 1]>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+
+
+defm : PdWriteResXMMPair<WriteDPPD,         [PdFPU1, PdFPFMA], 15, [1, 3],  15, 2>;
+
+defm : PdWriteResXMMPair<WriteDPPS,         [PdFPU1, PdFPFMA], 25, [1, 3],  16, 2>;
+defm : PdWriteResYMMPair<WriteDPPSY,        [PdFPU1, PdFPFMA], 27, [2, 6], /*or 29*/ 25, 4>;
+defm : X86WriteResPairUnsupported<WriteDPPSZ>;
+
+def PdWriteVDPPSrri : SchedWriteRes<[PdFPU1, PdFPFMA]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 3];
+  let NumMicroOps = 17;
+}
+def : InstRW<[PdWriteVDPPSrri], (instrs VDPPSrri)>;
+
+defm : PdWriteResXMMPair<WriteFRcp,         [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFRcpX,        [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFRcpY,        [PdFPU1, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : PdWriteResXMMPair<WriteFRsqrt,       [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResXMMPair<WriteFRsqrtX,      [PdFPU1, PdFPFMA],  5>;
+defm : PdWriteResYMMPair<WriteFRsqrtY,      [PdFPU1, PdFPFMA],  5, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
+defm : PdWriteResXMMPair<WriteFDiv,         [PdFPU1, PdFPFMA], 9, [1, 19]>;
+defm : PdWriteResXMMPair<WriteFDivX,        [PdFPU1, PdFPFMA], 9, [1, 19]>;
+defm : PdWriteResYMMPair<WriteFDivY,        [PdFPU1, PdFPFMA], 9, [2, 38]>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+
+defm : PdWriteResXMMPair<WriteFDiv64,       [PdFPU1, PdFPFMA], 9, [1, 19]>;
+defm : PdWriteResXMMPair<WriteFDiv64X,      [PdFPU1, PdFPFMA], 9, [1, 19]>;
+defm : PdWriteResYMMPair<WriteFDiv64Y,      [PdFPU1, PdFPFMA], 9, [2, 38]>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+
+defm : PdWriteResXMMPair<WriteFSqrt,        [PdFPU1, PdFPFMA], 9, [1, 21]>;
+defm : PdWriteResXMMPair<WriteFSqrtX,       [PdFPU1, PdFPFMA], 9, [1, 21]>;
+defm : PdWriteResYMMPair<WriteFSqrtY,       [PdFPU1, PdFPFMA], 9, [2, 42]>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+
+defm : PdWriteResXMMPair<WriteFSqrt64,      [PdFPU1, PdFPFMA], 9, [1, 27]>;
+defm : PdWriteResXMMPair<WriteFSqrt64X,     [PdFPU1, PdFPFMA], 9, [1, 27]>;
+defm : PdWriteResYMMPair<WriteFSqrt64Y,     [PdFPU1, PdFPFMA], 9, [2, 54]>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+
+defm : PdWriteResXMMPair<WriteFSqrt80,      [PdFPU1, PdFPFMA],  1, [1, 35]>;
+defm : PdWriteResXMMPair<WriteFSign,        [PdFPU1, PdFPFMA]>;
+
+defm : PdWriteResXMMPair<WriteFRnd,         [PdFPU1, PdFPSTO],  4>;
+defm : PdWriteResYMMPair<WriteFRndY,        [PdFPU1, PdFPSTO],  4, [2, 1], 2>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+
+def PdWriteVFRCZ : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVFRCZ], (instrs VFRCZPDrr, VFRCZPSrr,
+                                     VFRCZSDrr, VFRCZSSrr)>;
+
+def PdWriteVFRCZm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 15;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVFRCZm], (instrs VFRCZPDrm, VFRCZPSrm,
+                                      VFRCZSDrm, VFRCZSSrm)>;
+
+def PdWriteVFRCZY : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 10;
+  let ResourceCycles = [2, 1];
+  let NumMicroOps = 4;
+}
+def : InstRW<[PdWriteVFRCZY], (instrs VFRCZPSYrr, VFRCZPDYrr)>;
+
+def PdWriteVFRCZYm : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 15;
+  let ResourceCycles = [2, 1];
+  let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVFRCZYm], (instrs VFRCZPSYrm, VFRCZPDYrm)>;
+
+defm : PdWriteResXMMPair<WriteFLogic,       [PdFPU01, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFLogicY,      [PdFPU01, PdFPFMA],  2, [2, 2]>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+
+defm : PdWriteResXMMPair<WriteFTest,        [PdFPU0, PdFPFMA, PdEX0],  1, [], 2>;
+defm : PdWriteResYMMPair<WriteFTestY,       [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+
+defm : PdWriteResXMMPair<WriteFShuffle,     [PdFPU01, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFShuffleY,    [PdFPU01, PdFPFMA],  2, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+
+def PdWriteVBROADCASTF128 : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVBROADCASTF128], (instrs VBROADCASTF128)>;
+
+defm : PdWriteResXMMPair<WriteFVarShuffle,  [PdFPU01, PdFPFMA],  3, [1, 4]>;
+defm : PdWriteResYMMPair<WriteFVarShuffleY, [PdFPU01, PdFPFMA],  3, [2, 6], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+
+defm : PdWriteResXMMPair<WriteFBlend,       [PdFPU01, PdFPFMA],  2>;
+defm : PdWriteResYMMPair<WriteFBlendY,      [PdFPU01, PdFPFMA],  2, [2, 2], 2>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+
+defm : PdWriteResXMMPair<WriteFVarBlend,    [PdFPU01, PdFPFMA],  2, [1, 4]>;
+defm : PdWriteResYMMPair<WriteFVarBlendY,   [PdFPU01, PdFPFMA],  2, [2, 6], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+
+defm : PdWriteResXMMPair<WriteFShuffle256,  [PdFPU01, PdFPFMA],  2, [], 2>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffle256>;
+
+def PdWriteVEXTRACTF128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 2;
+}
+def : InstRW<[PdWriteVEXTRACTF128rr], (instrs VEXTRACTF128rr)>;
+
+def PdWriteVEXTRACTF128mr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVEXTRACTF128mr], (instrs VEXTRACTF128mr)>;
+
+def PdWriteVPERM2F128rr : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 4;
+  let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVPERM2F128rr], (instrs VPERM2F128rr)>;
+
+def PdWriteVPERM2F128rm : SchedWriteRes<[PdFPU01, PdFPFMA]> {
+  let Latency = 8; // 4 + 4
+  let NumMicroOps = 10;
+}
+def : InstRW<[PdWriteVPERM2F128rm], (instrs VPERM2F128rm)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Conversions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteCvtSS2I,   [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+
+defm : PdWriteResXMMPair<WriteCvtPS2I,   [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtPS2IY,  [PdFPU1, PdFPSTO], 4, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+
+defm : PdWriteResXMMPair<WriteCvtSD2I,   [PdFPU1, PdFPSTO, PdFPFMA, PdEX0], 13, [], 2>;
+
+defm : PdWriteResXMMPair<WriteCvtPD2I,   [PdFPU1, PdFPSTO],          8, [],        2>;
+defm : PdWriteResYMMPair<WriteCvtPD2IY,  [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>;
+
+// FIXME: f+3 ST, LD+STC latency
+defm : PdWriteResXMMPair<WriteCvtI2SS,   [PdFPU1, PdFPSTO], 4, [], 2>;
+// FIXME: .Folded version is one NumMicroOp *less*..
+
+defm : PdWriteResXMMPair<WriteCvtI2PS,   [PdFPU1, PdFPSTO], 4>;
+defm : PdWriteResYMMPair<WriteCvtI2PSY,  [PdFPU1, PdFPSTO], 4, [2, 1]>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+
+defm : PdWriteResXMMPair<WriteCvtI2SD,   [PdFPU1, PdFPSTO], 4, [], 2>;
+// FIXME: .Folded version is one NumMicroOp *less*..
+
+def WriteCVTSI642SDrr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 13;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteCVTSI642SDrr], (instrs CVTSI642SDrr, CVTSI642SSrr)>;
+
+defm : PdWriteResXMMPair<WriteCvtI2PD,   [PdFPU1, PdFPSTO], 8, [],     2>;
+defm : PdWriteResYMMPair<WriteCvtI2PDY,  [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : PdWriteResXMMPair<WriteCvtSS2SD,  [PdFPU1, PdFPSTO], 4>;
+
+defm : PdWriteResXMMPair<WriteCvtPS2PD,  [PdFPU1, PdFPSTO], 8, [],     2>;
+defm : PdWriteResYMMPair<WriteCvtPS2PDY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 1>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+
+defm : PdWriteResXMMPair<WriteCvtSD2SS,  [PdFPU1, PdFPSTO], 4>;
+
+defm : PdWriteResXMMPair<WriteCvtPD2PS,  [PdFPU1, PdFPSTO],          8, [],        2>;
+defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+def WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr,
+                                                            MMX_CVTPI2PDirr)>;
+
+def WriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU1, PdFPSTO]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def : InstRW<[WriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>;
+
+defm : PdWriteResXMMPair<WriteCvtPH2PS,  [PdFPU1, PdFPSTO], 8, [],     2, 1>;
+defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU1, PdFPSTO], 8, [2, 1], 4, 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>;
+
+defm : PdWriteRes<WriteCvtPS2PH,        [PdFPU1, PdFPSTO],          8, [],        2>;
+defm : PdWriteRes<WriteCvtPS2PHY,       [PdFPU1, PdFPSTO, PdFPFMA], 8, [2, 1, 1], 4>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+
+defm : PdWriteRes<WriteCvtPS2PHSt,      [PdFPU1, PdFPSTO, PdStore],          4, [],           3>;
+defm : PdWriteRes<WriteCvtPS2PHYSt,     [PdFPU1, PdFPSTO, PdFPFMA, PdStore], 4, [2, 1, 1, 1], 4>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector integer operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteVecLoad,             [PdLoad, PdFPU01, PdFPMAL], 5>;
+defm : PdWriteRes<WriteVecLoadX,            [PdLoad, PdFPU01, PdFPMAL], 5>;
+defm : PdWriteRes<WriteVecLoadY,            [PdLoad, PdFPU01, PdFPMAL], 5, [], 2>;
+
+defm : PdWriteRes<WriteVecLoadNT,           [PdLoad, PdFPU01, PdFPMAL], 5>;
+defm : PdWriteRes<WriteVecLoadNTY,          [PdLoad, PdFPU01, PdFPMAL], 5>;
+
+defm : PdWriteRes<WriteVecMaskedLoad,       [PdLoad, PdFPU01, PdFPMAL], 6, [1, 1, 2]>;
+defm : PdWriteRes<WriteVecMaskedLoadY,      [PdLoad, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
+
+defm : PdWriteRes<WriteVecStore,            [PdStore, PdFPU1,   PdFPSTO], 2>;
+defm : PdWriteRes<WriteVecStoreX,           [PdStore, PdFPU1,   PdFPSTO]>;
+defm : PdWriteRes<WriteVecStoreY,           [PdStore, PdFPU1,   PdFPSTO], 1, [], 4>;
+
+def PdWriteVMOVDQUYmr : SchedWriteRes<[PdStore, PdFPU1,   PdFPSTO]> {
+  let NumMicroOps = 8;
+}
+def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>;
+
+defm : PdWriteRes<WriteVecStoreNT,          [PdStore, PdFPU1,   PdFPSTO], 2>;
+defm : PdWriteRes<WriteVecStoreNTY,         [PdStore, PdFPU1,   PdFPSTO], 2, [2, 2, 2], 4>;
+
+defm : PdWriteRes<WriteVecMaskedStore,      [PdStore, PdFPU01, PdFPMAL], 6, [1, 1, 4]>;
+defm : PdWriteRes<WriteVecMaskedStoreY,     [PdStore, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
+
+defm : PdWriteRes<WriteVecMove,             [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteRes<WriteVecMoveX,            [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteRes<WriteVecMoveY,            [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
+
+defm : PdWriteRes<WriteVecMoveToGpr,        [PdFPU0, PdFPFMA, PdEX0], 10>;
+defm : PdWriteRes<WriteVecMoveFromGpr,      [PdFPU01, PdFPFMA], 10, [], 2>;
+
+defm : PdWriteResXMMPair<WriteVecALU,        [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecALUX,       [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WriteVecALUY>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+
+defm : PdWriteResXMMPair<WriteVecShift,      [PdFPU01, PdFPMAL], 3>;
+defm : PdWriteResXMMPair<WriteVecShiftX,     [PdFPU01, PdFPMAL], 3>;
+defm : X86WriteResPairUnsupported<WriteVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+
+defm : PdWriteResXMMPair<WriteVecShiftImm,   [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecShiftImmX,  [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmY>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+
+defm : PdWriteResXMMPair<WriteVecIMul,       [PdFPU0, PdFPMMA], 4>;
+defm : PdWriteResXMMPair<WriteVecIMulX,      [PdFPU0, PdFPMMA], 4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulY>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+
+defm : PdWriteResXMMPair<WritePMULLD,        [PdFPU0, PdFPU01, PdFPMMA, PdFPMAL], 5, [2, 1, 2, 1]>;
+defm : X86WriteResPairUnsupported<WritePMULLDY>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+
+def JWriteVPMACS : SchedWriteRes<[PdFPU0, PdFPU01, PdFPMMA, PdFPMAL]> {
+  let Latency = 4;
+  let ResourceCycles = [2, 1, 2, 1];
+}
+def : InstRW<[JWriteVPMACS], (instrs VPMACSDQHrr, VPMACSDQLrr, VPMACSSDQHrr,
+                                     VPMACSSDQLrr)>;
+
+defm : PdWriteResXMMPair<WriteMPSAD,         [PdFPU0, PdFPMMA], 9, [1, 2], 9>;
+defm : X86WriteResPairUnsupported<WriteMPSADY>;
+defm : X86WriteResPairUnsupported<WriteMPSADZ>;
+
+defm : PdWriteResXMMPair<WritePSADBW,        [PdFPU01, PdFPMAL], 4, [], 2>;
+defm : PdWriteResXMMPair<WritePSADBWX,       [PdFPU01, PdFPMAL], 4, [], 2>;
+defm : X86WriteResPairUnsupported<WritePSADBWY>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+
+defm : PdWriteResXMMPair<WritePHMINPOS,      [PdFPU0,  PdFPMAL], 4, [], 2>;
+
+defm : PdWriteResXMMPair<WriteShuffle,       [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteShuffleX,      [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResYMMPair<WriteShuffleY,      [PdFPU01, PdFPMAL], 2,   [1, 1]>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+
+defm : PdWriteResXMMPair<WriteVarShuffle,    [PdFPU01, PdFPMAL], 3, [1, 4]>;
+defm : PdWriteResXMMPair<WriteVarShuffleX,   [PdFPU01, PdFPMAL], 3, [1, 4]>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleY>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+
+defm : PdWriteResXMMPair<WriteBlend,         [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WriteBlendY>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+
+defm : PdWriteResXMMPair<WriteVarBlend,      [PdFPU01, PdFPMAL], 2, [1, 4]>;
+defm : X86WriteResPairUnsupported<WriteVarBlendY>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+
+defm : PdWriteResXMMPair<WriteVecLogic,      [PdFPU01, PdFPMAL], 2>;
+defm : PdWriteResXMMPair<WriteVecLogicX,     [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WriteVecLogicY>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+
+defm : PdWriteResXMMPair<WriteVecTest,       [PdFPU0, PdFPFMA, PdEX0],  1, [], 2>;
+defm : PdWriteResYMMPair<WriteVecTestY,      [PdFPU01, PdFPFMA, PdEX0], 1, [2, 2, 1], 4, 2>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+
+defm : PdWriteResXMMPair<WriteShuffle256,    [PdFPU01, PdFPMAL]>;
+defm : PdWriteResXMMPair<WriteVarShuffle256, [PdFPU01, PdFPMAL]>;
+
+defm : PdWriteResXMMPair<WriteVarVecShift,   [PdFPU01, PdFPMAL], 3>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftY>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Vector insert/extract operations.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteVecInsert,    [PdFPU01, PdFPMAL], 2, [], 2>;
+defm : PdWriteRes<WriteVecInsertLd,  [PdFPU01, PdFPMAL, PdLoad], 6, [], 2>;
+
+defm : PdWriteRes<WriteVecExtract,   [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>;
+defm : PdWriteRes<WriteVecExtractSt, [PdFPU1, PdFPSTO, PdStore], 13, [], 2>;
+
+def PdWriteEXTRQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+  let Latency = 3;
+}
+def : InstRW<[PdWriteEXTRQ], (instrs EXTRQ, EXTRQI)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE42 String instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WritePCmpIStrI, [PdFPU1, PdFPFMA, PdEX0], 14, [1, 2, 1], 7, 1>;
+defm : PdWriteResXMMPair<WritePCmpIStrM, [PdFPU1, PdFPFMA, PdEX0],  6, [1, 2, 1], 7, 2>;
+
+defm : PdWriteResXMMPair<WritePCmpEStrI, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 15, [1, 2, 6, 4, 1, 1], 27, 1>;
+defm : PdWriteResXMMPair<WritePCmpEStrM, [PdFPU1, PdStore, PdLoad, PdFPMAL, PdFPFMA, PdEX0], 10, [1, 2, 6, 4, 1, 1], 27, 1>;
+
+////////////////////////////////////////////////////////////////////////////////
+// MOVMSK Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteRes<WriteFMOVMSK, [PdFPU0, PdFPFMA, PdEX0],   10, [], 2>;
+
+defm : PdWriteRes<WriteVecMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 13, [], 2>;
+defm : X86WriteResUnsupported<WriteVecMOVMSKY>;
+// defm : X86WriteResUnsupported<WriteVecMOVMSKZ>;
+
+defm : PdWriteRes<WriteMMXMOVMSK, [PdFPU0, PdFPFMA, PdEX0], 10, [], 2>;
+
+////////////////////////////////////////////////////////////////////////////////
+// AES Instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteAESIMC,    [PdFPU0, PdFPMMA], 5>;
+defm : PdWriteResXMMPair<WriteAESKeyGen, [PdFPU0, PdFPMMA], 5>;
+defm : PdWriteResXMMPair<WriteAESDecEnc, [PdFPU0, PdFPMMA], 9, [], 2>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteFHAdd,  [PdFPU0, PdFPFMA], 11, [],     3, 1>;
+defm : PdWriteResYMMPair<WriteFHAddY, [PdFPU0, PdFPFMA], 11, [2, 1], 8, 2>;
+defm : X86WriteResPairUnsupported<WriteFHAddZ>;
+
+defm : PdWriteResXMMPair<WritePHAdd,  [PdFPU01, PdFPMAL], 5, [], 3, 1>;
+defm : PdWriteResXMMPair<WritePHAddX, [PdFPU01, PdFPMAL], 2>;
+defm : X86WriteResPairUnsupported<WritePHAddY>;
+defm : X86WriteResPairUnsupported<WritePHAddZ>;
+
+def : InstRW<[WritePHAdd], (instrs PHADDDrr, PHSUBDrr,
+                                   PHADDWrr, PHSUBWrr,
+                                   PHADDSWrr, PHSUBSWrr,
+                                   VPHADDDrr, VPHSUBDrr,
+                                   VPHADDWrr, VPHSUBWrr,
+                                   VPHADDSWrr, VPHSUBSWrr)>;
+
+def : InstRW<[WritePHAdd.Folded], (instrs PHADDDrm, PHSUBDrm,
+                                          PHADDWrm, PHSUBWrm,
+                                          PHADDSWrm, PHSUBSWrm,
+                                          VPHADDDrm, VPHSUBDrm,
+                                          VPHADDWrm, VPHSUBWrm,
+                                          VPHADDSWrm, VPHSUBSWrm)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Carry-less multiplication instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : PdWriteResXMMPair<WriteCLMul, [PdFPU0, PdFPMMA], 12, [], 5, 1>;
+
+def PdWriteVPCLMULQDQrr : SchedWriteRes<[PdFPU0, PdFPMMA]> {
+  let Latency = 13;
+  let NumMicroOps = 6;
+}
+def : InstRW<[PdWriteVPCLMULQDQrr], (instrs VPCLMULQDQrr)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def PdWriteINSERTQ : SchedWriteRes<[PdFPU01, PdFPMAL]> {
+  let Latency = 3;
+  let ResourceCycles = [1, 4];
+}
+def : InstRW<[PdWriteINSERTQ], (instrs INSERTQ, INSERTQI)>;
+
+////////////////////////////////////////////////////////////////////////////////
+// AVX instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def PdWriteVBROADCASTYLd : SchedWriteRes<[PdLoad, PdFPU01, PdFPFMA]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 2, 4];
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm,
+                                                          VBROADCASTSSYrm)>;
+
+def PdWriteVZEROALL : SchedWriteRes<[]> {
+  let Latency = 90;
+  let NumMicroOps = 32;
+}
+def : InstRW<[PdWriteVZEROALL], (instrs VZEROALL)>;
+
+def PdWriteVZEROUPPER : SchedWriteRes<[]> {
+  let Latency = 46;
+  let NumMicroOps = 16;
+}
+def : InstRW<[PdWriteVZEROUPPER], (instrs VZEROUPPER)>;
+
+///////////////////////////////////////////////////////////////////////////////
+//  SchedWriteVariant definitions.
+///////////////////////////////////////////////////////////////////////////////
+
+def PdWriteZeroLatency : SchedWriteRes<[]> {
+  let Latency = 0;
+}
+
+def PdWriteZeroIdiom : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteALU]>
+]>;
+def : InstRW<[PdWriteZeroIdiom], (instrs SUB32rr, SUB64rr,
+                                         XOR32rr, XOR64rr)>;
+
+def PdWriteFZeroIdiom : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteFLogic]>
+]>;
+def : InstRW<[PdWriteFZeroIdiom], (instrs XORPSrr,  VXORPSrr,
+                                          XORPDrr,  VXORPDrr,
+                                          ANDNPSrr, VANDNPSrr,
+                                          ANDNPDrr, VANDNPDrr)>;
+
+// VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr "zero-idioms" have latency of 1.
+
+def PdWriteVZeroIdiomLogic : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteVecLogic]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>;
+
+def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteVecLogicX]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomLogicX], (instrs PXORrr,  VPXORrr,
+                                                PANDNrr, VPANDNrr)>;
+
+def PdWriteVZeroIdiomALU : SchedWriteVariant<[
+  SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+  SchedVar<MCSchedPredicate<TruePred>,           [WriteVecALU]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBirr,   MMX_PSUBDirr,
+                                             MMX_PSUBQirr,   MMX_PSUBWirr,
+                                             MMX_PCMPGTBirr,
+                                             MMX_PCMPGTDirr,
+                                             MMX_PCMPGTWirr)>;
+
+def PdWriteVZeroIdiomALUX : SchedWriteVariant<[
+    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>,
+    SchedVar<MCSchedPredicate<TruePred>,           [WriteVecALUX]>
+]>;
+def : InstRW<[PdWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr,
+                                              PSUBDrr, VPSUBDrr,
+                                              PSUBQrr, VPSUBQrr,
+                                              PSUBWrr, VPSUBWrr,
+                                              PCMPGTBrr, VPCMPGTBrr,
+                                              PCMPGTDrr, VPCMPGTDrr,
+                                              PCMPGTWrr, VPCMPGTWrr)>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+// VPCMPGTQ, but not PCMPGTQ!
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+  // MMX Zero-idioms.
+  DepBreakingClass<[
+    MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr,
+    MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr,
+    MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr,
+    MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr
+  ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
+
+    // int variants.
+    PXORrr, PANDNrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX Zero-idioms.
+  DepBreakingClass<[
+    // xmm fp variants.
+    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
+
+    // xmm int variants.
+    VPXORrr, VPANDNrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+
+    // ymm variants.
+    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr
+  ], ZeroIdiomPredicate>
+]>;
+
+def : IsDepBreakingFunction<[
+  // GPR
+  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
+  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
+
+  // MMX
+  DepBreakingClass<[
+    MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr
+  ], ZeroIdiomPredicate>,
+
+  // SSE
+  DepBreakingClass<[
+    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr
+    // But not PCMPEQQrr.
+  ], ZeroIdiomPredicate>,
+
+  // AVX
+  DepBreakingClass<[
+    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr
+    // But not VPCMPEQQrr.
+  ], ZeroIdiomPredicate>
+]>;
+
+
+} // SchedModel
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index 2c1a4b6c7f56f78754eeacb95d0e52c4b505187b..33a6b01546d7656597ca4e5e158f4a5f146fc0d5 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -48,12 +48,22 @@ def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM
 // part of it.
 // Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register
 // access" - Agner Fog's "microarchitecture.pdf".
-def JIntegerPRF : RegisterFile<64, [GR64, CCR]>;
+def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0],
+                               0,  // Max moves that can be eliminated per cycle.
+                               1>; // Restrict move elimination to zero regs.
 
 // The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE
 // registers. Operations on 256-bit data types are cracked into two COPs.
 // Reference: www.realworldtech.com/jaguar/4/
-def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// The PRF in the floating point unit can eliminate a move from a MMX or SSE
+// register that is know to be zero (i.e. it has been zeroed using a zero-idiom
+// dependency breaking instruction, or via VZEROALL).
+// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking
+// instructions" - Agner Fog's "microarchitecture.pdf"
+def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0],
+                          0,  // Max moves that can be eliminated per cycle.
+                          1>; // Restrict move elimination to zero regs.
 
 // The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can
 // retire up to two macro-ops per cycle.
@@ -805,4 +815,24 @@ def : IsDepBreakingFunction<[
   ], ZeroIdiomPredicate>
 ]>;
 
+def : IsOptimizableRegisterMove<[
+  InstructionEquivalenceClass<[
+    // GPR variants.
+    MOV32rr, MOV64rr,
+
+    // MMX variants.
+    MMX_MOVQ64rr,
+
+    // SSE variants.
+    MOVAPSrr, MOVUPSrr,
+    MOVAPDrr, MOVUPDrr,
+    MOVDQArr, MOVDQUrr,
+
+    // AVX variants.
+    VMOVAPSrr, VMOVUPSrr,
+    VMOVAPDrr, VMOVUPDrr,
+    VMOVDQArr, VMOVDQUrr
+  ], TruePred >
+]>;
+
 } // SchedModel
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index a71e5d3959581ce762d6f2098edf09e5514c782b..008a9ec2ba3ce35ae6184fe73541fc2b5aaeadf2 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -250,7 +250,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
 
     if (Repeats.BytesLeft() > 0 &&
         DAG.getMachineFunction().getFunction().optForMinSize()) {
-      // When agressively optimizing for size, avoid generating the code to
+      // When aggressively optimizing for size, avoid generating the code to
       // handle BytesLeft.
       Repeats.AVT = MVT::i8;
     }
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index c7ddf93f8e85144064ceb658f674177e8c7feacc..720be8afa62c2c44b7e8e3e5d6985a95e083163c 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -112,11 +112,10 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   return true;
 }
 
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
@@ -125,7 +124,7 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / 8;
   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
          "Unexpected number of vector elements.");
 
@@ -151,12 +150,10 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
                         SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
 
@@ -166,7 +163,7 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) &&
          "Unexpected number of vector elements.");
@@ -189,11 +186,13 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
 }
 
 void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask) {
   Type *MaskTy = C->getType();
   unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
   (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
+  assert((MaskTySize == 128 || MaskTySize == 256) &&
+         Width >= MaskTySize && "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
   APInt UndefElts;
@@ -201,7 +200,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
   unsigned NumEltsPerLane = 128 / ElSize;
   assert((NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected number of vector elements.");
@@ -242,9 +241,12 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   }
 }
 
-void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
-  assert(C->getType()->getPrimitiveSizeInBits() == 128 &&
-         "Unexpected vector size.");
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
+  (void)MaskTySize;
+  assert(Width == 128 && Width >= MaskTySize && "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
   APInt UndefElts;
@@ -252,7 +254,7 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / 8;
   assert(NumElts == 16 && "Unexpected number of vector elements.");
 
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -291,12 +293,10 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
   }
 }
 
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
                       SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -307,7 +307,7 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
@@ -319,12 +319,10 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
   }
 }
 
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
                        SmallVectorImpl<int> &ShuffleMask) {
-  Type *MaskTy = C->getType();
-  unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
-  (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512) &&
+  assert((Width == 128 || Width == 256 || Width == 512) &&
+         C->getType()->getPrimitiveSizeInBits() >= Width &&
          "Unexpected vector size.");
   assert((ElSize == 8 || ElSize == 16 || ElSize == 32 || ElSize == 64) &&
          "Unexpected vector element size.");
@@ -335,7 +333,7 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
-  unsigned NumElts = RawMask.size();
+  unsigned NumElts = Width / ElSize;
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (UndefElts[i]) {
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index b703cbbd2b29928b11f3ec681f5354cc81010e74..b08c31935d281011e2fb0a5eb9fe886b0c0a96e8 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -26,25 +26,28 @@ class Constant;
 class MVT;
 
 /// Decode a PSHUFB mask from an IR-level vector constant.
-void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+void DecodePSHUFBMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP variable mask from an IR-level vector constant.
-void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP2 variable mask from an IR-level vector constant.
 void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
+                         unsigned Width,
                          SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPPERM variable mask from an IR-level vector constant.
-void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPPERMMask(const Constant *C, unsigned Width,
+                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
+void DecodeVPERMVMask(const Constant *C, unsigned ElSize, unsigned Width,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant.
-void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
+void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize, unsigned Width,
                        SmallVectorImpl<int> &ShuffleMask);
 
 } // llvm namespace
diff --git a/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 14e4c455a0867cfcef7ac08fc126356c976add69..b8cb11fb862eb5402037e7780f7410f3e8c54adc 100644
--- a/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -119,12 +119,6 @@ static cl::opt<bool> HardenIndirectCallsAndJumps(
              "mitigate Spectre v1.2 style attacks."),
     cl::init(true), cl::Hidden);
 
-namespace llvm {
-
-void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
-
-} // end namespace llvm
-
 namespace {
 
 class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index ddee9a692e19e7ab4f8cdd5ccc65420f4f9ca52b..b1103f823e7f97fa19b64accb3d8497deda8be57 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -52,21 +52,15 @@ enum Style {
 
 class X86Subtarget final : public X86GenSubtargetInfo {
 public:
+  // NOTE: Do not add anything new to this list. Coarse, CPU name based flags
+  // are not a good idea. We should be migrating away from these.
   enum X86ProcFamilyEnum {
     Others,
     IntelAtom,
     IntelSLM,
     IntelGLM,
     IntelGLP,
-    IntelTRM,
-    IntelHaswell,
-    IntelBroadwell,
-    IntelSkylake,
-    IntelKNL,
-    IntelSKX,
-    IntelCannonlake,
-    IntelIcelakeClient,
-    IntelIcelakeServer,
+    IntelTRM
   };
 
 protected:
@@ -229,6 +223,9 @@ protected:
   //  PMULUDQ.
   bool IsPMULLDSlow = false;
 
+  /// True if the PMADDWD instruction is slow compared to PMULLD.
+  bool IsPMADDWDSlow = false;
+
   /// True if unaligned memory accesses of 16-bytes are slow.
   bool IsUAMem16Slow = false;
 
@@ -388,6 +385,9 @@ protected:
   /// Processor has a single uop BEXTR implementation.
   bool HasFastBEXTR = false;
 
+  /// Try harder to combine to horizontal vector ops if they are fast.
+  bool HasFastHorizontalOps = false;
+
   /// Use a retpoline thunk rather than indirect calls to block speculative
   /// execution.
   bool UseRetpolineIndirectCalls = false;
@@ -615,6 +615,7 @@ public:
   bool hasPTWRITE() const { return HasPTWRITE; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isPMULLDSlow() const { return IsPMULLDSlow; }
+  bool isPMADDWDSlow() const { return IsPMADDWDSlow; }
   bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
   bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
   int getGatherOverhead() const { return GatherOverhead; }
@@ -636,6 +637,7 @@ public:
   bool hasFastLZCNT() const { return HasFastLZCNT; }
   bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
   bool hasFastBEXTR() const { return HasFastBEXTR; }
+  bool hasFastHorizontalOps() const { return HasFastHorizontalOps; }
   bool hasMacroFusion() const { return HasMacroFusion; }
   bool hasERMSB() const { return HasERMSB; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 812b8b28ebdce189c8b542fc6871ba911cfab82a..6426e5b076cc8c2cc0b2793cccac324fbafdb801 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -59,21 +59,6 @@ static cl::opt<bool> EnableCondBrFoldingPass("x86-condbr-folding",
                                         "folding pass"),
                                cl::init(true), cl::Hidden);
 
-namespace llvm {
-
-void initializeWinEHStatePassPass(PassRegistry &);
-void initializeFixupLEAPassPass(PassRegistry &);
-void initializeShadowCallStackPass(PassRegistry &);
-void initializeX86CallFrameOptimizationPass(PassRegistry &);
-void initializeX86CmovConverterPassPass(PassRegistry &);
-void initializeX86ExecutionDomainFixPass(PassRegistry &);
-void initializeX86DomainReassignmentPass(PassRegistry &);
-void initializeX86AvoidSFBPassPass(PassRegistry &);
-void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
-void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
-
-} // end namespace llvm
-
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
   RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
@@ -295,13 +280,14 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
     }
   }
 
-  // Extract required-vector-width attribute.
+  // Extract min-legal-vector-width attribute.
   unsigned RequiredVectorWidth = UINT32_MAX;
-  if (F.hasFnAttribute("required-vector-width")) {
-    StringRef Val = F.getFnAttribute("required-vector-width").getValueAsString();
+  if (F.hasFnAttribute("min-legal-vector-width")) {
+    StringRef Val =
+        F.getFnAttribute("min-legal-vector-width").getValueAsString();
     unsigned Width;
     if (!Val.getAsInteger(0, Width)) {
-      Key += ",required-vector-width=";
+      Key += ",min-legal-vector-width=";
       Key += Val;
       RequiredVectorWidth = Width;
     }
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 5b21cd82b5b1f613315d07fb6f750ccf0e0fc157..f5b45da0c3dc2225c58cb014faf23892f319fc07 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -53,10 +53,6 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
-
-  bool isMachineVerifierClean() const override {
-    return false;
-  }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 4c14715b758d669dea467fb311062b2f0313d173..ebb8aca5fb146a17f41d1f29c3f103650fc26937 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -290,11 +290,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SHL,  MVT::v64i8,   2 }, // psllw + pand.
     { ISD::SRL,  MVT::v64i8,   2 }, // psrlw + pand.
     { ISD::SRA,  MVT::v64i8,   4 }, // psrlw, pand, pxor, psubb.
-
-    { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
-    { ISD::SREM, MVT::v32i16,  8 }, // vpmulhw+mul+sub sequence
-    { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
-    { ISD::UREM, MVT::v32i16,  8 }, // vpmulhuw+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -308,11 +303,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v2i64,   1 },
     { ISD::SRA,  MVT::v4i64,   1 },
     { ISD::SRA,  MVT::v8i64,   1 },
-
-    { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
-    { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
-    { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
-    { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -328,15 +318,6 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRA,  MVT::v32i8,   4 }, // psrlw, pand, pxor, psubb.
 
     { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.
-
-    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
-    { ISD::SREM, MVT::v16i16,  8 }, // vpmulhw+mul+sub sequence
-    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
-    { ISD::UREM, MVT::v16i16,  8 }, // vpmulhuw+mul+sub sequence
-    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
-    { ISD::SREM, MVT::v8i32,  19 }, // vpmuldq+mul+sub sequence
-    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
-    { ISD::UREM, MVT::v8i32,  19 }, // vpmuludq+mul+sub sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -354,7 +335,81 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
     { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
     { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+  };
+
+  // XOP has faster vXi8 shifts.
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasSSE2() && !ST->hasXOP()) {
+    if (const auto *Entry =
+            CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
 
+  static const CostTblEntry AVX512BWConstCostTable[] = {
+    { ISD::SDIV, MVT::v64i8,  14 }, // 2*ext+2*pmulhw sequence
+    { ISD::SREM, MVT::v64i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v64i8,  14 }, // 2*ext+2*pmulhw sequence
+    { ISD::UREM, MVT::v64i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
+    { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
+    { ISD::SREM, MVT::v32i16,  8 }, // vpmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
+    { ISD::UREM, MVT::v32i16,  8 }, // vpmulhuw+mul+sub sequence
+  };
+
+  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+      ST->hasBWI()) {
+    if (const auto *Entry =
+            CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX512ConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+    { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
+    { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+    { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
+  };
+
+  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+      ST->hasAVX512()) {
+    if (const auto *Entry =
+            CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX2ConstCostTable[] = {
+    { ISD::SDIV, MVT::v32i8,  14 }, // 2*ext+2*pmulhw sequence
+    { ISD::SREM, MVT::v32i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v32i8,  14 }, // 2*ext+2*pmulhw sequence
+    { ISD::UREM, MVT::v32i8,  16 }, // 2*ext+2*pmulhw+mul+sub sequence
+    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
+    { ISD::SREM, MVT::v16i16,  8 }, // vpmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
+    { ISD::UREM, MVT::v16i16,  8 }, // vpmulhuw+mul+sub sequence
+    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
+    { ISD::SREM, MVT::v8i32,  19 }, // vpmuldq+mul+sub sequence
+    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
+    { ISD::UREM, MVT::v8i32,  19 }, // vpmuludq+mul+sub sequence
+  };
+
+  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
+      ST->hasAVX2()) {
+    if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry SSE2ConstCostTable[] = {
+    { ISD::SDIV, MVT::v32i8,  28+2 }, // 4*ext+4*pmulhw sequence + split.
+    { ISD::SREM, MVT::v32i8,  32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
+    { ISD::SDIV, MVT::v16i8,    14 }, // 2*ext+2*pmulhw sequence
+    { ISD::SREM, MVT::v16i8,    16 }, // 2*ext+2*pmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v32i8,  28+2 }, // 4*ext+4*pmulhw sequence + split.
+    { ISD::UREM, MVT::v32i8,  32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
+    { ISD::UDIV, MVT::v16i8,    14 }, // 2*ext+2*pmulhw sequence
+    { ISD::UREM, MVT::v16i8,    16 }, // 2*ext+2*pmulhw+mul+sub sequence
     { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
     { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
     { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
@@ -373,7 +428,8 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::UREM, MVT::v4i32,    20 }, // pmuludq+mul+sub sequence
   };
 
-  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+  if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
       ST->hasSSE2()) {
     // pmuldq sequence.
     if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
@@ -385,12 +441,8 @@ int X86TTIImpl::getArithmeticInstrCost(
     if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
       return LT.first * 20;
 
-    // XOP has faster vXi8 shifts.
-    if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
-        !ST->hasXOP())
-      if (const auto *Entry =
-              CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
-        return LT.first * Entry->Cost;
+    if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
   }
 
   static const CostTblEntry AVX2UniformCostTable[] = {
@@ -810,6 +862,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
 
+  // Treat Transpose as 2-op shuffles - there's no difference in lowering.
+  if (Kind == TTI::SK_Transpose)
+    Kind = TTI::SK_PermuteTwoSrc;
+
   // For Broadcasts we are splatting the first element from the first input
   // register, so only need to reference that input and all the output
   // registers are the same.
@@ -1208,8 +1264,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
 
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
@@ -1231,9 +1285,12 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64, 12 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64, 26 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  5 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
+
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
 
     { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
@@ -1329,13 +1386,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i32, 6 },
     { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i32, 6 },
     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i32, 9 },
+    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 5 },
+    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 6 },
     // The generic code to compute the scalar overhead is currently broken.
     // Workaround this limitation by estimating the scalarization overhead
     // here. We have roughly 10 instructions per scalar element.
     // Multiply that by the vector width.
     // FIXME: remove that when PR19268 is fixed.
-    { ISD::UINT_TO_FP,  MVT::v2f64, MVT::v2i64, 10 },
-    { ISD::UINT_TO_FP,  MVT::v4f64, MVT::v4i64, 20 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
 
@@ -1388,6 +1445,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  3 },
     { ISD::TRUNCATE,    MVT::v16i16, MVT::v16i32, 6 },
 
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
   };
 
   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
@@ -1409,11 +1467,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
 
     { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
 
+    { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    6 },
+
     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
     { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i8,   2 },
@@ -2342,11 +2402,15 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
       return TTI::TCC_Free;
     ImmIdx = 1;
     break;
-  case Instruction::Mul:
   case Instruction::UDiv:
   case Instruction::SDiv:
   case Instruction::URem:
   case Instruction::SRem:
+    // Division by constant is typically expanded later into a different
+    // instruction sequence. This completely changes the constants.
+    // Report them as "free" to stop ConstantHoist from marking them as opaque.
+    return TTI::TCC_Free;
+  case Instruction::Mul:
   case Instruction::Or:
   case Instruction::Xor:
     ImmIdx = 1;
@@ -2719,7 +2783,14 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
-                                               unsigned AddressSpace) {
+                                               unsigned AddressSpace,
+                                               bool UseMaskForCond,
+                                               bool UseMaskForGaps) {
+
+  if (UseMaskForCond || UseMaskForGaps)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
@@ -2828,7 +2899,14 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                                  unsigned Factor,
                                                  ArrayRef<unsigned> Indices,
                                                  unsigned Alignment,
-                                                 unsigned AddressSpace) {
+                                                 unsigned AddressSpace,
+                                                 bool UseMaskForCond,
+                                                 bool UseMaskForGaps) {
+
+  if (UseMaskForCond || UseMaskForGaps)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace,
+                                             UseMaskForCond, UseMaskForGaps);
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -2946,7 +3024,9 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
-                                           unsigned AddressSpace) {
+                                           unsigned AddressSpace,
+                                           bool UseMaskForCond,
+                                           bool UseMaskForGaps) {
   auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
     Type *EltTy = VecTy->getVectorElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
@@ -2958,11 +3038,14 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   };
   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
-                                            Alignment, AddressSpace);
+                                            Alignment, AddressSpace,
+                                            UseMaskForCond, UseMaskForGaps);
   if (ST->hasAVX2())
     return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
-                                          Alignment, AddressSpace);
+                                          Alignment, AddressSpace,
+                                          UseMaskForCond, UseMaskForGaps);
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace,
+                                           UseMaskForCond, UseMaskForGaps);
 }
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 3df8990388202ae194803144f3a5cf19dd3a6a37..1637592c81f8a413be473f595cb2310991ee024a 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -101,13 +101,19 @@ public:
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace);
+                                 unsigned Alignment, unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
   int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace);
+                                 unsigned Alignment, unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
   int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace);
+                                 unsigned Alignment, unsigned AddressSpace,
+                                 bool UseMaskForCond = false,
+                                 bool UseMaskForGaps = false);
 
   int getIntImmCost(int64_t);
 
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index dde9c734f4928511be354bdc8ee0dc3a6f448e66..185deda97c1fc7978a059fc43ca07ae70ab804f8 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -34,10 +34,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "winehstate"
 
-namespace llvm {
-void initializeWinEHStatePassPass(PassRegistry &);
-}
-
 namespace {
 const int OverdefinedState = INT_MIN;
 
@@ -369,7 +365,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
 
   // Insert an unlink before all returns.
   for (BasicBlock &BB : *F) {
-    TerminatorInst *T = BB.getTerminator();
+    Instruction *T = BB.getTerminator();
     if (!isa<ReturnInst>(T))
       continue;
     Builder.SetInsertPoint(T);
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 4357948d5ab29af274f3997728d8ec0685c2c7d8..4cb0a52961cca8522a679a816315ce83f0f599aa 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -601,7 +601,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
 }
 
 // Sets the unwind edge of an instruction to a particular successor.
-static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) {
+static void setUnwindEdgeTo(Instruction *TI, BasicBlock *Succ) {
   if (auto *II = dyn_cast<InvokeInst>(TI))
     II->setUnwindDest(Succ);
   else if (auto *CS = dyn_cast<CatchSwitchInst>(TI))
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 72c850fca99fd0627538a0420cc7eedec26ddc77..f01c6a4e99bf10be8c9d7eaa63be70a384d73fc4 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -165,7 +165,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
 
         AAMDNodes AAInfo;
         I->getAAMetadata(AAInfo);
-        MemoryLocation Loc(Arg, MemoryLocation::UnknownSize, AAInfo);
+        MemoryLocation Loc(Arg, LocationSize::unknown(), AAInfo);
 
         // Skip accesses to local or constant memory as they don't impact the
         // externally visible mod/ref behavior.
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index 8f8c85e1b18d2489aa81788b970f813d4f83b31b..31531beea5e78d76a95d856b0119d123d301a8d8 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -60,8 +60,17 @@ using namespace llvm;
 
 #define DEBUG_TYPE "function-import"
 
-STATISTIC(NumImportedFunctions, "Number of functions imported");
-STATISTIC(NumImportedGlobalVars, "Number of global variables imported");
+STATISTIC(NumImportedFunctionsThinLink,
+          "Number of functions thin link decided to import");
+STATISTIC(NumImportedHotFunctionsThinLink,
+          "Number of hot functions thin link decided to import");
+STATISTIC(NumImportedCriticalFunctionsThinLink,
+          "Number of critical functions thin link decided to import");
+STATISTIC(NumImportedGlobalVarsThinLink,
+          "Number of global variables thin link decided to import");
+STATISTIC(NumImportedFunctions, "Number of functions imported in backend");
+STATISTIC(NumImportedGlobalVars,
+          "Number of global variables imported in backend");
 STATISTIC(NumImportedModules, "Number of modules imported from");
 STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
 STATISTIC(NumLiveSymbols, "Number of live symbols in index");
@@ -228,11 +237,19 @@ selectCallee(const ModuleSummaryIndex &Index,
           return false;
         }
 
+        // Skip if it isn't legal to import (e.g. may reference unpromotable
+        // locals).
         if (Summary->notEligibleToImport()) {
           Reason = FunctionImporter::ImportFailureReason::NotEligible;
           return false;
         }
 
+        // Don't bother importing if we can't inline it anyway.
+        if (Summary->fflags().NoInline) {
+          Reason = FunctionImporter::ImportFailureReason::NoInline;
+          return false;
+        }
+
         return true;
       });
   if (It == CalleeSummaryList.end())
@@ -278,11 +295,13 @@ static void computeImportForReferencedGlobals(
 
     for (auto &RefSummary : VI.getSummaryList())
       if (RefSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind &&
-          // Don't try to import regular LTO summaries added to dummy module.
-          !RefSummary->modulePath().empty() &&
+          !RefSummary->notEligibleToImport() &&
           !GlobalValue::isInterposableLinkage(RefSummary->linkage()) &&
           RefSummary->refs().empty()) {
-        ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+        auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+        // Only update stat if we haven't already imported this variable.
+        if (ILI.second)
+          NumImportedGlobalVarsThinLink++;
         if (ExportLists)
           (*ExportLists)[RefSummary->modulePath()].insert(VI.getGUID());
         break;
@@ -307,6 +326,8 @@ getFailureName(FunctionImporter::ImportFailureReason Reason) {
     return "LocalLinkageNotInModule";
   case FunctionImporter::ImportFailureReason::NotEligible:
     return "NotEligible";
+  case FunctionImporter::ImportFailureReason::NoInline:
+    return "NoInline";
   }
   llvm_unreachable("invalid reason");
 }
@@ -364,6 +385,11 @@ static void computeImportForFunction(
     auto &CalleeSummary = std::get<1>(IT.first->second);
     auto &FailureInfo = std::get<2>(IT.first->second);
 
+    bool IsHotCallsite =
+        Edge.second.getHotness() == CalleeInfo::HotnessType::Hot;
+    bool IsCriticalCallsite =
+        Edge.second.getHotness() == CalleeInfo::HotnessType::Critical;
+
     const FunctionSummary *ResolvedCalleeSummary = nullptr;
     if (CalleeSummary) {
       assert(PreviouslyVisited);
@@ -435,6 +461,13 @@ static void computeImportForFunction(
       // We previously decided to import this GUID definition if it was already
       // inserted in the set of imports from the exporting module.
       bool PreviouslyImported = !ILI.second;
+      if (!PreviouslyImported) {
+        NumImportedFunctionsThinLink++;
+        if (IsHotCallsite)
+          NumImportedHotFunctionsThinLink++;
+        if (IsCriticalCallsite)
+          NumImportedCriticalFunctionsThinLink++;
+      }
 
       // Make exports in the source module.
       if (ExportLists) {
@@ -468,8 +501,6 @@ static void computeImportForFunction(
       return Threshold * ImportInstrFactor;
     };
 
-    bool IsHotCallsite =
-        Edge.second.getHotness() == CalleeInfo::HotnessType::Hot;
     const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
 
     ImportCount++;
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 5518ef8fce988d626522324a3bf80070c87990b3..3005aafd06b13b43861e366be5b5043744625238 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1710,19 +1710,25 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
           assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
         }
       }
-      new StoreInst(StoreVal, NewGV, false, 0,
-                    SI->getOrdering(), SI->getSyncScopeID(), SI);
+      StoreInst *NSI =
+          new StoreInst(StoreVal, NewGV, false, 0, SI->getOrdering(),
+                        SI->getSyncScopeID(), SI);
+      NSI->setDebugLoc(SI->getDebugLoc());
     } else {
       // Change the load into a load of bool then a select.
       LoadInst *LI = cast<LoadInst>(UI);
       LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", false, 0,
                                    LI->getOrdering(), LI->getSyncScopeID(), LI);
-      Value *NSI;
+      Instruction *NSI;
       if (IsOneZero)
         NSI = new ZExtInst(NLI, LI->getType(), "", LI);
       else
         NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
       NSI->takeName(LI);
+      // Since LI is split into two instructions, NLI and NSI both inherit the
+      // same DebugLoc
+      NLI->setDebugLoc(LI->getDebugLoc());
+      NSI->setDebugLoc(LI->getDebugLoc());
       LI->replaceAllUsesWith(NSI);
     }
     UI->eraseFromParent();
diff --git a/lib/Transforms/IPO/HotColdSplitting.cpp b/lib/Transforms/IPO/HotColdSplitting.cpp
index 810fdf418a28a64afe51883860daceeed9331d63..621ac7dc8ab80ed1524e2d433ace813bc4c8bbd9 100644
--- a/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
@@ -57,16 +58,18 @@
 
 #define DEBUG_TYPE "hotcoldsplit"
 
-STATISTIC(NumColdSESEFound,
-          "Number of cold single entry single exit (SESE) regions found.");
-STATISTIC(NumColdSESEOutlined,
-          "Number of cold single entry single exit (SESE) regions outlined.");
+STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
+STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
 
 using namespace llvm;
 
 static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
                               cl::init(true), cl::Hidden);
 
+static cl::opt<int>
+    MinOutliningThreshold("min-outlining-thresh", cl::init(3), cl::Hidden,
+                          cl::desc("Code size threshold for outlining within a "
+                                   "single BB (as a multiple of TCC_Basic)"));
 
 namespace {
 
@@ -74,41 +77,24 @@ struct PostDomTree : PostDomTreeBase<BasicBlock> {
   PostDomTree(Function &F) { recalculate(F); }
 };
 
-typedef DenseSet<const BasicBlock *> DenseSetBB;
-typedef DenseMap<const BasicBlock *, uint64_t> DenseMapBBInt;
-
-// From: https://reviews.llvm.org/D22558
-// Exit is not part of the region.
-static bool isSingleEntrySingleExit(BasicBlock *Entry, const BasicBlock *Exit,
-                                    DominatorTree *DT, PostDomTree *PDT,
-                                    SmallVectorImpl<BasicBlock *> &Region) {
-  if (!DT->dominates(Entry, Exit))
-    return false;
-
-  if (!PDT->dominates(Exit, Entry))
-    return false;
-
-  for (auto I = df_begin(Entry), E = df_end(Entry); I != E;) {
-    if (*I == Exit) {
-      I.skipChildren();
-      continue;
-    }
-    if (!DT->dominates(Entry, *I))
-      return false;
-    Region.push_back(*I);
-    ++I;
-  }
-  return true;
-}
+/// A sequence of basic blocks.
+///
+/// A 0-sized SmallVector is slightly cheaper to move than a std::vector.
+using BlockSequence = SmallVector<BasicBlock *, 0>;
 
+// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
+// this function unless you modify the MBB version as well.
+//
+/// A no successor, non-return block probably ends in unreachable and is cold.
+/// Also consider a block that ends in an indirect branch to be a return block,
+/// since many targets use plain indirect branches to return.
 bool blockEndsInUnreachable(const BasicBlock &BB) {
+  if (!succ_empty(&BB))
+    return false;
   if (BB.empty())
     return true;
-  const TerminatorInst *I = BB.getTerminator();
-  if (isa<ReturnInst>(I) || isa<IndirectBrInst>(I))
-    return true;
-  // Unreachable blocks do not have any successor.
-  return succ_empty(&BB);
+  const Instruction *I = BB.getTerminator();
+  return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
 }
 
 static bool exceptionHandlingFunctions(const CallInst *CI) {
@@ -123,8 +109,7 @@ static bool exceptionHandlingFunctions(const CallInst *CI) {
          FName == "__cxa_end_catch";
 }
 
-static
-bool unlikelyExecuted(const BasicBlock &BB) {
+static bool unlikelyExecuted(const BasicBlock &BB) {
   if (blockEndsInUnreachable(BB))
     return true;
   // Exception handling blocks are unlikely executed.
@@ -145,81 +130,173 @@ bool unlikelyExecuted(const BasicBlock &BB) {
   return false;
 }
 
-static DenseSetBB getHotBlocks(Function &F) {
-
-  // Mark all cold basic blocks.
-  DenseSetBB ColdBlocks;
-  for (BasicBlock &BB : F)
-    if (unlikelyExecuted(BB))
-      ColdBlocks.insert((const BasicBlock *)&BB);
-
-  // Forward propagation: basic blocks are hot when they are reachable from the
-  // beginning of the function through a path that does not contain cold blocks.
-  SmallVector<const BasicBlock *, 8> WL;
-  DenseSetBB HotBlocks;
-
-  const BasicBlock *It = &F.front();
-  if (!ColdBlocks.count(It)) {
-    HotBlocks.insert(It);
-    // Breadth First Search to mark edges reachable from hot.
-    WL.push_back(It);
-    while (WL.size() > 0) {
-      It = WL.pop_back_val();
-
-      for (const BasicBlock *Succ : successors(It)) {
-        // Do not visit blocks that are cold.
-        if (!ColdBlocks.count(Succ) && !HotBlocks.count(Succ)) {
-          HotBlocks.insert(Succ);
-          WL.push_back(Succ);
-        }
-      }
-    }
+/// Check whether it's safe to outline \p BB.
+static bool mayExtractBlock(const BasicBlock &BB) {
+  return !BB.hasAddressTaken();
+}
+
+/// Check whether \p BB is profitable to outline (i.e. its code size cost meets
+/// the threshold set in \p MinOutliningThreshold).
+static bool isProfitableToOutline(const BasicBlock &BB,
+                                  TargetTransformInfo &TTI) {
+  int Cost = 0;
+  for (const Instruction &I : BB) {
+    if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator())
+      continue;
+
+    Cost += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+
+    if (Cost >= (MinOutliningThreshold * TargetTransformInfo::TCC_Basic))
+      return true;
   }
+  return false;
+}
+
+/// Identify the maximal region of cold blocks which includes \p SinkBB.
+///
+/// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all
+/// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which
+/// cannot be outlined.
+///
+/// Return an empty sequence if the cold region is too small to outline, or if
+/// the cold region has no warm predecessors.
+static BlockSequence findMaximalColdRegion(BasicBlock &SinkBB,
+                                           TargetTransformInfo &TTI,
+                                           DominatorTree &DT,
+                                           PostDomTree &PDT) {
+  // The maximal cold region.
+  BlockSequence ColdRegion = {};
+
+  // The ancestor farthest-away from SinkBB, and also post-dominated by it.
+  BasicBlock *MaxAncestor = &SinkBB;
+  unsigned MaxAncestorHeight = 0;
+
+  // Visit SinkBB's ancestors using inverse DFS.
+  auto PredIt = ++idf_begin(&SinkBB);
+  auto PredEnd = idf_end(&SinkBB);
+  while (PredIt != PredEnd) {
+    BasicBlock &PredBB = **PredIt;
+    bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
+
+    // If SinkBB does not post-dominate a predecessor, do not mark the
+    // predecessor (or any of its predecessors) cold.
+    if (!SinkPostDom || !mayExtractBlock(PredBB)) {
+      PredIt.skipChildren();
+      continue;
+    }
 
-  assert(WL.empty() && "work list should be empty");
-
-  DenseMapBBInt NumHotSuccessors;
-  // Back propagation: when all successors of a basic block are cold, the
-  // basic block is cold as well.
-  for (BasicBlock &BBRef : F) {
-    const BasicBlock *BB = &BBRef;
-    if (HotBlocks.count(BB)) {
-      // Keep a count of hot successors for every hot block.
-      NumHotSuccessors[BB] = 0;
-      for (const BasicBlock *Succ : successors(BB))
-        if (!ColdBlocks.count(Succ))
-          NumHotSuccessors[BB] += 1;
-
-      // Add to work list the blocks with all successors cold. Those are the
-      // root nodes in the next loop, where we will move those blocks from
-      // HotBlocks to ColdBlocks and iterate over their predecessors.
-      if (NumHotSuccessors[BB] == 0)
-        WL.push_back(BB);
+    // Keep track of the post-dominated ancestor farthest away from the sink.
+    unsigned AncestorHeight = PredIt.getPathLength();
+    if (AncestorHeight > MaxAncestorHeight) {
+      MaxAncestor = &PredBB;
+      MaxAncestorHeight = AncestorHeight;
     }
+
+    ColdRegion.push_back(&PredBB);
+    ++PredIt;
   }
 
-  while (WL.size() > 0) {
-    It = WL.pop_back_val();
-    if (ColdBlocks.count(It))
+  // CodeExtractor requires that all blocks to be extracted must be dominated
+  // by the first block to be extracted.
+  //
+  // To avoid spurious or repeated outlining, require that the max ancestor
+  // has a predecessor. By construction this predecessor is not in the cold
+  // region, i.e. its existence implies we don't outline the whole function.
+  //
+  // TODO: If MaxAncestor has no predecessors, we may be able to outline the
+  // second largest cold region that has a predecessor.
+  if (pred_empty(MaxAncestor) ||
+      MaxAncestor->getSinglePredecessor() == MaxAncestor)
+    return {};
+
+  // Filter out predecessors not dominated by the max ancestor.
+  //
+  // TODO: Blocks not dominated by the max ancestor could be extracted as
+  // other cold regions. Marking outlined calls as noreturn when appropriate
+  // and outlining more than once per function could achieve most of the win.
+  auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) {
+    return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB);
+  });
+  ColdRegion.erase(EraseIt, ColdRegion.end());
+
+  // Add SinkBB to the cold region.
+  ColdRegion.push_back(&SinkBB);
+
+  // Ensure that the first extracted block is the max ancestor.
+  if (ColdRegion[0] != MaxAncestor) {
+    auto AncestorIt = find(ColdRegion, MaxAncestor);
+    *AncestorIt = ColdRegion[0];
+    ColdRegion[0] = MaxAncestor;
+  }
+
+  // Find all successors of SinkBB dominated by SinkBB using DFS.
+  auto SuccIt = ++df_begin(&SinkBB);
+  auto SuccEnd = df_end(&SinkBB);
+  while (SuccIt != SuccEnd) {
+    BasicBlock &SuccBB = **SuccIt;
+    bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
+
+    // If SinkBB does not dominate a successor, do not mark the successor (or
+    // any of its successors) cold.
+    if (!SinkDom || !mayExtractBlock(SuccBB)) {
+      SuccIt.skipChildren();
       continue;
+    }
 
-    // Move the block from HotBlocks to ColdBlocks.
-    HotBlocks.erase(It);
-    ColdBlocks.insert(It);
+    ColdRegion.push_back(&SuccBB);
+    ++SuccIt;
+  }
+
+  if (ColdRegion.size() == 1 && !isProfitableToOutline(*ColdRegion[0], TTI))
+    return {};
+
+  return ColdRegion;
+}
 
-    // Iterate over the predecessors.
-    for (const BasicBlock *Pred : predecessors(It)) {
-      if (HotBlocks.count(Pred)) {
-        NumHotSuccessors[Pred] -= 1;
+/// Get the largest cold region in \p F.
+static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI,
+                                          BlockFrequencyInfo *BFI,
+                                          TargetTransformInfo &TTI,
+                                          DominatorTree &DT, PostDomTree &PDT) {
+  // Keep track of the largest cold region.
+  BlockSequence LargestColdRegion = {};
+
+  for (BasicBlock &BB : F) {
+    // Identify cold blocks.
+    if (!mayExtractBlock(BB))
+      continue;
+    bool Cold =
+        PSI.isColdBB(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB));
+    if (!Cold)
+      continue;
 
-        // If Pred has no more hot successors, add it to the work list.
-        if (NumHotSuccessors[Pred] == 0)
-          WL.push_back(Pred);
-      }
+    LLVM_DEBUG({
+      dbgs() << "Found cold block:\n";
+      BB.dump();
+    });
+
+    // Find a maximal cold region we can outline.
+    BlockSequence ColdRegion = findMaximalColdRegion(BB, TTI, DT, PDT);
+    if (ColdRegion.empty()) {
+      LLVM_DEBUG(dbgs() << "  Skipping (block not profitable to extract)\n");
+      continue;
     }
+
+    ++NumColdRegionsFound;
+
+    LLVM_DEBUG({
+      llvm::dbgs() << "Identified cold region with " << ColdRegion.size()
+                   << " blocks:\n";
+      for (BasicBlock *BB : ColdRegion)
+        BB->dump();
+    });
+
+    // TODO: Outline more than one region.
+    if (ColdRegion.size() > LargestColdRegion.size())
+      LargestColdRegion = std::move(ColdRegion);
   }
 
-  return HotBlocks;
+  return LargestColdRegion;
 }
 
 class HotColdSplitting {
@@ -233,23 +310,9 @@ public:
 
 private:
   bool shouldOutlineFrom(const Function &F) const;
-  const Function *outlineColdBlocks(Function &F, const DenseSetBB &ColdBlock,
-                                    DominatorTree *DT, PostDomTree *PDT);
-  Function *extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
-                              DominatorTree *DT, BlockFrequencyInfo *BFI,
-                              OptimizationRemarkEmitter &ORE);
-  bool isOutlineCandidate(const SmallVectorImpl<BasicBlock *> &Region,
-                          const BasicBlock *Exit) const {
-    if (!Exit)
-      return false;
-
-    // Regions with landing pads etc.
-    for (const BasicBlock *BB : Region) {
-      if (BB->isEHPad() || BB->hasAddressTaken())
-        return false;
-    }
-    return true;
-  }
+  Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
+                              BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
+                              OptimizationRemarkEmitter &ORE, unsigned Count);
   SmallPtrSet<const Function *, 2> OutlinedFunctions;
   ProfileSummaryInfo *PSI;
   function_ref<BlockFrequencyInfo *(Function &)> GetBFI;
@@ -286,6 +349,8 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
   if (F.size() <= 2)
     return false;
 
+  // TODO: Consider only skipping functions marked `optnone` or `cold`.
+
   if (F.hasAddressTaken())
     return false;
 
@@ -303,34 +368,57 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
   return true;
 }
 
-Function *
-HotColdSplitting::extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
-                                    DominatorTree *DT, BlockFrequencyInfo *BFI,
-                                    OptimizationRemarkEmitter &ORE) {
+Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
+                                              DominatorTree &DT,
+                                              BlockFrequencyInfo *BFI,
+                                              TargetTransformInfo &TTI,
+                                              OptimizationRemarkEmitter &ORE,
+                                              unsigned Count) {
+  assert(!Region.empty());
   LLVM_DEBUG(for (auto *BB : Region)
           llvm::dbgs() << "\nExtracting: " << *BB;);
 
   // TODO: Pass BFI and BPI to update profile information.
-  CodeExtractor CE(Region, DT);
+  CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
+                   /* BPI */ nullptr, /* AllowVarArgs */ false,
+                   /* AllowAlloca */ false,
+                   /* Suffix */ "cold." + std::to_string(Count));
 
   SetVector<Value *> Inputs, Outputs, Sinks;
   CE.findInputsOutputs(Inputs, Outputs, Sinks);
 
   // Do not extract regions that have live exit variables.
-  if (Outputs.size() > 0)
+  if (Outputs.size() > 0) {
+    LLVM_DEBUG(llvm::dbgs() << "Not outlining; live outputs\n");
     return nullptr;
+  }
 
+  // TODO: Run MergeBasicBlockIntoOnlyPred on the outlined function.
+  Function *OrigF = Region[0]->getParent();
   if (Function *OutF = CE.extractCodeRegion()) {
     User *U = *OutF->user_begin();
     CallInst *CI = cast<CallInst>(U);
     CallSite CS(CI);
-    NumColdSESEOutlined++;
-    if (GetTTI(*OutF).useColdCCForColdCall(*OutF)) {
+    NumColdRegionsOutlined++;
+    if (TTI.useColdCCForColdCall(*OutF)) {
       OutF->setCallingConv(CallingConv::Cold);
       CS.setCallingConv(CallingConv::Cold);
     }
     CI->setIsNoInline();
+
+    // Try to make the outlined code as small as possible on the assumption
+    // that it's cold.
+    assert(!OutF->hasFnAttribute(Attribute::OptimizeNone) &&
+           "An outlined function should never be marked optnone");
+    OutF->addFnAttr(Attribute::MinSize);
+
     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
+                                &*Region[0]->begin())
+             << ore::NV("Original", OrigF) << " split cold code into "
+             << ore::NV("Split", OutF);
+    });
     return OutF;
   }
 
@@ -343,63 +431,34 @@ HotColdSplitting::extractColdRegion(const SmallVectorImpl<BasicBlock *> &Region,
   return nullptr;
 }
 
-// Return the function created after outlining, nullptr otherwise.
-const Function *HotColdSplitting::outlineColdBlocks(Function &F,
-                                                    const DenseSetBB &HotBlocks,
-                                                    DominatorTree *DT,
-                                                    PostDomTree *PDT) {
-  auto BFI = GetBFI(F);
-  auto &ORE = (*GetORE)(F);
-  // Walking the dominator tree allows us to find the largest
-  // cold region.
-  BasicBlock *Begin = DT->getRootNode()->getBlock();
-  for (auto I = df_begin(Begin), E = df_end(Begin); I != E; ++I) {
-    BasicBlock *BB = *I;
-    if (PSI->isColdBB(BB, BFI) || !HotBlocks.count(BB)) {
-      SmallVector<BasicBlock *, 4> ValidColdRegion, Region;
-      BasicBlock *Exit = (*PDT)[BB]->getIDom()->getBlock();
-      BasicBlock *ExitColdRegion = nullptr;
-
-      // Estimated cold region between a BB and its dom-frontier.
-      while (Exit && isSingleEntrySingleExit(BB, Exit, DT, PDT, Region) &&
-             isOutlineCandidate(Region, Exit)) {
-        ExitColdRegion = Exit;
-        ValidColdRegion = Region;
-        Region.clear();
-        // Update Exit recursively to its dom-frontier.
-        Exit = (*PDT)[Exit]->getIDom()->getBlock();
-      }
-      if (ExitColdRegion) {
-        // Do not outline a region with only one block.
-        if (ValidColdRegion.size() == 1)
-          continue;
-
-        ++NumColdSESEFound;
-        ValidColdRegion.push_back(ExitColdRegion);
-        // Candidate for outlining. FIXME: Continue outlining.
-        return extractColdRegion(ValidColdRegion, DT, BFI, ORE);
-      }
-    }
-  }
-  return nullptr;
-}
-
 bool HotColdSplitting::run(Module &M) {
+  bool Changed = false;
   for (auto &F : M) {
-    if (!shouldOutlineFrom(F))
+    if (!shouldOutlineFrom(F)) {
+      LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n");
       continue;
+    }
+
+    LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
     DominatorTree DT(F);
     PostDomTree PDT(F);
     PDT.recalculate(F);
-    DenseSetBB HotBlocks;
-    if (EnableStaticAnalyis) // Static analysis of cold blocks.
-      HotBlocks = getHotBlocks(F);
+    BlockFrequencyInfo *BFI = GetBFI(F);
+    TargetTransformInfo &TTI = GetTTI(F);
 
-    const Function *Outlined = outlineColdBlocks(F, HotBlocks, &DT, &PDT);
-    if (Outlined)
+    BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, TTI, DT, PDT);
+    if (ColdRegion.empty())
+      continue;
+
+    OptimizationRemarkEmitter &ORE = (*GetORE)(F);
+    Function *Outlined =
+        extractColdRegion(ColdRegion, DT, BFI, TTI, ORE, /*Count=*/1);
+    if (Outlined) {
       OutlinedFunctions.insert(Outlined);
+      Changed = true;
+    }
   }
-  return true;
+  return Changed;
 }
 
 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 3275226925cb49c5abfdfdb8ba732d13a51600fd..66aea45323fea9a817be7946e0928dcfbe8d8c01 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -1158,10 +1158,19 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // SCC splits and merges. To avoid this, we capture the originating caller
     // node and the SCC containing the call edge. This is a slight over
     // approximation of the possible inlining decisions that must be avoided,
-    // but is relatively efficient to store.
+    // but is relatively efficient to store. We use C != OldC to know when
+    // a new SCC is generated and the original SCC may be generated via merge
+    // in later iterations.
+    //
+    // It is also possible that even if no new SCC is generated
+    // (i.e., C == OldC), the original SCC could be split and then merged
+    // into the same one as itself. and the original SCC will be added into
+    // UR.CWorklist again, we want to catch such cases too.
+    //
     // FIXME: This seems like a very heavyweight way of retaining the inline
     // history, we should look for a more efficient way of tracking it.
-    if (C != OldC && llvm::any_of(InlinedCallees, [&](Function *Callee) {
+    if ((C != OldC || UR.CWorklist.count(OldC)) &&
+        llvm::any_of(InlinedCallees, [&](Function *Callee) {
           return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
         })) {
       LLVM_DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 8c86f7cb806a0c9d394c91faf47382f3a7448b3e..733235d45a09623424d5f58d3668d6a983f2afff 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -104,8 +104,8 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
   bool ShouldExtractLoop = false;
 
   // Extract the loop if the entry block doesn't branch to the loop header.
-  TerminatorInst *EntryTI =
-    L->getHeader()->getParent()->getEntryBlock().getTerminator();
+  Instruction *EntryTI =
+      L->getHeader()->getParent()->getEntryBlock().getTerminator();
   if (!isa<BranchInst>(EntryTI) ||
       !cast<BranchInst>(EntryTI)->isUnconditional() ||
       EntryTI->getSuccessor(0) != L->getHeader()) {
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 3bebb96c6d35411fdcb820e3b475831efbfb7202..4c51cd131a1015d654e1de9f8b5232b390d097af 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -284,7 +284,7 @@ private:
   // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid
   // dangling iterators into FnTree. The invariant that preserves this is that
   // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree.
-  ValueMap<Function*, FnTreeType::iterator> FNodesInTree;
+  DenseMap<AssertingVH<Function>, FnTreeType::iterator> FNodesInTree;
 };
 
 } // end anonymous namespace
@@ -425,6 +425,7 @@ bool MergeFunctions::runOnModule(Module &M) {
   } while (!Deferred.empty());
 
   FnTree.clear();
+  FNodesInTree.clear();
   GlobalNumbers.clear();
 
   return Changed;
@@ -608,7 +609,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI(
         LLVM_DEBUG(BI->print(dbgs()));
         LLVM_DEBUG(dbgs() << "\n");
       }
-    } else if (dyn_cast<TerminatorInst>(BI) == GEntryBlock->getTerminator()) {
+    } else if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) {
       LLVM_DEBUG(dbgs() << " Will Include Terminator: ");
       LLVM_DEBUG(BI->print(dbgs()));
       LLVM_DEBUG(dbgs() << "\n");
@@ -770,6 +771,7 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
         GlobalNumbers.erase(G);
         // If G's address is not significant, replace it entirely.
         Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+        removeUsers(G);
         G->replaceAllUsesWith(BitcastF);
       } else {
         // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
@@ -816,6 +818,24 @@ void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN,
   FN.replaceBy(G);
 }
 
+// Ordering for functions that are equal under FunctionComparator
+static bool isFuncOrderCorrect(const Function *F, const Function *G) {
+  if (F->isInterposable() != G->isInterposable()) {
+    // Strong before weak, because the weak function may call the strong
+    // one, but not the other way around.
+    return !F->isInterposable();
+  }
+  if (F->hasLocalLinkage() != G->hasLocalLinkage()) {
+    // External before local, because we definitely have to keep the external
+    // function, but may be able to drop the local one.
+    return !F->hasLocalLinkage();
+  }
+  // Impose a total order (by name) on the replacement of functions. This is
+  // important when operating on more than one module independently to prevent
+  // cycles of thunks calling each other when the modules are linked together.
+  return F->getName() <= G->getName();
+}
+
 // Insert a ComparableFunction into the FnTree, or merge it away if equal to one
 // that was already inserted.
 bool MergeFunctions::insert(Function *NewFunction) {
@@ -832,14 +852,7 @@ bool MergeFunctions::insert(Function *NewFunction) {
 
   const FunctionNode &OldF = *Result.first;
 
-  // Impose a total order (by name) on the replacement of functions. This is
-  // important when operating on more than one module independently to prevent
-  // cycles of thunks calling each other when the modules are linked together.
-  //
-  // First of all, we process strong functions before weak functions.
-  if ((OldF.getFunc()->isInterposable() && !NewFunction->isInterposable()) ||
-     (OldF.getFunc()->isInterposable() == NewFunction->isInterposable() &&
-       OldF.getFunc()->getName() > NewFunction->getName())) {
+  if (!isFuncOrderCorrect(OldF.getFunc(), NewFunction)) {
     // Swap the two functions.
     Function *F = OldF.getFunc();
     replaceFunctionInTree(*Result.first, NewFunction);
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 709222dbec002913c014968fc90afacdf1eff754..bcb19af85b29b9bd501be3b53a3c5f98a61add49 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -556,7 +556,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) {
   };
 
   auto IsReturnBlock = [](BasicBlock *BB) {
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     return isa<ReturnInst>(TI);
   };
 
@@ -1251,7 +1251,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
   if (PSI->isFunctionEntryCold(F))
     return {false, nullptr};
 
-  if (F->user_begin() == F->user_end())
+  if (empty(F->users()))
     return {false, nullptr};
 
   OptimizationRemarkEmitter ORE(F);
@@ -1357,7 +1357,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     return false;
   }
 
-  assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
+  assert(empty(Cloner.OrigFunc->users()) &&
          "F's users should all be replaced!");
 
   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 8b00a60f5213602068e8f79c4ffff10ba698c261..19ff2a21cd2186cb0532822b1e0e405c9dc0e256 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -499,9 +499,6 @@ void PassManagerBuilder::populateModulePassManager(
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
 
-  if (EnableHotColdSplit)
-    MPM.add(createHotColdSplittingPass());
-
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
   if (OptLevel > 2)
@@ -735,6 +732,9 @@ void PassManagerBuilder::populateModulePassManager(
   // flattening of blocks.
   MPM.add(createDivRemPairsPass());
 
+  if (EnableHotColdSplit)
+    MPM.add(createHotColdSplittingPass());
+
   // LoopSink (and other loop passes since the last simplifyCFG) might have
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
   MPM.add(createCFGSimplificationPass());
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 2caee2942213b65fbf7f2325a2baff5e3f7ac508..ae586c017471a7885c3aa9d365053097c71c89d6 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -107,7 +107,7 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
         continue;
 
       for (const BasicBlock &BB : *F) {
-        const TerminatorInst *TI = BB.getTerminator();
+        const Instruction *TI = BB.getTerminator();
         if (CheckUnwind && TI->mayThrow()) {
           SCCMightUnwind = true;
         } else if (CheckReturn && isa<ReturnInst>(TI)) {
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 182202fda05f1939455ae3bed9253928199d4c4c..a78e0d459c891b909d7f9766667f0f8c7cecb7d4 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -96,6 +96,13 @@ static cl::opt<std::string> SampleProfileFile(
     "sample-profile-file", cl::init(""), cl::value_desc("filename"),
     cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
 
+// The named file contains a set of transformations that may have been applied
+// to the symbol names between the program from which the sample data was
+// collected and the current program's symbols.
+static cl::opt<std::string> SampleProfileRemappingFile(
+    "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
+
 static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
     "sample-profile-max-propagate-iterations", cl::init(100),
     cl::desc("Maximum number of iterations to go through when propagating "
@@ -183,12 +190,12 @@ private:
 class SampleProfileLoader {
 public:
   SampleProfileLoader(
-      StringRef Name, bool IsThinLTOPreLink,
+      StringRef Name, StringRef RemapName, bool IsThinLTOPreLink,
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
       : GetAC(std::move(GetAssumptionCache)),
         GetTTI(std::move(GetTargetTransformInfo)), Filename(Name),
-        IsThinLTOPreLink(IsThinLTOPreLink) {}
+        RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   bool doInitialization(Module &M);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
@@ -282,6 +289,9 @@ protected:
   /// Name of the profile file to load.
   std::string Filename;
 
+  /// Name of the profile remapping file to load.
+  std::string RemappingFilename;
+
   /// Flag indicating whether the profile input loaded successfully.
   bool ProfileIsValid = false;
 
@@ -311,13 +321,14 @@ public:
 
   SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile,
                                 bool IsThinLTOPreLink = false)
-      : ModulePass(ID), SampleLoader(Name, IsThinLTOPreLink,
-                                     [&](Function &F) -> AssumptionCache & {
-                                       return ACT->getAssumptionCache(F);
-                                     },
-                                     [&](Function &F) -> TargetTransformInfo & {
-                                       return TTIWP->getTTI(F);
-                                     }) {
+      : ModulePass(ID),
+        SampleLoader(Name, SampleProfileRemappingFile, IsThinLTOPreLink,
+                     [&](Function &F) -> AssumptionCache & {
+                       return ACT->getAssumptionCache(F);
+                     },
+                     [&](Function &F) -> TargetTransformInfo & {
+                       return TTIWP->getTTI(F);
+                     }) {
     initializeSampleProfileLoaderLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -1286,7 +1297,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
         }
       }
     }
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     if (TI->getNumSuccessors() == 1)
       continue;
     if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
@@ -1515,11 +1526,26 @@ bool SampleProfileLoader::doInitialization(Module &M) {
   Reader = std::move(ReaderOrErr.get());
   Reader->collectFuncsToUse(M);
   ProfileIsValid = (Reader->read() == sampleprof_error::success);
+
+  if (!RemappingFilename.empty()) {
+    // Apply profile remappings to the loaded profile data if requested.
+    // For now, we only support remapping symbols encoded using the Itanium
+    // C++ ABI's name mangling scheme.
+    ReaderOrErr = SampleProfileReaderItaniumRemapper::create(
+        RemappingFilename, Ctx, std::move(Reader));
+    if (std::error_code EC = ReaderOrErr.getError()) {
+      std::string Msg = "Could not open profile remapping file: " + EC.message();
+      Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+      return false;
+    }
+    Reader = std::move(ReaderOrErr.get());
+    ProfileIsValid = (Reader->read() == sampleprof_error::success);
+  }
   return true;
 }
 
 ModulePass *llvm::createSampleProfileLoaderPass() {
-  return new SampleProfileLoaderLegacyPass(SampleProfileFile);
+  return new SampleProfileLoaderLegacyPass();
 }
 
 ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
@@ -1612,6 +1638,8 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
 
   SampleProfileLoader SampleLoader(
       ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
+      ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
+                                       : ProfileRemappingFileName,
       IsThinLTOPreLink, GetAssumptionCache, GetTTI);
 
   SampleLoader.doInitialization(M);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a6280ec95a96152783b8a23aa881dd681153db7f..7e7a515bfc8de4869966455018cfdecc070db285 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1831,14 +1831,33 @@ static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
 /// We have an expression of the form (A & C) | (B & D). If A is a scalar or
 /// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
 /// B, it can be used as the condition operand of a select instruction.
-static Value *getSelectCondition(Value *A, Value *B,
-                                 InstCombiner::BuilderTy &Builder) {
-  // If these are scalars or vectors of i1, A can be used directly.
+Value *InstCombiner::getSelectCondition(Value *A, Value *B) {
+  // Step 1: We may have peeked through bitcasts in the caller.
+  // Exit immediately if we don't have (vector) integer types.
   Type *Ty = A->getType();
-  if (match(A, m_Not(m_Specific(B))) && Ty->isIntOrIntVectorTy(1))
-    return A;
+  if (!Ty->isIntOrIntVectorTy() || !B->getType()->isIntOrIntVectorTy())
+    return nullptr;
+
+  // Step 2: We need 0 or all-1's bitmasks.
+  if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
+    return nullptr;
 
-  // If A and B are sign-extended, look through the sexts to find the booleans.
+  // Step 3: If B is the 'not' value of A, we have our answer.
+  if (match(A, m_Not(m_Specific(B)))) {
+    // If these are scalars or vectors of i1, A can be used directly.
+    if (Ty->isIntOrIntVectorTy(1))
+      return A;
+    return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty));
+  }
+
+  // If both operands are constants, see if the constants are inverse bitmasks.
+  Constant *AConst, *BConst;
+  if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst)))
+    if (AConst == ConstantExpr::getNot(BConst))
+      return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty));
+
+  // Look for more complex patterns. The 'not' op may be hidden behind various
+  // casts. Look through sexts and bitcasts to find the booleans.
   Value *Cond;
   Value *NotB;
   if (match(A, m_SExt(m_Value(Cond))) &&
@@ -1854,36 +1873,29 @@ static Value *getSelectCondition(Value *A, Value *B,
   if (!Ty->isVectorTy())
     return nullptr;
 
-  // If both operands are constants, see if the constants are inverse bitmasks.
-  Constant *AC, *BC;
-  if (match(A, m_Constant(AC)) && match(B, m_Constant(BC)) &&
-      areInverseVectorBitmasks(AC, BC)) {
-    return Builder.CreateZExtOrTrunc(AC, CmpInst::makeCmpResultType(Ty));
-  }
-
   // If both operands are xor'd with constants using the same sexted boolean
   // operand, see if the constants are inverse bitmasks.
-  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AC)))) &&
-      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BC)))) &&
+  // TODO: Use ConstantExpr::getNot()?
+  if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AConst)))) &&
+      match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BConst)))) &&
       Cond->getType()->isIntOrIntVectorTy(1) &&
-      areInverseVectorBitmasks(AC, BC)) {
-    AC = ConstantExpr::getTrunc(AC, CmpInst::makeCmpResultType(Ty));
-    return Builder.CreateXor(Cond, AC);
+      areInverseVectorBitmasks(AConst, BConst)) {
+    AConst = ConstantExpr::getTrunc(AConst, CmpInst::makeCmpResultType(Ty));
+    return Builder.CreateXor(Cond, AConst);
   }
   return nullptr;
 }
 
 /// We have an expression of the form (A & C) | (B & D). Try to simplify this
 /// to "A' ? C : D", where A' is a boolean or vector of booleans.
-static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
-                                   InstCombiner::BuilderTy &Builder) {
+Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
+                                          Value *D) {
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
   A = peekThroughBitcast(A, true);
   B = peekThroughBitcast(B, true);
-
-  if (Value *Cond = getSelectCondition(A, B, Builder)) {
+  if (Value *Cond = getSelectCondition(A, B)) {
     // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
     // The bitcasts will either all exist or all not exist. The builder will
     // not create unnecessary casts if the types already match.
@@ -2234,21 +2246,21 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     // 'or' that it is replacing.
     if (Op0->hasOneUse() || Op1->hasOneUse()) {
       // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
-      if (Value *V = matchSelectFromAndOr(A, C, B, D, Builder))
+      if (Value *V = matchSelectFromAndOr(A, C, B, D))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(A, C, D, B, Builder))
+      if (Value *V = matchSelectFromAndOr(A, C, D, B))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, B, D, Builder))
+      if (Value *V = matchSelectFromAndOr(C, A, B, D))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, D, B, Builder))
+      if (Value *V = matchSelectFromAndOr(C, A, D, B))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, A, C, Builder))
+      if (Value *V = matchSelectFromAndOr(B, D, A, C))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, C, A, Builder))
+      if (Value *V = matchSelectFromAndOr(B, D, C, A))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, A, C, Builder))
+      if (Value *V = matchSelectFromAndOr(D, B, A, C))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, C, A, Builder))
+      if (Value *V = matchSelectFromAndOr(D, B, C, A))
         return replaceInstUsesWith(I, V);
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 4e404933a22b4de99ae83ca6ee6827b92a007fe3..bdd9a43be2721a9e441456060a1e127805877230 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2020,7 +2020,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   }
 
   case Intrinsic::minnum:
-  case Intrinsic::maxnum: {
+  case Intrinsic::maxnum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximum: {
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
     // Canonicalize constants to the RHS.
@@ -2030,19 +2032,68 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return II;
     }
 
+    Intrinsic::ID IID = II->getIntrinsicID();
     Value *X, *Y;
     if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
         (Arg0->hasOneUse() || Arg1->hasOneUse())) {
       // If both operands are negated, invert the call and negate the result:
-      // minnum(-X, -Y) --> -(maxnum(X, Y))
-      // maxnum(-X, -Y) --> -(minnum(X, Y))
-      Intrinsic::ID NewIID = II->getIntrinsicID() == Intrinsic::maxnum ?
-          Intrinsic::minnum : Intrinsic::maxnum;
+      // min(-X, -Y) --> -(max(X, Y))
+      // max(-X, -Y) --> -(min(X, Y))
+      Intrinsic::ID NewIID;
+      switch (IID) {
+      case Intrinsic::maxnum:
+        NewIID = Intrinsic::minnum;
+        break;
+      case Intrinsic::minnum:
+        NewIID = Intrinsic::maxnum;
+        break;
+      case Intrinsic::maximum:
+        NewIID = Intrinsic::minimum;
+        break;
+      case Intrinsic::minimum:
+        NewIID = Intrinsic::maximum;
+        break;
+      default:
+        llvm_unreachable("unexpected intrinsic ID");
+      }
       Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II);
       Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall);
       FNeg->copyIRFlags(II);
       return FNeg;
     }
+
+    // m(m(X, C2), C1) -> m(X, C)
+    const APFloat *C1, *C2;
+    if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) {
+      if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) &&
+          ((match(M->getArgOperand(0), m_Value(X)) &&
+            match(M->getArgOperand(1), m_APFloat(C2))) ||
+           (match(M->getArgOperand(1), m_Value(X)) &&
+            match(M->getArgOperand(0), m_APFloat(C2))))) {
+        APFloat Res(0.0);
+        switch (IID) {
+        case Intrinsic::maxnum:
+          Res = maxnum(*C1, *C2);
+          break;
+        case Intrinsic::minnum:
+          Res = minnum(*C1, *C2);
+          break;
+        case Intrinsic::maximum:
+          Res = maximum(*C1, *C2);
+          break;
+        case Intrinsic::minimum:
+          Res = minimum(*C1, *C2);
+          break;
+        default:
+          llvm_unreachable("unexpected intrinsic ID");
+        }
+        Instruction *NewCall = Builder.CreateBinaryIntrinsic(
+            IID, X, ConstantFP::get(Arg0->getType(), Res));
+        NewCall->copyIRFlags(II);
+        return replaceInstUsesWith(*II, NewCall);
+      }
+    }
+
     break;
   }
   case Intrinsic::fmuladd: {
@@ -3732,7 +3783,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Scan down this block to see if there is another stack restore in the
     // same block without an intervening call/alloca.
     BasicBlock::iterator BI(II);
-    TerminatorInst *TI = II->getParent()->getTerminator();
+    Instruction *TI = II->getParent()->getTerminator();
     bool CannotRemove = false;
     for (++BI; &*BI != TI; ++BI) {
       if (isa<AllocaInst>(BI)) {
@@ -3960,7 +4011,11 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
   auto InstCombineRAUW = [this](Instruction *From, Value *With) {
     replaceInstUsesWith(*From, With);
   };
-  LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW);
+  auto InstCombineErase = [this](Instruction *I) {
+    eraseInstFromFunction(*I);
+  };
+  LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW,
+                               InstCombineErase);
   if (Value *With = Simplifier.optimizeCall(CI)) {
     ++NumSimplified;
     return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 74f1e695ff63191fefd9936057498bc9317b8ccb..9fa27d89911dbea920ab88f843f0c08b3151b2c6 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1612,8 +1612,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
     }
 
     // (fptrunc (fneg x)) -> (fneg (fptrunc x))
-    if (BinaryOperator::isFNeg(OpI)) {
-      Value *InnerTrunc = Builder.CreateFPTrunc(OpI->getOperand(1), Ty);
+    Value *X;
+    if (match(OpI, m_FNeg(m_Value(X)))) {
+      Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
       return BinaryOperator::CreateFNegFMF(InnerTrunc, OpI);
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index bf8bc8818f7f39acaaf96f62c253975c2d6ba013..2ba1174517ffe662b7b8258ffa5421963b45c597 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5281,11 +5281,71 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
   if (C->isNegative())
     Pred = I.getSwappedPredicate();
 
-  // Finally emit the new fcmp.
-  Value *X = LHSI->getOperand(1);
-  FCmpInst *NewFCI = new FCmpInst(Pred, X, RHSC);
-  NewFCI->setFastMathFlags(I.getFastMathFlags());
-  return NewFCI;
+  return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I);
+}
+
+/// Optimize fabs(X) compared with zero.
+static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
+  Value *X;
+  if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) ||
+      !match(I.getOperand(1), m_PosZeroFP()))
+    return nullptr;
+
+  auto replacePredAndOp0 = [](FCmpInst *I, FCmpInst::Predicate P, Value *X) {
+    I->setPredicate(P);
+    I->setOperand(0, X);
+    return I;
+  };
+
+  switch (I.getPredicate()) {
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OLT:
+    // fabs(X) >= 0.0 --> true
+    // fabs(X) <  0.0 --> false
+    llvm_unreachable("fcmp should have simplified");
+
+  case FCmpInst::FCMP_OGT:
+    // fabs(X) > 0.0 --> X != 0.0
+    return replacePredAndOp0(&I, FCmpInst::FCMP_ONE, X);
+
+  case FCmpInst::FCMP_UGT:
+    // fabs(X) u> 0.0 --> X u!= 0.0
+    return replacePredAndOp0(&I, FCmpInst::FCMP_UNE, X);
+
+  case FCmpInst::FCMP_OLE:
+    // fabs(X) <= 0.0 --> X == 0.0
+    return replacePredAndOp0(&I, FCmpInst::FCMP_OEQ, X);
+
+  case FCmpInst::FCMP_ULE:
+    // fabs(X) u<= 0.0 --> X u== 0.0
+    return replacePredAndOp0(&I, FCmpInst::FCMP_UEQ, X);
+
+  case FCmpInst::FCMP_OGE:
+    // fabs(X) >= 0.0 --> !isnan(X)
+    assert(!I.hasNoNaNs() && "fcmp should have simplified");
+    return replacePredAndOp0(&I, FCmpInst::FCMP_ORD, X);
+
+  case FCmpInst::FCMP_ULT:
+    // fabs(X) u< 0.0 --> isnan(X)
+    assert(!I.hasNoNaNs() && "fcmp should have simplified");
+    return replacePredAndOp0(&I, FCmpInst::FCMP_UNO, X);
+
+  case FCmpInst::FCMP_OEQ:
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_ONE:
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_ORD:
+  case FCmpInst::FCMP_UNO:
+    // Look through the fabs() because it doesn't change anything but the sign.
+    // fabs(X) == 0.0 --> X == 0.0,
+    // fabs(X) != 0.0 --> X != 0.0
+    // isnan(fabs(X)) --> isnan(X)
+    // !isnan(fabs(X) --> !isnan(X)
+    return replacePredAndOp0(&I, I.getPredicate(), X);
+
+  default:
+    return nullptr;
+  }
 }
 
 Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
@@ -5357,132 +5417,89 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         return nullptr;
     }
 
-  // Handle fcmp with constant RHS
-  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
-    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
-      switch (LHSI->getOpcode()) {
-      case Instruction::FPExt: {
-        // fcmp (fpext x), C -> fcmp x, (fptrunc C) if fptrunc is lossless
-        FPExtInst *LHSExt = cast<FPExtInst>(LHSI);
-        ConstantFP *RHSF = dyn_cast<ConstantFP>(RHSC);
-        if (!RHSF)
-          break;
-
-        const fltSemantics *Sem;
-        // FIXME: This shouldn't be here.
-        if (LHSExt->getSrcTy()->isHalfTy())
-          Sem = &APFloat::IEEEhalf();
-        else if (LHSExt->getSrcTy()->isFloatTy())
-          Sem = &APFloat::IEEEsingle();
-        else if (LHSExt->getSrcTy()->isDoubleTy())
-          Sem = &APFloat::IEEEdouble();
-        else if (LHSExt->getSrcTy()->isFP128Ty())
-          Sem = &APFloat::IEEEquad();
-        else if (LHSExt->getSrcTy()->isX86_FP80Ty())
-          Sem = &APFloat::x87DoubleExtended();
-        else if (LHSExt->getSrcTy()->isPPC_FP128Ty())
-          Sem = &APFloat::PPCDoubleDouble();
-        else
-          break;
-
-        bool Lossy;
-        APFloat F = RHSF->getValueAPF();
-        F.convert(*Sem, APFloat::rmNearestTiesToEven, &Lossy);
-
-        // Avoid lossy conversions and denormals. Zero is a special case
-        // that's OK to convert.
-        APFloat Fabs = F;
-        Fabs.clearSign();
-        if (!Lossy &&
-            ((Fabs.compare(APFloat::getSmallestNormalized(*Sem)) !=
-                 APFloat::cmpLessThan) || Fabs.isZero()))
-
-          return new FCmpInst(Pred, LHSExt->getOperand(0),
-                              ConstantFP::get(RHSC->getContext(), F));
-        break;
-      }
-      case Instruction::PHI:
-        // Only fold fcmp into the PHI if the phi and fcmp are in the same
-        // block.  If in the same block, we're encouraging jump threading.  If
-        // not, we are just pessimizing the code by making an i1 phi.
-        if (LHSI->getParent() == I.getParent())
-          if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
-            return NV;
-        break;
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-        if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
-          return NV;
-        break;
-      case Instruction::FSub: {
-        // fcmp pred (fneg x), C -> fcmp swap(pred) x, -C
-        Value *Op;
-        if (match(LHSI, m_FNeg(m_Value(Op))))
-          return new FCmpInst(I.getSwappedPredicate(), Op,
-                              ConstantExpr::getFNeg(RHSC));
-        break;
-      }
-      case Instruction::FDiv:
-        if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
+  // The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0:
+  // fcmp Pred X, -0.0 --> fcmp Pred X, 0.0
+  if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP())) {
+    I.setOperand(1, ConstantFP::getNullValue(Op1->getType()));
+    return &I;
+  }
+
+  // Handle fcmp with instruction LHS and constant RHS.
+  Instruction *LHSI;
+  Constant *RHSC;
+  if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) {
+    switch (LHSI->getOpcode()) {
+    case Instruction::PHI:
+      // Only fold fcmp into the PHI if the phi and fcmp are in the same
+      // block.  If in the same block, we're encouraging jump threading.  If
+      // not, we are just pessimizing the code by making an i1 phi.
+      if (LHSI->getParent() == I.getParent())
+        if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
           return NV;
-        break;
-      case Instruction::Load:
-        if (GetElementPtrInst *GEP =
-            dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
-          if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
-            if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
-                !cast<LoadInst>(LHSI)->isVolatile())
-              if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
-                return Res;
-        }
-        break;
-      case Instruction::Call: {
-        if (!RHSC->isNullValue())
-          break;
+      break;
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+      if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
+        return NV;
+      break;
+    case Instruction::FDiv:
+      if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
+        return NV;
+      break;
+    case Instruction::Load:
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
+        if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+          if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+              !cast<LoadInst>(LHSI)->isVolatile())
+            if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
+              return Res;
+      break;
+  }
+  }
 
-        CallInst *CI = cast<CallInst>(LHSI);
-        Intrinsic::ID IID = getIntrinsicForCallSite(CI, &TLI);
-        if (IID != Intrinsic::fabs)
-          break;
+  if (Instruction *R = foldFabsWithFcmpZero(I))
+    return R;
 
-        // Various optimization for fabs compared with zero.
-        switch (Pred) {
-        default:
-          break;
-        // fabs(x) < 0 --> false
-        case FCmpInst::FCMP_OLT:
-          llvm_unreachable("handled by SimplifyFCmpInst");
-        // fabs(x) > 0 --> x != 0
-        case FCmpInst::FCMP_OGT:
-          return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0), RHSC);
-        // fabs(x) <= 0 --> x == 0
-        case FCmpInst::FCMP_OLE:
-          return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0), RHSC);
-        // fabs(x) >= 0 --> !isnan(x)
-        case FCmpInst::FCMP_OGE:
-          return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0), RHSC);
-        // fabs(x) == 0 --> x == 0
-        // fabs(x) != 0 --> x != 0
-        case FCmpInst::FCMP_OEQ:
-        case FCmpInst::FCMP_UEQ:
-        case FCmpInst::FCMP_ONE:
-        case FCmpInst::FCMP_UNE:
-          return new FCmpInst(Pred, CI->getArgOperand(0), RHSC);
-        }
-      }
-      }
+  Value *X, *Y;
+  if (match(Op0, m_FNeg(m_Value(X)))) {
+    // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
+    if (match(Op1, m_FNeg(m_Value(Y))))
+      return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I);
+
+    // fcmp pred (fneg X), C --> fcmp swap(pred) X, -C
+    Constant *C;
+    if (match(Op1, m_Constant(C))) {
+      Constant *NegC = ConstantExpr::getFNeg(C);
+      return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
+    }
   }
 
-  // fcmp pred (fneg x), (fneg y) -> fcmp swap(pred) x, y
-  Value *X, *Y;
-  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
-    return new FCmpInst(I.getSwappedPredicate(), X, Y);
-
-  // fcmp (fpext x), (fpext y) -> fcmp x, y
-  if (FPExtInst *LHSExt = dyn_cast<FPExtInst>(Op0))
-    if (FPExtInst *RHSExt = dyn_cast<FPExtInst>(Op1))
-      if (LHSExt->getSrcTy() == RHSExt->getSrcTy())
-        return new FCmpInst(Pred, LHSExt->getOperand(0), RHSExt->getOperand(0));
+  if (match(Op0, m_FPExt(m_Value(X)))) {
+    // fcmp (fpext X), (fpext Y) -> fcmp X, Y
+    if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
+      return new FCmpInst(Pred, X, Y, "", &I);
+
+    // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
+    const APFloat *C;
+    if (match(Op1, m_APFloat(C))) {
+      const fltSemantics &FPSem =
+          X->getType()->getScalarType()->getFltSemantics();
+      bool Lossy;
+      APFloat TruncC = *C;
+      TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
+
+      // Avoid lossy conversions and denormals.
+      // Zero is a special case that's OK to convert.
+      APFloat Fabs = TruncC;
+      Fabs.clearSign();
+      if (!Lossy &&
+          ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
+            APFloat::cmpLessThan) || Fabs.isZero())) {
+        Constant *NewC = ConstantFP::get(X->getType(), TruncC);
+        return new FCmpInst(Pred, X, NewC, "", &I);
+      }
+    }
+  }
 
   if (I.getType()->isVectorTy())
     if (Instruction *Res = foldVectorCmp(I, Builder))
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 46c598d4bfb245e5198c1b8c1e06b30ee3cabc99..431856c9e00609f45623383138556585d8f8b0fe 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -82,8 +82,8 @@ class User;
 ///   5 -> Other instructions
 static inline unsigned getComplexity(Value *V) {
   if (isa<Instruction>(V)) {
-    if (isa<CastInst>(V) || BinaryOperator::isNeg(V) ||
-        BinaryOperator::isFNeg(V) || BinaryOperator::isNot(V))
+    if (isa<CastInst>(V) || match(V, m_Neg(m_Value())) ||
+        match(V, m_Not(m_Value())) || match(V, m_FNeg(m_Value())))
       return 4;
     return 5;
   }
@@ -141,7 +141,7 @@ static inline Constant *SubOne(Constant *C) {
 /// uses of V and only keep uses of ~V.
 static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
   // ~(~(X)) -> X.
-  if (BinaryOperator::isNot(V))
+  if (match(V, m_Not(m_Value())))
     return true;
 
   // Constants can be considered to be not'ed values.
@@ -589,6 +589,9 @@ private:
 
   Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
                                        bool JoinedByAnd, Instruction &CxtI);
+  Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D);
+  Value *getSelectCondition(Value *A, Value *B);
+
 public:
   /// Inserts an instruction \p New before instruction \p Old
   ///
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 6427c818e02929ae4cdcdddfd38c830be40f0a00..c348aecb2d44a9a6054316da902ce38b9fad2185 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1157,7 +1157,8 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       IRBuilder<>::FastMathFlagGuard FMFGuard(B);
       B.setFastMathFlags(I.getFastMathFlags());
       AttributeList Attrs = CallSite(Op0).getCalledFunction()->getAttributes();
-      Value *Res = emitUnaryFloatFnCall(X, TLI.getName(LibFunc_tan), B, Attrs);
+      Value *Res = emitUnaryFloatFnCall(X, &TLI, LibFunc_tan, LibFunc_tanf,
+                                        LibFunc_tanl, B, Attrs);
       if (IsCot)
         Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res);
       return replaceInstUsesWith(I, Res);
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 0289abe472e89894c431129cc6a448ad16679bf8..a71ebdcd346a4e3888b1b59239dcd06673b522d1 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -211,23 +211,20 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
   }
 
   // If it requires a conversion for every PHI operand, do not do it.
-  if (std::all_of(AvailablePtrVals.begin(), AvailablePtrVals.end(),
-                  [&](Value *V) {
-                    return (V->getType() != IntToPtr->getType()) ||
-                           isa<IntToPtrInst>(V);
-                  }))
+  if (all_of(AvailablePtrVals, [&](Value *V) {
+        return (V->getType() != IntToPtr->getType()) || isa<IntToPtrInst>(V);
+      }))
     return nullptr;
 
   // If any of the operand that requires casting is a terminator
   // instruction, do not do it.
-  if (std::any_of(AvailablePtrVals.begin(), AvailablePtrVals.end(),
-                  [&](Value *V) {
-                    if (V->getType() == IntToPtr->getType())
-                      return false;
-
-                    auto *Inst = dyn_cast<Instruction>(V);
-                    return Inst && Inst->isTerminator();
-                  }))
+  if (any_of(AvailablePtrVals, [&](Value *V) {
+        if (V->getType() == IntToPtr->getType())
+          return false;
+
+        auto *Inst = dyn_cast<Instruction>(V);
+        return Inst && Inst->isTerminator();
+      }))
     return nullptr;
 
   PHINode *NewPtrPHI = PHINode::Create(
@@ -652,7 +649,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
 Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
   // We cannot create a new instruction after the PHI if the terminator is an
   // EHPad because there is no valid insertion point.
-  if (TerminatorInst *TI = Phi.getParent()->getTerminator())
+  if (Instruction *TI = Phi.getParent()->getTerminator())
     if (TI->isEHPad())
       return nullptr;
 
@@ -726,7 +723,7 @@ Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
 Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   // We cannot create a new instruction after the PHI if the terminator is an
   // EHPad because there is no valid insertion point.
-  if (TerminatorInst *TI = PN.getParent()->getTerminator())
+  if (Instruction *TI = PN.getParent()->getTerminator())
     if (TI->isEHPad())
       return nullptr;
 
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 3d346dff2f8d635cc82dbafbdb32c93c7c7fc091..88a72bb8eb57e1e87d3fe6ce4019d64fdb1947c4 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -75,13 +75,22 @@ static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
   else
     return nullptr;
 
-  // A select operand must be a binop, and the compare constant must be the
-  // identity constant for that binop.
+  // A select operand must be a binop.
   BinaryOperator *BO;
-  if (!match(Sel.getOperand(IsEq ? 1 : 2), m_BinOp(BO)) ||
-      ConstantExpr::getBinOpIdentity(BO->getOpcode(), BO->getType(), true) != C)
+  if (!match(Sel.getOperand(IsEq ? 1 : 2), m_BinOp(BO)))
     return nullptr;
 
+  // The compare constant must be the identity constant for that binop.
+  // If this a floating-point compare with 0.0, any zero constant will do.
+  Type *Ty = BO->getType();
+  Constant *IdC = ConstantExpr::getBinOpIdentity(BO->getOpcode(), Ty, true);
+  if (IdC != C) {
+    if (!IdC || !CmpInst::isFPPredicate(Pred))
+      return nullptr;
+    if (!match(IdC, m_AnyZeroFP()) || !match(C, m_AnyZeroFP()))
+      return nullptr;
+  }
+
   // Last, match the compare variable operand with a binop operand.
   Value *Y;
   if (!BO->isCommutative() && !match(BO, m_BinOp(m_Value(Y), m_Specific(X))))
@@ -1651,31 +1660,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   // See if we are selecting two values based on a comparison of the two values.
   if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
     if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) {
-      // Transform (X == Y) ? X : Y  -> Y
-      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
-        // This is not safe in general for floating point:
-        // consider X== -0, Y== +0.
-        // It becomes safe if either operand is a nonzero constant.
-        ConstantFP *CFPt, *CFPf;
-        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
-              !CFPt->getValueAPF().isZero()) ||
-            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
-             !CFPf->getValueAPF().isZero()))
-        return replaceInstUsesWith(SI, FalseVal);
-      }
-      // Transform (X une Y) ? X : Y  -> X
-      if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
-        // This is not safe in general for floating point:
-        // consider X== -0, Y== +0.
-        // It becomes safe if either operand is a nonzero constant.
-        ConstantFP *CFPt, *CFPf;
-        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
-              !CFPt->getValueAPF().isZero()) ||
-            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
-             !CFPf->getValueAPF().isZero()))
-        return replaceInstUsesWith(SI, TrueVal);
-      }
-
       // Canonicalize to use ordered comparisons by swapping the select
       // operands.
       //
@@ -1694,31 +1678,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
       // NOTE: if we wanted to, this is where to detect MIN/MAX
     } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){
-      // Transform (X == Y) ? Y : X  -> X
-      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ) {
-        // This is not safe in general for floating point:
-        // consider X== -0, Y== +0.
-        // It becomes safe if either operand is a nonzero constant.
-        ConstantFP *CFPt, *CFPf;
-        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
-              !CFPt->getValueAPF().isZero()) ||
-            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
-             !CFPf->getValueAPF().isZero()))
-          return replaceInstUsesWith(SI, FalseVal);
-      }
-      // Transform (X une Y) ? Y : X  -> Y
-      if (FCI->getPredicate() == FCmpInst::FCMP_UNE) {
-        // This is not safe in general for floating point:
-        // consider X== -0, Y== +0.
-        // It becomes safe if either operand is a nonzero constant.
-        ConstantFP *CFPt, *CFPf;
-        if (((CFPt = dyn_cast<ConstantFP>(TrueVal)) &&
-              !CFPt->getValueAPF().isZero()) ||
-            ((CFPf = dyn_cast<ConstantFP>(FalseVal)) &&
-             !CFPf->getValueAPF().isZero()))
-          return replaceInstUsesWith(SI, TrueVal);
-      }
-
       // Canonicalize to use ordered comparisons by swapping the select
       // operands.
       //
@@ -1989,10 +1948,12 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     }
   }
 
-  if (BinaryOperator::isNot(CondVal)) {
-    SI.setOperand(0, BinaryOperator::getNotArgument(CondVal));
+  Value *NotCond;
+  if (match(CondVal, m_Not(m_Value(NotCond)))) {
+    SI.setOperand(0, NotCond);
     SI.setOperand(1, FalseVal);
     SI.setOperand(2, TrueVal);
+    SI.swapProfMetadata();
     return &SI;
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 045ce423ef60f3746b19bd51b75d02a8346eac32..c562d45a9e2b919aff71d45971eb0d0a1007fedd 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -725,9 +725,9 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
 
     Value *X;
     const APInt *ShOp1;
-    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
-      unsigned ShlAmt = ShOp1->getZExtValue();
-      if (ShlAmt < ShAmt) {
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
+      if (ShOp1->ult(ShAmt)) {
+        unsigned ShlAmt = ShOp1->getZExtValue();
         Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
         if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
           // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1)
@@ -740,7 +740,8 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
         APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
         return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
       }
-      if (ShlAmt > ShAmt) {
+      if (ShOp1->ugt(ShAmt)) {
+        unsigned ShlAmt = ShOp1->getZExtValue();
         Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
         if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
           // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
@@ -753,7 +754,7 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
         APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
         return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
       }
-      assert(ShlAmt == ShAmt);
+      assert(*ShOp1 == ShAmt);
       // (X << C) >>u C --> X & (-1 >>u C)
       APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 936daa828a587dbc58bd81f70d8d569632c1e173..45cacc73d63becb324ffabe1d4d5425a349e2873 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -314,11 +314,32 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     Known.One  = std::move(IKnownOne);
     break;
   }
-  case Instruction::Select:
-    // If this is a select as part of a min/max pattern, don't simplify any
-    // further in case we break the structure.
+  case Instruction::Select: {
     Value *LHS, *RHS;
-    if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN)
+    SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
+    if (SPF == SPF_UMAX) {
+      // UMax(A, C) == A if ...
+      // The lowest non-zero bit of DemandMask is higher than the highest
+      // non-zero bit of C.
+      const APInt *C;
+      unsigned CTZ = DemandedMask.countTrailingZeros();
+      if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits())
+        return LHS;
+    } else if (SPF == SPF_UMIN) {
+      // UMin(A, C) == A if ...
+      // The lowest non-zero bit of DemandMask is higher than the highest
+      // non-one bit of C.
+      // This comes from using DeMorgans on the above umax example.
+      const APInt *C;
+      unsigned CTZ = DemandedMask.countTrailingZeros();
+      if (match(RHS, m_APInt(C)) &&
+          CTZ >= C->getBitWidth() - C->countLeadingOnes())
+        return LHS;
+    }
+
+    // If this is a select as part of any other min/max pattern, don't simplify
+    // any further in case we break the structure.
+    if (SPF != SPF_UNKNOWN)
       return nullptr;
 
     if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
@@ -336,6 +357,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     Known.One = RHSKnown.One & LHSKnown.One;
     Known.Zero = RHSKnown.Zero & LHSKnown.Zero;
     break;
+  }
   case Instruction::ZExt:
   case Instruction::Trunc: {
     unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 7258127f3197116b01ed229be911b9dc0976538b..21dd7ed227af476fac845aeb974e4b62dd98164c 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -876,43 +876,62 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp))
     replaceInstUsesWith(IE, VecOp);
 
-  // If the inserted element was extracted from some other vector, and if the
-  // indexes are constant, try to turn this into a shufflevector operation.
-  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
-    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
-      unsigned NumInsertVectorElts = IE.getType()->getNumElements();
-      unsigned NumExtractVectorElts =
-          EI->getOperand(0)->getType()->getVectorNumElements();
-      unsigned ExtractedIdx =
-        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
-      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
-
-      if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract.
-        return replaceInstUsesWith(IE, VecOp);
-
-      if (InsertedIdx >= NumInsertVectorElts)  // Out of range insert.
-        return replaceInstUsesWith(IE, UndefValue::get(IE.getType()));
-
-      // If we are extracting a value from a vector, then inserting it right
-      // back into the same place, just use the input vector.
-      if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
-        return replaceInstUsesWith(IE, VecOp);
-
-      // If this insertelement isn't used by some other insertelement, turn it
-      // (and any insertelements it points to), into one big shuffle.
-      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
-        SmallVector<Constant*, 16> Mask;
-        ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
-
-        // The proposed shuffle may be trivial, in which case we shouldn't
-        // perform the combine.
-        if (LR.first != &IE && LR.second != &IE) {
-          // We now have a shuffle of LHS, RHS, Mask.
-          if (LR.second == nullptr)
-            LR.second = UndefValue::get(LR.first->getType());
-          return new ShuffleVectorInst(LR.first, LR.second,
-                                       ConstantVector::get(Mask));
-        }
+  // If the inserted element was extracted from some other vector and both
+  // indexes are constant, try to turn this into a shuffle.
+  uint64_t InsertedIdx, ExtractedIdx;
+  Value *ExtVecOp;
+  if (match(IdxOp, m_ConstantInt(InsertedIdx)) &&
+      match(ScalarOp, m_ExtractElement(m_Value(ExtVecOp),
+                                       m_ConstantInt(ExtractedIdx)))) {
+    unsigned NumInsertVectorElts = IE.getType()->getNumElements();
+    unsigned NumExtractVectorElts = ExtVecOp->getType()->getVectorNumElements();
+    if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract.
+      return replaceInstUsesWith(IE, VecOp);
+
+    if (InsertedIdx >= NumInsertVectorElts)  // Out of range insert.
+      return replaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+
+    // If we are extracting a value from a vector, then inserting it right
+    // back into the same place, just use the input vector.
+    if (ExtVecOp == VecOp && ExtractedIdx == InsertedIdx)
+      return replaceInstUsesWith(IE, VecOp);
+
+    // TODO: Looking at the user(s) to determine if this insert is a
+    // fold-to-shuffle opportunity does not match the usual instcombine
+    // constraints. We should decide if the transform is worthy based only
+    // on this instruction and its operands, but that may not work currently.
+    //
+    // Here, we are trying to avoid creating shuffles before reaching
+    // the end of a chain of extract-insert pairs. This is complicated because
+    // we do not generally form arbitrary shuffle masks in instcombine
+    // (because those may codegen poorly), but collectShuffleElements() does
+    // exactly that.
+    //
+    // The rules for determining what is an acceptable target-independent
+    // shuffle mask are fuzzy because they evolve based on the backend's
+    // capabilities and real-world impact.
+    auto isShuffleRootCandidate = [](InsertElementInst &Insert) {
+      if (!Insert.hasOneUse())
+        return true;
+      auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());
+      if (!InsertUser)
+        return true;
+      return false;
+    };
+
+    // Try to form a shuffle from a chain of extract-insert ops.
+    if (isShuffleRootCandidate(IE)) {
+      SmallVector<Constant*, 16> Mask;
+      ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
+
+      // The proposed shuffle may be trivial, in which case we shouldn't
+      // perform the combine.
+      if (LR.first != &IE && LR.second != &IE) {
+        // We now have a shuffle of LHS, RHS, Mask.
+        if (LR.second == nullptr)
+          LR.second = UndefValue::get(LR.first->getType());
+        return new ShuffleVectorInst(LR.first, LR.second,
+                                     ConstantVector::get(Mask));
       }
     }
   }
@@ -1490,13 +1509,21 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
 
   // We are extracting a subvector from a shuffle. Remove excess elements from
   // the 1st shuffle mask to eliminate the extract.
-  //   shuf (shuf X, Y, <C0, C1, C2, C3>), undef, <0, undef, 2> -->
-  //   shuf X, Y, <C0, undef, C2>
+  //
+  // This transform is conservatively limited to identity extracts because we do
+  // not allow arbitrary shuffle mask creation as a target-independent transform
+  // (because we can't guarantee that will lower efficiently).
+  //
+  // If the extracting shuffle has an undef mask element, it transfers to the
+  // new shuffle mask. Otherwise, copy the original mask element. Example:
+  //   shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> -->
+  //   shuf X, Y, <C0, undef, C2, undef>
   unsigned NumElts = Shuf.getType()->getVectorNumElements();
   SmallVector<Constant *, 16> NewMask(NumElts);
+  assert(NumElts < Mask->getType()->getVectorNumElements() &&
+         "Identity with extract must have less elements than its inputs");
+
   for (unsigned i = 0; i != NumElts; ++i) {
-    // If the extracting shuffle has an undef mask element, it transfers to the
-    // new shuffle mask. Otherwise, copy the original mask element.
     Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i);
     Constant *MaskElt = Mask->getAggregateElement(i);
     NewMask[i] = isa<UndefValue>(ExtractMaskElt) ? ExtractMaskElt : MaskElt;
@@ -1504,6 +1531,71 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
   return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask));
 }
 
+/// Try to replace a shuffle with an insertelement.
+static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) {
+  Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
+  SmallVector<int, 16> Mask = Shuf.getShuffleMask();
+
+  // The shuffle must not change vector sizes.
+  // TODO: This restriction could be removed if the insert has only one use
+  //       (because the transform would require a new length-changing shuffle).
+  int NumElts = Mask.size();
+  if (NumElts != (int)(V0->getType()->getVectorNumElements()))
+    return nullptr;
+
+  // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
+  auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
+    // We need an insertelement with a constant index.
+    if (!match(V0, m_InsertElement(m_Value(), m_Value(Scalar),
+                                   m_ConstantInt(IndexC))))
+      return false;
+
+    // Test the shuffle mask to see if it splices the inserted scalar into the
+    // operand 1 vector of the shuffle.
+    int NewInsIndex = -1;
+    for (int i = 0; i != NumElts; ++i) {
+      // Ignore undef mask elements.
+      if (Mask[i] == -1)
+        continue;
+
+      // The shuffle takes elements of operand 1 without lane changes.
+      if (Mask[i] == NumElts + i)
+        continue;
+
+      // The shuffle must choose the inserted scalar exactly once.
+      if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue())
+        return false;
+
+      // The shuffle is placing the inserted scalar into element i.
+      NewInsIndex = i;
+    }
+
+    assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?");
+
+    // Index is updated to the potentially translated insertion lane.
+    IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex);
+    return true;
+  };
+
+  // If the shuffle is unnecessary, insert the scalar operand directly into
+  // operand 1 of the shuffle. Example:
+  // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0
+  Value *Scalar;
+  ConstantInt *IndexC;
+  if (isShufflingScalarIntoOp1(Scalar, IndexC))
+    return InsertElementInst::Create(V1, Scalar, IndexC);
+
+  // Try again after commuting shuffle. Example:
+  // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> -->
+  // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3
+  std::swap(V0, V1);
+  ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+  if (isShufflingScalarIntoOp1(Scalar, IndexC))
+    return InsertElementInst::Create(V1, Scalar, IndexC);
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
@@ -1529,6 +1621,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (Instruction *I = foldIdentityExtractShuffle(SVI))
     return I;
 
+  // This transform has the potential to lose undef knowledge, so it is
+  // intentionally placed after SimplifyDemandedVectorElts().
+  if (Instruction *I = foldShuffleWithInsert(SVI))
+    return I;
+
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
   unsigned LHSWidth = LHS->getType()->getVectorNumElements();
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 00ffe9e2dc2220cf5d4b166a486b7d822a502c0c..a3962a04b500cbfa0fbf169f1e9cf6aa074a3c19 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -119,6 +119,10 @@ STATISTIC(NumReassoc  , "Number of reassociations");
 DEBUG_COUNTER(VisitCounter, "instcombine-visit",
               "Controls which instructions are visited");
 
+static cl::opt<bool>
+EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
+                                              cl::init(true));
+
 static cl::opt<bool>
 EnableExpensiveCombines("expensive-combines",
                         cl::desc("Enable expensive instruction combines"));
@@ -747,8 +751,9 @@ Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
 /// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
 /// constant zero (which is the 'negate' form).
 Value *InstCombiner::dyn_castNegVal(Value *V) const {
-  if (BinaryOperator::isNeg(V))
-    return BinaryOperator::getNegArgument(V);
+  Value *NegV;
+  if (match(V, m_Neg(m_Value(NegV))))
+    return NegV;
 
   // Constants can be considered to be negated values if they can be folded.
   if (ConstantInt *C = dyn_cast<ConstantInt>(V))
@@ -2317,14 +2322,14 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
 /// The move is performed only if the block containing the call to free
 /// will be removed, i.e.:
 /// 1. it has only one predecessor P, and P has two successors
-/// 2. it contains the call and an unconditional branch
+/// 2. it contains the call, noops, and an unconditional branch
 /// 3. its successor is the same as its predecessor's successor
 ///
 /// The profitability is out-of concern here and this function should
 /// be called only if the caller knows this transformation would be
 /// profitable (e.g., for code size).
-static Instruction *
-tryToMoveFreeBeforeNullTest(CallInst &FI) {
+static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
+                                                const DataLayout &DL) {
   Value *Op = FI.getArgOperand(0);
   BasicBlock *FreeInstrBB = FI.getParent();
   BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();
@@ -2337,20 +2342,34 @@ tryToMoveFreeBeforeNullTest(CallInst &FI) {
     return nullptr;
 
   // Validate constraint #2: Does this block contains only the call to
-  //                         free and an unconditional branch?
-  // FIXME: We could check if we can speculate everything in the
-  //        predecessor block
-  if (FreeInstrBB->size() != 2)
-    return nullptr;
+  //                         free, noops, and an unconditional branch?
   BasicBlock *SuccBB;
-  if (!match(FreeInstrBB->getTerminator(), m_UnconditionalBr(SuccBB)))
+  Instruction *FreeInstrBBTerminator = FreeInstrBB->getTerminator();
+  if (!match(FreeInstrBBTerminator, m_UnconditionalBr(SuccBB)))
     return nullptr;
 
+  // If there are only 2 instructions in the block, at this point,
+  // this is the call to free and unconditional.
+  // If there are more than 2 instructions, check that they are noops
+  // i.e., they won't hurt the performance of the generated code.
+  if (FreeInstrBB->size() != 2) {
+    for (const Instruction &Inst : *FreeInstrBB) {
+      if (&Inst == &FI || &Inst == FreeInstrBBTerminator)
+        continue;
+      auto *Cast = dyn_cast<CastInst>(&Inst);
+      if (!Cast || !Cast->isNoopCast(DL))
+        return nullptr;
+    }
+  }
   // Validate the rest of constraint #1 by matching on the pred branch.
-  TerminatorInst *TI = PredBB->getTerminator();
+  Instruction *TI = PredBB->getTerminator();
   BasicBlock *TrueBB, *FalseBB;
   ICmpInst::Predicate Pred;
-  if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Op), m_Zero()), TrueBB, FalseBB)))
+  if (!match(TI, m_Br(m_ICmp(Pred,
+                             m_CombineOr(m_Specific(Op),
+                                         m_Specific(Op->stripPointerCasts())),
+                             m_Zero()),
+                      TrueBB, FalseBB)))
     return nullptr;
   if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
     return nullptr;
@@ -2361,7 +2380,17 @@ tryToMoveFreeBeforeNullTest(CallInst &FI) {
   assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
          "Broken CFG: missing edge from predecessor to successor");
 
-  FI.moveBefore(TI);
+  // At this point, we know that everything in FreeInstrBB can be moved
+  // before TI.
+  for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end();
+       It != End;) {
+    Instruction &Instr = *It++;
+    if (&Instr == FreeInstrBBTerminator)
+      break;
+    Instr.moveBefore(TI);
+  }
+  assert(FreeInstrBB->size() == 1 &&
+         "Only the branch instruction should remain");
   return &FI;
 }
 
@@ -2388,7 +2417,7 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
   // into
   // free(foo);
   if (MinimizeSize)
-    if (Instruction *I = tryToMoveFreeBeforeNullTest(FI))
+    if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL))
       return I;
 
   return nullptr;
@@ -2481,9 +2510,11 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
 
   // Shrink the condition operand if the new type is smaller than the old type.
-  // This may produce a non-standard type for the switch, but that's ok because
-  // the backend should extend back to a legal type for the target.
-  if (NewWidth > 0 && NewWidth < Known.getBitWidth()) {
+  // But do not shrink to a non-standard type, because backend can't generate 
+  // good code for that yet.
+  // TODO: We can make it aggressive again after fixing PR39569.
+  if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
+      shouldChangeType(Known.getBitWidth(), NewWidth)) {
     IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
     Builder.SetInsertPoint(&SI);
     Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");
@@ -3102,7 +3133,7 @@ bool InstCombiner::run() {
     }
 
     // See if we can trivially sink this instruction to a successor basic block.
-    if (I->hasOneUse()) {
+    if (EnableCodeSinking && I->hasOneUse()) {
       BasicBlock *BB = I->getParent();
       Instruction *UserInst = cast<Instruction>(*I->user_begin());
       BasicBlock *UserParent;
@@ -3285,7 +3316,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
 
     // Recursively visit successors.  If this is a branch or switch on a
     // constant, only visit the reachable successor.
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
         bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 15eba9089cbd0662fc76bb8e9e803fac3c85d584..42b8179f80052704ef6f9e828625f2de34ce8f05 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1100,25 +1100,11 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
   return Res;
 }
 
-// Create a constant for Str so that we can pass it to the run-time lib.
-static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
-                                                    bool AllowMerging) {
-  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
-  // We use private linkage for module-local strings. If they can be merged
-  // with another one, we set the unnamed_addr attribute.
-  GlobalVariable *GV =
-      new GlobalVariable(M, StrConst->getType(), true,
-                         GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix);
-  if (AllowMerging) GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
-  return GV;
-}
-
 /// Create a global describing a source location.
 static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
                                                        LocationMetadata MD) {
   Constant *LocData[] = {
-      createPrivateGlobalForString(M, MD.Filename, true),
+      createPrivateGlobalForString(M, MD.Filename, true, kAsanGenPrefix),
       ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
       ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
   };
@@ -1383,7 +1369,7 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
     } else {
       IRBuilder<> IRB(I);
       Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
-      TerminatorInst *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
       InsertBefore = ThenTerm;
     }
 
@@ -1536,8 +1522,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     Value *TagCheck =
         IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag));
 
-    TerminatorInst *TagCheckTerm = SplitBlockAndInsertIfThen(
-        TagCheck, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
+    Instruction *TagCheckTerm =
+        SplitBlockAndInsertIfThen(TagCheck, InsertBefore, false,
+                                  MDBuilder(*C).createBranchWeights(1, 100000));
     assert(cast<BranchInst>(TagCheckTerm)->isUnconditional());
     IRB.SetInsertPoint(TagCheckTerm);
     InsertBefore = TagCheckTerm;
@@ -1553,12 +1540,12 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
   size_t Granularity = 1ULL << Mapping.Scale;
-  TerminatorInst *CrashTerm = nullptr;
+  Instruction *CrashTerm = nullptr;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
     // We use branch weights for the slow path check, to indicate that the slow
     // path is rarely taken. This seems to be the case for SPEC benchmarks.
-    TerminatorInst *CheckTerm = SplitBlockAndInsertIfThen(
+    Instruction *CheckTerm = SplitBlockAndInsertIfThen(
         Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
     assert(cast<BranchInst>(CheckTerm)->isUnconditional());
     BasicBlock *NextBB = CheckTerm->getSuccessor(0);
@@ -2105,7 +2092,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
   // We shouldn't merge same module names, as this string serves as unique
   // module ID in runtime.
   GlobalVariable *ModuleName = createPrivateGlobalForString(
-      M, M.getModuleIdentifier(), /*AllowMerging*/ false);
+      M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix);
 
   for (size_t i = 0; i < n; i++) {
     static const uint64_t kMaxGlobalRedzone = 1 << 18;
@@ -2117,7 +2104,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool
     // if it's available, otherwise just write the name of global variable).
     GlobalVariable *Name = createPrivateGlobalForString(
         M, MD.Name.empty() ? NameForGlobal : MD.Name,
-        /*AllowMerging*/ true);
+        /*AllowMerging*/ true, kAsanGenPrefix);
 
     Type *Ty = G->getValueType();
     uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
@@ -3020,7 +3007,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
       IntptrPtrTy);
   GlobalVariable *StackDescriptionGlobal =
       createPrivateGlobalForString(*F.getParent(), DescriptionString,
-                                   /*AllowMerging*/ true);
+                                   /*AllowMerging*/ true, kAsanGenPrefix);
   Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
   IRB.CreateStore(Description, BasePlus1);
   // Write the PC to redzone[2].
@@ -3078,7 +3065,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
       //     <This is not a fake stack; unpoison the redzones>
       Value *Cmp =
           IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
-      TerminatorInst *ThenTerm, *ElseTerm;
+      Instruction *ThenTerm, *ElseTerm;
       SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
 
       IRBuilder<> IRBPoison(ThenTerm);
diff --git a/lib/Transforms/Instrumentation/CFGMST.h b/lib/Transforms/Instrumentation/CFGMST.h
index cc9b149d0b6a546e2330767a84e4d8859399f352..e178ef386e68fab63c795da286491c0a8f1c1c1c 100644
--- a/lib/Transforms/Instrumentation/CFGMST.h
+++ b/lib/Transforms/Instrumentation/CFGMST.h
@@ -119,7 +119,7 @@ public:
     static const uint32_t CriticalEdgeMultiplier = 1000;
 
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
+      Instruction *TI = BB->getTerminator();
       uint64_t BBWeight =
           (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
       uint64_t Weight = 2;
diff --git a/lib/Transforms/Instrumentation/CGProfile.cpp b/lib/Transforms/Instrumentation/CGProfile.cpp
index 9606b3da2475adf20745ff166b72e2566721abf6..cdcd017269061f242a495742dec7f9aebdb1b664 100644
--- a/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -88,11 +88,10 @@ void CGProfilePass::addModuleFlags(
   std::vector<Metadata *> Nodes;
 
   for (auto E : Counts) {
-    SmallVector<Metadata *, 3> Vals;
-    Vals.push_back(ValueAsMetadata::get(E.first.first));
-    Vals.push_back(ValueAsMetadata::get(E.first.second));
-    Vals.push_back(MDB.createConstant(
-        ConstantInt::get(Type::getInt64Ty(Context), E.second)));
+    Metadata *Vals[] = {ValueAsMetadata::get(E.first.first),
+                        ValueAsMetadata::get(E.first.second),
+                        MDB.createConstant(ConstantInt::get(
+                            Type::getInt64Ty(Context), E.second))};
     Nodes.push_back(MDNode::get(Context, Vals));
   }
 
diff --git a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 2c0721f73669f064bcb7c3fdcf9054639872a2ab..8f4159d3d191ce61ade22b1cd8b6aa03ccfe653a 100644
--- a/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -621,9 +621,10 @@ static BranchProbability getCHRBiasThreshold() {
 // CHRBiasThreshold, put Key into TrueSet and return true. If FalseProb >=
 // CHRBiasThreshold, put Key into FalseSet and return true. Otherwise, return
 // false.
-template<typename K, typename S, typename M>
-bool checkBias(K *Key, BranchProbability TrueProb, BranchProbability FalseProb,
-               S &TrueSet, S &FalseSet, M &BiasMap) {
+template <typename K, typename S, typename M>
+static bool checkBias(K *Key, BranchProbability TrueProb,
+                      BranchProbability FalseProb, S &TrueSet, S &FalseSet,
+                      M &BiasMap) {
   BranchProbability Threshold = getCHRBiasThreshold();
   if (TrueProb >= Threshold) {
     TrueSet.insert(Key);
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 33f220a893dff4e9009abd338627784474f6eccf..db438e78ded95b6db798bb2181801dc133f6fd53 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -144,21 +144,6 @@ OverrideOptionsFromCL(EfficiencySanitizerOptions Options) {
   return Options;
 }
 
-// Create a constant for Str so that we can pass it to the run-time lib.
-static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
-                                                    bool AllowMerging) {
-  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
-  // We use private linkage for module-local strings. If they can be merged
-  // with another one, we set the unnamed_addr attribute.
-  GlobalVariable *GV =
-    new GlobalVariable(M, StrConst->getType(), true,
-                       GlobalValue::PrivateLinkage, StrConst, "");
-  if (AllowMerging)
-    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
-  return GV;
-}
-
 /// EfficiencySanitizer: instrument each module to find performance issues.
 class EfficiencySanitizer : public ModulePass {
 public:
@@ -902,7 +887,7 @@ bool EfficiencySanitizer::instrumentFastpathWorkingSet(
   Value *OldValue = IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
   // The AND and CMP will be turned into a TEST instruction by the compiler.
   Value *Cmp = IRB.CreateICmpNE(IRB.CreateAnd(OldValue, ValueMask), ValueMask);
-  TerminatorInst *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false);
+  Instruction *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false);
   // FIXME: do I need to call SetCurrentDebugLocation?
   IRB.SetInsertPoint(CmpTerm);
   // We use OR to set the shadow bits to avoid corrupting the middle 6 bits,
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 625b354cc38252b89a56589d59409705742f06a1..084e6b7e43690d50033ce4d2d32396a3d123c5e9 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -109,6 +109,8 @@ private:
   insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
   Function *insertFlush(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
 
+  void AddFlushBeforeForkAndExec();
+
   enum class GCovFileType { GCNO, GCDA };
   std::string mangleName(const DICompileUnit *CU, GCovFileType FileType);
 
@@ -468,6 +470,8 @@ bool GCOVProfiler::runOnModule(Module &M, const TargetLibraryInfo &TLI) {
   this->TLI = &TLI;
   Ctx = &M.getContext();
 
+  AddFlushBeforeForkAndExec();
+
   if (Options.EmitNotes) emitProfileNotes();
   if (Options.EmitData) return emitProfileArcs();
   return false;
@@ -524,6 +528,38 @@ static bool shouldKeepInEntry(BasicBlock::iterator It) {
 	return false;
 }
 
+void GCOVProfiler::AddFlushBeforeForkAndExec() {
+  SmallVector<Instruction *, 2> ForkAndExecs;
+  for (auto &F : M->functions()) {
+    for (auto &I : instructions(F)) {
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (Function *Callee = CI->getCalledFunction()) {
+          LibFunc LF;
+          if (TLI->getLibFunc(*Callee, LF) &&
+              (LF == LibFunc_fork || LF == LibFunc_execl ||
+               LF == LibFunc_execle || LF == LibFunc_execlp ||
+               LF == LibFunc_execv || LF == LibFunc_execvp ||
+               LF == LibFunc_execve || LF == LibFunc_execvpe ||
+               LF == LibFunc_execvP)) {
+            ForkAndExecs.push_back(&I);
+          }
+        }
+      }
+    }
+  }
+
+  // We need to split the block after the fork/exec call
+  // because else the counters for the lines after will be
+  // the same as before the call.
+  for (auto I : ForkAndExecs) {
+    IRBuilder<> Builder(I);
+    FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false);
+    Constant *GCOVFlush = M->getOrInsertFunction("__gcov_flush", FTy);
+    Builder.CreateCall(GCOVFlush);
+    I->getParent()->splitBasicBlock(I);
+  }
+}
+
 void GCOVProfiler::emitProfileNotes() {
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes) return;
@@ -570,9 +606,14 @@ void GCOVProfiler::emitProfileNotes() {
                                                 Options.ExitBlockBeforeBody));
       GCOVFunction &Func = *Funcs.back();
 
+      // Add the function line number to the lines of the entry block
+      // to have a counter for the function definition.
+      uint32_t Line = SP->getLine();
+      Func.getBlock(&EntryBlock).getFile(SP->getFilename()).addLine(Line);
+
       for (auto &BB : F) {
         GCOVBlock &Block = Func.getBlock(&BB);
-        TerminatorInst *TI = BB.getTerminator();
+        Instruction *TI = BB.getTerminator();
         if (int successors = TI->getNumSuccessors()) {
           for (int i = 0; i != successors; ++i) {
             Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
@@ -581,7 +622,6 @@ void GCOVProfiler::emitProfileNotes() {
           Block.addEdge(Func.getReturnBlock());
         }
 
-        uint32_t Line = 0;
         for (auto &I : BB) {
           // Debug intrinsic locations correspond to the location of the
           // declaration, not necessarily any statements or expressions.
@@ -603,6 +643,7 @@ void GCOVProfiler::emitProfileNotes() {
           GCOVLines &Lines = Block.getFile(SP->getFilename());
           Lines.addLine(Loc.getLine());
         }
+        Line = 0;
       }
       EdgeDestinations += Func.getEdgeDestinations();
     }
@@ -640,7 +681,7 @@ bool GCOVProfiler::emitProfileArcs() {
       DenseMap<std::pair<BasicBlock *, BasicBlock *>, unsigned> EdgeToCounter;
       unsigned Edges = 0;
       for (auto &BB : F) {
-        TerminatorInst *TI = BB.getTerminator();
+        Instruction *TI = BB.getTerminator();
         if (isa<ReturnInst>(TI)) {
           EdgeToCounter[{&BB, nullptr}] = Edges++;
         } else {
@@ -684,7 +725,7 @@ bool GCOVProfiler::emitProfileArcs() {
           Count = Builder.CreateAdd(Count, Builder.getInt64(1));
           Builder.CreateStore(Count, Phi);
 
-          TerminatorInst *TI = BB.getTerminator();
+          Instruction *TI = BB.getTerminator();
           if (isa<ReturnInst>(TI)) {
             auto It = EdgeToCounter.find({&BB, nullptr});
             assert(It != EdgeToCounter.end());
diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 510b1b058d0f0e71fc9d2590379a1504480b8db7..91021604169a6936e642567291202fe15ec31b50 100644
--- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <sstream>
 
 using namespace llvm;
 
@@ -146,6 +147,11 @@ static cl::opt<bool>
                          cl::desc("Record stack frames with tagged allocations "
                                   "in a thread-local ring buffer"),
                          cl::Hidden, cl::init(true));
+static cl::opt<bool>
+    ClCreateFrameDescriptions("hwasan-create-frame-descriptions",
+                              cl::desc("create static frame descriptions"),
+                              cl::Hidden, cl::init(true));
+
 namespace {
 
 /// An instrumentation pass implementing detection of addressability bugs
@@ -198,8 +204,27 @@ public:
 
 private:
   LLVMContext *C;
+  std::string CurModuleUniqueId;
   Triple TargetTriple;
 
+  // Frame description is a way to pass names/sizes of local variables
+  // to the run-time w/o adding extra executable code in every function.
+  // We do this by creating a separate section with {PC,Descr} pairs and passing
+  // the section beg/end to __hwasan_init_frames() at module init time.
+  std::string createFrameString(ArrayRef<AllocaInst*> Allocas);
+  void createFrameGlobal(Function &F, const std::string &FrameString);
+  // Get the section name for frame descriptions. Currently ELF-only.
+  const char *getFrameSection() { return "__hwasan_frames"; }
+  const char *getFrameSectionBeg() { return  "__start___hwasan_frames"; }
+  const char *getFrameSectionEnd() { return  "__stop___hwasan_frames"; }
+  GlobalVariable *createFrameSectionBound(Module &M, Type *Ty,
+                                          const char *Name) {
+    auto GV = new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                                 nullptr, Name);
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+    return GV;
+  }
+
   /// This struct defines the shadow mapping using the rule:
   ///   shadow = (mem >> Scale) + Offset.
   /// If InGlobal is true, then
@@ -207,7 +232,7 @@ private:
   ///   shadow = (mem >> Scale) + &__hwasan_shadow
   /// If InTls is true, then
   ///   extern char *__hwasan_tls;
-  ///   shadow = (mem >> Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment)
+  ///   shadow = (mem>>Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment)
   struct ShadowMapping {
     int Scale;
     uint64_t Offset;
@@ -271,6 +296,7 @@ bool HWAddressSanitizer::doInitialization(Module &M) {
   Mapping.init(TargetTriple);
 
   C = &(M.getContext());
+  CurModuleUniqueId = getUniqueModuleId(&M);
   IRBuilder<> IRB(*C);
   IntptrTy = IRB.getIntPtrTy(DL);
   Int8PtrTy = IRB.getInt8PtrTy();
@@ -285,6 +311,21 @@ bool HWAddressSanitizer::doInitialization(Module &M) {
                                             /*InitArgs=*/{});
     appendToGlobalCtors(M, HwasanCtorFunction, 0);
   }
+
+  // Create a call to __hwasan_init_frames.
+  if (HwasanCtorFunction) {
+    // Create a dummy frame description for the CTOR function.
+    // W/o it we would have to create the call to __hwasan_init_frames after
+    // all functions are instrumented (i.e. need to have a ModulePass).
+    createFrameGlobal(*HwasanCtorFunction, "");
+    IRBuilder<> IRBCtor(HwasanCtorFunction->getEntryBlock().getTerminator());
+    IRBCtor.CreateCall(
+        declareSanitizerInitFunction(M, "__hwasan_init_frames",
+                                     {Int8PtrTy, Int8PtrTy}),
+        {createFrameSectionBound(M, Int8Ty, getFrameSectionBeg()),
+         createFrameSectionBound(M, Int8Ty, getFrameSectionEnd())});
+  }
+
   if (!TargetTriple.isAndroid())
     appendToCompilerUsed(
         M, ThreadPtrGlobal = new GlobalVariable(
@@ -467,7 +508,7 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *PtrLong, bool IsWrite,
     TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
   }
 
-  TerminatorInst *CheckTerm =
+  Instruction *CheckTerm =
       SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover,
                                 MDBuilder(*C).createBranchWeights(1, 100000));
 
@@ -676,6 +717,37 @@ Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
   return nullptr;
 }
 
+// Creates a string with a description of the stack frame (set of Allocas).
+// The string is intended to be human readable.
+// The current form is: Size1 Name1; Size2 Name2; ...
+std::string
+HWAddressSanitizer::createFrameString(ArrayRef<AllocaInst *> Allocas) {
+  std::ostringstream Descr;
+  for (auto AI : Allocas)
+    Descr << getAllocaSizeInBytes(*AI) << " " <<  AI->getName().str() << "; ";
+  return Descr.str();
+}
+
+// Creates a global in the frame section which consists of two pointers:
+// the function PC and the frame string constant.
+void HWAddressSanitizer::createFrameGlobal(Function &F,
+                                           const std::string &FrameString) {
+  Module &M = *F.getParent();
+  auto DescrGV = createPrivateGlobalForString(M, FrameString, true);
+  auto PtrPairTy = StructType::get(F.getType(), DescrGV->getType());
+  auto GV = new GlobalVariable(
+      M, PtrPairTy, /*isConstantGlobal*/ true, GlobalVariable::PrivateLinkage,
+      ConstantStruct::get(PtrPairTy, (Constant *)&F, (Constant *)DescrGV),
+      "__hwasan");
+  GV->setSection(getFrameSection());
+  appendToCompilerUsed(M, GV);
+  // Put GV into the F's Comadat so that if F is deleted GV can be deleted too.
+  if (&F != HwasanCtorFunction)
+    if (auto Comdat =
+            GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
+      GV->setComdat(Comdat);
+}
+
 Value *HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB,
                                         bool WithFrameRecord) {
   if (!Mapping.InTls)
@@ -838,6 +910,9 @@ bool HWAddressSanitizer::runOnFunction(Function &F) {
   if (AllocasToInstrument.empty() && ToInstrument.empty())
     return false;
 
+  if (ClCreateFrameDescriptions && !AllocasToInstrument.empty())
+    createFrameGlobal(F, createFrameString(AllocasToInstrument));
+
   initializeCallbacks(*F.getParent());
 
   assert(!LocalDynamicShadow);
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index d52b1b928178a79a142d14a3bea357370ad24a99..eb6a3730ad9a8ec8b5134a8aaa764f922680c49c 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -14,7 +14,9 @@
 
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm-c/Initialization.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassRegistry.h"
 
@@ -53,6 +55,49 @@ BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB,
   return IP;
 }
 
+// Create a constant for Str so that we can pass it to the run-time lib.
+GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
+                                                   bool AllowMerging,
+                                                   const char *NamePrefix) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  // We use private linkage for module-local strings. If they can be merged
+  // with another one, we set the unnamed_addr attribute.
+  GlobalVariable *GV =
+      new GlobalVariable(M, StrConst->getType(), true,
+                         GlobalValue::PrivateLinkage, StrConst, NamePrefix);
+  if (AllowMerging)
+    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  return GV;
+}
+
+Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
+                                        const std::string &ModuleId) {
+  if (auto Comdat = F.getComdat()) return Comdat;
+  assert(F.hasName());
+  Module *M = F.getParent();
+  std::string Name = F.getName();
+
+  // Make a unique comdat name for internal linkage things on ELF. On COFF, the
+  // name of the comdat group identifies the leader symbol of the comdat group.
+  // The linkage of the leader symbol is considered during comdat resolution,
+  // and internal symbols with the same name from different objects will not be
+  // merged.
+  if (T.isOSBinFormatELF() && F.hasLocalLinkage()) {
+    if (ModuleId.empty())
+      return nullptr;
+    Name += ModuleId;
+  }
+
+  // Make a new comdat for the function. Use the "no duplicates" selection kind
+  // for non-weak symbols if the object file format supports it.
+  Comdat *C = M->getOrInsertComdat(Name);
+  if (T.isOSBinFormatCOFF() && !F.isWeakForLinker())
+    C->setSelectionKind(Comdat::NoDuplicates);
+  F.setComdat(C);
+  return C;
+}
+
 /// initializeInstrumentation - Initialize all passes in the TransformUtils
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 6421b6efac4f7a05b1ce810b1dd70c8e5bac80e3..960c1f42900c63ddb4e02a7bf5cc342fd0eb45f5 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -90,6 +90,24 @@
 /// value. It implements the store part as a simple atomic store by storing a
 /// clean shadow.
 ///
+///                      Instrumenting inline assembly.
+///
+/// For inline assembly code LLVM has little idea about which memory locations
+/// become initialized depending on the arguments. It can be possible to figure
+/// out which arguments are meant to point to inputs and outputs, but the
+/// actual semantics can be only visible at runtime. In the Linux kernel it's
+/// also possible that the arguments only indicate the offset for a base taken
+/// from a segment register, so it's dangerous to treat any asm() arguments as
+/// pointers. We take a conservative approach generating calls to
+///   __msan_instrument_asm_load(ptr, size) and
+///   __msan_instrument_asm_store(ptr, size)
+/// , which defer the memory checking/unpoisoning to the runtime library.
+/// The latter can perform more complex address checks to figure out whether
+/// it's safe to touch the shadow memory.
+/// Like with atomic operations, we call __msan_instrument_asm_store() before
+/// the assembly call, so that changes to the shadow memory will be seen by
+/// other threads together with main memory initialization.
+///
 ///                  KernelMemorySanitizer (KMSAN) implementation.
 ///
 /// The major differences between KMSAN and MSan instrumentation are:
@@ -549,6 +567,7 @@ private:
   Value *MsanMetadataPtrForLoadN, *MsanMetadataPtrForStoreN;
   Value *MsanMetadataPtrForLoad_1_8[4];
   Value *MsanMetadataPtrForStore_1_8[4];
+  Value *MsanInstrumentAsmStoreFn, *MsanInstrumentAsmLoadFn;
 
   /// Helper to choose between different MsanMetadataPtrXxx().
   Value *getKmsanShadowOriginAccessFn(bool isStore, int size);
@@ -757,6 +776,13 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
 
+  MsanInstrumentAsmLoadFn =
+      M.getOrInsertFunction("__msan_instrument_asm_load", IRB.getVoidTy(),
+                            PointerType::get(IRB.getInt8Ty(), 0), IntptrTy);
+  MsanInstrumentAsmStoreFn =
+      M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(),
+                            PointerType::get(IRB.getInt8Ty(), 0), IntptrTy);
+
   if (CompileKernel) {
     createKernelApi(M);
   } else {
@@ -3444,37 +3470,97 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Nothing to do here.
   }
 
+  void instrumentAsmArgument(Value *Operand, Instruction &I, IRBuilder<> &IRB,
+                             const DataLayout &DL, bool isOutput) {
+    // For each assembly argument, we check its value for being initialized.
+    // If the argument is a pointer, we assume it points to a single element
+    // of the corresponding type (or to a 8-byte word, if the type is unsized).
+    // Each such pointer is instrumented with a call to the runtime library.
+    Type *OpType = Operand->getType();
+    // Check the operand value itself.
+    insertShadowCheck(Operand, &I);
+    if (!OpType->isPointerTy()) {
+      assert(!isOutput);
+      return;
+    }
+    Value *Hook =
+        isOutput ? MS.MsanInstrumentAsmStoreFn : MS.MsanInstrumentAsmLoadFn;
+    Type *ElType = OpType->getPointerElementType();
+    if (!ElType->isSized())
+      return;
+    int Size = DL.getTypeStoreSize(ElType);
+    Value *Ptr = IRB.CreatePointerCast(Operand, IRB.getInt8PtrTy());
+    Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
+    IRB.CreateCall(Hook, {Ptr, SizeVal});
+  }
+
+  /// Get the number of output arguments returned by pointers.
+  int getNumOutputArgs(InlineAsm *IA, CallInst *CI) {
+    int NumRetOutputs = 0;
+    int NumOutputs = 0;
+    Type *RetTy = dyn_cast<Value>(CI)->getType();
+    if (!RetTy->isVoidTy()) {
+      // Register outputs are returned via the CallInst return value.
+      StructType *ST = dyn_cast_or_null<StructType>(RetTy);
+      if (ST)
+        NumRetOutputs = ST->getNumElements();
+      else
+        NumRetOutputs = 1;
+    }
+    InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+    for (size_t i = 0, n = Constraints.size(); i < n; i++) {
+      InlineAsm::ConstraintInfo Info = Constraints[i];
+      switch (Info.Type) {
+      case InlineAsm::isOutput:
+        NumOutputs++;
+        break;
+      default:
+        break;
+      }
+    }
+    return NumOutputs - NumRetOutputs;
+  }
+
   void visitAsmInstruction(Instruction &I) {
     // Conservative inline assembly handling: check for poisoned shadow of
     // asm() arguments, then unpoison the result and all the memory locations
     // pointed to by those arguments.
+    // An inline asm() statement in C++ contains lists of input and output
+    // arguments used by the assembly code. These are mapped to operands of the
+    // CallInst as follows:
+    //  - nR register outputs ("=r) are returned by value in a single structure
+    //  (SSA value of the CallInst);
+    //  - nO other outputs ("=m" and others) are returned by pointer as first
+    // nO operands of the CallInst;
+    //  - nI inputs ("r", "m" and others) are passed to CallInst as the
+    // remaining nI operands.
+    // The total number of asm() arguments in the source is nR+nO+nI, and the
+    // corresponding CallInst has nO+nI+1 operands (the last operand is the
+    // function to be called).
+    const DataLayout &DL = F.getParent()->getDataLayout();
     CallInst *CI = dyn_cast<CallInst>(&I);
-
-    for (size_t i = 0, n = CI->getNumOperands(); i < n; i++) {
+    IRBuilder<> IRB(&I);
+    InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+    int OutputArgs = getNumOutputArgs(IA, CI);
+    // The last operand of a CallInst is the function itself.
+    int NumOperands = CI->getNumOperands() - 1;
+
+    // Check input arguments. Doing so before unpoisoning output arguments, so
+    // that we won't overwrite uninit values before checking them.
+    for (int i = OutputArgs; i < NumOperands; i++) {
       Value *Operand = CI->getOperand(i);
-      if (Operand->getType()->isSized())
-        insertShadowCheck(Operand, &I);
+      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ false);
     }
-    setShadow(&I, getCleanShadow(&I));
-    setOrigin(&I, getCleanOrigin());
-    IRBuilder<> IRB(&I);
-    IRB.SetInsertPoint(I.getNextNode());
-    for (size_t i = 0, n = CI->getNumOperands(); i < n; i++) {
+    // Unpoison output arguments. This must happen before the actual InlineAsm
+    // call, so that the shadow for memory published in the asm() statement
+    // remains valid.
+    for (int i = 0; i < OutputArgs; i++) {
       Value *Operand = CI->getOperand(i);
-      Type *OpType = Operand->getType();
-      if (!OpType->isPointerTy())
-        continue;
-      Type *ElType = OpType->getPointerElementType();
-      if (!ElType->isSized())
-        continue;
-      Value *ShadowPtr, *OriginPtr;
-      std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
-          Operand, IRB, ElType, /*Alignment*/ 1, /*isStore*/ true);
-      Value *CShadow = getCleanShadow(ElType);
-      IRB.CreateStore(
-          CShadow,
-          IRB.CreatePointerCast(ShadowPtr, CShadow->getType()->getPointerTo()));
+      instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ true);
     }
+
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
   }
 
   void visitInstruction(Instruction &I) {
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 307b7eaa2196471fc680fe0c61182fd3bbef2f3c..876ae23dfd29db28875eec38573d6d53751e0c89 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -141,6 +141,11 @@ static cl::opt<std::string>
                        cl::value_desc("filename"),
                        cl::desc("Specify the path of profile data file. This is"
                                 "mainly for test purpose."));
+static cl::opt<std::string> PGOTestProfileRemappingFile(
+    "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden,
+    cl::value_desc("filename"),
+    cl::desc("Specify the path of profile remapping file. This is mainly for "
+             "test purpose."));
 
 // Command line option to disable value profiling. The default is false:
 // i.e. value profiling is enabled by default. This is for debug purpose.
@@ -581,7 +586,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   std::vector<char> Indexes;
   JamCRC JC;
   for (auto &BB : F) {
-    const TerminatorInst *TI = BB.getTerminator();
+    const Instruction *TI = BB.getTerminator();
     for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
       BasicBlock *Succ = TI->getSuccessor(I);
       auto BI = findBBInfo(Succ);
@@ -693,7 +698,7 @@ BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
 
   // Instrument the SrcBB if it has a single successor,
   // otherwise, the DestBB if this is not a critical edge.
-  TerminatorInst *TI = SrcBB->getTerminator();
+  Instruction *TI = SrcBB->getTerminator();
   if (TI->getNumSuccessors() <= 1)
     return SrcBB;
   if (!E->IsCritical)
@@ -854,7 +859,7 @@ public:
         FreqAttr(FFA_Normal) {}
 
   // Read counts for the instrumented BB from profile.
-  bool readCounters(IndexedInstrProfReader *PGOReader);
+  bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros);
 
   // Populate the counts for all BBs.
   void populateCounters();
@@ -899,6 +904,7 @@ public:
     FuncInfo.dumpInfo(Str);
   }
 
+  uint64_t getProgramMaxCount() const { return ProgramMaxCount; }
 private:
   Function &F;
   Module *M;
@@ -1008,7 +1014,7 @@ void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
 // Read the profile from ProfileFileName and assign the value to the
 // instrumented BB and the edges. This function also updates ProgramMaxCount.
 // Return true if the profile are successfully read, and false on errors.
-bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
+bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros) {
   auto &Ctx = M->getContext();
   Expected<InstrProfRecord> Result =
       PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
@@ -1048,6 +1054,7 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader) {
     LLVM_DEBUG(dbgs() << "  " << I << ": " << CountFromProfile[I] << "\n");
     ValueSum += CountFromProfile[I];
   }
+  AllZeros = (ValueSum == 0);
 
   LLVM_DEBUG(dbgs() << "SUM =  " << ValueSum << "\n");
 
@@ -1162,7 +1169,7 @@ void PGOUseFunc::setBranchWeights() {
   // Generate MD_prof metadata for every branch instruction.
   LLVM_DEBUG(dbgs() << "\nSetting branch weights.\n");
   for (auto &BB : F) {
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (TI->getNumSuccessors() < 2)
       continue;
     if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
@@ -1208,7 +1215,7 @@ void PGOUseFunc::annotateIrrLoopHeaderWeights() {
     // to become an irreducible loop header after the indirectbr tail
     // duplication.
     if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) {
-      TerminatorInst *TI = BB.getTerminator();
+      Instruction *TI = BB.getTerminator();
       const UseBBInfo &BBCountInfo = getBBInfo(&BB);
       setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
     }
@@ -1429,13 +1436,14 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M,
 }
 
 static bool annotateAllFunctions(
-    Module &M, StringRef ProfileFileName,
+    Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
     function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI) {
   LLVM_DEBUG(dbgs() << "Read in profile counters: ");
   auto &Ctx = M.getContext();
   // Read the counter array from file.
-  auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName);
+  auto ReaderOrErr =
+      IndexedInstrProfReader::create(ProfileFileName, ProfileRemappingFileName);
   if (Error E = ReaderOrErr.takeError()) {
     handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
       Ctx.diagnose(
@@ -1471,8 +1479,15 @@ static bool annotateAllFunctions(
     // later in getInstrBB() to avoid invalidating it.
     SplitIndirectBrCriticalEdges(F, BPI, BFI);
     PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI);
-    if (!Func.readCounters(PGOReader.get()))
+    bool AllZeros = false;
+    if (!Func.readCounters(PGOReader.get(), AllZeros))
       continue;
+    if (AllZeros) {
+      F.setEntryCount(ProfileCount(0, Function::PCT_Real));
+      if (Func.getProgramMaxCount() != 0)
+        ColdFunctions.push_back(&F);
+      continue;
+    }
     Func.populateCounters();
     Func.setBranchWeights();
     Func.annotateValueSites();
@@ -1529,10 +1544,14 @@ static bool annotateAllFunctions(
   return true;
 }
 
-PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename)
-    : ProfileFileName(std::move(Filename)) {
+PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename,
+                                             std::string RemappingFilename)
+    : ProfileFileName(std::move(Filename)),
+      ProfileRemappingFileName(std::move(RemappingFilename)) {
   if (!PGOTestProfileFile.empty())
     ProfileFileName = PGOTestProfileFile;
+  if (!PGOTestProfileRemappingFile.empty())
+    ProfileRemappingFileName = PGOTestProfileRemappingFile;
 }
 
 PreservedAnalyses PGOInstrumentationUse::run(Module &M,
@@ -1547,7 +1566,8 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
 
-  if (!annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI))
+  if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
+                            LookupBPI, LookupBFI))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -1564,7 +1584,7 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
   };
 
-  return annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI);
+  return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI);
 }
 
 static std::string getSimpleNodeName(const BasicBlock *Node) {
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 2a055920c3eea6a262a80be57101d31400db1fea..7f683ad089fbad766a3e694ab6ecd31e3a1b928d 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
@@ -211,16 +212,14 @@ private:
                              bool IsLeafFunc = true);
   Function *CreateInitCallsForSections(Module &M, const char *InitFunctionName,
                                        Type *Ty, const char *Section);
-  std::pair<GlobalVariable *, GlobalVariable *>
-  CreateSecStartEnd(Module &M, const char *Section, Type *Ty);
+  std::pair<Value *, Value *> CreateSecStartEnd(Module &M, const char *Section,
+                                                Type *Ty);
 
   void SetNoSanitizeMetadata(Instruction *I) {
     I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
                    MDNode::get(*C, None));
   }
 
-  Comdat *GetOrCreateFunctionComdat(Function &F);
-
   std::string getSectionName(const std::string &Section) const;
   std::string getSectionStart(const std::string &Section) const;
   std::string getSectionEnd(const std::string &Section) const;
@@ -252,7 +251,7 @@ private:
 
 } // namespace
 
-std::pair<GlobalVariable *, GlobalVariable *>
+std::pair<Value *, Value *>
 SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section,
                                            Type *Ty) {
   GlobalVariable *SecStart =
@@ -263,33 +262,28 @@ SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section,
       new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
                          nullptr, getSectionEnd(Section));
   SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+  IRBuilder<> IRB(M.getContext());
+  Value *SecEndPtr = IRB.CreatePointerCast(SecEnd, Ty);
+  if (TargetTriple.getObjectFormat() != Triple::COFF)
+    return std::make_pair(IRB.CreatePointerCast(SecStart, Ty), SecEndPtr);
 
-  return std::make_pair(SecStart, SecEnd);
+  // Account for the fact that on windows-msvc __start_* symbols actually
+  // point to a uint64_t before the start of the array.
+  auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy);
+  auto GEP = IRB.CreateGEP(SecStartI8Ptr,
+                           ConstantInt::get(IntptrTy, sizeof(uint64_t)));
+  return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEndPtr);
 }
 
-
 Function *SanitizerCoverageModule::CreateInitCallsForSections(
     Module &M, const char *InitFunctionName, Type *Ty,
     const char *Section) {
-  IRBuilder<> IRB(M.getContext());
   auto SecStartEnd = CreateSecStartEnd(M, Section, Ty);
   auto SecStart = SecStartEnd.first;
   auto SecEnd = SecStartEnd.second;
   Function *CtorFunc;
-  Value *SecStartPtr = nullptr;
-  // Account for the fact that on windows-msvc __start_* symbols actually
-  // point to a uint64_t before the start of the array.
-  if (TargetTriple.getObjectFormat() == Triple::COFF) {
-    auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy);
-    auto GEP = IRB.CreateGEP(SecStartI8Ptr,
-                             ConstantInt::get(IntptrTy, sizeof(uint64_t)));
-    SecStartPtr = IRB.CreatePointerCast(GEP, Ty);
-  } else {
-    SecStartPtr = IRB.CreatePointerCast(SecStart, Ty);
-  }
   std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
-      M, SanCovModuleCtorName, InitFunctionName, {Ty, Ty},
-      {SecStartPtr, IRB.CreatePointerCast(SecEnd, Ty)});
+      M, SanCovModuleCtorName, InitFunctionName, {Ty, Ty}, {SecStart, SecEnd});
 
   if (TargetTriple.supportsCOMDAT()) {
     // Use comdat to dedup CtorFunc.
@@ -298,6 +292,26 @@ Function *SanitizerCoverageModule::CreateInitCallsForSections(
   } else {
     appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
   }
+
+  if (TargetTriple.getObjectFormat() == Triple::COFF) {
+    // In COFF files, if the contructors are set as COMDAT (they are because
+    // COFF supports COMDAT) and the linker flag /OPT:REF (strip unreferenced
+    // functions and data) is used, the constructors get stripped. To prevent
+    // this, give the constructors weak ODR linkage and tell the linker to
+    // always include the sancov constructor. This way the linker can
+    // deduplicate the constructors but always leave one copy.
+    CtorFunc->setLinkage(GlobalValue::WeakODRLinkage);
+    SmallString<20> PartialIncDirective("/include:");
+    // Get constructor's mangled name in order to support i386.
+    SmallString<40> MangledName;
+    Mangler().getNameWithPrefix(MangledName, CtorFunc, true);
+    Twine IncDirective = PartialIncDirective + MangledName;
+    Metadata *Args[1] = {MDString::get(*C, IncDirective.str())};
+    MDNode *MetadataNode = MDNode::get(*C, Args);
+    NamedMDNode *NamedMetadata =
+        M.getOrInsertNamedMetadata("llvm.linker.options");
+    NamedMetadata->addOperand(MetadataNode);
+  }
   return CtorFunc;
 }
 
@@ -412,20 +426,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
     Function *InitFunction = declareSanitizerInitFunction(
         M, SanCovPCsInitName, {IntptrPtrTy, IntptrPtrTy});
     IRBuilder<> IRBCtor(Ctor->getEntryBlock().getTerminator());
-    Value *SecStartPtr = nullptr;
-    // Account for the fact that on windows-msvc __start_pc_table actually
-    // points to a uint64_t before the start of the PC table.
-    if (TargetTriple.getObjectFormat() == Triple::COFF) {
-      auto SecStartI8Ptr = IRB.CreatePointerCast(SecStartEnd.first, Int8PtrTy);
-      auto GEP = IRB.CreateGEP(SecStartI8Ptr,
-                               ConstantInt::get(IntptrTy, sizeof(uint64_t)));
-      SecStartPtr = IRB.CreatePointerCast(GEP, IntptrPtrTy);
-    } else {
-      SecStartPtr = IRB.CreatePointerCast(SecStartEnd.first, IntptrPtrTy);
-    }
-    IRBCtor.CreateCall(
-        InitFunction,
-        {SecStartPtr, IRB.CreatePointerCast(SecStartEnd.second, IntptrPtrTy)});
+    IRBCtor.CreateCall(InitFunction, {SecStartEnd.first, SecStartEnd.second});
   }
   // We don't reference these arrays directly in any of our runtime functions,
   // so we need to prevent them from being dead stripped.
@@ -569,31 +570,21 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   return true;
 }
 
-Comdat *SanitizerCoverageModule::GetOrCreateFunctionComdat(Function &F) {
-  if (auto Comdat = F.getComdat()) return Comdat;
-  if (!TargetTriple.isOSBinFormatELF()) return nullptr;
-  assert(F.hasName());
-  std::string Name = F.getName();
-  if (F.hasLocalLinkage()) {
-    if (CurModuleUniqueId.empty()) return nullptr;
-    Name += CurModuleUniqueId;
-  }
-  auto Comdat = CurModule->getOrInsertComdat(Name);
-  F.setComdat(Comdat);
-  return Comdat;
-}
-
 GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
     size_t NumElements, Function &F, Type *Ty, const char *Section) {
   ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
   auto Array = new GlobalVariable(
       *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
       Constant::getNullValue(ArrayTy), "__sancov_gen_");
-  if (auto Comdat = GetOrCreateFunctionComdat(F))
-    Array->setComdat(Comdat);
+
+  if (TargetTriple.supportsCOMDAT())
+    if (auto Comdat =
+            GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
+      Array->setComdat(Comdat);
   Array->setSection(getSectionName(Section));
   Array->setAlignment(Ty->isPointerTy() ? DL->getPointerSize()
                                         : Ty->getPrimitiveSizeInBits() / 8);
+  GlobalsToAppendToUsed.push_back(Array);
   GlobalsToAppendToCompilerUsed.push_back(Array);
   MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
   Array->addMetadata(LLVMContext::MD_associated, *MD);
@@ -631,14 +622,14 @@ SanitizerCoverageModule::CreatePCArray(Function &F,
 
 void SanitizerCoverageModule::CreateFunctionLocalArrays(
     Function &F, ArrayRef<BasicBlock *> AllBlocks) {
-  if (Options.TracePCGuard) {
+  if (Options.TracePCGuard)
     FunctionGuardArray = CreateFunctionLocalArrayInSection(
         AllBlocks.size(), F, Int32Ty, SanCovGuardsSectionName);
-    GlobalsToAppendToUsed.push_back(FunctionGuardArray);
-  }
+
   if (Options.Inline8bitCounters)
     Function8bitCounterArray = CreateFunctionLocalArrayInSection(
         AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName);
+
   if (Options.PCTable)
     FunctionPCsArray = CreatePCArray(F, AllBlocks);
 }
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 464805051c65fd3e63fa41f48fccdddeffb923ba..52a5e8c96abe39d44f9193cb4addefefd2ed990b 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -266,13 +266,10 @@ llvm::objcarc::FindDependencies(DependenceKind Flavor,
   for (const BasicBlock *BB : Visited) {
     if (BB == StartBB)
       continue;
-    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-    for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
-      const BasicBlock *Succ = *SI;
+    for (const BasicBlock *Succ : successors(BB))
       if (Succ != StartBB && !Visited.count(Succ)) {
         DependingInsts.insert(reinterpret_cast<Instruction *>(-1));
         return;
       }
-    }
   }
 }
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 21e2848030fc4a6212de5c3b09eb8622d1827d1c..6ffaadc2b5fc52d1fd38ee09557c3342291de903 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -914,8 +914,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
           GetRCIdentityRoot(PN->getIncomingValue(i));
         if (IsNullOrUndef(Incoming))
           HasNull = true;
-        else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
-                   .getNumSuccessors() != 1) {
+        else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() !=
+                 1) {
           HasCriticalEdges = true;
           break;
         }
@@ -1084,18 +1084,15 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
            "Unknown top down sequence state.");
 
     const Value *Arg = I->first;
-    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
     bool SomeSuccHasSame = false;
     bool AllSuccsHaveSame = true;
     bool NotAllSeqEqualButKnownSafe = false;
 
-    succ_const_iterator SI(TI), SE(TI, false);
-
-    for (; SI != SE; ++SI) {
+    for (const BasicBlock *Succ : successors(BB)) {
       // If VisitBottomUp has pointer information for this successor, take
       // what we know about it.
       const DenseMap<const BasicBlock *, BBState>::iterator BBI =
-        BBStates.find(*SI);
+          BBStates.find(Succ);
       assert(BBI != BBStates.end());
       const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
       const Sequence SuccSSeq = SuccS.GetSeq();
@@ -1414,21 +1411,20 @@ ComputePostOrders(Function &F,
   BasicBlock *EntryBB = &F.getEntryBlock();
   BBState &MyStates = BBStates[EntryBB];
   MyStates.SetAsEntry();
-  TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back());
+  Instruction *EntryTI = EntryBB->getTerminator();
   SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
   Visited.insert(EntryBB);
   OnStack.insert(EntryBB);
   do {
   dfs_next_succ:
     BasicBlock *CurrBB = SuccStack.back().first;
-    TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back());
-    succ_iterator SE(TI, false);
+    succ_iterator SE(CurrBB->getTerminator(), false);
 
     while (SuccStack.back().second != SE) {
       BasicBlock *SuccBB = *SuccStack.back().second++;
       if (Visited.insert(SuccBB).second) {
-        TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back());
-        SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI)));
+        SuccStack.push_back(
+            std::make_pair(SuccBB, succ_iterator(SuccBB->getTerminator())));
         BBStates[CurrBB].addSucc(SuccBB);
         BBState &SuccStates = BBStates[SuccBB];
         SuccStates.addPred(CurrBB);
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 883d2e17350df880b95cfca010234294ff801d47..b0602d96798c3119299abd9b57b5f65ed749306b 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -103,7 +103,7 @@ struct BlockInfoType {
   BasicBlock *BB = nullptr;
 
   /// Cache of BB->getTerminator().
-  TerminatorInst *Terminator = nullptr;
+  Instruction *Terminator = nullptr;
 
   /// Post-order numbering of reverse control flow graph.
   unsigned PostOrder;
@@ -206,7 +206,7 @@ bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
   return removeDeadInstructions();
 }
 
-static bool isUnconditionalBranch(TerminatorInst *Term) {
+static bool isUnconditionalBranch(Instruction *Term) {
   auto *BR = dyn_cast<BranchInst>(Term);
   return BR && BR->isUnconditional();
 }
@@ -277,7 +277,7 @@ void AggressiveDeadCodeElimination::initialize() {
     // treat all edges to a block already seen as loop back edges
     // and mark the branch live it if there is a back edge.
     for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) {
-      TerminatorInst *Term = BB->getTerminator();
+      Instruction *Term = BB->getTerminator();
       if (isLive(Term))
         continue;
 
@@ -643,7 +643,7 @@ void AggressiveDeadCodeElimination::computeReversePostOrder() {
 
 void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
                                                       BasicBlock *Target) {
-  TerminatorInst *PredTerm = BB->getTerminator();
+  Instruction *PredTerm = BB->getTerminator();
   // Collect the live debug info scopes attached to this instruction.
   if (const DILocation *DL = PredTerm->getDebugLoc())
     collectLiveScopes(*DL);
diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 54385155cd2dc7041003b13154d7c5649c774ec6..b9e8e3424ccfe61e80587adad135b83daf69b056 100644
--- a/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -197,7 +197,7 @@ static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) {
       isa<IndirectBrInst>(Preds[1]->getTerminator()))
     return false;
 
-  // BasicBlock::canSplitPredecessors is more agressive, so checking for
+  // BasicBlock::canSplitPredecessors is more aggressive, so checking for
   // BasicBlock::isEHPad as well.
   if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad())
     return false;
@@ -248,7 +248,7 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
   ReturnInst* RI = dyn_cast<ReturnInst>(&*II);
   assert(RI && "`musttail` call must be followed by `ret` instruction");
 
-  TerminatorInst *TI = SplitBB->getTerminator();
+  Instruction *TI = SplitBB->getTerminator();
   Value *V = NewCI;
   if (BCI)
     V = cloneInstForMustTail(BCI, TI, V);
@@ -461,10 +461,9 @@ static bool tryToSplitOnPredicatedArgument(CallSite CS, DominatorTree *DT) {
     PredsCS.push_back({Pred, Conditions});
   }
 
-  if (std::all_of(PredsCS.begin(), PredsCS.end(),
-                  [](const std::pair<BasicBlock *, ConditionsTy> &P) {
-                    return P.second.empty();
-                  }))
+  if (all_of(PredsCS, [](const std::pair<BasicBlock *, ConditionsTy> &P) {
+        return P.second.empty();
+      }))
     return false;
 
   splitCallSite(CS, PredsCS, DT);
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 545b0060c1392e2fbccaf88cf505870c442a011d..69112f3cee277049585fe1092de6c8bd894276a7 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -643,7 +643,7 @@ static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
   for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
     BasicBlock *Pred = *I;
     if (Pred == BB) continue;
-    TerminatorInst *PredTI = Pred->getTerminator();
+    Instruction *PredTI = Pred->getTerminator();
     if (PredTI->getNumSuccessors() != 1)
       continue;
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index d6c2824a299f4447df140e161f6281cafa83722a..c080c2a1813de93c49048cf0f401bf7570ff224e 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -2341,7 +2341,7 @@ bool GVN::splitCriticalEdges() {
   if (toSplit.empty())
     return false;
   do {
-    std::pair<TerminatorInst*, unsigned> Edge = toSplit.pop_back_val();
+    std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
     SplitCriticalEdge(Edge.first, Edge.second,
                       CriticalEdgeSplittingOptions(DT));
   } while (!toSplit.empty());
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index 3043df9cca75c5d2ae842b027f15c2e1c0cea772..0797ce9adeaff1c72e1199afba6b0b6a97945ea9 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -577,7 +577,7 @@ private:
   // Returns the edge via which an instruction in BB will get the values from.
 
   // Returns true when the values are flowing out to each edge.
-  bool valueAnticipable(CHIArgs C, TerminatorInst *TI) const {
+  bool valueAnticipable(CHIArgs C, Instruction *TI) const {
     if (TI->getNumSuccessors() > (unsigned)size(C))
       return false; // Not enough args in this CHI.
 
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index c35478e220b6efc793eac467303a9234a7b6ddbe..ec51ad71abc517e92c16ddca63d1437a6f0a0441 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -595,41 +595,20 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
             !isSafeToExpand(ExitValue, *SE))
           continue;
 
-        // Computing the value outside of the loop brings no benefit if :
-        //  - it is definitely used inside the loop in a way which can not be
-        //    optimized away.
-        //  - no use outside of the loop can take advantage of hoisting the
-        //    computation out of the loop
+        // Computing the value outside of the loop brings no benefit if it is
+        // definitely used inside the loop in a way which can not be optimized
+        // away.
         if (ExitValue->getSCEVType()>=scMulExpr) {
           bool HasHardInternalUses = false;
-          bool HasSoftExternalUses = false;
           for (auto *IB : Inst->users()) {
             Instruction *UseInstr = cast<Instruction>(IB);
             unsigned Opc = UseInstr->getOpcode();
-            if (L->contains(UseInstr)) {
-              if (Opc == Instruction::Call)
-                HasHardInternalUses = true;
-            } else {
-              if (Opc == Instruction::PHI) {
-                // Do not count the Phi as a use. LCSSA may have inserted
-                // plenty of trivial ones.
-                for (auto *PB : UseInstr->users()) {
-                  unsigned PhiOpc = cast<Instruction>(PB)->getOpcode();
-                  if (PhiOpc != Instruction::Call &&
-                      PhiOpc != Instruction::Ret) {
-                    HasSoftExternalUses = true;
-                    break;
-                  }
-                }
-                continue;
-              }
-              if (Opc != Instruction::Call && Opc != Instruction::Ret) {
-                HasSoftExternalUses = true;
-                break;
-              }
+            if (L->contains(UseInstr) && Opc == Instruction::Call) {
+              HasHardInternalUses = true;
+              break;
             }
           }
-          if (HasHardInternalUses && !HasSoftExternalUses)
+          if (HasHardInternalUses)
             continue;
         }
 
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index 1fc8f3988f963fd538ef3c736419a3a828c92f53..849ff71e198b6033402d0db5e8712e44cbbbd83a 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -947,7 +947,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
 /// Since we can pick an arbitrary destination, we pick the successor with the
 /// fewest predecessors.  This should reduce the in-degree of the others.
 static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) {
-  TerminatorInst *BBTerm = BB->getTerminator();
+  Instruction *BBTerm = BB->getTerminator();
   unsigned MinSucc = 0;
   BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
   // Compute the successor with the minimum number of predecessors.
@@ -988,7 +988,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // because now the condition in this block can be threaded through
   // predecessors of our predecessor block.
   if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
-    const TerminatorInst *TI = SinglePred->getTerminator();
+    const Instruction *TI = SinglePred->getTerminator();
     if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 &&
         SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
       // If SinglePred was a loop header, BB becomes one.
@@ -1080,7 +1080,7 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     std::vector<DominatorTree::UpdateType> Updates;
 
     // Fold the branch/switch.
-    TerminatorInst *BBTerm = BB->getTerminator();
+    Instruction *BBTerm = BB->getTerminator();
     Updates.reserve(BBTerm->getNumSuccessors());
     for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
       if (i == BestSucc) continue;
@@ -1549,7 +1549,7 @@ FindMostPopularDest(BasicBlock *BB,
   // successor list.
   if (!SamePopularity.empty()) {
     SamePopularity.push_back(MostPopularDest);
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     for (unsigned i = 0; ; ++i) {
       assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
 
@@ -1669,7 +1669,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
       }
 
       // Finally update the terminator.
-      TerminatorInst *Term = BB->getTerminator();
+      Instruction *Term = BB->getTerminator();
       BranchInst::Create(OnlyDest, Term);
       Term->eraseFromParent();
       DTU->applyUpdates(Updates);
@@ -2006,7 +2006,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
   // Update the terminator of PredBB to jump to NewBB instead of BB.  This
   // eliminates predecessors from BB, which requires us to simplify any PHI
   // nodes in BB.
-  TerminatorInst *PredTerm = PredBB->getTerminator();
+  Instruction *PredTerm = PredBB->getTerminator();
   for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
     if (PredTerm->getSuccessor(i) == BB) {
       BB->removePredecessor(PredBB, true);
@@ -2115,7 +2115,7 @@ BasicBlock *JumpThreadingPass::SplitBlockPreds(BasicBlock *BB,
 }
 
 bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
-  const TerminatorInst *TI = BB->getTerminator();
+  const Instruction *TI = BB->getTerminator();
   assert(TI->getNumSuccessors() > 1 && "not a split");
 
   MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
@@ -2538,7 +2538,7 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
     if (!SI)
       continue;
     // Expand the select.
-    TerminatorInst *Term =
+    Instruction *Term =
         SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
     BasicBlock *SplitBB = SI->getParent();
     BasicBlock *NewBB = Term->getParent();
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index bb918cf717d2a4cb1efc691497961a88ea40263d..7789cb923450d6c377b8fb7c5a0bc5ee3b1d0214 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -103,10 +103,10 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                                   const LoopSafetyInfo *SafetyInfo,
                                   TargetTransformInfo *TTI, bool &FreeInLoop);
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  LoopSafetyInfo *SafetyInfo,
+                  ICFLoopSafetyInfo *SafetyInfo,
                   OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
-                 const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
+                 const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
                  OptimizationRemarkEmitter *ORE, bool FreeInLoop);
 static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
@@ -123,6 +123,9 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
                             const LoopInfo *LI,
                             const LoopSafetyInfo *SafetyInfo);
 
+static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
+                             AliasSetTracker *AST);
+
 namespace {
 struct LoopInvariantCodeMotion {
   using ASTrackerMapTy = DenseMap<Loop *, std::unique_ptr<AliasSetTracker>>;
@@ -267,7 +270,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   BasicBlock *Preheader = L->getLoopPreheader();
 
   // Compute loop safety information.
-  LoopSafetyInfo SafetyInfo;
+  ICFLoopSafetyInfo SafetyInfo(DT);
   SafetyInfo.computeLoopSafetyInfo(L);
 
   // We want to visit all of the instructions in this loop... that are not parts
@@ -374,7 +377,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
 bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                       DominatorTree *DT, TargetLibraryInfo *TLI,
                       TargetTransformInfo *TTI, Loop *CurLoop,
-                      AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                      AliasSetTracker *CurAST, ICFLoopSafetyInfo *SafetyInfo,
                       OptimizationRemarkEmitter *ORE) {
 
   // Verify inputs.
@@ -404,8 +407,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
         salvageDebugInfo(I);
         ++II;
-        CurAST->deleteValue(&I);
-        I.eraseFromParent();
+        eraseInstruction(I, *SafetyInfo, CurAST);
         Changed = true;
         continue;
       }
@@ -422,8 +424,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) {
           if (!FreeInLoop) {
             ++II;
-            CurAST->deleteValue(&I);
-            I.eraseFromParent();
+            eraseInstruction(I, *SafetyInfo, CurAST);
           }
           Changed = true;
         }
@@ -440,7 +441,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 ///
 bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                        DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
-                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+                       AliasSetTracker *CurAST, ICFLoopSafetyInfo *SafetyInfo,
                        OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
@@ -459,14 +460,10 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
     if (inSubLoop(BB, CurLoop, LI))
       continue;
 
-    // Keep track of whether the prefix of instructions visited so far are such
-    // that the next instruction visited is guaranteed to execute if the loop
-    // is entered.
-    bool IsMustExecute = CurLoop->getHeader() == BB;
     // Keep track of whether the prefix instructions could have written memory.
-    // TODO: This and IsMustExecute may be done smarter if we keep track of all
-    // throwing and mem-writing operations in every block, e.g. using something
-    // similar to isGuaranteedToExecute.
+    // TODO: This may be done smarter if we keep track of all throwing and
+    // mem-writing operations in every block, e.g. using something similar to
+    // isGuaranteedToExecute.
     bool IsMemoryNotModified = CurLoop->getHeader() == BB;
 
     for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
@@ -480,10 +477,8 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
                           << '\n');
         CurAST->copyValue(&I, C);
         I.replaceAllUsesWith(C);
-        if (isInstructionTriviallyDead(&I, TLI)) {
-          CurAST->deleteValue(&I);
-          I.eraseFromParent();
-        }
+        if (isInstructionTriviallyDead(&I, TLI))
+          eraseInstruction(I, *SafetyInfo, CurAST);
         Changed = true;
         continue;
       }
@@ -494,10 +489,9 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       //
       if (CurLoop->hasLoopInvariantOperands(&I) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, true, ORE) &&
-          (IsMustExecute ||
-           isSafeToExecuteUnconditionally(
-               I, DT, CurLoop, SafetyInfo, ORE,
-               CurLoop->getLoopPreheader()->getTerminator()))) {
+          isSafeToExecuteUnconditionally(
+              I, DT, CurLoop, SafetyInfo, ORE,
+              CurLoop->getLoopPreheader()->getTerminator())) {
         hoist(I, DT, CurLoop, SafetyInfo, ORE);
         Changed = true;
         continue;
@@ -512,14 +506,16 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
         auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
         ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+        SafetyInfo->insertInstructionTo(I.getParent());
         ReciprocalDivisor->insertBefore(&I);
 
         auto Product =
             BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
         Product->setFastMathFlags(I.getFastMathFlags());
+        SafetyInfo->insertInstructionTo(I.getParent());
         Product->insertAfter(&I);
         I.replaceAllUsesWith(Product);
-        I.eraseFromParent();
+        eraseInstruction(I, *SafetyInfo, CurAST);
 
         hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
         Changed = true;
@@ -530,15 +526,13 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       if (((I.use_empty() &&
             match(&I, m_Intrinsic<Intrinsic::invariant_start>())) ||
            isGuard(&I)) &&
-          IsMustExecute && IsMemoryNotModified &&
-          CurLoop->hasLoopInvariantOperands(&I)) {
+          IsMemoryNotModified && CurLoop->hasLoopInvariantOperands(&I) &&
+          SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) {
         hoist(I, DT, CurLoop, SafetyInfo, ORE);
         Changed = true;
         continue;
       }
 
-      if (IsMustExecute)
-        IsMustExecute = isGuaranteedToTransferExecutionToSuccessor(&I);
       if (IsMemoryNotModified)
         IsMemoryNotModified = !I.mayWriteToMemory();
     }
@@ -693,7 +687,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
         for (Value *Op : CI->arg_operands())
           if (Op->getType()->isPointerTy() &&
               pointerInvalidatedByLoop(
-                  MemoryLocation(Op, MemoryLocation::UnknownSize, AAMDNodes()),
+                  MemoryLocation(Op, LocationSize::unknown(), AAMDNodes()),
                   CurAST, CurLoop, AA))
             return false;
         return true;
@@ -798,7 +792,7 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
 static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                                   const LoopSafetyInfo *SafetyInfo,
                                   TargetTransformInfo *TTI, bool &FreeInLoop) {
-  const auto &BlockColors = SafetyInfo->BlockColors;
+  const auto &BlockColors = SafetyInfo->getBlockColors();
   bool IsFree = isFreeInLoop(I, CurLoop, TTI);
   for (const User *U : I.users()) {
     const Instruction *UI = cast<Instruction>(U);
@@ -833,7 +827,7 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
                             const LoopSafetyInfo *SafetyInfo) {
   Instruction *New;
   if (auto *CI = dyn_cast<CallInst>(&I)) {
-    const auto &BlockColors = SafetyInfo->BlockColors;
+    const auto &BlockColors = SafetyInfo->getBlockColors();
 
     // Sinking call-sites need to be handled differently from other
     // instructions.  The cloned call-site needs a funclet bundle operand
@@ -888,6 +882,14 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
   return New;
 }
 
+static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
+                             AliasSetTracker *AST) {
+  if (AST)
+    AST->deleteValue(&I);
+  SafetyInfo.removeInstruction(&I);
+  I.eraseFromParent();
+}
+
 static Instruction *sinkThroughTriviallyReplaceablePHI(
     PHINode *TPN, Instruction *I, LoopInfo *LI,
     SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
@@ -913,7 +915,7 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
   // it require updating BlockColors for all offspring blocks accordingly. By
   // skipping such corner case, we can make updating BlockColors after splitting
   // predecessor fairly simple.
-  if (!SafetyInfo->BlockColors.empty() && BB->getFirstNonPHI()->isEHPad())
+  if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad())
     return false;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *BBPred = *PI;
@@ -967,7 +969,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
   // LE:
   //   %p = phi [%p1, %LE.split], [%p2, %LE.split2]
   //
-  auto &BlockColors = SafetyInfo->BlockColors;
+  const auto &BlockColors = SafetyInfo->getBlockColors();
   SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
   while (!PredBBs.empty()) {
     BasicBlock *PredBB = *PredBBs.begin();
@@ -979,14 +981,11 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
       // Since we do not allow splitting EH-block with BlockColors in
       // canSplitPredecessors(), we can simply assign predecessor's color to
       // the new block.
-      if (!BlockColors.empty()) {
+      if (!BlockColors.empty())
         // Grab a reference to the ColorVector to be inserted before getting the
         // reference to the vector we are copying because inserting the new
         // element in BlockColors might cause the map to be reallocated.
-        ColorVector &ColorsForNewBlock = BlockColors[NewPred];
-        ColorVector &ColorsForOldBlock = BlockColors[PredBB];
-        ColorsForNewBlock = ColorsForOldBlock;
-      }
+        SafetyInfo->copyColors(NewPred, PredBB);
     }
     PredBBs.remove(PredBB);
   }
@@ -998,7 +997,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
 /// position, and may either delete it or move it to outside of the loop.
 ///
 static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
-                 const Loop *CurLoop, LoopSafetyInfo *SafetyInfo,
+                 const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo,
                  OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
   LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
   ORE->emit([&]() {
@@ -1089,7 +1088,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     Instruction *New = sinkThroughTriviallyReplaceablePHI(PN, &I, LI, SunkCopies,
                                                           SafetyInfo, CurLoop);
     PN->replaceAllUsesWith(New);
-    PN->eraseFromParent();
+    eraseInstruction(*PN, *SafetyInfo, nullptr);
     Changed = true;
   }
   return Changed;
@@ -1099,7 +1098,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
 /// is safe to hoist, this instruction is called to do the dirty work.
 ///
 static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
-                  LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
+                  ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
   auto *Preheader = CurLoop->getLoopPreheader();
   LLVM_DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
                     << "\n");
@@ -1116,9 +1115,11 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
       // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
       // time in isGuaranteedToExecute if we don't actually have anything to
       // drop.  It is a compile time optimization, not required for correctness.
-      !isGuaranteedToExecute(I, DT, CurLoop, SafetyInfo))
+      !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop))
     I.dropUnknownNonDebugMetadata();
 
+  SafetyInfo->removeInstruction(&I);
+  SafetyInfo->insertInstructionTo(Preheader);
   // Move the new node to the Preheader, before its terminator.
   I.moveBefore(Preheader->getTerminator());
 
@@ -1150,7 +1151,7 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst,
     return true;
 
   bool GuaranteedToExecute =
-      isGuaranteedToExecute(Inst, DT, CurLoop, SafetyInfo);
+      SafetyInfo->isGuaranteedToExecute(Inst, DT, CurLoop);
 
   if (!GuaranteedToExecute) {
     auto *LI = dyn_cast<LoadInst>(&Inst);
@@ -1179,6 +1180,7 @@ class LoopPromoter : public LoadAndStorePromoter {
   int Alignment;
   bool UnorderedAtomic;
   AAMDNodes AATags;
+  ICFLoopSafetyInfo &SafetyInfo;
 
   Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
     if (Instruction *I = dyn_cast<Instruction>(V))
@@ -1201,11 +1203,13 @@ public:
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
                AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
-               bool UnorderedAtomic, const AAMDNodes &AATags)
+               bool UnorderedAtomic, const AAMDNodes &AATags,
+               ICFLoopSafetyInfo &SafetyInfo)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
         LI(li), DL(std::move(dl)), Alignment(alignment),
-        UnorderedAtomic(UnorderedAtomic), AATags(AATags) {}
+        UnorderedAtomic(UnorderedAtomic), AATags(AATags), SafetyInfo(SafetyInfo)
+      {}
 
   bool isInstInList(Instruction *I,
                     const SmallVectorImpl<Instruction *> &) const override {
@@ -1242,7 +1246,10 @@ public:
     // Update alias analysis.
     AST.copyValue(LI, V);
   }
-  void instructionDeleted(Instruction *I) const override { AST.deleteValue(I); }
+  void instructionDeleted(Instruction *I) const override {
+    SafetyInfo.removeInstruction(I);
+    AST.deleteValue(I);
+  }
 };
 
 
@@ -1280,7 +1287,7 @@ bool llvm::promoteLoopAccessesToScalars(
     SmallVectorImpl<BasicBlock *> &ExitBlocks,
     SmallVectorImpl<Instruction *> &InsertPts, PredIteratorCache &PIC,
     LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
-    Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
+    Loop *CurLoop, AliasSetTracker *CurAST, ICFLoopSafetyInfo *SafetyInfo,
     OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
@@ -1408,7 +1415,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
         if (!DereferenceableInPH || !SafeToInsertStore ||
             (InstAlignment > Alignment)) {
-          if (isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo)) {
+          if (SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop)) {
             DereferenceableInPH = true;
             SafeToInsertStore = true;
             Alignment = std::max(Alignment, InstAlignment);
@@ -1499,7 +1506,7 @@ bool llvm::promoteLoopAccessesToScalars(
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
                         InsertPts, PIC, *CurAST, *LI, DL, Alignment,
-                        SawUnorderedAtomic, AATags);
+                        SawUnorderedAtomic, AATags, *SafetyInfo);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
@@ -1519,7 +1526,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
   // If the SSAUpdater didn't use the load in the preheader, just zap it now.
   if (PreheaderLoad->use_empty())
-    PreheaderLoad->eraseFromParent();
+    eraseInstruction(*PreheaderLoad, *SafetyInfo, CurAST);
 
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 68abf9719a94298cddca75873d80db9369adcd94..241dbed30e1b7af32c524571061d70def9ed8a53 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -320,7 +320,7 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
 
   // The following transforms hoist stores/memsets into the loop pre-header.
   // Give up if the loop has instructions may throw.
-  LoopSafetyInfo SafetyInfo;
+  SimpleLoopSafetyInfo SafetyInfo;
   SafetyInfo.computeLoopSafetyInfo(CurLoop);
   if (SafetyInfo.anyBlockMayThrow())
     return MadeChange;
@@ -928,10 +928,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Type *Int8PtrTy = DestInt8PtrTy;
 
     Module *M = TheStore->getModule();
+    StringRef FuncName = "memset_pattern16";
     Value *MSP =
-        M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
+        M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
                                Int8PtrTy, Int8PtrTy, IntPtr);
-    inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
+    inferLibFuncAttributes(M, FuncName, *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
     // an constant array of 16-bytes.  Plop the value into a mergable global.
diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp
index 21c8512b26600ba47d5c8cfdd3d6e30ea0a87712..7a4ae2eb30307a590ad805f4da4c37e17203456f 100644
--- a/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -339,16 +339,10 @@ public:
 
   bool currentLimitations();
 
-  bool hasInnerLoopReduction() { return InnerLoopHasReduction; }
-
 private:
   bool tightlyNested(Loop *Outer, Loop *Inner);
-  bool containsUnsafeInstructionsInHeader(BasicBlock *BB);
-  bool areAllUsesReductions(Instruction *Ins, Loop *L);
-  bool containsUnsafeInstructionsInLatch(BasicBlock *BB);
-  bool findInductionAndReductions(Loop *L,
-                                  SmallVector<PHINode *, 8> &Inductions,
-                                  SmallVector<PHINode *, 8> &Reductions);
+  bool containsUnsafeInstructions(BasicBlock *BB);
+  bool findInductions(Loop *L, SmallVector<PHINode *, 8> &Inductions);
 
   Loop *OuterLoop;
   Loop *InnerLoop;
@@ -358,7 +352,6 @@ private:
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
-  bool InnerLoopHasReduction = false;
 };
 
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
@@ -391,11 +384,9 @@ class LoopInterchangeTransform {
 public:
   LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
                            LoopInfo *LI, DominatorTree *DT,
-                           BasicBlock *LoopNestExit,
-                           bool InnerLoopContainsReductions)
+                           BasicBlock *LoopNestExit)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
-        LoopExit(LoopNestExit),
-        InnerLoopHasReduction(InnerLoopContainsReductions) {}
+        LoopExit(LoopNestExit) {}
 
   /// Interchange OuterLoop and InnerLoop.
   bool transform();
@@ -420,7 +411,6 @@ private:
   LoopInfo *LI;
   DominatorTree *DT;
   BasicBlock *LoopExit;
-  bool InnerLoopHasReduction;
 };
 
 // Main LoopInterchange Pass.
@@ -571,7 +561,7 @@ struct LoopInterchange : public LoopPass {
     });
 
     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
-                                 LoopNestExit, LIL.hasInnerLoopReduction());
+                                 LoopNestExit);
     LIT.transform();
     LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
     LoopsInterchanged++;
@@ -581,42 +571,12 @@ struct LoopInterchange : public LoopPass {
 
 } // end anonymous namespace
 
-bool LoopInterchangeLegality::areAllUsesReductions(Instruction *Ins, Loop *L) {
-  return llvm::none_of(Ins->users(), [=](User *U) -> bool {
-    auto *UserIns = dyn_cast<PHINode>(U);
-    RecurrenceDescriptor RD;
-    return !UserIns || !RecurrenceDescriptor::isReductionPHI(UserIns, L, RD);
+bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
+  return any_of(*BB, [](const Instruction &I) {
+    return I.mayHaveSideEffects() || I.mayReadFromMemory();
   });
 }
 
-bool LoopInterchangeLegality::containsUnsafeInstructionsInHeader(
-    BasicBlock *BB) {
-  for (Instruction &I : *BB) {
-    // Load corresponding to reduction PHI's are safe while concluding if
-    // tightly nested.
-    if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
-      if (!areAllUsesReductions(L, InnerLoop))
-        return true;
-    } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
-      return true;
-  }
-  return false;
-}
-
-bool LoopInterchangeLegality::containsUnsafeInstructionsInLatch(
-    BasicBlock *BB) {
-  for (Instruction &I : *BB) {
-    // Stores corresponding to reductions are safe while concluding if tightly
-    // nested.
-    if (StoreInst *L = dyn_cast<StoreInst>(&I)) {
-      if (!isa<PHINode>(L->getOperand(0)))
-        return true;
-    } else if (I.mayHaveSideEffects() || I.mayReadFromMemory())
-      return true;
-  }
-  return false;
-}
-
 bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
   BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
@@ -640,8 +600,8 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
   // We do not have any basic block in between now make sure the outer header
   // and outer loop latch doesn't contain any unsafe instructions.
-  if (containsUnsafeInstructionsInHeader(OuterLoopHeader) ||
-      containsUnsafeInstructionsInLatch(OuterLoopLatch))
+  if (containsUnsafeInstructions(OuterLoopHeader) ||
+      containsUnsafeInstructions(OuterLoopLatch))
     return false;
 
   LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
@@ -673,9 +633,8 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood(
   return true;
 }
 
-bool LoopInterchangeLegality::findInductionAndReductions(
-    Loop *L, SmallVector<PHINode *, 8> &Inductions,
-    SmallVector<PHINode *, 8> &Reductions) {
+bool LoopInterchangeLegality::findInductions(
+    Loop *L, SmallVector<PHINode *, 8> &Inductions) {
   if (!L->getLoopLatch() || !L->getLoopPredecessor())
     return false;
   for (PHINode &PHI : L->getHeader()->phis()) {
@@ -683,11 +642,8 @@ bool LoopInterchangeLegality::findInductionAndReductions(
     InductionDescriptor ID;
     if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
       Inductions.push_back(&PHI);
-    else if (RecurrenceDescriptor::isReductionPHI(&PHI, L, RD))
-      Reductions.push_back(&PHI);
     else {
-      LLVM_DEBUG(
-          dbgs() << "Failed to recognize PHI as an induction or reduction.\n");
+      LLVM_DEBUG(dbgs() << "Failed to recognize PHI as an induction.\n");
       return false;
     }
   }
@@ -737,8 +693,7 @@ bool LoopInterchangeLegality::currentLimitations() {
 
   PHINode *InnerInductionVar;
   SmallVector<PHINode *, 8> Inductions;
-  SmallVector<PHINode *, 8> Reductions;
-  if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
+  if (!findInductions(InnerLoop, Inductions)) {
     LLVM_DEBUG(
         dbgs() << "Only inner loops with induction or reduction PHI nodes "
                << "are supported currently.\n");
@@ -766,12 +721,9 @@ bool LoopInterchangeLegality::currentLimitations() {
     });
     return true;
   }
-  if (Reductions.size() > 0)
-    InnerLoopHasReduction = true;
 
   InnerInductionVar = Inductions.pop_back_val();
-  Reductions.clear();
-  if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
+  if (!findInductions(OuterLoop, Inductions)) {
     LLVM_DEBUG(
         dbgs() << "Only outer loops with induction or reduction PHI nodes "
                << "are supported currently.\n");
@@ -785,20 +737,6 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  // Outer loop cannot have reduction because then loops will not be tightly
-  // nested.
-  if (!Reductions.empty()) {
-    LLVM_DEBUG(dbgs() << "Outer loops with reductions are not supported "
-                      << "currently.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "ReductionsOuter",
-                                      OuterLoop->getStartLoc(),
-                                      OuterLoop->getHeader())
-             << "Outer loops with reductions cannot be interchangeed "
-                "currently.";
-    });
-    return true;
-  }
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
     LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
@@ -1449,34 +1387,13 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   // replaced by Inners'.
   updateIncomingBlock(OuterLoopLatchSuccessor, OuterLoopLatch, InnerLoopLatch);
 
-  // Now update the reduction PHIs in the inner and outer loop headers.
-  SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
-  for (PHINode &PHI : drop_begin(InnerLoopHeader->phis(), 1))
-    InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
-  for (PHINode &PHI : drop_begin(OuterLoopHeader->phis(), 1))
-    OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
-
-  for (PHINode *PHI : OuterLoopPHIs)
-    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
-
-  // Move the PHI nodes from the inner loop header to the outer loop header.
-  // We have to deal with one kind of PHI nodes:
-  //  1) PHI nodes that are part of inner loop-only reductions.
-  // We only have to move the PHI node and update the incoming blocks.
-  for (PHINode *PHI : InnerLoopPHIs) {
-    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
-    for (BasicBlock *InBB : PHI->blocks()) {
-      if (InnerLoop->contains(InBB))
-        continue;
-
-      assert(!isa<PHINode>(PHI->getIncomingValueForBlock(InBB)) &&
-             "Unexpected incoming PHI node, reductions in outer loop are not "
-             "supported yet");
-      PHI->replaceAllUsesWith(PHI->getIncomingValueForBlock(InBB));
-      PHI->eraseFromParent();
-      break;
-    }
-  }
+  // Make sure we have no other PHIs.
+  auto InnerPhis = drop_begin(InnerLoopHeader->phis(), 1);
+  auto OuterPhis = drop_begin(OuterLoopHeader->phis(), 1);
+  (void) InnerPhis;
+  (void) OuterPhis;
+  assert(begin(InnerPhis) == end(InnerPhis) && "Unexpected PHIs in inner loop");
+  assert(begin(OuterPhis) == end(OuterPhis) && "Unexpected PHis in outer loop");
 
   // Update the incoming blocks for moved PHI nodes.
   updateIncomingBlock(OuterLoopHeader, InnerLoopPreHeader, OuterLoopPreHeader);
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index cbb6594cf8f43c118e5a76c1106a15f1ea5f5401..ccaf10142d51318b660e4a1fc45c0ce460ca1d96 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -178,6 +178,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -196,6 +197,9 @@
 
 #define DEBUG_TYPE "loop-predication"
 
+STATISTIC(TotalConsidered, "Number of guards considered");
+STATISTIC(TotalWidened, "Number of checks widened");
+
 using namespace llvm;
 
 static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
@@ -574,6 +578,8 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
   LLVM_DEBUG(dbgs() << "Processing guard:\n");
   LLVM_DEBUG(Guard->dump());
 
+  TotalConsidered++;
+
   IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
 
   // The guard condition is expected to be in form of:
@@ -615,6 +621,8 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
   if (NumWidened == 0)
     return false;
 
+  TotalWidened += NumWidened;
+
   // Emit the new guard condition
   Builder.SetInsertPoint(Guard);
   Value *LastCheck = nullptr;
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index eeaad39dc1d18f825d2d68293814d08bd96e59ef..fd22128f7fe6b849c6083623081a934df2e5a3d5 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -15,6 +15,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
@@ -40,12 +42,19 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
   const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
   const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
 
-  bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, SQ,
-                              false, Threshold, false);
+  Optional<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA)
+    MSSAU = MemorySSAUpdater(AR.MSSA);
+  bool Changed = LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
+                              MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
+                              SQ, false, Threshold, false);
 
   if (!Changed)
     return PreservedAnalyses::all();
 
+  if (AR.MSSA && VerifyMemorySSA)
+    AR.MSSA->verifyMemorySSA();
+
   return getLoopPassPreservedAnalyses();
 }
 
@@ -68,6 +77,10 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    if (EnableMSSALoopDependency) {
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    }
     getLoopAnalysisUsage(AU);
   }
 
@@ -84,8 +97,14 @@ public:
     auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     auto *SE = SEWP ? &SEWP->getSE() : nullptr;
     const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
-    return LoopRotation(L, LI, TTI, AC, DT, SE, SQ, false, MaxHeaderSize,
-                        false);
+    Optional<MemorySSAUpdater> MSSAU;
+    if (EnableMSSALoopDependency) {
+      MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+      MSSAU = MemorySSAUpdater(MSSA);
+    }
+    return LoopRotation(L, LI, TTI, AC, DT, SE,
+                        MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
+                        false, MaxHeaderSize, false);
   }
 };
 }
@@ -96,6 +115,7 @@ INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
                     false)
 
diff --git a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index ed37fc8825de8406037574692756e9639a245f31..6cac3787311794de4dd69c9ecd8c4c09e035e2c4 100644
--- a/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -41,8 +41,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-simplifycfg"
 
-static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
-                            ScalarEvolution &SE, MemorySSAUpdater *MSSAU) {
+static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT,
+                                        LoopInfo &LI, MemorySSAUpdater *MSSAU) {
   bool Changed = false;
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   // Copy blocks into a temporary array to avoid iterator invalidation issues
@@ -63,14 +63,25 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
     // Merge Succ into Pred and delete it.
     MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU);
 
-    SE.forgetTopmostLoop(&L);
-
     Changed = true;
   }
 
   return Changed;
 }
 
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                            ScalarEvolution &SE, MemorySSAUpdater *MSSAU) {
+  bool Changed = false;
+
+  // Eliminate unconditional branches by merging blocks into their predecessors.
+  Changed |= mergeBlocksIntoPredecessors(L, DT, LI, MSSAU);
+
+  if (Changed)
+    SE.forgetTopmostLoop(&L);
+
+  return Changed;
+}
+
 PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LoopStandardAnalysisResults &AR,
                                            LPMUpdater &) {
diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp
index db502e1c5db89d26d2eff96e9fc0581a3a16546b..ce6ecea2dc2a8dcf6bdd3059175b095811d32b4f 100644
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@@ -202,6 +202,13 @@ static bool sinkInstruction(Loop &L, Instruction &I,
   if (BBsToSinkInto.empty())
     return false;
 
+  // Return if any of the candidate blocks to sink into is non-cold.
+  if (BBsToSinkInto.size() > 1) {
+    for (auto *BB : BBsToSinkInto)
+      if (!LoopBlockNumber.count(BB))
+        return false;
+  }
+
   // Copy the final BBs into a vector and sort them using the total ordering
   // of the loop block numbers as iterating the set doesn't give a useful
   // order. No need to stable sort as the block numbers are a total ordering.
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index d980cde49b6472c456f6683f8b948a55b54c0563..d10dae124a79477cd383c4c41584e2d942a386fa 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -540,7 +540,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
         }
       }
 
-      TerminatorInst *TI = BB->getTerminator();
+      Instruction *TI = BB->getTerminator();
 
       // Add in the live successors by first checking whether we have terminator
       // that may be simplified based on the values simplified by this call.
@@ -1333,23 +1333,20 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
     Loop *ParentL = L.getParentLoop();
 #endif
 
-    // The API here is quite complex to call, but there are only two interesting
-    // states we support: partial and full (or "simple") unrolling. However, to
-    // enable these things we actually pass "None" in for the optional to avoid
-    // providing an explicit choice.
-    Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam,
-        AllowPeeling;
     // Check if the profile summary indicates that the profiled application
     // has a huge working set size, in which case we disable peeling to avoid
     // bloating it further.
+    Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
     if (PSI && PSI->hasHugeWorkingSetSize())
-      AllowPeeling = false;
+      LocalAllowPeeling = false;
     std::string LoopName = L.getName();
-    LoopUnrollResult Result =
-        tryToUnrollLoop(&L, DT, &LI, SE, TTI, AC, ORE,
-                        /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
-                        /*Threshold*/ None, AllowPartialParam, RuntimeParam,
-                        UpperBoundParam, AllowPeeling);
+    // The API here is quite complex to call and we allow to select some
+    // flavors of unrolling during construction time (by setting UnrollOpts).
+    LoopUnrollResult Result = tryToUnrollLoop(
+        &L, DT, &LI, SE, TTI, AC, ORE,
+        /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*Count*/ None,
+        /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
+        UnrollOpts.AllowUpperBound, LocalAllowPeeling);
     Changed |= Result != LoopUnrollResult::Unmodified;
 
     // The parent must not be damaged by unrolling!
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index f67bff7fe93db5b90878c8e34ad11d797827df40..4a089dfa7dbf3716ab56fa9d9b867de75576c320 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -189,7 +189,7 @@ namespace {
     BasicBlock *loopPreheader = nullptr;
 
     bool SanitizeMemory;
-    LoopSafetyInfo SafetyInfo;
+    SimpleLoopSafetyInfo SafetyInfo;
 
     // LoopBlocks contains all of the basic blocks of the loop, including the
     // preheader of the loop, the body of the loop, and the exit blocks of the
@@ -246,11 +246,11 @@ namespace {
     bool TryTrivialLoopUnswitch(bool &Changed);
 
     bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,
-                              TerminatorInst *TI = nullptr);
+                              Instruction *TI = nullptr);
     void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
-                                  BasicBlock *ExitBlock, TerminatorInst *TI);
+                                  BasicBlock *ExitBlock, Instruction *TI);
     void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
-                                     TerminatorInst *TI);
+                                     Instruction *TI);
 
     void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
                                               Constant *Val, bool isEqual);
@@ -258,8 +258,7 @@ namespace {
     void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                         BasicBlock *TrueDest,
                                         BasicBlock *FalseDest,
-                                        BranchInst *OldBranch,
-                                        TerminatorInst *TI);
+                                        BranchInst *OldBranch, Instruction *TI);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
 
@@ -713,7 +712,7 @@ bool LoopUnswitch::processCurrentLoop() {
   // loop.
   for (Loop::block_iterator I = currentLoop->block_begin(),
          E = currentLoop->block_end(); I != E; ++I) {
-    TerminatorInst *TI = (*I)->getTerminator();
+    Instruction *TI = (*I)->getTerminator();
 
     // Unswitching on a potentially uninitialized predicate is not
     // MSan-friendly. Limit this to the cases when the original predicate is
@@ -722,7 +721,7 @@ bool LoopUnswitch::processCurrentLoop() {
     // This is a workaround for the discrepancy between LLVM IR and MSan
     // semantics. See PR28054 for more details.
     if (SanitizeMemory &&
-        !isGuaranteedToExecute(*TI, DT, currentLoop, &SafetyInfo))
+        !SafetyInfo.isGuaranteedToExecute(*TI, DT, currentLoop))
       continue;
 
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -876,7 +875,7 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
 /// simplify the loop.  If we decide that this is profitable,
 /// unswitch the loop, reprocess the pieces, then return true.
 bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
-                                        TerminatorInst *TI) {
+                                        Instruction *TI) {
   // Check to see if it would be profitable to unswitch current loop.
   if (!BranchesInfo.CostAllowsUnswitching()) {
     LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
@@ -931,7 +930,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                                   BasicBlock *TrueDest,
                                                   BasicBlock *FalseDest,
                                                   BranchInst *OldBranch,
-                                                  TerminatorInst *TI) {
+                                                  Instruction *TI) {
   assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
   assert(TrueDest != FalseDest && "Branch targets should be different");
   // Insert a conditional branch on LIC to the two preheaders.  The original
@@ -996,7 +995,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
 /// outside of the loop and updating loop info.
 void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
                                             BasicBlock *ExitBlock,
-                                            TerminatorInst *TI) {
+                                            Instruction *TI) {
   LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
                     << loopHeader->getName() << " [" << L->getBlocks().size()
                     << " blocks] in Function "
@@ -1054,7 +1053,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
 /// condition.
 bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
   BasicBlock *CurrentBB = currentLoop->getHeader();
-  TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+  Instruction *CurrentTerm = CurrentBB->getTerminator();
   LLVMContext &Context = CurrentBB->getContext();
 
   // If loop header has only one reachable successor (currently via an
@@ -1227,7 +1226,7 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
 /// Split it into loop versions and test the condition outside of either loop.
 /// Return the loops created as Out1/Out2.
 void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
-                                               Loop *L, TerminatorInst *TI) {
+                                               Loop *L, Instruction *TI) {
   Function *F = loopHeader->getParent();
   LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
                     << loopHeader->getName() << " [" << L->getBlocks().size()
diff --git a/lib/Transforms/Scalar/MergeICmps.cpp b/lib/Transforms/Scalar/MergeICmps.cpp
index f68662b488cbb053839698715f54f0f6c09e3277..69fd8b163a070d1fa483927b27214c28d1b903b5 100644
--- a/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/lib/Transforms/Scalar/MergeICmps.cpp
@@ -98,7 +98,7 @@ BCEAtom visitICmpLoadOperand(Value *const Val) {
     Value *const Addr = LoadI->getOperand(0);
     if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
       LLVM_DEBUG(dbgs() << "GEP\n");
-      if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+      if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
         LLVM_DEBUG(dbgs() << "used outside of block\n");
         return {};
       }
@@ -283,8 +283,9 @@ BCECmpBlock visitICmp(const ICmpInst *const CmpI,
     if (!Lhs.Base()) return {};
     auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
     if (!Rhs.Base()) return {};
+    const auto &DL = CmpI->getModule()->getDataLayout();
     return BCECmpBlock(std::move(Lhs), std::move(Rhs),
-                       CmpI->getOperand(0)->getType()->getScalarSizeInBits());
+                       DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()));
   }
   return {};
 }
diff --git a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 3464b759280f6c7d911a42749c0046b5e7a1a70b..ee21feca8d2c02deeeb89061015851818f878010 100644
--- a/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -211,6 +211,7 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
 
   auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
                                 &BB->front());
+  NewPN->applyMergedLocation(S0->getDebugLoc(), S1->getDebugLoc());
   NewPN->addIncoming(Opd1, S0->getParent());
   NewPN->addIncoming(Opd2, S1->getParent());
   return NewPN;
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index ed9f868af6182af271b77b5e8deffddf3077132f..9803bcb485d26994f17c82a20a8809bdd2f9b818 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -777,7 +777,7 @@ private:
 
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
-  void processOutgoingEdges(TerminatorInst *, BasicBlock *);
+  void processOutgoingEdges(Instruction *, BasicBlock *);
   Value *findConditionEquivalence(Value *) const;
 
   // Elimination.
@@ -1086,9 +1086,13 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
   CongruenceClass *CC = ValueToClass.lookup(V);
   if (CC) {
     if (CC->getLeader() && CC->getLeader() != I) {
-      // Don't add temporary instructions to the user lists.
-      if (!AllTempInstructions.count(I))
-        addAdditionalUsers(V, I);
+      // If we simplified to something else, we need to communicate
+      // that we're users of the value we simplified to.
+      if (I != V) {
+        // Don't add temporary instructions to the user lists.
+        if (!AllTempInstructions.count(I))
+          addAdditionalUsers(V, I);
+      }
       return createVariableOrConstant(CC->getLeader());
     }
     if (CC->getDefiningExpr()) {
@@ -1751,7 +1755,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
     return true;
   });
   // If we are left with no operands, it's dead.
-  if (Filtered.begin() == Filtered.end()) {
+  if (empty(Filtered)) {
     // If it has undef at this point, it means there are no-non-undef arguments,
     // and thus, the value of the phi node must be undef.
     if (HasUndef) {
@@ -2483,7 +2487,7 @@ Value *NewGVN::findConditionEquivalence(Value *Cond) const {
 }
 
 // Process the outgoing edges of a block for reachability.
-void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
+void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
   // Evaluate reachability of terminator instruction.
   BranchInst *BR;
   if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
@@ -3133,7 +3137,7 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
       auto *Symbolized = createUnknownExpression(I);
       performCongruenceFinding(I, Symbolized);
     }
-    processOutgoingEdges(dyn_cast<TerminatorInst>(I), I->getParent());
+    processOutgoingEdges(I, I->getParent());
   }
 }
 
diff --git a/lib/Transforms/Scalar/PlaceSafepoints.cpp b/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 7f9aad2488367f93f55bfda2c2c9e93dd4d72e31..fd2eb85fd7bf5326ff9c83006e6d9910e8e10ce5 100644
--- a/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -105,7 +105,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass {
 
   /// The output of the pass - gives a list of each backedge (described by
   /// pointing at the branch) which need a poll inserted.
-  std::vector<TerminatorInst *> PollLocations;
+  std::vector<Instruction *> PollLocations;
 
   /// True unless we're running spp-no-calls in which case we need to disable
   /// the call-dependent placement opts.
@@ -348,7 +348,7 @@ bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
     // Safepoint insertion would involve creating a new basic block (as the
     // target of the current backedge) which does the safepoint (of all live
     // variables) and branches to the true header
-    TerminatorInst *Term = Pred->getTerminator();
+    Instruction *Term = Pred->getTerminator();
 
     LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
 
@@ -535,7 +535,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) {
 
     // Insert a poll at each point the analysis pass identified
     // The poll location must be the terminator of a loop latch block.
-    for (TerminatorInst *Term : PollLocations) {
+    for (Instruction *Term : PollLocations) {
       // We are inserting a poll, the function is modified
       Modified = true;
 
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 03cd7c10150885ef4d0ef92b8473545c46e9f489..61b6d7ca259300f1f5435fe30752c878b2590d01 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -63,6 +63,7 @@
 
 using namespace llvm;
 using namespace reassociate;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "reassociate"
 
@@ -125,10 +126,10 @@ XorOpnd::XorOpnd(Value *V) {
     Value *V0 = I->getOperand(0);
     Value *V1 = I->getOperand(1);
     const APInt *C;
-    if (match(V0, PatternMatch::m_APInt(C)))
+    if (match(V0, m_APInt(C)))
       std::swap(V0, V1);
 
-    if (match(V1, PatternMatch::m_APInt(C))) {
+    if (match(V1, m_APInt(C))) {
       ConstPart = *C;
       SymbolicPart = V0;
       isOr = (I->getOpcode() == Instruction::Or);
@@ -204,10 +205,10 @@ unsigned ReassociatePass::getRank(Value *V) {
   for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i)
     Rank = std::max(Rank, getRank(I->getOperand(i)));
 
-  // If this is a not or neg instruction, do not count it for rank.  This
+  // If this is a 'not' or 'neg' instruction, do not count it for rank. This
   // assures us that X and ~X will have the same rank.
-  if (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I) &&
-      !BinaryOperator::isFNeg(I))
+  if (!match(I, m_Not(m_Value())) && !match(I, m_Neg(m_Value())) &&
+      !match(I, m_FNeg(m_Value())))
     ++Rank;
 
   LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank
@@ -573,8 +574,8 @@ static bool LinearizeExprTree(BinaryOperator *I,
       // If this is a multiply expression, turn any internal negations into
       // multiplies by -1 so they can be reassociated.
       if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op))
-        if ((Opcode == Instruction::Mul && BinaryOperator::isNeg(BO)) ||
-            (Opcode == Instruction::FMul && BinaryOperator::isFNeg(BO))) {
+        if ((Opcode == Instruction::Mul && match(BO, m_Neg(m_Value()))) ||
+            (Opcode == Instruction::FMul && match(BO, m_FNeg(m_Value())))) {
           LLVM_DEBUG(dbgs()
                      << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
           BO = LowerNegateToMultiply(BO);
@@ -854,7 +855,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
   // Okay, we need to materialize a negated version of V with an instruction.
   // Scan the use lists of V to see if we have one already.
   for (User *U : V->users()) {
-    if (!BinaryOperator::isNeg(U) && !BinaryOperator::isFNeg(U))
+    if (!match(U, m_Neg(m_Value())) && !match(U, m_FNeg(m_Value())))
       continue;
 
     // We found one!  Now we have to make sure that the definition dominates
@@ -899,7 +900,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
 /// Return true if we should break up this subtract of X-Y into (X + -Y).
 static bool ShouldBreakUpSubtract(Instruction *Sub) {
   // If this is a negation, we can't split it up!
-  if (BinaryOperator::isNeg(Sub) || BinaryOperator::isFNeg(Sub))
+  if (match(Sub, m_Neg(m_Value())) || match(Sub, m_FNeg(m_Value()))) 
     return false;
 
   // Don't breakup X - undef.
@@ -1113,8 +1114,8 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     // First, check for X and ~X in the operand list.
     assert(i < Ops.size());
-    if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
-      Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+    Value *X;
+    if (match(Ops[i].Op, m_Not(m_Value(X)))) {    // Cannot occur for ^.
       unsigned FoundX = FindInOperandList(Ops, i, X);
       if (FoundX != i) {
         if (Opcode == Instruction::And)   // ...&X&~X = 0
@@ -1304,7 +1305,7 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
     Value *V = Ops[i].Op;
     const APInt *C;
     // TODO: Support non-splat vectors.
-    if (match(V, PatternMatch::m_APInt(C))) {
+    if (match(V, m_APInt(C))) {
       ConstOpnd ^= *C;
     } else {
       XorOpnd O(V);
@@ -1460,27 +1461,22 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
     }
 
     // Check for X and -X or X and ~X in the operand list.
-    if (!BinaryOperator::isNeg(TheOp) && !BinaryOperator::isFNeg(TheOp) &&
-        !BinaryOperator::isNot(TheOp))
+    Value *X;
+    if (!match(TheOp, m_Neg(m_Value(X))) && !match(TheOp, m_Not(m_Value(X))) &&
+        !match(TheOp, m_FNeg(m_Value(X))))
       continue;
 
-    Value *X = nullptr;
-    if (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp))
-      X = BinaryOperator::getNegArgument(TheOp);
-    else if (BinaryOperator::isNot(TheOp))
-      X = BinaryOperator::getNotArgument(TheOp);
-
     unsigned FoundX = FindInOperandList(Ops, i, X);
     if (FoundX == i)
       continue;
 
     // Remove X and -X from the operand list.
     if (Ops.size() == 2 &&
-        (BinaryOperator::isNeg(TheOp) || BinaryOperator::isFNeg(TheOp)))
+        (match(TheOp, m_Neg(m_Value())) || match(TheOp, m_FNeg(m_Value()))))
       return Constant::getNullValue(X->getType());
 
     // Remove X and ~X from the operand list.
-    if (Ops.size() == 2 && BinaryOperator::isNot(TheOp))
+    if (Ops.size() == 2 && match(TheOp, m_Not(m_Value())))
       return Constant::getAllOnesValue(X->getType());
 
     Ops.erase(Ops.begin()+i);
@@ -1494,7 +1490,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
     e -= 2;  // Removed two elements.
 
     // if X and ~X we append -1 to the operand list.
-    if (BinaryOperator::isNot(TheOp)) {
+    if (match(TheOp, m_Not(m_Value()))) {
       Value *V = Constant::getAllOnesValue(X->getType());
       Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
       e += 1;
@@ -2058,7 +2054,7 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
       RedoInsts.insert(I);
       MadeChange = true;
       I = NI;
-    } else if (BinaryOperator::isNeg(I)) {
+    } else if (match(I, m_Neg(m_Value()))) {
       // Otherwise, this is a negation.  See if the operand is a multiply tree
       // and if this is not an inner node of a multiply tree.
       if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
@@ -2082,7 +2078,7 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
       RedoInsts.insert(I);
       MadeChange = true;
       I = NI;
-    } else if (BinaryOperator::isFNeg(I)) {
+    } else if (match(I, m_FNeg(m_Value()))) {
       // Otherwise, this is a negation.  See if the operand is a multiply tree
       // and if this is not an inner node of a multiply tree.
       if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 5e23a8a3dcd24cb92c514e3f9629fe5f230c92bf..cf2ce03049afc33edaceb91466c55c93d2a43486 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1851,13 +1851,13 @@ static void relocationViaAlloca(
     StoreInst *Store = new StoreInst(Def, Alloca);
     if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
       if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
-        // InvokeInst is a TerminatorInst so the store need to be inserted
-        // into its normal destination block.
+        // InvokeInst is a terminator so the store need to be inserted into its
+        // normal destination block.
         BasicBlock *NormalDest = Invoke->getNormalDest();
         Store->insertBefore(NormalDest->getFirstNonPHI());
       } else {
         assert(!Inst->isTerminator() &&
-               "The only TerminatorInst that can produce a value is "
+               "The only terminator that can produce a value is "
                "InvokeInst which is handled above.");
         Store->insertAfter(Inst);
       }
@@ -2584,7 +2584,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
   // increase the liveset of any statepoint we move over.  This is profitable
   // as long as all statepoints are in rare blocks.  If we had in-register
   // lowering for live values this would be a much safer transform.
-  auto getConditionInst = [](TerminatorInst *TI) -> Instruction* {
+  auto getConditionInst = [](Instruction *TI) -> Instruction * {
     if (auto *BI = dyn_cast<BranchInst>(TI))
       if (BI->isConditional())
         return dyn_cast<Instruction>(BI->getCondition());
@@ -2592,7 +2592,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
     return nullptr;
   };
   for (BasicBlock &BB : F) {
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (auto *Cond = getConditionInst(TI))
       // TODO: Handle more than just ICmps here.  We should be able to move
       // most instructions without side effects or memory access.
@@ -2675,7 +2675,7 @@ static SetVector<Value *> computeKillSet(BasicBlock *BB) {
 /// Check that the items in 'Live' dominate 'TI'.  This is used as a basic
 /// sanity check for the liveness computation.
 static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live,
-                          TerminatorInst *TI, bool TermOkay = false) {
+                          Instruction *TI, bool TermOkay = false) {
   for (Value *V : Live) {
     if (auto *I = dyn_cast<Instruction>(V)) {
       // The terminator can be a member of the LiveOut set.  LLVM's definition
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index d024e03b80a12e269e38249330a56d1a375cf542..1f98128f923a6301593d5c69b97af4d8b9d4d874 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -563,7 +563,7 @@ private:
 
   // getFeasibleSuccessors - Return a vector of booleans to indicate which
   // successors are reachable from a given terminator instruction.
-  void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
+  void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl<bool> &Succs);
 
   // OperandChangedState - This method is invoked on all of the users of an
   // instruction that was just changed state somehow.  Based on this
@@ -604,7 +604,7 @@ private:
   // Terminators
 
   void visitReturnInst(ReturnInst &I);
-  void visitTerminatorInst(TerminatorInst &TI);
+  void visitTerminator(Instruction &TI);
 
   void visitCastInst(CastInst &I);
   void visitSelectInst(SelectInst &I);
@@ -615,7 +615,7 @@ private:
 
   void visitCatchSwitchInst(CatchSwitchInst &CPI) {
     markOverdefined(&CPI);
-    visitTerminatorInst(CPI);
+    visitTerminator(CPI);
   }
 
   // Instructions that cannot be folded away.
@@ -630,12 +630,12 @@ private:
 
   void visitInvokeInst    (InvokeInst &II) {
     visitCallSite(&II);
-    visitTerminatorInst(II);
+    visitTerminator(II);
   }
 
   void visitCallSite      (CallSite CS);
-  void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
-  void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitResumeInst    (ResumeInst &I) { /*returns void*/ }
+  void visitUnreachableInst(UnreachableInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
 
   void visitInstruction(Instruction &I) {
@@ -650,7 +650,7 @@ private:
 
 // getFeasibleSuccessors - Return a vector of booleans to indicate which
 // successors are reachable from a given terminator instruction.
-void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
                                        SmallVectorImpl<bool> &Succs) {
   Succs.resize(TI.getNumSuccessors());
   if (auto *BI = dyn_cast<BranchInst>(&TI)) {
@@ -837,7 +837,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
   }
 }
 
-void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+void SCCPSolver::visitTerminator(Instruction &TI) {
   SmallVector<bool, 16> SuccFeasible;
   getFeasibleSuccessors(TI, SuccFeasible);
 
@@ -1017,8 +1017,9 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
 
 // Handle ICmpInst instruction.
 void SCCPSolver::visitCmpInst(CmpInst &I) {
-  LatticeVal &IV = ValueState[&I];
-  if (IV.isOverdefined()) return;
+  // Do not cache this lookup, getValueState calls later in the function might
+  // invalidate the reference.
+  if (ValueState[&I].isOverdefined()) return;
 
   Value *Op1 = I.getOperand(0);
   Value *Op2 = I.getOperand(1);
@@ -1046,7 +1047,8 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
   }
 
   // If operands are still unknown, wait for it to resolve.
-  if (!V1State.isOverdefined() && !V2State.isOverdefined() && !IV.isConstant())
+  if (!V1State.isOverdefined() && !V2State.isOverdefined() &&
+      !ValueState[&I].isConstant())
     return;
 
   markOverdefined(&I);
@@ -1228,6 +1230,8 @@ CallOverdefined:
       SmallVector<Constant*, 8> Operands;
       for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
            AI != E; ++AI) {
+        if (AI->get()->getType()->isStructTy())
+          return markOverdefined(I); // Can't handle struct args.
         LatticeVal State = getValueState(*AI);
 
         if (State.isUnknown())
@@ -1612,7 +1616,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
     // Check to see if we have a branch or switch on an undefined value.  If so
     // we force the branch to go one way or the other to make the successor
     // values live.  It doesn't really matter which way we force it.
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (auto *BI = dyn_cast<BranchInst>(TI)) {
       if (!BI->isConditional()) continue;
       if (!getValueState(BI->getCondition()).isUnknown())
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 6e991409bf079683553237cca3ebe00ca5a95fb5..a8b9ee566395a3b6cabc74ba05eaa2afef4795ad 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1211,7 +1211,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
   // predecessor blocks. The only thing to watch out for is that we can't put
   // a possibly trapping load in the predecessor if it is a critical edge.
   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
-    TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+    Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
     Value *InVal = PN.getIncomingValue(Idx);
 
     // If the value is produced by the terminator of the predecessor (an
@@ -1275,7 +1275,7 @@ static void speculatePHINodeLoads(PHINode &PN) {
       continue;
     }
 
-    TerminatorInst *TI = Pred->getTerminator();
+    Instruction *TI = Pred->getTerminator();
     IRBuilderTy PredBuilder(TI);
 
     LoadInst *Load = PredBuilder.CreateLoad(
@@ -1400,8 +1400,8 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
   if (Ty == TargetTy)
     return buildGEP(IRB, BasePtr, Indices, NamePrefix);
 
-  // Pointer size to use for the indices.
-  unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType());
+  // Offset size to use for the indices.
+  unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
 
   // See if we can descend into a struct and locate a field with the correct
   // type.
@@ -1413,7 +1413,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
 
     if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
       ElementTy = ArrayTy->getElementType();
-      Indices.push_back(IRB.getIntN(PtrSize, 0));
+      Indices.push_back(IRB.getIntN(OffsetSize, 0));
     } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
       ElementTy = VectorTy->getElementType();
       Indices.push_back(IRB.getInt32(0));
@@ -2377,7 +2377,7 @@ private:
 #endif
 
     return getAdjustedPtr(IRB, DL, &NewAI,
-                          APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+                          APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
                           PointerTy,
 #ifndef NDEBUG
                           Twine(OldName) + "."
@@ -2899,8 +2899,8 @@ private:
     unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
 
     // Compute the relative offset for the other pointer within the transfer.
-    unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS);
-    APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
+    unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
+    APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
     unsigned OtherAlign =
       IsDest ? II.getSourceAlignment() : II.getDestAlignment();
     OtherAlign =  MinAlign(OtherAlign ? OtherAlign : 1,
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 17035f469dae04b6e7c5eb4d15c3dc8dae19945e..368f0925abac0011e658acf2c11ecff2265f4b4c 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -59,6 +60,7 @@ using namespace llvm;
 
 STATISTIC(NumBranches, "Number of branches unswitched");
 STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
 
 static cl::opt<bool> EnableNonTrivialUnswitch(
@@ -70,6 +72,11 @@ static cl::opt<int>
     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
                       cl::desc("The cost threshold for unswitching a loop."));
 
+static cl::opt<bool> UnswitchGuards(
+    "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
+    cl::desc("If enabled, simple loop unswitching will also consider "
+             "llvm.experimental.guard intrinsics as unswitch candidates."));
+
 /// Collect all of the loop invariant input values transitively used by the
 /// homogeneous instruction graph from a given root.
 ///
@@ -783,7 +790,7 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
                      [](Instruction &I) { return I.mayHaveSideEffects(); }))
       return Changed;
 
-    TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+    Instruction *CurrentTerm = CurrentBB->getTerminator();
 
     if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
       // Don't bother trying to unswitch past a switch with a constant
@@ -1792,10 +1799,10 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
   } while (!DomWorklist.empty());
 }
 
-static bool unswitchNontrivialInvariants(
-    Loop &L, TerminatorInst &TI, ArrayRef<Value *> Invariants,
-    DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC,
-    function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+static void unswitchNontrivialInvariants(
+    Loop &L, Instruction &TI, ArrayRef<Value *> Invariants,
+    SmallVectorImpl<BasicBlock *> &ExitBlocks, DominatorTree &DT, LoopInfo &LI,
+    AssumptionCache &AC, function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
     ScalarEvolution *SE) {
   auto *ParentBB = TI.getParent();
   BranchInst *BI = dyn_cast<BranchInst>(&TI);
@@ -1851,17 +1858,6 @@ static bool unswitchNontrivialInvariants(
   // whatever reason).
   assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
 
-  SmallVector<BasicBlock *, 4> ExitBlocks;
-  L.getUniqueExitBlocks(ExitBlocks);
-
-  // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
-  // don't know how to split those exit blocks.
-  // FIXME: We should teach SplitBlock to handle this and remove this
-  // restriction.
-  for (auto *ExitBB : ExitBlocks)
-    if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI()))
-      return false;
-
   // Compute the parent loop now before we start hacking on things.
   Loop *ParentL = L.getParentLoop();
 
@@ -2048,6 +2044,18 @@ static bool unswitchNontrivialInvariants(
     assert(UnswitchedSuccBBs.size() == 1 &&
            "Only one possible unswitched block for a branch!");
     BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+
+    // When considering multiple partially-unswitched invariants
+    // we cant just go replace them with constants in both branches.
+    //
+    // For 'AND' we infer that true branch ("continue") means true
+    // for each invariant operand.
+    // For 'OR' we can infer that false branch ("continue") means false
+    // for each invariant operand.
+    // So it happens that for multiple-partial case we dont replace
+    // in the unswitched branch.
+    bool ReplaceUnswitched = FullUnswitch || (Invariants.size() == 1);
+
     ConstantInt *UnswitchedReplacement =
         Direction ? ConstantInt::getTrue(BI->getContext())
                   : ConstantInt::getFalse(BI->getContext());
@@ -2067,7 +2075,8 @@ static bool unswitchNontrivialInvariants(
         // unswitched if in the cloned blocks.
         if (DT.dominates(LoopPH, UserI->getParent()))
           U->set(ContinueReplacement);
-        else if (DT.dominates(ClonedPH, UserI->getParent()))
+        else if (ReplaceUnswitched &&
+                 DT.dominates(ClonedPH, UserI->getParent()))
           U->set(UnswitchedReplacement);
       }
   }
@@ -2145,7 +2154,6 @@ static bool unswitchNontrivialInvariants(
   UnswitchCB(IsStillLoop, SibLoops);
 
   ++NumBranches;
-  return true;
 }
 
 /// Recursively compute the cost of a dominator subtree based on the per-block
@@ -2181,6 +2189,77 @@ computeDomSubtreeCost(DomTreeNode &N,
   return Cost;
 }
 
+/// Turns a llvm.experimental.guard intrinsic into implicit control flow branch,
+/// making the following replacement:
+///
+///   --code before guard--
+///   call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+///   --code after guard--
+///
+/// into
+///
+///   --code before guard--
+///   br i1 %cond, label %guarded, label %deopt
+///
+/// guarded:
+///   --code after guard--
+///
+/// deopt:
+///   call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+///   unreachable
+///
+/// It also makes all relevant DT and LI updates, so that all structures are in
+/// valid state after this transform.
+static BranchInst *
+turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
+                    SmallVectorImpl<BasicBlock *> &ExitBlocks,
+                    DominatorTree &DT, LoopInfo &LI) {
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+  LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n");
+  BasicBlock *CheckBB = GI->getParent();
+
+  // Remove all CheckBB's successors from DomTree. A block can be seen among
+  // successors more than once, but for DomTree it should be added only once.
+  SmallPtrSet<BasicBlock *, 4> Successors;
+  for (auto *Succ : successors(CheckBB))
+    if (Successors.insert(Succ).second)
+      DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ});
+
+  Instruction *DeoptBlockTerm =
+      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true);
+  BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+  // SplitBlockAndInsertIfThen inserts control flow that branches to
+  // DeoptBlockTerm if the condition is true.  We want the opposite.
+  CheckBI->swapSuccessors();
+
+  BasicBlock *GuardedBlock = CheckBI->getSuccessor(0);
+  GuardedBlock->setName("guarded");
+  CheckBI->getSuccessor(1)->setName("deopt");
+
+  // We now have a new exit block.
+  ExitBlocks.push_back(CheckBI->getSuccessor(1));
+
+  GI->moveBefore(DeoptBlockTerm);
+  GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext()));
+
+  // Add new successors of CheckBB into DomTree.
+  for (auto *Succ : successors(CheckBB))
+    DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ});
+
+  // Now the blocks that used to be CheckBB's successors are GuardedBlock's
+  // successors.
+  for (auto *Succ : Successors)
+    DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ});
+
+  // Make proper changes to DT.
+  DT.applyUpdates(DTUpdates);
+  // Inform LI of a new loop block.
+  L.addBasicBlockToLoop(GuardedBlock, LI);
+
+  ++NumGuards;
+  return CheckBI;
+}
+
 static bool
 unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
                       AssumptionCache &AC, TargetTransformInfo &TTI,
@@ -2188,12 +2267,31 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
                       ScalarEvolution *SE) {
   // Collect all invariant conditions within this loop (as opposed to an inner
   // loop which would be handled when visiting that inner loop).
-  SmallVector<std::pair<TerminatorInst *, TinyPtrVector<Value *>>, 4>
+  SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
       UnswitchCandidates;
+
+  // Whether or not we should also collect guards in the loop.
+  bool CollectGuards = false;
+  if (UnswitchGuards) {
+    auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
+        Intrinsic::getName(Intrinsic::experimental_guard));
+    if (GuardDecl && !GuardDecl->use_empty())
+      CollectGuards = true;
+  }
+
   for (auto *BB : L.blocks()) {
     if (LI.getLoopFor(BB) != &L)
       continue;
 
+    if (CollectGuards)
+      for (auto &I : *BB)
+        if (isGuard(&I)) {
+          auto *Cond = cast<IntrinsicInst>(&I)->getArgOperand(0);
+          // TODO: Support AND, OR conditions and partial unswitching.
+          if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond))
+            UnswitchCandidates.push_back({&I, {Cond}});
+        }
+
     if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
       // We can only consider fully loop-invariant switch conditions as we need
       // to completely eliminate the switch after unswitching.
@@ -2241,6 +2339,19 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
   if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
     return false;
 
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L.getUniqueExitBlocks(ExitBlocks);
+
+  // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
+  // don't know how to split those exit blocks.
+  // FIXME: We should teach SplitBlock to handle this and remove this
+  // restriction.
+  for (auto *ExitBB : ExitBlocks)
+    if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI())) {
+      dbgs() << "Cannot unswitch because of cleanuppad in exit block\n";
+      return false;
+    }
+
   LLVM_DEBUG(
       dbgs() << "Considering " << UnswitchCandidates.size()
              << " non-trivial loop invariant conditions for unswitching.\n");
@@ -2298,7 +2409,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
   SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
   // Given a terminator which might be unswitched, computes the non-duplicated
   // cost for that terminator.
-  auto ComputeUnswitchedCost = [&](TerminatorInst &TI, bool FullUnswitch) {
+  auto ComputeUnswitchedCost = [&](Instruction &TI, bool FullUnswitch) {
     BasicBlock &BB = *TI.getParent();
     SmallPtrSet<BasicBlock *, 4> Visited;
 
@@ -2345,15 +2456,18 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
     // Now scale the cost by the number of unique successors minus one. We
     // subtract one because there is already at least one copy of the entire
     // loop. This is computing the new cost of unswitching a condition.
-    assert(Visited.size() > 1 &&
+    // Note that guards always have 2 unique successors that are implicit and
+    // will be materialized if we decide to unswitch it.
+    int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size();
+    assert(SuccessorsCount > 1 &&
            "Cannot unswitch a condition without multiple distinct successors!");
-    return Cost * (Visited.size() - 1);
+    return Cost * (SuccessorsCount - 1);
   };
-  TerminatorInst *BestUnswitchTI = nullptr;
+  Instruction *BestUnswitchTI = nullptr;
   int BestUnswitchCost;
   ArrayRef<Value *> BestUnswitchInvariants;
   for (auto &TerminatorAndInvariants : UnswitchCandidates) {
-    TerminatorInst &TI = *TerminatorAndInvariants.first;
+    Instruction &TI = *TerminatorAndInvariants.first;
     ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
     BranchInst *BI = dyn_cast<BranchInst>(&TI);
     int CandidateCost = ComputeUnswitchedCost(
@@ -2374,11 +2488,17 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
     return false;
   }
 
-  LLVM_DEBUG(dbgs() << "  Trying to unswitch non-trivial (cost = "
+  // If the best candidate is a guard, turn it into a branch.
+  if (isGuard(BestUnswitchTI))
+    BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L,
+                                         ExitBlocks, DT, LI);
+
+  LLVM_DEBUG(dbgs() << "  Unswitching non-trivial (cost = "
                     << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
                     << "\n");
-  return unswitchNontrivialInvariants(
-      L, *BestUnswitchTI, BestUnswitchInvariants, DT, LI, AC, UnswitchCB, SE);
+  unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants,
+                               ExitBlocks, DT, LI, AC, UnswitchCB, SE);
+  return true;
 }
 
 /// Unswitch control flow predicated on loop invariant conditions.
diff --git a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 2061db13639a25b573a3e8039db101f488be376c..b5089b006bdd1131a32ae03c1ddf088488d8ffd6 100644
--- a/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -640,12 +640,12 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
   Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
   switch (C.CandidateKind) {
   case Candidate::Add:
-  case Candidate::Mul:
+  case Candidate::Mul: {
     // C = Basis + Bump
-    if (BinaryOperator::isNeg(Bump)) {
+    Value *NegBump;
+    if (match(Bump, m_Neg(m_Value(NegBump)))) {
       // If Bump is a neg instruction, emit C = Basis - (-Bump).
-      Reduced =
-          Builder.CreateSub(Basis.Ins, BinaryOperator::getNegArgument(Bump));
+      Reduced = Builder.CreateSub(Basis.Ins, NegBump);
       // We only use the negative argument of Bump, and Bump itself may be
       // trivially dead.
       RecursivelyDeleteTriviallyDeadInstructions(Bump);
@@ -662,6 +662,7 @@ void StraightLineStrengthReduce::rewriteCandidateWithBasis(
       Reduced = Builder.CreateAdd(Basis.Ins, Bump);
     }
     break;
+  }
   case Candidate::GEP:
     {
       Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index f58f79f8b1448709b71604abc1bb2cacb43e7e7d..0db762d846f225add2026a9195db667036a2d2c0 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
@@ -596,7 +597,8 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
 
 /// Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
-  SSAUpdater Updater;
+  SmallVector<PHINode *, 8> InsertedPhis;
+  SSAUpdater Updater(&InsertedPhis);
   for (const auto &AddedPhi : AddedPhis) {
     BasicBlock *To = AddedPhi.first;
     const BBVector &From = AddedPhi.second;
@@ -632,11 +634,31 @@ void StructurizeCFG::setPhiValues() {
     DeletedPhis.erase(To);
   }
   assert(DeletedPhis.empty());
+
+  // Simplify any phis inserted by the SSAUpdater if possible
+  bool Changed;
+  do {
+    Changed = false;
+
+    SimplifyQuery Q(Func->getParent()->getDataLayout());
+    Q.DT = DT;
+    for (size_t i = 0; i < InsertedPhis.size(); ++i) {
+      PHINode *Phi = InsertedPhis[i];
+      if (Value *V = SimplifyInstruction(Phi, Q)) {
+        Phi->replaceAllUsesWith(V);
+        Phi->eraseFromParent();
+        InsertedPhis[i] = InsertedPhis.back();
+        InsertedPhis.pop_back();
+        i--;
+        Changed = true;
+      }
+    }
+  } while (Changed);
 }
 
 /// Remove phi values from all successors and then remove the terminator.
 void StructurizeCFG::killTerminator(BasicBlock *BB) {
-  TerminatorInst *Term = BB->getTerminator();
+  Instruction *Term = BB->getTerminator();
   if (!Term)
     return;
 
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 6a77a2d414f8bd021960ed781b8492804345fd68..0f6db21f73b60eb5ea57e7b457b79054e8139df9 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -702,7 +702,7 @@ static bool foldReturnAndProcessPred(
   SmallVector<BranchInst*, 8> UncondBranchPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *Pred = *PI;
-    TerminatorInst *PTI = Pred->getTerminator();
+    Instruction *PTI = Pred->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
       if (BI->isUnconditional())
         UncondBranchPreds.push_back(BI);
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 99914fcf81b226aea0436ca96baba47983733468..11a0114150ffb29a8964d7ed97440d8443425919 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -52,7 +52,7 @@ void llvm::DeleteDeadBlock(BasicBlock *BB, DomTreeUpdater *DTU) {
   assert((pred_begin(BB) == pred_end(BB) ||
          // Can delete self loop.
          BB->getSinglePredecessor() == BB) && "Block is not dead!");
-  TerminatorInst *BBTerm = BB->getTerminator();
+  Instruction *BBTerm = BB->getTerminator();
   std::vector<DominatorTree::UpdateType> Updates;
 
   // Loop through all of our successors and make sure they know that one
@@ -270,7 +270,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
   unsigned SuccNum = GetSuccessorNumber(BB, Succ);
 
   // If this is a critical edge, let SplitCriticalEdge do it.
-  TerminatorInst *LatchTerm = BB->getTerminator();
+  Instruction *LatchTerm = BB->getTerminator();
   if (SplitCriticalEdge(
           LatchTerm, SuccNum,
           CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA()))
@@ -298,7 +298,7 @@ llvm::SplitAllCriticalEdges(Function &F,
                             const CriticalEdgeSplittingOptions &Options) {
   unsigned NumBroken = 0;
   for (BasicBlock &BB : F) {
-    TerminatorInst *TI = BB.getTerminator();
+    Instruction *TI = BB.getTerminator();
     if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
       for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
         if (SplitCriticalEdge(TI, i, Options))
@@ -705,16 +705,17 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
-TerminatorInst *
-llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
-                                bool Unreachable, MDNode *BranchWeights,
-                                DominatorTree *DT, LoopInfo *LI) {
+Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
+                                             Instruction *SplitBefore,
+                                             bool Unreachable,
+                                             MDNode *BranchWeights,
+                                             DominatorTree *DT, LoopInfo *LI) {
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
-  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  Instruction *HeadOldTerm = Head->getTerminator();
   LLVMContext &C = Head->getContext();
   BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
-  TerminatorInst *CheckTerm;
+  Instruction *CheckTerm;
   if (Unreachable)
     CheckTerm = new UnreachableInst(C, ThenBlock);
   else
@@ -749,12 +750,12 @@ llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
 }
 
 void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
-                                         TerminatorInst **ThenTerm,
-                                         TerminatorInst **ElseTerm,
+                                         Instruction **ThenTerm,
+                                         Instruction **ElseTerm,
                                          MDNode *BranchWeights) {
   BasicBlock *Head = SplitBefore->getParent();
   BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
-  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  Instruction *HeadOldTerm = Head->getTerminator();
   LLVMContext &C = Head->getContext();
   BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
   BasicBlock *ElseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 63b37e37943344ae87d772b8eeba07f9ebbc9a32..fafc9aaba5c9cc4b095e9d7ca7538ac3aae8e9ae 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -130,7 +130,7 @@ static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
 }
 
 BasicBlock *
-llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
+llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
                         const CriticalEdgeSplittingOptions &Options) {
   if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
     return nullptr;
@@ -318,7 +318,7 @@ findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
   BasicBlock *IBB = nullptr;
   for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
     BasicBlock *PredBB = PN->getIncomingBlock(Pred);
-    TerminatorInst *PredTerm = PredBB->getTerminator();
+    Instruction *PredTerm = PredBB->getTerminator();
     switch (PredTerm->getOpcode()) {
     case Instruction::IndirectBr:
       if (IBB)
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 6eb39e5b959dfee5a371815ed4704a154061194c..3466dedd3236e1b1cc1d37d40f3aa69258996b14 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -121,6 +121,14 @@ static bool setNonLazyBind(Function &F) {
   return true;
 }
 
+bool llvm::inferLibFuncAttributes(Module *M, StringRef Name,
+                                  const TargetLibraryInfo &TLI) {
+  Function *F = M->getFunction(Name);
+  if (!F)
+    return false;
+  return inferLibFuncAttributes(*F, TLI);
+}
+
 bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   LibFunc TheLibFunc;
   if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
@@ -757,6 +765,24 @@ bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
   }
 }
 
+StringRef llvm::getUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                                LibFunc DoubleFn, LibFunc FloatFn,
+                                LibFunc LongDoubleFn) {
+  assert(hasUnaryFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
+         "Cannot get name for unavailable function!");
+
+  switch (Ty->getTypeID()) {
+  case Type::HalfTyID:
+    llvm_unreachable("No name for HalfTy!");
+  case Type::FloatTyID:
+    return TLI->getName(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->getName(DoubleFn);
+  default:
+    return TLI->getName(LongDoubleFn);
+  }
+}
+
 //- Emit LibCalls ------------------------------------------------------------//
 
 Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
@@ -770,11 +796,12 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef StrlenName = TLI->getName(LibFunc_strlen);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context),
+  Constant *StrLen = M->getOrInsertFunction(StrlenName, DL.getIntPtrType(Context),
                                             B.getInt8PtrTy());
-  inferLibFuncAttributes(*M->getFunction("strlen"), *TLI);
-  CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen");
+  inferLibFuncAttributes(M, StrlenName, *TLI);
+  CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), StrlenName);
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
 
@@ -787,13 +814,14 @@ Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef StrChrName = TLI->getName(LibFunc_strchr);
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr =
-      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty);
-  inferLibFuncAttributes(*M->getFunction("strchr"), *TLI);
+      M->getOrInsertFunction(StrChrName, I8Ptr, I8Ptr, I32Ty);
+  inferLibFuncAttributes(M, StrChrName, *TLI);
   CallInst *CI = B.CreateCall(
-      StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
+      StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, StrChrName);
   if (const Function *F = dyn_cast<Function>(StrChr->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
@@ -805,13 +833,14 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef StrNCmpName = TLI->getName(LibFunc_strncmp);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(),
+  Value *StrNCmp = M->getOrInsertFunction(StrNCmpName, B.getInt32Ty(),
                                           B.getInt8PtrTy(), B.getInt8PtrTy(),
                                           DL.getIntPtrType(Context));
-  inferLibFuncAttributes(*M->getFunction("strncmp"), *TLI);
+  inferLibFuncAttributes(M, StrNCmpName, *TLI);
   CallInst *CI = B.CreateCall(
-      StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp");
+      StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, StrNCmpName);
 
   if (const Function *F = dyn_cast<Function>(StrNCmp->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -827,7 +856,7 @@ Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr);
-  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
+  inferLibFuncAttributes(M, Name, *TLI);
   CallInst *CI =
       B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name);
   if (const Function *F = dyn_cast<Function>(StrCpy->stripPointerCasts()))
@@ -844,9 +873,9 @@ Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType());
-  inferLibFuncAttributes(*M->getFunction(Name), *TLI);
+  inferLibFuncAttributes(M, Name, *TLI);
   CallInst *CI = B.CreateCall(
-      StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy");
+      StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, Name);
   if (const Function *F = dyn_cast<Function>(StrNCpy->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
@@ -881,12 +910,13 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef MemChrName = TLI->getName(LibFunc_memchr);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(),
+  Value *MemChr = M->getOrInsertFunction(MemChrName, B.getInt8PtrTy(),
                                          B.getInt8PtrTy(), B.getInt32Ty(),
                                          DL.getIntPtrType(Context));
-  inferLibFuncAttributes(*M->getFunction("memchr"), *TLI);
-  CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr");
+  inferLibFuncAttributes(M, MemChrName, *TLI);
+  CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, MemChrName);
 
   if (const Function *F = dyn_cast<Function>(MemChr->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -900,13 +930,14 @@ Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef MemCmpName = TLI->getName(LibFunc_memcmp);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(),
+  Value *MemCmp = M->getOrInsertFunction(MemCmpName, B.getInt32Ty(),
                                          B.getInt8PtrTy(), B.getInt8PtrTy(),
                                          DL.getIntPtrType(Context));
-  inferLibFuncAttributes(*M->getFunction("memcmp"), *TLI);
+  inferLibFuncAttributes(M, MemCmpName, *TLI);
   CallInst *CI = B.CreateCall(
-      MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp");
+      MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, MemCmpName);
 
   if (const Function *F = dyn_cast<Function>(MemCmp->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -929,10 +960,10 @@ static void appendTypeSuffix(Value *Op, StringRef &Name,
   }
 }
 
-Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
-                                  const AttributeList &Attrs) {
-  SmallString<20> NameBuffer;
-  appendTypeSuffix(Op, Name, NameBuffer);
+static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
+                                         IRBuilder<> &B,
+                                         const AttributeList &Attrs) {
+  assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall");
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
@@ -951,8 +982,29 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
   return CI;
 }
 
+Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
+                                  const AttributeList &Attrs) {
+  SmallString<20> NameBuffer;
+  appendTypeSuffix(Op, Name, NameBuffer);
+
+  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+}
+
+Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+                                  LibFunc DoubleFn, LibFunc FloatFn,
+                                  LibFunc LongDoubleFn, IRBuilder<> &B,
+                                  const AttributeList &Attrs) {
+  // Get the name of the function according to TLI.
+  StringRef Name = getUnaryFloatFn(TLI, Op->getType(),
+                                   DoubleFn, FloatFn, LongDoubleFn);
+
+  return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+}
+
 Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
                                    IRBuilder<> &B, const AttributeList &Attrs) {
+  assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
+
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op1, Name, NameBuffer);
 
@@ -973,14 +1025,15 @@ Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), B.getInt32Ty());
-  inferLibFuncAttributes(*M->getFunction("putchar"), *TLI);
+  StringRef PutCharName = TLI->getName(LibFunc_putchar);
+  Value *PutChar = M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty());
+  inferLibFuncAttributes(M, PutCharName, *TLI);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
                               B.getInt32Ty(),
                               /*isSigned*/true,
                               "chari"),
-                              "putchar");
+                              PutCharName);
 
   if (const Function *F = dyn_cast<Function>(PutChar->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -993,10 +1046,11 @@ Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef PutsName = TLI->getName(LibFunc_puts);
   Value *PutS =
-      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy());
-  inferLibFuncAttributes(*M->getFunction("puts"), *TLI);
-  CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts");
+      M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy());
+  inferLibFuncAttributes(M, PutsName, *TLI);
+  CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName);
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
   return CI;
@@ -1008,13 +1062,14 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(),
+  StringRef FPutcName = TLI->getName(LibFunc_fputc);
+  Constant *F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(), B.getInt32Ty(),
                                        File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction("fputc"), *TLI);
+    inferLibFuncAttributes(M, FPutcName, *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
                          "chari");
-  CallInst *CI = B.CreateCall(F, {Char, File}, "fputc");
+  CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1027,12 +1082,13 @@ Value *llvm::emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  Constant *F = M->getOrInsertFunction("fputc_unlocked", B.getInt32Ty(),
+  StringRef FPutcUnlockedName = TLI->getName(LibFunc_fputc_unlocked);
+  Constant *F = M->getOrInsertFunction(FPutcUnlockedName, B.getInt32Ty(),
                                        B.getInt32Ty(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction("fputc_unlocked"), *TLI);
+    inferLibFuncAttributes(M, FPutcUnlockedName, *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/ true, "chari");
-  CallInst *CI = B.CreateCall(F, {Char, File}, "fputc_unlocked");
+  CallInst *CI = B.CreateCall(F, {Char, File}, FPutcUnlockedName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1049,8 +1105,8 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   Constant *F = M->getOrInsertFunction(
       FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FPutsName), *TLI);
-  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs");
+    inferLibFuncAttributes(M, FPutsName, *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1067,8 +1123,8 @@ Value *llvm::emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B,
   Constant *F = M->getOrInsertFunction(FPutsUnlockedName, B.getInt32Ty(),
                                        B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FPutsUnlockedName), *TLI);
-  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs_unlocked");
+    inferLibFuncAttributes(M, FPutsUnlockedName, *TLI);
+  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsUnlockedName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1088,7 +1144,7 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
       DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FWriteName), *TLI);
+    inferLibFuncAttributes(M, FWriteName, *TLI);
   CallInst *CI =
       B.CreateCall(F, {castToCStr(Ptr, B), Size,
                        ConstantInt::get(DL.getIntPtrType(Context), 1), File});
@@ -1104,11 +1160,12 @@ Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef MallocName = TLI->getName(LibFunc_malloc);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *Malloc = M->getOrInsertFunction("malloc", B.getInt8PtrTy(),
+  Value *Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(),
                                          DL.getIntPtrType(Context));
-  inferLibFuncAttributes(*M->getFunction("malloc"), *TLI);
-  CallInst *CI = B.CreateCall(Malloc, Num, "malloc");
+  inferLibFuncAttributes(M, MallocName, *TLI);
+  CallInst *CI = B.CreateCall(Malloc, Num, MallocName);
 
   if (const Function *F = dyn_cast<Function>(Malloc->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1122,12 +1179,13 @@ Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef CallocName = TLI.getName(LibFunc_calloc);
   const DataLayout &DL = M->getDataLayout();
   IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
-  Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
+  Value *Calloc = M->getOrInsertFunction(CallocName, Attrs, B.getInt8PtrTy(),
                                          PtrType, PtrType);
-  inferLibFuncAttributes(*M->getFunction("calloc"), TLI);
-  CallInst *CI = B.CreateCall(Calloc, {Num, Size}, "calloc");
+  inferLibFuncAttributes(M, CallocName, TLI);
+  CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
 
   if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1149,7 +1207,7 @@ Value *llvm::emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
       DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FWriteUnlockedName), *TLI);
+    inferLibFuncAttributes(M, FWriteUnlockedName, *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
@@ -1163,11 +1221,12 @@ Value *llvm::emitFGetCUnlocked(Value *File, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef FGetCUnlockedName = TLI->getName(LibFunc_fgetc_unlocked);
   Constant *F =
-      M->getOrInsertFunction("fgetc_unlocked", B.getInt32Ty(), File->getType());
+      M->getOrInsertFunction(FGetCUnlockedName, B.getInt32Ty(), File->getType());
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction("fgetc_unlocked"), *TLI);
-  CallInst *CI = B.CreateCall(F, File, "fgetc_unlocked");
+    inferLibFuncAttributes(M, FGetCUnlockedName, *TLI);
+  CallInst *CI = B.CreateCall(F, File, FGetCUnlockedName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1180,12 +1239,13 @@ Value *llvm::emitFGetSUnlocked(Value *Str, Value *Size, Value *File,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
+  StringRef FGetSUnlockedName = TLI->getName(LibFunc_fgets_unlocked);
   Constant *F =
-      M->getOrInsertFunction("fgets_unlocked", B.getInt8PtrTy(),
+      M->getOrInsertFunction(FGetSUnlockedName, B.getInt8PtrTy(),
                              B.getInt8PtrTy(), B.getInt32Ty(), File->getType());
-  inferLibFuncAttributes(*M->getFunction("fgets_unlocked"), *TLI);
+  inferLibFuncAttributes(M, FGetSUnlockedName, *TLI);
   CallInst *CI =
-      B.CreateCall(F, {castToCStr(Str, B), Size, File}, "fgets_unlocked");
+      B.CreateCall(F, {castToCStr(Str, B), Size, File}, FGetSUnlockedName);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -1206,7 +1266,7 @@ Value *llvm::emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
       DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
 
   if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(*M->getFunction(FReadUnlockedName), *TLI);
+    inferLibFuncAttributes(M, FReadUnlockedName, *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp
index 6d18d0614611eab5d7957c2bb4b1274ff74a06f2..4db579156d90aa538008434d1b90c0dec6765cd6 100644
--- a/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -177,8 +177,8 @@ static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) {
     InsertBefore = &*std::next(CS.getInstruction()->getIterator());
 
   // Bitcast the return value to the correct type.
-  auto *Cast = CastInst::Create(Instruction::BitCast, CS.getInstruction(),
-                                RetTy, "", InsertBefore);
+  auto *Cast = CastInst::CreateBitOrPointerCast(CS.getInstruction(), RetTy, "",
+                                                InsertBefore);
   if (RetBitCast)
     *RetBitCast = Cast;
 
@@ -270,8 +270,8 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
   // Create an if-then-else structure. The original instruction is moved into
   // the "else" block, and a clone of the original instruction is placed in the
   // "then" block.
-  TerminatorInst *ThenTerm = nullptr;
-  TerminatorInst *ElseTerm = nullptr;
+  Instruction *ThenTerm = nullptr;
+  Instruction *ElseTerm = nullptr;
   SplitBlockAndInsertIfThenElse(Cond, CS.getInstruction(), &ThenTerm, &ElseTerm,
                                 BranchWeights);
   BasicBlock *ThenBlock = ThenTerm->getParent();
@@ -321,12 +321,14 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
                             const char **FailureReason) {
   assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted");
 
+  auto &DL = Callee->getParent()->getDataLayout();
+
   // Check the return type. The callee's return value type must be bitcast
   // compatible with the call site's type.
   Type *CallRetTy = CS.getInstruction()->getType();
   Type *FuncRetTy = Callee->getReturnType();
   if (CallRetTy != FuncRetTy)
-    if (!CastInst::isBitCastable(FuncRetTy, CallRetTy)) {
+    if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) {
       if (FailureReason)
         *FailureReason = "Return type mismatch";
       return false;
@@ -351,7 +353,7 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
     Type *ActualTy = CS.getArgument(I)->getType();
     if (FormalTy == ActualTy)
       continue;
-    if (!CastInst::isBitCastable(ActualTy, FormalTy)) {
+    if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) {
       if (FailureReason)
         *FailureReason = "Argument type mismatch";
       return false;
@@ -396,8 +398,8 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
     Type *FormalTy = CalleeType->getParamType(ArgNo);
     Type *ActualTy = Arg->getType();
     if (FormalTy != ActualTy) {
-      auto *Cast = CastInst::Create(Instruction::BitCast, Arg, FormalTy, "",
-                                    CS.getInstruction());
+      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "",
+                                                    CS.getInstruction());
       CS.setArgument(ArgNo, Cast);
     }
   }
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index a9257a8c6702660c6fa99168d229afa08316fc83..000af808945ae599d2089d5b5a3610a91989a610 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -365,7 +365,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
   }
 
   // Finally, clone over the terminator.
-  const TerminatorInst *OldTI = BB->getTerminator();
+  const Instruction *OldTI = BB->getTerminator();
   bool TerminatorDone = false;
   if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
     if (BI->isConditional()) {
@@ -414,7 +414,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
           CodeInfo->OperandBundleCallSites.push_back(NewInst);
 
     // Recursively clone any reachable successor blocks.
-    const TerminatorInst *TI = BB->getTerminator();
+    const Instruction *TI = BB->getTerminator();
     for (const BasicBlock *Succ : successors(TI))
       ToClone.push_back(Succ);
   }
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 7f26c53ecf3e5e082de8145fbef69dbeb7f264c7..419e1db08bfd5320e654215daffb999812295431 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -57,6 +57,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -167,14 +168,22 @@ static bool isBlockValidForExtraction(const BasicBlock &BB,
       continue;
     }
 
-    if (const CallInst *CI = dyn_cast<CallInst>(I))
-      if (const Function *F = CI->getCalledFunction())
-        if (F->getIntrinsicID() == Intrinsic::vastart) {
+    if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (const Function *F = CI->getCalledFunction()) {
+        auto IID = F->getIntrinsicID();
+        if (IID == Intrinsic::vastart) {
           if (AllowVarArgs)
             continue;
           else
             return false;
         }
+
+        // Currently, we miscompile outlined copies of eh_typid_for. There are
+        // proposals for fixing this in llvm.org/PR39545.
+        if (IID == Intrinsic::eh_typeid_for)
+          return false;
+      }
+    }
   }
 
   return true;
@@ -228,19 +237,21 @@ buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
 CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
                              bool AggregateArgs, BlockFrequencyInfo *BFI,
                              BranchProbabilityInfo *BPI, bool AllowVarArgs,
-                             bool AllowAlloca)
+                             bool AllowAlloca, std::string Suffix)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AllowVarArgs(AllowVarArgs),
-      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)) {}
+      Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
+      Suffix(Suffix) {}
 
 CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
                              BlockFrequencyInfo *BFI,
-                             BranchProbabilityInfo *BPI)
+                             BranchProbabilityInfo *BPI, std::string Suffix)
     : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
       BPI(BPI), AllowVarArgs(false),
       Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
                                      /* AllowVarArgs */ false,
-                                     /* AllowAlloca */ false)) {}
+                                     /* AllowAlloca */ false)),
+      Suffix(Suffix) {}
 
 /// definedInRegion - Return true if the specified value is defined in the
 /// extracted region.
@@ -566,7 +577,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
     // changing them to branch to NewBB instead.
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
       if (Blocks.count(PN->getIncomingBlock(i))) {
-        TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator();
+        Instruction *TI = PN->getIncomingBlock(i)->getTerminator();
         TI->replaceUsesOfWith(OldPred, NewBB);
       }
 
@@ -669,10 +680,14 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
                   FunctionType::get(RetTy, paramTy,
                                     AllowVarArgs && oldFunction->isVarArg());
 
+  std::string SuffixToUse =
+      Suffix.empty()
+          ? (header->getName().empty() ? "extracted" : header->getName().str())
+          : Suffix;
   // Create the new function
   Function *newFunction = Function::Create(
       funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(),
-      oldFunction->getName() + "_" + header->getName(), M);
+      oldFunction->getName() + "." + SuffixToUse, M);
   // If the old function is no-throw, so is the new one.
   if (oldFunction->doesNotThrow())
     newFunction->setDoesNotThrow();
@@ -778,7 +793,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
-      TerminatorInst *TI = newFunction->begin()->getTerminator();
+      Instruction *TI = newFunction->begin()->getTerminator();
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
           StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);
       RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI);
@@ -808,10 +823,10 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   for (unsigned i = 0, e = Users.size(); i != e; ++i)
     // The BasicBlock which contains the branch is not in the region
     // modify the branch target to a new block
-    if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i]))
-      if (!Blocks.count(TI->getParent()) &&
-          TI->getParent()->getParent() == oldFunction)
-        TI->replaceUsesOfWith(header, newHeader);
+    if (Instruction *I = dyn_cast<Instruction>(Users[i]))
+      if (I->isTerminator() && !Blocks.count(I->getParent()) &&
+          I->getParent()->getParent() == oldFunction)
+        I->replaceUsesOfWith(header, newHeader);
 
   return newFunction;
 }
@@ -972,7 +987,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
 
   unsigned switchVal = 0;
   for (BasicBlock *Block : Blocks) {
-    TerminatorInst *TI = Block->getTerminator();
+    Instruction *TI = Block->getTerminator();
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
       if (!Blocks.count(TI->getSuccessor(i))) {
         BasicBlock *OldTarget = TI->getSuccessor(i);
@@ -1078,7 +1093,7 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
   using BlockNode = BlockFrequencyInfoImplBase::BlockNode;
 
   // Update the branch weights for the exit block.
-  TerminatorInst *TI = CodeReplacer->getTerminator();
+  Instruction *TI = CodeReplacer->getTerminator();
   SmallVector<unsigned, 8> BranchWeights(TI->getNumSuccessors(), 0);
 
   // Block Frequency distribution with dummy node.
@@ -1267,24 +1282,53 @@ Function *CodeExtractor::extractCodeRegion() {
   // Look at all successors of the codeReplacer block.  If any of these blocks
   // had PHI nodes in them, we need to update the "from" block to be the code
   // replacer, not the original block in the extracted region.
-  std::vector<BasicBlock *> Succs(succ_begin(codeReplacer),
-                                  succ_end(codeReplacer));
-  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
-    for (BasicBlock::iterator I = Succs[i]->begin(); isa<PHINode>(I); ++I) {
-      PHINode *PN = cast<PHINode>(I);
-      std::set<BasicBlock*> ProcessedPreds;
-      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-        if (Blocks.count(PN->getIncomingBlock(i))) {
-          if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second)
-            PN->setIncomingBlock(i, codeReplacer);
-          else {
-            // There were multiple entries in the PHI for this block, now there
-            // is only one, so remove the duplicated entries.
-            PN->removeIncomingValue(i, false);
-            --i; --e;
-          }
+  for (BasicBlock *SuccBB : successors(codeReplacer)) {
+    for (PHINode &PN : SuccBB->phis()) {
+      Value *IncomingCodeReplacerVal = nullptr;
+      SmallVector<unsigned, 2> IncomingValsToRemove;
+      for (unsigned I = 0, E = PN.getNumIncomingValues(); I != E; ++I) {
+        BasicBlock *IncomingBB = PN.getIncomingBlock(I);
+
+        // Ignore incoming values from outside of the extracted region.
+        if (!Blocks.count(IncomingBB))
+          continue;
+
+        // Ensure that there is only one incoming value from codeReplacer.
+        if (!IncomingCodeReplacerVal) {
+          PN.setIncomingBlock(I, codeReplacer);
+          IncomingCodeReplacerVal = PN.getIncomingValue(I);
+        } else {
+          assert(IncomingCodeReplacerVal == PN.getIncomingValue(I) &&
+                 "PHI has two incompatbile incoming values from codeRepl");
+          IncomingValsToRemove.push_back(I);
         }
+      }
+
+      for (unsigned I : reverse(IncomingValsToRemove))
+        PN.removeIncomingValue(I, /*DeletePHIIfEmpty=*/false);
+    }
+  }
+
+  // Erase debug info intrinsics. Variable updates within the new function are
+  // invisible to debuggers. This could be improved by defining a DISubprogram
+  // for the new function.
+  for (BasicBlock &BB : *newFunction) {
+    auto BlockIt = BB.begin();
+    // Remove debug info intrinsics from the new function.
+    while (BlockIt != BB.end()) {
+      Instruction *Inst = &*BlockIt;
+      ++BlockIt;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        Inst->eraseFromParent();
     }
+    // Remove debug info intrinsics which refer to values in the new function
+    // from the old function.
+    SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
+    for (Instruction &I : BB)
+      findDbgUsers(DbgUsers, &I);
+    for (DbgVariableIntrinsic *DVI : DbgUsers)
+      DVI->eraseFromParent();
+  }
 
   LLVM_DEBUG(if (verifyFunction(*newFunction))
                  report_fatal_error("verifyFunction failed!"));
diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp
index c9c96fbe5da09cdb4413b664056e0e8b2a940b23..762a374c135c81b1651ed49569bcc37e07de3c93 100644
--- a/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -37,7 +37,7 @@ IRBuilder<> *EscapeEnumerator::Next() {
 
     // Branches and invokes do not escape, only unwind, resume, and return
     // do.
-    TerminatorInst *TI = CurBB->getTerminator();
+    Instruction *TI = CurBB->getTerminator();
     if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
       continue;
 
diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
index 3c6c9c9a5df42824c9f6fae5d8780a6e90d14c35..d9778f4a1fb73ac1e21a3c10a5a4de327616d000 100644
--- a/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/lib/Transforms/Utils/FlattenCFG.cpp
@@ -232,7 +232,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
   if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock))
     return false;
 
-  TerminatorInst *TBB = LastCondBlock->getTerminator();
+  Instruction *TBB = LastCondBlock->getTerminator();
   BasicBlock *PS1 = TBB->getSuccessor(0);
   BasicBlock *PS2 = TBB->getSuccessor(1);
   BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator());
@@ -325,7 +325,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
 bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
                                          BasicBlock *Block1,
                                          BasicBlock *Block2) {
-  TerminatorInst *PTI2 = Head2->getTerminator();
+  Instruction *PTI2 = Head2->getTerminator();
   Instruction *PBI2 = &Head2->front();
 
   bool eq1 = (Block1 == Head1);
@@ -421,7 +421,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   if ((IfTrue2 != SecondEntryBlock) && (IfFalse2 != SecondEntryBlock))
     return false;
 
-  TerminatorInst *PTI2 = SecondEntryBlock->getTerminator();
+  Instruction *PTI2 = SecondEntryBlock->getTerminator();
   Instruction *PBI2 = &SecondEntryBlock->front();
 
   if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1,
diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index 69203f9f248554ce6cf7b03c59e5107d3bf0f4d2..a717d9b728198e56cfe96d2375bf90176b312014 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -410,8 +410,6 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
   switch (TyL->getTypeID()) {
   default:
     llvm_unreachable("Unknown type!");
-    // Fall through in Release mode.
-    LLVM_FALLTHROUGH;
   case Type::IntegerTyID:
     return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(),
                       cast<IntegerType>(TyR)->getBitWidth());
@@ -867,8 +865,8 @@ int FunctionComparator::compare() {
     if (int Res = cmpBasicBlocks(BBL, BBR))
       return Res;
 
-    const TerminatorInst *TermL = BBL->getTerminator();
-    const TerminatorInst *TermR = BBR->getTerminator();
+    const Instruction *TermL = BBL->getTerminator();
+    const Instruction *TermR = BBR->getTerminator();
 
     assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
     for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
@@ -938,7 +936,7 @@ FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
     for (auto &Inst : *BB) {
       H.add(Inst.getOpcode());
     }
-    const TerminatorInst *Term = BB->getTerminator();
+    const Instruction *Term = BB->getTerminator();
     for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
       if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
         continue;
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index f8226f529ee682a64005b08108d3f41150b484b2..bda2ee2d8a3f3e5a9b4f3625367bf1071bda8d4a 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -2247,7 +2247,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // Change the branch that used to go to AfterCallBB to branch to the first
   // basic block of the inlined function.
   //
-  TerminatorInst *Br = OrigBB->getTerminator();
+  Instruction *Br = OrigBB->getTerminator();
   assert(Br && Br->getOpcode() == Instruction::Br &&
          "splitBasicBlock broken!");
   Br->setOperand(0, &*FirstNewBlock);
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index a1f8e7484bcf9f6c0c9606eee8f91b3c5ba3e2c5..53d444b309d5d972171d8598d95b8f683521f5f4 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PredIteratorCache.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils.h"
@@ -201,6 +202,21 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       SSAUpdate.RewriteUse(*UseToRewrite);
     }
 
+    SmallVector<DbgValueInst *, 4> DbgValues;
+    llvm::findDbgValues(DbgValues, I);
+
+    // Update pre-existing debug value uses that reside outside the loop.
+    auto &Ctx = I->getContext();
+    for (auto DVI : DbgValues) {
+      BasicBlock *UserBB = DVI->getParent();
+      if (InstBB == UserBB || L->contains(UserBB))
+        continue;
+      // We currently only handle debug values residing in blocks where we have
+      // inserted a PHI instruction.
+      if (Value *V = SSAUpdate.FindValueForBlock(UserBB))
+        DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V)));
+    }
+
     // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
     // to post-process them to keep LCSSA form.
     for (PHINode *InsertedPN : InsertedPHIs) {
diff --git a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 9832a6f24e1f33629a86a35cae8cd822399f3fa2..e1592c86763653e1f381ce84bcf066127aa3695e 100644
--- a/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -487,7 +487,7 @@ void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
   MDNode *BranchWeights =
       MDBuilder(CI->getContext()).createBranchWeights(1, 2000);
 
-  TerminatorInst *NewInst =
+  Instruction *NewInst =
       SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights, DT);
   BasicBlock *CallBB = NewInst->getParent();
   CallBB->setName("cdce.call");
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 879145cea6b21d654304bf0237004af037dcf9c7..0dcd7371210d75a475719a6fde4c19b37f5d63a1 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -105,7 +105,7 @@ STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
 bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
                                   const TargetLibraryInfo *TLI,
                                   DomTreeUpdater *DTU) {
-  TerminatorInst *T = BB->getTerminator();
+  Instruction *T = BB->getTerminator();
   IRBuilder<> Builder(T);
 
   // Branch - See if we are conditional jumping on constant
@@ -2101,7 +2101,7 @@ static bool markAliveBlocks(Function &F,
       }
     }
 
-    TerminatorInst *Terminator = BB->getTerminator();
+    Instruction *Terminator = BB->getTerminator();
     if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
       // Turn invokes that call 'nounwind' functions into ordinary calls.
       Value *Callee = II->getCalledValue();
@@ -2176,14 +2176,14 @@ static bool markAliveBlocks(Function &F,
 }
 
 void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
-  TerminatorInst *TI = BB->getTerminator();
+  Instruction *TI = BB->getTerminator();
 
   if (auto *II = dyn_cast<InvokeInst>(TI)) {
     changeToCall(II, DTU);
     return;
   }
 
-  TerminatorInst *NewTI;
+  Instruction *NewTI;
   BasicBlock *UnwindDest;
 
   if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
@@ -2260,7 +2260,7 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI,
       continue;
     }
     if (DTU) {
-      // Remove the TerminatorInst of BB to clear the successor list of BB.
+      // Remove the terminator of BB to clear the successor list of BB.
       if (BB->getTerminator())
         BB->getInstList().pop_back();
       new UnreachableInst(BB->getContext(), BB);
@@ -2315,7 +2315,15 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
         break;
       case LLVMContext::MD_range:
-        K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
+
+        // If K does move, use most generic range. Otherwise keep the range of
+        // K.
+        if (DoesKMove)
+          // FIXME: If K does move, we should drop the range info and nonnull.
+          //        Currently this function is used with DoesKMove in passes
+          //        doing hoisting/sinking and the current behavior of using the
+          //        most generic range is correct in those cases.
+          K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
         break;
       case LLVMContext::MD_fpmath:
         K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
@@ -2529,6 +2537,47 @@ void llvm::dropDebugUsers(Instruction &I) {
     DII->eraseFromParent();
 }
 
+void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
+                                    BasicBlock *BB) {
+  // Since we are moving the instructions out of its basic block, we do not
+  // retain their original debug locations (DILocations) and debug intrinsic
+  // instructions (dbg.values).
+  //
+  // Doing so would degrade the debugging experience and adversely affect the
+  // accuracy of profiling information.
+  //
+  // Currently, when hoisting the instructions, we take the following actions:
+  // - Remove their dbg.values.
+  // - Set their debug locations to the values from the insertion point.
+  //
+  // As per PR39141 (comment #8), the more fundamental reason why the dbg.values
+  // need to be deleted, is because there will not be any instructions with a
+  // DILocation in either branch left after performing the transformation. We
+  // can only insert a dbg.value after the two branches are joined again.
+  //
+  // See PR38762, PR39243 for more details.
+  //
+  // TODO: Extend llvm.dbg.value to take more than one SSA Value (PR39141) to
+  // encode predicated DIExpressions that yield different results on different
+  // code paths.
+  for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+    Instruction *I = &*II;
+    I->dropUnknownNonDebugMetadata();
+    if (I->isUsedByMetadata())
+      dropDebugUsers(*I);
+    if (isa<DbgVariableIntrinsic>(I)) {
+      // Remove DbgInfo Intrinsics.
+      II = I->eraseFromParent();
+      continue;
+    }
+    I->setDebugLoc(InsertPt->getDebugLoc());
+    ++II;
+  }
+  DomBlock->getInstList().splice(InsertPt->getIterator(), BB->getInstList(),
+                                 BB->begin(),
+                                 BB->getTerminator()->getIterator());
+}
+
 namespace {
 
 /// A potential constituent of a bitreverse or bswap expression. See
diff --git a/lib/Transforms/Utils/LoopRotationUtils.cpp b/lib/Transforms/Utils/LoopRotationUtils.cpp
index a6320d8dbf4e9431ef3a87b4624eef9720c0e150..41f14a8346178fdd1da9cb50d1a30a8308b01bd6 100644
--- a/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -20,6 +20,8 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -54,6 +56,7 @@ class LoopRotate {
   AssumptionCache *AC;
   DominatorTree *DT;
   ScalarEvolution *SE;
+  MemorySSAUpdater *MSSAU;
   const SimplifyQuery &SQ;
   bool RotationOnly;
   bool IsUtilMode;
@@ -61,10 +64,11 @@ class LoopRotate {
 public:
   LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
              const TargetTransformInfo *TTI, AssumptionCache *AC,
-             DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ,
-             bool RotationOnly, bool IsUtilMode)
+             DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+             const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode)
       : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
-        SQ(SQ), RotationOnly(RotationOnly), IsUtilMode(IsUtilMode) {}
+        MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
+        IsUtilMode(IsUtilMode) {}
   bool processLoop(Loop *L);
 
 private:
@@ -269,6 +273,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     SE->forgetTopmostLoop(L);
 
   LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
   // Find new Loop header. NewHeader is a Header's one and only successor
   // that is inside loop.  Header's other successor is outside the
@@ -299,7 +305,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
   // For the rest of the instructions, either hoist to the OrigPreheader if
   // possible or create a clone in the OldPreHeader if not.
-  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+  Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
 
   // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
   using DbgIntrinsicHash =
@@ -385,6 +391,12 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // remove the corresponding incoming values from the PHI nodes in OrigHeader.
   LoopEntryBranch->eraseFromParent();
 
+  // Update MemorySSA before the rewrite call below changes the 1:1
+  // instruction:cloned_instruction_or_value mapping in ValueMap.
+  if (MSSAU) {
+    ValueMap[OrigHeader] = OrigPreheader;
+    MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, ValueMap);
+  }
 
   SmallVector<PHINode*, 2> InsertedPHIs;
   // If there were any uses of instructions in the duplicated block outside the
@@ -411,6 +423,12 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
     Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
     DT->applyUpdates(Updates);
+
+    if (MSSAU) {
+      MSSAU->applyUpdates(Updates, *DT);
+      if (VerifyMemorySSA)
+        MSSAU->getMemorySSA()->verifyMemorySSA();
+    }
   }
 
   // At this point, we've finished our major CFG changes.  As part of cloning
@@ -433,7 +451,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // Split the edge to form a real preheader.
     BasicBlock *NewPH = SplitCriticalEdge(
         OrigPreheader, NewHeader,
-        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+        CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
     NewPH->setName(NewHeader->getName() + ".lr.ph");
 
     // Preserve canonical loop form, which means that 'Exit' should have only
@@ -452,7 +470,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       SplitLatchEdge |= L->getLoopLatch() == ExitPred;
       BasicBlock *ExitSplit = SplitCriticalEdge(
           ExitPred, Exit,
-          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+          CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
       ExitSplit->moveBefore(Exit);
     }
     assert(SplitLatchEdge &&
@@ -467,17 +485,27 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
     // With our CFG finalized, update DomTree if it is available.
     if (DT) DT->deleteEdge(OrigPreheader, Exit);
+
+    // Update MSSA too, if available.
+    if (MSSAU)
+      MSSAU->removeEdge(OrigPreheader, Exit);
   }
 
   assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
   assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
 
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   // Now that the CFG and DomTree are in a consistent state again, try to merge
   // the OrigHeader block into OrigLatch.  This will succeed if they are
   // connected by an unconditional branch.  This is just a cleanup so the
   // emitted code isn't too gross in this common case.
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-  MergeBlockIntoPredecessor(OrigHeader, &DTU, LI);
+  MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
   LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
 
@@ -586,9 +614,14 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
                     << LastExit->getName() << "\n");
 
   // Hoist the instructions from Latch into LastExit.
+  Instruction *FirstLatchInst = &*(Latch->begin());
   LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
                                  Latch->begin(), Jmp->getIterator());
 
+  // Update MemorySSA
+  if (MSSAU)
+    MSSAU->moveAllAfterMergeBlocks(Latch, LastExit, FirstLatchInst);
+
   unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
   BasicBlock *Header = Jmp->getSuccessor(0);
   assert(Header == L->getHeader() && "expected a backward branch");
@@ -604,6 +637,10 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
   if (DT)
     DT->eraseNode(Latch);
   Latch->eraseFromParent();
+
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+
   return true;
 }
 
@@ -636,11 +673,16 @@ bool LoopRotate::processLoop(Loop *L) {
 /// The utility to convert a loop into a loop with bottom test.
 bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
                         AssumptionCache *AC, DominatorTree *DT,
-                        ScalarEvolution *SE, const SimplifyQuery &SQ,
-                        bool RotationOnly = true,
+                        ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+                        const SimplifyQuery &SQ, bool RotationOnly = true,
                         unsigned Threshold = unsigned(-1),
                         bool IsUtilMode = true) {
-  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, SQ, RotationOnly, IsUtilMode);
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+  LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
+                IsUtilMode);
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 
   return LR.processLoop(L);
 }
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index fc59cafa33151a180bed3e1e10c8a82d88448a1c..380f4fca54d9ed12dab65fbf969b455ce7034cef 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -435,7 +435,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop");
   MDNode *LoopMD = nullptr;
   for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
-    TerminatorInst *TI = BackedgeBlocks[i]->getTerminator();
+    Instruction *TI = BackedgeBlocks[i]->getTerminator();
     if (!LoopMD)
       LoopMD = TI->getMetadata(LoopMDKind);
     TI->setMetadata(LoopMDKind, nullptr);
@@ -488,7 +488,7 @@ ReprocessLoop:
                         << P->getName() << "\n");
 
       // Zap the dead pred's terminator and replace it with unreachable.
-      TerminatorInst *TI = P->getTerminator();
+      Instruction *TI = P->getTerminator();
       changeToUnreachable(TI, /*UseLLVMTrap=*/false, PreserveLCSSA);
       Changed = true;
     }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index a8ec75c0bafbc63befea1f98a4cc69fe5dc80381..877e0e4dcf900a54e6a409421910ae855c9a4eee 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -781,7 +781,7 @@ LoopUnrollResult llvm::UnrollLoop(
         // there is no such latch.
         NewIDom = Latches.back();
         for (BasicBlock *IterLatch : Latches) {
-          TerminatorInst *Term = IterLatch->getTerminator();
+          Instruction *Term = IterLatch->getTerminator();
           if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
             NewIDom = IterLatch;
             break;
diff --git a/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 1ce2f844489685dded0c481d5a9ec7abfd4e83c3..8949c603a841e79d3c02d3f517c90911410642ce 100644
--- a/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -72,7 +72,7 @@ static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
   for (BasicBlock *BB : ForeBlocks) {
     if (BB == SubLoopPreHeader)
       continue;
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
       if (!ForeBlocks.count(TI->getSuccessor(i)))
         return false;
@@ -761,7 +761,7 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   }
 
   // Check the loop safety info for exceptions.
-  LoopSafetyInfo LSI;
+  SimpleLoopSafetyInfo LSI;
   LSI.computeLoopSafetyInfo(L);
   if (LSI.anyBlockMayThrow()) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 03006ef3a2d37c7c6034da11696bf87c263d282a..661b4fa5bcb70118ff4dbc21170817e44aecbf0d 100644
--- a/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -301,7 +301,7 @@ static void createMemMoveLoop(Instruction *InsertBefore,
   // the appropriate conditional branches when the loop is built.
   ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
                                       SrcAddr, DstAddr, "compare_src_dst");
-  TerminatorInst *ThenTerm, *ElseTerm;
+  Instruction *ThenTerm, *ElseTerm;
   SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
                                 &ElseTerm);
 
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
index 9d9624850fbd814eda7d97de25fd50cdf9e13880..585ce6b4c1188c0b2395e1396fe749bee0040fef 100644
--- a/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -522,7 +522,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
     if (isa<PredicateWithEdge>(ValInfo)) {
       IRBuilder<> B(getBranchTerminator(ValInfo));
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
-      if (IF->user_begin() == IF->user_end())
+      if (empty(IF->users()))
         CreatedDeclarations.insert(IF);
       CallInst *PIC =
           B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
@@ -534,7 +534,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
              "Should not have gotten here without it being an assume");
       IRBuilder<> B(PAssume->AssumeInst);
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
-      if (IF->user_begin() == IF->user_end())
+      if (empty(IF->users()))
         CreatedDeclarations.insert(IF);
       CallInst *PIC = B.CreateCall(IF, Op);
       PredicateMap.insert({PIC, ValInfo});
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 4a1fd8d571aa137ff73d3d03046d9bbe4804a1d3..9e5fb0e7172d4d2d601dac8fe2c4d240401db0d5 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -64,6 +64,11 @@ bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
   return getAvailableVals(AV).count(BB);
 }
 
+Value *SSAUpdater::FindValueForBlock(BasicBlock *BB) const {
+  AvailableValsTy::iterator AVI = getAvailableVals(AV).find(BB);
+  return (AVI != getAvailableVals(AV).end()) ? AVI->second : nullptr;
+}
+
 void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
   assert(ProtoType && "Need to initialize SSAUpdater");
   assert(ProtoType == V->getType() &&
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index ebbcf80025459e672a0c16da95348804e2673ca6..849f9ee1d19dde4749c66a67148f7e5f1c308bb1 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -175,13 +175,13 @@ class SimplifyCFGOpt {
   const SimplifyCFGOptions &Options;
   bool Resimplify;
 
-  Value *isValueEqualityComparison(TerminatorInst *TI);
+  Value *isValueEqualityComparison(Instruction *TI);
   BasicBlock *GetValueEqualityComparisonCases(
-      TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases);
-  bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
+      Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases);
+  bool SimplifyEqualityComparisonWithOnlyPredecessor(Instruction *TI,
                                                      BasicBlock *Pred,
                                                      IRBuilder<> &Builder);
-  bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+  bool FoldValueComparisonIntoPredecessors(Instruction *TI,
                                            IRBuilder<> &Builder);
 
   bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
@@ -219,7 +219,7 @@ public:
 /// Return true if it is safe to merge these two
 /// terminator instructions together.
 static bool
-SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2,
+SafeToMergeTerminators(Instruction *SI1, Instruction *SI2,
                        SmallSetVector<BasicBlock *, 4> *FailBlocks = nullptr) {
   if (SI1 == SI2)
     return false; // Can't merge with self!
@@ -670,7 +670,7 @@ private:
 
 } // end anonymous namespace
 
-static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
+static void EraseTerminatorAndDCECond(Instruction *TI) {
   Instruction *Cond = nullptr;
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cond = dyn_cast<Instruction>(SI->getCondition());
@@ -688,7 +688,7 @@ static void EraseTerminatorInstAndDCECond(TerminatorInst *TI) {
 
 /// Return true if the specified terminator checks
 /// to see if a value is equal to constant integer value.
-Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
+Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) {
   Value *CV = nullptr;
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     // Do not permit merging of large switch instructions into their
@@ -716,7 +716,7 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
 /// Given a value comparison instruction,
 /// decode all of the 'cases' that it represents and return the 'default' block.
 BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
-    TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
+    Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
     for (auto Case : SI->cases())
@@ -806,7 +806,7 @@ static void setBranchWeights(Instruction *I, uint32_t TrueWeight,
 /// determines the outcome of this comparison. If so, simplify TI. This does a
 /// very limited form of jump threading.
 bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
-    TerminatorInst *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
+    Instruction *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
   Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
   if (!PredVal)
     return false; // Not a value comparison in predecessor.
@@ -854,7 +854,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
                         << "Through successor TI: " << *TI << "Leaving: " << *NI
                         << "\n");
 
-      EraseTerminatorInstAndDCECond(TI);
+      EraseTerminatorAndDCECond(TI);
       return true;
     }
 
@@ -936,7 +936,7 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
                     << "Through successor TI: " << *TI << "Leaving: " << *NI
                     << "\n");
 
-  EraseTerminatorInstAndDCECond(TI);
+  EraseTerminatorAndDCECond(TI);
   return true;
 }
 
@@ -971,10 +971,10 @@ static inline bool HasBranchWeights(const Instruction *I) {
   return false;
 }
 
-/// Get Weights of a given TerminatorInst, the default weight is at the front
+/// Get Weights of a given terminator, the default weight is at the front
 /// of the vector. If TI is a conditional eq, we need to swap the branch-weight
 /// metadata.
-static void GetBranchWeights(TerminatorInst *TI,
+static void GetBranchWeights(Instruction *TI,
                              SmallVectorImpl<uint64_t> &Weights) {
   MDNode *MD = TI->getMetadata(LLVMContext::MD_prof);
   assert(MD);
@@ -1008,7 +1008,7 @@ static void FitWeights(MutableArrayRef<uint64_t> Weights) {
 /// (either a switch or a branch on "X == c").
 /// See if any of the predecessors of the terminator block are value comparisons
 /// on the same value.  If so, and if safe to do so, fold them together.
-bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
+bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(Instruction *TI,
                                                          IRBuilder<> &Builder) {
   BasicBlock *BB = TI->getParent();
   Value *CV = isValueEqualityComparison(TI); // CondVal
@@ -1020,7 +1020,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
     BasicBlock *Pred = Preds.pop_back_val();
 
     // See if the predecessor is a comparison with the same value.
-    TerminatorInst *PTI = Pred->getTerminator();
+    Instruction *PTI = Pred->getTerminator();
     Value *PCV = isValueEqualityComparison(PTI); // PredCondVal
 
     if (PCV == CV && TI != PTI) {
@@ -1197,7 +1197,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         setBranchWeights(NewSI, MDWeights);
       }
 
-      EraseTerminatorInstAndDCECond(PTI);
+      EraseTerminatorAndDCECond(PTI);
 
       // Okay, last check.  If BB is still a successor of PSI, then we must
       // have an infinite loop case.  If so, add an infinitely looping block
@@ -1413,7 +1413,7 @@ HoistTerminator:
   for (BasicBlock *Succ : successors(BB1))
     AddPredecessorToBlock(Succ, BIParent, BB1);
 
-  EraseTerminatorInstAndDCECond(BI);
+  EraseTerminatorAndDCECond(BI);
   return true;
 }
 
@@ -2247,7 +2247,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
 
     // Loop over all of the edges from PredBB to BB, changing them to branch
     // to EdgeBB instead.
-    TerminatorInst *PredBBTI = PredBB->getTerminator();
+    Instruction *PredBBTI = PredBB->getTerminator();
     for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i)
       if (PredBBTI->getSuccessor(i) == BB) {
         BB->removePredecessor(PredBB);
@@ -2375,24 +2375,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
-  if (IfBlock1) {
-    for (auto &I : *IfBlock1) {
-      I.dropUnknownNonDebugMetadata();
-      dropDebugUsers(I);
-    }
-    DomBlock->getInstList().splice(InsertPt->getIterator(),
-                                   IfBlock1->getInstList(), IfBlock1->begin(),
-                                   IfBlock1->getTerminator()->getIterator());
-  }
-  if (IfBlock2) {
-    for (auto &I : *IfBlock2) {
-      I.dropUnknownNonDebugMetadata();
-      dropDebugUsers(I);
-    }
-    DomBlock->getInstList().splice(InsertPt->getIterator(),
-                                   IfBlock2->getInstList(), IfBlock2->begin(),
-                                   IfBlock2->getTerminator()->getIterator());
-  }
+  if (IfBlock1)
+    hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock1);
+  if (IfBlock2)
+    hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2);
 
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     // Change the PHI node into a select instruction.
@@ -2408,7 +2394,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
   // has been flattened.  Change DomBlock to jump directly to our new block to
   // avoid other simplifycfg's kicking in on the diamond.
-  TerminatorInst *OldTI = DomBlock->getTerminator();
+  Instruction *OldTI = DomBlock->getTerminator();
   Builder.SetInsertPoint(OldTI);
   Builder.CreateBr(BB);
   OldTI->eraseFromParent();
@@ -2442,7 +2428,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
     TrueSucc->removePredecessor(BI->getParent());
     FalseSucc->removePredecessor(BI->getParent());
     Builder.CreateRetVoid();
-    EraseTerminatorInstAndDCECond(BI);
+    EraseTerminatorAndDCECond(BI);
     return true;
   }
 
@@ -2498,7 +2484,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
                     << "\n  " << *BI << "NewRet = " << *RI << "TRUEBLOCK: "
                     << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
 
-  EraseTerminatorInstAndDCECond(BI);
+  EraseTerminatorAndDCECond(BI);
 
   return true;
 }
@@ -2822,7 +2808,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, unsigned BonusInstThreshold) {
       }
       // Change PBI from Conditional to Unconditional.
       BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI);
-      EraseTerminatorInstAndDCECond(PBI);
+      EraseTerminatorAndDCECond(PBI);
       PBI = New_PBI;
     }
 
@@ -3417,7 +3403,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 // Takes care of updating the successors and removing the old terminator.
 // Also makes sure not to introduce new successors by assuming that edges to
 // non-successor TrueBBs and FalseBBs aren't reachable.
-static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
+static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
                                        BasicBlock *TrueBB, BasicBlock *FalseBB,
                                        uint32_t TrueWeight,
                                        uint32_t FalseWeight) {
@@ -3472,7 +3458,7 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
       Builder.CreateBr(FalseBB);
   }
 
-  EraseTerminatorInstAndDCECond(OldTerm);
+  EraseTerminatorAndDCECond(OldTerm);
   return true;
 }
 
@@ -3715,7 +3701,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
     BasicBlock *NewBB =
         BB->splitBasicBlock(BI->getIterator(), "switch.early.test");
     // Remove the uncond branch added to the old block.
-    TerminatorInst *OldTI = BB->getTerminator();
+    Instruction *OldTI = BB->getTerminator();
     Builder.SetInsertPoint(OldTI);
 
     if (TrueWhenEqual)
@@ -3759,7 +3745,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   }
 
   // Erase the old branch instruction.
-  EraseTerminatorInstAndDCECond(BI);
+  EraseTerminatorAndDCECond(BI);
 
   LLVM_DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
   return true;
@@ -4007,7 +3993,7 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) {
     if (UnwindDest == nullptr) {
       removeUnwindEdge(PredBB);
     } else {
-      TerminatorInst *TI = PredBB->getTerminator();
+      Instruction *TI = PredBB->getTerminator();
       TI->replaceUsesOfWith(BB, UnwindDest);
     }
   }
@@ -4076,7 +4062,7 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   SmallVector<BranchInst *, 8> CondBranchPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *P = *PI;
-    TerminatorInst *PTI = P->getTerminator();
+    Instruction *PTI = P->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
       if (BI->isUnconditional())
         UncondBranchPreds.push_back(P);
@@ -4181,7 +4167,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
 
   SmallVector<BasicBlock *, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
-    TerminatorInst *TI = Preds[i]->getTerminator();
+    Instruction *TI = Preds[i]->getTerminator();
     IRBuilder<> Builder(TI);
     if (auto *BI = dyn_cast<BranchInst>(TI)) {
       if (BI->isUnconditional()) {
@@ -4193,10 +4179,10 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
       } else {
         if (BI->getSuccessor(0) == BB) {
           Builder.CreateBr(BI->getSuccessor(1));
-          EraseTerminatorInstAndDCECond(BI);
+          EraseTerminatorAndDCECond(BI);
         } else if (BI->getSuccessor(1) == BB) {
           Builder.CreateBr(BI->getSuccessor(0));
-          EraseTerminatorInstAndDCECond(BI);
+          EraseTerminatorAndDCECond(BI);
           Changed = true;
         }
       }
@@ -4438,7 +4424,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
     SplitBlock(&*NewDefault, &NewDefault->front());
     auto *OldTI = NewDefault->getTerminator();
     new UnreachableInst(SI->getContext(), OldTI);
-    EraseTerminatorInstAndDCECond(OldTI);
+    EraseTerminatorAndDCECond(OldTI);
     return true;
   }
 
@@ -4649,12 +4635,12 @@ GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
   SmallDenseMap<Value *, Constant *> ConstantPool;
   ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
   for (Instruction &I :CaseDest->instructionsWithoutDebug()) {
-    if (TerminatorInst *T = dyn_cast<TerminatorInst>(&I)) {
+    if (I.isTerminator()) {
       // If the terminator is a simple branch, continue to the next block.
-      if (T->getNumSuccessors() != 1 || T->isExceptionalTerminator())
+      if (I.getNumSuccessors() != 1 || I.isExceptionalTerminator())
         return false;
       Pred = CaseDest;
-      CaseDest = T->getSuccessor(0);
+      CaseDest = I.getSuccessor(0);
     } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) {
       // Instruction is side-effect free and constant.
 
@@ -5274,7 +5260,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   // Figure out the corresponding result for each case value and phi node in the
   // common destination, as well as the min and max case values.
-  assert(SI->case_begin() != SI->case_end());
+  assert(!empty(SI->cases()));
   SwitchInst::CaseIt CI = SI->case_begin();
   ConstantInt *MinCaseVal = CI->getCaseValue();
   ConstantInt *MaxCaseVal = CI->getCaseValue();
@@ -5663,14 +5649,14 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
   if (IBI->getNumDestinations() == 0) {
     // If the indirectbr has no successors, change it to unreachable.
     new UnreachableInst(IBI->getContext(), IBI);
-    EraseTerminatorInstAndDCECond(IBI);
+    EraseTerminatorAndDCECond(IBI);
     return true;
   }
 
   if (IBI->getNumDestinations() == 1) {
     // If the indirectbr has one successor, change it to a direct branch.
     BranchInst::Create(IBI->getDestination(0), IBI);
-    EraseTerminatorInstAndDCECond(IBI);
+    EraseTerminatorAndDCECond(IBI);
     return true;
   }
 
@@ -5892,7 +5878,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
-      TerminatorInst *Succ0TI = BI->getSuccessor(0)->getTerminator();
+      Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
       if (Succ0TI->getNumSuccessors() == 1 &&
           Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
         if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI))
@@ -5901,7 +5887,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
     // execute Successor #1 if it branches to Successor #0.
-    TerminatorInst *Succ1TI = BI->getSuccessor(1)->getTerminator();
+    Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
     if (Succ1TI->getNumSuccessors() == 1 &&
         Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
       if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI))
@@ -5991,7 +5977,7 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
   for (PHINode &PHI : BB->phis())
     for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
       if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
-        TerminatorInst *T = PHI.getIncomingBlock(i)->getTerminator();
+        Instruction *T = PHI.getIncomingBlock(i)->getTerminator();
         IRBuilder<> Builder(T);
         if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
           BB->removePredecessor(PHI.getIncomingBlock(i));
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 51fda1c620b60a4d0433c30b799cdd204beb6284..7faf291e73d9fdbb5e40318dceabb3b59ba187a5 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -108,6 +108,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
   Value *IVSrc = nullptr;
   const unsigned OperIdx = 0;
   const SCEV *FoldedExpr = nullptr;
+  bool MustDropExactFlag = false;
   switch (UseInst->getOpcode()) {
   default:
     return nullptr;
@@ -140,6 +141,11 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
                            APInt::getOneBitSet(BitWidth, D->getZExtValue()));
     }
     FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
+    // We might have 'exact' flag set at this point which will no longer be
+    // correct after we make the replacement.
+    if (UseInst->isExact() &&
+        SE->getSCEV(IVSrc) != SE->getMulExpr(FoldedExpr, SE->getSCEV(D)))
+      MustDropExactFlag = true;
   }
   // We have something that might fold it's operand. Compare SCEVs.
   if (!SE->isSCEVable(UseInst->getType()))
@@ -155,6 +161,9 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
   UseInst->setOperand(OperIdx, IVSrc);
   assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
 
+  if (MustDropExactFlag)
+    UseInst->dropPoisonGeneratingFlags();
+
   ++NumElimOperand;
   Changed = true;
   if (IVOperand->use_empty())
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 3789181a898252eeee20216f78a2e1b016ead4b0..a50575b025601c4ecbf8d14bea8a671b06344b3d 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -923,8 +923,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
 }
 
 /// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
-static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
-                               const TargetLibraryInfo &TLI) {
+Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) {
   // This has to be a memset of zeros (bzero).
   auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
   if (!FillValue || FillValue->getZExtValue() != 0)
@@ -944,7 +943,7 @@ static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
     return nullptr;
 
   LibFunc Func;
-  if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
+  if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
       Func != LibFunc_malloc)
     return nullptr;
 
@@ -959,18 +958,18 @@ static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
   IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
   Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
                              Malloc->getArgOperand(0), Malloc->getAttributes(),
-                             B, TLI);
+                             B, *TLI);
   if (!Calloc)
     return nullptr;
 
   Malloc->replaceAllUsesWith(Calloc);
-  Malloc->eraseFromParent();
+  eraseFromParent(Malloc);
 
   return Calloc;
 }
 
 Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
-  if (auto *Calloc = foldMallocMemset(CI, B, *TLI))
+  if (auto *Calloc = foldMallocMemset(CI, B))
     return Calloc;
 
   // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
@@ -1220,6 +1219,9 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       StringRef ExpName;
       Intrinsic::ID ID;
       Value *ExpFn;
+      LibFunc LibFnFloat;
+      LibFunc LibFnDouble;
+      LibFunc LibFnLongDouble;
 
       switch (LibFn) {
       default:
@@ -1227,10 +1229,16 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       case LibFunc_expf:  case LibFunc_exp:  case LibFunc_expl:
         ExpName = TLI->getName(LibFunc_exp);
         ID = Intrinsic::exp;
+        LibFnFloat = LibFunc_expf;
+        LibFnDouble = LibFunc_exp;
+        LibFnLongDouble = LibFunc_expl;
         break;
       case LibFunc_exp2f: case LibFunc_exp2: case LibFunc_exp2l:
         ExpName = TLI->getName(LibFunc_exp2);
         ID = Intrinsic::exp2;
+        LibFnFloat = LibFunc_exp2f;
+        LibFnDouble = LibFunc_exp2;
+        LibFnLongDouble = LibFunc_exp2l;
         break;
       }
 
@@ -1239,14 +1247,16 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
       ExpFn = BaseFn->doesNotAccessMemory()
               ? B.CreateCall(Intrinsic::getDeclaration(Mod, ID, Ty),
                              FMul, ExpName)
-              : emitUnaryFloatFnCall(FMul, ExpName, B, BaseFn->getAttributes());
+              : emitUnaryFloatFnCall(FMul, TLI, LibFnDouble, LibFnFloat,
+                                     LibFnLongDouble, B,
+                                     BaseFn->getAttributes());
 
       // Since the new exp{,2}() is different from the original one, dead code
       // elimination cannot be trusted to remove it, since it may have side
       // effects (e.g., errno).  When the only consumer for the original
       // exp{,2}() is pow(), then it has to be explicitly erased.
       BaseFn->replaceAllUsesWith(ExpFn);
-      BaseFn->eraseFromParent();
+      eraseFromParent(BaseFn);
 
       return ExpFn;
     }
@@ -1276,7 +1286,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
         return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
                             FMul, "exp2");
       else
-        return emitUnaryFloatFnCall(FMul, TLI->getName(LibFunc_exp2), B, Attrs);
+        return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
+                                    LibFunc_exp2l, B, Attrs);
     }
   }
 
@@ -1284,7 +1295,8 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
   // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
   if (match(Base, m_SpecificFP(10.0)) &&
       hasUnaryFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
-    return emitUnaryFloatFnCall(Expo, TLI->getName(LibFunc_exp10), B, Attrs);
+    return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
+                                LibFunc_exp10l, B, Attrs);
 
   return nullptr;
 }
@@ -1305,7 +1317,8 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
     // TODO: We also should check that the target can in fact lower the sqrt()
     // libcall. We currently have no way to ask this question, so we ask if
     // the target has a sqrt() libcall, which is not exactly the same.
-    return emitUnaryFloatFnCall(V, TLI->getName(LibFunc_sqrt), B, Attrs);
+    return emitUnaryFloatFnCall(V, TLI, LibFunc_sqrt, LibFunc_sqrtf,
+                                LibFunc_sqrtl, B, Attrs);
 
   return nullptr;
 }
@@ -2591,7 +2604,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) {
         // If we were able to further simplify, remove the now redundant call.
         SimplifiedCI->replaceAllUsesWith(V);
-        SimplifiedCI->eraseFromParent();
+        eraseFromParent(SimplifiedCI);
         return V;
       }
     }
@@ -2670,15 +2683,20 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
 LibCallSimplifier::LibCallSimplifier(
     const DataLayout &DL, const TargetLibraryInfo *TLI,
     OptimizationRemarkEmitter &ORE,
-    function_ref<void(Instruction *, Value *)> Replacer)
+    function_ref<void(Instruction *, Value *)> Replacer,
+    function_ref<void(Instruction *)> Eraser)
     : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE),
-      UnsafeFPShrink(false), Replacer(Replacer) {}
+      UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {}
 
 void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
   // Indirect through the replacer used in this instance.
   Replacer(I, With);
 }
 
+void LibCallSimplifier::eraseFromParent(Instruction *I) {
+  Eraser(I);
+}
+
 // TODO:
 //   Additional cases that we need to add to this file:
 //
diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 7e11504c0e04f27293c53a561214b57154880b45..755ad32a7bf5079b871a0b09c53a9f756d508bfe 100644
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -817,11 +817,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   if (!LAI->canVectorizeMemory())
     return false;
 
-  if (LAI->hasVariantStoreToLoopInvariantAddress()) {
+  if (LAI->hasMultipleStoresToLoopInvariantAddress()) {
     ORE->emit(createMissedAnalysis("CantVectorizeStoreToLoopInvariantAddress")
-              << "write of variant value to a loop invariant address could not "
+              << "multiple writes to a loop invariant address could not "
                  "be vectorized");
-    LLVM_DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+    LLVM_DEBUG(
+        dbgs() << "LV: We don't allow multiple stores to a uniform address\n");
     return false;
   }
 
@@ -1133,4 +1134,59 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   return Result;
 }
 
+bool LoopVectorizationLegality::canFoldTailByMasking() {
+
+  LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
+
+  if (!PrimaryInduction) {
+    ORE->emit(createMissedAnalysis("NoPrimaryInduction")
+              << "Missing a primary induction variable in the loop, which is "
+              << "needed in order to fold tail by masking as required.");
+    LLVM_DEBUG(dbgs() << "LV: No primary induction, cannot fold tail by "
+                      << "masking.\n");
+    return false;
+  }
+
+  // TODO: handle reductions when tail is folded by masking.
+  if (!Reductions.empty()) {
+    ORE->emit(createMissedAnalysis("ReductionFoldingTailByMasking")
+              << "Cannot fold tail by masking in the presence of reductions.");
+    LLVM_DEBUG(dbgs() << "LV: Loop has reductions, cannot fold tail by "
+                      << "masking.\n");
+    return false;
+  }
+
+  // TODO: handle outside users when tail is folded by masking.
+  for (auto *AE : AllowedExit) {
+    // Check that all users of allowed exit values are inside the loop.
+    for (User *U : AE->users()) {
+      Instruction *UI = cast<Instruction>(U);
+      if (TheLoop->contains(UI))
+        continue;
+      ORE->emit(createMissedAnalysis("LiveOutFoldingTailByMasking")
+                << "Cannot fold tail by masking in the presence of live outs.");
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop has an "
+                        << "outside user for : " << *UI << '\n');
+      return false;
+    }
+  }
+
+  // The list of pointers that we can safely read and write to remains empty.
+  SmallPtrSet<Value *, 8> SafePointers;
+
+  // Check and mark all blocks for predication, including those that ordinarily
+  // do not need predication such as the header block.
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    if (!blockCanBePredicated(BB, SafePointers)) {
+      ORE->emit(createMissedAnalysis("NoCFGForSelect", BB->getTerminator())
+                << "control flow cannot be substituted for a select");
+      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as required.\n");
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+  return true;
+}
+
 } // namespace llvm
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c7c4568377bc5228e943e010b59fbe711626e795..c9c70b5c5364a0d9522cae79ebb7bd763a251f87 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,6 +172,12 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
 
+/// An interleave-group may need masking if it resides in a block that needs
+/// predication, or in order to mask away gaps. 
+static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
+    "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+    cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
+
 /// We don't interleave loops with a known constant trip count below this
 /// number.
 static const unsigned TinyTripCountInterleaveThreshold = 128;
@@ -408,8 +414,10 @@ public:
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
 
-  /// Try to vectorize the interleaved access group that \p Instr belongs to.
-  void vectorizeInterleaveGroup(Instruction *Instr);
+  /// Try to vectorize the interleaved access group that \p Instr belongs to,
+  /// optionally masking the vector operations if \p BlockInMask is non-null.
+  void vectorizeInterleaveGroup(Instruction *Instr,
+                                VectorParts *BlockInMask = nullptr);
 
   /// Vectorize Load and Store instructions, optionally masking the vector
   /// operations if \p BlockInMask is non-null.
@@ -1099,7 +1107,7 @@ public:
   // through scalar predication or masked load/store or masked gather/scatter.
   // Superset of instructions that return true for isScalarWithPredication.
   bool isPredicatedInst(Instruction *I) {
-    if (!Legal->blockNeedsPredication(I->getParent()))
+    if (!blockNeedsPredication(I->getParent()))
       return false;
     // Loads and stores that need some form of masked operation are predicated
     // instructions.
@@ -1112,6 +1120,11 @@ public:
   /// access that can be widened.
   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
+  /// Returns true if \p I is a memory instruction in an interleaved-group
+  /// of memory accesses that can be vectorized with wide vector loads/stores
+  /// and shuffles.
+  bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
+
   /// Check if \p Instr belongs to any interleaved access group.
   bool isAccessInterleaved(Instruction *Instr) {
     return InterleaveInfo.isInterleaved(Instr);
@@ -1123,9 +1136,20 @@ public:
   }
 
   /// Returns true if an interleaved group requires a scalar iteration
-  /// to handle accesses with gaps.
+  /// to handle accesses with gaps, and there is nothing preventing us from
+  /// creating a scalar epilogue.
   bool requiresScalarEpilogue() const {
-    return InterleaveInfo.requiresScalarEpilogue();
+    return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
+  }
+
+  /// Returns true if a scalar epilogue is not allowed due to optsize.
+  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+
+  /// Returns true if all loop blocks should be masked to fold tail loop.
+  bool foldTailByMasking() const { return FoldTailByMasking; }
+
+  bool blockNeedsPredication(BasicBlock *BB) {
+    return foldTailByMasking() || Legal->blockNeedsPredication(BB);
   }
 
 private:
@@ -1178,7 +1202,6 @@ private:
   /// Load: scalar load + broadcast.
   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
   /// element)
-  /// TODO: Test the extra cost of the extract when loop variant value stored.
   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
 
   /// Returns whether the instruction is a load or store and will be a emitted
@@ -1212,6 +1235,18 @@ private:
   /// vectorization as a predicated block.
   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
 
+  /// Records whether it is allowed to have the original scalar loop execute at
+  /// least once. This may be needed as a fallback loop in case runtime 
+  /// aliasing/dependence checks fail, or to handle the tail/remainder
+  /// iterations when the trip count is unknown or doesn't divide by the VF,
+  /// or as a peel-loop to handle gaps in interleave-groups.
+  /// Under optsize and when the trip count is very small we don't allow any
+  /// iterations to execute in the scalar loop.
+  bool IsScalarEpilogueAllowed = true;
+
+  /// All blocks of loop are to be masked to fold tail of scalar iterations.
+  bool FoldTailByMasking = false;
+
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
@@ -1918,6 +1953,17 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
                                      "reverse");
 }
 
+// Return whether we allow using masked interleave-groups (for dealing with
+// strided loads/stores that reside in predicated blocks, or for dealing
+// with gaps).
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+  // If an override option has been passed in for interleaved accesses, use it.
+  if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
+    return EnableMaskedInterleavedMemAccesses;
+
+  return TTI.enableMaskedInterleavedAccessVectorization();
+}
+
 // Try to vectorize the interleave group that \p Instr belongs to.
 //
 // E.g. Translate following interleaved load group (factor = 3):
@@ -1946,7 +1992,8 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
+void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
+                                                   VectorParts *BlockInMask) {
   const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr);
   assert(Group && "Fail to get an interleaved access group.");
 
@@ -1968,6 +2015,15 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   SmallVector<Value *, 2> NewPtrs;
   unsigned Index = Group->getIndex(Instr);
 
+  VectorParts Mask;
+  bool IsMaskForCondRequired = BlockInMask;
+  if (IsMaskForCondRequired) {
+    Mask = *BlockInMask;
+    // TODO: extend the masked interleaved-group support to reversed access.
+    assert(!Group->isReverse() && "Reversed masked interleave-group "
+                                  "not supported.");
+  }
+
   // If the group is reverse, adjust the index to refer to the last vector lane
   // instead of the first. We adjust the index from the first vector lane,
   // rather than directly getting the pointer for lane VF - 1, because the
@@ -2006,13 +2062,39 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   setDebugLocFromInst(Builder, Instr);
   Value *UndefVec = UndefValue::get(VecTy);
 
+  Value *MaskForGaps = nullptr;
+  if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
+    MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
+    assert(MaskForGaps && "Mask for Gaps is required but it is null");
+  }
+
   // Vectorize the interleaved load group.
   if (isa<LoadInst>(Instr)) {
     // For each unroll part, create a wide load for the group.
     SmallVector<Value *, 2> NewLoads;
     for (unsigned Part = 0; Part < UF; Part++) {
-      auto *NewLoad = Builder.CreateAlignedLoad(
-          NewPtrs[Part], Group->getAlignment(), "wide.vec");
+      Instruction *NewLoad;
+      if (IsMaskForCondRequired || MaskForGaps) {
+        assert(useMaskedInterleavedAccesses(*TTI) &&
+               "masked interleaved groups are not allowed.");
+        Value *GroupMask = MaskForGaps;
+        if (IsMaskForCondRequired) {
+          auto *Undefs = UndefValue::get(Mask[Part]->getType());
+          auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+          Value *ShuffledMask = Builder.CreateShuffleVector(
+              Mask[Part], Undefs, RepMask, "interleaved.mask");
+          GroupMask = MaskForGaps
+                          ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+                                                MaskForGaps)
+                          : ShuffledMask;
+        }
+        NewLoad =
+            Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
+                                     GroupMask, UndefVec, "wide.masked.vec");
+      }
+      else
+        NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], 
+          Group->getAlignment(), "wide.vec");
       Group->addMetadata(NewLoad);
       NewLoads.push_back(NewLoad);
     }
@@ -2079,8 +2161,18 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
                                               "interleaved.vec");
 
-    Instruction *NewStoreInstr =
-        Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment());
+    Instruction *NewStoreInstr;
+    if (IsMaskForCondRequired) {
+      auto *Undefs = UndefValue::get(Mask[Part]->getType());
+      auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+      Value *ShuffledMask = Builder.CreateShuffleVector(
+          Mask[Part], Undefs, RepMask, "interleaved.mask");
+      NewStoreInstr = Builder.CreateMaskedStore(
+          IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask);
+    }
+    else
+      NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], 
+        Group->getAlignment());
 
     Group->addMetadata(NewStoreInstr);
   }
@@ -2298,6 +2390,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
   if (TripCount)
     return TripCount;
 
+  assert(L && "Create Trip Count for null loop.");
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   // Find the loop boundaries.
   ScalarEvolution *SE = PSE.getSE();
@@ -2347,12 +2440,26 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   Value *TC = getOrCreateTripCount(L);
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
 
+  Type *Ty = TC->getType();
+  Constant *Step = ConstantInt::get(Ty, VF * UF);
+
+  // If the tail is to be folded by masking, round the number of iterations N
+  // up to a multiple of Step instead of rounding down. This is done by first
+  // adding Step-1 and then rounding down. Note that it's ok if this addition
+  // overflows: the vector induction variable will eventually wrap to zero given
+  // that it starts at zero and its Step is a power of two; the loop will then
+  // exit, with the last early-exit vector comparison also producing all-true.
+  if (Cost->foldTailByMasking()) {
+    assert(isPowerOf2_32(VF * UF) &&
+           "VF*UF must be a power of 2 when folding tail by masking");
+    TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
+  }
+
   // Now we need to generate the expression for the part of the loop that the
   // vectorized body will execute. This is equal to N - (N % Step) if scalar
   // iterations are not required for correctness, or N - Step, otherwise. Step
   // is equal to the vectorization factor (number of SIMD elements) times the
   // unroll factor (number of SIMD instructions).
-  Constant *Step = ConstantInt::get(TC->getType(), VF * UF);
   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
 
   // If there is a non-reversed interleaved group that may speculatively access
@@ -2415,8 +2522,13 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   // of zero. In this case we will also jump to the scalar loop.
   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
                                           : ICmpInst::ICMP_ULT;
-  Value *CheckMinIters = Builder.CreateICmp(
-      P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
+
+  // If tail is to be folded, vector loop takes care of all iterations.
+  Value *CheckMinIters = Builder.getFalse();
+  if (!Cost->foldTailByMasking())
+    CheckMinIters = Builder.CreateICmp(
+        P, Count, ConstantInt::get(Count->getType(), VF * UF),
+        "min.iters.check");
 
   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
   // Update dominator tree immediately if the generated block is a
@@ -2445,6 +2557,8 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
     if (C->isZero())
       return;
 
+  assert(!Cost->foldTailByMasking() &&
+         "Cannot SCEV check stride or overflow when folding tail");
   // Create a new block containing the stride check.
   BB->setName("vector.scevcheck");
   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2477,6 +2591,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   if (!MemRuntimeCheck)
     return;
 
+  assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail");
   // Create a new block containing the memory check.
   BB->setName("vector.memcheck");
   auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
@@ -2745,9 +2860,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN =
-      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                      CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
+  // If tail is to be folded, we know we don't need to run the remainder.
+  Value *CmpN = Builder.getTrue();
+  if (!Cost->foldTailByMasking())
+    CmpN =
+        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
+                        CountRoundDown, "cmp.n", MiddleBlock->getTerminator());
   ReplaceInstWithInst(MiddleBlock->getTerminator(),
                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
 
@@ -2906,6 +3024,10 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
        !TTI.supportsEfficientVectorElementLoadStore()))
     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
 
+  // Some targets keep addresses scalar.
+  if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
+    return Cost;
+
   if (CallInst *CI = dyn_cast<CallInst>(I)) {
     SmallVector<const Value *, 4> Operands(CI->arg_operands());
     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
@@ -4221,7 +4343,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
 }
 
 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
-  if (!Legal->blockNeedsPredication(I->getParent()))
+  if (!blockNeedsPredication(I->getParent()))
     return false;
   switch(I->getOpcode()) {
   default:
@@ -4253,6 +4375,35 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
   return false;
 }
 
+bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
+                                                               unsigned VF) {
+  assert(isAccessInterleaved(I) && "Expecting interleaved access.");
+  assert(getWideningDecision(I, VF) == CM_Unknown &&
+         "Decision should not be set yet.");
+  auto *Group = getInterleavedAccessGroup(I);
+  assert(Group && "Must have a group.");
+
+  // Check if masking is required.
+  // A Group may need masking for one of two reasons: it resides in a block that
+  // needs predication, or it was decided to use masking to deal with gaps.
+  bool PredicatedAccessRequiresMasking = 
+      Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
+  bool AccessWithGapsRequiresMasking = 
+      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
+    return true;
+
+  // If masked interleaving is required, we expect that the user/target had
+  // enabled it, because otherwise it either wouldn't have been created or
+  // it should have been invalidated by the CostModel.
+  assert(useMaskedInterleavedAccesses(TTI) &&
+         "Masked interleave-groups for predicated accesses are not enabled.");
+
+  auto *Ty = getMemInstValueType(I);
+  return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
+                          : TTI.isLegalMaskedStore(Ty);
+}
+
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
                                                                unsigned VF) {
   // Get and ensure we have a valid memory instruction.
@@ -4487,39 +4638,78 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
     return None;
   }
 
+  if (!PSE.getUnionPredicate().getPredicates().empty()) {
+    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+              << "runtime SCEV checks needed. Enable vectorization of this "
+                 "loop with '#pragma clang loop vectorize(enable)' when "
+                 "compiling with -Os/-Oz");
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
+    return None;
+  }
+
+  // FIXME: Avoid specializing for stride==1 instead of bailing out.
+  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
+    ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
+              << "runtime stride == 1 checks needed. Enable vectorization of "
+                 "this loop with '#pragma clang loop vectorize(enable)' when "
+                 "compiling with -Os/-Oz");
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+    return None;
+  }
+
   // If we optimize the program for size, avoid creating the tail loop.
   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
-  // If we don't know the precise trip count, don't try to vectorize.
-  if (TC < 2) {
-    ORE->emit(
-        createMissedAnalysis("UnknownLoopCountComplexCFG")
-        << "unable to calculate the loop count due to complex control flow");
-    LLVM_DEBUG(
-        dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+  if (TC == 1) {
+    ORE->emit(createMissedAnalysis("SingleIterationLoop")
+              << "loop trip count is one, irrelevant for vectorization");
+    LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n");
     return None;
   }
 
+  // Record that scalar epilogue is not allowed.
+  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+
+  IsScalarEpilogueAllowed = !OptForSize;
+
+  // We don't create an epilogue when optimizing for size.
+  // Invalidate interleave groups that require an epilogue if we can't mask
+  // the interleave-group.
+  if (!useMaskedInterleavedAccesses(TTI)) 
+    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+
   unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
 
-  if (TC % MaxVF != 0) {
-    // If the trip count that we found modulo the vectorization factor is not
-    // zero then we require a tail.
-    // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
-    // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
-    //        smaller MaxVF that does not require a scalar epilog.
-
-    ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
-              << "cannot optimize for size and vectorize at the "
-                 "same time. Enable vectorization of this loop "
-                 "with '#pragma clang loop vectorize(enable)' "
-                 "when compiling with -Os/-Oz");
-    LLVM_DEBUG(
-        dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+  if (TC > 0 && TC % MaxVF == 0) {
+    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+    return MaxVF;
+  }
+
+  // If we don't know the precise trip count, or if the trip count that we
+  // found modulo the vectorization factor is not zero, try to fold the tail
+  // by masking.
+  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+  if (Legal->canFoldTailByMasking()) {
+    FoldTailByMasking = true;
+    return MaxVF;
+  }
+
+  if (TC == 0) {
+    ORE->emit(
+        createMissedAnalysis("UnknownLoopCountComplexCFG")
+        << "unable to calculate the loop count due to complex control flow");
     return None;
   }
 
-  return MaxVF;
+  ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
+            << "cannot optimize for size and vectorize at the same time. "
+               "Enable vectorization of this loop with '#pragma clang loop "
+               "vectorize(enable)' when compiling with -Os/-Oz");
+  return None;
 }
 
 unsigned
@@ -4655,7 +4845,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     // For each instruction in the loop.
-    for (Instruction &I : *BB) {
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
       Type *T = I.getType();
 
       // Skip ignored values.
@@ -4757,6 +4947,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
   // fit without causing spills. All of this is rounded down if necessary to be
   // a power of two. We want power of two interleave count to simplify any
   // addressing operations or alignment considerations.
+  // We also want power of two interleave counts to ensure that the induction
+  // variable of the vector loop wraps to zero, when tail is folded by masking;
+  // this currently happens when OptForSize, in which case IC is set to 1 above.
   unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) /
                               R.MaxLocalUsers);
 
@@ -4882,7 +5075,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   using IntervalMap = DenseMap<Instruction *, unsigned>;
 
   // Maps instruction to its index.
-  DenseMap<unsigned, Instruction *> IdxToInstr;
+  SmallVector<Instruction *, 64> IdxToInstr;
   // Marks the end of each interval.
   IntervalMap EndPoint;
   // Saves the list of instruction indices that are used in the loop.
@@ -4891,10 +5084,9 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
   // defined outside the loop, such as arguments and constants.
   SmallPtrSet<Value *, 8> LoopInvariants;
 
-  unsigned Index = 0;
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
-    for (Instruction &I : *BB) {
-      IdxToInstr[Index++] = &I;
+    for (Instruction &I : BB->instructionsWithoutDebug()) {
+      IdxToInstr.push_back(&I);
 
       // Save the end location of each USE.
       for (Value *U : I.operands()) {
@@ -4911,7 +5103,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
         }
 
         // Overwrite previous end points.
-        EndPoint[Instr] = Index;
+        EndPoint[Instr] = IdxToInstr.size();
         Ends.insert(Instr);
       }
     }
@@ -4948,7 +5140,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
   };
 
-  for (unsigned int i = 0; i < Index; ++i) {
+  for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
     Instruction *I = IdxToInstr[i];
 
     // Remove all of the instructions that end at this location.
@@ -5044,7 +5236,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
   // determine if it would be better to not if-convert the blocks they are in.
   // If so, we also record the instructions to scalarize.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!Legal->blockNeedsPredication(BB))
+    if (!blockNeedsPredication(BB))
       continue;
     for (Instruction &I : *BB)
       if (isScalarWithPredication(&I)) {
@@ -5209,7 +5401,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
     // unconditionally executed. For the scalar case, we may not always execute
     // the predicated block. Thus, scale the block's cost by the probability of
     // executing it.
-    if (VF == 1 && Legal->blockNeedsPredication(BB))
+    if (VF == 1 && blockNeedsPredication(BB))
       BlockCost.first /= getReciprocalPredBlockProb();
 
     Cost.first += BlockCost.first;
@@ -5256,6 +5448,7 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
 
 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
                                                                  unsigned VF) {
+  assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
   Type *ValTy = getMemInstValueType(I);
   auto SE = PSE.getSE();
 
@@ -5271,9 +5464,11 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   // Get the cost of the scalar memory instruction and address computation.
   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
 
+  // Don't pass *I here, since it is scalar but will actually be part of a
+  // vectorized loop where the user of it is a vectorized instruction.
   Cost += VF *
           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
-                              AS, I);
+                              AS);
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
@@ -5372,13 +5567,19 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   }
 
   // Calculate the cost of the whole interleaved group.
-  unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
-                                                 Group->getFactor(), Indices,
-                                                 Group->getAlignment(), AS);
-
-  if (Group->isReverse())
+  bool UseMaskForGaps = 
+      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(
+      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+      Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
+
+  if (Group->isReverse()) {
+    // TODO: Add support for reversed masked interleaved access.
+    assert(!Legal->isMaskRequired(I) &&
+           "Reverse masked interleaved access not supported.");
     Cost += Group->getNumMembers() *
             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  }
   return Cost;
 }
 
@@ -5480,7 +5681,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
           continue;
 
         NumAccesses = Group->getNumMembers();
-        InterleaveCost = getInterleaveGroupCost(&I, VF);
+        if (interleavedAccessCanBeWidened(&I, VF))
+          InterleaveCost = getInterleaveGroupCost(&I, VF);
       }
 
       unsigned GatherScatterCost =
@@ -5626,7 +5828,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // First-order recurrences are replaced by vector shuffles inside the loop.
     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                VectorTy, VF - 1, VectorTy);
+                                VectorTy, VF - 1, ToVectorTy(RetTy, 1));
 
     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
     // converted into select instructions. We require N - 1 selects per phi
@@ -5895,6 +6097,16 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
   if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
     return NoVectorization;
 
+  // Invalidate interleave groups if all blocks of loop will be predicated.
+  if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
+      !useMaskedInterleavedAccesses(*TTI)) {
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Invalidate all interleaved groups due to fold-tail by masking "
+           "which requires masked-interleaved support.\n");
+    CM.InterleaveInfo.reset();
+  }
+
   if (UserVF) {
     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
@@ -5951,6 +6163,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
                          &ILV,   CallbackILV};
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
+  State.TripCount = ILV.getOrCreateTripCount(nullptr);
 
   //===------------------------------------------------===//
   //
@@ -6131,9 +6344,17 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
   // load/store/gather/scatter. Initialize BlockMask to no-mask.
   VPValue *BlockMask = nullptr;
 
-  // Loop incoming mask is all-one.
-  if (OrigLoop->getHeader() == BB)
+  if (OrigLoop->getHeader() == BB) {
+    if (!CM.blockNeedsPredication(BB))
+      return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+
+    // Introduce the early-exit compare IV <= BTC to form header block mask.
+    // This is used instead of IV < TC because TC may wrap, unlike BTC.
+    VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
+    VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
+    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     return BlockMaskCache[BB] = BlockMask;
+  }
 
   // This is the block mask. We OR all incoming edges.
   for (auto *Predecessor : predecessors(BB)) {
@@ -6153,7 +6374,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
 }
 
 VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
-                                                           VFRange &Range) {
+                                                           VFRange &Range,
+                                                           VPlanPtr &Plan) {
   const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I);
   if (!IG)
     return nullptr;
@@ -6175,7 +6397,11 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I,
   assert(I == IG->getInsertPos() &&
          "Generating a recipe for an adjunct member of an interleave group");
 
-  return new VPInterleaveRecipe(IG);
+  VPValue *Mask = nullptr;
+  if (Legal->isMaskRequired(I))
+    Mask = createBlockInMask(I->getParent(), Plan);
+
+  return new VPInterleaveRecipe(IG, Mask);
 }
 
 VPWidenMemoryInstructionRecipe *
@@ -6443,7 +6669,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
   VPRecipeBase *Recipe = nullptr;
   // Check if Instr should belong to an interleave memory recipe, or already
   // does. In the latter case Instr is irrelevant.
-  if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+  if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) {
     VPBB->appendRecipe(Recipe);
     return true;
   }
@@ -6494,6 +6720,11 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
       NeedDef.insert(Branch->getCondition());
   }
 
+  // If the tail is to be folded by masking, the primary induction variable
+  // needs to be represented in VPlan for it to model early-exit masking.
+  if (CM.foldTailByMasking())
+    NeedDef.insert(Legal->getPrimaryInduction());
+
   // Collect instructions from the original loop that will become trivially dead
   // in the vectorized loop. We don't need to vectorize these instructions. For
   // example, original induction update instructions can become dead because we
@@ -6670,6 +6901,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
   O << " +\n"
     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
+  if (User) {
+    O << ", ";
+    User->getOperand(0)->printAsOperand(O);
+  }
   O << "\\l\"";
   for (unsigned i = 0; i < IG->getFactor(); ++i)
     if (Instruction *I = IG->getMember(i))
@@ -6732,7 +6967,15 @@ void VPBlendRecipe::execute(VPTransformState &State) {
 
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
-  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+  if (!User)
+    return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+
+  // Last (and currently only) operand is a mask.
+  InnerLoopVectorizer::VectorParts MaskValues(State.UF);
+  VPValue *Mask = User->getOperand(User->getNumOperands() - 1);
+  for (unsigned Part = 0; Part < State.UF; ++Part)
+    MaskValues[Part] = State.get(Mask, Part);
+  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues);
 }
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
@@ -7031,7 +7274,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Analyze interleaved memory accesses.
   if (UseInterleaved) {
-    IAI.analyzeInterleaving();
+    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
   }
 
   // Use the cost model.
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 79b575b78cdd6525a7598dc9d94c92a1bb20c70d..3592df3ede3d13f187717c122af372fa98694a69 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1536,12 +1536,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       // Check for terminator values (e.g. invoke).
       for (unsigned j = 0; j < VL.size(); ++j)
         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
-          TerminatorInst *Term = dyn_cast<TerminatorInst>(
-              cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
-          if (Term) {
-            LLVM_DEBUG(
-                dbgs()
-                << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+          Instruction *Term = dyn_cast<Instruction>(
+              cast<PHINode>(VL[j])->getIncomingValueForBlock(
+                  PH->getIncomingBlock(i)));
+          if (Term && Term->isTerminator()) {
+            LLVM_DEBUG(dbgs()
+                       << "SLP: Need to swizzle PHINodes (terminator use).\n");
             BS.cancelScheduling(VL, VL0);
             newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
             return;
@@ -3652,7 +3652,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       if (PHINode *PH = dyn_cast<PHINode>(User)) {
         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
           if (PH->getIncomingValue(i) == Scalar) {
-            TerminatorInst *IncomingTerminator =
+            Instruction *IncomingTerminator =
                 PH->getIncomingBlock(i)->getTerminator();
             if (isa<CatchSwitchInst>(IncomingTerminator)) {
               Builder.SetInsertPoint(VecI->getParent(),
@@ -3960,7 +3960,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
     ScheduleEnd = I->getNextNode();
     if (isOneOf(S, I) != I)
       CheckSheduleForI(I);
-    assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+    assert(ScheduleEnd && "tried to vectorize a terminator?");
     LLVM_DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
     return true;
   }
@@ -3996,7 +3996,7 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
         ScheduleEnd = I->getNextNode();
         if (isOneOf(S, I) != I)
           CheckSheduleForI(I);
-        assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
+        assert(ScheduleEnd && "tried to vectorize a terminator?");
         LLVM_DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I
                           << "\n");
         return true;
@@ -5126,9 +5126,12 @@ class HorizontalReduction {
     /// Checks if the reduction operation can be vectorized.
     bool isVectorizable() const {
       return LHS && RHS &&
-             // We currently only support adds && min/max reductions.
+             // We currently only support add/mul/logical && min/max reductions.
              ((Kind == RK_Arithmetic &&
-               (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
+               (Opcode == Instruction::Add || Opcode == Instruction::FAdd ||
+                Opcode == Instruction::Mul || Opcode == Instruction::FMul ||
+                Opcode == Instruction::And || Opcode == Instruction::Or ||
+                Opcode == Instruction::Xor)) ||
               ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
                (Kind == RK_Min || Kind == RK_Max)) ||
               (Opcode == Instruction::ICmp &&
diff --git a/lib/Transforms/Vectorize/VPRecipeBuilder.h b/lib/Transforms/Vectorize/VPRecipeBuilder.h
index f43a8bb123b19b99d2ce9a3912105d4aba2cfa9e..15d38ac9c84c89e8630c54a5676638e66052ba44 100644
--- a/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -69,7 +69,8 @@ public:
   /// \return value is <true, nullptr>, as it is handled by another recipe.
   /// \p Range.End may be decreased to ensure same decision from \p Range.Start
   /// to \p Range.End.
-  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+  VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range,
+                                            VPlanPtr &Plan);
 
   /// Check if \I is a memory instruction to be widened for \p Range.Start and
   /// potentially masked. Such instructions are handled by a recipe that takes
diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp
index 09d20fbdefe543a5a703f22759b6677871e4a9e9..a3c15a36b05d77d8ba344d07fe7880942cbc1109 100644
--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp
@@ -303,6 +303,13 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     State.set(this, V, Part);
     break;
   }
+  case VPInstruction::ICmpULE: {
+    Value *IV = State.get(getOperand(0), Part);
+    Value *TC = State.get(getOperand(1), Part);
+    Value *V = Builder.CreateICmpULE(IV, TC);
+    State.set(this, V, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -328,6 +335,9 @@ void VPInstruction::print(raw_ostream &O) const {
   case VPInstruction::Not:
     O << "not";
     break;
+  case VPInstruction::ICmpULE:
+    O << "icmp ule";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -342,6 +352,15 @@ void VPInstruction::print(raw_ostream &O) const {
 /// LoopVectorBody basic-block was created for this. Introduce additional
 /// basic-blocks as needed, and fill them all.
 void VPlan::execute(VPTransformState *State) {
+  // -1. Check if the backedge taken count is needed, and if so build it.
+  if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
+    Value *TC = State->TripCount;
+    IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
+    auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+                                   "trip.count.minus.1");
+    Value2VPValue[TCMO] = BackedgeTakenCount;
+  }
+
   // 0. Set the reverse mapping from VPValues to Values for code generation.
   for (auto &Entry : Value2VPValue)
     State->VPValue2Value[Entry.second] = Entry.first;
@@ -469,8 +488,11 @@ void VPlanPrinter::dump() {
   OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
   if (!Plan.getName().empty())
     OS << "\\n" << DOT::EscapeString(Plan.getName());
-  if (!Plan.Value2VPValue.empty()) {
+  if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) {
     OS << ", where:";
+    if (Plan.BackedgeTakenCount)
+      OS << "\\n"
+         << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount";
     for (auto Entry : Plan.Value2VPValue) {
       OS << "\\n" << *Entry.second;
       OS << DOT::EscapeString(" := ");
@@ -543,8 +565,10 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
     if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
       CBI->printAsOperand(OS);
       OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
-    } else
+    } else {
       CBV->printAsOperand(OS);
+      OS << "\"";
+    }
   }
 
   bumpIndent(-2);
diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h
index c3123b416008951430ee9c901682bae98ee49e56..9daaea1acdedd082396cc69d92742f38f5797b3e 100644
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -317,6 +317,9 @@ struct VPTransformState {
   /// Values they correspond to.
   VPValue2ValueTy VPValue2Value;
 
+  /// Hold the trip count of the scalar loop.
+  Value *TripCount = nullptr;
+
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 
@@ -607,7 +610,7 @@ class VPInstruction : public VPUser, public VPRecipeBase {
 
 public:
   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
-  enum { Not = Instruction::OtherOpsEnd + 1 };
+  enum { Not = Instruction::OtherOpsEnd + 1, ICmpULE };
 
 private:
   typedef unsigned char OpcodeTy;
@@ -769,10 +772,14 @@ public:
 class VPInterleaveRecipe : public VPRecipeBase {
 private:
   const InterleaveGroup *IG;
+  std::unique_ptr<VPUser> User;
 
 public:
-  VPInterleaveRecipe(const InterleaveGroup *IG)
-      : VPRecipeBase(VPInterleaveSC), IG(IG) {}
+  VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask)
+      : VPRecipeBase(VPInterleaveSC), IG(IG) {
+    if (Mask) // Create a VPInstruction to register as a user of the mask.
+      User.reset(new VPUser({Mask}));
+  }
   ~VPInterleaveRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1111,6 +1118,10 @@ private:
   // (operators '==' and '<').
   SmallPtrSet<VPValue *, 16> VPExternalDefs;
 
+  /// Represents the backedge taken count of the original loop, for folding
+  /// the tail.
+  VPValue *BackedgeTakenCount = nullptr;
+
   /// Holds a mapping between Values and their corresponding VPValue inside
   /// VPlan.
   Value2VPValueTy Value2VPValue;
@@ -1128,7 +1139,10 @@ public:
     if (Entry)
       VPBlockBase::deleteCFG(Entry);
     for (auto &MapEntry : Value2VPValue)
-      delete MapEntry.second;
+      if (MapEntry.second != BackedgeTakenCount)
+        delete MapEntry.second;
+    if (BackedgeTakenCount)
+      delete BackedgeTakenCount; // Delete once, if in Value2VPValue or not.
     for (VPValue *Def : VPExternalDefs)
       delete Def;
     for (VPValue *CBV : VPCBVs)
@@ -1143,6 +1157,13 @@ public:
 
   VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; }
 
+  /// The backedge taken count of the original loop.
+  VPValue *getOrCreateBackedgeTakenCount() {
+    if (!BackedgeTakenCount)
+      BackedgeTakenCount = new VPValue();
+    return BackedgeTakenCount;
+  }
+
   void addVF(unsigned VF) { VFs.insert(VF); }
 
   bool hasVF(unsigned VF) { return VFs.count(VF); }
diff --git a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index b6307acb9474bc53007d54a99d4aa7fd6d45bf49..0f42694e193bc2fa84f63c9c383f028a97c16d30 100644
--- a/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -268,7 +268,7 @@ VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
     // Set VPBB successors. We create empty VPBBs for successors if they don't
     // exist already. Recipes will be created when the successor is visited
     // during the RPO traversal.
-    TerminatorInst *TI = BB->getTerminator();
+    Instruction *TI = BB->getTerminator();
     assert(TI && "Terminator expected.");
     unsigned NumSuccs = TI->getNumSuccessors();
 
diff --git a/lib/XRay/BlockIndexer.cpp b/lib/XRay/BlockIndexer.cpp
index 98e91f7de5487d0ce44627af24238a926ae93452..4dbe2d2717ad1cd88a445197870665249a33f164 100644
--- a/lib/XRay/BlockIndexer.cpp
+++ b/lib/XRay/BlockIndexer.cpp
@@ -39,6 +39,16 @@ Error BlockIndexer::visit(CustomEventRecord &R) {
   return Error::success();
 }
 
+Error BlockIndexer::visit(CustomEventRecordV5 &R) {
+  CurrentBlock.Records.push_back(&R);
+  return Error::success();
+}
+
+Error BlockIndexer::visit(TypedEventRecord &R) {
+  CurrentBlock.Records.push_back(&R);
+  return Error::success();
+}
+
 Error BlockIndexer::visit(CallArgRecord &R) {
   CurrentBlock.Records.push_back(&R);
   return Error::success();
diff --git a/lib/XRay/BlockPrinter.cpp b/lib/XRay/BlockPrinter.cpp
index c8b65fc12d766d711cbd457c836c27e0e7f30c20..0acebee0cbdd253943be032548bec574c8457808 100644
--- a/lib/XRay/BlockPrinter.cpp
+++ b/lib/XRay/BlockPrinter.cpp
@@ -68,6 +68,24 @@ Error BlockPrinter::visit(CustomEventRecord &R) {
   return E;
 }
 
+Error BlockPrinter::visit(CustomEventRecordV5 &R) {
+  if (CurrentState == State::Metadata)
+    OS << "\n";
+  CurrentState = State::CustomEvent;
+  OS << "*  ";
+  auto E = RP.visit(R);
+  return E;
+}
+
+Error BlockPrinter::visit(TypedEventRecord &R) {
+  if (CurrentState == State::Metadata)
+    OS << "\n";
+  CurrentState = State::CustomEvent;
+  OS << "*  ";
+  auto E = RP.visit(R);
+  return E;
+}
+
 // Function call printing.
 Error BlockPrinter::visit(FunctionRecord &R) {
   if (CurrentState == State::Metadata)
diff --git a/lib/XRay/BlockVerifier.cpp b/lib/XRay/BlockVerifier.cpp
index 62be1a87ab5b94e715f668b3ee944755d114ea74..5e949ec4e46afc3d34d1427335418b2f95a01bf1 100644
--- a/lib/XRay/BlockVerifier.cpp
+++ b/lib/XRay/BlockVerifier.cpp
@@ -43,6 +43,8 @@ StringRef recordToString(BlockVerifier::State R) {
     return "CallArg";
   case BlockVerifier::State::EndOfBuffer:
     return "EndOfBuffer";
+  case BlockVerifier::State::TypedEvent:
+    return "TypedEvent";
   case BlockVerifier::State::StateMax:
   case BlockVerifier::State::Unknown:
     return "Unknown";
@@ -75,27 +77,34 @@ Error BlockVerifier::transition(State To) {
                        {State::NewCPUId,
                         {mask(State::NewCPUId) | mask(State::TSCWrap) |
                          mask(State::CustomEvent) | mask(State::Function) |
-                         mask(State::EndOfBuffer)}},
+                         mask(State::EndOfBuffer) | mask(State::TypedEvent)}},
 
                        {State::TSCWrap,
                         {mask(State::TSCWrap) | mask(State::NewCPUId) |
                          mask(State::CustomEvent) | mask(State::Function) |
-                         mask(State::EndOfBuffer)}},
+                         mask(State::EndOfBuffer) | mask(State::TypedEvent)}},
 
                        {State::CustomEvent,
                         {mask(State::CustomEvent) | mask(State::TSCWrap) |
                          mask(State::NewCPUId) | mask(State::Function) |
-                         mask(State::EndOfBuffer)}},
+                         mask(State::EndOfBuffer) | mask(State::TypedEvent)}},
+
+                       {State::TypedEvent,
+                        {mask(State::TypedEvent) | mask(State::TSCWrap) |
+                         mask(State::NewCPUId) | mask(State::Function) |
+                         mask(State::EndOfBuffer) | mask(State::CustomEvent)}},
 
                        {State::Function,
                         {mask(State::Function) | mask(State::TSCWrap) |
                          mask(State::NewCPUId) | mask(State::CustomEvent) |
-                         mask(State::CallArg) | mask(State::EndOfBuffer)}},
+                         mask(State::CallArg) | mask(State::EndOfBuffer) |
+                         mask(State::TypedEvent)}},
 
                        {State::CallArg,
                         {mask(State::CallArg) | mask(State::Function) |
                          mask(State::TSCWrap) | mask(State::NewCPUId) |
-                         mask(State::CustomEvent) | mask(State::EndOfBuffer)}},
+                         mask(State::CustomEvent) | mask(State::EndOfBuffer) |
+                         mask(State::TypedEvent)}},
 
                        {State::EndOfBuffer, {}}}};
 
@@ -145,6 +154,14 @@ Error BlockVerifier::visit(CustomEventRecord &) {
   return transition(State::CustomEvent);
 }
 
+Error BlockVerifier::visit(CustomEventRecordV5 &) {
+  return transition(State::CustomEvent);
+}
+
+Error BlockVerifier::visit(TypedEventRecord &) {
+  return transition(State::TypedEvent);
+}
+
 Error BlockVerifier::visit(CallArgRecord &) {
   return transition(State::CallArg);
 }
@@ -169,6 +186,7 @@ Error BlockVerifier::verify() {
   case State::EndOfBuffer:
   case State::NewCPUId:
   case State::CustomEvent:
+  case State::TypedEvent:
   case State::Function:
   case State::CallArg:
   case State::TSCWrap:
diff --git a/lib/XRay/FDRRecordProducer.cpp b/lib/XRay/FDRRecordProducer.cpp
index 4b010f1fa624b7fbc619f29eca6feadea8c2bef3..122578010c435acbed71546374a45224f6c7630a 100644
--- a/lib/XRay/FDRRecordProducer.cpp
+++ b/lib/XRay/FDRRecordProducer.cpp
@@ -53,14 +53,15 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
   case MetadataRecordKinds::WalltimeMarkerKind:
     return make_unique<WallclockRecord>();
   case MetadataRecordKinds::CustomEventMarkerKind:
+    if (Header.Version >= 5)
+      return make_unique<CustomEventRecordV5>();
     return make_unique<CustomEventRecord>();
   case MetadataRecordKinds::CallArgumentKind:
     return make_unique<CallArgRecord>();
   case MetadataRecordKinds::BufferExtentsKind:
     return make_unique<BufferExtents>();
   case MetadataRecordKinds::TypedEventMarkerKind:
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Encountered an unsupported TypedEventMarker.");
+    return make_unique<TypedEventRecord>();
   case MetadataRecordKinds::PidKind:
     return make_unique<PIDRecord>();
   case MetadataRecordKinds::EnumEndMarker:
@@ -84,6 +85,12 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
   // the rest of the bytes.
   auto PreReadOffset = OffsetPtr;
   uint8_t FirstByte = E.getU8(&OffsetPtr);
+  if (OffsetPtr == PreReadOffset)
+    return createStringError(
+        std::make_error_code(std::errc::executable_format_error),
+        "Failed reading one byte from offset %d.", OffsetPtr);
+
+  // Set up our result record.
   std::unique_ptr<Record> R;
 
   // For metadata records, handle especially here.
diff --git a/lib/XRay/FDRRecords.cpp b/lib/XRay/FDRRecords.cpp
index 66d17ffcb53f760bcd3b5a0e9e19b662d63e7e0c..2b68a73686fbf95713cb71e2e96922e633c2f3ea 100644
--- a/lib/XRay/FDRRecords.cpp
+++ b/lib/XRay/FDRRecords.cpp
@@ -26,6 +26,8 @@ Error PIDRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 Error NewBufferRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 Error EndBufferRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 Error FunctionRecord::apply(RecordVisitor &V) { return V.visit(*this); }
+Error CustomEventRecordV5::apply(RecordVisitor &V) { return V.visit(*this); }
+Error TypedEventRecord::apply(RecordVisitor &V) { return V.visit(*this); }
 
 } // namespace xray
 } // namespace llvm
diff --git a/lib/XRay/FDRTraceExpander.cpp b/lib/XRay/FDRTraceExpander.cpp
index 8e15db52ce6ce9347516729ff2ae64b34acbfff5..a6e1521da87f340a1e8a82bc85fd8e5de09b0f72 100644
--- a/lib/XRay/FDRTraceExpander.cpp
+++ b/lib/XRay/FDRTraceExpander.cpp
@@ -12,10 +12,11 @@ namespace llvm {
 namespace xray {
 
 void TraceExpander::resetCurrentRecord() {
-  if (BuildingFunction)
+  if (BuildingRecord)
     C(CurrentRecord);
-  BuildingFunction = false;
+  BuildingRecord = false;
   CurrentRecord.CallArgs.clear();
+  CurrentRecord.Data.clear();
 }
 
 Error TraceExpander::visit(BufferExtents &) {
@@ -36,9 +37,48 @@ Error TraceExpander::visit(TSCWrapRecord &R) {
   return Error::success();
 }
 
-Error TraceExpander::visit(CustomEventRecord &) {
-  // TODO: Support custom event records in the future.
+Error TraceExpander::visit(CustomEventRecord &R) {
   resetCurrentRecord();
+  if (!IgnoringRecords) {
+    CurrentRecord.TSC = R.tsc();
+    CurrentRecord.CPU = R.cpu();
+    CurrentRecord.PId = PID;
+    CurrentRecord.TId = TID;
+    CurrentRecord.Type = RecordTypes::CUSTOM_EVENT;
+    CurrentRecord.Data = R.data();
+    BuildingRecord = true;
+  }
+  return Error::success();
+}
+
+Error TraceExpander::visit(CustomEventRecordV5 &R) {
+  resetCurrentRecord();
+  if (!IgnoringRecords) {
+    BaseTSC += R.delta();
+    CurrentRecord.TSC = BaseTSC;
+    CurrentRecord.CPU = CPUId;
+    CurrentRecord.PId = PID;
+    CurrentRecord.TId = TID;
+    CurrentRecord.Type = RecordTypes::CUSTOM_EVENT;
+    CurrentRecord.Data = R.data();
+    BuildingRecord = true;
+  }
+  return Error::success();
+}
+
+Error TraceExpander::visit(TypedEventRecord &R) {
+  resetCurrentRecord();
+  if (!IgnoringRecords) {
+    BaseTSC += R.delta();
+    CurrentRecord.TSC = BaseTSC;
+    CurrentRecord.CPU = CPUId;
+    CurrentRecord.PId = PID;
+    CurrentRecord.TId = TID;
+    CurrentRecord.RecordType = R.eventType();
+    CurrentRecord.Type = RecordTypes::TYPED_EVENT;
+    CurrentRecord.Data = R.data();
+    BuildingRecord = true;
+  }
   return Error::success();
 }
 
@@ -78,7 +118,7 @@ Error TraceExpander::visit(FunctionRecord &R) {
     CurrentRecord.PId = PID;
     CurrentRecord.TId = TID;
     CurrentRecord.CPU = CPUId;
-    BuildingFunction = true;
+    BuildingRecord = true;
   }
   return Error::success();
 }
diff --git a/lib/XRay/FDRTraceWriter.cpp b/lib/XRay/FDRTraceWriter.cpp
index d0206e775a85db54eab3b8f9ec468e5aff4f461a..d5f9697998638396c53aa04d1f2daf55635503f0 100644
--- a/lib/XRay/FDRTraceWriter.cpp
+++ b/lib/XRay/FDRTraceWriter.cpp
@@ -94,9 +94,28 @@ Error FDRTraceWriter::visit(TSCWrapRecord &R) {
 }
 
 Error FDRTraceWriter::visit(CustomEventRecord &R) {
-  if (auto E = writeMetadata<5u>(OS, R.size(), R.tsc()))
+  if (auto E = writeMetadata<5u>(OS, R.size(), R.tsc(), R.cpu()))
     return E;
-  ArrayRef<char> Bytes(R.data().data(), R.data().size());
+  auto D = R.data();
+  ArrayRef<char> Bytes(D.data(), D.size());
+  OS.write(Bytes);
+  return Error::success();
+}
+
+Error FDRTraceWriter::visit(CustomEventRecordV5 &R) {
+  if (auto E = writeMetadata<5u>(OS, R.size(), R.delta()))
+    return E;
+  auto D = R.data();
+  ArrayRef<char> Bytes(D.data(), D.size());
+  OS.write(Bytes);
+  return Error::success();
+}
+
+Error FDRTraceWriter::visit(TypedEventRecord &R) {
+  if (auto E = writeMetadata<7u>(OS, R.size(), R.delta(), R.eventType()))
+    return E;
+  auto D = R.data();
+  ArrayRef<char> Bytes(D.data(), D.size());
   OS.write(Bytes);
   return Error::success();
 }
@@ -127,7 +146,7 @@ Error FDRTraceWriter::visit(FunctionRecord &R) {
   OS.write(TypeRecordFuncId);
   OS.write(R.delta());
   return Error::success();
-} // namespace xray
+}
 
 } // namespace xray
 } // namespace llvm
diff --git a/lib/XRay/FileHeaderReader.cpp b/lib/XRay/FileHeaderReader.cpp
index 967e85f30d2f44096bc54c06293f954e820636bb..0b3fb8b6f692db919c4ad96134812949f9cceb0a 100644
--- a/lib/XRay/FileHeaderReader.cpp
+++ b/lib/XRay/FileHeaderReader.cpp
@@ -63,11 +63,6 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
   // Manually advance the offset pointer 16 bytes, after getting a raw memcpy
   // from the underlying data.
   OffsetPtr += 16;
-  if (FileHeader.Version != 1 && FileHeader.Version != 2 &&
-      FileHeader.Version != 3)
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             "Unsupported XRay file version: %d at offset %d",
-                             FileHeader.Version, OffsetPtr);
   return std::move(FileHeader);
 }
 
diff --git a/lib/XRay/Profile.cpp b/lib/XRay/Profile.cpp
index fdd1953ab0f02e12487a5e205f0ac5c9d1b6ebe7..e8a082884d69e84b87999161476a39f9bc63a024 100644
--- a/lib/XRay/Profile.cpp
+++ b/lib/XRay/Profile.cpp
@@ -374,6 +374,12 @@ Expected<Profile> profileFromTrace(const Trace &T) {
       }
 
       break;
+
+    case RecordTypes::CUSTOM_EVENT:
+    case RecordTypes::TYPED_EVENT:
+      // TODO: Support an extension point to allow handling of custom and typed
+      // events in profiles.
+      break;
     }
   }
 
diff --git a/lib/XRay/RecordInitializer.cpp b/lib/XRay/RecordInitializer.cpp
index 7f9fd4c9627c4d529b3e64a4e2f9d67de1d09f98..cc9dd4609498c28e47c8a174d0bae0c249b3226e 100644
--- a/lib/XRay/RecordInitializer.cpp
+++ b/lib/XRay/RecordInitializer.cpp
@@ -20,7 +20,7 @@ Error RecordInitializer::visit(BufferExtents &R) {
   auto PreReadOffset = OffsetPtr;
   R.Size = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read buffer extent at offset %d.",
                              OffsetPtr);
 
@@ -39,14 +39,14 @@ Error RecordInitializer::visit(WallclockRecord &R) {
   R.Seconds = E.getU64(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Cannot read wall clock 'seconds' field at offset %d.", OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.Nanos = E.getU32(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Cannot read wall clock 'nanos' field at offset %d.", OffsetPtr);
 
   // Align to metadata record size boundary.
@@ -65,13 +65,13 @@ Error RecordInitializer::visit(NewCPUIDRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.CPUId = E.getU16(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read CPU id at offset %d.", OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.TSC = E.getU64(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read CPU TSC at offset %d.", OffsetPtr);
 
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
@@ -88,7 +88,7 @@ Error RecordInitializer::visit(TSCWrapRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.BaseTSC = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read TSC wrap record at offset %d.",
                              OffsetPtr);
 
@@ -108,16 +108,128 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
   R.Size = E.getSigned(&OffsetPtr, sizeof(int32_t));
   if (PreReadOffset == OffsetPtr)
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Cannot read a custom event record size field offset %d.", OffsetPtr);
 
   PreReadOffset = OffsetPtr;
   R.TSC = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Cannot read a custom event TSC field at offset %d.", OffsetPtr);
 
+  // For version 4 onwards, of the FDR log, we want to also capture the CPU ID
+  // of the custom event.
+  if (Version >= 4) {
+    PreReadOffset = OffsetPtr;
+    R.CPU = E.getU16(&OffsetPtr);
+    if (PreReadOffset == OffsetPtr)
+      return createStringError(
+          std::make_error_code(std::errc::invalid_argument),
+          "Missing CPU field at offset %d", OffsetPtr);
+  }
+
+  assert(OffsetPtr > BeginOffset &&
+         OffsetPtr - BeginOffset <= MetadataRecord::kMetadataBodySize);
+  OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
+
+  // Next we read in a fixed chunk of data from the given offset.
+  if (!E.isValidOffsetForDataOfSize(OffsetPtr, R.Size))
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Cannot read %d bytes of custom event data from offset %d.", R.Size,
+        OffsetPtr);
+
+  std::vector<uint8_t> Buffer;
+  Buffer.resize(R.Size);
+  if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Failed reading data into buffer of size %d at offset %d.", R.Size,
+        OffsetPtr);
+  R.Data.assign(Buffer.begin(), Buffer.end());
+  return Error::success();
+}
+
+Error RecordInitializer::visit(CustomEventRecordV5 &R) {
+  if (!E.isValidOffsetForDataOfSize(OffsetPtr,
+                                    MetadataRecord::kMetadataBodySize))
+    return createStringError(std::make_error_code(std::errc::bad_address),
+                             "Invalid offset for a custom event record (%d).",
+                             OffsetPtr);
+
+  auto BeginOffset = OffsetPtr;
+  auto PreReadOffset = OffsetPtr;
+
+  R.Size = E.getSigned(&OffsetPtr, sizeof(int32_t));
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a custom event record size field offset %d.", OffsetPtr);
+
+  PreReadOffset = OffsetPtr;
+  R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t));
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a custom event record TSC delta field at offset %d.",
+        OffsetPtr);
+
+  assert(OffsetPtr > BeginOffset &&
+         OffsetPtr - BeginOffset <= MetadataRecord::kMetadataBodySize);
+  OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
+
+  // Next we read in a fixed chunk of data from the given offset.
+  if (!E.isValidOffsetForDataOfSize(OffsetPtr, R.Size))
+    return createStringError(
+        std::make_error_code(std::errc::bad_address),
+        "Cannot read %d bytes of custom event data from offset %d.", R.Size,
+        OffsetPtr);
+
+  std::vector<uint8_t> Buffer;
+  Buffer.resize(R.Size);
+  if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Failed reading data into buffer of size %d at offset %d.", R.Size,
+        OffsetPtr);
+  R.Data.assign(Buffer.begin(), Buffer.end());
+  return Error::success();
+}
+
+Error RecordInitializer::visit(TypedEventRecord &R) {
+  if (!E.isValidOffsetForDataOfSize(OffsetPtr,
+                                    MetadataRecord::kMetadataBodySize))
+    return createStringError(std::make_error_code(std::errc::bad_address),
+                             "Invalid offset for a typed event record (%d).",
+                             OffsetPtr);
+
+  auto BeginOffset = OffsetPtr;
+  auto PreReadOffset = OffsetPtr;
+
+  R.Size = E.getSigned(&OffsetPtr, sizeof(int32_t));
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a typed event record size field offset %d.", OffsetPtr);
+
+  PreReadOffset = OffsetPtr;
+  R.Delta = E.getSigned(&OffsetPtr, sizeof(int32_t));
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a typed event record TSC delta field at offset %d.",
+        OffsetPtr);
+
+  PreReadOffset = OffsetPtr;
+  R.EventType = E.getU16(&OffsetPtr);
+  if (PreReadOffset == OffsetPtr)
+    return createStringError(
+        std::make_error_code(std::errc::invalid_argument),
+        "Cannot read a typed event record type field at offset %d.", OffsetPtr);
+
+  assert(OffsetPtr > BeginOffset &&
+         OffsetPtr - BeginOffset <= MetadataRecord::kMetadataBodySize);
   OffsetPtr += MetadataRecord::kMetadataBodySize - (OffsetPtr - BeginOffset);
 
   // Next we read in a fixed chunk of data from the given offset.
@@ -131,7 +243,7 @@ Error RecordInitializer::visit(CustomEventRecord &R) {
   Buffer.resize(R.Size);
   if (E.getU8(&OffsetPtr, Buffer.data(), R.Size) != Buffer.data())
     return createStringError(
-        std::make_error_code(std::errc::bad_message),
+        std::make_error_code(std::errc::invalid_argument),
         "Failed reading data into buffer of size %d at offset %d.", R.Size,
         OffsetPtr);
   R.Data.assign(Buffer.begin(), Buffer.end());
@@ -148,7 +260,7 @@ Error RecordInitializer::visit(CallArgRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.Arg = E.getU64(&OffsetPtr);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read a call arg record at offset %d.",
                              OffsetPtr);
 
@@ -166,7 +278,7 @@ Error RecordInitializer::visit(PIDRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.PID = E.getSigned(&OffsetPtr, 4);
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read a process ID record at offset %d.",
                              OffsetPtr);
 
@@ -184,7 +296,7 @@ Error RecordInitializer::visit(NewBufferRecord &R) {
   auto PreReadOffset = OffsetPtr;
   R.TID = E.getSigned(&OffsetPtr, sizeof(int32_t));
   if (PreReadOffset == OffsetPtr)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Cannot read a new buffer record at offset %d.",
                              OffsetPtr);
 
@@ -234,7 +346,7 @@ Error RecordInitializer::visit(FunctionRecord &R) {
     R.Kind = static_cast<RecordTypes>(FunctionType);
     break;
   default:
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Unknown function record type '%d' at offset %d.",
                              FunctionType, BeginOffset);
   }
@@ -243,7 +355,7 @@ Error RecordInitializer::visit(FunctionRecord &R) {
   PreReadOffset = OffsetPtr;
   R.Delta = E.getU32(&OffsetPtr);
   if (OffsetPtr == PreReadOffset)
-    return createStringError(std::make_error_code(std::errc::bad_message),
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
                              "Failed reading TSC delta from offset %d.",
                              OffsetPtr);
   assert(FunctionRecord::kFunctionRecordSize == (OffsetPtr - BeginOffset));
diff --git a/lib/XRay/RecordPrinter.cpp b/lib/XRay/RecordPrinter.cpp
index 09b25ddba25d979b496f5ee8073c391f142b85b9..71ea7d0e969f0f9ef470a467871beead2b3524b4 100644
--- a/lib/XRay/RecordPrinter.cpp
+++ b/lib/XRay/RecordPrinter.cpp
@@ -35,8 +35,24 @@ Error RecordPrinter::visit(TSCWrapRecord &R) {
 }
 
 Error RecordPrinter::visit(CustomEventRecord &R) {
-  OS << formatv("<Custom Event: tsc = {0}, size = {1}, data = '{2}'>", R.tsc(),
-                R.size(), R.data())
+  OS << formatv(
+            "<Custom Event: tsc = {0}, cpu = {1}, size = {2}, data = '{3}'>",
+            R.tsc(), R.cpu(), R.size(), R.data())
+     << Delim;
+  return Error::success();
+}
+
+Error RecordPrinter::visit(CustomEventRecordV5 &R) {
+  OS << formatv("<Custom Event: delta = +{0}, size = {1}, data = '{2}'>",
+                R.delta(), R.size(), R.data())
+     << Delim;
+  return Error::success();
+}
+
+Error RecordPrinter::visit(TypedEventRecord &R) {
+  OS << formatv(
+            "<Typed Event: delta = +{0}, type = {1}, size = {2}, data = '{3}'",
+            R.delta(), R.eventType(), R.size(), R.data())
      << Delim;
   return Error::success();
 }
@@ -65,21 +81,25 @@ Error RecordPrinter::visit(FunctionRecord &R) {
   // FIXME: Support symbolization here?
   switch (R.recordType()) {
   case RecordTypes::ENTER:
-    OS << formatv("<Function Enter: #{0} delta = +{0}>", R.functionId(),
+    OS << formatv("<Function Enter: #{0} delta = +{1}>", R.functionId(),
                   R.delta());
     break;
   case RecordTypes::ENTER_ARG:
-    OS << formatv("<Function Enter With Arg: #{0} delta = +{0}>",
+    OS << formatv("<Function Enter With Arg: #{0} delta = +{1}>",
                   R.functionId(), R.delta());
     break;
   case RecordTypes::EXIT:
-    OS << formatv("<Function Exit: #{0} delta = +{0}>", R.functionId(),
+    OS << formatv("<Function Exit: #{0} delta = +{1}>", R.functionId(),
                   R.delta());
     break;
   case RecordTypes::TAIL_EXIT:
-    OS << formatv("<Function Tail Exit: #{0} delta = +{0}>", R.functionId(),
+    OS << formatv("<Function Tail Exit: #{0} delta = +{1}>", R.functionId(),
                   R.delta());
     break;
+  case RecordTypes::CUSTOM_EVENT:
+  case RecordTypes::TYPED_EVENT:
+    // TODO: Flag as a bug?
+    break;
   }
   OS << Delim;
   return Error::success();
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index 1d7c723864d13e02edf80821186aa57bea1871c5..4f28f3f754c1ffaf27522e73d8759def033793c4 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -247,6 +247,17 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
 /// ThreadBuffer: BufferExtents NewBuffer WallClockTime Pid NewCPUId
 ///               FunctionSequence
 /// EOB: *deprecated*
+///
+/// In Version 4, we make the following changes:
+///
+/// CustomEventRecord now includes the CPU data.
+///
+/// In Version 5, we make the following changes:
+///
+/// CustomEventRecord and TypedEventRecord now use TSC delta encoding similar to
+/// what FunctionRecord instances use, and we no longer need to include the CPU
+/// id in the CustomEventRecord.
+///
 Error loadFDRLog(StringRef Data, bool IsLittleEndian,
                  XRayFileHeader &FileHeader, std::vector<XRayRecord> &Records) {
 
@@ -310,12 +321,11 @@ Error loadFDRLog(StringRef Data, bool IsLittleEndian,
   {
     for (auto &PTB : Index) {
       auto &Blocks = PTB.second;
-      llvm::sort(
-          Blocks,
-          [](const BlockIndexer::Block &L, const BlockIndexer::Block &R) {
-            return (L.WallclockTime->seconds() < R.WallclockTime->seconds() &&
-                    L.WallclockTime->nanos() < R.WallclockTime->nanos());
-          });
+      llvm::sort(Blocks, [](const BlockIndexer::Block &L,
+                            const BlockIndexer::Block &R) {
+        return (L.WallclockTime->seconds() < R.WallclockTime->seconds() &&
+                L.WallclockTime->nanos() < R.WallclockTime->nanos());
+      });
       auto Adder = [&](const XRayRecord &R) { Records.push_back(R); };
       TraceExpander Expander(Adder, FileHeader.Version);
       for (auto &B : Blocks) {
@@ -353,8 +363,9 @@ Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
   Records.clear();
   std::transform(Trace.Records.begin(), Trace.Records.end(),
                  std::back_inserter(Records), [&](const YAMLXRayRecord &R) {
-                   return XRayRecord{R.RecordType, R.CPU, R.Type, R.FuncId,
-                                     R.TSC,        R.TId, R.PId,  R.CallArgs};
+                   return XRayRecord{R.RecordType, R.CPU,      R.Type,
+                                     R.FuncId,     R.TSC,      R.TId,
+                                     R.PId,        R.CallArgs, R.Data};
                  });
   return Error::success();
 }
@@ -435,7 +446,7 @@ Expected<Trace> llvm::xray::loadTrace(const DataExtractor &DE, bool Sort) {
     }
     break;
   case FLIGHT_DATA_RECORDER_FORMAT:
-    if (Version == 1 || Version == 2 || Version == 3) {
+    if (Version >= 1 && Version <= 5) {
       if (auto E = loadFDRLog(DE.getData(), DE.isLittleEndian(), T.FileHeader,
                               T.Records))
         return std::move(E);
diff --git a/test/Analysis/ConstantFolding/min-max.ll b/test/Analysis/ConstantFolding/min-max.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b872e4a1aca8c45f18b37de96d65b3803f83202c
--- /dev/null
+++ b/test/Analysis/ConstantFolding/min-max.ll
@@ -0,0 +1,136 @@
+; RUN: opt -instcombine -S -o - %s | FileCheck %s
+; Tests that constant folding of min and max operations works as expected.
+
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+
+; CHECK: define float @minnum_float() {
+define float @minnum_float() {
+  ; CHECK-NEXT: ret float 5.000000e+00
+  %1 = call float @llvm.minnum.f32(float 5.0, float 42.0)
+  ret float %1
+}
+
+; Check that minnum constant folds to propagate non-NaN or smaller argument
+; CHECK: define <4 x float> @minnum_float_vec() {
+define <4 x float> @minnum_float_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0x7FF8000000000000, float 5.000000e+00,
+  ; CHECK-SAME:                  float 4.200000e+01, float 5.000000e+00>
+  %1 = call <4 x float> @llvm.minnum.v4f32(
+    <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 42., float 42.>,
+    <4 x float> <float 0x7FF8000000000000, float 5., float 0x7FF8000000000000, float 5.>
+  )
+  ret <4 x float> %1
+}
+
+; Check that minnum constant folds to propagate one of its argument zeros
+; CHECK: define <4 x float> @minnum_float_zeros_vec() {
+define <4 x float> @minnum_float_zeros_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float {{-?}}0.000000e+00,
+  ; CHECK-SAME:                  float {{-?}}0.000000e+00, float -0.000000e+00>
+  %1 = call <4 x float> @llvm.minnum.v4f32(
+    <4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>,
+    <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>
+  )
+  ret <4 x float> %1
+}
+
+; CHECK: define float @maxnum_float() {
+define float @maxnum_float() {
+  ; CHECK-NEXT: ret float 4.200000e+01
+  %1 = call float @llvm.maxnum.f32(float 5.0, float 42.0)
+  ret float %1
+}
+
+; Check that maxnum constant folds to propagate non-NaN or greater argument
+; CHECK: define <4 x float> @maxnum_float_vec() {
+define <4 x float> @maxnum_float_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0x7FF8000000000000, float 5.000000e+00,
+  ; CHECK-SAME:                  float 4.200000e+01, float 4.200000e+01>
+  %1 = call <4 x float> @llvm.maxnum.v4f32(
+    <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 42., float 42.>,
+    <4 x float> <float 0x7FF8000000000000, float 5., float 0x7FF8000000000000, float 5.>
+  )
+  ret <4 x float> %1
+}
+
+; Check that maxnum constant folds to propagate one of its argument zeros
+; CHECK: define <4 x float> @maxnum_float_zeros_vec() {
+define <4 x float> @maxnum_float_zeros_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float {{-?}}0.000000e+00,
+  ; CHECK-SAME:                  float {{-?}}0.000000e+00, float -0.000000e+00>
+  %1 = call <4 x float> @llvm.maxnum.v4f32(
+    <4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>,
+    <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>
+  )
+  ret <4 x float> %1
+}
+
+; CHECK: define float @minimum_float() {
+define float @minimum_float() {
+  ; CHECK-NEXT: ret float 5.000000e+00
+  %1 = call float @llvm.minimum.f32(float 5.0, float 42.0)
+  ret float %1
+}
+
+; Check that minimum propagates its NaN or smaller argument
+; CHECK: define <4 x float> @minimum_float_vec() {
+define <4 x float> @minimum_float_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000,
+  ; CHECK-SAME:                  float 0x7FF8000000000000, float 5.000000e+00>
+  %1 = call <4 x float> @llvm.minimum.v4f32(
+    <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 42., float 42.>,
+    <4 x float> <float 0x7FF8000000000000, float 5., float 0x7FF8000000000000, float 5.>
+  )
+  ret <4 x float> %1
+}
+
+; Check that minimum treats -0.0 as smaller than 0.0 while constant folding
+; CHECK: define <4 x float> @minimum_float_zeros_vec() {
+define <4 x float> @minimum_float_zeros_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float -0.000000e+00,
+  ; CHECK-SAME:                  float -0.000000e+00, float -0.000000e+00>
+  %1 = call <4 x float> @llvm.minimum.v4f32(
+    <4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>,
+    <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>
+  )
+  ret <4 x float> %1
+}
+
+; CHECK: define float @maximum_float() {
+define float @maximum_float() {
+  ; CHECK-NEXT: ret float 4.200000e+01
+  %1 = call float @llvm.maximum.f32(float 5.0, float 42.0)
+  ret float %1
+}
+
+; Check that maximum propagates its NaN or greater argument
+; CHECK: define <4 x float> @maximum_float_vec() {
+define <4 x float> @maximum_float_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000,
+  ; CHECK-SAME:                  float 0x7FF8000000000000, float 4.200000e+01>
+  %1 = call <4 x float> @llvm.maximum.v4f32(
+    <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 42., float 42.>,
+    <4 x float> <float 0x7FF8000000000000, float 5., float 0x7FF8000000000000, float 5.>
+  )
+  ret <4 x float> %1
+}
+
+; Check that maximum treats -0.0 as smaller than 0.0 while constant folding
+; CHECK: define <4 x float> @maximum_float_zeros_vec() {
+define <4 x float> @maximum_float_zeros_vec() {
+  ; CHECK-NEXT: ret <4 x float> <float 0.000000e+00, float 0.000000e+00,
+  ; CHECK-SAME:                  float 0.000000e+00, float -0.000000e+00>
+  %1 = call <4 x float> @llvm.maximum.v4f32(
+    <4 x float> <float 0.0, float -0.0, float 0.0, float -0.0>,
+    <4 x float> <float 0.0, float 0.0, float -0.0, float -0.0>
+  )
+  ret <4 x float> %1
+}
diff --git a/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll b/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
new file mode 100644
index 0000000000000000000000000000000000000000..355ed520575fb0c6ba59fbb21675fd8399a0c5ff
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=aarch64--linux-gnu -cost-model -analyze | FileCheck %s
+
+define void @broadcast() {
+; CHECK-LABEL: 'broadcast'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v7 = shufflevector <2 x i8> undef, <2 x i8>undef, <2 x i32> zeroinitializer
+  %v8 = shufflevector <4 x i8> undef, <4 x i8>undef, <4 x i32> zeroinitializer
+  %v9 = shufflevector <8 x i8> undef, <8 x i8>undef, <8 x i32> zeroinitializer
+  %v10 = shufflevector <16 x i8> undef, <16 x i8>undef, <16 x i32> zeroinitializer
+
+  %v11 = shufflevector <2 x i16> undef, <2 x i16>undef, <2 x i32> zeroinitializer
+  %v12 = shufflevector <4 x i16> undef, <4 x i16>undef, <4 x i32> zeroinitializer
+  %v13 = shufflevector <8 x i16> undef, <8 x i16>undef, <8 x i32> zeroinitializer
+
+  %v14 = shufflevector <2 x i32> undef, <2 x i32>undef, <2 x i32> zeroinitializer
+  %v15 = shufflevector <4 x i32> undef, <4 x i32>undef, <4 x i32> zeroinitializer
+
+  %v16 = shufflevector <2 x float> undef, <2 x float>undef, <2 x i32> zeroinitializer
+  %v17 = shufflevector <4 x float> undef, <4 x float>undef, <4 x i32> zeroinitializer
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/AArch64/vector-reduce.ll b/test/Analysis/CostModel/AArch64/vector-reduce.ll
index 5bf50764e2ed8b5e9a29da7b7fcec3034ebb76a7..c268a18e7f8c6d0c439569c976530cd788c67048 100644
--- a/test/Analysis/CostModel/AArch64/vector-reduce.ll
+++ b/test/Analysis/CostModel/AArch64/vector-reduce.ll
@@ -47,7 +47,7 @@ define i32 @add.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: umin.i8.v8i8
-; COST:       Found an estimated cost of 157 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umin.i8.v8i8
 ; CODE:       uminv b0, v0.8b
 define i8 @umin.i8.v8i8(<8 x i8> %v) {
@@ -56,7 +56,7 @@ define i8 @umin.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: umin.i8.v16i8
-; COST:       Found an estimated cost of 388 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umin.i8.v16i8
 ; CODE:       uminv b0, v0.16b
 define i8 @umin.i8.v16i8(<16 x i8> %v) {
@@ -65,7 +65,7 @@ define i8 @umin.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: umin.i16.v4i16
-; COST:       Found an estimated cost of 58 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umin.i16.v4i16
 ; CODE:       uminv h0, v0.4h
 define i16 @umin.i16.v4i16(<4 x i16> %v) {
@@ -74,7 +74,7 @@ define i16 @umin.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: umin.i16.v8i16
-; COST:       Found an estimated cost of 157 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umin.i16.v8i16
 ; CODE:       uminv h0, v0.8h
 define i16 @umin.i16.v8i16(<8 x i16> %v) {
@@ -83,7 +83,7 @@ define i16 @umin.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: umin.i32.v4i32
-; COST:       Found an estimated cost of 58 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umin.i32.v4i32
 ; CODE:       uminv s0, v0.4s
 define i32 @umin.i32.v4i32(<4 x i32> %v) {
@@ -92,7 +92,7 @@ define i32 @umin.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: umax.i8.v8i8
-; COST:       Found an estimated cost of 157 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: umax.i8.v8i8
 ; CODE:       umaxv b0, v0.8b
 define i8 @umax.i8.v8i8(<8 x i8> %v) {
@@ -101,7 +101,7 @@ define i8 @umax.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: umax.i8.v16i8
-; COST:       Found an estimated cost of 388 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: umax.i8.v16i8
 ; CODE:       umaxv b0, v0.16b
 define i8 @umax.i8.v16i8(<16 x i8> %v) {
@@ -110,7 +110,7 @@ define i8 @umax.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: umax.i16.v4i16
-; COST:       Found an estimated cost of 58 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: umax.i16.v4i16
 ; CODE:       umaxv h0, v0.4h
 define i16 @umax.i16.v4i16(<4 x i16> %v) {
@@ -119,7 +119,7 @@ define i16 @umax.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: umax.i16.v8i16
-; COST:       Found an estimated cost of 157 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: umax.i16.v8i16
 ; CODE:       umaxv h0, v0.8h
 define i16 @umax.i16.v8i16(<8 x i16> %v) {
@@ -128,7 +128,7 @@ define i16 @umax.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: umax.i32.v4i32
-; COST:       Found an estimated cost of 58 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: umax.i32.v4i32
 ; CODE:       umaxv s0, v0.4s
 define i32 @umax.i32.v4i32(<4 x i32> %v) {
@@ -137,7 +137,7 @@ define i32 @umax.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: smin.i8.v8i8
-; COST:       Found an estimated cost of 157 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smin.i8.v8i8
 ; CODE:       sminv b0, v0.8b
 define i8 @smin.i8.v8i8(<8 x i8> %v) {
@@ -146,7 +146,7 @@ define i8 @smin.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: smin.i8.v16i8
-; COST:       Found an estimated cost of 388 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smin.i8.v16i8
 ; CODE:       sminv b0, v0.16b
 define i8 @smin.i8.v16i8(<16 x i8> %v) {
@@ -155,7 +155,7 @@ define i8 @smin.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: smin.i16.v4i16
-; COST:       Found an estimated cost of 58 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smin.i16.v4i16
 ; CODE:       sminv h0, v0.4h
 define i16 @smin.i16.v4i16(<4 x i16> %v) {
@@ -164,7 +164,7 @@ define i16 @smin.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: smin.i16.v8i16
-; COST:       Found an estimated cost of 157 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smin.i16.v8i16
 ; CODE:       sminv h0, v0.8h
 define i16 @smin.i16.v8i16(<8 x i16> %v) {
@@ -173,7 +173,7 @@ define i16 @smin.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: smin.i32.v4i32
-; COST:       Found an estimated cost of 58 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smin.i32.v4i32
 ; CODE:       sminv s0, v0.4s
 define i32 @smin.i32.v4i32(<4 x i32> %v) {
@@ -182,7 +182,7 @@ define i32 @smin.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: smax.i8.v8i8
-; COST:       Found an estimated cost of 157 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> %v)
 ; CODE-LABEL: smax.i8.v8i8
 ; CODE:       smaxv b0, v0.8b
 define i8 @smax.i8.v8i8(<8 x i8> %v) {
@@ -191,7 +191,7 @@ define i8 @smax.i8.v8i8(<8 x i8> %v) {
 }
 
 ; COST-LABEL: smax.i8.v16i8
-; COST:       Found an estimated cost of 388 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %v)
+; COST:       Found an estimated cost of 744 for instruction: %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %v)
 ; CODE-LABEL: smax.i8.v16i8
 ; CODE:       smaxv b0, v0.16b
 define i8 @smax.i8.v16i8(<16 x i8> %v) {
@@ -200,7 +200,7 @@ define i8 @smax.i8.v16i8(<16 x i8> %v) {
 }
 
 ; COST-LABEL: smax.i16.v4i16
-; COST:       Found an estimated cost of 58 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %v)
+; COST:       Found an estimated cost of 92 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> %v)
 ; CODE-LABEL: smax.i16.v4i16
 ; CODE:       smaxv h0, v0.4h
 define i16 @smax.i16.v4i16(<4 x i16> %v) {
@@ -209,7 +209,7 @@ define i16 @smax.i16.v4i16(<4 x i16> %v) {
 }
 
 ; COST-LABEL: smax.i16.v8i16
-; COST:       Found an estimated cost of 157 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %v)
+; COST:       Found an estimated cost of 280 for instruction: %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %v)
 ; CODE-LABEL: smax.i16.v8i16
 ; CODE:       smaxv h0, v0.8h
 define i16 @smax.i16.v8i16(<8 x i16> %v) {
@@ -218,7 +218,7 @@ define i16 @smax.i16.v8i16(<8 x i16> %v) {
 }
 
 ; COST-LABEL: smax.i32.v4i32
-; COST:       Found an estimated cost of 58 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %v)
 ; CODE-LABEL: smax.i32.v4i32
 ; CODE:       smaxv s0, v0.4s
 define i32 @smax.i32.v4i32(<4 x i32> %v) {
@@ -227,7 +227,7 @@ define i32 @smax.i32.v4i32(<4 x i32> %v) {
 }
 
 ; COST-LABEL: fmin.f32.v4f32
-; COST:       Found an estimated cost of 58 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmin.f32.v4f32
 ; CODE:       fminnmv s0, v0.4s
 define float @fmin.f32.v4f32(<4 x float> %v) {
@@ -236,7 +236,7 @@ define float @fmin.f32.v4f32(<4 x float> %v) {
 }
 
 ; COST-LABEL: fmax.f32.v4f32
-; COST:       Found an estimated cost of 58 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %v)
+; COST:       Found an estimated cost of 62 for instruction: %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %v)
 ; CODE-LABEL: fmax.f32.v4f32
 ; CODE:       fmaxnmv s0, v0.4s
 define float @fmax.f32.v4f32(<4 x float> %v) {
diff --git a/test/Analysis/CostModel/ARM/shuffle.ll b/test/Analysis/CostModel/ARM/shuffle.ll
index c92d668804648a038d7ae695df398664dc80533f..a6a4235256433e80aa7c164d8e197259867b2bbc 100644
--- a/test/Analysis/CostModel/ARM/shuffle.ll
+++ b/test/Analysis/CostModel/ARM/shuffle.ll
@@ -1,39 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s  -cost-model -analyze -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios6.0.0"
 
-; CHECK: shuffle
-define void @shuffle() {
+define void @broadcast() {
+; CHECK-LABEL: 'broadcast'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %v7 = shufflevector <2 x i8> undef, <2 x i8>undef, <2 x i32> zeroinitializer
+  %v8 = shufflevector <4 x i8> undef, <4 x i8>undef, <4 x i32> zeroinitializer
+  %v9 = shufflevector <8 x i8> undef, <8 x i8>undef, <8 x i32> zeroinitializer
+  %v10 = shufflevector <16 x i8> undef, <16 x i8>undef, <16 x i32> zeroinitializer
 
+  %v11 = shufflevector <2 x i16> undef, <2 x i16>undef, <2 x i32> zeroinitializer
+  %v12 = shufflevector <4 x i16> undef, <4 x i16>undef, <4 x i32> zeroinitializer
+  %v13 = shufflevector <8 x i16> undef, <8 x i16>undef, <8 x i32> zeroinitializer
 
-  ;; Reverse shuffles should be lowered to vrev and possibly a vext (for
-  ;; quadwords)
+  %v14 = shufflevector <2 x i32> undef, <2 x i32>undef, <2 x i32> zeroinitializer
+  %v15 = shufflevector <4 x i32> undef, <4 x i32>undef, <4 x i32> zeroinitializer
 
-    ; Vector values
-  ; CHECK: cost of 1 {{.*}} shuffle
+  %v16 = shufflevector <2 x float> undef, <2 x float>undef, <2 x i32> zeroinitializer
+  %v17 = shufflevector <4 x float> undef, <4 x float>undef, <4 x i32> zeroinitializer
+
+  ret void
+}
+
+;; Reverse shuffles should be lowered to vrev and possibly a vext (for quadwords)
+define void @reverse() {
+; CHECK-LABEL: 'reverse'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v10 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %v7 = shufflevector <2 x i8> undef, <2 x i8>undef, <2 x i32> <i32 1, i32 0>
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v8 = shufflevector <4 x i8> undef, <4 x i8>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v9 = shufflevector <8 x i8> undef, <8 x i8>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-  ; CHECK: cost of 2 {{.*}} shuffle
   %v10 = shufflevector <16 x i8> undef, <16 x i8>undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v11 = shufflevector <2 x i16> undef, <2 x i16>undef, <2 x i32> <i32 1, i32 0>
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v12 = shufflevector <4 x i16> undef, <4 x i16>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  ; CHECK: cost of 2 {{.*}} shuffle
   %v13 = shufflevector <8 x i16> undef, <8 x i16>undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v14 = shufflevector <2 x i32> undef, <2 x i32>undef, <2 x i32> <i32 1, i32 0>
-  ; CHECK: cost of 2 {{.*}} shuffle
   %v15 = shufflevector <4 x i32> undef, <4 x i32>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 
-  ; CHECK: cost of 1 {{.*}} shuffle
   %v16 = shufflevector <2 x float> undef, <2 x float>undef, <2 x i32> <i32 1, i32 0>
-  ; CHECK: cost of 2 {{.*}} shuffle
   %v17 = shufflevector <4 x float> undef, <4 x float>undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 
   ret void
diff --git a/test/Analysis/CostModel/SystemZ/cmp-tofp-scalar.ll b/test/Analysis/CostModel/SystemZ/cmp-tofp-scalar.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6cd4ead76a5845b9ab46ee3a975adfe1205e7b57
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/cmp-tofp-scalar.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Costs for conversion of i1 to fp.
+
+define float @fun0(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = uitofp i1 %cmp to float
+  ret float %v
+
+; CHECK: fun0
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = uitofp i1 %cmp to float
+}
+
+define double @fun1(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = uitofp i1 %cmp to double
+  ret double %v
+
+; CHECK: fun1
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = uitofp i1 %cmp to double
+}
diff --git a/test/Analysis/CostModel/SystemZ/cmp-tofp.ll b/test/Analysis/CostModel/SystemZ/cmp-tofp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f50e3ea23cfe797b695ed08eedb1df5e289e1835
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/cmp-tofp.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Costs for conversion of i1 vectors to vectors of double.
+
+define <2 x double> @fun0(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = uitofp <2 x i1> %cmp to <2 x double>
+  ret <2 x double> %v
+
+; CHECK: fun0
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = uitofp <2 x i1> %cmp to <2 x double>
+}
+
+define <2 x double> @fun1(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = sitofp <2 x i1> %cmp to <2 x double>
+  ret <2 x double> %v
+
+; CHECK: fun1
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sitofp <2 x i1> %cmp to <2 x double>
+}
+
+define <2 x double> @fun2(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = uitofp <2 x i1> %cmp to <2 x double>
+  ret <2 x double> %v
+
+; CHECK: fun2
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = uitofp <2 x i1> %cmp to <2 x double>
+}
+
+define <2 x double> @fun3(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = sitofp <2 x i1> %cmp to <2 x double>
+  ret <2 x double> %v
+
+; CHECK: fun3
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sitofp <2 x i1> %cmp to <2 x double>
+}
diff --git a/test/Analysis/CostModel/SystemZ/div-pow2.ll b/test/Analysis/CostModel/SystemZ/div-pow2.ll
deleted file mode 100644
index 9ef2dd71e8fa747244bcc7328cb00edfa95f0612..0000000000000000000000000000000000000000
--- a/test/Analysis/CostModel/SystemZ/div-pow2.ll
+++ /dev/null
@@ -1,154 +0,0 @@
-; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
-
-; Scalar sdiv
-
-define i64 @fun0(i64 %a) {
-  %r = sdiv i64 %a, 2
-  ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, 2
-}
-
-define i64 @fun1(i64 %a) {
-  %r = sdiv i64 %a, -4
-  ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, -4
-}
-
-define i32 @fun2(i32 %a) {
-  %r = sdiv i32 %a, 8
-  ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, 8
-}
-
-define i32 @fun3(i32 %a) {
-  %r = sdiv i32 %a, -16
-  ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, -16
-}
-
-define i16 @fun4(i16 %a) {
-  %r = sdiv i16 %a, 32
-  ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, 32
-}
-
-define i16 @fun5(i16 %a) {
-  %r = sdiv i16 %a, -64
-  ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, -64
-}
-
-define i8 @fun6(i8 %a) {
-  %r = sdiv i8 %a, 64
-  ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, 64
-}
-
-define i8 @fun7(i8 %a) {
-  %r = sdiv i8 %a, -128
-  ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, -128
-}
-
-
-; Vector sdiv
-
-define <2 x i64> @fun8(<2 x i64> %a) {
-  %r = sdiv <2 x i64> %a, <i64 2, i64 2>
-  ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 2, i64 2>
-}
-
-define <2 x i64> @fun9(<2 x i64> %a) {
-  %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
-  ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
-}
-
-define <4 x i32> @fun10(<4 x i32> %a) {
-  %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
-  ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
-}
-
-define <4 x i32> @fun11(<4 x i32> %a) {
-  %r = sdiv <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
-  ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 -16
-}
-
-define <8 x i16> @fun12(<8 x i16> %a) {
-  %r = sdiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
-  ret <8 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 32
-}
-
-define <8 x i16> @fun13(<8 x i16> %a) {
-  %r = sdiv <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
-  ret <8 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 -64
-}
-
-define <16 x i8> @fun14(<16 x i8> %a) {
-  %r = sdiv <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
-  ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 64
-}
-
-define <16 x i8> @fun15(<16 x i8> %a) {
-  %r = sdiv <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
-  ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 -128
-}
-
-; Scalar udiv
-
-define i64 @fun16(i64 %a) {
-  %r = udiv i64 %a, 2
-  ret i64 %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i64 %a, 2
-}
-
-define i32 @fun17(i32 %a) {
-  %r = udiv i32 %a, 8
-  ret i32 %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i32 %a, 8
-}
-
-define i16 @fun18(i16 %a) {
-  %r = udiv i16 %a, 32
-  ret i16 %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i16 %a, 32
-}
-
-define i8 @fun19(i8 %a) {
-  %r = udiv i8 %a, 128
-  ret i8 %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i8 %a, -128
-}
-
-; Vector udiv
-
-define <2 x i64> @fun20(<2 x i64> %a) {
-  %r = udiv <2 x i64> %a, <i64 2, i64 2>
-  ret <2 x i64> %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <2 x i64> %a, <i64 2
-}
-
-define <4 x i32> @fun21(<4 x i32> %a) {
-  %r = udiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
-  ret <4 x i32> %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <4 x i32> %a, <i32 8
-}
-
-define <8 x i16> @fun22(<8 x i16> %a) {
-  %r = udiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
-  ret <8 x i16> %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <8 x i16> %a, <i16 32
-}
-
-define <16 x i8> @fun23(<16 x i8> %a) {
-  %r = udiv <16 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
-  ret <16 x i8> %r
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <16 x i8> %a, <i8 -128
-}
diff --git a/test/Analysis/CostModel/SystemZ/divrem-const.ll b/test/Analysis/CostModel/SystemZ/divrem-const.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0889d0f96afd9f869215238ab6a6b50d5342195a
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/divrem-const.ll
@@ -0,0 +1,291 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:  | FileCheck %s -check-prefix=COST
+
+; Check that all divide/remainder instructions are implemented by cheaper instructions.
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -o - | FileCheck %s
+; CHECK-NOT: dsg
+; CHECK-NOT: dl
+
+; Check costs of divisions/remainders by a vector of constants that is *not*
+; a power of two. A sequence containing a multiply and shifts will replace
+; the divide instruction.
+
+; Scalar sdiv
+
+define i64 @fun0(i64 %a) {
+  %r = sdiv i64 %a, 20
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = sdiv i64 %a, 20
+}
+
+define i32 @fun1(i32 %a) {
+  %r = sdiv i32 %a, 20
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = sdiv i32 %a, 20
+}
+
+define i16 @fun2(i16 %a) {
+  %r = sdiv i16 %a, 20
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = sdiv i16 %a, 20
+}
+
+define i8 @fun3(i8 %a) {
+  %r = sdiv i8 %a, 20
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = sdiv i8 %a, 20
+}
+
+; Vector sdiv
+
+define <2 x i64> @fun4(<2 x i64> %a) {
+  %r = sdiv <2 x i64> %a, <i64 20, i64 21>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 24 for instruction:   %r = sdiv <2 x i64>
+}
+
+define <4 x i32> @fun5(<4 x i32> %a) {
+  %r = sdiv <4 x i32> %a, <i32 20, i32 20, i32 20, i32 20>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = sdiv <4 x i32>
+}
+
+define <2 x i32> @fun6(<2 x i32> %a) {
+  %r = sdiv <2 x i32> %a, <i32 20, i32 21>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 25 for instruction:   %r = sdiv <2 x i32>
+}
+
+define <8 x i16> @fun7(<8 x i16> %a) {
+  %r = sdiv <8 x i16> %a, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = sdiv <8 x i16>
+}
+
+define <4 x i16> @fun8(<4 x i16> %a) {
+  %r = sdiv <4 x i16> %a, <i16 20, i16 20, i16 20, i16 21>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = sdiv <4 x i16>
+}
+
+define <16 x i8> @fun9(<16 x i8> %a) {
+  %r = sdiv <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 193 for instruction:   %r = sdiv <16 x i8>
+}
+
+define <8 x i8> @fun10(<8 x i8> %a) {
+  %r = sdiv <8 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 21>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = sdiv <8 x i8>
+}
+
+; Scalar udiv
+
+define i64 @fun11(i64 %a) {
+  %r = udiv i64 %a, 20
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = udiv i64 %a, 20
+}
+
+define i32 @fun12(i32 %a) {
+  %r = udiv i32 %a, 20
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = udiv i32 %a, 20
+}
+
+define i16 @fun13(i16 %a) {
+  %r = udiv i16 %a, 20
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = udiv i16 %a, 20
+}
+
+define i8 @fun14(i8 %a) {
+  %r = udiv i8 %a, 20
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = udiv i8
+}
+
+; Vector udiv
+
+define <2 x i64> @fun15(<2 x i64> %a) {
+  %r = udiv <2 x i64> %a, <i64 20, i64 20>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 24 for instruction:   %r = udiv <2 x i64>
+}
+
+define <4 x i32> @fun16(<4 x i32> %a) {
+  %r = udiv <4 x i32> %a, <i32 20, i32 20, i32 20, i32 21>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = udiv <4 x i32>
+}
+
+define <2 x i32> @fun17(<2 x i32> %a) {
+  %r = udiv <2 x i32> %a, <i32 20, i32 20>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 25 for instruction:   %r = udiv <2 x i32>
+}
+
+define <8 x i16> @fun18(<8 x i16> %a) {
+  %r = udiv <8 x i16> %a, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 21>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = udiv <8 x i16>
+}
+
+define <4 x i16> @fun19(<4 x i16> %a) {
+  %r = udiv <4 x i16> %a, <i16 20, i16 20, i16 20, i16 20>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = udiv <4 x i16>
+}
+
+define <16 x i8> @fun20(<16 x i8> %a) {
+  %r = udiv <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 21>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 193 for instruction:   %r = udiv <16 x i8>
+}
+
+define <8 x i8> @fun21(<8 x i8> %a) {
+  %r = udiv <8 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = udiv <8 x i8>
+}
+
+; Scalar srem
+
+define i64 @fun22(i64 %a) {
+  %r = srem i64 %a, 20
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = srem i64
+}
+
+define i32 @fun23(i32 %a) {
+  %r = srem i32 %a, 20
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = srem i32
+}
+
+define i16 @fun24(i16 %a) {
+  %r = srem i16 %a, 20
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = srem i16
+}
+
+define i8 @fun25(i8 %a) {
+  %r = srem i8 %a, 20
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = srem i8
+}
+
+; Vector srem
+
+define <2 x i64> @fun26(<2 x i64> %a) {
+  %r = srem <2 x i64> %a, <i64 20, i64 21>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 24 for instruction:   %r = srem <2 x i64>
+}
+
+define <4 x i32> @fun27(<4 x i32> %a) {
+  %r = srem <4 x i32> %a, <i32 20, i32 20, i32 20, i32 20>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = srem <4 x i32>
+}
+
+define <2 x i32> @fun28(<2 x i32> %a) {
+  %r = srem <2 x i32> %a, <i32 20, i32 21>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 25 for instruction:   %r = srem <2 x i32>
+}
+
+define <8 x i16> @fun29(<8 x i16> %a) {
+  %r = srem <8 x i16> %a, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = srem <8 x i16>
+}
+
+define <4 x i16> @fun30(<4 x i16> %a) {
+  %r = srem <4 x i16> %a, <i16 20, i16 20, i16 20, i16 21>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = srem <4 x i16>
+}
+
+define <16 x i8> @fun31(<16 x i8> %a) {
+  %r = srem <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 193 for instruction:   %r = srem <16 x i8>
+}
+
+define <8 x i8> @fun32(<8 x i8> %a) {
+  %r = srem <8 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 21>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = srem <8 x i8>
+}
+
+; Scalar urem
+
+define i64 @fun33(i64 %a) {
+  %r = urem i64 %a, 20
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = urem i64
+}
+
+define i32 @fun34(i32 %a) {
+  %r = urem i32 %a, 20
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = urem i32
+}
+
+define i16 @fun35(i16 %a) {
+  %r = urem i16 %a, 20
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = urem i16
+}
+
+define i8 @fun36(i8 %a) {
+  %r = urem i8 %a, 20
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 10 for instruction:   %r = urem i8
+}
+
+; Vector urem
+
+define <2 x i64> @fun37(<2 x i64> %a) {
+  %r = urem <2 x i64> %a, <i64 20, i64 20>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 24 for instruction:   %r = urem <2 x i64>
+}
+
+define <4 x i32> @fun38(<4 x i32> %a) {
+  %r = urem <4 x i32> %a, <i32 20, i32 20, i32 20, i32 21>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = urem <4 x i32>
+}
+
+define <2 x i32> @fun39(<2 x i32> %a) {
+  %r = urem <2 x i32> %a, <i32 20, i32 20>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 25 for instruction:   %r = urem <2 x i32>
+}
+
+define <8 x i16> @fun40(<8 x i16> %a) {
+  %r = urem <8 x i16> %a, <i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 20, i16 21>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = urem <8 x i16>
+}
+
+define <4 x i16> @fun41(<4 x i16> %a) {
+  %r = urem <4 x i16> %a, <i16 20, i16 20, i16 20, i16 20>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 49 for instruction:   %r = urem <4 x i16>
+}
+
+define <16 x i8> @fun42(<16 x i8> %a) {
+  %r = urem <16 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 21>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 193 for instruction:   %r = urem <16 x i8>
+}
+
+define <8 x i8> @fun43(<8 x i8> %a) {
+  %r = urem <8 x i8> %a, <i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20, i8 20>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 97 for instruction:   %r = urem <8 x i8>
+}
diff --git a/test/Analysis/CostModel/SystemZ/divrem-pow2.ll b/test/Analysis/CostModel/SystemZ/divrem-pow2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ad67ef9405fe5488c2db1fc88fb39c4a4002544e
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/divrem-pow2.ll
@@ -0,0 +1,383 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:  | FileCheck %s -check-prefix=COST
+
+; Check that all divide/remainder instructions are implemented by cheaper instructions.
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -o - | FileCheck %s
+; CHECK-NOT: dsg
+; CHECK-NOT: dl
+
+; Scalar sdiv
+
+define i64 @fun0(i64 %a) {
+  %r = sdiv i64 %a, 2
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, 2
+}
+
+define i64 @fun1(i64 %a) {
+  %r = sdiv i64 %a, -4
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i64 %a, -4
+}
+
+define i32 @fun2(i32 %a) {
+  %r = sdiv i32 %a, 8
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, 8
+}
+
+define i32 @fun3(i32 %a) {
+  %r = sdiv i32 %a, -16
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i32 %a, -16
+}
+
+define i16 @fun4(i16 %a) {
+  %r = sdiv i16 %a, 32
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, 32
+}
+
+define i16 @fun5(i16 %a) {
+  %r = sdiv i16 %a, -64
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i16 %a, -64
+}
+
+define i8 @fun6(i8 %a) {
+  %r = sdiv i8 %a, 64
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, 64
+}
+
+define i8 @fun7(i8 %a) {
+  %r = sdiv i8 %a, -128
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv i8 %a, -128
+}
+
+; Vector sdiv
+
+define <2 x i64> @fun8(<2 x i64> %a) {
+  %r = sdiv <2 x i64> %a, <i64 2, i64 2>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 2, i64 2>
+}
+
+define <2 x i64> @fun9(<2 x i64> %a) {
+  %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i64> %a, <i64 -4, i64 -4>
+}
+
+define <4 x i32> @fun10(<4 x i32> %a) {
+  %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+}
+
+define <4 x i32> @fun11(<4 x i32> %a) {
+  %r = sdiv <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i32> %a, <i32 -16
+}
+
+define <2 x i32> @fun12(<2 x i32> %a) {
+  %r = sdiv <2 x i32> %a, <i32 -16, i32 -16>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <2 x i32> %a, <i32 -16
+}
+
+define <8 x i16> @fun13(<8 x i16> %a) {
+  %r = sdiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 32
+}
+
+define <8 x i16> @fun14(<8 x i16> %a) {
+  %r = sdiv <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i16> %a, <i16 -64
+}
+
+define <4 x i16> @fun15(<4 x i16> %a) {
+  %r = sdiv <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <4 x i16> %a, <i16 32
+}
+
+define <16 x i8> @fun16(<16 x i8> %a) {
+  %r = sdiv <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 64
+}
+
+define <16 x i8> @fun17(<16 x i8> %a) {
+  %r = sdiv <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <16 x i8> %a, <i8 -128
+}
+
+define <8 x i8> @fun18(<8 x i8> %a) {
+  %r = sdiv <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = sdiv <8 x i8> %a, <i8 -128
+}
+
+; Scalar udiv
+
+define i64 @fun19(i64 %a) {
+  %r = udiv i64 %a, 2
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i64 %a, 2
+}
+
+define i32 @fun20(i32 %a) {
+  %r = udiv i32 %a, 8
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i32 %a, 8
+}
+
+define i16 @fun21(i16 %a) {
+  %r = udiv i16 %a, 32
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i16 %a, 32
+}
+
+define i8 @fun22(i8 %a) {
+  %r = udiv i8 %a, 128
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv i8 %a, -128
+}
+
+; Vector udiv
+
+define <2 x i64> @fun23(<2 x i64> %a) {
+  %r = udiv <2 x i64> %a, <i64 2, i64 2>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <2 x i64> %a, <i64 2
+}
+
+define <4 x i32> @fun24(<4 x i32> %a) {
+  %r = udiv <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <4 x i32> %a, <i32 8
+}
+
+define <2 x i32> @fun25(<2 x i32> %a) {
+  %r = udiv <2 x i32> %a, <i32 8, i32 8>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <2 x i32> %a, <i32 8
+}
+
+define <8 x i16> @fun26(<8 x i16> %a) {
+  %r = udiv <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <8 x i16> %a, <i16 32
+}
+
+define <4 x i16> @fun27(<4 x i16> %a) {
+  %r = udiv <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <4 x i16> %a, <i16 32
+}
+
+define <16 x i8> @fun28(<16 x i8> %a) {
+  %r = udiv <16 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <16 x i8> %a, <i8 -128
+}
+
+define <8 x i8> @fun29(<8 x i8> %a) {
+  %r = udiv <8 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = udiv <8 x i8> %a, <i8 -128
+}
+
+; Scalar srem
+
+define i64 @fun30(i64 %a) {
+  %r = srem i64 %a, 2
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i64 %a, 2
+}
+
+define i64 @fun31(i64 %a) {
+  %r = srem i64 %a, -4
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i64 %a, -4
+}
+
+define i32 @fun32(i32 %a) {
+  %r = srem i32 %a, 8
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i32 %a, 8
+}
+
+define i32 @fun33(i32 %a) {
+  %r = srem i32 %a, -16
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i32 %a, -16
+}
+
+define i16 @fun34(i16 %a) {
+  %r = srem i16 %a, 32
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i16 %a, 32
+}
+
+define i16 @fun35(i16 %a) {
+  %r = srem i16 %a, -64
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i16 %a, -64
+}
+
+define i8 @fun36(i8 %a) {
+  %r = srem i8 %a, 64
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i8 %a, 64
+}
+
+define i8 @fun37(i8 %a) {
+  %r = srem i8 %a, -128
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem i8 %a, -128
+}
+
+; Vector srem
+
+define <2 x i64> @fun38(<2 x i64> %a) {
+  %r = srem <2 x i64> %a, <i64 2, i64 2>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <2 x i64> %a, <i64 2, i64 2>
+}
+
+define <2 x i64> @fun39(<2 x i64> %a) {
+  %r = srem <2 x i64> %a, <i64 -4, i64 -4>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <2 x i64> %a, <i64 -4, i64 -4>
+}
+
+define <4 x i32> @fun40(<4 x i32> %a) {
+  %r = srem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+}
+
+define <4 x i32> @fun41(<4 x i32> %a) {
+  %r = srem <4 x i32> %a, <i32 -16, i32 -16, i32 -16, i32 -16>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <4 x i32> %a, <i32 -16
+}
+
+define <2 x i32> @fun42(<2 x i32> %a) {
+  %r = srem <2 x i32> %a, <i32 -16, i32 -16>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <2 x i32> %a, <i32 -16
+}
+
+define <8 x i16> @fun43(<8 x i16> %a) {
+  %r = srem <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <8 x i16> %a, <i16 32
+}
+
+define <8 x i16> @fun44(<8 x i16> %a) {
+  %r = srem <8 x i16> %a, <i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64, i16 -64>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <8 x i16> %a, <i16 -64
+}
+
+define <4 x i16> @fun45(<4 x i16> %a) {
+  %r = srem <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <4 x i16> %a, <i16 32
+}
+
+define <16 x i8> @fun46(<16 x i8> %a) {
+  %r = srem <16 x i8> %a, <i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64, i8 64>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <16 x i8> %a, <i8 64
+}
+
+define <16 x i8> @fun47(<16 x i8> %a) {
+  %r = srem <16 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <16 x i8> %a, <i8 -128
+}
+
+define <8 x i8> @fun48(<8 x i8> %a) {
+  %r = srem <8 x i8> %a, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 4 for instruction:   %r = srem <8 x i8> %a, <i8 -128
+}
+
+; Scalar urem
+
+define i64 @fun49(i64 %a) {
+  %r = urem i64 %a, 2
+  ret i64 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i64 %a, 2
+}
+
+define i32 @fun50(i32 %a) {
+  %r = urem i32 %a, 8
+  ret i32 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i32 %a, 8
+}
+
+define i16 @fun51(i16 %a) {
+  %r = urem i16 %a, 32
+  ret i16 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i16 %a, 32
+}
+
+define i8 @fun52(i8 %a) {
+  %r = urem i8 %a, 128
+  ret i8 %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem i8 %a, -128
+}
+
+; Vector urem
+
+define <2 x i64> @fun53(<2 x i64> %a) {
+  %r = urem <2 x i64> %a, <i64 2, i64 2>
+  ret <2 x i64> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <2 x i64> %a, <i64 2
+}
+
+define <4 x i32> @fun54(<4 x i32> %a) {
+  %r = urem <4 x i32> %a, <i32 8, i32 8, i32 8, i32 8>
+  ret <4 x i32> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <4 x i32> %a, <i32 8
+}
+
+define <2 x i32> @fun55(<2 x i32> %a) {
+  %r = urem <2 x i32> %a, <i32 8, i32 8>
+  ret <2 x i32> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <2 x i32> %a, <i32 8
+}
+
+define <8 x i16> @fun56(<8 x i16> %a) {
+  %r = urem <8 x i16> %a, <i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32, i16 32>
+  ret <8 x i16> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <8 x i16> %a, <i16 32
+}
+
+define <4 x i16> @fun57(<4 x i16> %a) {
+  %r = urem <4 x i16> %a, <i16 32, i16 32, i16 32, i16 32>
+  ret <4 x i16> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <4 x i16> %a, <i16 32
+}
+
+define <16 x i8> @fun58(<16 x i8> %a) {
+  %r = urem <16 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
+  ret <16 x i8> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <16 x i8> %a, <i8 -128
+}
+
+define <8 x i8> @fun59(<8 x i8> %a) {
+  %r = urem <8 x i8> %a, <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>
+  ret <8 x i8> %r
+; COST: Cost Model: Found an estimated cost of 1 for instruction:   %r = urem <8 x i8> %a, <i8 -128
+}
diff --git a/test/Analysis/CostModel/SystemZ/divrem-reg.ll b/test/Analysis/CostModel/SystemZ/divrem-reg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0cb1293cf3bf75d1c28274f9cd4ace36ed15268c
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/divrem-reg.ll
@@ -0,0 +1,286 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+; Check costs of divisions by register
+;
+; Note: Vectorization of division/remainder is temporarily disabled for high
+; vectorization factors by returning 1000.
+
+; Scalar sdiv
+
+define i64 @fun0(i64 %a, i64 %b) {
+  %r = sdiv i64 %a, %b
+  ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = sdiv i64
+}
+
+define i32 @fun1(i32 %a, i32 %b) {
+  %r = sdiv i32 %a, %b
+  ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = sdiv i32 %a, %b
+}
+
+define i16 @fun2(i16 %a, i16 %b) {
+  %r = sdiv i16 %a, %b
+  ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = sdiv i16 %a, %b
+}
+
+define i8 @fun3(i8 %a, i8 %b) {
+  %r = sdiv i8 %a, %b
+  ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = sdiv i8 %a, %b
+}
+
+; Vector sdiv
+
+define <2 x i64> @fun4(<2 x i64> %a, <2 x i64> %b) {
+  %r = sdiv <2 x i64> %a, %b
+  ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = sdiv <2 x i64>
+}
+
+define <4 x i32> @fun5(<4 x i32> %a, <4 x i32> %b) {
+  %r = sdiv <4 x i32> %a, %b
+  ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = sdiv <4 x i32>
+}
+
+define <2 x i32> @fun6(<2 x i32> %a, <2 x i32> %b) {
+  %r = sdiv <2 x i32> %a, %b
+  ret <2 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = sdiv <2 x i32>
+}
+
+define <8 x i16> @fun7(<8 x i16> %a, <8 x i16> %b) {
+  %r = sdiv <8 x i16> %a, %b
+  ret <8 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = sdiv <8 x i16>
+}
+
+define <4 x i16> @fun8(<4 x i16> %a, <4 x i16> %b) {
+  %r = sdiv <4 x i16> %a, %b
+  ret <4 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = sdiv <4 x i16>
+}
+
+define <16 x i8> @fun9(<16 x i8> %a, <16 x i8> %b) {
+  %r = sdiv <16 x i8> %a, %b
+  ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = sdiv <16 x i8>
+}
+
+define <8 x i8> @fun10(<8 x i8> %a, <8 x i8> %b) {
+  %r = sdiv <8 x i8> %a, %b
+  ret <8 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = sdiv <8 x i8>
+}
+
+; Scalar udiv
+
+define i64 @fun11(i64 %a, i64 %b) {
+  %r = udiv i64 %a, %b
+  ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = udiv i64 %a, %b
+}
+
+define i32 @fun12(i32 %a, i32 %b) {
+  %r = udiv i32 %a, %b
+  ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = udiv i32
+}
+
+define i16 @fun13(i16 %a, i16 %b) {
+  %r = udiv i16 %a, %b
+  ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = udiv i16
+}
+
+define i8 @fun14(i8 %a, i8 %b) {
+  %r = udiv i8 %a, %b
+  ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = udiv i8
+}
+
+; Vector udiv
+
+define <2 x i64> @fun15(<2 x i64> %a, <2 x i64> %b) {
+  %r = udiv <2 x i64> %a, %b
+  ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %r = udiv <2 x i64>
+}
+
+define <4 x i32> @fun16(<4 x i32> %a, <4 x i32> %b) {
+  %r = udiv <4 x i32> %a, %b
+  ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = udiv <4 x i32>
+}
+
+define <2 x i32> @fun17(<2 x i32> %a, <2 x i32> %b) {
+  %r = udiv <2 x i32> %a, %b
+  ret <2 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = udiv <2 x i32>
+}
+
+define <8 x i16> @fun18(<8 x i16> %a, <8 x i16> %b) {
+  %r = udiv <8 x i16> %a, %b
+  ret <8 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = udiv <8 x i16>
+}
+
+define <4 x i16> @fun19(<4 x i16> %a, <4 x i16> %b) {
+  %r = udiv <4 x i16> %a, %b
+  ret <4 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = udiv <4 x i16>
+}
+
+define <16 x i8> @fun20(<16 x i8> %a, <16 x i8> %b) {
+  %r = udiv <16 x i8> %a, %b
+  ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = udiv <16 x i8>
+}
+
+define <8 x i8> @fun21(<8 x i8> %a, <8 x i8> %b) {
+  %r = udiv <8 x i8> %a, %b
+  ret <8 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = udiv <8 x i8>
+}
+
+; Scalar srem
+
+define i64 @fun22(i64 %a, i64 %b) {
+  %r = srem i64 %a, %b
+  ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %r = srem i64
+}
+
+define i32 @fun23(i32 %a, i32 %b) {
+  %r = srem i32 %a, %b
+  ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = srem i32
+}
+
+define i16 @fun24(i16 %a, i16 %b) {
+  %r = srem i16 %a, %b
+  ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = srem i16
+}
+
+define i8 @fun25(i8 %a, i8 %b) {
+  %r = srem i8 %a, %b
+  ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = srem i8
+}
+
+; Vector srem
+
+define <2 x i64> @fun26(<2 x i64> %a, <2 x i64> %b) {
+  %r = srem <2 x i64> %a, %b
+  ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 47 for instruction:   %r = srem <2 x i64>
+}
+
+define <4 x i32> @fun27(<4 x i32> %a, <4 x i32> %b) {
+  %r = srem <4 x i32> %a, %b
+  ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = srem <4 x i32>
+}
+
+define <2 x i32> @fun28(<2 x i32> %a, <2 x i32> %b) {
+  %r = srem <2 x i32> %a, %b
+  ret <2 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = srem <2 x i32>
+}
+
+define <8 x i16> @fun29(<8 x i16> %a, <8 x i16> %b) {
+  %r = srem <8 x i16> %a, %b
+  ret <8 x i16> %r
+; CHECK: ost Model: Found an estimated cost of 1000 for instruction:   %r = srem <8 x i16>
+}
+
+define <4 x i16> @fun30(<4 x i16> %a, <4 x i16> %b) {
+  %r = srem <4 x i16> %a, %b
+  ret <4 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = srem <4 x i16>
+}
+
+define <16 x i8> @fun31(<16 x i8> %a, <16 x i8> %b) {
+  %r = srem <16 x i8> %a, %b
+  ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = srem <16 x i8>
+}
+
+define <8 x i8> @fun32(<8 x i8> %a, <8 x i8> %b) {
+  %r = srem <8 x i8> %a, %b
+  ret <8 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = srem <8 x i8>
+}
+
+; Scalar urem
+
+define i64 @fun33(i64 %a, i64 %b) {
+  %r = urem i64 %a, %b
+  ret i64 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = urem i64
+}
+
+define i32 @fun34(i32 %a, i32 %b) {
+  %r = urem i32 %a, %b
+  ret i32 %r
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %r = urem i32
+}
+
+define i16 @fun35(i16 %a, i16 %b) {
+  %r = urem i16 %a, %b
+  ret i16 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = urem i16
+}
+
+define i8 @fun36(i8 %a, i8 %b) {
+  %r = urem i8 %a, %b
+  ret i8 %r
+; CHECK: Cost Model: Found an estimated cost of 23 for instruction:   %r = urem i8
+}
+
+; Vector urem
+
+define <2 x i64> @fun37(<2 x i64> %a, <2 x i64> %b) {
+  %r = urem <2 x i64> %a, %b
+  ret <2 x i64> %r
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %r = urem <2 x i64>
+}
+
+define <4 x i32> @fun38(<4 x i32> %a, <4 x i32> %b) {
+  %r = urem <4 x i32> %a, %b
+  ret <4 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 98 for instruction:   %r = urem <4 x i32>
+}
+
+define <2 x i32> @fun39(<2 x i32> %a, <2 x i32> %b) {
+  %r = urem <2 x i32> %a, %b
+  ret <2 x i32> %r
+; CHECK: Cost Model: Found an estimated cost of 50 for instruction:   %r = urem <2 x i32>
+}
+
+define <8 x i16> @fun40(<8 x i16> %a, <8 x i16> %b) {
+  %r = urem <8 x i16> %a, %b
+  ret <8 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = urem <8 x i16>
+}
+
+define <4 x i16> @fun41(<4 x i16> %a, <4 x i16> %b) {
+  %r = urem <4 x i16> %a, %b
+  ret <4 x i16> %r
+; CHECK: Cost Model: Found an estimated cost of 106 for instruction:   %r = urem <4 x i16>
+}
+
+define <16 x i8> @fun42(<16 x i8> %a, <16 x i8> %b) {
+  %r = urem <16 x i8> %a, %b
+  ret <16 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = urem <16 x i8>
+}
+
+define <8 x i8> @fun43(<8 x i8> %a, <8 x i8> %b) {
+  %r = urem <8 x i8> %a, %b
+  ret <8 x i8> %r
+; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %r = urem <8 x i8>
+}
diff --git a/test/Analysis/CostModel/SystemZ/fp-cast.ll b/test/Analysis/CostModel/SystemZ/fp-cast.ll
index 4ea5a5033d737fed651faf80f7d265b699df1c92..20feefb8025946412f37a1648adf8d55f76dd92f 100644
--- a/test/Analysis/CostModel/SystemZ/fp-cast.ll
+++ b/test/Analysis/CostModel/SystemZ/fp-cast.ll
@@ -539,3 +539,49 @@ define void @uitofp() {
 
   ret void;
 }
+
+define void @sitofp_extload(i16 *%src16, i8 *%src8) {
+  %ld16 = load i16, i16 *%src16
+  %v6 = sitofp i16 %ld16 to fp128
+  %v7 = sitofp i16 %ld16 to double
+  %v8 = sitofp i16 %ld16 to float
+
+  %ld8 = load i8, i8 *%src8
+  %v9 = sitofp i8 %ld8 to fp128
+  %v10 = sitofp i8 %ld8 to double
+  %v11 = sitofp i8 %ld8 to float
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %ld16 = load i16, i16* %src16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = sitofp i16 %ld16 to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = sitofp i16 %ld16 to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = sitofp i16 %ld16 to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %ld8 = load i8, i8* %src8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = sitofp i8 %ld8 to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = sitofp i8 %ld8 to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = sitofp i8 %ld8 to float
+
+  ret void;
+}
+
+define void @uitofp_extload(i16 *%src16, i8 *%src8) {
+  %ld16 = load i16, i16 *%src16
+  %v6 = uitofp i16 %ld16 to fp128
+  %v7 = uitofp i16 %ld16 to double
+  %v8 = uitofp i16 %ld16 to float
+
+  %ld8 = load i8, i8 *%src8
+  %v9 = uitofp i8 %ld8 to fp128
+  %v10 = uitofp i8 %ld8 to double
+  %v11 = uitofp i8 %ld8 to float
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %ld16 = load i16, i16* %src16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = uitofp i16 %ld16 to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = uitofp i16 %ld16 to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = uitofp i16 %ld16 to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %ld8 = load i8, i8* %src8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = uitofp i8 %ld8 to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = uitofp i8 %ld8 to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = uitofp i8 %ld8 to float
+
+  ret void;
+}
diff --git a/test/Analysis/CostModel/SystemZ/int-arith.ll b/test/Analysis/CostModel/SystemZ/int-arith.ll
index 3ecf4342b949ef1a3c7291d7d04c5ba531ef1c7d..f9a55dfe742ed42525a441086b88827c6b0a0926 100644
--- a/test/Analysis/CostModel/SystemZ/int-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/int-arith.ll
@@ -2,9 +2,6 @@
 ;
 ; Note: The scalarized vector instructions costs are not including any
 ; extracts, due to the undef operands.
-;
-; Note: Vectorization of division/remainder is temporarily disabled for high
-; vectorization factors by returning 1000.
 
 define void @add() {
   %res0 = add i8 undef, undef
@@ -143,187 +140,3 @@ define void @mul() {
 
   ret void;
 }
-
-define void @sdiv() {
-  %res0 = sdiv i8 undef, undef
-  %res1 = sdiv i16 undef, undef
-  %res2 = sdiv i32 undef, undef
-  %res3 = sdiv i64 undef, undef
-  %res4 = sdiv <2 x i8> undef, undef
-  %res5 = sdiv <2 x i16> undef, undef
-  %res6 = sdiv <2 x i32> undef, undef
-  %res7 = sdiv <2 x i64> undef, undef
-  %res8 = sdiv <4 x i8> undef, undef
-  %res9 = sdiv <4 x i16> undef, undef
-  %res10 = sdiv <4 x i32> undef, undef
-  %res11 = sdiv <4 x i64> undef, undef
-  %res12 = sdiv <8 x i8> undef, undef
-  %res13 = sdiv <8 x i16> undef, undef
-  %res14 = sdiv <8 x i32> undef, undef
-  %res15 = sdiv <8 x i64> undef, undef
-  %res16 = sdiv <16 x i8> undef, undef
-  %res17 = sdiv <16 x i16> undef, undef
-  %res18 = sdiv <16 x i32> undef, undef
-  %res19 = sdiv <16 x i64> undef, undef
-
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = sdiv i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = sdiv i16 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = sdiv i32 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sdiv i64 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = sdiv <2 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = sdiv <2 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = sdiv <2 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = sdiv <2 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = sdiv <4 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = sdiv <4 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = sdiv <4 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = sdiv <4 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res12 = sdiv <8 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res13 = sdiv <8 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res14 = sdiv <8 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res15 = sdiv <8 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res16 = sdiv <16 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res17 = sdiv <16 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res18 = sdiv <16 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res19 = sdiv <16 x i64> undef, undef
-
-  ret void;
-}
-
-define void @srem() {
-  %res0 = srem i8 undef, undef
-  %res1 = srem i16 undef, undef
-  %res2 = srem i32 undef, undef
-  %res3 = srem i64 undef, undef
-  %res4 = srem <2 x i8> undef, undef
-  %res5 = srem <2 x i16> undef, undef
-  %res6 = srem <2 x i32> undef, undef
-  %res7 = srem <2 x i64> undef, undef
-  %res8 = srem <4 x i8> undef, undef
-  %res9 = srem <4 x i16> undef, undef
-  %res10 = srem <4 x i32> undef, undef
-  %res11 = srem <4 x i64> undef, undef
-  %res12 = srem <8 x i8> undef, undef
-  %res13 = srem <8 x i16> undef, undef
-  %res14 = srem <8 x i32> undef, undef
-  %res15 = srem <8 x i64> undef, undef
-  %res16 = srem <16 x i8> undef, undef
-  %res17 = srem <16 x i16> undef, undef
-  %res18 = srem <16 x i32> undef, undef
-  %res19 = srem <16 x i64> undef, undef
-
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = srem i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = srem i16 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = srem i32 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = srem i64 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = srem <2 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = srem <2 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = srem <2 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = srem <2 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = srem <4 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = srem <4 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = srem <4 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = srem <4 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res12 = srem <8 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res13 = srem <8 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res14 = srem <8 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res15 = srem <8 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res16 = srem <16 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res17 = srem <16 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res18 = srem <16 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res19 = srem <16 x i64> undef, undef
-
-  ret void;
-}
-
-define void @udiv() {
-  %res0 = udiv i8 undef, undef
-  %res1 = udiv i16 undef, undef
-  %res2 = udiv i32 undef, undef
-  %res3 = udiv i64 undef, undef
-  %res4 = udiv <2 x i8> undef, undef
-  %res5 = udiv <2 x i16> undef, undef
-  %res6 = udiv <2 x i32> undef, undef
-  %res7 = udiv <2 x i64> undef, undef
-  %res8 = udiv <4 x i8> undef, undef
-  %res9 = udiv <4 x i16> undef, undef
-  %res10 = udiv <4 x i32> undef, undef
-  %res11 = udiv <4 x i64> undef, undef
-  %res12 = udiv <8 x i8> undef, undef
-  %res13 = udiv <8 x i16> undef, undef
-  %res14 = udiv <8 x i32> undef, undef
-  %res15 = udiv <8 x i64> undef, undef
-  %res16 = udiv <16 x i8> undef, undef
-  %res17 = udiv <16 x i16> undef, undef
-  %res18 = udiv <16 x i32> undef, undef
-  %res19 = udiv <16 x i64> undef, undef
-
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = udiv i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = udiv i16 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = udiv i32 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res3 = udiv i64 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = udiv <2 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = udiv <2 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = udiv <2 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %res7 = udiv <2 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = udiv <4 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = udiv <4 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = udiv <4 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res11 = udiv <4 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res12 = udiv <8 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res13 = udiv <8 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res14 = udiv <8 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res15 = udiv <8 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res16 = udiv <16 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res17 = udiv <16 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res18 = udiv <16 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res19 = udiv <16 x i64> undef, undef
-
-  ret void;
-}
-
-define void @urem() {
-  %res0 = urem i8 undef, undef
-  %res1 = urem i16 undef, undef
-  %res2 = urem i32 undef, undef
-  %res3 = urem i64 undef, undef
-  %res4 = urem <2 x i8> undef, undef
-  %res5 = urem <2 x i16> undef, undef
-  %res6 = urem <2 x i32> undef, undef
-  %res7 = urem <2 x i64> undef, undef
-  %res8 = urem <4 x i8> undef, undef
-  %res9 = urem <4 x i16> undef, undef
-  %res10 = urem <4 x i32> undef, undef
-  %res11 = urem <4 x i64> undef, undef
-  %res12 = urem <8 x i8> undef, undef
-  %res13 = urem <8 x i16> undef, undef
-  %res14 = urem <8 x i32> undef, undef
-  %res15 = urem <8 x i64> undef, undef
-  %res16 = urem <16 x i8> undef, undef
-  %res17 = urem <16 x i16> undef, undef
-  %res18 = urem <16 x i32> undef, undef
-  %res19 = urem <16 x i64> undef, undef
-
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = urem i8 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = urem i16 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = urem i32 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res3 = urem i64 undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = urem <2 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = urem <2 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = urem <2 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %res7 = urem <2 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = urem <4 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = urem <4 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = urem <4 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res11 = urem <4 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res12 = urem <8 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res13 = urem <8 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res14 = urem <8 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res15 = urem <8 x i64> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res16 = urem <16 x i8> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res17 = urem <16 x i16> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res18 = urem <16 x i32> undef, undef
-; CHECK: Cost Model: Found an estimated cost of 1000 for instruction:   %res19 = urem <16 x i64> undef, undef
-
-  ret void;
-}
diff --git a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
index 1b6a50d303f252aca8c0eeba138961ead01130a0..d5c097ced6254bd86033b4012db8d58271afc720 100644
--- a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
+++ b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
@@ -1,4 +1,7 @@
-; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:  | FileCheck %s -check-prefixes=CHECK,Z13
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z14 \
+; RUN:  | FileCheck %s -check-prefixes=CHECK,Z14
 ;
 ; Test that loads into operations that can fold one memory operand get zero
 ; cost. In the case that both operands are loaded, one load should get a cost
@@ -19,6 +22,35 @@ define void @add() {
   %li64_1 = load i64, i64* undef
   add i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  add i32 %tr, undef
+
+  ; Sign-extended loads
+  %li16_0 = load i16, i16* undef
+  %sext_0 = sext i16 %li16_0 to i32
+  add i32 %sext_0, undef
+
+  %li16_1 = load i16, i16* undef
+  %sext_1 = sext i16 %li16_1 to i64
+  add i64 %sext_1, undef
+
+  %li32_2 = load i32, i32* undef
+  %sext_2 = sext i32 %li32_2 to i64
+  add i64 %sext_2, undef
+
+  ; Zero-extended loads
+  %li32_3 = load i32, i32* undef
+  %zext_0 = zext i32 %li32_3 to i64
+  add i64 %zext_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li16_3 = load i16, i16* undef
+  %sext_3 = sext i16 %li16_3 to i32
+  %sext_4 = sext i16 %li16_3 to i32
+  add i32 %sext_3, undef
+
   ret void;
 
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
@@ -31,9 +63,29 @@ define void @add() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = add i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = add i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i16 %li16_0 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = add i32 %sext_0, undef
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i16 %li16_1 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = add i64 %sext_1, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = add i64 %sext_2, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext_0 = zext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = add i64 %zext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_3 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_3 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_4 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = add i32 %sext_3, undef
 }
 
-define void @sub() {
+define void @sub_lhs_mem() {
   %li32 = load i32, i32* undef
   sub i32 %li32, undef
 
@@ -48,18 +100,132 @@ define void @sub() {
   %li64_1 = load i64, i64* undef
   sub i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  sub i32 %tr, undef
+
+  ; Sign-extended loads
+  %li16_0 = load i16, i16* undef
+  %sext_0 = sext i16 %li16_0 to i32
+  sub i32 %sext_0, undef
+
+  %li16_1 = load i16, i16* undef
+  %sext_1 = sext i16 %li16_1 to i64
+  sub i64 %sext_1, undef
+
+  %li32_2 = load i32, i32* undef
+  %sext_2 = sext i32 %li32_2 to i64
+  sub i64 %sext_2, undef
+
+  ; Zero-extended loads
+  %li32_3 = load i32, i32* undef
+  %zext_0 = zext i32 %li32_3 to i64
+  sub i64 %zext_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li16_3 = load i16, i16* undef
+  %sext_3 = sext i16 %li16_3 to i32
+  %sext_4 = sext i16 %li16_3 to i32
+  sub i32 %sext_3, undef
+
   ret void;
 
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; A sub LHS loaded operand is *not* foldable.
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = sub i32 %li32, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_1 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = sub i32 %li32_0, %li32_1
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sub i64 %li64, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sub i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = sub i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i16 %li16_0 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = sub i32 %sext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i16 %li16_1 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = sub i64 %sext_1, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = sub i64 %sext_2, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext_0 = zext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = sub i64 %zext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_3 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_3 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_4 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = sub i32 %sext_3, undef
+}
+
+define void @sub_rhs_mem() {
+  %li32 = load i32, i32* undef
+  sub i32 undef, %li32
+
+  %li64 = load i64, i64* undef
+  sub i64 undef, %li64
+
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  sub i32 undef, %tr
+
+  ; Sign-extended loads
+  %li16_0 = load i16, i16* undef
+  %sext_0 = sext i16 %li16_0 to i32
+  sub i32 undef, %sext_0
+
+  %li16_1 = load i16, i16* undef
+  %sext_1 = sext i16 %li16_1 to i64
+  sub i64 undef, %sext_1
+
+  %li32_2 = load i32, i32* undef
+  %sext_2 = sext i32 %li32_2 to i64
+  sub i64 undef, %sext_2
+
+  ; Zero-extended loads
+  %li32_3 = load i32, i32* undef
+  %zext_0 = zext i32 %li32_3 to i64
+  sub i64 undef, %zext_0
+
+  ; Loads with multiple uses are *not* folded
+  %li16_3 = load i16, i16* undef
+  %sext_3 = sext i16 %li16_3 to i32
+  %sext_4 = sext i16 %li16_3 to i32
+  sub i32 undef, %sext_3
+
+  ret void;
+
+; A sub RHS loaded operand is foldable.
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = sub i32 undef, %li32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = sub i64 undef, %li64
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sub i32 undef, %tr
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i16 %li16_0 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sub i32 undef, %sext_0
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i16 %li16_1 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = sub i64 undef, %sext_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = sub i64 undef, %sext_2
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext_0 = zext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = sub i64 undef, %zext_0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_3 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_3 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_4 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = sub i32 undef, %sext_3
 }
 
 define void @mul() {
@@ -77,6 +243,35 @@ define void @mul() {
   %li64_1 = load i64, i64* undef
   mul i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  mul i32 %tr, undef
+
+  ; Sign-extended loads
+  %li16_0 = load i16, i16* undef
+  %sext_0 = sext i16 %li16_0 to i32
+  mul i32 %sext_0, undef
+
+  %li16_1 = load i16, i16* undef
+  %sext_1 = sext i16 %li16_1 to i64
+  mul i64 %sext_1, undef
+
+  %li32_2 = load i32, i32* undef
+  %sext_2 = sext i32 %li32_2 to i64
+  mul i64 %sext_2, undef
+
+  ; Zero-extended loads are *not* folded
+  %li16_2 = load i16, i16* undef
+  %zext_0 = zext i16 %li16_2 to i32
+  mul i32 %zext_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li16_3 = load i16, i16* undef
+  %sext_3 = sext i16 %li16_3 to i32
+  %sext_4 = sext i16 %li16_3 to i32
+  mul i32 %sext_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = mul i32 %li32, undef
@@ -88,62 +283,202 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = mul i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = mul i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li16_0 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i16 %li16_0 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = mul i32 %sext_0, undef
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %li16_1 = load i16, i16* undef
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction:   %li16_1 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i16 %li16_1 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = mul i64 %sext_1, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = mul i64 %sext_2, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_2 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %zext_0 = zext i16 %li16_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = mul i32 %zext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16_3 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_3 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_4 = sext i16 %li16_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = mul i32 %sext_3, undef
 }
 
-define void @sdiv() {
+define void @sdiv_lhs(i32 %arg32, i64 %arg64) {
   %li32 = load i32, i32* undef
-  sdiv i32 %li32, undef
+  sdiv i32 %li32, %arg32
 
   %li32_0 = load i32, i32* undef
   %li32_1 = load i32, i32* undef
   sdiv i32 %li32_0, %li32_1
 
   %li64 = load i64, i64* undef
-  sdiv i64 %li64, undef
+  sdiv i64 %li64, %arg64
 
   %li64_0 = load i64, i64* undef
   %li64_1 = load i64, i64* undef
   sdiv i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  sdiv i32 %tr, undef
+
+  ; Sign-extended loads
+  %li32_2 = load i32, i32* undef
+  %sext_0 = sext i32 %li32_2 to i64
+  sdiv i64 %sext_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li32_3 = load i32, i32* undef
+  %sext_1 = sext i32 %li32_3 to i64
+  %sext_2 = sext i32 %li32_3 to i64
+  sdiv i64 %sext_1, undef
+
   ret void;
+
+; An sdiv loaded dividend (lhs) operand is *not* foldable.
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = sdiv i32 %li32, %arg32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %2 = sdiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %3 = sdiv i64 %li64, %arg64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %4 = sdiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %5 = sdiv i32 %tr, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %6 = sdiv i64 %sext_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %7 = sdiv i64 %sext_1, undef
+}
+
+define void @sdiv_rhs(i32 %arg32, i64 %arg64) {
+  %li32 = load i32, i32* undef
+  sdiv i32 %arg32, %li32
+
+  %li64 = load i64, i64* undef
+  sdiv i64 %arg64, %li64
+
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr = trunc i64 %li64_2 to i32
+  sdiv i32 undef, %tr
+
+  ; Sign-extended loads
+  %li32_2 = load i32, i32* undef
+  %sext_0 = sext i32 %li32_2 to i64
+  sdiv i64 undef, %sext_0
+
+  ; Loads with multiple uses are *not* folded
+  %li32_3 = load i32, i32* undef
+  %sext_1 = sext i32 %li32_3 to i64
+  %sext_2 = sext i32 %li32_3 to i64
+  sdiv i64 undef, %sext_1
+
+  ret void;
+
+; An sdiv loaded divisor (rhs) operand is foldable.
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = sdiv i32 %li32, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %2 = sdiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = sdiv i32 %arg32, %li32
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sdiv i64 %li64, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sdiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %2 = sdiv i64 %arg64, %li64
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %3 = sdiv i32 undef, %tr
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_2 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_0 = sext i32 %li32_2 to i64
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %4 = sdiv i64 undef, %sext_0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_1 = sext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %sext_2 = sext i32 %li32_3 to i64
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %5 = sdiv i64 undef, %sext_1
 }
 
-define void @udiv() {
+define void @udiv_lhs(i32 %arg32, i64 %arg64) {
   %li32 = load i32, i32* undef
-  udiv i32 %li32, undef
+  udiv i32 %li32, %arg32
 
   %li32_0 = load i32, i32* undef
   %li32_1 = load i32, i32* undef
   udiv i32 %li32_0, %li32_1
 
   %li64 = load i64, i64* undef
-  udiv i64 %li64, undef
+  udiv i64 %li64, %arg64
 
   %li64_0 = load i64, i64* undef
   %li64_1 = load i64, i64* undef
   udiv i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  udiv i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  udiv i64 %li64_3, undef
+
+  ret void;
+
+; An udiv loaded dividend (lhs) operand is *not* foldable.
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = udiv i32 %li32, %arg32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %2 = udiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %3 = udiv i64 %li64, %arg64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %4 = udiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %5 = udiv i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %6 = udiv i64 %li64_3, undef
+}
+
+define void @udiv_rhs(i32 %arg32, i64 %arg64) {
+  %li32 = load i32, i32* undef
+  udiv i32 %arg32, %li32
+
+  %li64 = load i64, i64* undef
+  udiv i64 %arg64, %li64
+
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  udiv i32 undef, %tr_0
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  udiv i64 undef, %li64_3
+
   ret void;
+
+; An udiv loaded divisor (rhs) operand is foldable.
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = udiv i32 %li32, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %2 = udiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %1 = udiv i32 %arg32, %li32
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %3 = udiv i64 %li64, undef
-; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %4 = udiv i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %2 = udiv i64 %arg64, %li64
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %3 = udiv i32 undef, %tr_0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %4 = udiv i64 undef, %li64_3
 }
 
 define void @and() {
@@ -161,6 +496,16 @@ define void @and() {
   %li64_1 = load i64, i64* undef
   and i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  and i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  and i64 %li64_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = and i32 %li32, undef
@@ -172,6 +517,12 @@ define void @and() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = and i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = and i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = and i64 %li64_3, undef
 }
 
 define void @or() {
@@ -189,6 +540,16 @@ define void @or() {
   %li64_1 = load i64, i64* undef
   or i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  or i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  or i64 %li64_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = or i32 %li32, undef
@@ -200,6 +561,12 @@ define void @or() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = or i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = or i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = or i64 %li64_3, undef
 }
 
 define void @xor() {
@@ -217,6 +584,16 @@ define void @xor() {
   %li64_1 = load i64, i64* undef
   xor i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  xor i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  xor i64 %li64_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = xor i32 %li32, undef
@@ -228,6 +605,12 @@ define void @xor() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = xor i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = xor i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = xor i64 %li64_3, undef
 }
 
 define void @icmp() {
@@ -245,6 +628,16 @@ define void @icmp() {
   %li64_1 = load i64, i64* undef
   icmp eq i64 %li64_0, %li64_1
 
+  ; Truncated load
+  %li64_2 = load i64, i64* undef
+  %tr_0 = trunc i64 %li64_2 to i32
+  icmp eq i32 %tr_0, undef
+
+  ; Loads with multiple uses are *not* folded
+  %li64_3 = load i64, i64* undef
+  %tr_1 = trunc i64 %li64_3 to i32
+  icmp eq i64 %li64_3, undef
+
   ret void;
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = icmp eq i32 %li32, undef
@@ -256,4 +649,10 @@ define void @icmp() {
 ; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = icmp eq i64 %li64_0, %li64_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_2 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_0 = trunc i64 %li64_2 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = icmp eq i32 %tr_0, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_3 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %tr_1 = trunc i64 %li64_3 to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = icmp eq i64 %li64_3, undef
 }
diff --git a/test/Analysis/CostModel/X86/div.ll b/test/Analysis/CostModel/X86/div.ll
index 63fb25dd2f6b5ed4c33426e084fdffc8158dc29e..724f9872417244ead4e0c54b281d409dd21a0649 100644
--- a/test/Analysis/CostModel/X86/div.ll
+++ b/test/Analysis/CostModel/X86/div.ll
@@ -136,24 +136,157 @@ define i32 @udiv() {
 }
 
 define i32 @sdiv_const() {
-; CHECK-LABEL: 'sdiv_const'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'sdiv_const'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'sdiv_const'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'sdiv_const'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'sdiv_const'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'sdiv_const'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'sdiv_const'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'sdiv_const'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'sdiv_const'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, <i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sdiv_const'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7
@@ -161,17 +294,17 @@ define i32 @sdiv_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = sdiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = sdiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 7
@@ -198,24 +331,119 @@ define i32 @sdiv_const() {
 }
 
 define i32 @udiv_const() {
-; CHECK-LABEL: 'udiv_const'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'udiv_const'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'udiv_const'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'udiv_const'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'udiv_const'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'udiv_const'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'udiv_const'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = udiv <2 x i64> undef, <i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'udiv_const'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, 7
@@ -223,17 +451,17 @@ define i32 @udiv_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = udiv <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = udiv <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, 7
@@ -274,9 +502,9 @@ define i32 @sdiv_uniformconst() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'sdiv_uniformconst'
@@ -293,9 +521,9 @@ define i32 @sdiv_uniformconst() {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'sdiv_uniformconst'
@@ -312,9 +540,9 @@ define i32 @sdiv_uniformconst() {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'sdiv_uniformconst'
@@ -331,9 +559,9 @@ define i32 @sdiv_uniformconst() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'sdiv_uniformconst'
@@ -350,9 +578,9 @@ define i32 @sdiv_uniformconst() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'sdiv_uniformconst'
@@ -369,9 +597,9 @@ define i32 @sdiv_uniformconst() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'sdiv_uniformconst'
@@ -388,9 +616,9 @@ define i32 @sdiv_uniformconst() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'sdiv_uniformconst'
@@ -407,9 +635,9 @@ define i32 @sdiv_uniformconst() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'sdiv_uniformconst'
@@ -426,9 +654,9 @@ define i32 @sdiv_uniformconst() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = sdiv i64 undef, 7
@@ -469,9 +697,9 @@ define i32 @udiv_uniformconst() {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'udiv_uniformconst'
@@ -488,9 +716,9 @@ define i32 @udiv_uniformconst() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'udiv_uniformconst'
@@ -507,9 +735,9 @@ define i32 @udiv_uniformconst() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'udiv_uniformconst'
@@ -526,9 +754,9 @@ define i32 @udiv_uniformconst() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'udiv_uniformconst'
@@ -545,9 +773,9 @@ define i32 @udiv_uniformconst() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'udiv_uniformconst'
@@ -564,9 +792,9 @@ define i32 @udiv_uniformconst() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'udiv_uniformconst'
@@ -583,9 +811,9 @@ define i32 @udiv_uniformconst() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = udiv i64 undef, 7
diff --git a/test/Analysis/CostModel/X86/reduce-add.ll b/test/Analysis/CostModel/X86/reduce-add.ll
new file mode 100644
index 0000000000000000000000000000000000000000..97f7a75ffa2c9e106c98213e0e99a974c5464275
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-add.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'reduce_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.add.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.add.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.add.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.add.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.add.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-and.ll b/test/Analysis/CostModel/X86/reduce-and.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1dfa0953c286e2d5ceec9f3be962d39102f30da8
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-and.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE-LABEL: 'reduce_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i1(i32 %arg) {
+; SSE2-LABEL: 'reduce_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i1'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1   = call i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.and.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.and.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.and.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.and.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.and.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.and.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.and.i8.v128i8(<128 x i8>)
+
+declare i1 @llvm.experimental.vector.reduce.and.i1.v1i1(<1 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v2i1(<2 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v4i1(<4 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v8i1(<8 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v16i1(<16 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v32i1(<32 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v64i1(<64 x i1>)
+declare i1 @llvm.experimental.vector.reduce.and.i1.v128i1(<128 x i1>)
diff --git a/test/Analysis/CostModel/X86/reduce-mul.ll b/test/Analysis/CostModel/X86/reduce-mul.ll
new file mode 100644
index 0000000000000000000000000000000000000000..97e67a92f8f351cd8198deb74d735c1bcd313cc7
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-mul.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 81 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i64'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i64'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i64'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i32'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i32'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i32'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 275 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 93 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 239 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 137 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 172 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 241 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 197 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 197 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-or.ll b/test/Analysis/CostModel/X86/reduce-or.ll
new file mode 100644
index 0000000000000000000000000000000000000000..13814ac2b76efb094876d36ce723ba3a7d09dc71
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-or.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE-LABEL: 'reduce_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i1(i32 %arg) {
+; SSE2-LABEL: 'reduce_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i1'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1   = call i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.or.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.or.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.or.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.or.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.or.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.or.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.or.i8.v128i8(<128 x i8>)
+
+declare i1 @llvm.experimental.vector.reduce.or.i1.v1i1(<1 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v2i1(<2 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v4i1(<4 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v8i1(<8 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v16i1(<16 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v32i1(<32 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v64i1(<64 x i1>)
+declare i1 @llvm.experimental.vector.reduce.or.i1.v128i1(<128 x i1>)
diff --git a/test/Analysis/CostModel/X86/reduce-smax.ll b/test/Analysis/CostModel/X86/reduce-smax.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5426c7f9c80b072d9df9ee54f5318c0fedd3891b
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-smax.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-smin.ll b/test/Analysis/CostModel/X86/reduce-smin.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b8076a98513266b217ada124e11a1986815292c1
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-smin.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-umax.ll b/test/Analysis/CostModel/X86/reduce-umax.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6b947ebc225b6e077a809c22e0fce796359d776d
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-umax.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-umin.ll b/test/Analysis/CostModel/X86/reduce-umin.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0fe9029bc826c41755be3c97abfadbaef19abc8a
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-umin.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE2-LABEL: 'reduce_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 146 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i64'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE2-LABEL: 'reduce_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 101 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 97 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 111 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 162 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 173 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 194 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 253 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v128i8(<128 x i8>)
diff --git a/test/Analysis/CostModel/X86/reduce-xor.ll b/test/Analysis/CostModel/X86/reduce-xor.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f8e82d05aa7ccaecaba8cb75ee734d7c86386a05
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduce-xor.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -mtriple=x86_64-apple-darwin -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+
+define i32 @reduce_i64(i32 %arg) {
+; SSE-LABEL: 'reduce_i64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1  = call i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64> undef)
+  %V2  = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> undef)
+  %V4  = call i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64> undef)
+  %V8  = call i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64> undef)
+  %V16 = call i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i32(i32 %arg) {
+; SSE-LABEL: 'reduce_i32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'reduce_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V2  = call i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32> undef)
+  %V4  = call i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> undef)
+  %V8  = call i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32> undef)
+  %V16 = call i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32> undef)
+  %V32 = call i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i16(i32 %arg) {
+; SSE2-LABEL: 'reduce_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i16'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V4  = call i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16> undef)
+  %V8  = call i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16> undef)
+  %V16 = call i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16> undef)
+  %V32 = call i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16> undef)
+  %V64 = call i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i8(i32 %arg) {
+; SSE2-LABEL: 'reduce_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 118 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 121 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i8'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V8   = call i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8> undef)
+  %V16  = call i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8> undef)
+  %V32  = call i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8> undef)
+  %V64  = call i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8> undef)
+  %V128 = call i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8> undef)
+  ret i32 undef
+}
+
+define i32 @reduce_i1(i32 %arg) {
+; SSE2-LABEL: 'reduce_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'reduce_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'reduce_i1'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'reduce_i1'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 85 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'reduce_i1'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'reduce_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'reduce_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 357 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 838 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 841 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512DQ-LABEL: 'reduce_i1'
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 148 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 156 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %V1   = call i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1> undef)
+  %V2   = call i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1> undef)
+  %V4   = call i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1> undef)
+  %V8   = call i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1> undef)
+  %V16  = call i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1> undef)
+  %V32  = call i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1> undef)
+  %V64  = call i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1> undef)
+  %V128 = call i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1> undef)
+  ret i32 undef
+}
+
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v1i64(<1 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v4i64(<4 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v8i64(<8 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v16i64(<16 x i64>)
+
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v2i32(<2 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v8i32(<8 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v16i32(<16 x i32>)
+declare i32 @llvm.experimental.vector.reduce.xor.i32.v32i32(<32 x i32>)
+
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v4i16(<4 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v8i16(<8 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v16i16(<16 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v32i16(<32 x i16>)
+declare i16 @llvm.experimental.vector.reduce.xor.i16.v64i16(<64 x i16>)
+
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v8i8(<8 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v16i8(<16 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v32i8(<32 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v64i8(<64 x i8>)
+declare i8 @llvm.experimental.vector.reduce.xor.i8.v128i8(<128 x i8>)
+
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v1i1(<1 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v2i1(<2 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v4i1(<4 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v8i1(<8 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v16i1(<16 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v32i1(<32 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v64i1(<64 x i1>)
+declare i1 @llvm.experimental.vector.reduce.xor.i1.v128i1(<128 x i1>)
diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll
index 306a46d21ceef7e01aa502ed8481a5b5cdf2e75a..04e40d72246944b136af64e5df051fbe1914fc61 100644
--- a/test/Analysis/CostModel/X86/reduction.ll
+++ b/test/Analysis/CostModel/X86/reduction.ll
@@ -614,7 +614,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8i16'
@@ -1113,7 +1113,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
 ;
 ; SSSE3-LABEL: 'pairwise_reduction8i16'
diff --git a/test/Analysis/CostModel/X86/rem.ll b/test/Analysis/CostModel/X86/rem.ll
index fd7e83d74ffbad58c2611325695e12ff79222577..62de12d57ed942db93c7fcc4eaad9f45acbf7d16 100644
--- a/test/Analysis/CostModel/X86/rem.ll
+++ b/test/Analysis/CostModel/X86/rem.ll
@@ -136,24 +136,176 @@ define i32 @urem() {
 }
 
 define i32 @srem_const() {
-; CHECK-LABEL: 'srem_const'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'srem_const'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'srem_const'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE42-LABEL: 'srem_const'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'srem_const'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'srem_const'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'srem_const'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'srem_const'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'srem_const'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'srem_const'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, <i64 6, i64 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'srem_const'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7
@@ -161,17 +313,17 @@ define i32 @srem_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = srem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = srem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = srem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 7
@@ -198,24 +350,100 @@ define i32 @srem_const() {
 }
 
 define i32 @urem_const() {
-; CHECK-LABEL: 'urem_const'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE-LABEL: 'urem_const'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX1-LABEL: 'urem_const'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX2-LABEL: 'urem_const'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'urem_const'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'urem_const'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = urem <2 x i64> undef, <i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'urem_const'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = urem i64 undef, 7
@@ -223,17 +451,17 @@ define i32 @urem_const() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, <i64 4, i64 5, i64 6, i64 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, <i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V16i32 = urem <16 x i32> undef, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = urem <32 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = urem <64 x i8> undef, <i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 7
@@ -274,9 +502,9 @@ define i32 @srem_uniformconst() {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSSE3-LABEL: 'srem_uniformconst'
@@ -293,9 +521,9 @@ define i32 @srem_uniformconst() {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SSE42-LABEL: 'srem_uniformconst'
@@ -312,9 +540,9 @@ define i32 @srem_uniformconst() {
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'srem_uniformconst'
@@ -331,9 +559,9 @@ define i32 @srem_uniformconst() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'srem_uniformconst'
@@ -350,9 +578,9 @@ define i32 @srem_uniformconst() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'srem_uniformconst'
@@ -369,9 +597,9 @@ define i32 @srem_uniformconst() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'srem_uniformconst'
@@ -388,9 +616,9 @@ define i32 @srem_uniformconst() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SLM-LABEL: 'srem_uniformconst'
@@ -407,9 +635,9 @@ define i32 @srem_uniformconst() {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; SLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; GLM-LABEL: 'srem_uniformconst'
@@ -426,9 +654,9 @@ define i32 @srem_uniformconst() {
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; GLM-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; GLM-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'srem_uniformconst'
@@ -445,9 +673,9 @@ define i32 @srem_uniformconst() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = srem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = srem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = srem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = srem i64 undef, 7
@@ -488,9 +716,9 @@ define i32 @urem_uniformconst() {
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; SSE-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX1-LABEL: 'urem_uniformconst'
@@ -507,9 +735,9 @@ define i32 @urem_uniformconst() {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX2-LABEL: 'urem_uniformconst'
@@ -526,9 +754,9 @@ define i32 @urem_uniformconst() {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'urem_uniformconst'
@@ -545,9 +773,9 @@ define i32 @urem_uniformconst() {
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BW-LABEL: 'urem_uniformconst'
@@ -564,9 +792,9 @@ define i32 @urem_uniformconst() {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'urem_uniformconst'
@@ -583,9 +811,9 @@ define i32 @urem_uniformconst() {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32i16 = urem <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = urem i8 undef, 7
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = urem <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32i8 = urem <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V64i8 = urem <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %I64 = urem i64 undef, 7
diff --git a/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5bb2e1a756d619c4d8fe75eb7cd9c805c54a596d
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=CHECK,AVX512
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+;
+; Verify the cost model for extract_subector style shuffles.
+;
+
+define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) {
+; CHECK-LABEL: 'test_vXf64'
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXf64'
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V256_01 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+  %V256_23 = shufflevector <4 x double> %src256, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  %V512_01 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  %V512_23 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+  %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+  %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+  %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret void
+}
+
+define void @test_vXfi64(<4 x i64> %src256, <8 x i64> %src512) {
+; CHECK-LABEL: 'test_vXfi64'
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXfi64'
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %V512_2345 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
new file mode 100644
index 0000000000000000000000000000000000000000..94e56643472b1f782fc7f5bb17a6257ce5a137da
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=CHECK,AVX512
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+;
+; Verify the cost model for insert_subector style shuffles.
+;
+
+define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
+; SSE-LABEL: 'test_vXf64'
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXf64'
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf64'
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXf64'
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+  %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+  %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret void
+}
+
+define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
+; SSE-LABEL: 'test_vXi64'
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi64'
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXi64'
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi64'
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Unknown cost for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+  %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+  %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+  %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/shuffle-transpose.ll b/test/Analysis/CostModel/X86/shuffle-transpose.ll
new file mode 100644
index 0000000000000000000000000000000000000000..25a887604fa978b0031c4f5f7d042aa7fc2a133d
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-transpose.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+ssse3 | FileCheck %s -check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+sse4.2 | FileCheck %s -check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VBMI
+;
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
+
+;
+; Verify the cost model for transpose shuffles.
+;
+
+define void @test_vXf64(<2 x double> %a128, <2 x double> %b128, <4 x double> %a256, <4 x double> %b256, <8 x double> %a512, <8 x double> %b512) {
+; SSE-LABEL: 'test_vXf64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXf64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXf64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <2 x double> %a128, <2 x double> %b128, <2 x i32> <i32 0, i32 2>
+  %V256 = shufflevector <4 x double> %a256, <4 x double> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %V512 = shufflevector <8 x double> %a512, <8 x double> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret void
+}
+
+define void @test_vXi64(<2 x i64> %a128, <2 x i64> %b128, <4 x i64> %a256, <4 x i64> %b256, <8 x i64> %a512, <8 x i64> %b512) {
+; SSE-LABEL: 'test_vXi64'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXi64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi64'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <2 x i32> <i32 0, i32 2>
+  %V256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %V512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret void
+}
+
+define void @test_vXf32(<2 x float> %a64, <2 x float> %b64, <4 x float> %a128, <4 x float> %b128, <8 x float> %a256, <8 x float> %b256, <16 x float> %a512, <16 x float> %b512) {
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXf32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXf32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXf32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V64 = shufflevector <2 x float> %a64, <2 x float> %b64, <2 x i32> <i32 0, i32 2>
+  %V128 = shufflevector <4 x float> %a128, <4 x float> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %V256 = shufflevector <8 x float> %a256, <8 x float> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %V512 = shufflevector <16 x float> %a512, <16 x float> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret void
+}
+
+define void @test_vXi32(<2 x i32> %a64, <2 x i32> %b64, <4 x i32> %a128, <4 x i32> %b128, <8 x i32> %a256, <8 x i32> %b256, <16 x i32> %a512, <16 x i32> %b512) {
+; SSE-LABEL: 'test_vXi32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXi32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'test_vXi32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi32'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V64 = shufflevector <2 x i32> %a64, <2 x i32> %b64, <2 x i32> <i32 0, i32 2>
+  %V128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %V256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %V512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret void
+}
+
+define void @test_vXi16(<8 x i16> %a128, <8 x i16> %b128, <16 x i16> %a256, <16 x i16> %b256, <32 x i16> %a512, <32 x i16> %b512) {
+; SSE2-LABEL: 'test_vXi16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi16'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi16'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi16'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %V256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %V512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+  ret void
+}
+
+define void @test_vXi8(<16 x i8> %a128, <16 x i8> %b128, <32 x i8> %a256, <32 x i8> %b256, <64 x i8> %a512, <64 x i8> %b512) {
+; SSE2-LABEL: 'test_vXi8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi8'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512F-LABEL: 'test_vXi8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512BW-LABEL: 'test_vXi8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512VBMI-LABEL: 'test_vXi8'
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; BTVER2-LABEL: 'test_vXi8'
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %V256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <32 x i32> <i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62>
+  %V512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <64 x i32> <i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126>
+  ret void
+}
+
diff --git a/test/Analysis/CostModel/X86/testshiftashr.ll b/test/Analysis/CostModel/X86/testshiftashr.ll
index 13f2bd2019d33f3161490053f7c65752f81984d6..864ea2e5559e5ebca13f0e6152b035d64d5af245 100644
--- a/test/Analysis/CostModel/X86/testshiftashr.ll
+++ b/test/Analysis/CostModel/X86/testshiftashr.ll
@@ -4,9 +4,9 @@
 %shifttype = type <2 x i16>
 define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
 entry:
-  ; SSE2: shift2i16
+  ; SSE2-LABEL: shift2i16
   ; SSE2: cost of 12 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i16
+  ; SSE2-CODEGEN-LABEL: shift2i16
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype %a , %b
@@ -16,9 +16,9 @@ entry:
 %shifttype4i16 = type <4 x i16>
 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
 entry:
-  ; SSE2: shift4i16
+  ; SSE2-LABEL: shift4i16
   ; SSE2: cost of 16 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i16
+  ; SSE2-CODEGEN-LABEL: shift4i16
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype4i16 %a , %b
@@ -28,9 +28,9 @@ entry:
 %shifttype8i16 = type <8 x i16>
 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
 entry:
-  ; SSE2: shift8i16
+  ; SSE2-LABEL: shift8i16
   ; SSE2: cost of 32 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i16
+  ; SSE2-CODEGEN-LABEL: shift8i16
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype8i16 %a , %b
@@ -40,9 +40,9 @@ entry:
 %shifttype16i16 = type <16 x i16>
 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
 entry:
-  ; SSE2: shift16i16
+  ; SSE2-LABEL: shift16i16
   ; SSE2: cost of 64 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i16
+  ; SSE2-CODEGEN-LABEL: shift16i16
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype16i16 %a , %b
@@ -52,9 +52,9 @@ entry:
 %shifttype32i16 = type <32 x i16>
 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
 entry:
-  ; SSE2: shift32i16
+  ; SSE2-LABEL: shift32i16
   ; SSE2: cost of 128 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i16
+  ; SSE2-CODEGEN-LABEL: shift32i16
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype32i16 %a , %b
@@ -64,9 +64,9 @@ entry:
 %shifttype2i32 = type <2 x i32>
 define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
 entry:
-  ; SSE2: shift2i32
+  ; SSE2-LABEL: shift2i32
   ; SSE2: cost of 12 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i32
+  ; SSE2-CODEGEN-LABEL: shift2i32
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype2i32 %a , %b
@@ -76,9 +76,9 @@ entry:
 %shifttype4i32 = type <4 x i32>
 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
 entry:
-  ; SSE2: shift4i32
+  ; SSE2-LABEL: shift4i32
   ; SSE2: cost of 16 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i32
+  ; SSE2-CODEGEN-LABEL: shift4i32
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype4i32 %a , %b
@@ -88,9 +88,9 @@ entry:
 %shifttype8i32 = type <8 x i32>
 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
 entry:
-  ; SSE2: shift8i32
+  ; SSE2-LABEL: shift8i32
   ; SSE2: cost of 32 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i32
+  ; SSE2-CODEGEN-LABEL: shift8i32
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype8i32 %a , %b
@@ -100,9 +100,9 @@ entry:
 %shifttype16i32 = type <16 x i32>
 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
 entry:
-  ; SSE2: shift16i32
+  ; SSE2-LABEL: shift16i32
   ; SSE2: cost of 64 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i32
+  ; SSE2-CODEGEN-LABEL: shift16i32
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype16i32 %a , %b
@@ -112,9 +112,9 @@ entry:
 %shifttype32i32 = type <32 x i32>
 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
 entry:
-  ; SSE2: shift32i32
+  ; SSE2-LABEL: shift32i32
   ; SSE2: cost of 128 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i32
+  ; SSE2-CODEGEN-LABEL: shift32i32
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype32i32 %a , %b
@@ -124,9 +124,9 @@ entry:
 %shifttype2i64 = type <2 x i64>
 define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
 entry:
-  ; SSE2: shift2i64
+  ; SSE2-LABEL: shift2i64
   ; SSE2: cost of 12 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i64
+  ; SSE2-CODEGEN-LABEL: shift2i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype2i64 %a , %b
@@ -136,9 +136,9 @@ entry:
 %shifttype4i64 = type <4 x i64>
 define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
 entry:
-  ; SSE2: shift4i64
+  ; SSE2-LABEL: shift4i64
   ; SSE2: cost of 24 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i64
+  ; SSE2-CODEGEN-LABEL: shift4i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype4i64 %a , %b
@@ -148,9 +148,9 @@ entry:
 %shifttype8i64 = type <8 x i64>
 define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
 entry:
-  ; SSE2: shift8i64
+  ; SSE2-LABEL: shift8i64
   ; SSE2: cost of 48 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i64
+  ; SSE2-CODEGEN-LABEL: shift8i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype8i64 %a , %b
@@ -160,9 +160,9 @@ entry:
 %shifttype16i64 = type <16 x i64>
 define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
 entry:
-  ; SSE2: shift16i64
+  ; SSE2-LABEL: shift16i64
   ; SSE2: cost of 96 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i64
+  ; SSE2-CODEGEN-LABEL: shift16i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype16i64 %a , %b
@@ -172,9 +172,9 @@ entry:
 %shifttype32i64 = type <32 x i64>
 define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
 entry:
-  ; SSE2: shift32i64
+  ; SSE2-LABEL: shift32i64
   ; SSE2: cost of 192 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i64
+  ; SSE2-CODEGEN-LABEL: shift32i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype32i64 %a , %b
@@ -184,9 +184,9 @@ entry:
 %shifttype2i8 = type <2 x i8>
 define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
 entry:
-  ; SSE2: shift2i8
+  ; SSE2-LABEL: shift2i8
   ; SSE2: cost of 12 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i8
+  ; SSE2-CODEGEN-LABEL: shift2i8
   ; SSE2-CODEGEN: psrlq
 
   %0 = ashr %shifttype2i8 %a , %b
@@ -196,9 +196,9 @@ entry:
 %shifttype4i8 = type <4 x i8>
 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
 entry:
-  ; SSE2: shift4i8
+  ; SSE2-LABEL: shift4i8
   ; SSE2: cost of 16 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i8
+  ; SSE2-CODEGEN-LABEL: shift4i8
   ; SSE2-CODEGEN: psrad
 
   %0 = ashr %shifttype4i8 %a , %b
@@ -208,9 +208,9 @@ entry:
 %shifttype8i8 = type <8 x i8>
 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
 entry:
-  ; SSE2: shift8i8
+  ; SSE2-LABEL: shift8i8
   ; SSE2: cost of 32 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i8
+  ; SSE2-CODEGEN-LABEL: shift8i8
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype8i8 %a , %b
@@ -220,9 +220,9 @@ entry:
 %shifttype16i8 = type <16 x i8>
 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
 entry:
-  ; SSE2: shift16i8
+  ; SSE2-LABEL: shift16i8
   ; SSE2: cost of 54 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i8
+  ; SSE2-CODEGEN-LABEL: shift16i8
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype16i8 %a , %b
@@ -232,9 +232,9 @@ entry:
 %shifttype32i8 = type <32 x i8>
 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
 entry:
-  ; SSE2: shift32i8
+  ; SSE2-LABEL: shift32i8
   ; SSE2: cost of 108 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i8
+  ; SSE2-CODEGEN-LABEL: shift32i8
   ; SSE2-CODEGEN: psraw
 
   %0 = ashr %shifttype32i8 %a , %b
@@ -246,9 +246,9 @@ entry:
 %shifttypec = type <2 x i16>
 define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
 entry:
-  ; SSE2: shift2i16const
+  ; SSE2-LABEL: shift2i16const
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i16const
+  ; SSE2-CODEGEN-LABEL: shift2i16const
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec %a , <i16 3, i16 3>
@@ -258,9 +258,9 @@ entry:
 %shifttypec4i16 = type <4 x i16>
 define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
 entry:
-  ; SSE2: shift4i16const
+  ; SSE2-LABEL: shift4i16const
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i16const
+  ; SSE2-CODEGEN-LABEL: shift4i16const
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
@@ -270,9 +270,9 @@ entry:
 %shifttypec8i16 = type <8 x i16>
 define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
 entry:
-  ; SSE2: shift8i16const
+  ; SSE2-LABEL: shift8i16const
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i16const
+  ; SSE2-CODEGEN-LABEL: shift8i16const
   ; SSE2-CODEGEN: psraw $3
 
   %0 = ashr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -284,9 +284,9 @@ entry:
 define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
                                          %shifttypec16i16 %b) {
 entry:
-  ; SSE2: shift16i16const
+  ; SSE2-LABEL: shift16i16const
   ; SSE2: cost of 2 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i16const
+  ; SSE2-CODEGEN-LABEL: shift16i16const
   ; SSE2-CODEGEN: psraw $3
 
   %0 = ashr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -300,9 +300,9 @@ entry:
 define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
                                         %shifttypec32i16 %b) {
 entry:
-  ; SSE2: shift32i16const
+  ; SSE2-LABEL: shift32i16const
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i16const
+  ; SSE2-CODEGEN-LABEL: shift32i16const
   ; SSE2-CODEGEN: psraw $3
 
   %0 = ashr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -319,9 +319,9 @@ entry:
 %shifttypec2i32 = type <2 x i32>
 define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
 entry:
-  ; SSE2: shift2i32c
+  ; SSE2-LABEL: shift2i32c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i32c
+  ; SSE2-CODEGEN-LABEL: shift2i32c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec2i32 %a , <i32 3, i32 3>
@@ -331,9 +331,9 @@ entry:
 %shifttypec4i32 = type <4 x i32>
 define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
 entry:
-  ; SSE2: shift4i32c
+  ; SSE2-LABEL: shift4i32c
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i32c
+  ; SSE2-CODEGEN-LABEL: shift4i32c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
@@ -343,9 +343,9 @@ entry:
 %shifttypec8i32 = type <8 x i32>
 define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
 entry:
-  ; SSE2: shift8i32c
+  ; SSE2-LABEL: shift8i32c
   ; SSE2: cost of 2 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i32c
+  ; SSE2-CODEGEN-LABEL: shift8i32c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -356,9 +356,9 @@ entry:
 %shifttypec16i32 = type <16 x i32>
 define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
 entry:
-  ; SSE2: shift16i32c
+  ; SSE2-LABEL: shift16i32c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i32c
+  ; SSE2-CODEGEN-LABEL: shift16i32c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -371,10 +371,10 @@ entry:
 %shifttypec32i32 = type <32 x i32>
 define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
 entry:
-  ; SSE2: shift32i32c
+  ; SSE2-LABEL: shift32i32c
   ; getTypeConversion fails here and promotes this to a i64.
   ; SSE2: cost of 8 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i32c
+  ; SSE2-CODEGEN-LABEL: shift32i32c
   ; SSE2-CODEGEN: psrad $3
   %0 = ashr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
                                    i32 3, i32 3, i32 3, i32 3,
@@ -390,9 +390,9 @@ entry:
 %shifttypec2i64 = type <2 x i64>
 define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
 entry:
-  ; SSE2: shift2i64c
+  ; SSE2-LABEL: shift2i64c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i64c
+  ; SSE2-CODEGEN-LABEL: shift2i64c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec2i64 %a , <i64 3, i64 3>
@@ -402,9 +402,9 @@ entry:
 %shifttypec4i64 = type <4 x i64>
 define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
 entry:
-  ; SSE2: shift4i64c
+  ; SSE2-LABEL: shift4i64c
   ; SSE2: cost of 8 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i64c
+  ; SSE2-CODEGEN-LABEL: shift4i64c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
@@ -414,9 +414,9 @@ entry:
 %shifttypec8i64 = type <8 x i64>
 define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
 entry:
-  ; SSE2: shift8i64c
+  ; SSE2-LABEL: shift8i64c
   ; SSE2: cost of 16 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i64c
+  ; SSE2-CODEGEN-LABEL: shift8i64c
   ; SSE2-CODEGEN: psrad $3
 
  %0 = ashr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -427,9 +427,9 @@ entry:
 %shifttypec16i64 = type <16 x i64>
 define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
 entry:
-  ; SSE2: shift16i64c
+  ; SSE2-LABEL: shift16i64c
   ; SSE2: cost of 32 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i64c
+  ; SSE2-CODEGEN-LABEL: shift16i64c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -442,9 +442,9 @@ entry:
 %shifttypec32i64 = type <32 x i64>
 define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
 entry:
-  ; SSE2: shift32i64c
+  ; SSE2-LABEL: shift32i64c
   ; SSE2: cost of 64 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i64c
+  ; SSE2-CODEGEN-LABEL: shift32i64c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
@@ -461,9 +461,9 @@ entry:
 %shifttypec2i8 = type <2 x i8>
 define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
 entry:
-  ; SSE2: shift2i8c
+  ; SSE2-LABEL: shift2i8c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift2i8c
+  ; SSE2-CODEGEN-LABEL: shift2i8c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec2i8 %a , <i8 3, i8 3>
@@ -473,9 +473,9 @@ entry:
 %shifttypec4i8 = type <4 x i8>
 define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
 entry:
-  ; SSE2: shift4i8c
+  ; SSE2-LABEL: shift4i8c
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift4i8c
+  ; SSE2-CODEGEN-LABEL: shift4i8c
   ; SSE2-CODEGEN: psrad $3
 
   %0 = ashr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
@@ -485,9 +485,9 @@ entry:
 %shifttypec8i8 = type <8 x i8>
 define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
 entry:
-  ; SSE2: shift8i8c
+  ; SSE2-LABEL: shift8i8c
   ; SSE2: cost of 1 {{.*}} ashr
-  ; SSE2-CODEGEN: shift8i8c
+  ; SSE2-CODEGEN-LABEL: shift8i8c
   ; SSE2-CODEGEN: psraw $3
 
   %0 = ashr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -498,9 +498,9 @@ entry:
 %shifttypec16i8 = type <16 x i8>
 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
 entry:
-  ; SSE2: shift16i8c
+  ; SSE2-LABEL: shift16i8c
   ; SSE2: cost of 4 {{.*}} ashr
-  ; SSE2-CODEGEN: shift16i8c
+  ; SSE2-CODEGEN-LABEL: shift16i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = ashr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -513,9 +513,9 @@ entry:
 %shifttypec32i8 = type <32 x i8>
 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
 entry:
-  ; SSE2: shift32i8c
+  ; SSE2-LABEL: shift32i8c
   ; SSE2: cost of 8 {{.*}} ashr
-  ; SSE2-CODEGEN: shift32i8c
+  ; SSE2-CODEGEN-LABEL: shift32i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = ashr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
diff --git a/test/Analysis/CostModel/X86/testshiftlshr.ll b/test/Analysis/CostModel/X86/testshiftlshr.ll
index e5fff9b5e4da9c671f6f500a65b77deb9e3e4d35..3e30614e18537d7f7b334214bf532b8dd415dee3 100644
--- a/test/Analysis/CostModel/X86/testshiftlshr.ll
+++ b/test/Analysis/CostModel/X86/testshiftlshr.ll
@@ -4,9 +4,9 @@
 %shifttype = type <2 x i16>
 define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
 entry:
-  ; SSE2: shift2i16
+  ; SSE2-LABEL: shift2i16
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i16
+  ; SSE2-CODEGEN-LABEL: shift2i16
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype %a , %b
@@ -16,9 +16,9 @@ entry:
 %shifttype4i16 = type <4 x i16>
 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
 entry:
-  ; SSE2: shift4i16
+  ; SSE2-LABEL: shift4i16
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i16
+  ; SSE2-CODEGEN-LABEL: shift4i16
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype4i16 %a , %b
@@ -28,9 +28,9 @@ entry:
 %shifttype8i16 = type <8 x i16>
 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
 entry:
-  ; SSE2: shift8i16
+  ; SSE2-LABEL: shift8i16
   ; SSE2: cost of 32 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i16
+  ; SSE2-CODEGEN-LABEL: shift8i16
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype8i16 %a , %b
@@ -40,9 +40,9 @@ entry:
 %shifttype16i16 = type <16 x i16>
 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
 entry:
-  ; SSE2: shift16i16
+  ; SSE2-LABEL: shift16i16
   ; SSE2: cost of 64 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i16
+  ; SSE2-CODEGEN-LABEL: shift16i16
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype16i16 %a , %b
@@ -52,9 +52,9 @@ entry:
 %shifttype32i16 = type <32 x i16>
 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
 entry:
-  ; SSE2: shift32i16
+  ; SSE2-LABEL: shift32i16
   ; SSE2: cost of 128 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i16
+  ; SSE2-CODEGEN-LABEL: shift32i16
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype32i16 %a , %b
@@ -64,9 +64,9 @@ entry:
 %shifttype2i32 = type <2 x i32>
 define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
 entry:
-  ; SSE2: shift2i32
+  ; SSE2-LABEL: shift2i32
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i32
+  ; SSE2-CODEGEN-LABEL: shift2i32
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype2i32 %a , %b
@@ -76,9 +76,9 @@ entry:
 %shifttype4i32 = type <4 x i32>
 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
 entry:
-  ; SSE2: shift4i32
+  ; SSE2-LABEL: shift4i32
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i32
+  ; SSE2-CODEGEN-LABEL: shift4i32
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype4i32 %a , %b
@@ -88,9 +88,9 @@ entry:
 %shifttype8i32 = type <8 x i32>
 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
 entry:
-  ; SSE2: shift8i32
+  ; SSE2-LABEL: shift8i32
   ; SSE2: cost of 32 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i32
+  ; SSE2-CODEGEN-LABEL: shift8i32
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype8i32 %a , %b
@@ -100,9 +100,9 @@ entry:
 %shifttype16i32 = type <16 x i32>
 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
 entry:
-  ; SSE2: shift16i32
+  ; SSE2-LABEL: shift16i32
   ; SSE2: cost of 64 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i32
+  ; SSE2-CODEGEN-LABEL: shift16i32
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype16i32 %a , %b
@@ -112,9 +112,9 @@ entry:
 %shifttype32i32 = type <32 x i32>
 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
 entry:
-  ; SSE2: shift32i32
+  ; SSE2-LABEL: shift32i32
   ; SSE2: cost of 128 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i32
+  ; SSE2-CODEGEN-LABEL: shift32i32
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype32i32 %a , %b
@@ -124,9 +124,9 @@ entry:
 %shifttype2i64 = type <2 x i64>
 define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
 entry:
-  ; SSE2: shift2i64
+  ; SSE2-LABEL: shift2i64
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i64
+  ; SSE2-CODEGEN-LABEL: shift2i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype2i64 %a , %b
@@ -136,9 +136,9 @@ entry:
 %shifttype4i64 = type <4 x i64>
 define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
 entry:
-  ; SSE2: shift4i64
+  ; SSE2-LABEL: shift4i64
   ; SSE2: cost of 8 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i64
+  ; SSE2-CODEGEN-LABEL: shift4i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype4i64 %a , %b
@@ -148,9 +148,9 @@ entry:
 %shifttype8i64 = type <8 x i64>
 define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
 entry:
-  ; SSE2: shift8i64
+  ; SSE2-LABEL: shift8i64
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i64
+  ; SSE2-CODEGEN-LABEL: shift8i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype8i64 %a , %b
@@ -160,9 +160,9 @@ entry:
 %shifttype16i64 = type <16 x i64>
 define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
 entry:
-  ; SSE2: shift16i64
+  ; SSE2-LABEL: shift16i64
   ; SSE2: cost of 32 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i64
+  ; SSE2-CODEGEN-LABEL: shift16i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype16i64 %a , %b
@@ -172,9 +172,9 @@ entry:
 %shifttype32i64 = type <32 x i64>
 define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
 entry:
-  ; SSE2: shift32i64
+  ; SSE2-LABEL: shift32i64
   ; SSE2: cost of 64 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i64
+  ; SSE2-CODEGEN-LABEL: shift32i64
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype32i64 %a , %b
@@ -184,9 +184,9 @@ entry:
 %shifttype2i8 = type <2 x i8>
 define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
 entry:
-  ; SSE2: shift2i8
+  ; SSE2-LABEL: shift2i8
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i8
+  ; SSE2-CODEGEN-LABEL: shift2i8
   ; SSE2-CODEGEN: psrlq
 
   %0 = lshr %shifttype2i8 %a , %b
@@ -196,9 +196,9 @@ entry:
 %shifttype4i8 = type <4 x i8>
 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
 entry:
-  ; SSE2: shift4i8
+  ; SSE2-LABEL: shift4i8
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i8
+  ; SSE2-CODEGEN-LABEL: shift4i8
   ; SSE2-CODEGEN: psrld
 
   %0 = lshr %shifttype4i8 %a , %b
@@ -208,9 +208,9 @@ entry:
 %shifttype8i8 = type <8 x i8>
 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
 entry:
-  ; SSE2: shift8i8
+  ; SSE2-LABEL: shift8i8
   ; SSE2: cost of 32 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i8
+  ; SSE2-CODEGEN-LABEL: shift8i8
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype8i8 %a , %b
@@ -220,9 +220,9 @@ entry:
 %shifttype16i8 = type <16 x i8>
 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
 entry:
-  ; SSE2: shift16i8
+  ; SSE2-LABEL: shift16i8
   ; SSE2: cost of 26 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i8
+  ; SSE2-CODEGEN-LABEL: shift16i8
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype16i8 %a , %b
@@ -232,9 +232,9 @@ entry:
 %shifttype32i8 = type <32 x i8>
 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
 entry:
-  ; SSE2: shift32i8
+  ; SSE2-LABEL: shift32i8
   ; SSE2: cost of 52 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i8
+  ; SSE2-CODEGEN-LABEL: shift32i8
   ; SSE2-CODEGEN: psrlw
 
   %0 = lshr %shifttype32i8 %a , %b
@@ -246,9 +246,9 @@ entry:
 %shifttypec = type <2 x i16>
 define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
 entry:
-  ; SSE2: shift2i16const
+  ; SSE2-LABEL: shift2i16const
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i16const
+  ; SSE2-CODEGEN-LABEL: shift2i16const
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec %a , <i16 3, i16 3>
@@ -258,9 +258,9 @@ entry:
 %shifttypec4i16 = type <4 x i16>
 define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
 entry:
-  ; SSE2: shift4i16const
+  ; SSE2-LABEL: shift4i16const
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i16const
+  ; SSE2-CODEGEN-LABEL: shift4i16const
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
@@ -270,9 +270,9 @@ entry:
 %shifttypec8i16 = type <8 x i16>
 define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
 entry:
-  ; SSE2: shift8i16const
+  ; SSE2-LABEL: shift8i16const
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i16const
+  ; SSE2-CODEGEN-LABEL: shift8i16const
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -284,9 +284,9 @@ entry:
 define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
                                          %shifttypec16i16 %b) {
 entry:
-  ; SSE2: shift16i16const
+  ; SSE2-LABEL: shift16i16const
   ; SSE2: cost of 2 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i16const
+  ; SSE2-CODEGEN-LABEL: shift16i16const
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -300,9 +300,9 @@ entry:
 define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
                                         %shifttypec32i16 %b) {
 entry:
-  ; SSE2: shift32i16const
+  ; SSE2-LABEL: shift32i16const
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i16const
+  ; SSE2-CODEGEN-LABEL: shift32i16const
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -319,9 +319,9 @@ entry:
 %shifttypec2i32 = type <2 x i32>
 define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
 entry:
-  ; SSE2: shift2i32c
+  ; SSE2-LABEL: shift2i32c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i32c
+  ; SSE2-CODEGEN-LABEL: shift2i32c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec2i32 %a , <i32 3, i32 3>
@@ -331,9 +331,9 @@ entry:
 %shifttypec4i32 = type <4 x i32>
 define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
 entry:
-  ; SSE2: shift4i32c
+  ; SSE2-LABEL: shift4i32c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i32c
+  ; SSE2-CODEGEN-LABEL: shift4i32c
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
@@ -343,9 +343,9 @@ entry:
 %shifttypec8i32 = type <8 x i32>
 define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
 entry:
-  ; SSE2: shift8i32c
+  ; SSE2-LABEL: shift8i32c
   ; SSE2: cost of 2 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i32c
+  ; SSE2-CODEGEN-LABEL: shift8i32c
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -356,9 +356,9 @@ entry:
 %shifttypec16i32 = type <16 x i32>
 define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
 entry:
-  ; SSE2: shift16i32c
+  ; SSE2-LABEL: shift16i32c
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i32c
+  ; SSE2-CODEGEN-LABEL: shift16i32c
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -371,9 +371,9 @@ entry:
 %shifttypec32i32 = type <32 x i32>
 define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
 entry:
-  ; SSE2: shift32i32c
+  ; SSE2-LABEL: shift32i32c
   ; SSE2: cost of 8 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i32c
+  ; SSE2-CODEGEN-LABEL: shift32i32c
   ; SSE2-CODEGEN: psrld $3
   %0 = lshr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
                                    i32 3, i32 3, i32 3, i32 3,
@@ -389,9 +389,9 @@ entry:
 %shifttypec2i64 = type <2 x i64>
 define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
 entry:
-  ; SSE2: shift2i64c
+  ; SSE2-LABEL: shift2i64c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i64c
+  ; SSE2-CODEGEN-LABEL: shift2i64c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec2i64 %a , <i64 3, i64 3>
@@ -401,9 +401,9 @@ entry:
 %shifttypec4i64 = type <4 x i64>
 define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
 entry:
-  ; SSE2: shift4i64c
+  ; SSE2-LABEL: shift4i64c
   ; SSE2: cost of 2 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i64c
+  ; SSE2-CODEGEN-LABEL: shift4i64c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
@@ -413,9 +413,9 @@ entry:
 %shifttypec8i64 = type <8 x i64>
 define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
 entry:
-  ; SSE2: shift8i64c
+  ; SSE2-LABEL: shift8i64c
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i64c
+  ; SSE2-CODEGEN-LABEL: shift8i64c
   ; SSE2-CODEGEN: psrlq $3
 
  %0 = lshr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -426,9 +426,9 @@ entry:
 %shifttypec16i64 = type <16 x i64>
 define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
 entry:
-  ; SSE2: shift16i64c
+  ; SSE2-LABEL: shift16i64c
   ; SSE2: cost of 8 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i64c
+  ; SSE2-CODEGEN-LABEL: shift16i64c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -441,9 +441,9 @@ entry:
 %shifttypec32i64 = type <32 x i64>
 define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
 entry:
-  ; SSE2: shift32i64c
+  ; SSE2-LABEL: shift32i64c
   ; SSE2: cost of 16 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i64c
+  ; SSE2-CODEGEN-LABEL: shift32i64c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
@@ -460,9 +460,9 @@ entry:
 %shifttypec2i8 = type <2 x i8>
 define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
 entry:
-  ; SSE2: shift2i8c
+  ; SSE2-LABEL: shift2i8c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift2i8c
+  ; SSE2-CODEGEN-LABEL: shift2i8c
   ; SSE2-CODEGEN: psrlq $3
 
   %0 = lshr %shifttypec2i8 %a , <i8 3, i8 3>
@@ -472,9 +472,9 @@ entry:
 %shifttypec4i8 = type <4 x i8>
 define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
 entry:
-  ; SSE2: shift4i8c
+  ; SSE2-LABEL: shift4i8c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift4i8c
+  ; SSE2-CODEGEN-LABEL: shift4i8c
   ; SSE2-CODEGEN: psrld $3
 
   %0 = lshr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
@@ -484,9 +484,9 @@ entry:
 %shifttypec8i8 = type <8 x i8>
 define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
 entry:
-  ; SSE2: shift8i8c
+  ; SSE2-LABEL: shift8i8c
   ; SSE2: cost of 1 {{.*}} lshr
-  ; SSE2-CODEGEN: shift8i8c
+  ; SSE2-CODEGEN-LABEL: shift8i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -497,9 +497,9 @@ entry:
 %shifttypec16i8 = type <16 x i8>
 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
 entry:
-  ; SSE2: shift16i8c
+  ; SSE2-LABEL: shift16i8c
   ; SSE2: cost of 2 {{.*}} lshr
-  ; SSE2-CODEGEN: shift16i8c
+  ; SSE2-CODEGEN-LABEL: shift16i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -512,9 +512,9 @@ entry:
 %shifttypec32i8 = type <32 x i8>
 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
 entry:
-  ; SSE2: shift32i8c
+  ; SSE2-LABEL: shift32i8c
   ; SSE2: cost of 4 {{.*}} lshr
-  ; SSE2-CODEGEN: shift32i8c
+  ; SSE2-CODEGEN-LABEL: shift32i8c
   ; SSE2-CODEGEN: psrlw $3
 
   %0 = lshr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
diff --git a/test/Analysis/CostModel/X86/testshiftshl.ll b/test/Analysis/CostModel/X86/testshiftshl.ll
index 5f48b46684d42ac387e6baedad3723828965915b..7db82b9fa5c2cc426ed33c6a42258e067e944802 100644
--- a/test/Analysis/CostModel/X86/testshiftshl.ll
+++ b/test/Analysis/CostModel/X86/testshiftshl.ll
@@ -4,9 +4,9 @@
 %shifttype = type <2 x i16>
 define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
 entry:
-  ; SSE2: shift2i16
+  ; SSE2-LABEL: shift2i16
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i16
+  ; SSE2-CODEGEN-LABEL: shift2i16
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype %a , %b
@@ -16,9 +16,9 @@ entry:
 %shifttype4i16 = type <4 x i16>
 define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
 entry:
-  ; SSE2: shift4i16
+  ; SSE2-LABEL: shift4i16
   ; SSE2: cost of 10 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i16
+  ; SSE2-CODEGEN-LABEL: shift4i16
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype4i16 %a , %b
@@ -28,9 +28,9 @@ entry:
 %shifttype8i16 = type <8 x i16>
 define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
 entry:
-  ; SSE2: shift8i16
+  ; SSE2-LABEL: shift8i16
   ; SSE2: cost of 32 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i16
+  ; SSE2-CODEGEN-LABEL: shift8i16
   ; SSE2-CODEGEN: pmullw
 
   %0 = shl %shifttype8i16 %a , %b
@@ -40,9 +40,9 @@ entry:
 %shifttype16i16 = type <16 x i16>
 define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
 entry:
-  ; SSE2: shift16i16
+  ; SSE2-LABEL: shift16i16
   ; SSE2: cost of 64 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i16
+  ; SSE2-CODEGEN-LABEL: shift16i16
   ; SSE2-CODEGEN: pmullw
 
   %0 = shl %shifttype16i16 %a , %b
@@ -52,9 +52,9 @@ entry:
 %shifttype32i16 = type <32 x i16>
 define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
 entry:
-  ; SSE2: shift32i16
+  ; SSE2-LABEL: shift32i16
   ; SSE2: cost of 128 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i16
+  ; SSE2-CODEGEN-LABEL: shift32i16
   ; SSE2-CODEGEN: pmullw
 
   %0 = shl %shifttype32i16 %a , %b
@@ -64,9 +64,9 @@ entry:
 %shifttype2i32 = type <2 x i32>
 define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
 entry:
-  ; SSE2: shift2i32
+  ; SSE2-LABEL: shift2i32
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i32
+  ; SSE2-CODEGEN-LABEL: shift2i32
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype2i32 %a , %b
@@ -76,9 +76,9 @@ entry:
 %shifttype4i32 = type <4 x i32>
 define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
 entry:
-  ; SSE2: shift4i32
+  ; SSE2-LABEL: shift4i32
   ; SSE2: cost of 10 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i32
+  ; SSE2-CODEGEN-LABEL: shift4i32
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype4i32 %a , %b
@@ -88,9 +88,9 @@ entry:
 %shifttype8i32 = type <8 x i32>
 define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
 entry:
-  ; SSE2: shift8i32
+  ; SSE2-LABEL: shift8i32
   ; SSE2: cost of 20 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i32
+  ; SSE2-CODEGEN-LABEL: shift8i32
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype8i32 %a , %b
@@ -100,9 +100,9 @@ entry:
 %shifttype16i32 = type <16 x i32>
 define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
 entry:
-  ; SSE2: shift16i32
+  ; SSE2-LABEL: shift16i32
   ; SSE2: cost of 40 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i32
+  ; SSE2-CODEGEN-LABEL: shift16i32
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype16i32 %a , %b
@@ -112,9 +112,9 @@ entry:
 %shifttype32i32 = type <32 x i32>
 define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
 entry:
-  ; SSE2: shift32i32
+  ; SSE2-LABEL: shift32i32
   ; SSE2: cost of 80 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i32
+  ; SSE2-CODEGEN-LABEL: shift32i32
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype32i32 %a , %b
@@ -124,9 +124,9 @@ entry:
 %shifttype2i64 = type <2 x i64>
 define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
 entry:
-  ; SSE2: shift2i64
+  ; SSE2-LABEL: shift2i64
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i64
+  ; SSE2-CODEGEN-LABEL: shift2i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype2i64 %a , %b
@@ -136,9 +136,9 @@ entry:
 %shifttype4i64 = type <4 x i64>
 define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
 entry:
-  ; SSE2: shift4i64
+  ; SSE2-LABEL: shift4i64
   ; SSE2: cost of 8 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i64
+  ; SSE2-CODEGEN-LABEL: shift4i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype4i64 %a , %b
@@ -148,9 +148,9 @@ entry:
 %shifttype8i64 = type <8 x i64>
 define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
 entry:
-  ; SSE2: shift8i64
+  ; SSE2-LABEL: shift8i64
   ; SSE2: cost of 16 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i64
+  ; SSE2-CODEGEN-LABEL: shift8i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype8i64 %a , %b
@@ -160,9 +160,9 @@ entry:
 %shifttype16i64 = type <16 x i64>
 define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
 entry:
-  ; SSE2: shift16i64
+  ; SSE2-LABEL: shift16i64
   ; SSE2: cost of 32 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i64
+  ; SSE2-CODEGEN-LABEL: shift16i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype16i64 %a , %b
@@ -172,9 +172,9 @@ entry:
 %shifttype32i64 = type <32 x i64>
 define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
 entry:
-  ; SSE2: shift32i64
+  ; SSE2-LABEL: shift32i64
   ; SSE2: cost of 64 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i64
+  ; SSE2-CODEGEN-LABEL: shift32i64
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype32i64 %a , %b
@@ -184,9 +184,9 @@ entry:
 %shifttype2i8 = type <2 x i8>
 define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
 entry:
-  ; SSE2: shift2i8
+  ; SSE2-LABEL: shift2i8
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i8
+  ; SSE2-CODEGEN-LABEL: shift2i8
   ; SSE2-CODEGEN: psllq
 
   %0 = shl %shifttype2i8 %a , %b
@@ -196,9 +196,9 @@ entry:
 %shifttype4i8 = type <4 x i8>
 define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
 entry:
-  ; SSE2: shift4i8
+  ; SSE2-LABEL: shift4i8
   ; SSE2: cost of 10 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i8
+  ; SSE2-CODEGEN-LABEL: shift4i8
   ; SSE2-CODEGEN: pmuludq
 
   %0 = shl %shifttype4i8 %a , %b
@@ -208,9 +208,9 @@ entry:
 %shifttype8i8 = type <8 x i8>
 define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
 entry:
-  ; SSE2: shift8i8
+  ; SSE2-LABEL: shift8i8
   ; SSE2: cost of 32 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i8
+  ; SSE2-CODEGEN-LABEL: shift8i8
   ; SSE2-CODEGEN: pmullw
 
   %0 = shl %shifttype8i8 %a , %b
@@ -220,9 +220,9 @@ entry:
 %shifttype16i8 = type <16 x i8>
 define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
 entry:
-  ; SSE2: shift16i8
+  ; SSE2-LABEL: shift16i8
   ; SSE2: cost of 26 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i8
+  ; SSE2-CODEGEN-LABEL: shift16i8
   ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype16i8 %a , %b
@@ -232,9 +232,9 @@ entry:
 %shifttype32i8 = type <32 x i8>
 define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
 entry:
-  ; SSE2: shift32i8
+  ; SSE2-LABEL: shift32i8
   ; SSE2: cost of 52 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i8
+  ; SSE2-CODEGEN-LABEL: shift32i8
   ; SSE2-CODEGEN: psllw
 
   %0 = shl %shifttype32i8 %a , %b
@@ -246,9 +246,9 @@ entry:
 %shifttypec = type <2 x i16>
 define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
 entry:
-  ; SSE2: shift2i16const
+  ; SSE2-LABEL: shift2i16const
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i16const
+  ; SSE2-CODEGEN-LABEL: shift2i16const
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec %a , <i16 3, i16 3>
@@ -258,9 +258,9 @@ entry:
 %shifttypec4i16 = type <4 x i16>
 define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
 entry:
-  ; SSE2: shift4i16const
+  ; SSE2-LABEL: shift4i16const
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i16const
+  ; SSE2-CODEGEN-LABEL: shift4i16const
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
@@ -270,9 +270,9 @@ entry:
 %shifttypec8i16 = type <8 x i16>
 define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
 entry:
-  ; SSE2: shift8i16const
+  ; SSE2-LABEL: shift8i16const
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i16const
+  ; SSE2-CODEGEN-LABEL: shift8i16const
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -284,9 +284,9 @@ entry:
 define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
                                          %shifttypec16i16 %b) {
 entry:
-  ; SSE2: shift16i16const
+  ; SSE2-LABEL: shift16i16const
   ; SSE2: cost of 2 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i16const
+  ; SSE2-CODEGEN-LABEL: shift16i16const
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -300,9 +300,9 @@ entry:
 define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
                                         %shifttypec32i16 %b) {
 entry:
-  ; SSE2: shift32i16const
+  ; SSE2-LABEL: shift32i16const
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i16const
+  ; SSE2-CODEGEN-LABEL: shift32i16const
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
@@ -319,9 +319,9 @@ entry:
 %shifttypec2i32 = type <2 x i32>
 define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
 entry:
-  ; SSE2: shift2i32c
+  ; SSE2-LABEL: shift2i32c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i32c
+  ; SSE2-CODEGEN-LABEL: shift2i32c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec2i32 %a , <i32 3, i32 3>
@@ -331,9 +331,9 @@ entry:
 %shifttypec4i32 = type <4 x i32>
 define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
 entry:
-  ; SSE2: shift4i32c
+  ; SSE2-LABEL: shift4i32c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i32c
+  ; SSE2-CODEGEN-LABEL: shift4i32c
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
@@ -343,9 +343,9 @@ entry:
 %shifttypec8i32 = type <8 x i32>
 define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
 entry:
-  ; SSE2: shift8i32c
+  ; SSE2-LABEL: shift8i32c
   ; SSE2: cost of 2 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i32c
+  ; SSE2-CODEGEN-LABEL: shift8i32c
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -356,9 +356,9 @@ entry:
 %shifttypec16i32 = type <16 x i32>
 define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
 entry:
-  ; SSE2: shift16i32c
+  ; SSE2-LABEL: shift16i32c
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i32c
+  ; SSE2-CODEGEN-LABEL: shift16i32c
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
@@ -371,9 +371,9 @@ entry:
 %shifttypec32i32 = type <32 x i32>
 define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
 entry:
-  ; SSE2: shift32i32c
+  ; SSE2-LABEL: shift32i32c
   ; SSE2: cost of 8 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i32c
+  ; SSE2-CODEGEN-LABEL: shift32i32c
   ; SSE2-CODEGEN: pslld $3
   %0 = shl %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
                                    i32 3, i32 3, i32 3, i32 3,
@@ -389,9 +389,9 @@ entry:
 %shifttypec2i64 = type <2 x i64>
 define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
 entry:
-  ; SSE2: shift2i64c
+  ; SSE2-LABEL: shift2i64c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i64c
+  ; SSE2-CODEGEN-LABEL: shift2i64c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec2i64 %a , <i64 3, i64 3>
@@ -401,9 +401,9 @@ entry:
 %shifttypec4i64 = type <4 x i64>
 define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
 entry:
-  ; SSE2: shift4i64c
+  ; SSE2-LABEL: shift4i64c
   ; SSE2: cost of 2 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i64c
+  ; SSE2-CODEGEN-LABEL: shift4i64c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
@@ -413,9 +413,9 @@ entry:
 %shifttypec8i64 = type <8 x i64>
 define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
 entry:
-  ; SSE2: shift8i64c
+  ; SSE2-LABEL: shift8i64c
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i64c
+  ; SSE2-CODEGEN-LABEL: shift8i64c
   ; SSE2-CODEGEN: psllq $3
 
  %0 = shl %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -426,9 +426,9 @@ entry:
 %shifttypec16i64 = type <16 x i64>
 define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
 entry:
-  ; SSE2: shift16i64c
+  ; SSE2-LABEL: shift16i64c
   ; SSE2: cost of 8 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i64c
+  ; SSE2-CODEGEN-LABEL: shift16i64c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
@@ -441,9 +441,9 @@ entry:
 %shifttypec32i64 = type <32 x i64>
 define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
 entry:
-  ; SSE2: shift32i64c
+  ; SSE2-LABEL: shift32i64c
   ; SSE2: cost of 16 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i64c
+  ; SSE2-CODEGEN-LABEL: shift32i64c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
@@ -460,9 +460,9 @@ entry:
 %shifttypec2i8 = type <2 x i8>
 define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
 entry:
-  ; SSE2: shift2i8c
+  ; SSE2-LABEL: shift2i8c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift2i8c
+  ; SSE2-CODEGEN-LABEL: shift2i8c
   ; SSE2-CODEGEN: psllq $3
 
   %0 = shl %shifttypec2i8 %a , <i8 3, i8 3>
@@ -472,9 +472,9 @@ entry:
 %shifttypec4i8 = type <4 x i8>
 define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
 entry:
-  ; SSE2: shift4i8c
+  ; SSE2-LABEL: shift4i8c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift4i8c
+  ; SSE2-CODEGEN-LABEL: shift4i8c
   ; SSE2-CODEGEN: pslld $3
 
   %0 = shl %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
@@ -484,9 +484,9 @@ entry:
 %shifttypec8i8 = type <8 x i8>
 define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
 entry:
-  ; SSE2: shift8i8c
+  ; SSE2-LABEL: shift8i8c
   ; SSE2: cost of 1 {{.*}} shl
-  ; SSE2-CODEGEN: shift8i8c
+  ; SSE2-CODEGEN-LABEL: shift8i8c
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -497,9 +497,9 @@ entry:
 %shifttypec16i8 = type <16 x i8>
 define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
 entry:
-  ; SSE2: shift16i8c
+  ; SSE2-LABEL: shift16i8c
   ; SSE2: cost of 2 {{.*}} shl
-  ; SSE2-CODEGEN: shift16i8c
+  ; SSE2-CODEGEN-LABEL: shift16i8c
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
@@ -512,9 +512,9 @@ entry:
 %shifttypec32i8 = type <32 x i8>
 define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
 entry:
-  ; SSE2: shift32i8c
+  ; SSE2-LABEL: shift32i8c
   ; SSE2: cost of 4 {{.*}} shl
-  ; SSE2-CODEGEN: shift32i8c
+  ; SSE2-CODEGEN-LABEL: shift32i8c
   ; SSE2-CODEGEN: psllw $3
 
   %0 = shl %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
diff --git a/test/Analysis/CostModel/X86/uitofp.ll b/test/Analysis/CostModel/X86/uitofp.ll
index 9b8bd082923f4329a186873a3a8b001ce374751b..c76ac14a685aeb592bd2da913d6b08df42bb368e 100644
--- a/test/Analysis/CostModel/X86/uitofp.ll
+++ b/test/Analysis/CostModel/X86/uitofp.ll
@@ -13,7 +13,7 @@
 define i32 @uitofp_i8_double() {
 ; SSE-LABEL: 'uitofp_i8_double'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i8_f64 = uitofp i8 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -49,7 +49,7 @@ define i32 @uitofp_i8_double() {
 define i32 @uitofp_i16_double() {
 ; SSE-LABEL: 'uitofp_i16_double'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i16_f64 = uitofp i16 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -85,7 +85,7 @@ define i32 @uitofp_i16_double() {
 define i32 @uitofp_i32_double() {
 ; SSE-LABEL: 'uitofp_i32_double'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i32_f64 = uitofp i32 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
@@ -120,24 +120,24 @@ define i32 @uitofp_i32_double() {
 
 define i32 @uitofp_i64_double() {
 ; SSE-LABEL: 'uitofp_i64_double'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
-; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX-LABEL: 'uitofp_i64_double'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
-; AVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512F-LABEL: 'uitofp_i64_double'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQ-LABEL: 'uitofp_i64_double'
@@ -148,10 +148,10 @@ define i32 @uitofp_i64_double() {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'uitofp_i64_double'
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cvt_i64_f64 = uitofp i64 undef to double
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %cvt_i64_f64 = uitofp i64 undef to double
diff --git a/test/Analysis/CostModel/X86/vdiv-cost.ll b/test/Analysis/CostModel/X86/vdiv-cost.ll
index 7bb935cbcec338c5faffa57d16c101b14ed2c103..d87d21c487d840462d0fa157bc42f25c1ac0d60e 100644
--- a/test/Analysis/CostModel/X86/vdiv-cost.ll
+++ b/test/Analysis/CostModel/X86/vdiv-cost.ll
@@ -100,7 +100,7 @@ define <16 x i16> @test6(<16 x i16> %a) {
 
 define <16 x i8> @test7(<16 x i8> %a) {
 ; CHECK-LABEL: 'test7'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %div
 ;
   %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
@@ -162,18 +162,58 @@ define <8 x i32> @test9(<8 x i32> %a) {
 }
 
 define <8 x i32> @test10(<8 x i32> %a) {
-; CHECK-LABEL: 'test10'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+; SSE2-LABEL: 'test10'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; SSSE3-LABEL: 'test10'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; SSE42-LABEL: 'test10'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; AVX1-LABEL: 'test10'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; AVX2-LABEL: 'test10'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
+;
+; AVX512-LABEL: 'test10'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div
 ;
   %div = sdiv <8 x i32> %a, <i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <8 x i32> %div
 }
 
 define <16 x i32> @test11(<16 x i32> %a) {
-; CHECK-LABEL: 'test11'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+; SSE2-LABEL: 'test11'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; SSSE3-LABEL: 'test11'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; SSE42-LABEL: 'test11'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; AVX1-LABEL: 'test11'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; AVX2-LABEL: 'test11'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
+;
+; AVX512-LABEL: 'test11'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %div
 ;
   %div = sdiv <16 x i32> %a, <i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7, i32 8, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
   ret <16 x i32> %div
diff --git a/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll b/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
index 10f9c767904cdac47197c933dcc12151e1ad0ced..0d0fe65694c800503c88f990aa35a2ae7db997a9 100644
--- a/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
+++ b/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
@@ -39,7 +39,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 ; CHECK-NEXT:      Group
 ; CHECK-NEXT:        (Low: %b High: ((4 * (1 umax %x)) + %b))
 ; CHECK-NEXT:          Member: {%b,+,4}<%for.body>
-; CHECK:         Variant Store to invariant address was not found in loop.
+; CHECK:         Multiple stores to invariant address were not found in loop.
 ; CHECK-NEXT:    SCEV assumptions:
 ; CHECK-NEXT:    {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:    {0,+,1}<%for.body> Added Flags: <nusw>
diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
index ad9b1295a6dfa6feb671ee9d0334c6fce19f39b0..f24211d1e0dfe1aae30949aa33c9fff51c8e9887 100644
--- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
+++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check1.ll
@@ -1,26 +1,27 @@
 ; RUN: opt < %s -loop-accesses -analyze | FileCheck -check-prefix=OLDPM %s
 ; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output  < %s 2>&1 | FileCheck -check-prefix=NEWPM %s
 
-; Test to confirm LAA will find store to invariant address.
-; Inner loop has a store to invariant address.
+; Test to confirm LAA will find multiple stores to an invariant address in the
+; inner loop.
 ;
 ;  for(; i < itr; i++) {
 ;    for(; j < itr; j++) {
 ;      var1[i] = var2[j] + var1[i];
+;      var1[i]++;
 ;    }
 ;  }
 
 ; The LAA with the new PM is a loop pass so we go from inner to outer loops.
 
 ; OLDPM: for.cond1.preheader:
-; OLDPM:   Variant Store to invariant address was not found in loop.
+; OLDPM:   Multiple stores to invariant address were not found in loop.
 ; OLDPM: for.body3:
-; OLDPM:   Variant Store to invariant address was found in loop.
+; OLDPM:   Multiple stores to invariant address were found in loop.
 
 ; NEWPM: for.body3:
-; NEWPM:   Variant Store to invariant address was found in loop.
+; NEWPM:   Multiple stores to invariant address were found in loop.
 ; NEWPM: for.cond1.preheader:
-; NEWPM:   Variant Store to invariant address was not found in loop.
+; NEWPM:   Multiple stores to invariant address were not found in loop.
 
 define i32 @foo(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
 entry:
@@ -45,6 +46,9 @@ for.body3:                                        ; preds = %for.body3, %for.bod
   %2 = load i32, i32* %arrayidx5, align 4
   %add = add nsw i32 %2, %1
   store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  store i32 %4, i32* %arrayidx5, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %itr
diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
index e40c9e733cde9d8e89540614956b565d048ffef6..07bcdcc5c669cf224334fdeec852e031ba36e7b2 100644
--- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
+++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check2.ll
@@ -10,8 +10,8 @@
 ;    }
 ;  }
 
-; CHECK: Variant Store to invariant address was not found in loop.
-; CHECK-NOT: Variant Store to invariant address was found in loop.
+; CHECK: Multiple stores to invariant address were not found in loop.
+; CHECK-NOT: Multiple stores to invariant address were found in loop.
 
 
 define i32 @foo(i32* nocapture readonly %var1, i32* nocapture %var2, i32 %itr) #0 {
diff --git a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
index eaadcfecaa3149d0cd48ee874abfbc047b4c626d..8d7452471f5c9d93e1a4d0117da8d2bba8d7c9db 100644
--- a/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
+++ b/test/Analysis/LoopAccessAnalysis/store-to-invariant-check3.ll
@@ -1,8 +1,8 @@
 ; RUN: opt < %s -loop-accesses -analyze | FileCheck %s
 ; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output  < %s 2>&1 | FileCheck %s
 
-; Test to confirm LAA will find store to invariant address.
-; Inner loop has a store to invariant address.
+; Inner loop has a store to invariant address, but LAA does not need to identify
+; the store to invariant address, since it is a single store.
 ;
 ;  for(; i < itr; i++) {
 ;    for(; j < itr; j++) {
@@ -10,7 +10,7 @@
 ;    }
 ;  }
 
-; CHECK: Variant Store to invariant address was found in loop.
+; CHECK: Multiple stores to invariant address were not found in loop.
 
 define void @foo(i32* nocapture %var1, i32* nocapture %var2, i32 %itr) #0 {
 entry:
diff --git a/test/Analysis/ProfileSummary/basic.ll b/test/Analysis/ProfileSummary/basic.ll
index e417e459f04ca04b052d672c18a84247f2eec7a1..966a1117c47d149c48ca99910011792ae9b0e05b 100644
--- a/test/Analysis/ProfileSummary/basic.ll
+++ b/test/Analysis/ProfileSummary/basic.ll
@@ -1,19 +1,31 @@
 ; RUN: opt < %s -disable-output -passes=print-profile-summary -S 2>&1 | FileCheck %s
+; RUN: opt < %s -disable-output -profile-summary-hot-count=500 -passes=print-profile-summary -S 2>&1 | FileCheck %s -check-prefixes=OVERRIDE-HOT
+; RUN: opt < %s -disable-output -profile-summary-cold-count=0 -passes=print-profile-summary -S 2>&1 | FileCheck %s -check-prefixes=OVERRIDE-COLD
+; RUN: opt < %s -disable-output -profile-summary-cold-count=200 -profile-summary-hot-count=1000 -passes=print-profile-summary -S 2>&1 | FileCheck %s -check-prefixes=OVERRIDE-BOTH
 
 define void @f1() !prof !20 {
 ; CHECK-LABEL: f1 :hot
+; OVERRIDE-HOT-LABEL: f1
+; OVERRIDE-COLD-LABEL: f1 :hot
+; OVERRIDE-BOTH-LABEL: f1
 
   ret void
 }
 
 define void @f2() !prof !21 {
 ; CHECK-LABEL: f2 :cold
+; OVERRIDE-HOT-LABEL: f2 :cold
+; OVERRIDE-COLD-LABEL: f2
+; OVERRIDE-BOTH-LABEL: f2
 
   ret void
 }
 
 define void @f3() !prof !22 {
 ; CHECK-LABEL: f3
+; OVERRIDE-HOT-LABEL: f3
+; OVERRIDE-COLD-LABEL: f3
+; OVERRIDE-BOTH-LABEL: f3
 
   ret void
 }
diff --git a/test/Analysis/ScalarEvolution/binomial-explision.ll b/test/Analysis/ScalarEvolution/binomial-explision.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ff27bfcbd764e1fc9e817c9503d322f25dbe11a7
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/binomial-explision.ll
@@ -0,0 +1,47 @@
+; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+; Check that we don't have unreasonably huge SCEVs and in particular only a
+; reasonable amount of AddRecs in the notation of %tmp19. If we "simplify" SCEVs
+; too aggressively, we may end up with huge nested expressions.
+define void @test(i32 %x, i64 %y, i1 %cond) {
+
+; CHECK: %tmp19 = mul i32 %tmp17, %tmp18
+; CHECK: ((((((
+; CHECK-NOT: (((((
+; CHECK: %tmp20 = add i32 %tmp19, %x
+
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb3, %bb
+  %tmp = phi i64 [ %y, %bb ], [ %tmp22, %bb3 ]
+  %tmp2 = phi i32 [ %x, %bb ], [ %tmp4, %bb3 ]
+  br label %bb5
+
+bb3:                                              ; preds = %bb5
+  %tmp4 = add i32 %tmp2, %x
+  br label %bb1
+
+bb5:                                              ; preds = %bb5, %bb1
+  %tmp6 = phi i32 [ %tmp23, %bb5 ], [ %tmp2, %bb1 ]
+  %tmp7 = sub i32 -119, %tmp6
+  %tmp8 = mul i32 %tmp7, %x
+  %tmp9 = sub i32 -120, %tmp6
+  %tmp10 = mul i32 %tmp8, %tmp9
+  %tmp11 = mul i32 %x, %tmp10
+  %tmp12 = sub i32 -121, %tmp6
+  %tmp13 = mul i32 %tmp10, %tmp12
+  %tmp14 = mul i32 %tmp11, %tmp13
+  %tmp15 = sub i32 -122, %tmp6
+  %tmp16 = mul i32 %tmp13, %tmp15
+  %tmp17 = mul i32 %tmp14, %tmp16
+  %tmp18 = mul i32 %tmp16, %x
+  %tmp19 = mul i32 %tmp17, %tmp18
+  %tmp20 = add i32 %tmp19, %x
+  %tmp21 = sext i32 %tmp20 to i64
+  %tmp22 = add i64 %y, %tmp21
+  %tmp23 = add i32 %tmp6, 7
+  br i1 %cond, label %bb5, label %bb3
+}
diff --git a/test/Assembler/fast-math-flags.ll b/test/Assembler/fast-math-flags.ll
index 664b1bd271eb585339a4bbcaa0e4a622e13ec4e7..edff26e6d685f98a9af7671898428db56c6be760 100644
--- a/test/Assembler/fast-math-flags.ll
+++ b/test/Assembler/fast-math-flags.ll
@@ -1,5 +1,5 @@
-; RUN: llvm-as < %s | llvm-dis | FileCheck %s
-; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as < %s | llvm-dis | FileCheck -strict-whitespace %s
+; RUN: opt -S < %s | FileCheck -strict-whitespace %s
 ; RUN: verify-uselistorder %s
 
 @addr   = external global i64
@@ -11,67 +11,67 @@ declare float @foo(float)
 
 define float @none(float %x, float %y) {
 entry:
-; CHECK:  %vec = load  <3 x float>,  <3 x float>* @vec
-  %vec    = load  <3 x float>,  <3 x float>* @vec
+; CHECK:  %vec = load <3 x float>, <3 x float>* @vec
+  %vec    = load <3 x float>, <3 x float>* @vec
 ; CHECK:  %select = load i1, i1* @select
   %select = load i1, i1* @select
-; CHECK:  %arr    = load [3 x float], [3 x float]* @arr
+; CHECK:  %arr = load [3 x float], [3 x float]* @arr
   %arr    = load [3 x float], [3 x float]* @arr
 
-; CHECK:  %a = fadd  float %x, %y
-  %a = fadd  float %x, %y
-; CHECK:  %a_vec = fadd  <3 x float> %vec, %vec
-  %a_vec = fadd  <3 x float> %vec, %vec
-; CHECK:  %b = fsub  float %x, %y
-  %b = fsub  float %x, %y
-; CHECK:  %b_vec = fsub  <3 x float> %vec, %vec
-  %b_vec = fsub  <3 x float> %vec, %vec
-; CHECK:  %c = fmul  float %x, %y
-  %c = fmul  float %x, %y
-; CHECK:  %c_vec = fmul  <3 x float> %vec, %vec
-  %c_vec = fmul  <3 x float> %vec, %vec
-; CHECK:  %d = fdiv  float %x, %y
-  %d = fdiv  float %x, %y
-; CHECK:  %d_vec = fdiv  <3 x float> %vec, %vec
-  %d_vec = fdiv  <3 x float> %vec, %vec
-; CHECK:  %e = frem  float %x, %y
-  %e = frem  float %x, %y
-; CHECK:  %e_vec = frem  <3 x float> %vec, %vec
-  %e_vec = frem  <3 x float> %vec, %vec
-; CHECK:  ret  float %e
-  ret  float %e
+; CHECK:  %a = fadd float %x, %y
+  %a = fadd float %x, %y
+; CHECK:  %a_vec = fadd <3 x float> %vec, %vec
+  %a_vec = fadd <3 x float> %vec, %vec
+; CHECK:  %b = fsub float %x, %y
+  %b = fsub float %x, %y
+; CHECK:  %b_vec = fsub <3 x float> %vec, %vec
+  %b_vec = fsub <3 x float> %vec, %vec
+; CHECK:  %c = fmul float %x, %y
+  %c = fmul float %x, %y
+; CHECK:  %c_vec = fmul <3 x float> %vec, %vec
+  %c_vec = fmul <3 x float> %vec, %vec
+; CHECK:  %d = fdiv float %x, %y
+  %d = fdiv float %x, %y
+; CHECK:  %d_vec = fdiv <3 x float> %vec, %vec
+  %d_vec = fdiv <3 x float> %vec, %vec
+; CHECK:  %e = frem float %x, %y
+  %e = frem float %x, %y
+; CHECK:  %e_vec = frem <3 x float> %vec, %vec
+  %e_vec = frem <3 x float> %vec, %vec
+; CHECK:  ret float %e
+  ret float %e
 }
 
 ; CHECK: no_nan
 define float @no_nan(float %x, float %y) {
 entry:
 ; CHECK:  %vec = load <3 x float>, <3 x float>* @vec
-  %vec    = load  <3 x float>,  <3 x float>* @vec
+  %vec    = load <3 x float>, <3 x float>* @vec
 ; CHECK:  %select = load i1, i1* @select
   %select = load i1, i1* @select
-; CHECK:  %arr = load  [3 x float],  [3 x float]* @arr
-  %arr    = load  [3 x float],  [3 x float]* @arr
+; CHECK:  %arr = load [3 x float], [3 x float]* @arr
+  %arr    = load [3 x float], [3 x float]* @arr
 
-; CHECK:  %a = fadd nnan  float %x, %y
-  %a = fadd nnan  float %x, %y
-; CHECK:  %a_vec = fadd nnan  <3 x float> %vec, %vec
-  %a_vec = fadd nnan  <3 x float> %vec, %vec
-; CHECK:  %b = fsub nnan  float %x, %y
-  %b = fsub nnan  float %x, %y
-; CHECK:  %b_vec = fsub nnan  <3 x float> %vec, %vec
-  %b_vec = fsub nnan  <3 x float> %vec, %vec
-; CHECK:  %c = fmul nnan  float %x, %y
-  %c = fmul nnan  float %x, %y
-; CHECK:  %c_vec = fmul nnan  <3 x float> %vec, %vec
+; CHECK:  %a = fadd nnan float %x, %y
+  %a = fadd nnan float %x, %y
+; CHECK:  %a_vec = fadd nnan <3 x float> %vec, %vec
+  %a_vec = fadd nnan <3 x float> %vec, %vec
+; CHECK:  %b = fsub nnan float %x, %y
+  %b = fsub nnan float %x, %y
+; CHECK:  %b_vec = fsub nnan <3 x float> %vec, %vec
+  %b_vec = fsub nnan <3 x float> %vec, %vec
+; CHECK:  %c = fmul nnan float %x, %y
+  %c = fmul nnan float %x, %y
+; CHECK:  %c_vec = fmul nnan <3 x float> %vec, %vec
   %c_vec = fmul nnan <3 x float> %vec, %vec
-; CHECK:  %d = fdiv nnan  float %x, %y
+; CHECK:  %d = fdiv nnan float %x, %y
   %d = fdiv nnan float %x, %y
-; CHECK:  %d_vec = fdiv nnan  <3 x float> %vec, %vec
+; CHECK:  %d_vec = fdiv nnan <3 x float> %vec, %vec
   %d_vec = fdiv nnan <3 x float> %vec, %vec
-; CHECK:  %e = frem nnan  float %x, %y
-  %e = frem nnan  float %x, %y
-; CHECK:  %e_vec = frem nnan  <3 x float> %vec, %vec
-  %e_vec = frem nnan  <3 x float> %vec, %vec
+; CHECK:  %e = frem nnan float %x, %y
+  %e = frem nnan float %x, %y
+; CHECK:  %e_vec = frem nnan <3 x float> %vec, %vec
+  %e_vec = frem nnan <3 x float> %vec, %vec
 ; CHECK:  ret float %e
   ret float %e
 }
@@ -120,28 +120,28 @@ entry:
 ; CHECK:  %arr = load [3 x float], [3 x float]* @arr
   %arr    = load [3 x float], [3 x float]* @arr
 
-; CHECK:  %a = fadd nnan ninf  float %x, %y
-  %a = fadd ninf nnan  float %x, %y
-; CHECK:  %a_vec = fadd nnan  <3 x float> %vec, %vec
-  %a_vec = fadd nnan  <3 x float> %vec, %vec
-; CHECK:  %b = fsub nnan  float %x, %y
-  %b = fsub nnan  float %x, %y
-; CHECK:  %b_vec = fsub nnan ninf  <3 x float> %vec, %vec
-  %b_vec = fsub ninf nnan  <3 x float> %vec, %vec
-; CHECK:  %c = fmul nnan  float %x, %y
-  %c = fmul nnan  float %x, %y
-; CHECK:  %c_vec = fmul nnan  <3 x float> %vec, %vec
+; CHECK:  %a = fadd nnan ninf float %x, %y
+  %a = fadd ninf nnan float %x, %y
+; CHECK:  %a_vec = fadd nnan <3 x float> %vec, %vec
+  %a_vec = fadd nnan <3 x float> %vec, %vec
+; CHECK:  %b = fsub nnan float %x, %y
+  %b = fsub nnan float %x, %y
+; CHECK:  %b_vec = fsub nnan ninf <3 x float> %vec, %vec
+  %b_vec = fsub ninf nnan <3 x float> %vec, %vec
+; CHECK:  %c = fmul nnan float %x, %y
+  %c = fmul nnan float %x, %y
+; CHECK:  %c_vec = fmul nnan <3 x float> %vec, %vec
   %c_vec = fmul nnan <3 x float> %vec, %vec
-; CHECK:  %d = fdiv nnan ninf  float %x, %y
+; CHECK:  %d = fdiv nnan ninf float %x, %y
   %d = fdiv ninf nnan float %x, %y
-; CHECK:  %d_vec = fdiv nnan  <3 x float> %vec, %vec
+; CHECK:  %d_vec = fdiv nnan <3 x float> %vec, %vec
   %d_vec = fdiv nnan <3 x float> %vec, %vec
-; CHECK:  %e = frem nnan  float %x, %y
-  %e = frem nnan  float %x, %y
-; CHECK:  %e_vec = frem nnan ninf  <3 x float> %vec, %vec
-  %e_vec = frem ninf nnan  <3 x float> %vec, %vec
-; CHECK:  ret  float %e
-  ret  float %e
+; CHECK:  %e = frem nnan float %x, %y
+  %e = frem nnan float %x, %y
+; CHECK:  %e_vec = frem nnan ninf <3 x float> %vec, %vec
+  %e_vec = frem ninf nnan <3 x float> %vec, %vec
+; CHECK:  ret float %e
+  ret float %e
 }
 
 ; CHECK: mixed_flags
@@ -151,7 +151,7 @@ entry:
   %vec    = load <3 x float>, <3 x float>* @vec
 ; CHECK:  %select = load i1, i1* @select
   %select = load i1, i1* @select
-; CHECK:  %arr    = load [3 x float], [3 x float]* @arr
+; CHECK:  %arr = load [3 x float], [3 x float]* @arr
   %arr    = load [3 x float], [3 x float]* @arr
 
 ; CHECK:  %a = fadd nnan ninf afn float %x, %y
@@ -174,6 +174,6 @@ entry:
   %e = frem nnan nsz float %x, %y
 ; CHECK:  %e_vec = frem nnan <3 x float> %vec, %vec
   %e_vec = frem nnan <3 x float> %vec, %vec
-; CHECK:  ret  float %e
-  ret  float %e
+; CHECK:  ret float %e
+  ret float %e
 }
diff --git a/test/Assembler/thinlto-summary.ll b/test/Assembler/thinlto-summary.ll
index 01bf3a8c810168ab1b968d9abac189e2544e9c4e..64af835ae2b101bde9c5ea62673670588060d3cc 100644
--- a/test/Assembler/thinlto-summary.ll
+++ b/test/Assembler/thinlto-summary.ll
@@ -81,8 +81,8 @@
 ; CHECK: ^13 = gv: (guid: 12, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0))))
 ; CHECK: ^14 = gv: (guid: 13, summaries: (variable: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1))))
 ; CHECK: ^15 = gv: (guid: 14, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 1, live: 1, dsoLocal: 0), insts: 1)))
-; CHECK: ^16 = gv: (guid: 15, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 1, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0))))
-; CHECK: ^17 = gv: (guid: 16, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 0, readOnly: 1, noRecurse: 0, returnDoesNotAlias: 1), calls: ((callee: ^15)))))
+; CHECK: ^16 = gv: (guid: 15, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 1, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 0))))
+; CHECK: ^17 = gv: (guid: 16, summaries: (function: (module: ^1, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 1, funcFlags: (readNone: 0, readOnly: 1, noRecurse: 0, returnDoesNotAlias: 1, noInline: 0), calls: ((callee: ^15)))))
 ; CHECK: ^18 = gv: (guid: 17, summaries: (alias: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 1), aliasee: ^14)))
 ; CHECK: ^19 = gv: (guid: 18, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 4, typeIdInfo: (typeTests: (^24, ^26)))))
 ; CHECK: ^20 = gv: (guid: 19, summaries: (function: (module: ^0, flags: (linkage: external, notEligibleToImport: 0, live: 0, dsoLocal: 0), insts: 8, typeIdInfo: (typeTestAssumeVCalls: (vFuncId: (^27, offset: 16))))))
diff --git a/test/Bindings/llvm-c/echo.ll b/test/Bindings/llvm-c/echo.ll
index 580293b3d04edbe24fc9337e690a24140a517244..118f822e432f4b6e25f1cef86283c78e5d69dfcb 100644
--- a/test/Bindings/llvm-c/echo.ll
+++ b/test/Bindings/llvm-c/echo.ll
@@ -170,6 +170,22 @@ define void @with_debuginfo() !dbg !4 {
   ret void, !dbg !7
 }
 
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
+
+define void @test_intrinsics() {
+entry:
+  %sp = call i8* @llvm.stacksave()
+  %x = alloca i32
+  %0 = bitcast i32* %x to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0)
+  call void @llvm.stackrestore(i8* %sp)
+  ret void
+}
+
 !llvm.dbg.cu = !{!0, !2}
 !llvm.module.flags = !{!3}
 
diff --git a/test/Bitcode/thinlto-function-summary.ll b/test/Bitcode/thinlto-function-summary.ll
index 5922a8b3c4d0b1f00d7632c9db2b9c9ab2cd8a63..7f59eeabd9c42e97d9ddddaa7634e7c10b54b063 100644
--- a/test/Bitcode/thinlto-function-summary.ll
+++ b/test/Bitcode/thinlto-function-summary.ll
@@ -20,7 +20,7 @@
 ; BC-NEXT: <PERMODULE {{.*}} op0=1 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=2 op1=0
 ; BC-NEXT: <PERMODULE {{.*}} op0=3 op1=7
-; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=16
+; BC-NEXT: <PERMODULE {{.*}} op0=4 op1=0 op2=1 op3=16
 ; BC-NEXT: <ALIAS {{.*}} op0=5 op1=0 op2=3
 ; BC-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 ; BC: <STRTAB_BLOCK
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index e1defd719584dd8687920066ae11e2b86f614e4f..da3aa3c1009037480081dc85923b64c9e2352c9e 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -141,7 +141,7 @@ define fp128 @test_quad_dump() {
   ret fp128 0xL00000000000000004000000000000000
 }
 
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %2:_(p0) = G_EXTRACT_VECTOR_ELT %0:_(<2 x p0>), %3:_(s32) (in function: vector_of_pointers_extractelement)
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %2:_(p0) = G_EXTRACT_VECTOR_ELT %0:_(<2 x p0>), %3:_(s64) (in function: vector_of_pointers_extractelement)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for vector_of_pointers_extractelement
 ; FALLBACK-WITH-REPORT-OUT-LABEL: vector_of_pointers_extractelement:
 @var = global <2 x i16*> zeroinitializer
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index a021eeda353be9a8d02de7f903a4a033d9308b21..2997c5350ebc1cce4dfee1d22f3685d8dc1184a6 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1530,12 +1530,23 @@ define i32 @test_extractelement(<2 x i32> %vec, i32 %idx) {
 ; CHECK-LABEL: name: test_extractelement
 ; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = COPY $d0
 ; CHECK: [[IDX:%[0-9]+]]:_(s32) = COPY $w0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[IDX]](s32)
+; CHECK: [[IDXEXT:%[0-9]+]]:_(s64) = G_SEXT [[IDX]]
+; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[IDXEXT]](s64)
 ; CHECK: $w0 = COPY [[RES]](s32)
   %res = extractelement <2 x i32> %vec, i32 %idx
   ret i32 %res
 }
 
+define i32 @test_extractelement_const_idx(<2 x i32> %vec) {
+; CHECK-LABEL: name: test_extractelement
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+; CHECK: [[IDX:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[IDX]](s64)
+; CHECK: $w0 = COPY [[RES]](s32)
+  %res = extractelement <2 x i32> %vec, i32 1
+  ret i32 %res
+}
+
 define i32 @test_singleelementvector(i32 %elt){
 ; CHECK-LABEL: name: test_singleelementvector
 ; CHECK: [[ELT:%[0-9]+]]:_(s32) = COPY $w0
diff --git a/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll b/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll
index e603af678de672bf3d4c30539faa3247a0394e9a..caf0a2eebca5755b410c6f0995b52ae7cfed67ef 100644
--- a/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll
+++ b/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll
@@ -18,7 +18,7 @@ target triple = "aarch64-unknown-linux-gnu"
 %struct.NTCopy = type { i32 }
 
 ; CHECK-LABEL: name: _Z3foo6NTCopy
-; CHECK: DBG_VALUE debug-use %{{[0-9]+}}(p0), 0, !23, !DIExpression(), debug-location !24
+; CHECK: DBG_VALUE %{{[0-9]+}}(p0), 0, !23, !DIExpression(), debug-location !24
 ; Function Attrs: noinline nounwind optnone
 define dso_local i32 @_Z3foo6NTCopy(%struct.NTCopy* %o) #0 !dbg !7 {
 entry:
diff --git a/test/CodeGen/AArch64/GlobalISel/debug-insts.ll b/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
index 256eb37f6d419b04465aec88c81038a49b07d1d5..2945d65d3e27b62d4808c4f2420aa941baee29fa 100644
--- a/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
+++ b/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
@@ -15,7 +15,7 @@ entry:
 }
 
 ; CHECK-LABEL: name: debug_declare_vla
-; CHECK: DBG_VALUE debug-use %{{[0-9]+}}(p0), 0, !14, !DIExpression(), debug-location !15
+; CHECK: DBG_VALUE %{{[0-9]+}}(p0), 0, !14, !DIExpression(), debug-location !15
 define void @debug_declare_vla(i32 %in) #0 !dbg !13 {
 entry:
   %vla.addr = alloca i32, i32 %in
@@ -27,10 +27,10 @@ entry:
 ; CHECK: [[IN:%[0-9]+]]:_(s32) = COPY $w0
 define void @debug_value(i32 %in) #0 !dbg !16 {
   %addr = alloca i32
-; CHECK: DBG_VALUE debug-use [[IN]](s32), debug-use $noreg, !17, !DIExpression(), debug-location !18
+; CHECK: DBG_VALUE [[IN]](s32), $noreg, !17, !DIExpression(), debug-location !18
   call void @llvm.dbg.value(metadata i32 %in, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
   store i32 %in, i32* %addr
-; CHECK: DBG_VALUE debug-use %1(p0), debug-use $noreg, !17, !DIExpression(DW_OP_deref), debug-location !18
+; CHECK: DBG_VALUE %1(p0), $noreg, !17, !DIExpression(DW_OP_deref), debug-location !18
   call void @llvm.dbg.value(metadata i32* %addr, i64 0, metadata !17, metadata !DIExpression(DW_OP_deref)), !dbg !18
 ; CHECK: DBG_VALUE 123, 0, !17, !DIExpression(), debug-location !18
   call void @llvm.dbg.value(metadata i32 123, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
new file mode 100644
index 0000000000000000000000000000000000000000..63c5eecd8b9a758c60af0385b94a276c76a5bfdc
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
@@ -0,0 +1,53 @@
+; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -debug-only=irtranslator \
+; RUN:     -stop-after=irtranslator %s -o - 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+; CHECK: Checking DILocation from   %retval = alloca i32, align 4 was copied to G_FRAME_INDEX
+; CHECK: Checking DILocation from   %rv = alloca i32, align 4 was copied to G_FRAME_INDEX
+; CHECK: Checking DILocation from   store i32 0, i32* %retval, align 4 was copied to G_CONSTANT
+; CHECK: Checking DILocation from   store i32 0, i32* %retval, align 4 was copied to G_STORE
+; CHECK: Checking DILocation from   store i32 0, i32* %rv, align 4, !dbg !12 was copied to G_STORE debug-location !12; t.cpp:2:5
+; CHECK: Checking DILocation from   %0 = load i32, i32* %rv, align 4, !dbg !13 was copied to G_LOAD debug-location !13; t.cpp:3:8
+; CHECK: Checking DILocation from   ret i32 %0, !dbg !14 was copied to COPY debug-location !14; t.cpp:3:1
+; CHECK: Checking DILocation from   ret i32 %0, !dbg !14 was copied to RET_ReallyLR implicit $w0, debug-location !14; t.cpp:3:1
+
+source_filename = "t.cpp"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-unknown-linux-gnu"
+
+; Function Attrs: noinline norecurse nounwind optnone
+define dso_local i32 @main() !dbg !7 {
+entry:
+  %retval = alloca i32, align 4
+  %rv = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  call void @llvm.dbg.declare(metadata i32* %rv, metadata !11, metadata !DIExpression()), !dbg !12
+  store i32 0, i32* %rv, align 4, !dbg !12
+  %0 = load i32, i32* %rv, align 4, !dbg !13
+  ret i32 %0, !dbg !14
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk) (llvm/trunk 344296)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "t.cpp", directory: "/Volumes/Data/llvm.org/svn/build")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk) (llvm/trunk 344296)"}
+!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DILocalVariable(name: "rv", scope: !7, file: !1, line: 2, type: !10)
+!12 = !DILocation(line: 2, column: 5, scope: !7)
+!13 = !DILocation(line: 3, column: 8, scope: !7)
+!14 = !DILocation(line: 3, column: 1, scope: !7)
+
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
index 4b2d54bcd0d3815a63cb1d9361ed88e437d0b94f..fe6079c0db407e686c273d2e737d75c69dbb33e4 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
@@ -1,37 +1,9 @@
-# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_add_big() {
-  entry:
-    ret void
-  }
-  define void @test_scalar_add_big_nonpow2() {
-  entry:
-    ret void
-  }
-  define void @test_scalar_add_small() {
-  entry:
-    ret void
-  }
-  define void @test_vector_add() {
-  entry:
-    ret void
-  }
-  define void @test_vector_add_nonpow2() {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_add_big
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_add_big
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -47,63 +19,48 @@ body: |
     %1:_(s64) = COPY $x1
     %2:_(s64) = COPY $x2
     %3:_(s64) = COPY $x3
-    %4:_(s128) = G_MERGE_VALUES %0, %1
-    %5:_(s128) = G_MERGE_VALUES %2, %3
+    %4:_(s128) = G_MERGE_VALUES %0(s64), %1(s64)
+    %5:_(s128) = G_MERGE_VALUES %2(s64), %3(s64)
     %6:_(s128) = G_ADD %4, %5
-    %7:_(s64), %8:_(s64) = G_UNMERGE_VALUES %6
-    $x0 = COPY %7
-    $x1 = COPY %8
-...
+    %7:_(s64), %8:_(s64) = G_UNMERGE_VALUES %6(s128)
+    $x0 = COPY %7(s64)
+    $x1 = COPY %8(s64)
 
+...
 ---
 name:            test_scalar_add_big_nonpow2
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_scalar_add_big_nonpow2
-    ; CHECK-NOT: G_MERGE_VALUES
-    ; CHECK-NOT: G_UNMERGE_VALUES
-    ; CHECK-DAG: [[CARRY0_32:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-DAG: [[CARRY0:%[0-9]+]]:_(s1) = G_TRUNC [[CARRY0_32]]
-    ; CHECK: [[RES_LO:%[0-9]+]]:_(s64), [[CARRY1:%[0-9]+]]:_(s1) = G_UADDE %0, %1, [[CARRY0]]
-    ; CHECK: [[RES_MI:%[0-9]+]]:_(s64), [[CARRY2:%[0-9]+]]:_(s1) = G_UADDE %1, %2, [[CARRY1]]
-    ; CHECK: [[RES_HI:%[0-9]+]]:_(s64), {{%.*}}(s1) = G_UADDE %2, %3, [[CARRY2]]
-    ; CHECK-NOT: G_MERGE_VALUES
-    ; CHECK-NOT: G_UNMERGE_VALUES
-    ; CHECK: $x0 = COPY [[RES_LO]]
-    ; CHECK: $x1 = COPY [[RES_MI]]
-    ; CHECK: $x2 = COPY [[RES_HI]]
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s32)
+    ; CHECK: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[COPY]], [[COPY1]], [[TRUNC]]
+    ; CHECK: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[COPY1]], [[COPY2]], [[UADDE1]]
+    ; CHECK: [[UADDE4:%[0-9]+]]:_(s64), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[COPY2]], [[COPY3]], [[UADDE3]]
+    ; CHECK: $x0 = COPY [[UADDE]](s64)
+    ; CHECK: $x1 = COPY [[UADDE2]](s64)
+    ; CHECK: $x2 = COPY [[UADDE4]](s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = COPY $x2
+    %3:_(s64) = COPY $x3
+    %4:_(s192) = G_MERGE_VALUES %0(s64), %1(s64), %2(s64)
+    %5:_(s192) = G_MERGE_VALUES %1(s64), %2(s64), %3(s64)
+    %6:_(s192) = G_ADD %4, %5
+    %7:_(s64), %8:_(s64), %9:_(s64) = G_UNMERGE_VALUES %6(s192)
+    $x0 = COPY %7(s64)
+    $x1 = COPY %8(s64)
+    $x2 = COPY %9(s64)
 
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s64) = COPY $x2
-    %3(s64) = COPY $x3
-    %4(s192) = G_MERGE_VALUES %0, %1, %2
-    %5(s192) = G_MERGE_VALUES %1, %2, %3
-    %6(s192) = G_ADD %4, %5
-    %7(s64), %8(s64), %9(s64) = G_UNMERGE_VALUES %6
-    $x0 = COPY %7
-    $x1 = COPY %8
-    $x2 = COPY %9
 ...
-
 ---
 name:            test_scalar_add_small
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_add_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -114,19 +71,17 @@ body: |
     ; CHECK: $x0 = COPY [[ANYEXT]](s64)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
-    %2:_(s8) = G_TRUNC %0
-    %3:_(s8) = G_TRUNC %1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
     %4:_(s8) = G_ADD %2, %3
-    %5:_(s64) = G_ANYEXT %4
-    $x0 = COPY %5
-...
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x0 = COPY %5(s64)
 
+...
 ---
 name:            test_vector_add
-body: |
+body:             |
   bb.0.entry:
-    liveins: $q0, $q1, $q2, $q3
-
     ; CHECK-LABEL: name: test_vector_add
     ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
@@ -140,50 +95,39 @@ body: |
     %1:_(<2 x s64>) = COPY $q1
     %2:_(<2 x s64>) = COPY $q2
     %3:_(<2 x s64>) = COPY $q3
-    %4:_(<4 x s64>) = G_MERGE_VALUES %0, %1
-    %5:_(<4 x s64>) = G_MERGE_VALUES %2, %3
+    %4:_(<4 x s64>) = G_MERGE_VALUES %0(<2 x s64>), %1(<2 x s64>)
+    %5:_(<4 x s64>) = G_MERGE_VALUES %2(<2 x s64>), %3(<2 x s64>)
     %6:_(<4 x s64>) = G_ADD %4, %5
-    %7:_(<2 x s64>), %8:_(<2 x s64>) = G_UNMERGE_VALUES %6
-    $q0 = COPY %7
-    $q1 = COPY %8
+    %7:_(<2 x s64>), %8:_(<2 x s64>) = G_UNMERGE_VALUES %6(<4 x s64>)
+    $q0 = COPY %7(<2 x s64>)
+    $q1 = COPY %8(<2 x s64>)
+
 ...
 ---
 name:            test_vector_add_nonpow2
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $q0, $q1, $q2, $q3
     ; CHECK-LABEL: name: test_vector_add_nonpow2
-    ; CHECK-NOT: G_EXTRACT
-    ; CHECK-NOT: G_SEQUENCE
-    ; CHECK: [[RES_LO:%[0-9]+]]:_(<2 x s64>) = G_ADD %0, %1
-    ; CHECK: [[RES_MI:%[0-9]+]]:_(<2 x s64>) = G_ADD %1, %2
-    ; CHECK: [[RES_HI:%[0-9]+]]:_(<2 x s64>) = G_ADD %2, %3
-    ; CHECK-NOT: G_EXTRACT
-    ; CHECK-NOT: G_SEQUENCE
-    ; CHECK: $q0 = COPY [[RES_LO]]
-    ; CHECK: $q1 = COPY [[RES_MI]]
-    ; CHECK: $q2 = COPY [[RES_HI]]
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+    ; CHECK: [[COPY3:%[0-9]+]]:_(<2 x s64>) = COPY $q3
+    ; CHECK: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY1]], [[COPY2]]
+    ; CHECK: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK: $q0 = COPY [[ADD]](<2 x s64>)
+    ; CHECK: $q1 = COPY [[ADD1]](<2 x s64>)
+    ; CHECK: $q2 = COPY [[ADD2]](<2 x s64>)
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s64>) = COPY $q1
+    %2:_(<2 x s64>) = COPY $q2
+    %3:_(<2 x s64>) = COPY $q3
+    %4:_(<6 x s64>) = G_MERGE_VALUES %0(<2 x s64>), %1(<2 x s64>), %2(<2 x s64>)
+    %5:_(<6 x s64>) = G_MERGE_VALUES %1(<2 x s64>), %2(<2 x s64>), %3(<2 x s64>)
+    %6:_(<6 x s64>) = G_ADD %4, %5
+    %7:_(<2 x s64>), %8:_(<2 x s64>), %9:_(<2 x s64>) = G_UNMERGE_VALUES %6(<6 x s64>)
+    $q0 = COPY %7(<2 x s64>)
+    $q1 = COPY %8(<2 x s64>)
+    $q2 = COPY %9(<2 x s64>)
 
-    %0(<2 x s64>) = COPY $q0
-    %1(<2 x s64>) = COPY $q1
-    %2(<2 x s64>) = COPY $q2
-    %3(<2 x s64>) = COPY $q3
-    %4(<6 x s64>) = G_MERGE_VALUES %0, %1, %2
-    %5(<6 x s64>) = G_MERGE_VALUES %1, %2, %3
-    %6(<6 x s64>) = G_ADD %4, %5
-    %7(<2 x s64>), %8(<2 x s64>), %9(<2 x s64>) = G_UNMERGE_VALUES %6
-    $q0 = COPY %7
-    $q1 = COPY %8
-    $q2 = COPY %9
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-and.mir b/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
index fdcf79e55367b1dcdf3c529568ab6a6591fbf6e8..af683e302f45c128c9bae644990487c9cc602de3 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
@@ -1,29 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_and_small() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_and_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_and_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -34,13 +14,14 @@ body: |
     ; CHECK: $w0 = COPY [[COPY2]](s32)
     ; CHECK: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
     ; CHECK: $x0 = COPY [[COPY3]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-    %4(s8) = G_AND %2, %3
-    %6(s32) = G_ANYEXT %4
-    $w0 = COPY %6
-    %5(s64) = G_ANYEXT %2
-    $x0 = COPY %5
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_AND %2, %3
+    %6:_(s32) = G_ANYEXT %4(s8)
+    $w0 = COPY %6(s32)
+    %5:_(s64) = G_ANYEXT %2(s8)
+    $x0 = COPY %5(s64)
+
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
index ca5646a1c7b2d2583c9885fa64c37207f1220e52..ef86df6a5c15188cbf0b59dce9fb8523086b26ea 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
@@ -1,36 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_icmp() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_icmp
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-  - { id: 10, class: _ }
-  - { id: 11, class: _ }
-  - { id: 12, class: _ }
-  - { id: 13, class: _ }
-  - { id: 14, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_icmp
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
@@ -50,22 +23,19 @@ body: |
     ; CHECK: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[INTTOPTR]](p0), [[INTTOPTR]]
     ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ICMP2]](s32)
     ; CHECK: $w0 = COPY [[COPY4]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x0
-
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-
-    %4(s1) = G_ICMP intpred(sge), %0, %1
-    %11(s32) = G_ANYEXT %4
-    $w0 = COPY %11
-
-    %8(s1) = G_ICMP intpred(ult), %2, %3
-    %12(s32) = G_ANYEXT %8
-    $w0 = COPY %12
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x0
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s1) = G_ICMP intpred(sge), %0(s64), %1
+    %11:_(s32) = G_ANYEXT %4(s1)
+    $w0 = COPY %11(s32)
+    %8:_(s1) = G_ICMP intpred(ult), %2(s8), %3
+    %12:_(s32) = G_ANYEXT %8(s1)
+    $w0 = COPY %12(s32)
+    %9:_(p0) = G_INTTOPTR %0(s64)
+    %10:_(s1) = G_ICMP intpred(eq), %9(p0), %9
+    %14:_(s32) = G_ANYEXT %10(s1)
+    $w0 = COPY %14(s32)
 
-    %9(p0) = G_INTTOPTR %0(s64)
-    %10(s1) = G_ICMP intpred(eq), %9(p0), %9(p0)
-    %14(s32) = G_ANYEXT %10
-    $w0 = COPY %14
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-div.mir b/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
index a21b83bb5ca47c6ed8f1d5f905162eb7038c2e10..4753e17ca1cc4eaf34da05e79df6eae2976dc2fb 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
@@ -1,27 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_div() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_div
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_div
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -45,19 +27,15 @@ body: |
     ; CHECK: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
     ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UDIV]](s32)
     ; CHECK: $w0 = COPY [[COPY3]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-
-
-    %4(s8) = G_SDIV %2, %3
-    %6:_(s32) = G_ANYEXT %4
-    $w0 = COPY %6
-
-
-    %5(s8) = G_UDIV %2, %3
-    %7:_(s32) = G_ANYEXT %5
-    $w0 = COPY %7
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_SDIV %2, %3
+    %6:_(s32) = G_ANYEXT %4(s8)
+    $w0 = COPY %6(s32)
+    %5:_(s8) = G_UDIV %2, %3
+    %7:_(s32) = G_ANYEXT %5(s8)
+    $w0 = COPY %7(s32)
 
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
index cf4f687408f3229990e14e1e5429486f0dac5407..b1be33cbeb5c24939a241e407ba36e9f776554aa 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
@@ -1,40 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_ext() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_ext
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-  - { id: 10, class: _ }
-  - { id: 11, class: _ }
-  - { id: 12, class: _ }
-  - { id: 13, class: _ }
-  - { id: 14, class: _ }
-  - { id: 15, class: _ }
-  - { id: 16, class: _ }
-  - { id: 17, class: _ }
-  - { id: 18, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_ext
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
@@ -82,9 +51,9 @@ body: |
     ; CHECK: $w0 = COPY [[ASHR2]](s32)
     ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK: [[TRUNC10:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[TRUNC3]]4(s32)
-    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]]1, [[TRUNC3]]2
-    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[TRUNC3]]3(s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
+    ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[TRUNC10]], [[COPY5]]
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND3]](s32)
     ; CHECK: $w0 = COPY [[COPY6]](s32)
     ; CHECK: [[TRUNC11:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK: $w0 = COPY [[TRUNC11]](s32)
@@ -92,52 +61,60 @@ body: |
     ; CHECK: $w0 = COPY [[TRUNC12]](s32)
     ; CHECK: [[FPEXT:%[0-9]+]]:_(s64) = G_FPEXT [[TRUNC12]](s32)
     ; CHECK: $x0 = COPY [[FPEXT]](s64)
-    %0(s64) = COPY $x0
-
-    %1(s1) = G_TRUNC %0
-    %19:_(s32) = G_ANYEXT %1
-    $w0 = COPY %19
-    %2(s8) = G_TRUNC %0
-    %20:_(s32) = G_ANYEXT %2
-    $w0 = COPY %20
-    %3(s16) = G_TRUNC %0
-    %21:_(s32) = G_ANYEXT %3
-    $w0 = COPY %21
-    %4(s32) = G_TRUNC %0
-    $w0 = COPY %4
-
-    %5(s64) = G_ANYEXT %1
-    $x0 = COPY %5
-    %6(s64) = G_ZEXT %2
-    $x0 = COPY %6
-    %7(s64) = G_ANYEXT %3
-    $x0 = COPY %7
-    %8(s64) = G_SEXT %4
-    $x0 = COPY %8
-
-    %9(s32) = G_SEXT %1
-    $w0 = COPY %9
-    %10(s32) = G_ZEXT %2
-    $w0 = COPY %10
-    %11(s32) = G_ANYEXT %3
-    $w0 = COPY %11
-
-    %12(s32) = G_ZEXT %1
-    $w0 = COPY %12
-    %13(s32) = G_ANYEXT %2
-    $w0 = COPY %13
-    %14(s32) = G_SEXT %3
-    $w0 = COPY %14
-
-    %15(s8) = G_ZEXT %1
-    %22:_(s32) = G_ANYEXT %15
-    $w0 = COPY %22
-    %16(s16) = G_ANYEXT %2
-    %23:_(s32) = G_ANYEXT %16
-    $w0 = COPY %23
+    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: $w0 = COPY [[C7]](s32)
+    ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: $w0 = COPY [[C8]](s32)
+    ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK: $w0 = COPY [[DEF]](s32)
+    %0:_(s64) = COPY $x0
+    %1:_(s1) = G_TRUNC %0(s64)
+    %19:_(s32) = G_ANYEXT %1(s1)
+    $w0 = COPY %19(s32)
+    %2:_(s8) = G_TRUNC %0(s64)
+    %20:_(s32) = G_ANYEXT %2(s8)
+    $w0 = COPY %20(s32)
+    %3:_(s16) = G_TRUNC %0(s64)
+    %21:_(s32) = G_ANYEXT %3(s16)
+    $w0 = COPY %21(s32)
+    %4:_(s32) = G_TRUNC %0(s64)
+    $w0 = COPY %4(s32)
+    %5:_(s64) = G_ANYEXT %1(s1)
+    $x0 = COPY %5(s64)
+    %6:_(s64) = G_ZEXT %2(s8)
+    $x0 = COPY %6(s64)
+    %7:_(s64) = G_ANYEXT %3(s16)
+    $x0 = COPY %7(s64)
+    %8:_(s64) = G_SEXT %4(s32)
+    $x0 = COPY %8(s64)
+    %9:_(s32) = G_SEXT %1(s1)
+    $w0 = COPY %9(s32)
+    %10:_(s32) = G_ZEXT %2(s8)
+    $w0 = COPY %10(s32)
+    %11:_(s32) = G_ANYEXT %3(s16)
+    $w0 = COPY %11(s32)
+    %12:_(s32) = G_ZEXT %1(s1)
+    $w0 = COPY %12(s32)
+    %13:_(s32) = G_ANYEXT %2(s8)
+    $w0 = COPY %13(s32)
+    %14:_(s32) = G_SEXT %3(s16)
+    $w0 = COPY %14(s32)
+    %15:_(s8) = G_ZEXT %1(s1)
+    %22:_(s32) = G_ANYEXT %15(s8)
+    $w0 = COPY %22(s32)
+    %16:_(s16) = G_ANYEXT %2(s8)
+    %23:_(s32) = G_ANYEXT %16(s16)
+    $w0 = COPY %23(s32)
+    %17:_(s32) = G_TRUNC %0(s64)
+    $w0 = COPY %17(s32)
+    %18:_(s64) = G_FPEXT %17(s32)
+    $x0 = COPY %18(s64)
+    %24:_(s16) = G_IMPLICIT_DEF
+    %25:_(s32) = G_ZEXT %24(s16)
+    $w0 = COPY %25(s32)
+    %26:_(s32) = G_SEXT %24(s16)
+    $w0 = COPY %26(s32)
+    %27:_(s32) = G_ANYEXT %24(s16)
+    $w0 = COPY %27(s32)
 
-    %17(s32) = G_TRUNC  %0
-    $w0 = COPY %17
-    %18(s64) = G_FPEXT %17
-    $x0 = COPY %18
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-extload.mir b/test/CodeGen/AArch64/GlobalISel/legalize-extload.mir
index 816484108d25c70945f370d91a2e5994982b044e..a26704497c3539b9b9a2783abcb1cba264b343e7 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-extload.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-extload.mir
@@ -1,24 +1,15 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_extload(i8* %addr) {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
 ---
 name:            test_extload
 body: |
   bb.0.entry:
     liveins: $x0
     ; CHECK-LABEL: name: test_extload
-    ; CHECK: [[T0:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[T1:%[0-9]+]]:_(s32) = G_LOAD [[T0]](p0) :: (load 1 from %ir.addr)
-    ; CHECK: $w0 = COPY [[T1]](s32)
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: $w0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $x0
-    %1:_(s32) = G_LOAD %0 :: (load 1 from %ir.addr)
+    %1:_(s32) = G_LOAD %0 :: (load 1)
     $w0 = COPY %1
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
index 7a688e7eb9367d2c8a829ee13b6f2a55caab77eb..2176bb021f7062a3df4e05fe62e22fe302ed453a 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
@@ -1,29 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_icmp() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_icmp
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_icmp
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
@@ -33,15 +13,13 @@ body: |
     ; CHECK: $w0 = COPY [[FCMP]](s32)
     ; CHECK: [[FCMP1:%[0-9]+]]:_(s32) = G_FCMP floatpred(uno), [[TRUNC]](s32), [[TRUNC1]]
     ; CHECK: $w0 = COPY [[FCMP1]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x0
-
-    %2(s32) = G_TRUNC %0
-    %3(s32) = G_TRUNC %1
-
-    %4(s32) = G_FCMP floatpred(oge), %0, %1
-    $w0 = COPY %4
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x0
+    %2:_(s32) = G_TRUNC %0(s64)
+    %3:_(s32) = G_TRUNC %1(s64)
+    %4:_(s32) = G_FCMP floatpred(oge), %0(s64), %1
+    $w0 = COPY %4(s32)
+    %5:_(s32) = G_FCMP floatpred(uno), %2(s32), %3
+    $w0 = COPY %5(s32)
 
-    %5(s32) = G_FCMP floatpred(uno), %2, %3
-    $w0 = COPY %5
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir b/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir
index f7d77c72a38492e5f89166deb43daef28207ab7f..373a1db41ed895e83bf379f206d08fa84c4343b8 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir
@@ -1,26 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_gep_small() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_gep_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_gep_small
     ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -30,9 +13,10 @@ body: |
     ; CHECK: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]]
     ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[ASHR]](s64)
     ; CHECK: $x0 = COPY [[GEP]](p0)
-    %0(p0) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %1
-    %3(p0) = G_GEP %0, %2(s8)
-    $x0 = COPY %3
+    %0:_(p0) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %1(s64)
+    %3:_(p0) = G_GEP %0, %2(s8)
+    $x0 = COPY %3(p0)
+
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..7f42f6e6c336942dbcd1924f889992a5a2d54b1a
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-fewerElts.mir
@@ -0,0 +1,39 @@
+# RUN: llc -march=aarch64 -o - -run-pass=legalizer -global-isel-abort=0 -debug-only=legalizer 2>&1 %s | FileCheck %s
+# REQUIRES: asserts
+
+# CHECK: Legalize Machine IR for: load_v4s32
+# CHECK-NEXT: %{{[0-9]+}}:_(<4 x s32>) = G_LOAD %{{[0-9]+}}:_(p0)
+# CHECK-NEXT: Reduce number of elements
+---
+name:            load_v4s32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0
+
+    %0:_(p0) = COPY $x0
+    %1:_(<4 x s32>) = G_LOAD %0(p0) :: (load 16, align 4)
+    %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<4 x s32>)
+    $w0 = COPY %5(s32)
+
+...
+
+# Make sure we are able to scalarize v2s64.
+# CHECK: Legalize Machine IR for: load_v2s64
+# CHECK-NEXT: %{{[0-9]+}}:_(<2 x s64>) = G_LOAD %{{[0-9]+}}:_(p0)
+# CHECK-NEXT: Reduce number of elements
+---
+name:            load_v2s64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0
+
+    %0:_(p0) = COPY $x0
+    %1:_(<2 x s64>) = G_LOAD %0(p0) :: (load 16)
+    %2:_(s64), %3:_(s64) = G_UNMERGE_VALUES %1(<2 x s64>)
+    $x0 = COPY %3(s64)
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-store-s128-unaligned.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-store-s128-unaligned.mir
new file mode 100644
index 0000000000000000000000000000000000000000..33a6c23eb36786a07d3b1f56d7f558257b1a092b
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-store-s128-unaligned.mir
@@ -0,0 +1,31 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -o - -run-pass=legalizer %s | FileCheck %s
+---
+name:            loadstore128_align4
+exposesReturnsTwice: false
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: loadstore128_align4
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load 8, align 4)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP]](p0) :: (load 8, align 4)
+    ; CHECK: G_STORE [[LOAD]](s64), [[COPY1]](p0) :: (store 8, align 4)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64)
+    ; CHECK: G_STORE [[LOAD1]](s64), [[GEP1]](p0) :: (store 8, align 4)
+    ; CHECK: RET_ReallyLR
+    %0:_(p0) = COPY $x0
+    %1:_(p0) = COPY $x1
+    %2:_(s128) = G_LOAD %0(p0) :: (load 16, align 4)
+    G_STORE %2(s128), %1(p0) :: (store 16, align 4)
+    RET_ReallyLR
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 9a56303715992f364b1eafa12258276a3df82574..7a41cb0cd797dc1e156f84b217fd73e06d5268bc 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -1,130 +1,105 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_load(i8* %addr) {
-  entry:
-    ret void
-  }
-  define void @test_store(i8* %addr) {
-  entry:
-    ret void
-  }
-...
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 
 ---
 name:            test_load
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
 body: |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-    ; CHECK-LABEL: name: test_load
-    %0(p0) = COPY $x0
-
-    %1(s1) = G_LOAD %0 :: (load 1 from %ir.addr)
-    %9:_(s32) = G_ANYEXT %1
-    $w0 = COPY %9
-
-    ; CHECK: %2:_(s8) = G_LOAD %0(p0) :: (load 1 from %ir.addr)
-    %2(s8) = G_LOAD %0 :: (load 1 from %ir.addr)
-    %10:_(s32) = G_ANYEXT %2
-    $w0 = COPY %10
-
-    ; CHECK: %3:_(s16) = G_LOAD %0(p0) :: (load 2 from %ir.addr)
-    %3(s16) = G_LOAD %0 :: (load 2 from %ir.addr)
-    %11:_(s32) = G_ANYEXT %3
-    $w0 = COPY %11
+    liveins: $x0
 
-    ; CHECK: %4:_(s32) = G_LOAD %0(p0) :: (load 4 from %ir.addr)
-    %4(s32) = G_LOAD %0 :: (load 4 from %ir.addr)
-    $w0 = COPY %4
-
-    ; CHECK: %5:_(s64) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
-    %5(s64) = G_LOAD %0 :: (load 8 from %ir.addr)
-    $x0 = COPY %5
-
-    %6(p0) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
-    %12:_(s64) = G_PTRTOINT %6
-    $x0 = COPY %12
-
-    ; CHECK: %7:_(<2 x s32>) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
-    %7(<2 x s32>) = G_LOAD %0(p0) :: (load 8 from %ir.addr)
-    %13:_(s64) = G_BITCAST %7
-    $x0 = COPY %13
-
-    ; CHECK: [[LOAD0:%[0-9]+]]:_(s64) = G_LOAD %0(p0) :: (load 8 from %ir.addr, align 16)
-    ; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP %0, [[OFFSET1]](s64)
-    ; CHECK: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p0) :: (load 8 from %ir.addr + 8)
-    ; CHECK: %8:_(s128) = G_MERGE_VALUES [[LOAD0]](s64), [[LOAD1]](s64)
-    %8(s128) = G_LOAD %0(p0) :: (load 16 from %ir.addr)
-    %14:_(s64) = G_TRUNC %8
-    $x0 = COPY %14
+    ; CHECK-LABEL: name: test_load
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s8)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD1]](s8)
+    ; CHECK: $w0 = COPY [[ANYEXT1]](s32)
+    ; CHECK: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p0) :: (load 2)
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD2]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT2]](s32)
+    ; CHECK: $w0 = COPY [[ANYEXT1]](s32)
+    ; CHECK: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load 8)
+    ; CHECK: $x0 = COPY [[LOAD3]](s64)
+    ; CHECK: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[COPY]](p0) :: (load 8)
+    ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[LOAD4]](p0)
+    ; CHECK: $x0 = COPY [[PTRTOINT]](s64)
+    ; CHECK: [[LOAD5:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load 8)
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[LOAD5]](<2 x s32>)
+    ; CHECK: $x0 = COPY [[BITCAST]](s64)
+    ; CHECK: [[LOAD6:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load 8, align 16)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD7:%[0-9]+]]:_(s64) = G_LOAD [[GEP]](p0) :: (load 8)
+    ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD6]](s64), [[LOAD7]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[MV]](s128)
+    ; CHECK: $x0 = COPY [[TRUNC]](s64)
+    %0:_(p0) = COPY $x0
+    %1:_(s1) = G_LOAD %0(p0) :: (load 1)
+    %2:_(s32) = G_ANYEXT %1(s1)
+    $w0 = COPY %2(s32)
+    %3:_(s8) = G_LOAD %0(p0) :: (load 1)
+    %4:_(s32) = G_ANYEXT %3(s8)
+    $w0 = COPY %4(s32)
+    %5:_(s16) = G_LOAD %0(p0) :: (load 2)
+    %6:_(s32) = G_ANYEXT %5(s16)
+    $w0 = COPY %6(s32)
+    %7:_(s32) = G_LOAD %0(p0) :: (load 4)
+    $w0 = COPY %4(s32)
+    %8:_(s64) = G_LOAD %0(p0) :: (load 8)
+    $x0 = COPY %8(s64)
+    %9:_(p0) = G_LOAD %0(p0) :: (load 8)
+    %10:_(s64) = G_PTRTOINT %9(p0)
+    $x0 = COPY %10(s64)
+    %11:_(<2 x s32>) = G_LOAD %0(p0) :: (load 8)
+    %12:_(s64) = G_BITCAST %11(<2 x s32>)
+    $x0 = COPY %12(s64)
+    %13:_(s128) = G_LOAD %0(p0) :: (load 16)
+    %14:_(s64) = G_TRUNC %13(s128)
+    $x0 = COPY %14(s64)
 ...
 
 ---
 name:            test_store
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
 body: |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-    ; CHECK-LABEL: name: test_store
-
-    %0(p0) = COPY $x0
-    %1(s32) = COPY $w1
-
-    ; CHECK: [[C1:%.*]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[B:%.*]]:_(s32) = COPY %1(s32)
-    ; CHECK: [[COPY_C1:%.*]]:_(s32) = COPY [[C1]]
-    ; CHECK: [[AND:%.*]]:_(s32) = G_AND [[B]], [[COPY_C1]]
-    ; CHECK: [[BIT8:%.*]]:_(s8) = G_TRUNC [[AND]]
-
-
-    ; CHECK: G_STORE [[BIT8]](s8), %0(p0) :: (store 1 into %ir.addr)
-    %2(s1) = G_TRUNC %1
-    G_STORE %2, %0 :: (store 1 into %ir.addr)
-
-    ; CHECK: G_STORE %3(s8), %0(p0) :: (store 1 into %ir.addr)
-    %3(s8) = G_TRUNC %1
-    G_STORE %3, %0 :: (store 1 into %ir.addr)
+    liveins: $x0, $w1
 
-    ; CHECK: G_STORE %4(s16), %0(p0) :: (store 2 into %ir.addr)
-    %4(s16) = G_TRUNC %1
-    G_STORE %4, %0 :: (store 2 into %ir.addr)
-
-    ; CHECK: G_STORE %1(s32), %0(p0) :: (store 4 into %ir.addr)
-    G_STORE %1, %0 :: (store 4 into %ir.addr)
-
-    ; CHECK: G_STORE %5(s64), %0(p0) :: (store 8 into %ir.addr)
-    %5(s64) = G_PTRTOINT %0(p0)
-    G_STORE %5, %0 :: (store 8 into %ir.addr)
-
-    ; CHECK: G_STORE %0(p0), %0(p0) :: (store 8 into %ir.addr)
-    G_STORE %0(p0), %0(p0) :: (store 8 into %ir.addr)
-
-    ; CHECK: G_STORE %5(s64), %0(p0) :: (store 8 into %ir.addr, align 16)
-    ; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP %0, [[OFFSET1]](s64)
-    ; CHECK: G_STORE %6(s64), [[GEP1]](p0) :: (store 8 into %ir.addr + 8)
-    %6(s64) = G_PTRTOINT %0(p0)
-    %7(s128) = G_MERGE_VALUES %5, %6
-    G_STORE %7, %0 :: (store 16 into %ir.addr)
+    ; CHECK-LABEL: name: test_store
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[COPY3]]
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[AND]](s32)
+    ; CHECK: G_STORE [[TRUNC]](s8), [[COPY]](p0) :: (store 1)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32)
+    ; CHECK: G_STORE [[TRUNC1]](s8), [[COPY]](p0) :: (store 1)
+    ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK: G_STORE [[TRUNC2]](s16), [[COPY]](p0) :: (store 2)
+    ; CHECK: G_STORE [[COPY1]](s32), [[COPY]](p0) :: (store 4)
+    ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY]](p0)
+    ; CHECK: G_STORE [[PTRTOINT]](s64), [[COPY]](p0) :: (store 8)
+    ; CHECK: G_STORE [[COPY]](p0), [[COPY]](p0) :: (store 8)
+    ; CHECK: [[PTRTOINT1:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY]](p0)
+    ; CHECK: G_STORE [[PTRTOINT1]](s64), [[COPY]](p0) :: (store 8, align 16)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64)
+    ; CHECK: G_STORE [[PTRTOINT1]](s64), [[GEP]](p0) :: (store 8)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = COPY $w1
+    %2:_(s1) = G_TRUNC %1(s32)
+    G_STORE %2(s1), %0(p0) :: (store 1)
+    %3:_(s8) = G_TRUNC %1(s32)
+    G_STORE %3(s8), %0(p0) :: (store 1)
+    %4:_(s16) = G_TRUNC %1(s32)
+    G_STORE %4(s16), %0(p0) :: (store 2)
+    G_STORE %1(s32), %0(p0) :: (store 4)
+    %5:_(s64) = G_PTRTOINT %0(p0)
+    G_STORE %5(s64), %0(p0) :: (store 8)
+    G_STORE %0(p0), %0(p0) :: (store 8)
+    %6:_(s64) = G_PTRTOINT %0(p0)
+    %7:_(s128) = G_MERGE_VALUES %6(s64), %6
+    G_STORE %7(s128), %0(p0) :: (store 16)
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
index e6e6ab7825f577830b372386fba5e6925b02b3cd..3260eb6ca6fd2d017b3b7072fd469498e09a43d1 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
@@ -1,34 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_mul_small() {
-  entry:
-    ret void
-  }
-  define void @test_smul_overflow() {
-    ret void
-  }
-  define void @test_umul_overflow() {
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_mul_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_mul_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -37,22 +12,19 @@ body: |
     ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[TRUNC]], [[TRUNC1]]
     ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[MUL]](s32)
     ; CHECK: $x0 = COPY [[ANYEXT]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-    %4(s8) = G_MUL %2, %3
-    %5(s64) = G_ANYEXT %4
-    $x0 = COPY %5
-...
-
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_MUL %2, %3
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x0 = COPY %5(s64)
 
+...
 ---
 name:            test_smul_overflow
-body: |
+body:             |
   bb.0:
-    liveins: $x0, $x1, $w2, $w3
-
     ; CHECK-LABEL: name: test_smul_overflow
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -67,19 +39,15 @@ body: |
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
     %2:_(s64), %3:_(s1) = G_SMULO %0, %1
-    $x0 = COPY %2
-    %4:_(s32) = G_ANYEXT %3
-    $w0 = COPY %4
+    $x0 = COPY %2(s64)
+    %4:_(s32) = G_ANYEXT %3(s1)
+    $w0 = COPY %4(s32)
 
 ...
-
-
 ---
 name:            test_umul_overflow
-body: |
+body:             |
   bb.0:
-    liveins: $x0, $x1, $w2, $w3
-
     ; CHECK-LABEL: name: test_umul_overflow
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -93,8 +61,8 @@ body: |
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
     %2:_(s64), %3:_(s1) = G_UMULO %0, %1
-    $x0 = COPY %2
-    %4:_(s32) = G_ANYEXT %3
-    $w0 = COPY %4
+    $x0 = COPY %2(s64)
+    %4:_(s32) = G_ANYEXT %3(s1)
+    $w0 = COPY %4(s32)
 
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir b/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
index 7c4bbfcc63f874a572d2882b7bdad41c5231d3e9..d8f2542d907dc594f8b6723533f16e9d6009aa3c 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
@@ -121,6 +121,7 @@ body:             |
 
   bb.3:
     %9(s1) = G_PHI %5(s1), %bb.1, %8(s1), %bb.2
+    %11:_(s1) = G_PHI %5(s1), %bb.1, %8(s1), %bb.2
     %10(s32) = G_ZEXT %9(s1)
     $w0 = COPY %10(s32)
     RET_ReallyLR implicit $w0
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir b/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir
index 0b328b6345e0a47c55ae8d794c6600b5fcc1b1c0..3b301798bffa5bf756508d0c63c6e7a5c3b96fe7 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir
@@ -1,40 +1,35 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_pow() {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_pow
-body: |
+body:             |
   bb.0.entry:
-    liveins: $d0, $d1, $s2, $s3
-
     ; CHECK-LABEL: name: test_pow
-    ; CHECK: hasCalls: true
-
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $s2
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $s3
+    ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK: $d0 = COPY [[COPY]](s64)
+    ; CHECK: $d1 = COPY [[COPY1]](s64)
+    ; CHECK: BL &pow, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit-def $d0
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK: $x0 = COPY [[COPY4]](s64)
+    ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK: $s0 = COPY [[COPY2]](s32)
+    ; CHECK: $s1 = COPY [[COPY3]](s32)
+    ; CHECK: BL &powf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $s1, implicit-def $s0
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; CHECK: $w0 = COPY [[COPY5]](s32)
     %0:_(s64) = COPY $d0
     %1:_(s64) = COPY $d1
     %2:_(s32) = COPY $s2
     %3:_(s32) = COPY $s3
-
-    ; CHECK: $d0 = COPY %0
-    ; CHECK: $d1 = COPY %1
-    ; CHECK: BL &pow, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $d1, implicit-def $d0
-    ; CHECK: %4:_(s64) = COPY $d0
     %4:_(s64) = G_FPOW %0, %1
-    $x0 = COPY %4
-
-    ; CHECK: $s0 = COPY %2
-    ; CHECK: $s1 = COPY %3
-    ; CHECK: BL &powf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $s1, implicit-def $s0
-    ; CHECK: %5:_(s32) = COPY $s0
+    $x0 = COPY %4(s64)
     %5:_(s32) = G_FPOW %2, %3
-    $w0 = COPY %5
+    $w0 = COPY %5(s32)
 
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir b/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
index 35e71d6155623179c75aa386aefda3f64ac57df2..69d1b6d761d5acd9ba2696e1d337f4010c8cd0ef 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
@@ -1,37 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_urem_64() {
-  entry:
-    ret void
-  }
-  define void @test_srem_32() {
-  entry:
-    ret void
-  }
-  define void @test_srem_8() {
-  entry:
-    ret void
-  }
-  define void @test_frem() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_urem_64
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_urem_64
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -39,25 +11,16 @@ body: |
     ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[UDIV]], [[COPY1]]
     ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[MUL]]
     ; CHECK: $x0 = COPY [[SUB]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s64) = G_UREM %0, %1
-    $x0 = COPY %2
-
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = G_UREM %0, %1
+    $x0 = COPY %2(s64)
 
 ...
 ---
 name:            test_srem_32
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_srem_32
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -67,27 +30,18 @@ body: |
     ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SDIV]], [[TRUNC1]]
     ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[MUL]]
     ; CHECK: $w0 = COPY [[SUB]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %3(s32) = G_TRUNC %0
-    %4(s32) = G_TRUNC %1
-    %5(s32) = G_SREM %3, %4
-    $w0 = COPY %5
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s32) = G_TRUNC %0(s64)
+    %3:_(s32) = G_TRUNC %1(s64)
+    %4:_(s32) = G_SREM %2, %3
+    $w0 = COPY %4(s32)
 
 ...
 ---
 name:            test_srem_8
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
-
     ; CHECK-LABEL: name: test_srem_8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -108,27 +62,19 @@ body: |
     ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC3]], [[COPY3]]
     ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
     ; CHECK: $w0 = COPY [[COPY4]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %6(s8) = G_TRUNC %0
-    %7(s8) = G_TRUNC %1
-    %8(s8) = G_SREM %6, %7
-    %9:_(s32) = G_ANYEXT %8
-    $w0 = COPY %9
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_SREM %2, %3
+    %5:_(s32) = G_ANYEXT %4(s8)
+    $w0 = COPY %5(s32)
+
 ...
 ---
 name:            test_frem
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_frem
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -148,12 +94,13 @@ body: |
     ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $s0
     ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
     ; CHECK: $w0 = COPY [[COPY3]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s64) = G_FREM %0, %1
-    $x0 = COPY %2
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s64) = G_FREM %0, %1
+    $x0 = COPY %2(s64)
+    %3:_(s32) = G_TRUNC %0(s64)
+    %4:_(s32) = G_TRUNC %1(s64)
+    %5:_(s32) = G_FREM %3, %4
+    $w0 = COPY %5(s32)
 
-    %3(s32) = G_TRUNC %0
-    %4(s32) = G_TRUNC %1
-    %5(s32) = G_FREM %3, %4
-    $w0 = COPY %5
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-sext-copy.mir b/test/CodeGen/AArch64/GlobalISel/legalize-sext-copy.mir
new file mode 100644
index 0000000000000000000000000000000000000000..e84dae37a5e93c39beba4fea1395ad6925d29f8d
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-sext-copy.mir
@@ -0,0 +1,21 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
+---
+name:            test_sext_copy
+body: |
+  bb.0.entry:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: test_sext_copy
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK: $w0 = COPY [[COPY]](s32)
+    ; CHECK: $w0 = COPY [[COPY]](s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32)
+    ; CHECK: $x0 = COPY [[SEXT]](s64)
+    %0:_(s32) = COPY $w1
+    $w0 = COPY %0(s32)
+    $w0 = COPY %0(s32)
+    %1:_(s32) = COPY $w0
+    %2:_(s64) = G_SEXT %1(s32)
+    $x0 = COPY %2(s64)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-sextload.mir b/test/CodeGen/AArch64/GlobalISel/legalize-sextload.mir
index cfd1550303fd7ea877dab45c8db86e2225819bb9..7f568f8d6175d53dc793af4c904c6b6d8a627085 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-sextload.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-sextload.mir
@@ -1,24 +1,15 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_zextload(i8* %addr) {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
 ---
-name:            test_zextload
+name:            test_sextload
 body: |
   bb.0.entry:
     liveins: $x0
-    ; CHECK-LABEL: name: test_zextload
-    ; CHECK: [[T0:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[T1:%[0-9]+]]:_(s32) = G_SEXTLOAD [[T0]](p0) :: (load 1 from %ir.addr)
-    ; CHECK: $w0 = COPY [[T1]](s32)
+    ; CHECK-LABEL: name: test_sextload
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: $w0 = COPY [[SEXTLOAD]](s32)
     %0:_(p0) = COPY $x0
-    %1:_(s32) = G_SEXTLOAD %0 :: (load 1 from %ir.addr)
+    %1:_(s32) = G_SEXTLOAD %0 :: (load 1)
     $w0 = COPY %1
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir b/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
index 781b5d8cde81753ec79fe759f901783e48127a88..ad1f431c160033d2b5e240479d86b38bef1b7fa0 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
@@ -1,28 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_shift() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_shift
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
     ; CHECK-LABEL: name: test_shift
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -52,21 +33,18 @@ body: |
     ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC4]], [[AND3]]
     ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32)
     ; CHECK: $w0 = COPY [[COPY4]](s32)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-
-    %4(s8) = G_ASHR %2, %3
-    %7:_(s32) = G_ANYEXT %4
-    $w0 = COPY %7
-
-
-    %5(s8) = G_LSHR %2, %3
-    %8:_(s32) = G_ANYEXT %5
-    $w0 = COPY %8
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_ASHR %2, %3
+    %7:_(s32) = G_ANYEXT %4(s8)
+    $w0 = COPY %7(s32)
+    %5:_(s8) = G_LSHR %2, %3
+    %8:_(s32) = G_ANYEXT %5(s8)
+    $w0 = COPY %8(s32)
+    %6:_(s8) = G_SHL %2, %3
+    %9:_(s32) = G_ANYEXT %6(s8)
+    $w0 = COPY %9(s32)
 
-    %6(s8) = G_SHL %2, %3
-    %9:_(s32) = G_ANYEXT %6
-    $w0 = COPY %9
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir b/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
index 3da689d4265c5038297d3e94085a27a83bc5d4fb..51cda7d793e3e9dc53d509a446e6c2a3249c94ae 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
@@ -1,49 +1,10 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_simple() {
-  entry:
-    ret void
-  next:
-    ret void
-  }
-  define void @bitcast128() {
-    ret void
-  }
-  define void @testExtOfCopyOfTrunc() {
-    ret void
-  }
-  define void @testExtOf2CopyOfTrunc() {
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_simple
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-  - { id: 6, class: _ }
-  - { id: 7, class: _ }
-  - { id: 8, class: _ }
-  - { id: 9, class: _ }
-  - { id: 10, class: _ }
-  - { id: 11, class: _ }
-  - { id: 12, class: _ }
-  - { id: 13, class: _ }
-  - { id: 14, class: _ }
-  - { id: 15, class: _ }
-  - { id: 16, class: _ }
-body: |
+body:             |
   ; CHECK-LABEL: name: test_simple
-  ; CHECK: bb.0.{{[a-zA-Z0-9]+}}:
+  ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
   ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY $x0
   ; CHECK:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s64)
@@ -52,7 +13,7 @@ body: |
   ; CHECK:   [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[INTTOPTR]](p0)
   ; CHECK:   $x0 = COPY [[PTRTOINT]](s64)
   ; CHECK:   G_BRCOND [[TRUNC]](s1), %bb.1
-  ; CHECK: bb.1.{{[a-zA-Z0-9]+}}:
+  ; CHECK: bb.1:
   ; CHECK:   [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
   ; CHECK:   [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
   ; CHECK:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[TRUNC]](s1), [[TRUNC2]], [[TRUNC3]]
@@ -83,127 +44,101 @@ body: |
   ; CHECK:   [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST5]](<2 x s16>)
   ; CHECK:   $w0 = COPY [[BITCAST6]](s32)
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-    %0(s64) = COPY $x0
-
-    %1(s1) = G_TRUNC %0
-    %2(s8) = G_TRUNC %0
-    %3(s16) = G_TRUNC %0
-    %4(s32) = G_TRUNC %0
-
-    %5(p0) = G_INTTOPTR %0
-    %6(s64) = G_PTRTOINT %5
-    $x0 = COPY %6
-
-    G_BRCOND %1, %bb.1
-
-  bb.1.next:
-
-    %7(s1) = G_SELECT %1, %1, %1
-    %21:_(s32) = G_ANYEXT %7
-    $w0 = COPY %21
+    successors: %bb.1(0x80000000)
 
-    %8(s8) = G_SELECT %1, %2, %2
-    %20:_(s32) = G_ANYEXT %8
-    $w0 = COPY %20
+    %0:_(s64) = COPY $x0
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s16) = G_TRUNC %0(s64)
+    %4:_(s32) = G_TRUNC %0(s64)
+    %5:_(p0) = G_INTTOPTR %0(s64)
+    %6:_(s64) = G_PTRTOINT %5(p0)
+    $x0 = COPY %6(s64)
+    G_BRCOND %1(s1), %bb.1
 
-    %9(s16) = G_SELECT %1, %3, %3
-    %19:_(s32) = G_ANYEXT %9
-    $w0 = COPY %19
-
-    %10(s32) = G_SELECT %1, %4, %4
-    %11(s64) = G_SELECT %1, %0, %0
-    $x0 = COPY %11
+  bb.1:
+    %7:_(s1) = G_SELECT %1(s1), %1, %1
+    %17:_(s32) = G_ANYEXT %7(s1)
+    $w0 = COPY %17(s32)
+    %8:_(s8) = G_SELECT %1(s1), %2, %2
+    %18:_(s32) = G_ANYEXT %8(s8)
+    $w0 = COPY %18(s32)
+    %9:_(s16) = G_SELECT %1(s1), %3, %3
+    %19:_(s32) = G_ANYEXT %9(s16)
+    $w0 = COPY %19(s32)
+    %10:_(s32) = G_SELECT %1(s1), %4, %4
+    %11:_(s64) = G_SELECT %1(s1), %0, %0
+    $x0 = COPY %11(s64)
+    %12:_(<2 x s32>) = G_BITCAST %0(s64)
+    %13:_(s64) = G_BITCAST %12(<2 x s32>)
+    $x0 = COPY %13(s64)
+    %14:_(s32) = G_BITCAST %10(s32)
+    $w0 = COPY %14(s32)
+    %15:_(<4 x s8>) = G_BITCAST %0(s64)
+    %20:_(s32) = G_BITCAST %15(<4 x s8>)
+    $w0 = COPY %20(s32)
+    %16:_(<2 x s16>) = G_BITCAST %0(s64)
+    %21:_(s32) = G_BITCAST %16(<2 x s16>)
+    $w0 = COPY %21(s32)
 
-    %12(<2 x s32>) = G_BITCAST %0
-    %13(s64) = G_BITCAST %12
-    $x0 = COPY %13
-    %14(s32) = G_BITCAST %10
-    $w0 = COPY %14
-    %15(<4 x s8>) = G_BITCAST %0
-    %17:_(s32) = G_BITCAST %15
-    $w0 = COPY %17
-    %16(<2 x s16>) = G_BITCAST %0
-    %18:_(s32) = G_BITCAST %16
-    $w0 = COPY %18
 ...
-
 ---
 name:            bitcast128
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _}
-  - { id: 1, class: _}
-  - { id: 2, class: _}
-  - { id: 3, class: _}
 body:             |
-  bb.1:
+  bb.0:
     liveins: $x0, $x1
-    ; This is legal and shouldn't be changed.
+
     ; CHECK-LABEL: name: bitcast128
-    ; CHECK: liveins: $x0, $x1
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[COPY]](s64), [[COPY1]](s64)
     ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[MV]](s128)
     ; CHECK: $q0 = COPY [[BITCAST]](<2 x s64>)
     ; CHECK: RET_ReallyLR implicit $q0
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %3(s128) = G_MERGE_VALUES %0(s64), %1(s64)
-    %2(<2 x s64>) = G_BITCAST %3(s128)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %3:_(s128) = G_MERGE_VALUES %0(s64), %1(s64)
+    %2:_(<2 x s64>) = G_BITCAST %3(s128)
     $q0 = COPY %2(<2 x s64>)
     RET_ReallyLR implicit $q0
 
 ...
 ---
 name:            testExtOfCopyOfTrunc
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _}
-  - { id: 1, class: _}
-  - { id: 2, class: _}
-  - { id: 3, class: _}
 body:             |
-  bb.1:
+  bb.0:
     liveins: $x0
+
     ; CHECK-LABEL: name: testExtOfCopyOfTrunc
-    ; CHECK: liveins: $x0
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
     ; CHECK: $x0 = COPY [[COPY1]](s64)
     ; CHECK: RET_ReallyLR implicit $x0
-    %0(s64) = COPY $x0
-    %1(s1) = G_TRUNC %0
-    %2(s1) = COPY %1
-    %3(s64) = G_ANYEXT %2
-    $x0 = COPY %3
+    %0:_(s64) = COPY $x0
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(s1) = COPY %1(s1)
+    %3:_(s64) = G_ANYEXT %2(s1)
+    $x0 = COPY %3(s64)
     RET_ReallyLR implicit $x0
 
 ...
 ---
 name:            testExtOf2CopyOfTrunc
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: _}
-  - { id: 1, class: _}
-  - { id: 2, class: _}
-  - { id: 3, class: _}
 body:             |
-  bb.1:
+  bb.0:
     liveins: $x0
+
     ; CHECK-LABEL: name: testExtOf2CopyOfTrunc
-    ; CHECK: liveins: $x0
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
     ; CHECK: $x0 = COPY [[COPY1]](s64)
     ; CHECK: RET_ReallyLR implicit $x0
-    %0(s64) = COPY $x0
-    %1(s1) = G_TRUNC %0
-    %2(s1) = COPY %1
-    %4:_(s1) = COPY %2
-    %3(s64) = G_ANYEXT %4
-    $x0 = COPY %3
+    %0:_(s64) = COPY $x0
+    %1:_(s1) = G_TRUNC %0(s64)
+    %2:_(s1) = COPY %1(s1)
+    %4:_(s1) = COPY %2(s1)
+    %3:_(s64) = G_ANYEXT %4(s1)
+    $x0 = COPY %3(s64)
     RET_ReallyLR implicit $x0
 
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir b/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
index 5f50ce047cef97fea2f662c162ee3b31669a6cab..32796e0948cc5cc07391eb92fa1d8895eeb9076d 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
@@ -1,28 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_sub_small() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_sub_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_sub_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -31,11 +12,12 @@ body: |
     ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[TRUNC1]]
     ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SUB]](s32)
     ; CHECK: $x0 = COPY [[ANYEXT]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-    %4(s8) = G_SUB %2, %3
-    %5(s64) = G_ANYEXT %4
-    $x0 = COPY %5
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_SUB %2, %3
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x0 = COPY %5(s64)
+
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir b/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
index 022fd13d1789ccc871026369abbd322b9200b584..e46c9ad79c6748ab4d5afe26506b980b3b5d43d6 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-undef.mir
@@ -1,9 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=aarch64-linux-gnu -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_implicit_def
-registers:
 body: |
   bb.0.entry:
     liveins:
@@ -12,7 +10,9 @@ body: |
     ; CHECK: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CHECK: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CHECK: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[DEF]](s64), [[DEF1]](s64)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s64) = G_TRUNC [[MV]](s128)
+    ; CHECK: $x0 = COPY [[TRUNC]](s64)
     %0:_(s128) = G_IMPLICIT_DEF
-    %1:_(s64) = G_TRUNC %0
-    $x0 = COPY %1
+    %1:_(s64) = G_TRUNC %0(s128)
+    $x0 = COPY %1(s64)
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir b/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir
index 6958d30d36549086c0aeab839e1e761ab8f98d06..3305c4baef480a4f7b5cea6539d2aa722e0a9dc2 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir
@@ -1,28 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -run-pass=legalizer %s -o - | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_scalar_xor_small() {
-  entry:
-    ret void
-  }
-...
-
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
 name:            test_scalar_xor_small
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-  - { id: 3, class: _ }
-  - { id: 4, class: _ }
-  - { id: 5, class: _ }
-body: |
+body:             |
   bb.0.entry:
-    liveins: $x0, $x1, $x2, $x3
-
     ; CHECK-LABEL: name: test_scalar_xor_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
@@ -31,11 +12,12 @@ body: |
     ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
     ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
     ; CHECK: $x0 = COPY [[ANYEXT]](s64)
-    %0(s64) = COPY $x0
-    %1(s64) = COPY $x1
-    %2(s8) = G_TRUNC %0
-    %3(s8) = G_TRUNC %1
-    %4(s8) = G_XOR %2, %3
-    %5(s64) = G_ANYEXT %4
-    $x0 = COPY %5
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY $x1
+    %2:_(s8) = G_TRUNC %0(s64)
+    %3:_(s8) = G_TRUNC %1(s64)
+    %4:_(s8) = G_XOR %2, %3
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x0 = COPY %5(s64)
+
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-zextload.mir b/test/CodeGen/AArch64/GlobalISel/legalize-zextload.mir
index 66c3f2577360ef01ba52fd23357e7c2f9cfdd3bf..ad3603d1d136e6e5e4648f1e15e49b9bad3558f2 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-zextload.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-zextload.mir
@@ -1,24 +1,15 @@
-# RUN: llc -O0 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-  target triple = "aarch64--"
-  define void @test_sextload(i8* %addr) {
-  entry:
-    ret void
-  }
-...
-
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
 ---
-name:            test_sextload
+name:            test_zextload
 body: |
   bb.0.entry:
     liveins: $x0
-    ; CHECK-LABEL: name: test_sextload
-    ; CHECK: [[T0:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK: [[T1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[T0]](p0) :: (load 1 from %ir.addr)
-    ; CHECK: $w0 = COPY [[T1]](s32)
+    ; CHECK-LABEL: name: test_zextload
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load 1)
+    ; CHECK: $w0 = COPY [[ZEXTLOAD]](s32)
     %0:_(p0) = COPY $x0
-    %1:_(s32) = G_ZEXTLOAD %0 :: (load 1 from %ir.addr)
+    %1:_(s32) = G_ZEXTLOAD %0 :: (load 1)
     $w0 = COPY %1
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index f776ca6df318ecc4a9031396ee6047febc54a1a0..ca059cf15443d26c666dba2c0bd07ec3b8c48b5a 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -295,7 +295,7 @@
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
 #
 # DEBUG-NEXT: G_EXTRACT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices
-# DEBUG:      .. type index coverage check SKIPPED: no rules defined
+# DEBUG:      .. type index coverage check SKIPPED: user-defined predicate detected
 #
 # DEBUG-NEXT: G_SHUFFLE_VECTOR (opcode {{[0-9]+}}): 3 type indices
 # DEBUG:      .. type index coverage check SKIPPED: no rules defined
diff --git a/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir b/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir
index 8ca81a3bd4037c55b65a81502fcc5c36e1f10cfb..667a769046614f8219f59cbab03d1678bdf77b85 100644
--- a/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir
+++ b/test/CodeGen/AArch64/GlobalISel/machine-cse-mid-pipeline.mir
@@ -130,9 +130,8 @@ regBankSelected: false
 selected:        false
 body:             |
   ; CHECK-LABEL: name: generic_to_concrete_copy
-  ; CHECK:      %[[S1:[0-9]+]]:_(s32) = G_ADD %{{[0-9]+}}, %{{[0-9]+}}
-  ; CHECK-NEXT: %[[S2:[0-9]+]]:gpr32 = COPY %[[S1]](s32)
-  ; CHECK-NEXT: %{{[0-9]+}}:gpr32 = ADDWrr %[[S2]], %[[S2]]
+  ; CHECK:      %[[S1:[0-9]+]]:gpr32(s32) = G_ADD %{{[0-9]+}}, %{{[0-9]+}}
+  ; CHECK-NEXT: %{{[0-9]+}}:gpr32 = ADDWrr %[[S1]](s32), %[[S1]](s32)
   bb.0:
     %0:_(s32) = COPY $w0
     %1:_(s32) = COPY $w1
@@ -149,9 +148,8 @@ regBankSelected: false
 selected:        false
 body:             |
   ; CHECK-LABEL: name: concrete_to_generic_copy
-  ; CHECK:      %[[S1:[0-9]+]]:gpr32 = ADDWrr %{{[0-9]+}}, %{{[0-9]+}}
-  ; CHECK-NEXT: %[[S2:[0-9]+]]:_(s32) = COPY %[[S1]]
-  ; CHECK-NEXT: %{{[0-9]+}}:_(s32) = G_ADD %[[S2]], %[[S2]]
+  ; CHECK:      %[[S1:[0-9]+]]:gpr32(s32) = ADDWrr %{{[0-9]+}}, %{{[0-9]+}}
+  ; CHECK-NEXT: %{{[0-9]+}}:_(s32) = G_ADD %[[S1]], %[[S1]]
   bb.0:
     %0:gpr32 = COPY $w0
     %1:gpr32 = COPY $w1
@@ -278,3 +276,31 @@ body:             |
     $w0 = COPY %23(s32)
     RET_ReallyLR implicit $w0
 ...
+---
+name:            variadic_defs_unmerge_vector_constraints_mix
+legalized:       true
+regBankSelected: false
+selected:        false
+body:             |
+  ; CHECK-LABEL: name: variadic_defs_unmerge_vector_constraints_mix
+  ; CHECK:      [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK-NEXT: [[UV0:%[0-9]+]]:gpr(s32), [[UV1:%[0-9]+]]:gpr(s32), [[UV2:%[0-9]+]]:gpr32(s32), [[UV3:%[0-9]+]]:gpr32(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
+  ; CHECK-NEXT: [[ADD0:%[0-9]+]]:_(s32) = G_ADD [[UV0]], [[UV1]]
+  ; CHECK-NEXT: [[ADD1:%[0-9]+]]:gpr32(s32) = ADDWrr [[UV2]](s32), [[UV3]](s32)
+  ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD0]], [[ADD1]]
+  ; CHECK-NEXT: $w0 = COPY [[ADD2]](s32)
+  ; CHECK-NEXT: RET_ReallyLR implicit $w0
+  bb.0:
+    %0 :_(<4 x s32>) = COPY $q0
+    %1 :_(s32), %2 : _ (s32), %3 :_(s32), %4 :  _  (s32) = G_UNMERGE_VALUES %0(<4 x s32>)
+    %5 :_(s32), %6 :gpr(s32), %7 :_(s32), %8 :  _  (s32) = G_UNMERGE_VALUES %0(<4 x s32>)
+    %9 :_(s32), %10: _ (s32), %11:_(s32), %12:  _  (s32) = G_UNMERGE_VALUES %0(<4 x s32>)
+    %13:_(s32), %14: _ (s32), %15:_(s32), %16:gpr32(s32) = G_UNMERGE_VALUES %0(<4 x s32>)
+    %21:gpr(s32) = COPY %1(s32)
+    %17:_(s32) = G_ADD %21, %6
+    %18:gpr32 = COPY %11(s32)
+    %19:gpr32(s32) = ADDWrr %18, %16
+    %20:_(s32) = G_ADD %17, %19
+    $w0 = COPY %20(s32)
+    RET_ReallyLR implicit $w0
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir b/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir
index 1528a80977100255e92b34992650242dc1dff648..c64e2f78ab3bfc7421e4789595d22f5de5c50a66 100644
--- a/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir
+++ b/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir
@@ -36,8 +36,8 @@ body: |
   bb.0:
     liveins: $w0
     %0:_(s32) = COPY $w0
-    ; CHECK: DBG_VALUE debug-use %0(s32), debug-use $noreg, !7, !DIExpression(), debug-location !9
-    DBG_VALUE debug-use %0(s32), debug-use $noreg, !7, !DIExpression(), debug-location !9
+    ; CHECK: DBG_VALUE %0(s32), $noreg, !7, !DIExpression(), debug-location !9
+    DBG_VALUE %0(s32), $noreg, !7, !DIExpression(), debug-location !9
 
     ; CHECK: DBG_VALUE $noreg, 0, !7, !DIExpression(), debug-location !9
     DBG_VALUE $noreg, 0, !7, !DIExpression(), debug-location !9
diff --git a/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir b/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir
index f75d5629478d1e005fe9afeabcec710534ec686c..72fbfad89c23df10df12e6208a9bc7665e8547ec 100644
--- a/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir
@@ -46,11 +46,11 @@ body: |
     ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
     ; CHECK: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[COPY]], [[COPY]]
     ; CHECK: $w0 = COPY [[ADDWrr]]
-    ; CHECK: DBG_VALUE debug-use [[ADDWrr]], debug-use $noreg, !7, !DIExpression(), debug-location !9
+    ; CHECK: DBG_VALUE [[ADDWrr]], $noreg, !7, !DIExpression(), debug-location !9
     %0:gpr(s32) = COPY $w0
     %1:gpr(s32) = G_ADD %0, %0
     $w0 = COPY %1(s32)
-    DBG_VALUE debug-use %1(s32), debug-use $noreg, !7, !DIExpression(), debug-location !9
+    DBG_VALUE %1(s32), $noreg, !7, !DIExpression(), debug-location !9
 ...
 
 ---
@@ -62,7 +62,7 @@ body: |
     liveins: $w0
     ; CHECK-LABEL: name: test_dbg_value_dead
     ; CHECK-NOT: COPY
-    ; CHECK: DBG_VALUE debug-use $noreg, debug-use $noreg, !7, !DIExpression(), debug-location !9
+    ; CHECK: DBG_VALUE $noreg, $noreg, !7, !DIExpression(), debug-location !9
     %0:gpr(s32) = COPY $w0
-    DBG_VALUE debug-use %0(s32), debug-use $noreg, !7, !DIExpression(), debug-location !9
+    DBG_VALUE %0(s32), $noreg, !7, !DIExpression(), debug-location !9
 ...
diff --git a/test/CodeGen/AArch64/O3-pipeline.ll b/test/CodeGen/AArch64/O3-pipeline.ll
index 33bc05f91d52853eede46a645d02006a5ba1f40d..dc2316987d33f1c9efbf9e7b776b321ad6ea789a 100644
--- a/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/test/CodeGen/AArch64/O3-pipeline.ll
@@ -151,6 +151,7 @@
 ; CHECK-NEXT:       Branch Probability Basic Block Placement
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       AArch64 Branch Targets
+; CHECK-NEXT:       AArch64 Compress Jump Tables
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
diff --git a/test/CodeGen/AArch64/aarch64-be-bv.ll b/test/CodeGen/AArch64/aarch64-be-bv.ll
index 54b7c8ff414ba62f1871f362718173742868a0f2..0e1797fa1798e1594dae88323f4a7cdc127dc2d8 100644
--- a/test/CodeGen/AArch64/aarch64-be-bv.ll
+++ b/test/CodeGen/AArch64/aarch64-be-bv.ll
@@ -746,7 +746,7 @@ define void @modimm_t10_call() {
   ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
   ; CHECK-NEXT:    bl      f_v4i16
   call i16 @f_v4i16(<4 x i16> <i16 -1, i16 0, i16 -1, i16 0>)
-  ; CHECK:         movi    d[[REG1:[0-9]+]], #0xffffffffffffffff
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffffffffffffff
   ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
   ; CHECK-NEXT:    bl      f_v2i32
   call i32 @f_v2i32(<2 x i32> <i32 -1, i32 -1>)
diff --git a/test/CodeGen/AArch64/aarch64-smax-constantfold.ll b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
index 0e5b59f95126d65a1cd0a94a793367f9220c5a96..32cd3c6833349104c44b5175654cc07c1991d235 100644
--- a/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
+++ b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
@@ -6,7 +6,7 @@ declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>)
 ; CHECK-LABEL: test
 define <4 x i16> @test() {
 entry:
-; CHECK: movi	d{{[0-9]+}}, #0000000000000000
+; CHECK: movi	v{{[0-9]+}}.2d, #0000000000000000
   %0 = tail call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer)
   ret <4 x i16> %0
 }
diff --git a/test/CodeGen/AArch64/addr-of-ret-addr.ll b/test/CodeGen/AArch64/addr-of-ret-addr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..247b2825e15509f7f6e49fd5b0e104888800d8d8
--- /dev/null
+++ b/test/CodeGen/AArch64/addr-of-ret-addr.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -disable-fp-elim -mtriple=arm64-windows | FileCheck %s
+
+; Test generated from C code:
+; #include <stdarg.h>
+; void *foo() {
+;   return _AddressOfReturnAddress();
+; }
+; int bar(int x(va_list, void*), ...) {
+;   va_list y;
+;   va_start(y, x);
+;   return x(y, _AddressOfReturnAddress()) + 1;
+; }
+
+declare void @llvm.va_start(i8*)
+declare i8* @llvm.addressofreturnaddress()
+
+define dso_local i8* @"foo"() {
+entry:
+  %0 = call i8* @llvm.addressofreturnaddress()
+  ret i8* %0
+
+; CHECK-LABEL: foo
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: add x0, x29, #8
+; CHECK: ldp x29, x30, [sp], #16
+}
+
+define dso_local i32 @"bar"(i32 (i8*, i8*)* %x, ...) {
+entry:
+  %x.addr = alloca i32 (i8*, i8*)*, align 8
+  %y = alloca i8*, align 8
+  store i32 (i8*, i8*)* %x, i32 (i8*, i8*)** %x.addr, align 8
+  %y1 = bitcast i8** %y to i8*
+  call void @llvm.va_start(i8* %y1)
+  %0 = load i32 (i8*, i8*)*, i32 (i8*, i8*)** %x.addr, align 8
+  %1 = call i8* @llvm.addressofreturnaddress()
+  %2 = load i8*, i8** %y, align 8
+  %call = call i32 %0(i8* %2, i8* %1)
+  %add = add nsw i32 %call, 1
+  ret i32 %add
+
+; CHECK-LABEL: bar
+; CHECK: sub sp, sp, #96
+; CHECK: stp x29, x30, [sp, #16]
+; CHECK: add x29, sp, #16
+; CHECK: str x1, [x29, #24]
+; CHECK: add x1, x29, #8
+; CHECK: ldp x29, x30, [sp, #16]
+; CHECK: add sp, sp, #96
+}
diff --git a/test/CodeGen/AArch64/and-mask-removal.ll b/test/CodeGen/AArch64/and-mask-removal.ll
index c02bc881cd33d59fff68839521a10d044bc10266..4424b0e41124d450f61388b2315cf216b5bf79bb 100644
--- a/test/CodeGen/AArch64/and-mask-removal.ll
+++ b/test/CodeGen/AArch64/and-mask-removal.ll
@@ -179,7 +179,9 @@ ret_false:
 ret_true:
   ret i1 true
 ; CHECK-LABEL: test16_2
-; CHECK: and
+; CHECK: mov	[[CST:w[0-9]+]], #16882
+; CHECK: add	[[ADD:w[0-9]+]], w0, [[CST]]
+; CHECK: cmp	{{.*}}, [[ADD]], uxth
 ; CHECK: ret
 }
 
@@ -207,7 +209,9 @@ ret_false:
 ret_true:
   ret i1 true
 ; CHECK-LABEL: test16_4
-; CHECK: and
+; CHECK: mov	[[CST:w[0-9]+]], #29985
+; CHECK: add	[[ADD:w[0-9]+]], w0, [[CST]]
+; CHECK: cmp	{{.*}}, [[ADD]], uxth
 ; CHECK: ret
 }
 
@@ -249,7 +253,9 @@ ret_false:
 ret_true:
   ret i1 true
 ; CHECK-LABEL: test16_7
-; CHECK: and
+; CHECK: mov	[[CST:w[0-9]+]], #9272
+; CHECK: add	[[ADD:w[0-9]+]], w0, [[CST]]
+; CHECK: cmp	{{.*}}, [[ADD]], uxth
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index bfc03c6b97579e2d5473b2eab767d8a21a17d28c..af99734e6a6e4350f0f501de2ed2eab577db8063 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -128,7 +128,7 @@ entry:
 ; CHECK-LABEL: test3
 ; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
 ; FAST-LABEL: test3
-; FAST: sub sp, sp, #48
+; FAST: sub sp, sp, #{{[0-9]+}}
 ; FAST: mov x[[ADDR:[0-9]+]], sp
 ; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
   %0 = load <2 x i32>, <2 x i32>* %in, align 8
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
index 4288aa1df444ec89a1924102584bf91a897d1a44..dc64123b33c0e2f3c0ced2014b08987df91bae17 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
@@ -4,7 +4,8 @@ define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_imm
 ; CHECK:       cmp w0, #31
-; CHECK-NEXT:  cset w0, eq
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i32 %a, 31
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -14,7 +15,8 @@ define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_neg_imm
 ; CHECK:       cmn w0, #7
-; CHECK-NEXT:  cset w0, eq
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i32 %a, -7
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -24,7 +26,8 @@ define i32 @icmp_eq_i32(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_i32
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, eq
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -34,7 +37,8 @@ define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_ne
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, ne
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], ne
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ne i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -64,7 +68,8 @@ define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_ugt
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, hi
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], hi
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ugt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -74,7 +79,8 @@ define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_uge
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, hs
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], hs
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp uge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -84,7 +90,8 @@ define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_ult
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, lo
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lo
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ult i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -94,7 +101,8 @@ define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_ule
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, ls
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], ls
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ule i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -104,7 +112,8 @@ define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_sgt
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, gt
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], gt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sgt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -114,7 +123,8 @@ define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_sge
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, ge
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], ge
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sge i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -124,7 +134,8 @@ define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_slt
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, lt
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp slt i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -134,7 +145,8 @@ define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_sle
 ; CHECK:       cmp w0, w1
-; CHECK-NEXT:  cset w0, le
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], le
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sle i32 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -144,7 +156,8 @@ define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_i64
 ; CHECK:       cmp  x0, x1
-; CHECK-NEXT:  cset w{{[0-9]+}}, le
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], le
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sle i64 %a, %b
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -153,9 +166,10 @@ entry:
 define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_i16
-; CHECK:       sxth w0, w0
-; CHECK:       cmp w0, w1, sxth
-; CHECK-NEXT:  cset w0, eq
+; CHECK:       sxth [[REG0:w[0-9]+]], w0
+; CHECK:       cmp [[REG0]], w1, sxth
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i16 %a, %b
   ret i1 %cmp
 }
@@ -163,9 +177,10 @@ entry:
 define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
 entry:
 ; CHECK-LABEL: icmp_eq_i8
-; CHECK:       sxtb w0, w0
-; CHECK-NEXT:  cmp w0, w1, sxtb
-; CHECK-NEXT:  cset w0, eq
+; CHECK:       sxtb [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmp [[REG0]], w1, sxtb
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], eq
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp eq i8 %a, %b
   ret i1 %cmp
 }
@@ -173,9 +188,10 @@ entry:
 define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i16_unsigned
-; CHECK:       uxth w0, w0
-; CHECK-NEXT:  cmp w0, w1, uxth
-; CHECK-NEXT:  cset w0, lo
+; CHECK:       uxth [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmp [[REG0]], w1, uxth
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lo
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ult i16 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -184,9 +200,10 @@ entry:
 define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i8_signed
-; CHECK:       sxtb w0, w0
-; CHECK-NEXT:  cmp w0, w1, sxtb
-; CHECK-NEXT:  cset w0, gt
+; CHECK:       sxtb [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmp [[REG0]], w1, sxtb
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], gt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sgt i8 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -198,7 +215,8 @@ entry:
 ; CHECK:       sbfx [[REG1:w[0-9]+]], w0, #0, #1
 ; CHECK-NEXT:  sbfx [[REG2:w[0-9]+]], w1, #0, #1
 ; CHECK-NEXT:  cmp  [[REG1]], [[REG2]]
-; CHECK-NEXT:  cset w0, gt
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], gt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sgt i1 %a, %b
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -207,10 +225,10 @@ entry:
 define i32 @icmp_i16_signed_const(i16 %a) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i16_signed_const
-; CHECK:       sxth w0, w0
-; CHECK-NEXT:  cmn w0, #233
-; CHECK-NEXT:  cset w0, lt
-; CHECK-NEXT:  and w0, w0, #0x1
+; CHECK:       sxth [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmn [[REG0]], #233
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp slt i16 %a, -233
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -219,10 +237,10 @@ entry:
 define i32 @icmp_i8_signed_const(i8 %a) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i8_signed_const
-; CHECK:       sxtb w0, w0
-; CHECK-NEXT:  cmp w0, #124
-; CHECK-NEXT:  cset w0, gt
-; CHECK-NEXT:  and w0, w0, #0x1
+; CHECK:       sxtb [[REG0:w[0-9]+]], w0
+; CHECK-NEXT:  cmp [[REG0]], #124
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], gt
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp sgt i8 %a, 124
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
@@ -231,10 +249,10 @@ entry:
 define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
 entry:
 ; CHECK-LABEL: icmp_i1_unsigned_const
-; CHECK:       and w0, w0, #0x1
-; CHECK-NEXT:  cmp w0, #0
-; CHECK-NEXT:  cset w0, lo
-; CHECK-NEXT:  and w0, w0, #0x1
+; CHECK:       and [[REG0:w[0-9]+]], w0, #0x1
+; CHECK-NEXT:  cmp [[REG0]], #0
+; CHECK-NEXT:  cset [[REG:w[0-9]+]], lo
+; CHECK-NEXT:  and w0, [[REG]], #0x1
   %cmp = icmp ult i1 %a, 0
   %conv2 = zext i1 %cmp to i32
   ret i32 %conv2
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
index 0fcd4fe752f2a7e6f0e6d949f6fd11ca5c497527..c1b7d790878d676b8bc82aa79d887a5de942b950 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
@@ -7,8 +7,8 @@ define void @t1() {
 ; ARM64-LABEL: t1
 ; ARM64: adrp x8, _message@PAGE
 ; ARM64: add x0, x8, _message@PAGEOFF
-; ARM64: mov w9, wzr
-; ARM64: uxtb w1, w9
+; ARM64: mov [[REG:w[0-9]+]], wzr
+; ARM64: uxtb w1, [[REG]]
 ; ARM64: mov x2, #80
 ; ARM64: bl _memset
   call void @llvm.memset.p0i8.i64(i8* align 16 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i1 false)
@@ -48,15 +48,15 @@ declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1)
 define void @t4() {
 ; ARM64-LABEL: t4
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr x10, [x9]
-; ARM64: str x10, [x8]
-; ARM64: ldr x10, [x9, #8]
-; ARM64: str x10, [x8, #8]
-; ARM64: ldrb w11, [x9, #16]
-; ARM64: strb w11, [x8, #16]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG2:x[0-9]+]], [[REG1]], _message@PAGEOFF
+; ARM64: ldr x10, {{\[}}[[REG2]]{{\]}}
+; ARM64: str x10, {{\[}}[[REG0]]{{\]}}
+; ARM64: ldr x10, {{\[}}[[REG2]], #8]
+; ARM64: str x10, {{\[}}[[REG0]], #8]
+; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #16]
+; ARM64: strb [[REG3]], {{\[}}[[REG0]], #16]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 16 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 17, i1 false)
   ret void
@@ -65,15 +65,15 @@ define void @t4() {
 define void @t5() {
 ; ARM64-LABEL: t5
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr x10, [x9]
-; ARM64: str x10, [x8]
-; ARM64: ldr x10, [x9, #8]
-; ARM64: str x10, [x8, #8]
-; ARM64: ldrb w11, [x9, #16]
-; ARM64: strb w11, [x8, #16]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG3:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG1:x[0-9]+]], [[REG3]], _message@PAGEOFF
+; ARM64: ldr x10, {{\[}}[[REG1]]]
+; ARM64: str x10, {{\[}}[[REG0]]]
+; ARM64: ldr x10, {{\[}}[[REG1]], #8]
+; ARM64: str x10, {{\[}}[[REG0]], #8]
+; ARM64: ldrb [[REG4:w[0-9]+]], {{\[}}[[REG1]], #16]
+; ARM64: strb [[REG4]], {{\[}}[[REG0]], #16]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 8 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 17, i1 false)
   ret void
@@ -82,15 +82,15 @@ define void @t5() {
 define void @t6() {
 ; ARM64-LABEL: t6
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldr w10, [x9]
-; ARM64: str w10, [x8]
-; ARM64: ldr w10, [x9, #4]
-; ARM64: str w10, [x8, #4]
-; ARM64: ldrb w10, [x9, #8]
-; ARM64: strb w10, [x8, #8]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG2:x[0-9]+]], [[REG1]], _message@PAGEOFF
+; ARM64: ldr w10, {{\[}}[[REG2]]]
+; ARM64: str w10, {{\[}}[[REG0]]]
+; ARM64: ldr w10, {{\[}}[[REG2]], #4]
+; ARM64: str w10, {{\[}}[[REG0]], #4]
+; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #8]
+; ARM64: strb [[REG3]], {{\[}}[[REG0]], #8]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 4 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 9, i1 false)
   ret void
@@ -99,17 +99,17 @@ define void @t6() {
 define void @t7() {
 ; ARM64-LABEL: t7
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldrh w10, [x9]
-; ARM64: strh w10, [x8]
-; ARM64: ldrh w10, [x9, #2]
-; ARM64: strh w10, [x8, #2]
-; ARM64: ldrh w10, [x9, #4]
-; ARM64: strh w10, [x8, #4]
-; ARM64: ldrb w10, [x9, #6]
-; ARM64: strb w10, [x8, #6]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG2:x[0-9]+]], [[REG1]], _message@PAGEOFF
+; ARM64: ldrh w10, {{\[}}[[REG2]]]
+; ARM64: strh w10, {{\[}}[[REG0]]]
+; ARM64: ldrh w10, {{\[}}[[REG2]], #2]
+; ARM64: strh w10, {{\[}}[[REG0]], #2]
+; ARM64: ldrh w10, {{\[}}[[REG2]], #4]
+; ARM64: strh w10, {{\[}}[[REG0]], #4]
+; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #6]
+; ARM64: strb [[REG3]], {{\[}}[[REG0]], #6]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 2 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 7, i1 false)
   ret void
@@ -118,17 +118,17 @@ define void @t7() {
 define void @t8() {
 ; ARM64-LABEL: t8
 ; ARM64: adrp x8, _temp@GOTPAGE
-; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
-; ARM64: adrp x9, _message@PAGE
-; ARM64: add x9, x9, _message@PAGEOFF
-; ARM64: ldrb w10, [x9]
-; ARM64: strb w10, [x8]
-; ARM64: ldrb w10, [x9, #1]
-; ARM64: strb w10, [x8, #1]
-; ARM64: ldrb w10, [x9, #2]
-; ARM64: strb w10, [x8, #2]
-; ARM64: ldrb w10, [x9, #3]
-; ARM64: strb w10, [x8, #3]
+; ARM64: ldr [[REG0:x[0-9]+]], [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp [[REG1:x[0-9]+]], _message@PAGE
+; ARM64: add [[REG2:x[0-9]+]], [[REG1:x[0-9]+]], _message@PAGEOFF
+; ARM64: ldrb w10, {{\[}}[[REG2]]]
+; ARM64: strb w10, {{\[}}[[REG0]]]
+; ARM64: ldrb w10, {{\[}}[[REG2]], #1]
+; ARM64: strb w10, {{\[}}[[REG0]], #1]
+; ARM64: ldrb w10, {{\[}}[[REG2]], #2]
+; ARM64: strb w10, {{\[}}[[REG0]], #2]
+; ARM64: ldrb [[REG3:w[0-9]+]], {{\[}}[[REG2]], #3]
+; ARM64: strb [[REG3]], {{\[}}[[REG0]], #3]
 ; ARM64: ret
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 getelementptr inbounds ([80 x i8], [80 x i8]* @temp, i32 0, i32 0), i8* align 1 getelementptr inbounds ([80 x i8], [80 x i8]* @message, i32 0, i32 0), i64 4, i1 false)
   ret void
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
index 9a67fff00ac34253b93848b54c6b4c94de633119..81c9933a863741f61ffddb3173ad60e3c970b49a 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
@@ -35,7 +35,7 @@ entry:
 define signext i16 @ret_i16(i16 signext %a) nounwind {
 entry:
 ; CHECK: @ret_i16
-; CHECK: sxth	w0, w0
+; CHECK: sxth	w0, {{w[0-9]+}}
   %a.addr = alloca i16, align 1
   store i16 %a, i16* %a.addr, align 1
   %0 = load i16, i16* %a.addr, align 1
@@ -45,7 +45,7 @@ entry:
 define signext i8 @ret_i8(i8 signext %a) nounwind {
 entry:
 ; CHECK: @ret_i8
-; CHECK: sxtb	w0, w0
+; CHECK: sxtb	w0, {{w[0-9]+}}
   %a.addr = alloca i8, align 1
   store i8 %a, i8* %a.addr, align 1
   %0 = load i8, i8* %a.addr, align 1
@@ -55,7 +55,8 @@ entry:
 define signext i1 @ret_i1(i1 signext %a) nounwind {
 entry:
 ; CHECK: @ret_i1
-; CHECK: and w0, w0, #0x1
+; CHECK: and [[REG:w[0-9]+]], {{w[0-9]+}}, #0x1
+; CHECK: sbfx w0, [[REG]], #0, #1
   %a.addr = alloca i1, align 1
   store i1 %a, i1* %a.addr, align 1
   %0 = load i1, i1* %a.addr, align 1
diff --git a/test/CodeGen/AArch64/arm64-fast-isel.ll b/test/CodeGen/AArch64/arm64-fast-isel.ll
index 508e36750eec162c9579263389a90bafd72fd64f..daccc86c709dc6108aad1e5d3ff1e41921c49d6a 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel.ll
@@ -30,11 +30,11 @@ define void @t1(i64 %a) nounwind {
 define zeroext i1 @i1(i1 %a) nounwind {
 entry:
 ; CHECK: @i1
-; CHECK: and w0, w0, #0x1
-; CHECK: strb w0, [sp, #15]
-; CHECK: ldrb w0, [sp, #15]
-; CHECK: and w0, w0, #0x1
-; CHECK: and w0, w0, #0x1
+; CHECK: and [[REG:w[0-9]+]], w0, #0x1
+; CHECK: strb [[REG]], [sp, #15]
+; CHECK: ldrb [[REG1:w[0-9]+]], [sp, #15]
+; CHECK: and [[REG2:w[0-9]+]], [[REG1]], #0x1
+; CHECK: and w0, [[REG2]], #0x1
 ; CHECK: add sp, sp, #16
 ; CHECK: ret
   %a.addr = alloca i1, align 1
diff --git a/test/CodeGen/AArch64/arm64-ld-from-st.ll b/test/CodeGen/AArch64/arm64-ld-from-st.ll
index dd8add70cdb7cead2c9f696277564e16427aa13b..5488c21fa298f0fe6f7c1c66c2e2e0b045532741 100644
--- a/test/CodeGen/AArch64/arm64-ld-from-st.ll
+++ b/test/CodeGen/AArch64/arm64-ld-from-st.ll
@@ -13,7 +13,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str64Ldr32_0
-; CHECK: and x0, x1, #0xffffffff
+; CHECK: mov w0, w1
 define i32 @Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i32*
@@ -37,7 +37,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str64Ldr16_0
-; CHECK: and x0, x1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i16*
@@ -85,7 +85,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str64Ldr8_0
-; CHECK: and x0, x1, #0xff
+; CHECK: mov w0, w1
 define i8 @Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i8*
@@ -193,7 +193,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str32Ldr16_0
-; CHECK: and w0, w1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
 entry:
   %0 = bitcast i32* %P to i16*
@@ -217,7 +217,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str32Ldr8_0
-; CHECK: and w0, w1, #0xff
+; CHECK: mov w0, w1
 define i8 @Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
 entry:
   %0 = bitcast i32* %P to i8*
@@ -265,7 +265,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str16Ldr16
-; CHECK: and w0, w1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
 entry:
   %0 = bitcast i16* %P to i16*
@@ -277,7 +277,7 @@ entry:
 }
 
 ; CHECK-LABEL: Str16Ldr8_0
-; CHECK: and w0, w1, #0xff
+; CHECK: mov w0, w1
 define i8 @Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
 entry:
   %0 = bitcast i16* %P to i8*
@@ -314,7 +314,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str64Ldr32_0
-; CHECK: and x0, x1, #0xffffffff
+; CHECK: mov w0, w1
 define i32 @Unscaled_Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i32*
@@ -338,7 +338,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str64Ldr16_0
-; CHECK: and x0, x1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Unscaled_Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i16*
@@ -386,7 +386,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str64Ldr8_0
-; CHECK: and x0, x1, #0xff
+; CHECK: mov w0, w1
 define i8 @Unscaled_Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
 entry:
   %0 = bitcast i64* %P to i8*
@@ -494,7 +494,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str32Ldr16_0
-; CHECK: and w0, w1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Unscaled_Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
 entry:
   %0 = bitcast i32* %P to i16*
@@ -518,7 +518,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str32Ldr8_0
-; CHECK: and w0, w1, #0xff
+; CHECK: mov w0, w1
 define i8 @Unscaled_Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
 entry:
   %0 = bitcast i32* %P to i8*
@@ -566,7 +566,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str16Ldr16
-; CHECK: and w0, w1, #0xffff
+; CHECK: mov w0, w1
 define i16 @Unscaled_Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
 entry:
   %0 = bitcast i16* %P to i16*
@@ -578,7 +578,7 @@ entry:
 }
 
 ; CHECK-LABEL: Unscaled_Str16Ldr8_0
-; CHECK: and w0, w1, #0xff
+; CHECK: mov w0, w1
 define i8 @Unscaled_Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
 entry:
   %0 = bitcast i16* %P to i8*
diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
index 8946d8db331a95bbfeaf324938d14adc6ff1e35f..7a9f3b2fa97f90e8eb09091e679f3dc63ed575a8 100644
--- a/test/CodeGen/AArch64/arm64-memset-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -242,14 +242,12 @@ define void @memset_8_stack() {
   ret void
 }
 
-; FIXME This could be better: x9 is a superset of w8's bit-pattern.
 define void @memset_12_stack() {
 ; CHECK-LABEL: memset_12_stack:
-; CHECK:       mov w8, #-1431655766
-; CHECK-NEXT:  mov x9, #-6148914691236517206
+; CHECK:       mov x8, #-6148914691236517206
 ; CHECK-NEXT:  mov x0, sp
+; CHECK-NEXT:  str x8, [sp]
 ; CHECK-NEXT:  str w8, [sp, #8]
-; CHECK-NEXT:  str x9, [sp]
 ; CHECK-NEXT:  bl something
   %buf = alloca [12 x i8], align 1
   %cast = bitcast [12 x i8]* %buf to i8*
@@ -272,14 +270,12 @@ define void @memset_16_stack() {
   ret void
 }
 
-; FIXME This could be better: x9 is a superset of w8's bit-pattern.
 define void @memset_20_stack() {
 ; CHECK-LABEL: memset_20_stack:
-; CHECK:       mov w8, #-1431655766
-; CHECK-NEXT:  mov x9, #-6148914691236517206
+; CHECK:       mov x8, #-6148914691236517206
 ; CHECK-NEXT:  add x0, sp, #8
+; CHECK-NEXT:  stp x8, x8, [sp, #8]
 ; CHECK-NEXT:  str w8, [sp, #24]
-; CHECK-NEXT:  stp x9, x9, [sp, #8]
 ; CHECK-NEXT:  bl something
   %buf = alloca [20 x i8], align 1
   %cast = bitcast [20 x i8]* %buf to i8*
@@ -288,15 +284,13 @@ define void @memset_20_stack() {
   ret void
 }
 
-; FIXME This could be better: x9 is a superset of w8's bit-pattern.
 define void @memset_26_stack() {
 ; CHECK-LABEL: memset_26_stack:
-; CHECK:       mov w8, #43690
-; CHECK-NEXT:  mov x9, #-6148914691236517206
+; CHECK:       mov x8, #-6148914691236517206
 ; CHECK-NEXT:  mov x0, sp
+; CHECK-NEXT:  stp x8, x8, [sp, #8]
+; CHECK-NEXT:  str x8, [sp]
 ; CHECK-NEXT:  strh w8, [sp, #24]
-; CHECK-NEXT:  stp x9, x9, [sp, #8]
-; CHECK-NEXT:  str x9, [sp]
 ; CHECK-NEXT:  bl something
   %buf = alloca [26 x i8], align 1
   %cast = bitcast [26 x i8]* %buf to i8*
diff --git a/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
index 7cc5a43d53cea37c8c86d5b31b62078d2f28b57a..bb3c36adee55fb6da96dc473e403b78b25103fdb 100644
--- a/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
@@ -975,7 +975,7 @@ define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
 define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
 ;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
 	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
@@ -995,7 +995,7 @@ define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
 define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
 ;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
 	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
@@ -1015,7 +1015,7 @@ define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
 define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
 ;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
 	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index 2a9e545165e95cb7502ac68325923998f2b251ae..0b6132b1be64bd3e229895de38c59e45876410db 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1401,7 +1401,7 @@ entry:
 
 define <4 x i16> @concat_vector_v4i16_const() {
 ; CHECK-LABEL: concat_vector_v4i16_const:
-; CHECK: movi {{d[0-9]+}}, #0
+; CHECK: movi {{v[0-9]+}}.2d, #0
  %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
  ret <4 x i16> %r
 }
@@ -1422,7 +1422,7 @@ define <4 x i32> @concat_vector_v4i32_const() {
 
 define <8 x i8> @concat_vector_v8i8_const() {
 ; CHECK-LABEL: concat_vector_v8i8_const:
-; CHECK: movi {{d[0-9]+}}, #0
+; CHECK: movi {{v[0-9]+}}.2d, #0
  %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
  ret <8 x i8> %r
 }
diff --git a/test/CodeGen/AArch64/arm64-spill-remarks.ll b/test/CodeGen/AArch64/arm64-spill-remarks.ll
index 53a16ed748b22b164fb52adab00d056cf16b6311..2d187a744458ca316f4dc2aef6de8c5328731b2d 100644
--- a/test/CodeGen/AArch64/arm64-spill-remarks.ll
+++ b/test/CodeGen/AArch64/arm64-spill-remarks.ll
@@ -38,7 +38,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 3, Column: 20 }
+; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 3, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         300
 ; YAML: Args:
@@ -51,7 +51,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 2, Column: 20 }
+; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 2, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         30000
 ; YAML: Args:
@@ -64,7 +64,7 @@
 ; YAML: --- !Missed
 ; YAML: Pass:            regalloc
 ; YAML: Name:            LoopSpillReload
-; YAML: DebugLoc:        { File: /tmp/kk.c, Line: 1, Column: 20 }
+; YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 1, Column: 20 }
 ; YAML: Function:        fpr128
 ; YAML: Hotness:         300
 ; YAML: Args:
@@ -79,7 +79,7 @@
 ; THRESHOLD_YAML: --- !Missed
 ; THRESHOLD_YAML: Pass:            regalloc
 ; THRESHOLD_YAML: Name:            LoopSpillReload
-; THRESHOLD_YAML: DebugLoc:        { File: /tmp/kk.c, Line: 2, Column: 20 }
+; THRESHOLD_YAML: DebugLoc:        { File: '/tmp/kk.c', Line: 2, Column: 20 }
 ; THRESHOLD_YAML: Function:        fpr128
 ; THRESHOLD_YAML: Hotness:         30000
 ; THRESHOLD_YAML: Args:
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 68892eeacf37f6587dda1426633056bd5f8924dd..8debd21ee6e60a4a843bb0097a260cb11c5db9d3 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -19,7 +19,7 @@ define void @func30(%T0_30 %v0, %T1_30* %p1) {
 ; sensible instead.
 define <1 x i32> @autogen_SD7918() {
 ; CHECK-LABEL: autogen_SD7918
-; CHECK: movi d0, #0000000000000000
+; CHECK: movi.2d v0, #0000000000000000
 ; CHECK-NEXT: ret
   %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
   %ZE = zext <1 x i1> %I29 to <1 x i32>
diff --git a/test/CodeGen/AArch64/arm64-vpopcnt.ll b/test/CodeGen/AArch64/arm64-vpopcnt.ll
index 4fb73ca4805dfce63346b35426baf347d63479cd..6fe1176eaa871d4377c85ca05a13ee78450942d7 100644
--- a/test/CodeGen/AArch64/arm64-vpopcnt.ll
+++ b/test/CodeGen/AArch64/arm64-vpopcnt.ll
@@ -1,65 +1,102 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-apple- -mcpu=cyclone | FileCheck %s
 
 ; The non-byte ones used to fail with "Cannot select"
 
-; CHECK-LABEL: ctpopv8i8
-; CHECK: cnt.8b
 define <8 x i8> @ctpopv8i8(<8 x i8> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    ret
   %cnt = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %x)
   ret <8 x i8> %cnt
 }
 
 declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) nounwind readnone
 
-; CHECK-LABEL: ctpopv4i16
-; CHECK: cnt.8b
 define <4 x i16> @ctpopv4i16(<4 x i16> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    ret
   %cnt = tail call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %x)
   ret <4 x i16> %cnt
 }
 
 declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
 
-; CHECK-LABEL: ctpopv2i32
-; CHECK: cnt.8b
 define <2 x i32> @ctpopv2i32(<2 x i32> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-NEXT:    ret
   %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %x)
   ret <2 x i32> %cnt
 }
 
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) nounwind readnone
 
+define <1 x i64> @ctpopv1i64(<1 x i64> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-NEXT:    uaddlp v0.1d, v0.2s
+; CHECK-NEXT:    ret
+  %cnt = tail call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %x)
+  ret <1 x i64> %cnt
+}
+
+declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone
 
-; CHECK-LABEL: ctpopv16i8
-; CHECK: cnt.16b
 define <16 x i8> @ctpopv16i8(<16 x i8> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    ret
   %cnt = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %x)
   ret <16 x i8> %cnt
 }
 
 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) nounwind readnone
 
-; CHECK-LABEL: ctpopv8i16
-; CHECK: cnt.8b
 define <8 x i16> @ctpopv8i16(<8 x i16> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    ret
   %cnt = tail call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %x)
   ret <8 x i16> %cnt
 }
 
 declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
 
-; CHECK-LABEL: ctpopv4i32
-; CHECK: cnt.8b
 define <4 x i32> @ctpopv4i32(<4 x i32> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-NEXT:    ret
   %cnt = tail call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x)
   ret <4 x i32> %cnt
 }
 
 declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) nounwind readnone
 
-; CHECK-LABEL: ctpopv2i64
-; CHECK: cnt.8b
 define <2 x i64> @ctpopv2i64(<2 x i64> %x) nounwind readnone {
+; CHECK-LABEL: ctpopv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cnt v0.16b, v0.16b
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-NEXT:    ret
   %cnt = tail call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x)
   ret <2 x i64> %cnt
 }
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
index b4f57675ace3efc44b8d54ed22223b8384eb1a16..fdd7cad78536b0aeb30f7c69c09032e2185ac740 100644
--- a/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -2,7 +2,7 @@
 
 
 ; CHECK: test1
-; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000
+; CHECK: movi.16b v[[REG0:[0-9]+]], #0
 define <8 x i1> @test1() {
 entry:
   %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
index 0335d0a6a0732a41574af963938151fb472a0def..784b4c486fe2e7d9072ed808d63759a170f59127 100644
--- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -162,28 +162,28 @@ entry:
 define <8 x i8> @tv8i8() {
 entry:
 ; ALL-LABEL: tv8i8:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
   ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
 }
 
 define <4 x i16> @tv4i16() {
 entry:
 ; ALL-LABEL: tv4i16:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
   ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
 }
 
 define <2 x i32> @tv2i32() {
 entry:
 ; ALL-LABEL: tv2i32:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
   ret <2 x i32> <i32 0, i32 0>
 }
 
 define <2 x float> @tv2f32() {
 entry:
 ; ALL-LABEL: tv2f32:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
   ret <2 x float> <float 0.0, float 0.0>
 }
 
diff --git a/test/CodeGen/AArch64/bitcast-promote-widen.ll b/test/CodeGen/AArch64/bitcast-promote-widen.ll
new file mode 100644
index 0000000000000000000000000000000000000000..74f9e9c85669cbfd98308a84bc81b5d6145d70e0
--- /dev/null
+++ b/test/CodeGen/AArch64/bitcast-promote-widen.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
+
+; Test cases of bitcasts where one type needs to be widened and one needs to be promoted.
+
+define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) {
+; CHECK-LABEL: bitcast_v2i16_v2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    mov v1.s[1], w8
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %y = bitcast <2 x half> %x to <2 x i16>
+  ret <2 x i16> %y
+}
+
+define <2 x half> @bitcast_v2f16_v2i16(<2 x i16> %x) {
+; CHECK-LABEL: bitcast_v2f16_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v0.4h
+; CHECK-NEXT:    ret
+  %y = bitcast <2 x i16> %x to <2 x half>
+  ret <2 x half> %y
+}
diff --git a/test/CodeGen/AArch64/bitcast.ll b/test/CodeGen/AArch64/bitcast.ll
index e88ea9ec021393b8feafab687cae22523704406b..d60bd4ab3fc5f2ea585f7830a01f2f658676d91b 100644
--- a/test/CodeGen/AArch64/bitcast.ll
+++ b/test/CodeGen/AArch64/bitcast.ll
@@ -4,7 +4,7 @@
 
 define <4 x i16> @foo1(<2 x i32> %a) {
 ; CHECK-LABEL: foo1:
-; CHECK:       movi	d0, #0000000000000000
+; CHECK:       movi	v0.2d, #0000000000000000
 ; CHECK-NEXT:  ret
 
   %1 = shufflevector <2 x i32> <i32 58712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
@@ -16,7 +16,7 @@ define <4 x i16> @foo1(<2 x i32> %a) {
 
 define <4 x i16> @foo2(<2 x i32> %a) {
 ; CHECK-LABEL: foo2:
-; CHECK:       movi	d0, #0000000000000000
+; CHECK:       movi	v0.2d, #0000000000000000
 ; CHECK-NEXT:  ret
 
   %1 = shufflevector <2 x i32> <i32 712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
diff --git a/test/CodeGen/AArch64/chkstk.ll b/test/CodeGen/AArch64/chkstk.ll
index 1c2e5528f10c74fcc7a7b9db26568ee53803e15b..9689a3b9b588ac7b21322b864f9e5a10dd87be8c 100644
--- a/test/CodeGen/AArch64/chkstk.ll
+++ b/test/CodeGen/AArch64/chkstk.ll
@@ -1,8 +1,12 @@
 ; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs %s -o - \
 ; RUN:  | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s
+; RUN: llc -mtriple=aarch64-windows -print-machineinstrs=prologepilog %s -o - 2>&1 \
+; RUN:  | FileCheck -check-prefix CHECK-REGSTATE %s
 
 ; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs -code-model=large %s -o - \
 ; RUN:  | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s
+; RUN: llc -mtriple=aarch64-windows -print-machineinstrs=prologepilog -code-model=large %s -o - 2>&1 \
+; RUN:  | FileCheck -check-prefix CHECK-REGSTATE-LARGE %s
 
 define void @check_watermark() {
 entry:
@@ -12,14 +16,18 @@ entry:
 
 ; CHECK-DEFAULT-CODE-MODEL: check_watermark:
 ; CHECK-DEFAULT-CODE-MODEL-DAG: stp x29, x30, [sp
-; CHECK-DEFAULT-CODE-MODEL-DAG: orr x15, xzr, #0x100
+; CHECK-DEFAULT-CODE-MODEL-DAG: mov x15, #256
 ; CHECK-DEFAULT-CODE-MODEL:     bl __chkstk
 ; CHECK-DEFAULT-CODE-MODEL:     sub sp, sp, x15, lsl #4
 
+; CHECK-REGSTATE: frame-setup BL &__chkstk, implicit-def $lr, implicit $sp, implicit $x15, implicit-def dead $x16, implicit-def dead $x17, implicit-def dead $nzcv
+
 ; CHECK-LARGE-CODE-MODEL: check_watermark:
 ; CHECK-LARGE-CODE-MODEL-DAG: stp x29, x30, [sp
-; CHECK-LARGE-CODE-MODEL-DAG: orr x15, xzr, #0x100
+; CHECK-LARGE-CODE-MODEL-DAG: mov x15, #256
 ; CHECK-LARGE-CODE-MODEL-DAG: adrp x16, __chkstk
 ; CHECK-LARGE-CODE-MODEL-DAG: add x16, x16, __chkstk
 ; CHECK-LARGE-CODE-MODEL:     blr x16
 ; CHECK-LARGE-CODE-MODEL:     sub sp, sp, x15, lsl #4
+
+; CHECK-REGSTATE-LARGE: frame-setup BLR killed $x16, implicit-def $lr, implicit $sp, implicit-def $x15, implicit-def dead $x16, implicit-def dead $x17, implicit-def dead $nzcv
diff --git a/test/CodeGen/AArch64/ext-narrow-index.ll b/test/CodeGen/AArch64/ext-narrow-index.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f7f143ff49e31ab5254c459868ac8a1cf2a09cbe
--- /dev/null
+++ b/test/CodeGen/AArch64/ext-narrow-index.ll
@@ -0,0 +1,345 @@
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; Tests of shufflevector where the index operand is half the width of the vector
+; operands. We should get one ext instruction and not two.
+
+; i8 tests
+define <8 x i8> @i8_off0(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_off1(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #1
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_off8(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off8:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_off15(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off15:
+; CHECK: ext v0.16b, v0.16b, v1.16b, #15
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_off22(<16 x i8> %arg1, <16 x i8> %arg2) {
+; CHECK-LABEL: i8_off22:
+; CHECK: ext v0.16b, v1.16b, v1.16b, #6
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> <i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29>
+  ret <8 x i8> %shuffle
+}
+
+; i16 tests
+define <4 x i16> @i16_off0(<8 x i16> %arg1, <8 x i16> %arg2) {
+; CHECK-LABEL: i16_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> %arg2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_off1(<8 x i16> %arg1, <8 x i16> %arg2) {
+; CHECK-LABEL: i16_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> %arg2, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_off7(<8 x i16> %arg1, <8 x i16> %arg2) {
+; CHECK-LABEL: i16_off7:
+; CHECK: ext v0.16b, v0.16b, v1.16b, #14
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> %arg2, <4 x i32> <i32 7, i32 8, i32 9, i32 10>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_off8(<8 x i16> %arg1, <8 x i16> %arg2) {
+; CHECK-LABEL: i16_off8:
+; CHECK: mov v0.16b, v1.16b
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> %arg2, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  ret <4 x i16> %shuffle
+}
+
+; i32 tests
+define <2 x i32> @i32_off0(<4 x i32> %arg1, <4 x i32> %arg2) {
+; CHECK-LABEL: i32_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> %arg2, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_off1(<4 x i32> %arg1, <4 x i32> %arg2) {
+; CHECK-LABEL: i32_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> %arg2, <2 x i32> <i32 1, i32 2>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_off3(<4 x i32> %arg1, <4 x i32> %arg2) {
+; CHECK-LABEL: i32_off3:
+; CHECK: ext v0.16b, v0.16b, v1.16b, #12
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> %arg2, <2 x i32> <i32 3, i32 4>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_off4(<4 x i32> %arg1, <4 x i32> %arg2) {
+; CHECK-LABEL: i32_off4:
+; CHECK: mov v0.16b, v1.16b
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> %arg2, <2 x i32> <i32 4, i32 5>
+  ret <2 x i32> %shuffle
+}
+
+; i64 tests
+define <1 x i64> @i64_off0(<2 x i64> %arg1, <2 x i64> %arg2) {
+; CHECK-LABEL: i64_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> %arg2, <1 x i32> <i32 0>
+  ret <1 x i64> %shuffle
+}
+
+define <1 x i64> @i64_off1(<2 x i64> %arg1, <2 x i64> %arg2) {
+; CHECK-LABEL: i64_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> %arg2, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle
+}
+
+define <1 x i64> @i64_off2(<2 x i64> %arg1, <2 x i64> %arg2) {
+; CHECK-LABEL: i64_off2:
+; CHECK: mov v0.16b, v1.16b
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> %arg2, <1 x i32> <i32 2>
+  ret <1 x i64> %shuffle
+}
+
+; i8 tests with second operand zero
+define <8 x i8> @i8_zero_off0(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_zero_off1(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #1
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_zero_off8(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off8:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_zero_off15(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off15:
+; CHECK: movi [[REG:v[0-9]+]].2d, #0
+; CHECK: ext v0.16b, v0.16b, [[REG]].16b, #15
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
+  ret <8 x i8> %shuffle
+}
+
+define <8 x i8> @i8_zero_off22(<16 x i8> %arg1) {
+; CHECK-LABEL: i8_zero_off22:
+; CHECK: movi v0.2d, #0
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> <i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29>
+  ret <8 x i8> %shuffle
+}
+
+; i16 tests with second operand zero
+define <4 x i16> @i16_zero_off0(<8 x i16> %arg1) {
+; CHECK-LABEL: i16_zero_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_zero_off1(<8 x i16> %arg1) {
+; CHECK-LABEL: i16_zero_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #2
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_zero_off7(<8 x i16> %arg1) {
+; CHECK-LABEL: i16_zero_off7:
+; CHECK: movi [[REG:v[0-9]+]].2d, #0
+; CHECK: ext v0.16b, v0.16b, [[REG]].16b, #14
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> zeroinitializer, <4 x i32> <i32 7, i32 8, i32 9, i32 10>
+  ret <4 x i16> %shuffle
+}
+
+define <4 x i16> @i16_zero_off8(<8 x i16> %arg1) {
+; CHECK-LABEL: i16_zero_off8:
+; CHECK: movi v0.2d, #0
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <8 x i16> %arg1, <8 x i16> zeroinitializer, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  ret <4 x i16> %shuffle
+}
+
+; i32 tests with second operand zero
+define <2 x i32> @i32_zero_off0(<4 x i32> %arg1) {
+; CHECK-LABEL: i32_zero_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_zero_off1(<4 x i32> %arg1) {
+; CHECK-LABEL: i32_zero_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_zero_off3(<4 x i32> %arg1) {
+; CHECK-LABEL: i32_zero_off3:
+; CHECK: movi [[REG:v[0-9]+]].2d, #0
+; CHECK: ext v0.16b, v0.16b, [[REG]].16b, #12
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> zeroinitializer, <2 x i32> <i32 3, i32 4>
+  ret <2 x i32> %shuffle
+}
+
+define <2 x i32> @i32_zero_off4(<4 x i32> %arg1) {
+; CHECK-LABEL: i32_zero_off4:
+; CHECK: movi v0.2d, #0
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <4 x i32> %arg1, <4 x i32> zeroinitializer, <2 x i32> <i32 4, i32 5>
+  ret <2 x i32> %shuffle
+}
+
+; i64 tests with second operand zero
+define <1 x i64> @i64_zero_off0(<2 x i64> %arg1) {
+; CHECK-LABEL: i64_zero_off0:
+; CHECK-NOT: mov
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> <i32 0>
+  ret <1 x i64> %shuffle
+}
+
+define <1 x i64> @i64_zero_off1(<2 x i64> %arg1) {
+; CHECK-LABEL: i64_zero_off1:
+; CHECK-NOT: mov
+; CHECK: ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle
+}
+
+define <1 x i64> @i64_zero_off2(<2 x i64> %arg1) {
+; CHECK-LABEL: i64_zero_off2:
+; CHECK: fmov d0, xzr
+; CHECK-NOT: ext
+; CHECK: ret
+entry:
+  %shuffle = shufflevector <2 x i64> %arg1, <2 x i64> zeroinitializer, <1 x i32> <i32 2>
+  ret <1 x i64> %shuffle
+}
diff --git a/test/CodeGen/AArch64/extract-bits.ll b/test/CodeGen/AArch64/extract-bits.ll
index a60883b958e3159086018878dffa9eeeaf8d8e05..5dbb71939bb4f780812b98d121e81f7bf23bef6f 100644
--- a/test/CodeGen/AArch64/extract-bits.ll
+++ b/test/CodeGen/AArch64/extract-bits.ll
@@ -34,6 +34,22 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
   ret i32 %masked
 }
 
+define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; CHECK-LABEL: bextr32_a0_arithmetic:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w9, wzr, #0x1
+; CHECK-NEXT:    lsl w9, w9, w2
+; CHECK-NEXT:    asr w8, w0, w1
+; CHECK-NEXT:    sub w9, w9, #1 // =1
+; CHECK-NEXT:    and w0, w9, w8
+; CHECK-NEXT:    ret
+  %shifted = ashr i32 %val, %numskipbits
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
 define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr32_a1_indexzext:
 ; CHECK:       // %bb.0:
@@ -124,6 +140,22 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
   ret i64 %masked
 }
 
+define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; CHECK-LABEL: bextr64_a0_arithmetic:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w9, wzr, #0x1
+; CHECK-NEXT:    lsl x9, x9, x2
+; CHECK-NEXT:    asr x8, x0, x1
+; CHECK-NEXT:    sub x9, x9, #1 // =1
+; CHECK-NEXT:    and x0, x9, x8
+; CHECK-NEXT:    ret
+  %shifted = ashr i64 %val, %numskipbits
+  %onebit = shl i64 1, %numlowbits
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
 define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; CHECK-LABEL: bextr64_a1_indexzext:
 ; CHECK:       // %bb.0:
@@ -838,3 +870,93 @@ define i64 @c4_i64_bad(i64 %arg) {
   %tmp1 = and i64 %tmp0, 16382
   ret i64 %tmp1
 }
+
+; ---------------------------------------------------------------------------- ;
+; Constant, storing the result afterwards.
+; ---------------------------------------------------------------------------- ;
+
+; i32
+
+; The most canonical variant
+define void @c5_i32(i32 %arg, i32* %ptr) {
+; CHECK-LABEL: c5_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w8, w0, #19, #10
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  store i32 %tmp1, i32* %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i32(i32 %arg, i32* %ptr) {
+; CHECK-LABEL: c6_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w8, w0, #19, #12
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 4095
+  store i32 %tmp1, i32* %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i32(i32 %arg, i32* %ptr) {
+; CHECK-LABEL: c7_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w8, w0, #19, #10
+; CHECK-NEXT:    lsl w8, w8, #2
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  %tmp2 = shl i32 %tmp1, 2
+  store i32 %tmp2, i32* %ptr
+  ret void
+}
+
+; i64
+
+; The most canonical variant
+define void @c5_i64(i64 %arg, i64* %ptr) {
+; CHECK-LABEL: c5_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx x8, x0, #51, #10
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  store i64 %tmp1, i64* %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i64(i64 %arg, i64* %ptr) {
+; CHECK-LABEL: c6_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx x8, x0, #51, #12
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 4095
+  store i64 %tmp1, i64* %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i64(i64 %arg, i64* %ptr) {
+; CHECK-LABEL: c7_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx x8, x0, #51, #10
+; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  %tmp2 = shl i64 %tmp1, 2
+  store i64 %tmp2, i64* %ptr
+  ret void
+}
diff --git a/test/CodeGen/AArch64/extract-insert.ll b/test/CodeGen/AArch64/extract-insert.ll
new file mode 100644
index 0000000000000000000000000000000000000000..077e5f3d042df333edae4a067de1cd4439c7a49e
--- /dev/null
+++ b/test/CodeGen/AArch64/extract-insert.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64_be-- < %s | FileCheck %s --check-prefix=BE
+; RUN: llc -mtriple=aarch64--    < %s | FileCheck %s --check-prefix=LE
+
+define i32 @trunc_i64_to_i32_le(i64 %x) {
+; BE-LABEL: trunc_i64_to_i32_le:
+; BE:       // %bb.0:
+; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    fmov w0, s0
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i32_le:
+; LE:       // %bb.0:
+; LE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; LE-NEXT:    ret
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <4 x i32>
+  %ext = extractelement <4 x i32> %bc, i32 0
+  ret i32 %ext
+}
+
+define i32 @trunc_i64_to_i32_be(i64 %x) {
+; BE-LABEL: trunc_i64_to_i32_be:
+; BE:       // %bb.0:
+; BE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i32_be:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov d0, x0
+; LE-NEXT:    mov w0, v0.s[1]
+; LE-NEXT:    ret
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <4 x i32>
+  %ext = extractelement <4 x i32> %bc, i32 1
+  ret i32 %ext
+}
+
+define i16 @trunc_i64_to_i16_le(i64 %x) {
+; BE-LABEL: trunc_i64_to_i16_le:
+; BE:       // %bb.0:
+; BE-NEXT:    fmov d0, x0
+; BE-NEXT:    rev64 v0.8h, v0.8h
+; BE-NEXT:    umov w0, v0.h[0]
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i16_le:
+; LE:       // %bb.0:
+; LE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; LE-NEXT:    ret
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <8 x i16>
+  %ext = extractelement <8 x i16> %bc, i32 0
+  ret i16 %ext
+}
+
+define i16 @trunc_i64_to_i16_be(i64 %x) {
+; BE-LABEL: trunc_i64_to_i16_be:
+; BE:       // %bb.0:
+; BE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i16_be:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov d0, x0
+; LE-NEXT:    umov w0, v0.h[3]
+; LE-NEXT:    ret
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <8 x i16>
+  %ext = extractelement <8 x i16> %bc, i32 3
+  ret i16 %ext
+}
+
+define i8 @trunc_i32_to_i8_le(i32 %x) {
+; BE-LABEL: trunc_i32_to_i8_le:
+; BE:       // %bb.0:
+; BE-NEXT:    fmov s0, w0
+; BE-NEXT:    rev32 v0.16b, v0.16b
+; BE-NEXT:    umov w0, v0.b[0]
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i32_to_i8_le:
+; LE:       // %bb.0:
+; LE-NEXT:    ret
+  %ins = insertelement <4 x i32> undef, i32 %x, i32 0
+  %bc = bitcast <4 x i32> %ins to <16 x i8>
+  %ext = extractelement <16 x i8> %bc, i32 0
+  ret i8 %ext
+}
+
+define i8 @trunc_i32_to_i8_be(i32 %x) {
+; BE-LABEL: trunc_i32_to_i8_be:
+; BE:       // %bb.0:
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i32_to_i8_be:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov s0, w0
+; LE-NEXT:    umov w0, v0.b[3]
+; LE-NEXT:    ret
+  %ins = insertelement <4 x i32> undef, i32 %x, i32 0
+  %bc = bitcast <4 x i32> %ins to <16 x i8>
+  %ext = extractelement <16 x i8> %bc, i32 3
+  ret i8 %ext
+}
+
+; Weird type (non-power-of-2 vector) is ok.
+
+define i8 @trunc_i64_to_i8_be(i64 %x) {
+; BE-LABEL: trunc_i64_to_i8_be:
+; BE:       // %bb.0:
+; BE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; BE-NEXT:    ret
+;
+; LE-LABEL: trunc_i64_to_i8_be:
+; LE:       // %bb.0:
+; LE-NEXT:    fmov d0, x0
+; LE-NEXT:    umov w0, v0.b[7]
+; LE-NEXT:    ret
+  %ins = insertelement <3 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <3 x i64> %ins to <24 x i8>
+  %ext = extractelement <24 x i8> %bc, i32 7
+  ret i8 %ext
+}
+
diff --git a/test/CodeGen/AArch64/fadd-combines.ll b/test/CodeGen/AArch64/fadd-combines.ll
index be027a7b558b2fffd465507a4318cfb57d9f7036..7332101a481e610c037cf34bc77cfd237cd4900d 100644
--- a/test/CodeGen/AArch64/fadd-combines.ll
+++ b/test/CodeGen/AArch64/fadd-combines.ll
@@ -51,8 +51,8 @@ define double @test4(double %a, double %b, double %c) {
   ret double %add2
 }
 
-define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: test5:
+define <4 x float> @fmulnegtwo_vec(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmulnegtwo_vec:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    fadd v1.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
@@ -62,6 +62,39 @@ define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
   ret <4 x float> %add
 }
 
+define <4 x float> @fmulnegtwo_vec_commute(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmulnegtwo_vec_commute:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %b, <float -2.0, float -2.0, float -2.0, float -2.0>
+  %add = fadd <4 x float> %mul, %a
+  ret <4 x float> %add
+}
+
+define <4 x float> @fmulnegtwo_vec_undefs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmulnegtwo_vec_undefs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %b, <float undef, float -2.0, float undef, float -2.0>
+  %add = fadd <4 x float> %a, %mul
+  ret <4 x float> %add
+}
+
+define <4 x float> @fmulnegtwo_vec_commute_undefs(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: fmulnegtwo_vec_commute_undefs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fadd v1.4s, v1.4s, v1.4s
+; CHECK-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %mul = fmul <4 x float> %b, <float -2.0, float undef, float -2.0, float -2.0>
+  %add = fadd <4 x float> %mul, %a
+  ret <4 x float> %add
+}
+
 define <4 x float> @test6(<4 x float> %a, <4 x float> %b) {
 ; CHECK-LABEL: test6:
 ; CHECK:       // %bb.0:
@@ -99,10 +132,10 @@ define double @test7(double %a, double %b) nounwind {
 define float @fadd_const_multiuse_fmf(float %x) {
 ; CHECK-LABEL: fadd_const_multiuse_fmf:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x8, .LCPI7_0
-; CHECK-NEXT:    adrp x9, .LCPI7_1
-; CHECK-NEXT:    ldr s1, [x8, :lo12:.LCPI7_0]
-; CHECK-NEXT:    ldr s2, [x9, :lo12:.LCPI7_1]
+; CHECK-NEXT:    adrp x8, .LCPI10_0
+; CHECK-NEXT:    adrp x9, .LCPI10_1
+; CHECK-NEXT:    ldr s1, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT:    ldr s2, [x9, :lo12:.LCPI10_1]
 ; CHECK-NEXT:    fadd s1, s0, s1
 ; CHECK-NEXT:    fadd s0, s0, s2
 ; CHECK-NEXT:    fadd s0, s1, s0
@@ -120,10 +153,10 @@ define float @fadd_const_multiuse_fmf(float %x) {
 define float @fadd_const_multiuse_attr(float %x) #0 {
 ; CHECK-LABEL: fadd_const_multiuse_attr:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    adrp x9, .LCPI8_1
-; CHECK-NEXT:    adrp x8, .LCPI8_0
-; CHECK-NEXT:    ldr s1, [x9, :lo12:.LCPI8_1]
-; CHECK-NEXT:    ldr s2, [x8, :lo12:.LCPI8_0]
+; CHECK-NEXT:    adrp x9, .LCPI11_1
+; CHECK-NEXT:    adrp x8, .LCPI11_0
+; CHECK-NEXT:    ldr s1, [x9, :lo12:.LCPI11_1]
+; CHECK-NEXT:    ldr s2, [x8, :lo12:.LCPI11_0]
 ; CHECK-NEXT:    fadd s1, s0, s1
 ; CHECK-NEXT:    fadd s1, s2, s1
 ; CHECK-NEXT:    fadd s0, s0, s1
diff --git a/test/CodeGen/AArch64/fast-isel-address-extends.ll b/test/CodeGen/AArch64/fast-isel-address-extends.ll
index 6a17ec502a02d9ce463d86b7a4fd6cc9da791da5..8b0ffa8c10dabef56148e1ee641c9338b9053a3e 100644
--- a/test/CodeGen/AArch64/fast-isel-address-extends.ll
+++ b/test/CodeGen/AArch64/fast-isel-address-extends.ll
@@ -6,8 +6,10 @@ target triple = "arm64-apple-ios8.0.0"
 ; This test was trying to fold the sext %tmp142 in to the address arithmetic in %sunkaddr1.
 ; This was incorrect as %.mux isn't available in the last bb.
 
-; CHECK: sxtw [[REG:x[0-9]+]]
-; CHECK: strh wzr, {{\[}}[[REG]], {{.*}}, lsl #1]
+; CHECK: sxtw [[REG0:x[0-9]+]]
+; CHECK: str [[REG0]], [sp, [[OFFSET:#[0-9]+]]]
+; CHECK: ldr [[REG1:x[0-9]+]], [sp, [[OFFSET]]]
+; CHECK: strh wzr, [{{.*}}, [[REG1]], lsl #1]
 
 ; Function Attrs: nounwind optsize ssp
 define void @EdgeLoop(i32 %dir, i32 %edge, i32 %width, i16* %tmp89, i32 %tmp136, i16 %tmp144) #0 {
diff --git a/test/CodeGen/AArch64/fast-isel-atomic.ll b/test/CodeGen/AArch64/fast-isel-atomic.ll
index 452129e49515c46ab044be33d1aecdb9457b3ac7..240e82805726f08d408178c96aa0e677fb373c0e 100644
--- a/test/CodeGen/AArch64/fast-isel-atomic.ll
+++ b/test/CodeGen/AArch64/fast-isel-atomic.ll
@@ -91,8 +91,8 @@ define void @atomic_store_release_8(i8* %p, i8 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_release_8_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #1
-; CHECK-NEXT:  stlrb w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #1
+; CHECK-NEXT:  stlrb w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_8_off(i8* %p, i8 %val) #0 {
   %tmp0 = getelementptr i8, i8* %p, i32 1
@@ -111,8 +111,8 @@ define void @atomic_store_release_16(i16* %p, i16 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_release_16_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #2
-; CHECK-NEXT:  stlrh w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #2
+; CHECK-NEXT:  stlrh w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_16_off(i16* %p, i16 %val) #0 {
   %tmp0 = getelementptr i16, i16* %p, i32 1
@@ -131,8 +131,8 @@ define void @atomic_store_release_32(i32* %p, i32 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_release_32_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #4
-; CHECK-NEXT:  stlr w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #4
+; CHECK-NEXT:  stlr w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_32_off(i32* %p, i32 %val) #0 {
   %tmp0 = getelementptr i32, i32* %p, i32 1
@@ -151,8 +151,8 @@ define void @atomic_store_release_64(i64* %p, i64 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_release_64_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #8
-; CHECK-NEXT:  stlr x1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #8
+; CHECK-NEXT:  stlr x1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_release_64_off(i64* %p, i64 %val) #0 {
   %tmp0 = getelementptr i64, i64* %p, i32 1
@@ -172,8 +172,8 @@ define void @atomic_store_seq_cst_8(i8* %p, i8 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_seq_cst_8_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #1
-; CHECK-NEXT:  stlrb w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #1
+; CHECK-NEXT:  stlrb w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_8_off(i8* %p, i8 %val) #0 {
   %tmp0 = getelementptr i8, i8* %p, i32 1
@@ -192,8 +192,8 @@ define void @atomic_store_seq_cst_16(i16* %p, i16 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_seq_cst_16_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #2
-; CHECK-NEXT:  stlrh w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #2
+; CHECK-NEXT:  stlrh w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_16_off(i16* %p, i16 %val) #0 {
   %tmp0 = getelementptr i16, i16* %p, i32 1
@@ -212,8 +212,8 @@ define void @atomic_store_seq_cst_32(i32* %p, i32 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_seq_cst_32_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #4
-; CHECK-NEXT:  stlr w1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #4
+; CHECK-NEXT:  stlr w1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_32_off(i32* %p, i32 %val) #0 {
   %tmp0 = getelementptr i32, i32* %p, i32 1
@@ -232,8 +232,8 @@ define void @atomic_store_seq_cst_64(i64* %p, i64 %val) #0 {
 
 ; CHECK-LABEL: atomic_store_seq_cst_64_off:
 ; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT:  add x0, x0, #8
-; CHECK-NEXT:  stlr x1, [x0]
+; CHECK-NEXT:  add [[REG0:x[0-9]+]], x0, #8
+; CHECK-NEXT:  stlr x1, {{\[}}[[REG0]]]
 ; CHECK-NEXT:  ret
 define void @atomic_store_seq_cst_64_off(i64* %p, i64 %val) #0 {
   %tmp0 = getelementptr i64, i64* %p, i32 1
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
index 55fbf63319ee3da06f698cf1727caaa4dc5cd287..0cafd883f6947f5aeebe01eee90a9385c7dbd6bf 100644
--- a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
@@ -2,9 +2,9 @@
 
 define void @test(i64 %a, i64 %b, i2* %c) {
 ; CHECK-LABEL: test
-; CHECK:       and [[REG1:w[0-9]+]], w8, #0x3
+; CHECK:       and [[REG1:w[0-9]+]], {{w[0-9]+}}, #0x3
 ; CHECK-NEXT:  strb [[REG1]], {{\[}}x2{{\]}}
-; CHECK-NEXT:  tbz w9, #0,
+; CHECK-NEXT:  tbz {{w[0-9]+}}, #0,
  %1 = trunc i64 %a to i2
  %2 = trunc i64 %b to i1
 ; Force fast-isel to fall back to SDAG.
diff --git a/test/CodeGen/AArch64/fast-isel-cmp-vec.ll b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
index d5b64c5363e1bfeff98d1c1cca758546410fb350..42112065943be401ff2822c6bf8f1c01b700ef0c 100644
--- a/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
+++ b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
@@ -24,7 +24,7 @@ bb2:
 define <2 x i32> @icmp_constfold_v2i32(<2 x i32> %a) {
 ; CHECK-LABEL: icmp_constfold_v2i32:
 ; CHECK:      ; %bb.0:
-; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT:  movi.2d v[[CMP:[0-9]+]], #0xffffffffffffffff
 ; CHECK-NEXT: ; %bb.1:
 ; CHECK-NEXT:  movi.2s [[MASK:v[0-9]+]], #1
 ; CHECK-NEXT:  and.8b v0, v[[CMP]], [[MASK]]
@@ -56,7 +56,7 @@ bb2:
 define <4 x i32> @icmp_constfold_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: icmp_constfold_v4i32:
 ; CHECK:      ; %bb.0:
-; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT:  movi.2d v[[CMP:[0-9]+]], #0xffffffffffffffff
 ; CHECK-NEXT: ; %bb.1:
 ; CHECK-NEXT:  movi.4h [[MASK:v[0-9]+]], #1
 ; CHECK-NEXT:  and.8b [[ZEXT:v[0-9]+]], v[[CMP]], [[MASK]]
diff --git a/test/CodeGen/AArch64/fast-isel-dbg.ll b/test/CodeGen/AArch64/fast-isel-dbg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4d26b9142af0cba44ea0579f37b58e9ae06c1dd7
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-dbg.ll
@@ -0,0 +1,26 @@
+; RUN: llc -o - %s -fast-isel -stop-before=expand-isel-pseudos | FileCheck %s
+; Make sure fast-isel produces DBG_VALUE instructions even if no debug printer
+; is scheduled because of -stop-before.
+target triple="aarch64--"
+
+; CHECK-LABEL: name: func
+; CHECK: DBG_VALUE
+define void @func(i32 %a) !dbg !4 {
+  call void @llvm.dbg.declare(metadata i32 %a, metadata !5, metadata !DIExpression()), !dbg !7
+  ret void
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+attributes #0 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "fast-isel-dbg.ll", directory: "/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "func", scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
+!5 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 17, type: !6)
+!6 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!7 = !DILocation(line: 17, scope: !4)
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll
index 719d3f46950ff1b8b9918f7dec5f43a25f801cdc..ab13eb631d4eb594152b2e3819521cfa74599f07 100644
--- a/test/CodeGen/AArch64/fold-constants.ll
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -2,7 +2,7 @@
 
 define i64 @dotests_616() {
 ; CHECK-LABEL: dotests_616
-; CHECK:       movi d0, #0000000000000000
+; CHECK:       movi v0.2d, #0000000000000000
 ; CHECK-NEXT:  fmov x0, d0
 ; CHECK-NEXT:  ret
 entry:
diff --git a/test/CodeGen/AArch64/jump-table-compress.mir b/test/CodeGen/AArch64/jump-table-compress.mir
new file mode 100644
index 0000000000000000000000000000000000000000..b4217ea61681362eb96c54f638f0f7801e129fad
--- /dev/null
+++ b/test/CodeGen/AArch64/jump-table-compress.mir
@@ -0,0 +1,111 @@
+# RUN: llc -mtriple=aarch64-linux-gnu %s -run-pass=aarch64-jump-tables -o - | FileCheck %s
+--- |
+  define i32 @test_jumptable(i32 %in) {
+    unreachable
+  }
+
+...
+---
+name:            test_jumptable
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+jumpTable:
+  kind:            block-address
+  entries:
+    - id:              0
+      blocks:          [ '%bb.2', '%bb.3' ]
+    - id:              1
+      blocks:          [ '%bb.4', '%bb.5' ]
+    - id:              2
+      blocks:          [ '%bb.7' ]
+    - id:              3
+      blocks:          [ '%bb.9' ]
+    - id:              4
+      blocks:          [ '%bb.9' ]
+    - id:              5
+      blocks:          [ '%bb.11' ]
+body:             |
+  bb.0 (%ir-block.0):
+
+  bb.1 (%ir-block.0):
+    ; CHECK-LABEL: body:
+    ; CHECK-LABEL: bb.1
+    ; CHECK: JumpTableDest8
+    liveins: $x8
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.0
+    BR killed $x10
+
+  bb.2:
+    ; Last destination is 4 * 255 = 1020 bytes after first. Byte is OK.
+    dead $xzr = SPACE 1020, undef $xzr
+
+  bb.3:
+    ; CHECK-LABEL: bb.3
+    ; CHECK: JumpTableDest16
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.1
+    BR killed $x10
+
+  bb.4:
+    ; Last destination is 4 * 256 = 1024 bytes after first. Half needed.
+    dead $xzr = SPACE 1024, undef $xzr
+
+  bb.5:
+    ; CHECK-LABEL: bb.5
+    ; CHECK: JumpTableDest8
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.2
+    BR killed $x10
+
+  bb.6:
+    ; First destination is (2^20 - 4) after reference. Just reachable by ADR so can use compressed table.
+    dead $xzr = SPACE 1048556, undef $xzr
+
+  bb.7:
+    ; CHECK-LABEL: bb.7
+    ; CHECK: JumpTableDest32
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.3
+    BR killed $x10
+
+  bb.8:
+    ; First destination is 2^20 after reference. Compressed table cannot reach it.
+    dead $xzr = SPACE 1048560, undef $xzr
+
+  bb.9:
+    ; First destination is 2^20 before reference. Just within reach of ADR.
+    dead $xzr = SPACE 1048576, undef $xzr
+
+  bb.10:
+    ; CHECK-LABEL: bb.10
+    ; CHECK: JumpTableDest8
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.4
+    BR killed $x10
+
+  bb.11:
+    ; First destination is 2^20 before reference. Just within reach of ADR.
+    dead $xzr = SPACE 1048580, undef $xzr
+
+  bb.12:
+    ; CHECK-LABEL: bb.12
+    ; CHECK: JumpTableDest32
+    early-clobber $x10, dead early-clobber $x11 = JumpTableDest32 undef killed $x9, undef killed $x8, %jump-table.5
+    BR killed $x10
+...
diff --git a/test/CodeGen/AArch64/jump-table-exynos.ll b/test/CodeGen/AArch64/jump-table-exynos.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e018410792e22837560f755b0508c66cadc55e6c
--- /dev/null
+++ b/test/CodeGen/AArch64/jump-table-exynos.ll
@@ -0,0 +1,67 @@
+; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mattr=+force-32bit-jump-tables -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m1 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m2 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=exynos-m3 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+
+; Exynos doesn't want jump tables to be compressed for now.
+
+define i32 @test_jumptable(i32 %in)  {
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+  ]
+; CHECK-LABEL: test_jumptable:
+; CHECK-NOT: ldrb
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+}
+
+define i32 @test_jumptable_minsize(i32 %in) minsize {
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+  ]
+; CHECK-LABEL: test_jumptable_minsize:
+; CHECK:     adrp [[JTPAGE:x[0-9]+]], .LJTI1_0
+; CHECK:     add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI1_0
+; CHECK:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK:     br [[DEST]]
+
+
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+}
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 098b90f94b963d6773072cc5237bd936748b1ff8..4e70e92beaf8d96c2f8ec4ab612e7a5d9f14ab14 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-enable-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s
-; RUN: llc -code-model=tiny -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-TINY %s
+; RUN: llc -no-integrated-as -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+; RUN: llc -no-integrated-as -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -no-integrated-as -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-enable-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -no-integrated-as -code-model=tiny -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-TINY %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
@@ -12,27 +12,45 @@ define i32 @test_jumptable(i32 %in) {
     i32 2, label %lbl3
     i32 4, label %lbl4
   ]
-; CHECK: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
-; CHECK: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3]
-; CHECK: br [[DEST]]
-
-; CHECK-LARGE: movz x[[JTADDR:[0-9]+]], #:abs_g0_nc:.LJTI0_0
-; CHECK-LARGE: movk x[[JTADDR]], #:abs_g1_nc:.LJTI0_0
-; CHECK-LARGE: movk x[[JTADDR]], #:abs_g2_nc:.LJTI0_0
-; CHECK-LARGE: movk x[[JTADDR]], #:abs_g3:.LJTI0_0
-; CHECK-LARGE: ldr [[DEST:x[0-9]+]], [x[[JTADDR]], {{x[0-9]+}}, lsl #3]
-; CHECK-LARGE: br [[DEST]]
-
-; CHECK-PIC: adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
-; CHECK-PIC: add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
-; CHECK-PIC: ldrsw [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #2]
-; CHECK-PIC: add [[TABLE:x[0-9]+]], [[DEST]], x[[JT]]
-; CHECK-PIC: br [[TABLE]]
-
-; CHECK-TINY: adr x[[JT:[0-9]+]], .LJTI0_0
-; CHECK-TINY: ldr [[DEST:x[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #3]
-; CHECK-TINY: br [[DEST]]
+; CHECK-LABEL: test_jumptable:
+; CHECK:     adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
+; CHECK:     add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
+; CHECK:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK:     br [[DEST]]
+
+; CHECK-LARGE:     movz x[[JTADDR:[0-9]+]], #:abs_g0_nc:.LJTI0_0
+; CHECK-LARGE:     movk x[[JTADDR]], #:abs_g1_nc:.LJTI0_0
+; CHECK-LARGE:     movk x[[JTADDR]], #:abs_g2_nc:.LJTI0_0
+; CHECK-LARGE:     movk x[[JTADDR]], #:abs_g3:.LJTI0_0
+; CHECK-LARGE:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK-LARGE:     ldrb w[[OFFSET:[0-9]+]], [x[[JTADDR]], {{x[0-9]+}}]
+; CHECK-LARGE:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK-LARGE:     br [[DEST]]
+
+; CHECK-PIC-LABEL: test_jumptable:
+; CHECK-PIC:     adrp [[JTPAGE:x[0-9]+]], .LJTI0_0
+; CHECK-PIC:     add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI0_0
+; CHECK-PIC:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK-PIC:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK-PIC:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK-PIC:     br [[DEST]]
+
+; CHECK-IOS:     adrp [[JTPAGE:x[0-9]+]], LJTI0_0@PAGE
+; CHECK-IOS:     add x[[JT:[0-9]+]], [[JTPAGE]], LJTI0_0@PAGEOFF
+; CHECK-IOS:     adr [[PCBASE:x[0-9]+]], [[JTBASE:LBB[0-9]+_[0-9]+]]
+; CHECK-IOS:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK-IOS:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK-IOS: br [[DEST]]
+
+; CHECK-TINY-LABEL: test_jumptable:
+; CHECK-TINY:     adr x[[JT:[0-9]+]], .LJTI0_0
+; CHECK-TINY:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK-TINY:     ldrb w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}]
+; CHECK-TINY:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK-TINY:     br [[DEST]]
+
 
 def:
   ret i32 0
@@ -54,18 +72,86 @@ lbl4:
 ; CHECK: .rodata
 
 ; CHECK: .LJTI0_0:
-; CHECK-NEXT: .xword
-; CHECK-NEXT: .xword
-; CHECK-NEXT: .xword
-; CHECK-NEXT: .xword
-; CHECK-NEXT: .xword
+; CHECK-NEXT: .byte ([[JTBASE]]-[[JTBASE]])>>2
+; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+
+define i32 @test_jumptable16(i32 %in) {
+
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+  ]
+; CHECK-LABEL: test_jumptable16:
+; CHECK:     adrp [[JTPAGE:x[0-9]+]], .LJTI1_0
+; CHECK:     add x[[JT:[0-9]+]], [[JTPAGE]], {{#?}}:lo12:.LJTI1_0
+; CHECK:     adr [[PCBASE:x[0-9]+]], [[JTBASE:.LBB[0-9]+_[0-9]+]]
+; CHECK:     ldrh w[[OFFSET:[0-9]+]], [x[[JT]], {{x[0-9]+}}, lsl #1]
+; CHECK:     add [[DEST:x[0-9]+]], [[PCBASE]], x[[OFFSET]], lsl #2
+; CHECK:     br [[DEST]]
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  call void asm sideeffect "1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16", ""()
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+}
+
+; CHECK:      .rodata
+; CHECK:      .p2align 1
+; CHECK: .LJTI1_0:
+; CHECK-NEXT: .hword ([[JTBASE]]-[[JTBASE]])>>2
+; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-NEXT: .hword (.LBB{{.*}}-[[JTBASE]])>>2
 
 ; CHECK-PIC-NOT: .data_region
 ; CHECK-PIC-NOT: .LJTI0_0
 ; CHECK-PIC: .LJTI0_0:
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
-; CHECK-PIC-NEXT: .word .LBB{{.*}}-.LJTI0_0
+; CHECK-PIC-NEXT: .byte ([[JTBASE]]-[[JTBASE]])>>2
+; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-PIC-NEXT: .byte (.LBB{{.*}}-[[JTBASE]])>>2
 ; CHECK-PIC-NOT: .end_data_region
+
+; CHECK-IOS: .section __TEXT,__const
+; CHECK-IOS-NOT: .data_region
+; CHECK-IOS: LJTI0_0:
+; CHECK-IOS-NEXT:     .byte ([[JTBASE]]-[[JTBASE]])>>2
+; CHECK-IOS-NEXT:     .byte (LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-IOS-NEXT:     .byte (LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-IOS-NEXT:     .byte (LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-IOS-NEXT:     .byte (LBB{{.*}}-[[JTBASE]])>>2
+; CHECK-IOS-NOT: .end_data_region
diff --git a/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll b/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll
index d8ae73293d9bda5dd3936166a4408539f765a669..f4680354d7e423fda1241f58741cf7bba860bb12 100644
--- a/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll
+++ b/test/CodeGen/AArch64/lack-of-signed-truncation-check.ll
@@ -35,8 +35,7 @@ define i1 @shifts_necmp_i16_i8(i16 %x) nounwind {
 define i1 @shifts_necmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 16 ; 32-16
@@ -48,8 +47,7 @@ define i1 @shifts_necmp_i32_i16(i32 %x) nounwind {
 define i1 @shifts_necmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 24 ; 32-8
@@ -61,8 +59,7 @@ define i1 @shifts_necmp_i32_i8(i32 %x) nounwind {
 define i1 @shifts_necmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 32 ; 64-32
@@ -74,8 +71,7 @@ define i1 @shifts_necmp_i64_i32(i64 %x) nounwind {
 define i1 @shifts_necmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 48 ; 64-16
@@ -87,8 +83,7 @@ define i1 @shifts_necmp_i64_i16(i64 %x) nounwind {
 define i1 @shifts_necmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_necmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 56 ; 64-8
@@ -117,8 +112,7 @@ define i1 @add_ultcmp_i16_i8(i16 %x) nounwind {
 define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -32768 ; ~0U << (16-1)
@@ -129,8 +123,7 @@ define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -128 ; ~0U << (8-1)
@@ -141,8 +134,7 @@ define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1)
@@ -153,8 +145,7 @@ define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -32768 ; ~0U << (16-1)
@@ -165,8 +156,7 @@ define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -128 ; ~0U << (8-1)
@@ -208,8 +198,7 @@ define i1 @add_ugecmp_i16_i8(i16 %x) nounwind {
 define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 32768 ; 1U << (16-1)
@@ -220,8 +209,7 @@ define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 128 ; 1U << (8-1)
@@ -232,8 +220,7 @@ define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1)
@@ -244,8 +231,7 @@ define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 32768 ; 1U << (16-1)
@@ -256,8 +242,7 @@ define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 128 ; 1U << (8-1)
diff --git a/test/CodeGen/AArch64/load-store-forwarding.ll b/test/CodeGen/AArch64/load-store-forwarding.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e6124270169b72199e081deeccf47bcc1c28249b
--- /dev/null
+++ b/test/CodeGen/AArch64/load-store-forwarding.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck %s --check-prefix CHECK-BE
+; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s --check-prefix CHECK-LE
+
+define i8 @test1(i32 %a, i8* %pa) {
+; CHECK-BE-LABEL: test1:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ldrb w0, [x1]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LE-LABEL: test1:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    str w0, [x1]
+; CHECK-LE-NEXT:    ret
+  %p32 = bitcast i8* %pa to i32*
+  %p8 = getelementptr i8, i8* %pa, i32 0
+  store i32 %a, i32* %p32
+  %res = load i8, i8* %p8
+  ret i8 %res
+}
+
+define i8 @test2(i32 %a, i8* %pa) {
+; CHECK-BE-LABEL: test2:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ldrb w0, [x1, #1]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LE-LABEL: test2:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    str w0, [x1]
+; CHECK-LE-NEXT:    ubfx w0, w0, #8, #8
+; CHECK-LE-NEXT:    ret
+  %p32 = bitcast i8* %pa to i32*
+  %p8 = getelementptr i8, i8* %pa, i32 1
+  store i32 %a, i32* %p32
+  %res = load i8, i8* %p8
+  ret i8 %res
+}
+
+define i8 @test3(i32 %a, i8* %pa) {
+; CHECK-BE-LABEL: test3:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ldrb w0, [x1, #2]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LE-LABEL: test3:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    str w0, [x1]
+; CHECK-LE-NEXT:    ubfx w0, w0, #16, #8
+; CHECK-LE-NEXT:    ret
+  %p32 = bitcast i8* %pa to i32*
+  %p8 = getelementptr i8, i8* %pa, i32 2
+  store i32 %a, i32* %p32
+  %res = load i8, i8* %p8
+  ret i8 %res
+}
+
+define i8 @test4(i32 %a, i8* %pa) {
+; CHECK-BE-LABEL: test4:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    str w0, [x1]
+; CHECK-BE-NEXT:    ret
+;
+; CHECK-LE-LABEL: test4:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    str w0, [x1]
+; CHECK-LE-NEXT:    lsr w0, w0, #24
+; CHECK-LE-NEXT:    ret
+  %p32 = bitcast i8* %pa to i32*
+  %p8 = getelementptr i8, i8* %pa, i32 3
+  store i32 %a, i32* %p32
+  %res = load i8, i8* %p8
+  ret i8 %res
+}
diff --git a/test/CodeGen/AArch64/machine-cp-clobbers.mir b/test/CodeGen/AArch64/machine-cp-clobbers.mir
new file mode 100644
index 0000000000000000000000000000000000000000..b5c0331d2ef193a16628ea06d3d71fda20c7e7b1
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-cp-clobbers.mir
@@ -0,0 +1,51 @@
+# RUN: llc -march=aarch64 -o - %s -run-pass=machine-cp | FileCheck %s
+
+---
+name: dont_propagate_past_lower_subreg_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: dont_propagate_past_lower_subreg_kill
+    ; CHECK: HINT 0, implicit-def $q0
+    ; CHECK: HINT 0, implicit-def $d1
+    ; CHECK: HINT 0, implicit killed $d1
+    ; CHECK: $q1 = COPY killed $q0
+    ; CHECK: $q2 = COPY $q1
+    ; CHECK: HINT 0, implicit $q2
+    HINT 0, implicit-def $q0
+    $q1 = COPY killed $q0
+    $q0 = COPY killed $q1
+
+    HINT 0, implicit-def $d1
+    HINT 0, implicit killed $d1
+
+    $q1 = COPY killed $q0
+    $q2 = COPY $q1
+    HINT 0, implicit $q2
+
+...
+
+---
+name: dont_propagate_past_upper_subreg_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: dont_propagate_past_upper_subreg_kill
+    ; CHECK: HINT 0, implicit-def $z0
+    ; CHECK: HINT 0, implicit-def $z1_hi
+    ; CHECK: HINT 0, implicit killed $z1_hi
+    ; CHECK: $z1 = COPY killed $z0
+    ; CHECK: $z2 = COPY $z1
+    ; CHECK: HINT 0, implicit $z2
+    HINT 0, implicit-def $z0
+    $z1 = COPY killed $z0
+    $z0 = COPY killed $z1
+
+    HINT 0, implicit-def $z1_hi
+    HINT 0, implicit killed $z1_hi
+
+    $z1 = COPY killed $z0
+    $z2 = COPY $z1
+    HINT 0, implicit $z2
+
+...
diff --git a/test/CodeGen/AArch64/machine-outliner-remarks.ll b/test/CodeGen/AArch64/machine-outliner-remarks.ll
index e721b8a648a4dc3ed8075830f754a491f3316be5..29872d9518afc3b5257e32e0208d82380d7a3451 100644
--- a/test/CodeGen/AArch64/machine-outliner-remarks.ll
+++ b/test/CodeGen/AArch64/machine-outliner-remarks.ll
@@ -9,10 +9,13 @@
 ; CHECK-SAME: <UNKNOWN LOCATION>)
 ; RUN: llc %s -enable-machine-outliner -mtriple=aarch64-unknown-unknown -o /dev/null -pass-remarks-missed=machine-outliner -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck %s -check-prefix=YAML
+
+; For the YAML case, the function we pick depends on the order of the candidate
+; list.
 ; YAML: --- !Missed
 ; YAML-NEXT: Pass:            machine-outliner
 ; YAML-NEXT: Name:            NotOutliningCheaper
-; YAML-NEXT: Function:        dog
+; YAML-NEXT: Function:
 ; YAML-NEXT: Args:            
 ; YAML-NEXT:   - String:          'Did not outline '
 ; YAML-NEXT:   - Length:          '2'
diff --git a/test/CodeGen/AArch64/machine-outliner.ll b/test/CodeGen/AArch64/machine-outliner.ll
index 9d922c27f884bf15a60c69bc029d3bcc4a87b0d8..19be14d8d39136628953dd96cb650a19b649b40a 100644
--- a/test/CodeGen/AArch64/machine-outliner.ll
+++ b/test/CodeGen/AArch64/machine-outliner.ll
@@ -1,6 +1,16 @@
 ; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple=aarch64-apple-darwin < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple=aarch64-apple-darwin -mcpu=cortex-a53 -enable-misched=false < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -enable-machine-outliner -enable-linkonceodr-outlining -mtriple=aarch64-apple-darwin < %s | FileCheck %s -check-prefix=ODR
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple=aarch64-apple-darwin -stop-after=machine-outliner < %s | FileCheck %s -check-prefix=TARGET_FEATURES
+
+; Make sure that we inherit target features from functions and make sure we have
+; the right function attributes.
+; TARGET_FEATURES: define internal void @OUTLINED_FUNCTION_{{[0-9]+}}()
+; TARGET_FEATURES-SAME: #[[ATTR_NUM:[0-9]+]]
+; TARGET_FEATURES-DAG: attributes #[[ATTR_NUM]] = {
+; TARGET_FEATURES-SAME: minsize
+; TARGET_FEATURES-SAME: optsize
+; TARGET_FEATURES-SAME: "target-features"="+sse"
 
 define linkonce_odr void @fish() #0 {
   ; CHECK-LABEL: _fish:
@@ -95,4 +105,4 @@ define void @dog() #0 {
 ; CHECK-NEXT: str     w8, [sp, #8]
 ; CHECK-NEXT: ret
 
-attributes #0 = { noredzone "target-cpu"="cyclone" }
+attributes #0 = { noredzone "target-cpu"="cyclone" "target-features"="+sse" }
diff --git a/test/CodeGen/AArch64/machine_cse.ll b/test/CodeGen/AArch64/machine_cse.ll
index e9fa68041d9029adaf14e74ca5df988191b1d0e3..51252a2a8428f0f8198efdf4f97681c6f0a1892c 100644
--- a/test/CodeGen/AArch64/machine_cse.ll
+++ b/test/CodeGen/AArch64/machine_cse.ll
@@ -47,3 +47,27 @@ return:
   store i32 %a, i32 *%arg
   ret void
 }
+
+define void @combine_vector_zeros(<8 x i8>* %p, <16 x i8>* %q) {
+; CHECK-LABEL: combine_vector_zeros:
+; CHECK: movi v[[REG:[0-9]+]].2d, #0
+; CHECK-NOT: movi
+; CHECK: str d[[REG]], [x0]
+; CHECK: str q[[REG]], [x1]
+entry:
+  store <8 x i8> zeroinitializer, <8 x i8>* %p
+  store <16 x i8> zeroinitializer, <16 x i8>* %q
+  ret void
+}
+
+define void @combine_vector_ones(<2 x i32>* %p, <4 x i32>* %q) {
+; CHECK-LABEL: combine_vector_ones:
+; CHECK: movi v[[REG:[0-9]+]].2d, #0xffffffffffffffff
+; CHECK-NOT: movi
+; CHECK: str d[[REG]], [x0]
+; CHECK: str q[[REG]], [x1]
+entry:
+  store <2 x i32> <i32 -1, i32 -1>, <2 x i32>* %p
+  store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %q
+  ret void
+}
diff --git a/test/CodeGen/AArch64/min-jump-table.ll b/test/CodeGen/AArch64/min-jump-table.ll
index b22e683ebfede0a6b87fbb35b2357ebff003a095..7d6d26259af00f0c9ffce4780338d8cac39e1fb4 100644
--- a/test/CodeGen/AArch64/min-jump-table.ll
+++ b/test/CodeGen/AArch64/min-jump-table.ll
@@ -14,8 +14,8 @@ entry:
 ; CHECK0-NEXT: Jump Tables:
 ; CHECK0-NEXT: %jump-table.0:
 ; CHECK0-NOT: %jump-table.1:
-; CHECK4-NOT: Jump Tables:
-; CHECK8-NOT: Jump Tables:
+; CHECK4-NOT: {{^}}Jump Tables:
+; CHECK8-NOT: {{^}}Jump Tables:
 
 bb1: tail call void @ext(i32 0) br label %return
 bb2: tail call void @ext(i32 2) br label %return
@@ -38,7 +38,7 @@ entry:
 ; CHECK4-NEXT: Jump Tables:
 ; CHECK4-NEXT: %jump-table.0:
 ; CHECK4-NOT: %jump-table.1:
-; CHECK8-NOT: Jump Tables:
+; CHECK8-NOT: {{^}}Jump Tables:
 
 bb1: tail call void @ext(i32 0) br label %return
 bb2: tail call void @ext(i32 2) br label %return
diff --git a/test/CodeGen/AArch64/multi-vector-store-size.ll b/test/CodeGen/AArch64/multi-vector-store-size.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8764eb447a30079cfca729ae124d39780e251286
--- /dev/null
+++ b/test/CodeGen/AArch64/multi-vector-store-size.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=instruction-select < %s | FileCheck %s
+
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+
+define void @addstx(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entire vector is stored.
+  tail call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
+; CHECK: ST2Twov4s {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
+; CHECK: ST3Threev4s {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
+; CHECK: ST4Fourv4s {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
+
+define void @addst1x(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entire vector is stored.
+  tail call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, float* %res)
+; CHECK: ST1Twov4s {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, float* %res)
+; CHECK: ST1Threev4s {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, float* %res)
+; CHECK: ST1Fourv4s {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
+
+define void @addstxlane(float* %res, <4 x float>* %a,  <4 x float>* %b, <4 x float>* %c, <4 x float>* %d) {
+  %al = load <4 x float>, <4 x float>* %a
+  %bl = load <4 x float>, <4 x float>* %b
+  %cl = load <4 x float>, <4 x float>* %c
+  %dl = load <4 x float>, <4 x float>* %d
+
+  %ar = fadd <4 x float> %al, %bl
+  %br = fadd <4 x float> %bl, %cl
+  %cr = fadd <4 x float> %cl, %dl
+  %dr = fadd <4 x float> %dl, %al
+
+; The sizes below are conservative.  AArch64TargetLowering
+; conservatively assumes the entire vector is stored.
+  tail call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, i64 1, float* %res)
+; CHECK: ST2i32 {{.*}} :: (store 32 {{.*}})
+  tail call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, float* %res)
+; CHECK: ST3i32 {{.*}} :: (store 48 {{.*}})
+  tail call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, float* %res)
+; CHECK: ST4i32 {{.*}} :: (store 64 {{.*}})
+
+  ret void
+}
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index 8bb7cc8c143031cc4ab35f06e0bfd6f133d07297..9d7d0abbf6c7189baae2267328b85c24b5a3b5c5 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1223,7 +1223,7 @@ define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
 ; CHECK-LABEL: cmlsz8xi8:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}}
 ; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
 	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
@@ -1245,7 +1245,7 @@ define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
 ; CHECK-LABEL: cmlsz4xi16:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}}
 ; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
 	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
@@ -1267,7 +1267,7 @@ define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
 ; CHECK-LABEL: cmlsz2xi32:
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; LS implemented as HS, so check reversed operands.
-; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}}
 ; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
 	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
diff --git a/test/CodeGen/AArch64/neon-fp16fml.ll b/test/CodeGen/AArch64/neon-fp16fml.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dcae645ea54c1c97aeee0faad20a8b6b9b9fa486
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-fp16fml.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+fp16fml < %s | FileCheck %s
+
+declare <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>)
+declare <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>)
+declare <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>)
+declare <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float>, <4 x half>, <4 x half>)
+declare <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>)
+declare <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>)
+declare <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>)
+declare <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>)
+
+define <2 x float> @test_vfmlal_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlal_low_u32:
+; CHECK: fmlal   v0.2s, v1.2h, v2.2h
+  %vfmlal_low2.i = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2
+  ret <2 x float> %vfmlal_low2.i
+}
+
+define <2 x float> @test_vfmlsl_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlsl_low_u32:
+; CHECK: fmlsl   v0.2s, v1.2h, v2.2h
+  %vfmlsl_low2.i = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2
+  ret <2 x float> %vfmlsl_low2.i
+}
+
+define <2 x float> @test_vfmlal_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlal_high_u32:
+; CHECK: fmlal2   v0.2s, v1.2h, v2.2h
+  %vfmlal_high2.i = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2
+  ret <2 x float> %vfmlal_high2.i
+}
+
+define <2 x float> @test_vfmlsl_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlsl_high_u32:
+; CHECK: fmlsl2   v0.2s, v1.2h, v2.2h
+  %vfmlsl_high2.i = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c) #2
+  ret <2 x float> %vfmlsl_high2.i
+}
+
+define <4 x float> @test_vfmlalq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlalq_low_u32:
+; CHECK: fmlal   v0.4s, v1.4h, v2.4h
+  %vfmlalq_low4.i = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2
+  ret <4 x float> %vfmlalq_low4.i
+}
+
+define <4 x float> @test_vfmlslq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlslq_low_u32:
+; CHECK: fmlsl   v0.4s, v1.4h, v2.4h
+  %vfmlslq_low4.i = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2
+  ret <4 x float> %vfmlslq_low4.i
+}
+
+define <4 x float> @test_vfmlalq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlalq_high_u32:
+; CHECK: fmlal2   v0.4s, v1.4h, v2.4h
+  %vfmlalq_high4.i = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2
+  ret <4 x float> %vfmlalq_high4.i
+}
+
+define <4 x float> @test_vfmlslq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c) #0 {
+entry:
+; CHECK-LABEL: test_vfmlslq_high_u32:
+; CHECK: fmlsl2   v0.4s, v1.4h, v2.4h
+  %vfmlslq_high4.i = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c) #2
+  ret <4 x float> %vfmlslq_high4.i
+}
diff --git a/test/CodeGen/AArch64/phi-dbg.ll b/test/CodeGen/AArch64/phi-dbg.ll
index a2c97f311080f4e97cffe2ff3d57285a51887b41..4f7c005f8026f1db43e49f36f7133c9a2ed6f3d7 100644
--- a/test/CodeGen/AArch64/phi-dbg.ll
+++ b/test/CodeGen/AArch64/phi-dbg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 %s -mtriple=aarch64 -o - | FileCheck %s
+; RUN: llc -O0 %s -mtriple=aarch64 -stop-after=phi-node-elimination -o - | FileCheck %s
 
 ; Test that a DEBUG_VALUE node is create for variable c after the phi has been
 ; converted to a ldr.    The DEBUG_VALUE must be *after* the ldr and not before it.
@@ -15,25 +15,34 @@
 ; }
 ;
 ; Function Attrs: nounwind
-define i32 @func(i32) #0 !dbg !8 {
-  call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !12, metadata !13), !dbg !14
+; CHECK: !14 = !DILocalVariable(name: "c"
+; CHECK-LABEL: name: func
+define i32 @func(i32 %a0) #0 !dbg !8 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %a0, i64 0, metadata !12, metadata !13), !dbg !14
   call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !15, metadata !13), !dbg !16
-  %2 = icmp slt i32 %0, 0, !dbg !17
-  br i1 %2, label %3, label %4, !dbg !19
+  %v2 = icmp slt i32 %a0, 0, !dbg !17
+  br i1 %v2, label %bb2, label %bb3, !dbg !19
 
-; <label>:3:                                      ; preds = %1
+bb2:
   call void @llvm.dbg.value(metadata i32 12, i64 0, metadata !15, metadata !13), !dbg !16
-  br label %4, !dbg !20
+  br label %bb3, !dbg !20
 
-; <label>:4:                                      ; preds = %3, %1
-  %.0 = phi i32 [ 12, %3 ], [ 1, %1 ]
-; CHECK: ldr     w[[REG:[0-9]+]], [sp, #8]
-; CHECK-NEXT: .Ltmp
+; CHECK: bb.2.bb2:
+; CHECK:  [[REG0:%[0-9]+]]:gpr32 = MOVi32imm 12
+; CHECK:  [[PHIREG:%[0-9]+]]:gpr32 = COPY [[REG0]]
+
+bb3:
+; CHECK: bb.3.bb3:
+; CHECK:   [[PHIDEST:%[0-9]+]]:gpr32 = COPY [[PHIREG]]
+; CHECK-NEXT:   DBG_VALUE [[PHIDEST]]
+  %.0 = phi i32 [ 12, %bb2 ], [ 1, %entry ]
   call void @llvm.dbg.value(metadata i32 %.0, i64 0, metadata !15, metadata !13), !dbg !16
-; CHECK-NEXT:  //DEBUG_VALUE: func:c <- $w[[REG]]
-  %5 = add nsw i32 %.0, %0, !dbg !22
-  call void @llvm.dbg.value(metadata i32 %5, i64 0, metadata !15, metadata !13), !dbg !16
-  ret i32 %5, !dbg !23
+; CHECK: [[ADD:%[0-9]+]]:gpr32 = nsw ADDWrr [[PHIDEST]]
+; CHECK-NEXT: DBG_VALUE [[ADD]]
+  %v5 = add nsw i32 %.0, %a0, !dbg !22
+  call void @llvm.dbg.value(metadata i32 %v5, i64 0, metadata !15, metadata !13), !dbg !16
+  ret i32 %v5, !dbg !23
 }
 
 ; Function Attrs: nounwind readnone
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index 24038cda50784cc7a304214d7be1225c064993af..50da7d139f177b7943133afa374bac42ee1ae59d 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -26,9 +26,9 @@ define i64 @test_chains() {
   store i8 %inc.4, i8* %locvar
 
 ; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
-; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #1
 ; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]]
-; CHECK: and w0, w[[STRVAL]], #0xff
+; CHECK: and x0, x[[STRVAL]], #0xff
 
   %ret.1 = load i8, i8* %locvar
   %ret.2 = zext i8 %ret.1 to i64
diff --git a/test/CodeGen/AArch64/sat-add.ll b/test/CodeGen/AArch64/sat-add.ll
index d90828599887683317098b44ab27a279660fe279..4d865a2b14b74da94085de0eca6f2983e3b073d7 100644
--- a/test/CodeGen/AArch64/sat-add.ll
+++ b/test/CodeGen/AArch64/sat-add.ll
@@ -52,11 +52,10 @@ define i8 @unsigned_sat_constant_i8_using_cmp_notval(i8 %x) {
 define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i16_using_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w9, #65493
-; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #65493
+; CHECK-NEXT:    cmp w8, w0, uxth
 ; CHECK-NEXT:    mov w8, #-43
-; CHECK-NEXT:    csel w8, w0, w8, lo
+; CHECK-NEXT:    csel w8, w0, w8, hi
 ; CHECK-NEXT:    add w0, w8, #42 // =42
 ; CHECK-NEXT:    ret
   %c = icmp ult i16 %x, -43
@@ -82,11 +81,10 @@ define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
 define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) {
 ; CHECK-LABEL: unsigned_sat_constant_i16_using_cmp_notval:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xffff
-; CHECK-NEXT:    mov w10, #65493
-; CHECK-NEXT:    add w9, w0, #42 // =42
-; CHECK-NEXT:    cmp w8, w10
-; CHECK-NEXT:    csinv w0, w9, wzr, ls
+; CHECK-NEXT:    mov w9, #65493
+; CHECK-NEXT:    add w8, w0, #42 // =42
+; CHECK-NEXT:    cmp w9, w0, uxth
+; CHECK-NEXT:    csinv w0, w8, wzr, hs
 ; CHECK-NEXT:    ret
   %a = add i16 %x, 42
   %c = icmp ugt i16 %x, -43
diff --git a/test/CodeGen/AArch64/selectiondag-order.ll b/test/CodeGen/AArch64/selectiondag-order.ll
index 9427906160fd02a9d86c7e180974f522fa159652..fb40653723fec6687bbabbbf11d74c589e031f69 100644
--- a/test/CodeGen/AArch64/selectiondag-order.ll
+++ b/test/CodeGen/AArch64/selectiondag-order.ll
@@ -21,7 +21,7 @@ end:                                        ; preds = %body
 }
 
 ; AARCH64-CHECK: simulate:
-; AARCH64-CHECK: movi d9, #0000000000000000
+; AARCH64-CHECK: movi v0.2d, #0000000000000000
 ; AARCH64-CHECK: bl lrand48
 ; AARCH64-CHECK: mov x19, x0
 ; AARCH64-CHECK: BB0_1:
@@ -47,7 +47,7 @@ end:                                        ; preds = %body
 }
 
 ; AARCH64-CHECK: simulateWithDebugIntrinsic
-; AARCH64-CHECK: movi d9, #0000000000000000
+; AARCH64-CHECK: movi v0.2d, #0000000000000000
 ; AARCH64-CHECK: bl lrand48
 ; AARCH64-CHECK: mov x19, x0
 ; AARCH64-CHECK: BB1_1:
@@ -73,7 +73,7 @@ end:                                        ; preds = %body
 }
 
 ; AARCH64-CHECK: simulateWithDbgDeclare:
-; AARCH64-CHECK: movi d9, #0000000000000000
+; AARCH64-CHECK: movi v0.2d, #0000000000000000
 ; AARCH64-CHECK: bl lrand48
 ; AARCH64-CHECK: mov x19, x0
 ; AARCH64-CHECK: BB2_1:
diff --git a/test/CodeGen/AArch64/sign-return-address.ll b/test/CodeGen/AArch64/sign-return-address.ll
index a0c73058a30115edfd2518daa090926c6fb795ef..c057c815acfd4b6ae5c7ef0d0e755a1a993d9c61 100644
--- a/test/CodeGen/AArch64/sign-return-address.ll
+++ b/test/CodeGen/AArch64/sign-return-address.ll
@@ -84,3 +84,26 @@ define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
   tail call fastcc i64 @bar(i64 %x)
   ret void
 }
+
+; CHECK-LABEL: @leaf_sign_all_a_key
+; CHECK: paciasp
+; CHECK: autiasp
+define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
+  ret i32 %x
+}
+
+; CHECK-LABEL: @leaf_sign_all_b_key
+; CHECK: pacibsp
+; CHECK: autibsp
+define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" {
+  ret i32 %x
+}
+
+; CHECK-LABEL: @leaf_sign_all_v83_b_key
+; CHECK: pacibsp
+; CHECK-NOT: ret
+; CHECK: retab
+; CHECK-NOT: ret
+define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" {
+  ret i32 %x
+}
diff --git a/test/CodeGen/AArch64/signed-truncation-check.ll b/test/CodeGen/AArch64/signed-truncation-check.ll
index f475dbc2f74b28d7a5e846ced19f867d0cf2c1a7..edd61b10d0023b4dcfbf7c7b4d1dae98676ec01f 100644
--- a/test/CodeGen/AArch64/signed-truncation-check.ll
+++ b/test/CodeGen/AArch64/signed-truncation-check.ll
@@ -35,8 +35,7 @@ define i1 @shifts_eqcmp_i16_i8(i16 %x) nounwind {
 define i1 @shifts_eqcmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 16 ; 32-16
@@ -48,8 +47,7 @@ define i1 @shifts_eqcmp_i32_i16(i32 %x) nounwind {
 define i1 @shifts_eqcmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i32 %x, 24 ; 32-8
@@ -61,8 +59,7 @@ define i1 @shifts_eqcmp_i32_i8(i32 %x) nounwind {
 define i1 @shifts_eqcmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 32 ; 64-32
@@ -74,8 +71,7 @@ define i1 @shifts_eqcmp_i64_i32(i64 %x) nounwind {
 define i1 @shifts_eqcmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 48 ; 64-16
@@ -87,8 +83,7 @@ define i1 @shifts_eqcmp_i64_i16(i64 %x) nounwind {
 define i1 @shifts_eqcmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: shifts_eqcmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = shl i64 %x, 56 ; 64-8
@@ -117,8 +112,7 @@ define i1 @add_ugecmp_i16_i8(i16 %x) nounwind {
 define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -32768 ; ~0U << (16-1)
@@ -129,8 +123,7 @@ define i1 @add_ugecmp_i32_i16(i32 %x) nounwind {
 define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, -128 ; ~0U << (8-1)
@@ -141,8 +134,7 @@ define i1 @add_ugecmp_i32_i8(i32 %x) nounwind {
 define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -2147483648 ; ~0U << (32-1)
@@ -153,8 +145,7 @@ define i1 @add_ugecmp_i64_i32(i64 %x) nounwind {
 define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -32768 ; ~0U << (16-1)
@@ -165,8 +156,7 @@ define i1 @add_ugecmp_i64_i16(i64 %x) nounwind {
 define i1 @add_ugecmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ugecmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, -128 ; ~0U << (8-1)
@@ -208,8 +198,7 @@ define i1 @add_ultcmp_i16_i8(i16 %x) nounwind {
 define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 32768 ; 1U << (16-1)
@@ -220,8 +209,7 @@ define i1 @add_ultcmp_i32_i16(i32 %x) nounwind {
 define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i32_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb w8, w0
-; CHECK-NEXT:    cmp w8, w0
+; CHECK-NEXT:    cmp w0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i32 %x, 128 ; 1U << (8-1)
@@ -232,8 +220,7 @@ define i1 @add_ultcmp_i32_i8(i32 %x) nounwind {
 define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtw x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtw
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 2147483648 ; 1U << (32-1)
@@ -244,8 +231,7 @@ define i1 @add_ultcmp_i64_i32(i64 %x) nounwind {
 define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxth x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxth
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 32768 ; 1U << (16-1)
@@ -256,8 +242,7 @@ define i1 @add_ultcmp_i64_i16(i64 %x) nounwind {
 define i1 @add_ultcmp_i64_i8(i64 %x) nounwind {
 ; CHECK-LABEL: add_ultcmp_i64_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sxtb x8, w0
-; CHECK-NEXT:    cmp x8, x0
+; CHECK-NEXT:    cmp x0, w0, sxtb
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
   %tmp0 = add i64 %x, 128 ; 1U << (8-1)
diff --git a/test/CodeGen/AArch64/sponentry.ll b/test/CodeGen/AArch64/sponentry.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5b3638a1d862a82766a5ad41be1c924fa144b2aa
--- /dev/null
+++ b/test/CodeGen/AArch64/sponentry.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=aarch64-windows-msvc -disable-fp-elim %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows-msvc -fast-isel -disable-fp-elim %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s --check-prefix=NOFP
+; RUN: llc -mtriple=aarch64-windows-msvc -fast-isel %s -o - | FileCheck %s --check-prefix=NOFP
+
+@env2 = common dso_local global [24 x i64]* null, align 8
+
+define dso_local void @bar() {
+  %1 = call i8* @llvm.sponentry()
+  %2 = load [24 x i64]*, [24 x i64]** @env2, align 8
+  %3 = getelementptr inbounds [24 x i64], [24 x i64]* %2, i32 0, i32 0
+  %4 = bitcast i64* %3 to i8*
+  %5 = call i32 @_setjmpex(i8* %4, i8* %1) #2
+  ret void
+}
+
+; CHECK: bar:
+; CHECK: mov     x29, sp
+; CHECK: add     x1, x29, #16
+; CEHCK: bl      _setjmpex
+
+; NOFP: str     x30, [sp, #-16]!
+; NOFP: add     x1, sp, #16
+
+define dso_local void @foo([24 x i64]*) {
+  %2 = alloca [24 x i64]*, align 8
+  %3 = alloca i32, align 4
+  %4 = alloca [100 x i32], align 4
+  store [24 x i64]* %0, [24 x i64]** %2, align 8
+  %5 = call i8* @llvm.sponentry()
+  %6 = load [24 x i64]*, [24 x i64]** %2, align 8
+  %7 = getelementptr inbounds [24 x i64], [24 x i64]* %6, i32 0, i32 0
+  %8 = bitcast i64* %7 to i8*
+  %9 = call i32 @_setjmpex(i8* %8, i8* %5)
+  store i32 %9, i32* %3, align 4
+  ret void
+}
+
+; CHECK: foo:
+; CHECK: sub     sp, sp, #448
+; CHECK: add     x29, sp, #432
+; CHECK: add     x1, x29, #16
+; CEHCK: bl      _setjmpex
+
+; NOFP: sub     sp, sp, #432
+; NOFP: add     x1, sp, #432
+
+define dso_local void @var_args(i8*, ...) {
+  %2 = alloca i8*, align 8
+  %3 = alloca i8*, align 8
+  store i8* %0, i8** %2, align 8
+  %4 = bitcast i8** %3 to i8*
+  call void @llvm.va_start(i8* %4)
+  %5 = load i8*, i8** %3, align 8
+  %6 = getelementptr inbounds i8, i8* %5, i64 8
+  store i8* %6, i8** %3, align 8
+  %7 = bitcast i8* %5 to i32*
+  %8 = load i32, i32* %7, align 8
+  %9 = bitcast i8** %3 to i8*
+  call void @llvm.va_end(i8* %9)
+  %10 = call i8* @llvm.sponentry()
+  %11 = load [24 x i64]*, [24 x i64]** @env2, align 8
+  %12 = getelementptr inbounds [24 x i64], [24 x i64]* %11, i32 0, i32 0
+  %13 = bitcast i64* %12 to i8*
+  %14 = call i32 @_setjmpex(i8* %13, i8* %10) #3
+  ret void
+}
+
+; CHECK: var_args:
+; CHECK: sub     sp, sp, #96
+; CHECK: add     x29, sp, #16
+; CHECK: add     x1, x29, #80
+; CEHCK: bl      _setjmpex
+
+; NOFP: sub     sp, sp, #96
+; NOFP: add     x1, sp, #96
+
+define dso_local void @manyargs(i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, i64 %x8, i64 %x9, i64 %x10) {
+  %1 = call i8* @llvm.sponentry()
+  %2 = load [24 x i64]*, [24 x i64]** @env2, align 8
+  %3 = getelementptr inbounds [24 x i64], [24 x i64]* %2, i32 0, i32 0
+  %4 = bitcast i64* %3 to i8*
+  %5 = call i32 @_setjmpex(i8* %4, i8* %1) #2
+  ret void
+}
+
+; CHECK: manyargs:
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: add     x1, x29, #16
+
+; NOFP: str     x30, [sp, #-16]!
+; NOFP: add     x1, sp, #16
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.sponentry()
+
+; Function Attrs: returns_twice
+declare dso_local i32 @_setjmpex(i8*, i8*)
+
+; Function Attrs: nounwind
+declare void @llvm.va_start(i8*) #1
+
+; Function Attrs: nounwind
+declare void @llvm.va_end(i8*) #1
diff --git a/test/CodeGen/AArch64/swap-compare-operands.ll b/test/CodeGen/AArch64/swap-compare-operands.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7c19b911166e372a6bba293c457e8173cf8ed598
--- /dev/null
+++ b/test/CodeGen/AArch64/swap-compare-operands.ll
@@ -0,0 +1,632 @@
+; RUN: llc < %s -mtriple=arm64 | FileCheck %s
+
+define i1 @testSwapCmpWithLSL64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSL64_1:
+; CHECK:      cmp     x1, x0, lsl #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i64 %a, 1
+  %cmp = icmp slt i64 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSL64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSL64_63:
+; CHECK:      cmp     x1, x0, lsl #63
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i64 %a, 63
+  %cmp = icmp slt i64 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSL32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSL32_1:
+; CHECK:      cmp     w1, w0, lsl #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i32 %a, 1
+  %cmp = icmp slt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSL32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSL32_31:
+; CHECK:      cmp     w1, w0, lsl #31
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %shl = shl i32 %a, 31
+  %cmp = icmp slt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSR64_1:
+; CHECK:      cmp     x1, x0, lsr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i64 %a, 1
+  %cmp = icmp slt i64 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithLSR64_63:
+; CHECK:      cmp     x1, x0, lsr #63
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i64 %a, 63
+  %cmp = icmp slt i64 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSR32_1:
+; CHECK:      cmp     w1, w0, lsr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i32 %a, 1
+  %cmp = icmp slt i32 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithLSR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithLSR32_31:
+; CHECK:      cmp     w1, w0, lsr #31
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %lshr = lshr i32 %a, 31
+  %cmp = icmp slt i32 %lshr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithASR64_1:
+; CHECK:      cmp     x1, x0, asr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i64 %a, 1
+  %cmp = icmp slt i64 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithASR64_63:
+; CHECK:      cmp     x1, x0, asr #63
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i64 %a, 63
+  %cmp = icmp slt i64 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithASR32_1:
+; CHECK:      cmp     w1, w0, asr #1
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i32 %a, 1
+  %cmp = icmp slt i32 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithASR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithASR32_31:
+; CHECK:      cmp     w1, w0, asr #31
+; CHECK-NEXT: cset    w0, gt
+entry:
+  %ashr = ashr i32 %a, 31
+  %cmp = icmp slt i32 %ashr, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend32_64(i32 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend32_64
+; CHECK:      cmp    x1, w0, uxtw #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = zext i32 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend16_64(i16 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend16_64
+; CHECK:      cmp    x1, w0, uxth #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = zext i16 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend8_64(i8 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64
+; CHECK:      cmp    x1, w0, uxtb #4
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a64 = zext i8 %a to i64
+  %shl.2 = shl i64 %a64, 4
+  %cmp = icmp ugt i64 %shl.2, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend16_32(i16 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64
+; CHECK:      cmp    w1, w0, uxth #3
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a32 = zext i16 %a to i32
+  %shl = shl i32 %a32, 3
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedZeroExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend8_64
+; CHECK:      cmp    w1, w0, uxtb #4
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a32 = zext i8 %a to i32
+  %shl = shl i32 %a32, 4
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithTooLargeShiftedZeroExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithTooLargeShiftedZeroExtend8_64
+; CHECK:      and    [[REG:w[0-9]+]], w0, #0xff
+; CHECK:      cmp    w1, [[REG]], lsl #5
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = zext i8 %a to i32
+  %shl = shl i32 %a32, 5
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithZeroExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithZeroExtend8_64
+; CHECK:      cmp    w1, w0, uxtb
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = zext i8 %a to i32
+  %cmp = icmp ugt i32 %a32, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend32_64(i32 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend32_64
+; CHECK:      cmp    x1, w0, sxtw #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = sext i32 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend16_64(i16 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedZeroExtend16_64
+; CHECK:      cmp    x1, w0, sxth #2
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a64 = sext i16 %a to i64
+  %shl.0 = shl i64 %a64, 2
+  %cmp = icmp ugt i64 %shl.0, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend8_64(i8 %a, i64 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64
+; CHECK:      cmp    x1, w0, sxtb #4
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a64 = sext i8 %a to i64
+  %shl.2 = shl i64 %a64, 4
+  %cmp = icmp ugt i64 %shl.2, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend16_32(i16 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64
+; CHECK:      cmp    w1, w0, sxth #3
+; CHECK-NEXT: cset    w0, lo
+entry:
+  %a32 = sext i16 %a to i32
+  %shl = shl i32 %a32, 3
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithShiftedSignExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithShiftedSignExtend8_64
+; CHECK:      cmp    w1, w0, sxtb #4
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = sext i8 %a to i32
+  %shl = shl i32 %a32, 4
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithTooLargeShiftedSignExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithTooLargeShiftedSignExtend8_64
+; CHECK:      sxtb   [[REG:w[0-9]+]], w0
+; CHECK-NEXT: cmp    w1, [[REG]], lsl #5
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = sext i8 %a to i32
+  %shl = shl i32 %a32, 5
+  %cmp = icmp ugt i32 %shl, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmpWithSignExtend8_32(i8 %a, i32 %b) {
+; CHECK-LABEL testSwapCmpWithSignExtend8_64
+; CHECK:      cmp    w1, w0, sxtb
+; CHECK-NEXT: cset   w0, lo
+entry:
+  %a32 = sext i8 %a to i32
+  %cmp = icmp ugt i32 %a32, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSL64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSL64_1:
+; CHECK:      cmn    x1, x0, lsl #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i64 %a, 1
+  %na = sub i64 0, %shl
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 62 bits shift as 63 has another optimization kicking in.
+define i1 @testSwapCmnWithLSL64_62(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSL64_62:
+; CHECK:      cmn    x1, x0, lsl #62
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i64 %a, 62
+  %na = sub i64 0, %shl
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 63 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSL64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSL64_63:
+; CHECK:      cmp    x1, x0, lsl #63
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i64 %a, 63
+  %na = sub i64 0, %shl
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSL32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSL32_1:
+; CHECK:      cmn    w1, w0, lsl #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i32 %a, 1
+  %na = sub i32 0, %shl
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 30 bits shift as 30 has another optimization kicking in.
+define i1 @testSwapCmnWithLSL32_30(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSL32_30:
+; CHECK:      cmn    w1, w0, lsl #30
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i32 %a, 30
+  %na = sub i32 0, %shl
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 31 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSL32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSL32_31:
+; CHECK:      cmp    w1, w0, lsl #31
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %shl = shl i32 %a, 31
+  %na = sub i32 0, %shl
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSR64_1:
+; CHECK:      cmn    x1, x0, lsr #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i64 %a, 1
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 62 bits shift as 63 has another optimization kicking in.
+define i1 @testSwapCmnWithLSR64_62(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSR64_62:
+; CHECK:      cmn    x1, x0, lsr #62
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i64 %a, 62
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 63 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithLSR64_63:
+; CHECK:      cmp    x1, x0, asr #63
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i64 %a, 63
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithLSR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSR32_1:
+; CHECK:      cmn    w1, w0, lsr #1
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i32 %a, 1
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 30 bits shift as 31 has another optimization kicking in.
+define i1 @testSwapCmnWithLSR32_30(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSR32_30:
+; CHECK:      cmn    w1, w0, lsr #30
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i32 %a, 30
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 31 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithLSR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithLSR32_31:
+; CHECK:      cmp    w1, w0, asr #31
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = lshr i32 %a, 31
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithASR64_1(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithASR64_1:
+; CHECK:      cmn    x1, x0, asr #3
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i64 %a, 3
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 62 bits shift as 63 has another optimization kicking in.
+define i1 @testSwapCmnWithASR64_62(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithASR64_62:
+; CHECK:      cmn    x1, x0, asr #62
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i64 %a, 62
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 63 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithASR64_63(i64 %a, i64 %b) {
+; CHECK-LABEL testSwapCmnWithASR64_63:
+; CHECK:      cmp    x1, x0, lsr #63
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i64 %a, 63
+  %na = sub i64 0, %lshr
+  %cmp = icmp ne i64 %na, %b
+  ret i1 %cmp
+}
+
+define i1 @testSwapCmnWithASR32_1(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithASR32_1:
+; CHECK:      cmn    w1, w0, asr #1
+; CHECK-NEXT: cset   w0, eq
+entry:
+  %lshr = ashr i32 %a, 1
+  %na = sub i32 0, %lshr
+  %cmp = icmp eq i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: testing with a 30 bits shift as 31 has another optimization kicking in.
+define i1 @testSwapCmnWithASR32_30(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithASR32_30:
+; CHECK:      cmn    w1, w0, asr #30
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i32 %a, 30
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+; Note: the 31 bits shift triggers a different optimization path, which leads
+; to a similar result in terms of performances. We try to catch here any change
+; so that this test can be adapted should the optimization be done with the
+; operand swap.
+define i1 @testSwapCmnWithASR32_31(i32 %a, i32 %b) {
+; CHECK-LABEL testSwapCmnWithASR32_31:
+; CHECK:      cmp    w1, w0, lsr #31
+; CHECK-NEXT: cset   w0, ne
+entry:
+  %lshr = ashr i32 %a, 31
+  %na = sub i32 0, %lshr
+  %cmp = icmp ne i32 %na, %b
+  ret i1 %cmp
+}
+
+define i64 @testSwapCmpToCmnWithZeroExtend(i32 %a32, i16 %a16, i8 %a8, i64 %b64, i32 %b32) {
+; CHECK-LABEL testSwapCmpToCmnWithZeroExtend:
+t0:
+  %conv0 = zext i32 %a32 to i64
+  %shl0 = shl i64 %conv0, 1
+  %na0 = sub i64 0, %shl0
+  %cmp0 = icmp ne i64 %na0, %b64
+; CHECK: cmn    x3, w0, uxtw #1
+  br i1 %cmp0, label %t1, label %end
+
+t1:
+  %conv1 = zext i16 %a16 to i64
+  %shl1 = shl i64 %conv1, 4
+  %na1 = sub i64 0, %shl1
+  %cmp1 = icmp ne i64 %na1, %b64
+; CHECK: cmn    x3, w1, uxth #4
+  br i1 %cmp1, label %t2, label %end
+
+t2:
+  %conv2 = zext i8 %a8 to i64
+  %shl2 = shl i64 %conv2, 3
+  %na2 = sub i64 0, %shl2
+  %cmp2 = icmp ne i64 %na2, %b64
+; CHECK: cmn    x3, w2, uxtb #3
+  br i1 %cmp2, label %t3, label %end
+
+t3:
+  %conv3 = zext i16 %a16 to i32
+  %shl3 = shl i32 %conv3, 2
+  %na3 = sub i32 0, %shl3
+  %cmp3 = icmp ne i32 %na3, %b32
+; CHECK: cmn    w4, w1, uxth #2
+  br i1 %cmp3, label %t4, label %end
+
+t4:
+  %conv4 = zext i8 %a8 to i32
+  %shl4 = shl i32 %conv4, 1
+  %na4 = sub i32 0, %shl4
+  %cmp4 = icmp ne i32 %na4, %b32
+; CHECK: cmn    w4, w2, uxtb #1
+  br i1 %cmp4, label %t5, label %end
+
+t5:
+  %conv5 = zext i8 %a8 to i32
+  %shl5 = shl i32 %conv5, 5
+  %na5 = sub i32 0, %shl5
+  %cmp5 = icmp ne i32 %na5, %b32
+; CHECK: and    [[REG:w[0-9]+]], w2, #0xff
+; CHECK: cmn    w4, [[REG]], lsl #5
+  br i1 %cmp5, label %t6, label %end
+
+t6:
+  %conv6 = zext i8 %a8 to i32
+  %na6 = sub i32 0, %conv6
+  %cmp6 = icmp ne i32 %na6, %b32
+; CHECK: cmn    w4, w2, uxtb
+  br i1 %cmp6, label %t7, label %end
+
+t7:
+  ret i64 0
+
+end:
+  ret i64 1
+}
+define i64 @testSwapCmpToCmnWithSignExtend(i32 %a32, i16 %a16, i8 %a8, i64 %b64, i32 %b32) {
+; CHECK-LABEL testSwapCmpToCmnWithSignExtend:
+t0:
+  %conv0 = sext i32 %a32 to i64
+  %shl0 = shl i64 %conv0, 1
+  %na0 = sub i64 0, %shl0
+  %cmp0 = icmp ne i64 %na0, %b64
+; CHECK: cmn     x3, w0, sxtw #1
+  br i1 %cmp0, label %t1, label %end
+
+t1:
+  %conv1 = sext i16 %a16 to i64
+  %shl1 = shl i64 %conv1, 4
+  %na1 = sub i64 0, %shl1
+  %cmp1 = icmp ne i64 %na1, %b64
+; CHECK: cmn     x3, w1, sxth #4
+  br i1 %cmp1, label %t2, label %end
+
+t2:
+  %conv2 = sext i8 %a8 to i64
+  %shl2 = shl i64 %conv2, 3
+  %na2 = sub i64 0, %shl2
+  %cmp2 = icmp ne i64 %na2, %b64
+; CHECK: cmn     x3, w2, sxtb #3
+  br i1 %cmp2, label %t3, label %end
+
+t3:
+  %conv3 = sext i16 %a16 to i32
+  %shl3 = shl i32 %conv3, 2
+  %na3 = sub i32 0, %shl3
+  %cmp3 = icmp ne i32 %na3, %b32
+; CHECK: cmn     w4, w1, sxth #2
+  br i1 %cmp3, label %t4, label %end
+
+t4:
+  %conv4 = sext i8 %a8 to i32
+  %shl4 = shl i32 %conv4, 1
+  %na4 = sub i32 0, %shl4
+  %cmp4 = icmp ne i32 %na4, %b32
+; CHECK: cmn     w4, w2, sxtb #1
+  br i1 %cmp4, label %t5, label %end
+
+t5:
+  %conv5 = sext i8 %a8 to i32
+  %shl5 = shl i32 %conv5, 5
+  %na5 = sub i32 0, %shl5
+  %cmp5 = icmp ne i32 %na5, %b32
+; CHECK: sxtb    [[REG:w[0-9]+]], w2
+; CHECK: cmn     w4, [[REG]], lsl #5
+  br i1 %cmp5, label %t6, label %end
+
+t6:
+  %conv6 = sext i8 %a8 to i32
+  %na6 = sub i32 0, %conv6
+  %cmp6 = icmp ne i32 %na6, %b32
+; CHECK: cmn     w4, w2, sxtb
+  br i1 %cmp6, label %t7, label %end
+
+t7:
+  ret i64 0
+
+end:
+  ret i64 1
+}
diff --git a/test/CodeGen/AArch64/vararg-tallcall.ll b/test/CodeGen/AArch64/vararg-tallcall.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2818222680335e9424be967f63028b76e76d994f
--- /dev/null
+++ b/test/CodeGen/AArch64/vararg-tallcall.ll
@@ -0,0 +1,34 @@
+; RUN: llc -mtriple=aarch64-windows-msvc %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
+
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+
+%class.X = type { i8 }
+%struct.B = type { i32 (...)** }
+
+$"??_9B@@$BA@AA" = comdat any
+
+; Function Attrs: noinline optnone
+define linkonce_odr void @"??_9B@@$BA@AA"(%struct.B* %this, ...) #1 comdat align 2  {
+entry:
+  %this.addr = alloca %struct.B*, align 8
+  store %struct.B* %this, %struct.B** %this.addr, align 8
+  %this1 = load %struct.B*, %struct.B** %this.addr, align 8
+  call void asm sideeffect "", "~{d0}"()
+  %0 = bitcast %struct.B* %this1 to void (%struct.B*, ...)***
+  %vtable = load void (%struct.B*, ...)**, void (%struct.B*, ...)*** %0, align 8
+  %vfn = getelementptr inbounds void (%struct.B*, ...)*, void (%struct.B*, ...)** %vtable, i64 0
+  %1 = load void (%struct.B*, ...)*, void (%struct.B*, ...)** %vfn, align 8
+  musttail call void (%struct.B*, ...) %1(%struct.B* %this1, ...)
+  ret void
+                                                  ; No predecessors!
+  ret void
+}
+
+attributes #1 = { noinline optnone "thunk" }
+
+; CHECK: mov     v16.16b, v0.16b
+; CHECK: ldr     x8, [x0]
+; CHECK: ldr     x8, [x8]
+; CHECK: mov     v0.16b, v16.16b
+; CHECK: br      x8
diff --git a/test/CodeGen/AArch64/win64_vararg.ll b/test/CodeGen/AArch64/win64_vararg.ll
index 9cc9f50adb77225747e7469bd1aaeb0b724e984c..38da60b81a5542659c30895988221f435972c902 100644
--- a/test/CodeGen/AArch64/win64_vararg.ll
+++ b/test/CodeGen/AArch64/win64_vararg.ll
@@ -104,7 +104,7 @@ declare i64* @__local_stdio_printf_options() local_unnamed_addr #4
 
 ; CHECK-LABEL: fp
 ; CHECK: str     x21, [sp, #-96]!
-; CHECK: stp     x20, x19, [sp, #16]
+; CHECK: stp     x19, x20, [sp, #16]
 ; CHECK: stp     x29, x30, [sp, #32]
 ; CHECK: add     x29, sp, #32
 ; CHECK: add     x8, x29, #24
@@ -124,10 +124,10 @@ declare i64* @__local_stdio_printf_options() local_unnamed_addr #4
 ; CHECK: mov     x3, x19
 ; CHECK: mov     x4, xzr
 ; CHECK: bl      __stdio_common_vsprintf
-; CHECK: ldp     x29, x30, [sp, #32]
-; CHECK: ldp     x20, x19, [sp, #16]
 ; CHECK: cmp     w0, #0
 ; CHECK: csinv   w0, w0, wzr, ge
+; CHECK: ldp     x29, x30, [sp, #32]
+; CHECK: ldp     x19, x20, [sp, #16]
 ; CHECK: ldr     x21, [sp], #96
 ; CHECK: ret
 define i32 @fp(i8*, i64, i8*, ...) local_unnamed_addr #6 {
@@ -151,8 +151,8 @@ attributes #6 = { "no-frame-pointer-elim"="true" }
 
 ; CHECK-LABEL: vla
 ; CHECK: str     x23, [sp, #-112]!
-; CHECK: stp     x22, x21, [sp, #16]
-; CHECK: stp     x20, x19, [sp, #32]
+; CHECK: stp     x21, x22, [sp, #16]
+; CHECK: stp     x19, x20, [sp, #32]
 ; CHECK: stp     x29, x30, [sp, #48]
 ; CHECK: add     x29, sp, #48
 ; CHECK: add     x8, x29, #16
@@ -183,8 +183,8 @@ attributes #6 = { "no-frame-pointer-elim"="true" }
 ; CHECK: mov     sp, [[REG2]]
 ; CHECK: sub     sp, x29, #48
 ; CHECK: ldp     x29, x30, [sp, #48]
-; CHECK: ldp     x20, x19, [sp, #32]
-; CHECK: ldp     x22, x21, [sp, #16]
+; CHECK: ldp     x19, x20, [sp, #32]
+; CHECK: ldp     x21, x22, [sp, #16]
 ; CHECK: ldr     x23, [sp], #112
 ; CHECK: ret
 define void @vla(i32, i8*, ...) local_unnamed_addr {
@@ -211,32 +211,34 @@ declare i8* @llvm.stacksave()
 declare void @llvm.stackrestore(i8*)
 
 ; CHECK-LABEL: snprintf
-; CHECK: sub     sp,  sp, #96
-; CHECK: stp     x21, x20, [sp, #16]
-; CHECK: stp     x19, x30, [sp, #32]
-; CHECK: add     x8, sp, #56
-; CHECK: mov     x19, x2
-; CHECK: mov     x20, x1
-; CHECK: mov     x21, x0
-; CHECK: stp     x6, x7, [sp, #80]
-; CHECK: stp     x4, x5, [sp, #64]
-; CHECK: str     x3, [sp, #56]
-; CHECK: str     x8, [sp, #8]
-; CHECK: bl      __local_stdio_printf_options
-; CHECK: ldr     x8, [x0]
-; CHECK: add     x5, sp, #56
-; CHECK: mov     x1, x21
-; CHECK: mov     x2, x20
-; CHECK: orr     x0, x8, #0x2
-; CHECK: mov     x3, x19
-; CHECK: mov     x4, xzr
-; CHECK: bl      __stdio_common_vsprintf
-; CHECK: ldp     x19, x30, [sp, #32]
-; CHECK: ldp     x21, x20, [sp, #16]
-; CHECK: cmp     w0, #0
-; CHECK: csinv   w0, w0, wzr, ge
-; CHECK: add     sp, sp, #96
-; CHECK: ret
+; CHECK-DAG: sub     sp,  sp, #96
+; CHECK-DAG: str     x21, [sp, #16]
+; CHECK-DAG: stp     x19, x20, [sp, #24]
+; CHECK-DAG: str     x30, [sp, #40]
+; CHECK-DAG: add     x8, sp, #56
+; CHECK-DAG: mov     x19, x2
+; CHECK-DAG: mov     x20, x1
+; CHECK-DAG: mov     x21, x0
+; CHECK-DAG: stp     x6, x7, [sp, #80]
+; CHECK-DAG: stp     x4, x5, [sp, #64]
+; CHECK-DAG: str     x3, [sp, #56]
+; CHECK-DAG: str     x8, [sp, #8]
+; CHECK-DAG: bl      __local_stdio_printf_options
+; CHECK-DAG: ldr     x8, [x0]
+; CHECK-DAG: add     x5, sp, #56
+; CHECK-DAG: mov     x1, x21
+; CHECK-DAG: mov     x2, x20
+; CHECK-DAG: orr     x0, x8, #0x2
+; CHECK-DAG: mov     x3, x19
+; CHECK-DAG: mov     x4, xzr
+; CHECK-DAG: bl      __stdio_common_vsprintf
+; CHECK-DAG: ldr     x30, [sp, #40]
+; CHECK-DAG: ldp     x19, x20, [sp, #24]
+; CHECK-DAG: ldr     x21, [sp, #16]
+; CHECK-DAG: cmp     w0, #0
+; CHECK-DAG: csinv   w0, w0, wzr, ge
+; CHECK-DAG: add     sp, sp, #96
+; CHECK-DAG: ret
 define i32 @snprintf(i8*, i64, i8*, ...) local_unnamed_addr #5 {
   %4 = alloca i8*, align 8
   %5 = bitcast i8** %4 to i8*
diff --git a/test/CodeGen/AArch64/windows-trap.ll b/test/CodeGen/AArch64/windows-trap.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5cf0ece48e97d1c8ec304470264f54495ed4bd04
--- /dev/null
+++ b/test/CodeGen/AArch64/windows-trap.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=aarch64-win32 %s -o - | FileCheck %s
+
+declare void @callee() noreturn
+
+; Make sure the call isn't the last instruction in the function; if it is,
+; unwinding may break.
+;
+; (The instruction after the call doesn't have to be anything in particular,
+; but trapping has the nice side-effect of catching bugs.)
+
+define void @test_unreachable() {
+; CHECK-LABEL: test_unreachable:
+; CHECK: bl      callee
+; CHECK-NEXT: brk #0x1
+  call void @callee() noreturn
+  unreachable
+}
diff --git a/test/CodeGen/AArch64/wineh-frame0.mir b/test/CodeGen/AArch64/wineh-frame0.mir
new file mode 100644
index 0000000000000000000000000000000000000000..b59627d7f3114dcfc7c6f49702ad976b7bd28a23
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame0.mir
@@ -0,0 +1,60 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_regp_x, save_regp
+
+# CHECK:        early-clobber $sp = frame-setup STPXpre killed $x27, killed $x28, $sp, -10
+# CHECK-NEXT:   frame-setup SEH_SaveRegP_X 27, 28, -80
+# CHECK-NEXT:   frame-setup STPXi killed $x25, killed $x26, $sp, 2
+# CHECK-NEXT:   frame-setup SEH_SaveRegP 25, 26, 16
+# CHECK-NEXT:   frame-setup STPXi killed $x23, killed $x24, $sp, 4
+# CHECK-NEXT:   frame-setup SEH_SaveRegP 23, 24, 32
+# CHECK-NEXT:   frame-setup STPXi killed $x21, killed $x22, $sp, 6
+# CHECK-NEXT:   frame-setup SEH_SaveRegP 21, 22, 48
+# CHECK-NEXT:   frame-setup STPXi killed $x19, killed $x20, $sp, 8
+# CHECK-NEXT:   frame-setup SEH_SaveRegP 19, 20, 64
+# CHECK-NEXT:   frame-setup SEH_PrologEnd
+# CHECK:        frame-destroy SEH_EpilogStart
+# CHECK-NEXT:   $x19, $x20 = frame-destroy LDPXi $sp, 8
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP 19, 20, 64
+# CHECK-NEXT:   $x21, $x22 = frame-destroy LDPXi $sp, 6
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP 21, 22, 48
+# CHECK-NEXT:   $x23, $x24 = frame-destroy LDPXi $sp, 4
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP 23, 24, 32
+# CHECK-NEXT:   $x25, $x26 = frame-destroy LDPXi $sp, 2
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP 25, 26, 16
+# CHECK-NEXT:   early-clobber $sp, $x27, $x28 = frame-destroy LDPXpost $sp, 10
+# CHECK-NEXT:   frame-destroy SEH_SaveRegP_X 27, 28, -80
+# CHECK-NEXT:   frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:   RET_ReallyLR implicit $x0
+
+...
+---
+name:            test
+alignment:       2
+tracksRegLiveness: true
+hasWinCFI: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  stackSize:       80
+  maxAlignment:    8
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+stack:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+    $x19 = ADDXrr $x0, killed $x1
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $x28
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame1.mir b/test/CodeGen/AArch64/wineh-frame1.mir
new file mode 100644
index 0000000000000000000000000000000000000000..deff40160b2e88184895581299e35ab206f6fa51
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame1.mir
@@ -0,0 +1,94 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_fregp_x, save_fregp
+
+# CHECK:         early-clobber $sp = frame-setup STPDpre killed $d10, killed $d11, $sp, -14
+# CHECK-NEXT:    frame-setup SEH_SaveFRegP_X 10, 11, -112
+# CHECK-NEXT:    frame-setup STPDi killed $d8, killed $d9, $sp, 2
+# CHECK-NEXT:    frame-setup SEH_SaveFRegP 8, 9, 16
+# CHECK-NEXT:    frame-setup STPXi killed $x27, killed $x28, $sp, 4
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 27, 28, 32
+# CHECK-NEXT:    frame-setup STPXi killed $x25, killed $x26, $sp, 6
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 25, 26, 48
+# CHECK-NEXT:    frame-setup STPXi killed $x23, killed $x24, $sp, 8
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 23, 24, 64
+# CHECK-NEXT:    frame-setup STPXi killed $x21, killed $x22, $sp, 10
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 21, 22, 80
+# CHECK-NEXT:    frame-setup STPXi killed $x19, killed $x20, $sp, 12
+# CHECK-NEXT:    frame-setup SEH_SaveRegP 19, 20, 96
+# CHECK-NEXT:    frame-setup SEH_PrologEnd
+# CHECK:         frame-destroy SEH_EpilogStart
+# CHECK-NEXT:    $x19, $x20 = frame-destroy LDPXi $sp, 12
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 19, 20, 96
+# CHECK-NEXT:    $x21, $x22 = frame-destroy LDPXi $sp, 10
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 21, 22, 80
+# CHECK-NEXT:    $x23, $x24 = frame-destroy LDPXi $sp, 8
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 23, 24, 64
+# CHECK-NEXT:    $x25, $x26 = frame-destroy LDPXi $sp, 6
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 25, 26, 48
+# CHECK-NEXT:    $x27, $x28 = frame-destroy LDPXi $sp, 4
+# CHECK-NEXT:    frame-destroy SEH_SaveRegP 27, 28, 32
+# CHECK-NEXT:    $d8, $d9 = frame-destroy LDPDi $sp, 2
+# CHECK-NEXT:    frame-destroy SEH_SaveFRegP 8, 9, 16
+# CHECK-NEXT:    early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14
+# CHECK-NEXT:    frame-destroy SEH_SaveFRegP_X 10, 11, -112
+# CHECK-NEXT:    frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:    RET_ReallyLR implicit $x0
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $d10, $d11, $d8, $d9, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $d11
+    $x0 = ADDXrr $x0, killed $x28
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame2.mir b/test/CodeGen/AArch64/wineh-frame2.mir
new file mode 100644
index 0000000000000000000000000000000000000000..ae2aaf7f27d1bf70250f2e0b0ba03419e5ce9418
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame2.mir
@@ -0,0 +1,72 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_freg_x, save_frep, save_reg
+
+# CHECK:       early-clobber $sp = frame-setup STRDpre killed $d12, $sp, -48
+# CHECK-NEXT:  frame-setup SEH_SaveFReg_X 12, -48
+# CHECK-NEXT:  frame-setup STPDi killed $d10, killed $d11, $sp, 1
+# CHECK-NEXT:  frame-setup SEH_SaveFRegP 10, 11, 8
+# CHECK-NEXT:  frame-setup STPDi killed $d8, killed $d9, $sp, 3
+# CHECK-NEXT:  frame-setup SEH_SaveFRegP 8, 9, 24
+# CHECK-NEXT:  frame-setup STRXui killed $x19, $sp, 5
+# CHECK-NEXT:  frame-setup SEH_SaveReg 19, 40
+# CHECK-NEXT:  frame-setup SEH_PrologEnd
+# CHECK:       frame-destroy SEH_EpilogStart
+# CHECK-NEXT:  $x19 = frame-destroy LDRXui $sp, 5
+# CHECK-NEXT:  frame-destroy SEH_SaveReg 19, 40
+# CHECK-NEXT:  $d8, $d9 = frame-destroy LDPDi $sp, 3
+# CHECK-NEXT:  frame-destroy SEH_SaveFRegP 8, 9, 24
+# CHECK-NEXT:  $d10, $d11 = frame-destroy LDPDi $sp, 1
+# CHECK-NEXT:  frame-destroy SEH_SaveFRegP 10, 11, 8
+# CHECK-NEXT:  early-clobber $sp, $d12 = frame-destroy LDRDpost $sp, 48
+# CHECK-NEXT:  frame-destroy SEH_SaveFReg_X 12, -48
+# CHECK-NEXT:  frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:  RET_ReallyLR implicit $x0
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $d10, $d11, $d8, $d9
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $d12 = FADDDrr $d11, killed $d11
+    $x0 = COPY $d12
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame3.mir b/test/CodeGen/AArch64/wineh-frame3.mir
new file mode 100644
index 0000000000000000000000000000000000000000..d6e927d4bd51f16d410af2d306670161f2e8c01e
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame3.mir
@@ -0,0 +1,59 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_reg_x, save_reg
+
+# CHECK:      early-clobber $sp = frame-setup STRXpre killed $x22, $sp, -16
+# CHECK-NEXT: frame-setup SEH_SaveReg_X 22, -16
+# CHECK-NEXT: frame-setup STRXui killed $x19, $sp, 1
+# CHECK-NEXT: frame-setup SEH_SaveReg 19, 8
+# CHECK-NEXT: frame-setup SEH_PrologEnd
+# CHECK:      frame-destroy SEH_EpilogStart
+# CHECK-NEXT: $x19 = frame-destroy LDRXui $sp, 1
+# CHECK-NEXT: frame-destroy SEH_SaveReg 19, 8
+# CHECK-NEXT: early-clobber $sp, $x22 = frame-destroy LDRXpost $sp, 16
+# CHECK-NEXT: frame-destroy SEH_SaveReg_X 22, -16
+# CHECK-NEXT: frame-destroy SEH_EpilogEnd
+# CHECK-NEXT: RET_ReallyLR implicit $x0
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1
+    $x19 = ADDXrr $x0, killed $x1
+    $x22 = ADDXrr killed $x19, $x0
+    $x0 = COPY killed $x22
+    RET_ReallyLR implicit $x0
+...
diff --git a/test/CodeGen/AArch64/wineh-frame4.mir b/test/CodeGen/AArch64/wineh-frame4.mir
new file mode 100644
index 0000000000000000000000000000000000000000..63a8dc677792d8b0849e9ae01bc4438b472ad136
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame4.mir
@@ -0,0 +1,59 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check save_freg_x, save_freg
+
+# CHECK:       early-clobber $sp = frame-setup STRDpre killed $d10, $sp, -16
+# CHECK-NEXT:  frame-setup SEH_SaveFReg_X 10, -16
+# CHECK-NEXT:  frame-setup STRDui killed $d8, $sp, 1 :: (store 8 into %stack.0)
+# CHECK-NEXT:  frame-setup SEH_SaveFReg 8, 8
+# CHECK-NEXT:  frame-setup SEH_PrologEnd
+# CHECK:       frame-destroy SEH_EpilogStart
+# CHECK-NEXT:  $d8 = frame-destroy LDRDui $sp, 1 :: (load 8 from %stack.0)
+# CHECK-NEXT:  frame-destroy SEH_SaveFReg 8, 8
+# CHECK-NEXT:  early-clobber $sp, $d10 = frame-destroy LDRDpost $sp, 16 :: (load 8 from %stack.1)
+# CHECK-NEXT:  frame-destroy SEH_SaveFReg_X 10, -16
+# CHECK-NEXT:  frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:  RET_ReallyLR implicit $x0
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $d0, $d1
+    $d8 = FADDDrr $d0, killed $d1
+    $d10 = FADDDrr killed $d8, $d0
+    $x0 = COPY killed $d10
+    RET_ReallyLR implicit $x0
+...
diff --git a/test/CodeGen/AArch64/wineh-frame5.mir b/test/CodeGen/AArch64/wineh-frame5.mir
new file mode 100644
index 0000000000000000000000000000000000000000..2a4eed4ca92989328b8194b8d72def5b178e56dc
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame5.mir
@@ -0,0 +1,135 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Check multiple epilogues, save_reg, save_reg_x.
+
+# CHECK-LABEL:   bb.0.entry:
+# CHECK:         early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32
+# CHECK-NEXT:    frame-setup SEH_SaveReg_X 28, -32
+# CHECK-NEXT:    frame-setup STRXui killed $x19, $sp, 1
+# CHECK-NEXT:    frame-setup SEH_SaveReg 19, 8
+# CHECK-NEXT:    frame-setup STRXui killed $lr, $sp, 2
+# CHECK-NEXT:    frame-setup SEH_SaveReg 30, 16
+# CHECK-NEXT:    $sp = frame-setup SUBXri $sp, 496, 0
+# CHECK-NEXT:    frame-setup SEH_StackAlloc 496
+# CHECK-NEXT:    frame-setup SEH_PrologEnd
+
+# CHECK-LABEL:   bb.1.if.then:
+# CHECK:         frame-destroy SEH_EpilogStart
+# CHECK-NEXT:    $sp = frame-destroy ADDXri $sp, 496, 0
+# CHECK-NEXT:    frame-destroy SEH_StackAlloc 496
+# CHECK-NEXT:    $lr = frame-destroy LDRXui $sp, 2
+# CHECK-NEXT:    frame-destroy SEH_SaveReg 30, 16
+# CHECK-NEXT:    $x19 = frame-destroy LDRXui $sp, 1
+# CHECK-NEXT:    frame-destroy SEH_SaveReg 19, 8
+# CHECK-NEXT:    early-clobber $sp, $x28 = frame-destroy LDRXpost $sp, 32
+# CHECK-NEXT:    frame-destroy SEH_SaveReg_X 28, -32
+# CHECK-NEXT:    frame-destroy SEH_EpilogEnd
+# CHECK-NEXT:    TCRETURNdi @"?func2@@YAHXZ", 0, csr_aarch64_aapcs, implicit $sp
+
+
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  define dso_local i32 @"?func@@YAHH@Z"(i32 %i) local_unnamed_addr #0 {
+  entry:
+    %B = alloca [123 x i32], align 4
+    %call = tail call i32 @"?func2@@YAHXZ"()
+    %cmp = icmp sgt i32 %i, 2
+    br i1 %cmp, label %if.then, label %if.else
+
+  if.then:                                          ; preds = %entry
+    %call1 = tail call i32 @"?func2@@YAHXZ"()
+    ret i32 %call1
+
+  if.else:                                          ; preds = %entry
+    %0 = bitcast [123 x i32]* %B to i8*
+    call void @llvm.lifetime.start.p0i8(i64 492, i8* nonnull %0) #3
+    %arraydecay7 = bitcast [123 x i32]* %B to i32*
+    %call2 = call i32 @"?func3@@YAHPEAH@Z"(i32* nonnull %arraydecay7)
+    call void @llvm.lifetime.end.p0i8(i64 492, i8* nonnull %0) #3
+    ret i32 %call2
+  }
+
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+  declare dso_local i32 @"?func2@@YAHXZ"() local_unnamed_addr #2
+
+  declare dso_local i32 @"?func3@@YAHPEAH@Z"(i32*) local_unnamed_addr #2
+
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { argmemonly nounwind }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #3 = { nounwind }
+
+...
+---
+name:            '?func@@YAHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  492
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: B, type: default, offset: 0, size: 492, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -492, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $w0
+
+    renamable $w19 = COPY $w0
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    dead $wzr = SUBSWri killed renamable $w19, 3, 0, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit killed $nzcv
+    B %bb.1
+
+  bb.1.if.then:
+    TCRETURNdi @"?func2@@YAHXZ", 0, csr_aarch64_aapcs, implicit $sp
+
+  bb.2.if.else:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $x0 = ADDXri %stack.0.B, 0, 0
+    BL @"?func3@@YAHPEAH@Z", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    RET_ReallyLR implicit $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame6.mir b/test/CodeGen/AArch64/wineh-frame6.mir
new file mode 100644
index 0000000000000000000000000000000000000000..b86422e8097e911732bf1540e13087846f664b57
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame6.mir
@@ -0,0 +1,150 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Test that stack probe results in Nop unwind codes in the prologue.  Test
+# save_fplr, save_reg_x and stack_alloc with multiple updates
+
+# CHECK:      early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2
+# CHECK-NEXT: frame-setup SEH_SaveFPLR_X -16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
+# CHECK-NEXT: frame-setup SEH_SetFP
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 32, 0
+# CHECK-NEXT: frame-setup SEH_StackAlloc 32
+# CHECK-NEXT: frame-setup SEH_PrologEnd
+# CHECK:      frame-destroy SEH_EpilogStart
+# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
+# CHECK-NEXT: frame-destroy SEH_SetFP
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
+# CHECK-NEXT: frame-destroy SEH_SaveFPLR_X -16
+# CHECK-NEXT: frame-destroy SEH_EpilogEnd
+# CHECK-NEXT: RET_ReallyLR implicit killed $w0
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  ; Function Attrs: noinline optnone
+  define dso_local i32 @"?func@@YAHHHHH@Z"(i32 %n, i32 %idx, i32 %b, i32 %c) #0 {
+  entry:
+    %c.addr = alloca i32, align 4
+    %b.addr = alloca i32, align 4
+    %idx.addr = alloca i32, align 4
+    %n.addr = alloca i32, align 4
+    %a = alloca i32*, align 8
+    store i32 %c, i32* %c.addr, align 4
+    store i32 %b, i32* %b.addr, align 4
+    store i32 %idx, i32* %idx.addr, align 4
+    store i32 %n, i32* %n.addr, align 4
+    %0 = load i32, i32* %n.addr, align 4
+    %conv = sext i32 %0 to i64
+    %1 = alloca i8, i64 %conv, align 16
+    %2 = bitcast i8* %1 to i32*
+    store i32* %2, i32** %a, align 8
+    %3 = load i32*, i32** %a, align 8
+    call void @"?init@@YAXPEAH@Z"(i32* %3)
+    ret i32 0
+  }
+
+  declare dso_local void @"?init@@YAXPEAH@Z"(i32*) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #2
+
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { nounwind }
+
+...
+---
+name:            '?func@@YAHHHHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+  - { reg: '$w1', virtual-reg: '' }
+  - { reg: '$w2', virtual-reg: '' }
+  - { reg: '$w3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  24
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: c.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: b.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: idx.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -12, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, name: n.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -16, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 4, name: a, type: default, offset: 0, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 5, name: '', type: variable-sized, offset: 0,
+      alignment: 1, stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $w0, $w1, $w2, $w3
+
+    STRWui killed renamable $w3, %stack.0.c.addr, 0 :: (store 4 into %ir.c.addr)
+    STRWui killed renamable $w2, %stack.1.b.addr, 0 :: (store 4 into %ir.b.addr)
+    STRWui killed renamable $w1, %stack.2.idx.addr, 0 :: (store 4 into %ir.idx.addr)
+    STRWui killed renamable $w0, %stack.3.n.addr, 0 :: (store 4 into %ir.n.addr)
+    renamable $x8 = LDRSWui %stack.3.n.addr, 0 :: (dereferenceable load 4 from %ir.n.addr)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $x8 = nuw ADDXri killed renamable $x8, 15, 0
+    renamable $x8 = UBFMXri killed renamable $x8, 4, 63
+    $x15 = COPY renamable $x8
+    STRXui killed $x8, %stack.6, 0 :: (store 8 into %stack.6)
+    BL &__chkstk, csr_aarch64_stackprobe_windows, implicit-def dead $lr, implicit $sp, implicit killed $x15
+    renamable $x8 = COPY $sp
+    $x15 = LDRXui %stack.6, 0 :: (load 8 from %stack.6)
+    renamable $x8 = SUBSXrs killed renamable $x8, killed renamable $x15, 4, implicit-def dead $nzcv
+    $sp = COPY renamable $x8
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    STRXui killed renamable $x8, %stack.4.a, 0 :: (store 8 into %ir.a)
+    renamable $x0 = LDRXui %stack.4.a, 0 :: (dereferenceable load 8 from %ir.a)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @"?init@@YAXPEAH@Z", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $w1 = COPY $wzr
+    $w0 = COPY killed renamable $w1
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame7.mir b/test/CodeGen/AArch64/wineh-frame7.mir
new file mode 100644
index 0000000000000000000000000000000000000000..3e3e79eda6365b25f6fd00d5ff5508a378121d82
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame7.mir
@@ -0,0 +1,189 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Test that stack probe results in Nop unwind codes in the prologue.  Test
+# save_fplr, save_reg_x and stack_alloc with multiple updates.
+
+# CHECK:      early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32
+# CHECK-NEXT: frame-setup SEH_SaveReg_X 28, -32
+# CHECK-NEXT: frame-setup STPXi killed $fp, killed $lr, $sp, 2
+# CHECK-NEXT: frame-setup SEH_SaveFPLR 16
+# CHECK-NEXT: $x15 = frame-setup MOVZXi 56009, 0
+# CHECK-NEXT: frame-setup SEH_Nop
+# CHECK-NEXT: $x15 = frame-setup MOVKXi $x15, 2, 16
+# CHECK-NEXT: frame-setup SEH_Nop
+# CHECK-NEXT: frame-setup BL &__chkstk, implicit-def $lr, implicit $sp, implicit $x15
+# CHECK-NEXT: frame-setup SEH_Nop
+# CHECK-NEXT: $sp = frame-setup SUBXrx64 killed $sp, killed $x15, 28
+# CHECK-NEXT: frame-setup SEH_StackAlloc 2993296
+# CHECK-NEXT: frame-setup SEH_PrologEnd
+# CHECK:      frame-destroy SEH_EpilogStart
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 730, 12
+# CHECK-NEXT: frame-destroy SEH_StackAlloc 2990080
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 3216, 0
+# CHECK-NEXT: frame-destroy SEH_StackAlloc 3216
+# CHECK-NEXT: $fp, $lr = frame-destroy LDPXi $sp, 2
+# CHECK-NEXT: frame-destroy SEH_SaveFPLR 16
+# CHECK-NEXT: early-clobber $sp, $x28 = frame-destroy LDRXpost $sp, 32
+# CHECK-NEXT: frame-destroy SEH_SaveReg_X 28, -32
+# CHECK-NEXT: frame-destroy SEH_EpilogEnd
+# CHECK-NEXT: RET_ReallyLR implicit killed $w0
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  ; Function Attrs: noinline optnone
+  define dso_local i32 @"?func@@YAHH@Z"(i32 %i) #0 {
+  entry:
+    %retval = alloca i32, align 4
+    %i.addr = alloca i32, align 4
+    %A = alloca [748193 x i32], align 4
+    %a = alloca i32, align 4
+    %B = alloca [123 x i32], align 4
+    store i32 %i, i32* %i.addr, align 4
+    %0 = load i32, i32* %i.addr, align 4
+    %add = add nsw i32 %0, 2
+    store i32 %add, i32* %a, align 4
+    %call = call i32 @"?func2@@YAHXZ"()
+    %1 = load i32, i32* %i.addr, align 4
+    %cmp = icmp sgt i32 %1, 2
+    br i1 %cmp, label %if.then, label %if.else
+
+  if.then:                                          ; preds = %entry
+    %call1 = call i32 @"?func2@@YAHXZ"()
+    store i32 %call1, i32* %retval, align 4
+    br label %return
+
+  if.else:                                          ; preds = %entry
+    %arraydecay = getelementptr inbounds [123 x i32], [123 x i32]* %B, i32 0, i32 0
+    %call2 = call i32 @"?func3@@YAHPEAH@Z"(i32* %arraydecay)
+    store i32 %call2, i32* %retval, align 4
+    br label %return
+
+  return:                                           ; preds = %if.else, %if.then
+    %2 = load i32, i32* %retval, align 4
+    ret i32 %2
+  }
+
+  declare dso_local i32 @"?func2@@YAHXZ"() #1
+
+  declare dso_local i32 @"?func3@@YAHPEAH@Z"(i32*) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #2
+
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { nounwind }
+
+...
+---
+name:            '?func@@YAHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        true
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  2993276
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: retval, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: i.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: A, type: default, offset: 0, size: 2992772, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2992780, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, name: a, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2992784, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 4, name: B, type: default, offset: 0, size: 492, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2993276, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.1.entry:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+    liveins: $w0
+
+    renamable $x8 = ADDXri %stack.1.i.addr, 0, 0
+    renamable $w9 = MOVi32imm 2
+    STRWui killed renamable $w0, renamable $x8, 0 :: (store 4 into %ir.i.addr)
+    renamable $w0 = LDRWui renamable $x8, 0 :: (load 4 from %ir.i.addr)
+    renamable $w0 = ADDWri killed renamable $w0, 2, 0
+    STRWui killed renamable $w0, %stack.3.a, 0 :: (store 4 into %ir.a)
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    STRXui killed $x8, %stack.5, 0 :: (store 8 into %stack.5)
+    STRWui killed $w9, %stack.6, 0 :: (store 4 into %stack.6)
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    $x8 = LDRXui %stack.5, 0 :: (load 8 from %stack.5)
+    renamable $w9 = LDRWui killed renamable $x8, 0 :: (load 4 from %ir.i.addr)
+    $w10 = LDRWui %stack.6, 0 :: (load 4 from %stack.6)
+    $wzr = SUBSWrr killed renamable $w9, killed renamable $w10, implicit-def $nzcv
+    renamable $w9 = CSINCWr $wzr, $wzr, 13, implicit $nzcv
+    TBNZW killed renamable $w9, 0, %bb.2
+    B %bb.3
+
+  bb.2.if.then:
+    successors: %bb.4(0x80000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    $x8 = LDRXui %stack.5, 0 :: (load 8 from %stack.5)
+    STRWui killed renamable $w0, killed renamable $x8, 1 :: (store 4 into %ir.retval)
+    B %bb.4
+
+  bb.3.if.else:
+    successors: %bb.4(0x80000000)
+
+    renamable $x8 = ADDXri %stack.4.B, 0, 0
+    ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    $x0 = COPY killed renamable $x8
+    BL @"?func3@@YAHPEAH@Z", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit killed $x0, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    $x8 = LDRXui %stack.5, 0 :: (load 8 from %stack.5)
+    STRWui killed renamable $w0, killed renamable $x8, 1 :: (store 4 into %ir.retval)
+
+  bb.4.return:
+    $x8 = LDRXui %stack.5, 0 :: (load 8 from %stack.5)
+    renamable $w0 = LDRWui killed renamable $x8, 1 :: (load 4 from %ir.retval)
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh-frame8.mir b/test/CodeGen/AArch64/wineh-frame8.mir
new file mode 100644
index 0000000000000000000000000000000000000000..6fc7416d6d625aea80ddaf00fcc076e60af877e8
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh-frame8.mir
@@ -0,0 +1,88 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-before=prologepilog \
+# RUN:   -stop-after=prologepilog | FileCheck %s
+# Test that the frame lowering emits correct SEH updates for the case without
+# a stack frame (e.g. no callee saved registers, no frame pointer, just locals)
+
+# CHECK:      $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK-NEXT: frame-setup SEH_StackAlloc 16
+# CHECK-NEXT: frame-setup SEH_PrologEnd
+# CHECK:      frame-destroy SEH_EpilogStart
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
+# CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+# CHECK-NEXT: frame-destroy SEH_EpilogEnd
+# CHECK-NEXT: RET_ReallyLR implicit killed $w0
+
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  ; Function Attrs: noinline nounwind optnone uwtable
+  define dso_local i32 @"?func@@YAHH@Z"(i32 %a) #0 {
+  entry:
+    %a.addr = alloca i32, align 4
+    %b = alloca i32, align 4
+    store i32 %a, i32* %a.addr, align 4
+    store i32 2, i32* %b, align 4
+    %0 = load i32, i32* %b, align 4
+    %1 = load i32, i32* %a.addr, align 4
+    %add = add nsw i32 %0, %1
+    ret i32 %add
+  }
+
+  attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+...
+---
+name:            '?func@@YAHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        true
+failedISel:      false
+tracksRegLiveness: true
+registers:
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  8
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: a.addr, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: b, type: default, offset: 0, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+constants:
+body:             |
+  bb.1.entry:
+    liveins: $w0
+
+    renamable $w8 = MOVi32imm 2
+    STRWui killed renamable $w0, %stack.0.a.addr, 0 :: (store 4 into %ir.a.addr)
+    STRWui killed renamable $w8, %stack.1.b, 0 :: (store 4 into %ir.b)
+    renamable $w8 = LDRWui %stack.1.b, 0 :: (load 4 from %ir.b)
+    renamable $w0 = LDRWui %stack.0.a.addr, 0 :: (load 4 from %ir.a.addr)
+    renamable $w0 = nsw ADDWrr killed renamable $w8, killed renamable $w0
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh1.mir b/test/CodeGen/AArch64/wineh1.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c89daf1ce225260e243f59c63a05a3169781447b
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh1.mir
@@ -0,0 +1,120 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog -filetype=obj  \
+# RUN:   | llvm-readobj -unwind | FileCheck %s
+# This test case checks the basic validity of the .xdata section.  It's
+# documented at:
+# https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+
+# We expect to see the following in the .xdata section:
+
+# CHECK: 	 ExceptionData {
+# CHECK-NEXT:      FunctionLength: 92
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 28
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xc808              ; stp x19, x20, [sp, #64]
+# CHECK-NEXT:        0xd0c7              ; str x22, [sp, #56]
+# CHECK-NEXT:        0xd086              ; str x21, [sp, #48]
+# CHECK-NEXT:        0xc904              ; stp x23, x24, [sp, #32]
+# CHECK-NEXT:        0xc982              ; stp x25, x26, [sp, #16]
+# CHECK-NEXT:        0xce09              ; stp x27, x28, [sp, #-80]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 15
+# CHECK-NEXT:          EpilogueStartIndex: 13
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc808              ; ldp x19, x20, [sp, #64]
+# CHECK-NEXT:            0xd086              ; ldr x21, [sp, #48]
+# CHECK-NEXT:            0xe3                ; nop
+# CHECK-NEXT:            0xd0c7              ; ldr x22, [sp, #56]
+# CHECK-NEXT:            0xc904              ; ldp x23, x24, [sp, #32]
+# CHECK-NEXT:            0xc982              ; ldp x25, x26, [sp, #16]
+# CHECK-NEXT:            0xce09              ; ldp x27, x28, [sp], #80
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            test
+alignment:       2
+tracksRegLiveness: true
+hasWinCFI: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  stackSize:       80
+  maxAlignment:    8
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+stack:
+  - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x19' }
+  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x20' }
+  - { id: 2, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x21' }
+  - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x22' }
+  - { id: 4, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x23' }
+  - { id: 5, type: spill-slot, offset: -48, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x24' }
+  - { id: 6, type: spill-slot, offset: -56, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x25' }
+  - { id: 7, type: spill-slot, offset: -64, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x26' }
+  - { id: 8, type: spill-slot, offset: -72, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x27' }
+  - { id: 9, type: spill-slot, offset: -80, size: 8, alignment: 8, stack-id: 0,
+      callee-saved-register: '$x28' }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+    early-clobber $sp = frame-setup STPXpre killed $x27, killed $x28, $sp, -10 :: (store 8 into %stack.8), (store 8 into %stack.9)
+    frame-setup SEH_SaveRegP_X 27, 28, -80
+    frame-setup STPXi killed $x25, killed $x26, $sp, 2 :: (store 8 into %stack.6), (store 8 into %stack.7)
+    frame-setup SEH_SaveRegP 25, 26, 16
+    frame-setup STPXi killed $x23, killed $x24, $sp, 4 :: (store 8 into %stack.4), (store 8 into %stack.5)
+    frame-setup SEH_SaveRegP 23, 24, 32
+    frame-setup STRXui killed $x21, $sp, 6 :: (store 8 into %stack.2)
+    frame-setup SEH_SaveReg 21, 48
+    frame-setup STRXui killed $x22, $sp, 7 :: (store 8 into %stack.3)
+    frame-setup SEH_SaveReg 22, 56
+    frame-setup STPXi killed $x19, killed $x20, $sp, 8 :: (store 8 into %stack.0), (store 8 into %stack.1)
+    frame-setup SEH_SaveRegP 19, 20, 64
+    frame-setup SEH_PrologEnd
+    $x19 = ADDXrr $x0, killed $x1
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 64
+    $x21 = frame-destroy LDRXui $sp, 6 :: (load 8 from %stack.2)
+    frame-destroy SEH_SaveReg 21, 48
+    $x0 = COPY $x28
+    frame-destroy SEH_Nop
+    $x21 = frame-destroy LDRXui $sp, 6 :: (load 8 from %stack.2)
+    frame-destroy SEH_SaveReg 22, 56
+    $x23, $x24 = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 32
+    $x25, $x26 = frame-destroy LDPXi $sp, 2 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 16
+    early-clobber $sp, $x27, $x28 = frame-destroy LDPXpost $sp, 10 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP_X 27, 28, -80
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh2.mir b/test/CodeGen/AArch64/wineh2.mir
new file mode 100644
index 0000000000000000000000000000000000000000..e2c31fd56ce9eafb800f5ffe4529bfdaf50d5816
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh2.mir
@@ -0,0 +1,185 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:    -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# Test that the pre/post increment save of a flating point register is correct.
+
+# CHECK:        ExceptionData {
+# CHECK-NEXT:      FunctionLength: 136
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 40
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xc80e              ; stp x19, x20, [sp, #112]
+# CHECK-NEXT:        0xc88c              ; stp x21, x22, [sp, #96]
+# CHECK-NEXT:        0xc90a              ; stp x23, x24, [sp, #80]
+# CHECK-NEXT:        0xc988              ; stp x25, x26, [sp, #64]
+# CHECK-NEXT:        0xca06              ; stp x27, x28, [sp, #48]
+# CHECK-NEXT:        0xdc45              ; str d9, [sp, #40]
+# CHECK-NEXT:        0xdc04              ; str d8, [sp, #32]
+# CHECK-NEXT:        0xd882              ; stp d10, d11, [sp, #16]
+# CHECK-NEXT:        0xde8f              ; str d12, [sp, #-128]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 25
+# CHECK-NEXT:          EpilogueStartIndex: 19
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc80e              ; ldp x19, x20, [sp, #112]
+# CHECK-NEXT:            0xc88c              ; ldp x21, x22, [sp, #96]
+# CHECK-NEXT:            0xc90a              ; ldp x23, x24, [sp, #80]
+# CHECK-NEXT:            0xc988              ; ldp x25, x26, [sp, #64]
+# CHECK-NEXT:            0xca06              ; ldp x27, x28, [sp, #48]
+# CHECK-NEXT:            0xdc04              ; ldr d8, [sp, #32]
+# CHECK-NEXT:            0xdc45              ; ldr d9, [sp, #40]
+# CHECK-NEXT:            0xd882              ; ldp d10, d11, [sp, #16]
+# CHECK-NEXT:            0xde8f              ; ldr d12, [sp], #128
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       128
+  offsetAdjustment: 0
+  maxAlignment:    16
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x20', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x21', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x22', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x23', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x24', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -56, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x25', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -64, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x26', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -72, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x27', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 9, name: '', type: spill-slot, offset: -80, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x28', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 10, name: '', type: spill-slot, offset: -88, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d8', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 11, name: '', type: spill-slot, offset: -96, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d9', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 12, name: '', type: spill-slot, offset: -104, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d10', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 13, name: '', type: spill-slot, offset: -112, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d11', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 14, name: '', type: spill-slot, offset: -128, size: 8, alignment: 16,
+      stack-id: 0, callee-saved-register: '$d12', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $d8, $d9, $d10, $d11, $d12, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28
+
+    early-clobber $sp = frame-setup STRDpre killed $d12, $sp, -128 :: (store 8 into %stack.14)
+    frame-setup SEH_SaveFReg_X 12, -128
+    frame-setup STPDi killed $d10, killed $d11, $sp, 2 :: (store 8 into %stack.12), (store 8 into %stack.13)
+    frame-setup SEH_SaveFRegP 10, 11, 16
+    frame-setup STRDui killed $d8, $sp, 4 :: (store 8 into %stack.10)
+    frame-setup SEH_SaveFReg 8, 32
+    frame-setup STRDui killed $d9, $sp, 5 :: (store 8 into %stack.11)
+    frame-setup SEH_SaveFReg 9, 40
+    frame-setup STPXi killed $x27, killed $x28, $sp, 6 :: (store 8 into %stack.8), (store 8 into %stack.9)
+    frame-setup SEH_SaveRegP 27, 28, 48
+    frame-setup STPXi killed $x25, killed $x26, $sp, 8 :: (store 8 into %stack.6), (store 8 into %stack.7)
+    frame-setup SEH_SaveRegP 25, 26, 64
+    frame-setup STPXi killed $x23, killed $x24, $sp, 10 :: (store 8 into %stack.4), (store 8 into %stack.5)
+    frame-setup SEH_SaveRegP 23, 24, 80
+    frame-setup STPXi killed $x21, killed $x22, $sp, 12 :: (store 8 into %stack.2), (store 8 into %stack.3)
+    frame-setup SEH_SaveRegP 21, 22, 96
+    frame-setup STPXi killed $x19, killed $x20, $sp, 14 :: (store 8 into %stack.0), (store 8 into %stack.1)
+    frame-setup SEH_SaveRegP 19, 20, 112
+    frame-setup SEH_PrologEnd
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $d12 = FADDDrr killed $d10, killed $d11
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $d12
+    $x0 = ADDXrr $x0, killed $x28
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 14 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 112
+    $x21, $x22 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.2), (load 8 from %stack.3)
+    frame-destroy SEH_SaveRegP 21, 22, 96
+    $x23, $x24 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 80
+    $x25, $x26 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 64
+    $x27, $x28 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP 27, 28, 48
+    $d8 = frame-destroy LDRDui $sp, 4 :: (load 8 from %stack.10)
+    frame-destroy SEH_SaveFReg 8, 32
+    $d9 = frame-destroy LDRDui $sp, 5 :: (load 8 from %stack.11)
+    frame-destroy SEH_SaveFReg 9, 40
+    $d10, $d11 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.12), (load 8 from %stack.13)
+    frame-destroy SEH_SaveFRegP 10, 11, 16
+    early-clobber $sp, $d12 = frame-destroy LDRDpost $sp, 128 :: (load 8 from %stack.14)
+    frame-destroy SEH_SaveFReg_X 12, -128
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh3.mir b/test/CodeGen/AArch64/wineh3.mir
new file mode 100644
index 0000000000000000000000000000000000000000..ffca6c157a3e7e45a574f5f3f9a4c54c3880faa1
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh3.mir
@@ -0,0 +1,171 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:    -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# Test that the register pairing of both general purpose and floating point
+# registers is correctly saved in the .xdata section, as well as the pre/post
+# increment of floating point register pairs.
+
+# CHECK:        ExceptionData {
+# CHECK-NEXT:      FunctionLength: 124
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 32
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xc80c              ; stp x19, x20, [sp, #96]
+# CHECK-NEXT:        0xc88a              ; stp x21, x22, [sp, #80]
+# CHECK-NEXT:        0xc908              ; stp x23, x24, [sp, #64]
+# CHECK-NEXT:        0xc986              ; stp x25, x26, [sp, #48]
+# CHECK-NEXT:        0xca04              ; stp x27, x28, [sp, #32]
+# CHECK-NEXT:        0xd802              ; stp d8, d9, [sp, #16]
+# CHECK-NEXT:        0xda8d              ; stp d10, d11, [sp, #-112]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 23
+# CHECK-NEXT:          EpilogueStartIndex: 15
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc80c              ; ldp x19, x20, [sp, #96]
+# CHECK-NEXT:            0xc88a              ; ldp x21, x22, [sp, #80]
+# CHECK-NEXT:            0xc908              ; ldp x23, x24, [sp, #64]
+# CHECK-NEXT:            0xc986              ; ldp x25, x26, [sp, #48]
+# CHECK-NEXT:            0xca04              ; ldp x27, x28, [sp, #32]
+# CHECK-NEXT:            0xd802              ; ldp d8, d9, [sp, #16]
+# CHECK-NEXT:            0xda8d              ; ldp d10, d11, [sp], #112
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x20', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x21', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x22', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x23', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x24', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -56, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x25', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -64, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x26', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -72, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x27', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 9, name: '', type: spill-slot, offset: -80, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x28', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 10, name: '', type: spill-slot, offset: -88, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d8', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 11, name: '', type: spill-slot, offset: -96, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d9', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 12, name: '', type: spill-slot, offset: -104, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d10', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 13, name: '', type: spill-slot, offset: -112, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d11', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $d10, $d11, $d8, $d9, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+
+    early-clobber $sp = frame-setup STPDpre killed $d10, killed $d11, $sp, -14 :: (store 8 into %stack.12), (store 8 into %stack.13)
+    frame-setup SEH_SaveFRegP_X 10, 11, -112
+    frame-setup STPDi killed $d8, killed $d9, $sp, 2 :: (store 8 into %stack.10), (store 8 into %stack.11)
+    frame-setup SEH_SaveFRegP 8, 9, 16
+    frame-setup STPXi killed $x27, killed $x28, $sp, 4 :: (store 8 into %stack.8), (store 8 into %stack.9)
+    frame-setup SEH_SaveRegP 27, 28, 32
+    frame-setup STPXi killed $x25, killed $x26, $sp, 6 :: (store 8 into %stack.6), (store 8 into %stack.7)
+    frame-setup SEH_SaveRegP 25, 26, 48
+    frame-setup STPXi killed $x23, killed $x24, $sp, 8 :: (store 8 into %stack.4), (store 8 into %stack.5)
+    frame-setup SEH_SaveRegP 23, 24, 64
+    frame-setup STPXi killed $x21, killed $x22, $sp, 10 :: (store 8 into %stack.2), (store 8 into %stack.3)
+    frame-setup SEH_SaveRegP 21, 22, 80
+    frame-setup STPXi killed $x19, killed $x20, $sp, 12 :: (store 8 into %stack.0), (store 8 into %stack.1)
+    frame-setup SEH_SaveRegP 19, 20, 96
+    frame-setup SEH_PrologEnd
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $x20 = ADDXrr $x19, killed $x0
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $d11
+    $x0 = ADDXrr $x0, killed $x28
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 96
+    $x21, $x22 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.2), (load 8 from %stack.3)
+    frame-destroy SEH_SaveRegP 21, 22, 80
+    $x23, $x24 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 64
+    $x25, $x26 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 48
+    $x27, $x28 = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP 27, 28, 32
+    $d8, $d9 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.10), (load 8 from %stack.11)
+    frame-destroy SEH_SaveFRegP 8, 9, 16
+    early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14 :: (load 8 from %stack.12), (load 8 from %stack.13)
+    frame-destroy SEH_SaveFRegP_X 10, 11, -112
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh4.mir b/test/CodeGen/AArch64/wineh4.mir
new file mode 100644
index 0000000000000000000000000000000000000000..4d4cc892c2e82f7809a569c7a04842f103191e9a
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh4.mir
@@ -0,0 +1,228 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:   -disable-branch-fold  -filetype=obj \
+# RUN: | llvm-readobj -unwind | FileCheck %s
+# Check that multiple epilgoues are correctly placed in .xdata.
+
+# CHECK:        ExceptionData {
+# CHECK-NEXT:      FunctionLength: 164
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 2
+# CHECK-NEXT:      ByteCodeLength: 48
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xc80c              ; stp x19, x20, [sp, #96]
+# CHECK-NEXT:        0xc88a              ; stp x21, x22, [sp, #80]
+# CHECK-NEXT:        0xc908              ; stp x23, x24, [sp, #64]
+# CHECK-NEXT:        0xc986              ; stp x25, x26, [sp, #48]
+# CHECK-NEXT:        0xca04              ; stp x27, x28, [sp, #32]
+# CHECK-NEXT:        0xd802              ; stp d8, d9, [sp, #16]
+# CHECK-NEXT:        0xda8d              ; stp d10, d11, [sp, #-112]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 16
+# CHECK-NEXT:          EpilogueStartIndex: 15
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc80c              ; ldp x19, x20, [sp, #96]
+# CHECK-NEXT:            0xc88a              ; ldp x21, x22, [sp, #80]
+# CHECK-NEXT:            0xc908              ; ldp x23, x24, [sp, #64]
+# CHECK-NEXT:            0xc986              ; ldp x25, x26, [sp, #48]
+# CHECK-NEXT:            0xca04              ; ldp x27, x28, [sp, #32]
+# CHECK-NEXT:            0xd802              ; ldp d8, d9, [sp, #16]
+# CHECK-NEXT:            0xda8d              ; ldp d10, d11, [sp], #112
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 33
+# CHECK-NEXT:          EpilogueStartIndex: 30
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xc80c              ; ldp x19, x20, [sp, #96]
+# CHECK-NEXT:            0xc88a              ; ldp x21, x22, [sp, #80]
+# CHECK-NEXT:            0xc908              ; ldp x23, x24, [sp, #64]
+# CHECK-NEXT:            0xc986              ; ldp x25, x26, [sp, #48]
+# CHECK-NEXT:            0xca04              ; ldp x27, x28, [sp, #32]
+# CHECK-NEXT:            0xd802              ; ldp d8, d9, [sp, #16]
+# CHECK-NEXT:            0xda8d              ; ldp d10, d11, [sp], #112
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            test
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       112
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: true
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x20', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x21', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x22', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x23', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x24', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -56, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x25', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -64, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x26', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -72, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x27', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 9, name: '', type: spill-slot, offset: -80, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x28', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 10, name: '', type: spill-slot, offset: -88, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d8', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 11, name: '', type: spill-slot, offset: -96, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d9', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 12, name: '', type: spill-slot, offset: -104, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d10', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 13, name: '', type: spill-slot, offset: -112, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$d11', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $x0, $x1, $d0, $d1, $d10, $d11, $d8, $d9, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20
+
+    early-clobber $sp = frame-setup STPDpre killed $d10, killed $d11, $sp, -14 :: (store 8 into %stack.12), (store 8 into %stack.13)
+    frame-setup SEH_SaveFRegP_X 10, 11, -112
+    frame-setup STPDi killed $d8, killed $d9, $sp, 2 :: (store 8 into %stack.10), (store 8 into %stack.11)
+    frame-setup SEH_SaveFRegP 8, 9, 16
+    frame-setup STPXi killed $x27, killed $x28, $sp, 4 :: (store 8 into %stack.8), (store 8 into %stack.9)
+    frame-setup SEH_SaveRegP 27, 28, 32
+    frame-setup STPXi killed $x25, killed $x26, $sp, 6 :: (store 8 into %stack.6), (store 8 into %stack.7)
+    frame-setup SEH_SaveRegP 25, 26, 48
+    frame-setup STPXi killed $x23, killed $x24, $sp, 8 :: (store 8 into %stack.4), (store 8 into %stack.5)
+    frame-setup SEH_SaveRegP 23, 24, 64
+    frame-setup STPXi killed $x21, killed $x22, $sp, 10 :: (store 8 into %stack.2), (store 8 into %stack.3)
+    frame-setup SEH_SaveRegP 21, 22, 80
+    frame-setup STPXi killed $x19, killed $x20, $sp, 12 :: (store 8 into %stack.0), (store 8 into %stack.1)
+    frame-setup SEH_SaveRegP 19, 20, 96
+    frame-setup SEH_PrologEnd
+    frame-setup CFI_INSTRUCTION def_cfa_offset 112
+    frame-setup CFI_INSTRUCTION offset $w19, -8
+    frame-setup CFI_INSTRUCTION offset $w20, -16
+    frame-setup CFI_INSTRUCTION offset $w21, -24
+    frame-setup CFI_INSTRUCTION offset $w22, -32
+    frame-setup CFI_INSTRUCTION offset $w23, -40
+    frame-setup CFI_INSTRUCTION offset $w24, -48
+    frame-setup CFI_INSTRUCTION offset $w25, -56
+    frame-setup CFI_INSTRUCTION offset $w26, -64
+    frame-setup CFI_INSTRUCTION offset $w27, -72
+    frame-setup CFI_INSTRUCTION offset $w28, -80
+    frame-setup CFI_INSTRUCTION offset $b8, -88
+    frame-setup CFI_INSTRUCTION offset $b9, -96
+    frame-setup CFI_INSTRUCTION offset $b10, -104
+    frame-setup CFI_INSTRUCTION offset $b11, -112
+    $x19 = ADDXrr $x0, killed $x1
+    $d8 = FADDDrr killed $d0, $d1
+    $d9 = FADDDrr $d8, $d1
+    $d10 = FADDDrr $d9, $d8
+    $d11 = FADDDrr killed $d9, $d10
+    $x20 = SUBSXrr $x19, killed $x0, implicit-def $nzcv
+    Bcc 1, %bb.2, implicit killed $nzcv
+    B %bb.1
+
+  bb.1:
+    liveins: $x19, $x20
+
+    $x21 = ADDXrr $x20, killed $x19
+    $x22 = ADDXrr $x21, killed $x20
+    $x23 = ADDXrr $x22, killed $x21
+    $x24 = ADDXrr $x23, killed $x22
+    $x25 = ADDXrr $x24, killed $x23
+    $x26 = ADDXrr $x25, killed $x24
+    $x27 = ADDXrr $x26, killed $x25
+    $x28 = ADDXrr $x27, killed $x26
+    $x0 = COPY $x28
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 96
+    $x21, $x22 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.2), (load 8 from %stack.3)
+    frame-destroy SEH_SaveRegP 21, 22, 80
+    $x23, $x24 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 64
+    $x25, $x26 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 48
+    $x27, $x28 = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP 27, 28, 32
+    $d8, $d9 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.10), (load 8 from %stack.11)
+    frame-destroy SEH_SaveFRegP 8, 9, 16
+    early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14 :: (load 8 from %stack.12), (load 8 from %stack.13)
+    frame-destroy SEH_SaveFRegP_X 10, 11, -112
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+  bb.2:
+    liveins: $x28, $d11
+
+    $x0 = COPY $d11
+    $x0 = ADDXrr $x0, killed $x28
+    frame-destroy SEH_EpilogStart
+    $x19, $x20 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.0), (load 8 from %stack.1)
+    frame-destroy SEH_SaveRegP 19, 20, 96
+    $x21, $x22 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.2), (load 8 from %stack.3)
+    frame-destroy SEH_SaveRegP 21, 22, 80
+    $x23, $x24 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.4), (load 8 from %stack.5)
+    frame-destroy SEH_SaveRegP 23, 24, 64
+    $x25, $x26 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.6), (load 8 from %stack.7)
+    frame-destroy SEH_SaveRegP 25, 26, 48
+    $x27, $x28 = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.8), (load 8 from %stack.9)
+    frame-destroy SEH_SaveRegP 27, 28, 32
+    $d8, $d9 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.10), (load 8 from %stack.11)
+    frame-destroy SEH_SaveFRegP 8, 9, 16
+    early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14 :: (load 8 from %stack.12), (load 8 from %stack.13)
+    frame-destroy SEH_SaveFRegP_X 10, 11, -112
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/test/CodeGen/AArch64/wineh5.mir b/test/CodeGen/AArch64/wineh5.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c47bad5d290988dd6b2b088d74ccad6558d026a4
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh5.mir
@@ -0,0 +1,224 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:    -filetype=obj | llvm-readobj -unwind | FileCheck %s
+
+# Check that that the large stack allocation is correctly represented in .xdata.
+
+# CHECK:        ExceptionData {
+# CHECK-NEXT:     FunctionLength: 156
+# CHECK-NEXT:     Version: 0
+# CHECK-NEXT:     ExceptionData: No
+# CHECK-NEXT:     EpiloguePacked: No
+# CHECK-NEXT:     EpilogueScopes: 1
+# CHECK-NEXT:     ByteCodeLength: 20
+# CHECK-NEXT:     Prologue [
+# CHECK-NEXT:       0xe002dac9          ; sub sp, #2993296
+# CHECK-NEXT:       0xe3                ; nop
+# CHECK-NEXT:       0xe3                ; nop
+# CHECK-NEXT:       0x42                ; stp x29, x30, [sp, #16]
+# CHECK-NEXT:       0xd53f              ; str x28, [sp, #256]!
+# CHECK-NEXT:       0xe4                ; end
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     EpilogueScopes [
+# CHECK-NEXT:       EpilogueScope {
+# CHECK-NEXT:         StartOffset: 34
+# CHECK-NEXT:         EpilogueStartIndex: 10
+# CHECK-NEXT:         Opcodes [
+# CHECK-NEXT:           0xe002da00          ; add sp, #2990080
+# CHECK-NEXT:           0xc0c9              ; add sp, #3216
+# CHECK-NEXT:           0x42                ; ldp x29, x30, [sp, #16]
+# CHECK-NEXT:           0xd53f              ; ldr x28, [sp], #256
+# CHECK-NEXT:           0xe4                ; end
+# CHECK-NEXT:         ]
+# CHECK-NEXT:       }
+# CHECK-NEXT:     ]
+# CHECK-NEXT:   }
+
+
+--- |
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  ; Function Attrs: noinline optnone
+  define dso_local i32 @"?func@@YAHH@Z"(i32 %i) #0 {
+  entry:
+    %retval = alloca i32, align 4
+    %i.addr = alloca i32, align 4
+    %A = alloca [748193 x i32], align 4
+    %a = alloca i32, align 4
+    %B = alloca [123 x i32], align 4
+    store i32 %i, i32* %i.addr, align 4
+    %0 = load i32, i32* %i.addr, align 4
+    %add = add nsw i32 %0, 2
+    store i32 %add, i32* %a, align 4
+    %call = call i32 @"?func2@@YAHXZ"()
+    %1 = load i32, i32* %i.addr, align 4
+    %cmp = icmp sgt i32 %1, 2
+    br i1 %cmp, label %if.then, label %if.else
+
+  if.then:                                          ; preds = %entry
+    %call1 = call i32 @"?func2@@YAHXZ"()
+    store i32 %call1, i32* %retval, align 4
+    br label %return
+
+  if.else:                                          ; preds = %entry
+    %arraydecay = getelementptr inbounds [123 x i32], [123 x i32]* %B, i32 0, i32 0
+    %call2 = call i32 @"?func3@@YAHPEAH@Z"(i32* %arraydecay)
+    store i32 %call2, i32* %retval, align 4
+    br label %return
+
+  return:                                           ; preds = %if.else, %if.then
+    %2 = load i32, i32* %retval, align 4
+    ret i32 %2
+  }
+
+  declare dso_local i32 @"?func2@@YAHXZ"() #1
+
+  declare dso_local i32 @"?func3@@YAHPEAH@Z"(i32*) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #2
+
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { nounwind }
+
+  !llvm.module.flags = !{!0}
+
+  !0 = !{i32 1, !"wchar_size", i32 2}
+
+...
+---
+name:            '?func@@YAHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        true
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       true
+registers:
+liveins:
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       2993328
+  offsetAdjustment: 0
+  maxAlignment:    16
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  2993276
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: retval, type: default, offset: -36, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: i.addr, type: default, offset: -40, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: A, type: default, offset: -2992812, size: 2992772, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2992780, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, name: a, type: default, offset: -2992816, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2992784, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 4, name: B, type: default, offset: -2993308, size: 492, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -2993276, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -2993320, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -2993324, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$fp', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 9, name: '', type: spill-slot, offset: -32, size: 8, alignment: 16,
+      stack-id: 0, callee-saved-register: '$x28', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.1.entry:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+    liveins: $w0, $x28, $fp, $lr
+
+    early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32 :: (store 8 into %stack.9)
+    frame-setup SEH_SaveReg_X 28, -256
+    frame-setup STPXi killed $fp, killed $lr, $sp, 2 :: (store 8 into %stack.7), (store 8 into %stack.8)
+    frame-setup SEH_SaveFPLR 16
+    $x15 = frame-setup MOVi64imm 187081
+    frame-setup SEH_Nop
+    frame-setup BL &__chkstk, implicit-def $lr, implicit $sp, implicit $x15
+    frame-setup SEH_Nop
+    $sp = frame-setup SUBXrx64 killed $sp, killed $x15, 28
+    frame-setup SEH_StackAlloc 2993296
+    frame-setup SEH_PrologEnd
+    $x8 = ADDXri $sp, 730, 12
+    $x8 = ADDXri $x8, 3208, 0
+    renamable $w9 = MOVi32imm 2
+    STRWui killed renamable $w0, renamable $x8, 0 :: (store 4 into %ir.i.addr)
+    renamable $w0 = LDRWui renamable $x8, 0 :: (load 4 from %ir.i.addr)
+    renamable $w0 = ADDWri killed renamable $w0, 2, 0
+    STRWui killed renamable $w0, $sp, 128 :: (store 4 into %ir.a)
+    STRXui killed $x8, $sp, 1 :: (store 8 into %stack.5)
+    STRWui killed $w9, $sp, 1 :: (store 4 into %stack.6)
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $w0
+    $x8 = LDRXui $sp, 1 :: (load 8 from %stack.5)
+    renamable $w9 = LDRWui killed renamable $x8, 0 :: (load 4 from %ir.i.addr)
+    $w10 = LDRWui $sp, 1 :: (load 4 from %stack.6)
+    $wzr = SUBSWrr killed renamable $w9, killed renamable $w10, implicit-def $nzcv
+    renamable $w9 = CSINCWr $wzr, $wzr, 13, implicit $nzcv
+    TBNZW killed renamable $w9, 0, %bb.2
+    B %bb.3
+
+  bb.2.if.then:
+    successors: %bb.4(0x80000000)
+
+    BL @"?func2@@YAHXZ", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $w0
+    $x8 = LDRXui $sp, 1 :: (load 8 from %stack.5)
+    STRWui killed renamable $w0, killed renamable $x8, 1 :: (store 4 into %ir.retval)
+    B %bb.4
+
+  bb.3.if.else:
+    successors: %bb.4(0x80000000)
+
+    $x8 = ADDXri $sp, 20, 0
+    $x0 = COPY killed renamable $x8
+    BL @"?func3@@YAHPEAH@Z", csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit killed $x0, implicit-def $w0
+    $x8 = LDRXui $sp, 1 :: (load 8 from %stack.5)
+    STRWui killed renamable $w0, killed renamable $x8, 1 :: (store 4 into %ir.retval)
+
+  bb.4.return:
+    $x8 = LDRXui $sp, 1 :: (load 8 from %stack.5)
+    renamable $w0 = LDRWui killed renamable $x8, 1 :: (load 4 from %ir.retval)
+    frame-destroy SEH_EpilogStart
+    $sp = frame-destroy ADDXri $sp, 730, 12
+    frame-destroy SEH_StackAlloc 2990080
+    $sp = frame-destroy ADDXri $sp, 3216, 0
+    frame-destroy SEH_StackAlloc 3216
+    $fp, $lr = frame-destroy LDPXi $sp, 2 :: (load 8 from %stack.7), (load 8 from %stack.8)
+    frame-destroy SEH_SaveFPLR 16
+    early-clobber $sp, $x28 = frame-destroy LDRXpost $sp, 32 :: (load 8 from %stack.9)
+    frame-destroy SEH_SaveReg_X 28, -256
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh6.mir b/test/CodeGen/AArch64/wineh6.mir
new file mode 100644
index 0000000000000000000000000000000000000000..fd1f9ece3a698a971227840602fa659a24c56465
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh6.mir
@@ -0,0 +1,138 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:    -filetype=obj | llvm-readobj -unwind | FileCheck %s
+# Check save_fplr_x, set_fp, alloc_s
+
+# CHECK: 	ExceptionData {
+# CHECK-NEXT:      FunctionLength: 92
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 8
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0x02                ; sub sp, #32
+# CHECK-NEXT:        0xe1                ; mov fp, sp
+# CHECK-NEXT:        0x81                ; stp x29, x30, [sp, #-16]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 20
+# CHECK-NEXT:          EpilogueStartIndex: 4
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xe1                ; mov fp, sp
+# CHECK-NEXT:            0x81                ; ldp x29, x30, [sp], #16
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+...
+---
+name:            '?func@@YAHHHHH@Z'
+alignment:       3
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI: true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+  - { reg: '$w1', virtual-reg: '' }
+  - { reg: '$w2', virtual-reg: '' }
+  - { reg: '$w3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       48
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  24
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: default, offset: -20, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: '', type: default, offset: -24, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -8, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 2, name: '', type: default, offset: -28, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -12, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 3, name: '', type: default, offset: -32, size: 4, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -16, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 4, name: '', type: default, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 5, name: '', type: variable-sized, offset: -40,
+      alignment: 1, stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -24, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 7, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$fp', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 8, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $w0, $w1, $w2, $w3, $lr
+
+    early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store 8 into %stack.7), (store 8 into %stack.8)
+    frame-setup SEH_SaveFPLR_X -16
+    $fp = frame-setup ADDXri $sp, 0, 0
+    frame-setup SEH_SetFP
+    $sp = frame-setup SUBXri $sp, 32, 0
+    frame-setup SEH_StackAlloc 32
+    frame-setup SEH_PrologEnd
+    STURWi killed renamable $w3, $fp, -4
+    STURWi killed renamable $w2, $fp, -8
+    STURWi killed renamable $w1, $fp, -12
+    STURWi killed renamable $w0, $fp, -16
+    renamable $x8 = LDURSWi $fp, -16
+    renamable $x8 = ADDXri killed renamable $x8, 15, 0
+    renamable $x8 = UBFMXri killed renamable $x8, 4, 63
+    $x15 = COPY renamable $x8
+    STURXi killed $x8, $fp, -32 :: (store 8 into %stack.6)
+    BL &__chkstk, csr_aarch64_stackprobe_windows, implicit-def dead $lr, implicit $sp, implicit killed $x15
+    renamable $x8 = COPY $sp
+    $x15 = LDURXi $fp, -32 :: (load 8 from %stack.6)
+    renamable $x8 = SUBSXrs killed renamable $x8, killed renamable $x15, 4, implicit-def dead $nzcv
+    $sp = COPY renamable $x8
+    STURXi killed renamable $x8, $fp, -24
+    renamable $x0 = LDURXi $fp, -24
+    renamable $w1 = COPY $wzr
+    $w0 = COPY killed renamable $w1
+    frame-destroy SEH_EpilogStart
+    $sp = frame-destroy ADDXri $fp, 0, 0
+    frame-destroy SEH_SetFP
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
+    frame-destroy SEH_SaveFPLR_X -16
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit killed $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh7.mir b/test/CodeGen/AArch64/wineh7.mir
new file mode 100644
index 0000000000000000000000000000000000000000..547c622a704059542dd779542ee041fd877c6d36
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh7.mir
@@ -0,0 +1,134 @@
+# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \
+# RUN:   -filetype=obj  | llvm-readobj -unwind | FileCheck %s
+# Check AddFP
+
+# CHECK:	 ExceptionData {
+# CHECK-NEXT:      FunctionLength: 72
+# CHECK-NEXT:      Version: 0
+# CHECK-NEXT:      ExceptionData: No
+# CHECK-NEXT:      EpiloguePacked: No
+# CHECK-NEXT:      EpilogueScopes: 1
+# CHECK-NEXT:      ByteCodeLength: 16
+# CHECK-NEXT:      Prologue [
+# CHECK-NEXT:        0xe204              ; add fp, sp, #32
+# CHECK-NEXT:        0x44                ; stp x29, x30, [sp, #32]
+# CHECK-NEXT:        0xc802              ; stp x19, x20, [sp, #16]
+# CHECK-NEXT:        0xcc85              ; stp x21, x22, [sp, #-48]!
+# CHECK-NEXT:        0xe4                ; end
+# CHECK-NEXT:      ]
+# CHECK-NEXT:      EpilogueScopes [
+# CHECK-NEXT:        EpilogueScope {
+# CHECK-NEXT:          StartOffset: 13
+# CHECK-NEXT:          EpilogueStartIndex: 8
+# CHECK-NEXT:          Opcodes [
+# CHECK-NEXT:            0xe204              ; add fp, sp, #32
+# CHECK-NEXT:            0x44                ; ldp x29, x30, [sp, #32]
+# CHECK-NEXT:            0xc802              ; ldp x19, x20, [sp, #16]
+# CHECK-NEXT:            0xcc85              ; ldp x21, x22, [sp], #48
+# CHECK-NEXT:            0xe4                ; end
+# CHECK-NEXT:          ]
+# CHECK-NEXT:        }
+# CHECK-NEXT:      ]
+# CHECK-NEXT:    }
+# CHECK-NEXT:  }
+
+...
+---
+name:            '?func@@YAHHHHH@Z'
+alignment:       3
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI: true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+  - { reg: '$w1', virtual-reg: '' }
+  - { reg: '$w2', virtual-reg: '' }
+  - { reg: '$w3', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       48
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: '', type: variable-sized, offset: -48,
+      alignment: 1, stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$fp', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x19', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x20', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 5, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x21', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 6, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8,
+      stack-id: 0, callee-saved-register: '$x22', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    liveins: $w0, $w1, $w2, $w3, $x21, $x22, $x19, $x20, $lr
+
+    early-clobber $sp = frame-setup STPXpre killed $x21, killed $x22, $sp, -6 :: (store 8 into %stack.5), (store 8 into %stack.6)
+    frame-setup SEH_SaveRegP_X 21, 22, -48
+    frame-setup STPXi killed $x19, killed $x20, $sp, 2 :: (store 8 into %stack.3), (store 8 into %stack.4)
+    frame-setup SEH_SaveRegP 19, 20, 16
+    frame-setup STPXi killed $fp, killed $lr, $sp, 4 :: (store 8 into %stack.1), (store 8 into %stack.2)
+    frame-setup SEH_SaveFPLR 32
+    $fp = frame-setup ADDXri $sp, 32, 0
+    frame-setup SEH_AddFP 32
+    frame-setup SEH_PrologEnd
+    renamable $w19 = COPY $w3
+    renamable $w0 = KILL $w0, implicit-def $x0
+    renamable $w20 = COPY $w2
+    renamable $w21 = COPY $w1
+    renamable $x8 = SBFMXri killed renamable $x0, 0, 31
+    renamable $x9 = ADDXri killed renamable $x8, 15, 0
+    renamable $x15 = UBFMXri killed renamable $x9, 4, 63
+    renamable $x8 = COPY $sp
+    renamable $x22 = SUBXrs killed renamable $x8, killed renamable $x15, 4
+    $sp = COPY renamable $x22
+    $x0 = COPY renamable $x22
+    renamable $w8 = LDRWroW killed renamable $x22, killed renamable $w21, 1, 1
+    renamable $w9 = ADDWrr killed renamable $w19, killed renamable $w20
+    renamable $w0 = ADDWrr killed renamable $w9, killed renamable $w8
+    frame-destroy SEH_EpilogStart
+    $sp = frame-destroy SUBXri $fp, 32, 0
+    frame-destroy SEH_AddFP 32
+    $fp, $lr = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.1), (load 8 from %stack.2)
+    frame-destroy SEH_SaveFPLR 32
+    $x19, $x20 = frame-destroy LDPXi $sp, 2 :: (load 8 from %stack.3), (load 8 from %stack.4)
+    frame-destroy SEH_SaveRegP 19, 20, 16
+    early-clobber $sp, $x21, $x22 = frame-destroy LDPXpost $sp, 6 :: (load 8 from %stack.5), (load 8 from %stack.6)
+    frame-destroy SEH_SaveRegP_X 21, 22, -48
+    frame-destroy SEH_EpilogEnd
+    RET_ReallyLR implicit $w0
+
+...
diff --git a/test/CodeGen/AArch64/wineh_shrinkwrap.mir b/test/CodeGen/AArch64/wineh_shrinkwrap.mir
new file mode 100644
index 0000000000000000000000000000000000000000..97204722bc10892d869516ef79c3e4f23a38602d
--- /dev/null
+++ b/test/CodeGen/AArch64/wineh_shrinkwrap.mir
@@ -0,0 +1,146 @@
+# RUN: llc -O2 -o - %s -mtriple=aarch64-windows -start-before=shrink-wrap \
+# RUN:   -stop-after=prologepilog | FileCheck %s --check-prefix=WIN64
+# RUN: llc -O2 -o - %s -mtriple=aarch64-linux -start-before=shrink-wrap \
+# RUN:   -stop-after=prologepilog | FileCheck %s --check-prefix=LINUX
+
+# This tests checks that shrink wrapping bails out on Windows AMR64 due to the
+# use of Windows CFI.  We don't currently support fragments for WIndows EH on
+# ARM64.
+# The same test gets shrink wrapped on Linux ARM64.
+
+# WIN64-LABEL: bb.0.entry:
+# WIN64: early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32
+# WIN64-LABEL: bb.1:
+# WIN64-LABEL: bb.2.if.then:
+
+# LINUX-LABEL: bb.0.entry:
+# LINUX-LABEL: bb.1:
+# LINUX-LABEL: bb.2.if.then:
+# LINUX: early-clobber $sp = frame-setup STRXpre killed $x28, $sp, -32
+--- |
+  ; ModuleID = 'shrink.cpp'
+  target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+  define dso_local i32 @"?func@@YAHHH@Z"(i32 %a, i32 %b) local_unnamed_addr #0 {
+  entry:
+    %A = alloca [1000 x i32], align 4
+    %cmp = icmp sgt i32 %a, 1
+    br i1 %cmp, label %if.then, label %return
+
+  if.then:                                          ; preds = %entry
+    %0 = bitcast [1000 x i32]* %A to i8*
+    call void @llvm.lifetime.start.p0i8(i64 4000, i8* nonnull %0) #3
+    %arraydecay2 = bitcast [1000 x i32]* %A to i32*
+    call void @"?init@@YAXPEAH@Z"(i32* nonnull %arraydecay2)
+    %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* %A, i64 0, i64 100
+    %1 = load i32, i32* %arrayidx, align 4, !tbaa !2
+    %add = add i32 %b, 1
+    %add1 = add i32 %add, %1
+    call void @llvm.lifetime.end.p0i8(i64 4000, i8* nonnull %0) #3
+    br label %return
+
+  return:                                           ; preds = %entry, %if.then
+    %retval.0 = phi i32 [ %add1, %if.then ], [ 0, %entry ]
+    ret i32 %retval.0
+  }
+
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+  declare dso_local void @"?init@@YAXPEAH@Z"(i32*) local_unnamed_addr #2
+
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { argmemonly nounwind }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #3 = { nounwind }
+
+  !llvm.module.flags = !{!0}
+  !llvm.ident = !{!1}
+
+  !0 = !{i32 1, !"wchar_size", i32 2}
+  !1 = !{!"clang version 8.0.0"}
+  !2 = !{!3, !3, i64 0}
+  !3 = !{!"int", !4, i64 0}
+  !4 = !{!"omnipotent char", !5, i64 0}
+  !5 = !{!"Simple C++ TBAA"}
+
+...
+---
+name:            '?func@@YAHHH@Z'
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+registers:
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+  - { reg: '$w1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  4000
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+  - { id: 0, name: A, type: default, offset: 0, size: 4000, alignment: 4,
+      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -4000, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $w0, $w1
+
+    dead $wzr = SUBSWri killed renamable $w0, 2, 0, implicit-def $nzcv
+    Bcc 10, %bb.2, implicit killed $nzcv
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    renamable $w0 = COPY $wzr
+    B %bb.3
+
+  bb.2.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $w1
+
+    renamable $w19 = COPY $w1
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $x0 = ADDXri %stack.0.A, 0, 0
+    BL @"?init@@YAXPEAH@Z", csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $w8 = LDRWui %stack.0.A, 100 :: (dereferenceable load 4 from %ir.arrayidx, !tbaa !2)
+    renamable $w8 = ADDWrr killed renamable $w19, killed renamable $w8
+    renamable $w0 = ADDWri killed renamable $w8, 1, 0
+
+  bb.3.return:
+    liveins: $w0
+
+    RET_ReallyLR implicit $w0
+
+...
diff --git a/test/CodeGen/AMDGPU/add_i1.ll b/test/CodeGen/AMDGPU/add_i1.ll
index 1f44940018c00f1d63e23e2d6bad33902af1773a..c5f7e3af5e3075b44bd9e71cf46ecea5b80d0d5f 100644
--- a/test/CodeGen/AMDGPU/add_i1.ll
+++ b/test/CodeGen/AMDGPU/add_i1.ll
@@ -19,3 +19,29 @@ define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)
   store i1 %add, i1 addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}add_i1_cf:
+; GCN: ; %endif
+; GCN: s_not_b64
+define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %0 = load volatile i1, i1 addrspace(1)* %a
+  br label %endif
+
+else:
+  %1 = load volatile i1, i1 addrspace(1)* %b
+  br label %endif
+
+endif:
+  %2 = phi i1 [%0, %if], [%1, %else]
+  %3 = add i1 %2, -1
+  store i1 %3, i1 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
index 08e7883023d52e99a7119c20f1939891a1881a2f..3e09618bc28787b515780735c4a04029a8265ab6 100644
--- a/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mtriple=amdgcn-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=amdgcn-- -aa-eval -amdgpu-aa -amdgpu-aa-wrapper -disable-basicaa  -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=r600-- -aa-eval -amdgpu-aa -amdgpu-aa-wrapper -disable-basicaa  -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: NoAlias:      i8 addrspace(1)* %p1, i8 addrspace(5)* %p
 
diff --git a/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4b4b268df1faa61638a9a5f81a301b0744cb930c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
+; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
+
+declare i1 @llvm.amdgcn.wqm.vote(i1)
+declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1)
+
+; Show that what the atomic optimization pass will do for raw buffers.
+
+; GCN-LABEL: add_i32_constant:
+; GCN-LABEL: BB0_1:
+; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
+; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
+; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
+; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
+; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
+; GCN: buffer_atomic_add v[[value]]
+; GCN: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
+define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {
+entry:
+  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
+  %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)
+  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
+  %cond = and i1 %cond1, %cond2
+  br i1 %cond, label %if, label %else
+if:
+  %bitcast = bitcast i32 %old to float
+  call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)
+  ret void
+else:
+  ret void
+}
+
+; GCN-LABEL: add_i32_varying:
+; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
+; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
+; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], exec_lo, 0
+; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], exec_hi, v[[mbcnt_lo]]
+; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
+; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
+; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
+; GFX8MORE: buffer_atomic_add v[[value]]
+; GFX8MORE: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
+define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {
+entry:
+  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
+  %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i1 0)
+  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
+  %cond = and i1 %cond1, %cond2
+  br i1 %cond, label %if, label %else
+if:
+  %bitcast = bitcast i32 %old to float
+  call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)
+  ret void
+else:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll b/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ea5f01fbda0f7d7d2168703a939ad20ff9bd3ead
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}cast_constant_i64_to_build_vector_v4i16:
+; GCN: global_store_dwordx2
+; GCN: global_store_dword v
+; GCN: global_store_short
+define amdgpu_kernel void @cast_constant_i64_to_build_vector_v4i16(i8 addrspace(1)* nocapture %data) {
+entry:
+  store i8 72, i8 addrspace(1)* %data, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 1
+  store i8 101, i8 addrspace(1)* %arrayidx1, align 1
+  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 2
+  store i8 108, i8 addrspace(1)* %arrayidx2, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 3
+  store i8 108, i8 addrspace(1)* %arrayidx3, align 1
+  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 4
+  store i8 111, i8 addrspace(1)* %arrayidx4, align 1
+  %arrayidx5 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 5
+  store i8 44, i8 addrspace(1)* %arrayidx5, align 1
+  %arrayidx6 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 6
+  store i8 32, i8 addrspace(1)* %arrayidx6, align 1
+  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 7
+  store i8 87, i8 addrspace(1)* %arrayidx7, align 1
+  %arrayidx8 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 8
+  store i8 111, i8 addrspace(1)* %arrayidx8, align 1
+  %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 9
+  store i8 114, i8 addrspace(1)* %arrayidx9, align 1
+  %arrayidx10 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 10
+  store i8 108, i8 addrspace(1)* %arrayidx10, align 1
+  %arrayidx11 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 11
+  store i8 100, i8 addrspace(1)* %arrayidx11, align 1
+  %arrayidx12 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 12
+  store i8 33, i8 addrspace(1)* %arrayidx12, align 1
+  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %data, i64 13
+  store i8 72, i8 addrspace(1)* %arrayidx13, align 1
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/branch-relax-spill.ll b/test/CodeGen/AMDGPU/branch-relax-spill.ll
index db476c21636fc58645b9fda8ffa51581d545ac97..3d6906301d7e2441a8cde5f32366d7b16662964c 100644
--- a/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -3,7 +3,7 @@
 ; FIXME: This should be able to compile, but requires inserting an
 ; extra block to restore the scavenged register.
 
-; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot!
+; FAIL: LLVM ERROR: Error while trying to spill SGPR0_SGPR1 from class SReg_64: Cannot scavenge register without an emergency spill slot!
 
 define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 entry:
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index d4284a32c0ebf37553795487bcbeb25bfcc8166d..72c983d5d9788c881ce541254ba7e4e0a5fe29d4 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 
 ; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
@@ -61,10 +61,10 @@ bb3:
 ; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[LONGBB]]:
 ; GCN-NEXT: ;;#ASMSTART
@@ -105,10 +105,10 @@ bb3:
 ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[LONGBB]]:
 ; GCN: v_nop_e64
@@ -191,10 +191,11 @@ bb3:
 
 ; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONG_JUMP]]+4)-[[LOOPBB]]
-; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
-; GCN-NEXT: s_setpc_b64 vcc
+
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONG_JUMP]]+4)-[[LOOPBB]]
+; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
@@ -225,20 +226,20 @@ bb3:
 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONG_JUMP0:BB[0-9]+_[0-9]+]]: ; %bb0
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC0_LO]]:[[PC0_HI]]{{\]}}
 
 ; GCN-NEXT: [[BB2]]: ; %bb2
 ; GCN: v_mov_b32_e32 [[BB2_K:v[0-9]+]], 17
 ; GCN: buffer_store_dword [[BB2_K]]
 
 ; GCN-NEXT: [[LONG_JUMP1:BB[0-9]+_[0-9]+]]: ; %bb2
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB4:BB[0-9]_[0-9]+]]-([[LONG_JUMP1]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], [[BB4:BB[0-9]_[0-9]+]]-([[LONG_JUMP1]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC1_LO]]:[[PC1_HI]]{{\]}}
 
 ; GCN: [[BB3]]: ; %bb3
 ; GCN: v_nop_e64
@@ -289,10 +290,11 @@ bb4:
 
 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
 ; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP]]
-; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONGBB]]+4)-[[LOOP]]
+; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
 define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 entry:
@@ -318,10 +320,11 @@ loop:
 ; GCN-NEXT: s_cbranch_scc0 [[BB1:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONGBB0:BB[0-9]+_[0-9]+]]: ; %bb0
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB0]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB0]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC0_LO]]:[[PC0_HI]]{{\]}}
 
 ; GCN-NEXT: [[BB1]]: ; %bb1
 ; GCN-NEXT: s_load_dword
@@ -330,10 +333,10 @@ loop:
 ; GCN-NEXT: s_cbranch_scc0 [[BB2:BB[0-9]_[0-9]+]]
 
 ; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC1_LO]]:[[PC1_HI]]{{\]}}
 
 ; GCN-NEXT: [[BB2]]: ; %bb2
 ; GCN-NEXT: ;;#ASMSTART
@@ -389,10 +392,10 @@ bb3:
 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
 
 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_add_u32 vcc_lo, vcc_lo, [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4)
-; GCN-NEXT: s_addc_u32 vcc_hi, vcc_hi, 0{{$}}
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4)
+; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}}
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[IF]]: ; %if
 ; GCN: buffer_store_dword
@@ -454,10 +457,10 @@ endif:
 
 ; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop
 ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1
-; GCN-NEXT: s_getpc_b64 vcc
-; GCN-NEXT: s_sub_u32 vcc_lo, vcc_lo, ([[LONGBB]]+4)-[[LOOP_BODY]]
-; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0
-; GCN-NEXT: s_setpc_b64 vcc
+; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
+; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONGBB]]+4)-[[LOOP_BODY]]
+; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0
+; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 
 ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_endpgm
@@ -494,8 +497,9 @@ ret:
 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
-; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
-; GCN: s_setpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
+; GCN-NEXT: s_addc_u32
+; GCN-NEXT: s_setpc_b64
 
 ; GCN-NEXT: [[LONG_BR_0]]:
 ; GCN-DAG: v_cmp_lt_i32
diff --git a/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll b/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fd81c0438d6f88aa9016f369596530abcde25027
--- /dev/null
+++ b/test/CodeGen/AMDGPU/build-vector-insert-elt-infloop.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; There was an infinite loop in DAGCombiner from a target build_vector
+; combine and a generic insert_vector_elt combine.
+
+; GCN-LABEL: {{^}}combine_loop:
+; GCN: flat_load_ushort
+; GCN: flat_store_short
+; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+define amdgpu_kernel void @combine_loop(i16* %arg) #0 {
+bb:
+  br label %bb1
+
+bb1:
+  %tmp = phi <2 x i16> [ <i16 15360, i16 15360>, %bb ], [ %tmp5, %bb1 ]
+  %tmp2 = phi half [ 0xH0000, %bb ], [ %tmp8, %bb1 ]
+  %tmp3 = load volatile half, half* null, align 536870912
+  %tmp4 = bitcast half %tmp3 to i16
+  %tmp5 = insertelement <2 x i16> <i16 0, i16 undef>, i16 %tmp4, i32 1
+  %tmp6 = bitcast i16* %arg to half*
+  store volatile half %tmp2, half* %tmp6, align 2
+  %tmp7 = bitcast <2 x i16> %tmp to <2 x half>
+  %tmp8 = extractelement <2 x half> %tmp7, i32 0
+  br label %bb1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll
index 581df1c8527309f43f263da24c9af61b23609bc2..84d327b6f37980e9919da43330d60be12dd0a2f8 100644
--- a/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -61,11 +61,11 @@ declare void @external_void_func_v16i8(<16 x i8>) #0
 
 ; MESA-DAG: s_mov_b64 s[0:1], s[36:37]
 
+; GCN: v_mov_b32_e32 v0, 1{{$}}
+; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4
-; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
-; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
@@ -123,12 +123,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
+; GCN: v_mov_b32_e32 v0, 0x7b
+; HSA-DAG: s_mov_b32 s4, s33{{$}}
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4
-; GCN-NEXT: v_mov_b32_e32 v0, 0x7b
 
-; HSA-DAG: s_mov_b32 s4, s33{{$}}
 ; GCN-DAG: s_mov_b32 s32, s33{{$}}
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -144,11 +144,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: buffer_load_sbyte v0
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s3
 
 ; GCN: s_waitcnt vmcnt(0)
@@ -165,11 +165,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; HSA-DAG: s_mov_b32 s33, s9{{$}}
 
 ; GCN-DAG: buffer_load_ubyte v0
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: s_waitcnt vmcnt(0)
@@ -197,11 +197,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
 ; GCN-DAG: buffer_load_sshort v0
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: s_waitcnt vmcnt(0)
@@ -218,11 +218,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 
 
 ; GCN-DAG: buffer_load_ushort v0
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4
 
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: s_waitcnt vmcnt(0)
@@ -237,11 +237,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
+; GCN: v_mov_b32_e32 v0, 42
+; GCN: s_mov_b32 s4, s33
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4
-; GCN: v_mov_b32_e32 v0, 42
-; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
@@ -481,10 +481,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
 ; HSA-DAG: s_mov_b32 s33, s9
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
+; GCN-NOT: v3
 ; GCN-DAG: v_mov_b32_e32 v0, 3
 ; GCN-DAG: v_mov_b32_e32 v1, 4
 ; GCN-DAG: v_mov_b32_e32 v2, 5
-; GCN-NOT: v3
 
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
diff --git a/test/CodeGen/AMDGPU/call-constexpr.ll b/test/CodeGen/AMDGPU/call-constexpr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e0a39680bdf08af0c002ef20c4000b30da0d60db
--- /dev/null
+++ b/test/CodeGen/AMDGPU/call-constexpr.ll
@@ -0,0 +1,140 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-fix-function-bitcasts < %s | FileCheck -check-prefix=OPT %s
+
+; GCN-LABEL: {{^}}test_bitcast_return_type_noinline:
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@hi+4
+; GCN: s_swappc_b64
+; OPT-LABEL: @test_bitcast_return_type_noinline(
+; OPT: %val = call i32 @ret_i32_noinline()
+; OPT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_return_type_noinline() #0 {
+  %val = call float bitcast (i32()* @ret_i32_noinline to float()*)()
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_return_type_alwaysinline:
+; GCN-NOT: s_getpc_b64
+; GCN-NOT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@lo+4
+; GCN-NOT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_alwaysinline@rel32@hi+4
+; GCN-NOT: s_swappc_b64
+; OPT-LABEL: @test_bitcast_return_type_alwaysinline(
+; OPT: %val = call i32 @ret_i32_alwaysinline()
+; OPT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_return_type_alwaysinline() #0 {
+  %val = call float bitcast (i32()* @ret_i32_alwaysinline to float()*)()
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_argument_type:
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4
+; GCN: s_swappc_b64
+; OPT-LABEL: @test_bitcast_argument_type(
+; OPT: %1 = bitcast float 2.000000e+00 to i32
+; OPT: %val = call i32 @ident_i32(i32 %1)
+; OPT-NOT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_argument_type() #0 {
+  %val = call i32 bitcast (i32(i32)* @ident_i32 to i32(float)*)(float 2.0)
+  %op = add i32 %val, 1
+  store volatile i32 %op, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_bitcast_argument_and_return_types:
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4
+; GCN: s_swappc_b64
+; OPT-LABEL: @test_bitcast_argument_and_return_types(
+; OPT: %1 = bitcast float 2.000000e+00 to i32
+; OPT: %val = call i32 @ident_i32(i32 %1)
+; OPT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 {
+  %val = call float bitcast (i32(i32)* @ident_i32 to float(float)*)(float 2.0)
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_workitem_id_x:
+; GCN: s_waitcnt
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: s_setpc_b64
+define i32 @use_workitem_id_x(i32 %arg0) #0 {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %op = add i32 %id, %arg0
+  ret i32 %op
+}
+
+; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x:
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+4
+; GCN: v_mov_b32_e32 v0, 9
+; GCN: s_swappc_b64
+; GCN: v_add_f32_e32
+; OPT-LABEL: @use_workitem_id_x(
+; OPT: %val = call i32 @use_workitem_id_x(i32 9)
+; OPT: bitcast i32 %val to float
+define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #0 {
+  %val = call float bitcast (i32(i32)* @use_workitem_id_x to float(i32)*)(i32 9)
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_invoke:
+; GCN: s_getpc_b64
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
+; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+4
+; GCN: s_swappc_b64
+; OPT-LABEL: @test_invoke(
+; OPT: %1 = bitcast float 2.000000e+00 to i32
+; OPT: %val = invoke i32 @ident_i32(i32 %1)
+; OPT-NEXT: to label %continue unwind label %broken
+; OPT-LABEL: continue.split:
+; OPT: bitcast i32 %val to float
+@_ZTIi = external global i8*
+declare i32 @__gxx_personality_v0(...)
+define amdgpu_kernel void @test_invoke() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+  %val = invoke float bitcast (i32(i32)* @ident_i32 to float(float)*)(float 2.0)
+          to label %continue unwind label %broken
+
+broken:
+  landingpad { i8*, i32 } catch i8** @_ZTIi
+  ret void
+
+continue:
+  %op = fadd float %val, 1.0
+  store volatile float %op, float addrspace(1)* undef
+  ret void
+}
+
+; Callees appears last in source file to test that we still lower their
+; arguments before we lower any calls to them.
+
+define i32 @ret_i32_noinline() #0 {
+  ret i32 4
+}
+
+define i32 @ret_i32_alwaysinline() #1 {
+  ret i32 4
+}
+
+define i32 @ident_i32(i32 %i) #0 {
+  ret i32 %i
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+attributes #0 = { nounwind noinline }
+attributes #1 = { alwaysinline nounwind }
+attributes #2 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/call-preserved-registers.ll b/test/CodeGen/AMDGPU/call-preserved-registers.ll
index 6d1e2467d0842ebab36a1ed2070d7c4485281d8c..57bc6171d7a86e6d47c293bfcdab8f5882d8d258 100644
--- a/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -6,10 +6,10 @@ declare void @external_void_func_void() #0
 
 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GCN: s_mov_b32 s33, s7
-; GCN: s_getpc_b64 s[34:35]
+; GCN: s_mov_b32 s4, s33
+; GCN-NEXT: s_getpc_b64 s[34:35]
 ; GCN-NEXT: s_add_u32 s34, s34,
 ; GCN-NEXT: s_addc_u32 s35, s35,
-; GCN-NEXT: s_mov_b32 s4, s33
 ; GCN-NEXT: s_mov_b32 s32, s33
 ; GCN: s_swappc_b64 s[30:31], s[34:35]
 
@@ -129,13 +129,13 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
 ; GCN: s_mov_b32 s34, s9
-; GCN: ; def s33
-; GCN-NEXT: #ASMEND
-; GCN: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
-; GCN-NEXT: s_mov_b32 s4, s34
-; GCN-NEXT: s_mov_b32 s32, s34
+; GCN: s_mov_b32 s4, s34
+; GCN-DAG: s_mov_b32 s32, s34
+; GCN-DAG: ; def s33
+; GCN-DAG: #ASMEND
+; GCN-DAG: s_getpc_b64 s[6:7]
+; GCN-DAG: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
+; GCN-DAG: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ; use s33
@@ -150,13 +150,13 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32:
 ; GCN: s_mov_b32 s33, s9
-; GCN: ; def v32
-; GCN-NEXT: #ASMEND
-; GCN: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
-; GCN-NEXT: s_mov_b32 s4, s33
-; GCN-NEXT: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s4, s33
+; GCN-DAG: s_mov_b32 s32, s33
+; GCN-DAG: ; def v32
+; GCN-DAG: #ASMEND
+; GCN-DAG: s_getpc_b64 s[6:7]
+; GCN-DAG: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4
+; GCN-DAG: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4
 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ; use v32
@@ -183,10 +183,10 @@ define void @void_func_void_clobber_s33() #2 {
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
 ; GCN: s_mov_b32 s33, s7
-; GCN: s_getpc_b64
+; GCN: s_mov_b32 s4, s33
+; GCN-NEXT: s_getpc_b64
 ; GCN-NEXT: s_add_u32
 ; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_mov_b32 s4, s33
 ; GCN-NEXT: s_mov_b32 s32, s33
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
diff --git a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index 6af0795b04cd03757d97904981544bdf58e3970a..79abb96cccf16221d0011a2512c309f0d5c853ba 100644
--- a/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -558,7 +558,8 @@ define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
 
 ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill:
 ; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0x400
+
+; GCN-DAG: s_add_u32 s32, s32, 0x400
 
 ; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s14
 ; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-68-9][0-9]*]], s15
diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll
index e73f28604b5750ff81e0ab3060cd6a7ec51552ac..d98b56062cd53119e00f6bbd0fd66cccadc697f7 100644
--- a/test/CodeGen/AMDGPU/clamp.ll
+++ b/test/CodeGen/AMDGPU/clamp.ll
@@ -74,7 +74,8 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a
 
 ; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]]
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
 define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -90,8 +91,17 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
+; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]]
+; GCN-NOT: [[MAX]]
+; GCN-NOT: [[MED]]
+
+; SI: buffer_store_dword [[MED]]
+; SI: buffer_store_dword [[MAX]]
+
+; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MED]]
+; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX]]
 define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
@@ -406,8 +416,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
 ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0
 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index d19072a6c4eed35d01098538deaaeef499871854..41ecdd403d73665afa9389d6c49fea7db0633e96 100644
--- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -21,18 +21,17 @@
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
 
+; Spill load
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
 
-
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:8 ; 4-byte Folded Spill
-
-; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -57,11 +56,11 @@
 
 
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:8 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
@@ -103,7 +102,7 @@ endif:
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -111,9 +110,9 @@ endif:
 
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:28 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -122,7 +121,7 @@ endif:
 
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
-; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
 ; GCN: v_cmp_ne_u32_e32 vcc,
 ; GCN: s_and_b64 vcc, exec, vcc
@@ -134,11 +133,11 @@ endif:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:28 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
@@ -182,7 +181,7 @@ end:
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -237,13 +236,13 @@ end:
 
 ; GCN: BB{{[0-9]+}}_2: ; %if
 ; GCN: ds_read_b32
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ELSE]]: ; %else
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_branch [[FLOW]]
diff --git a/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4218cee9f1ecf6866eaf350d65c0ffd541886d72
--- /dev/null
+++ b/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX600 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX600 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx601 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hainan -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=oland -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=pitcairn -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=verde -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX601 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX700 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX700 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx701 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX701 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX701 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx702 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX702 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx703 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kabini -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=mullins -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX703 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx704 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX704 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX704 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX801 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=carrizo -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX801 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX802 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris10 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=polaris11 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX803 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX810 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=stoney -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX810 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX902 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX904 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3 < %s | FileCheck --check-prefixes=GFX906 %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+code-object-v3,+xnack < %s | FileCheck --check-prefixes=XNACK-GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -mattr=+code-object-v3,-xnack < %s | FileCheck --check-prefixes=NO-XNACK-GFX902 %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3,+sram-ecc < %s | FileCheck --check-prefixes=SRAM-ECC-GFX904 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3,-sram-ecc < %s | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+code-object-v3,+sram-ecc,+xnack < %s | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX904 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -mattr=+code-object-v3,+xnack < %s | FileCheck --check-prefixes=XNACK-GFX906 %s
+
+; GFX600: .amdgcn_target "amdgcn-amd-amdhsa--gfx600"
+; GFX601: .amdgcn_target "amdgcn-amd-amdhsa--gfx601"
+; GFX700: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
+; GFX701: .amdgcn_target "amdgcn-amd-amdhsa--gfx701"
+; GFX702: .amdgcn_target "amdgcn-amd-amdhsa--gfx702"
+; GFX703: .amdgcn_target "amdgcn-amd-amdhsa--gfx703"
+; GFX704: .amdgcn_target "amdgcn-amd-amdhsa--gfx704"
+; GFX801: .amdgcn_target "amdgcn-amd-amdhsa--gfx801+xnack"
+; GFX802: .amdgcn_target "amdgcn-amd-amdhsa--gfx802"
+; GFX803: .amdgcn_target "amdgcn-amd-amdhsa--gfx803"
+; GFX810: .amdgcn_target "amdgcn-amd-amdhsa--gfx810+xnack"
+; GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
+; GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902+xnack"
+; GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904"
+; GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+sram-ecc"
+
+; XNACK-GFX900: .amdgcn_target "amdgcn-amd-amdhsa--gfx900+xnack"
+; NO-XNACK-GFX902: .amdgcn_target "amdgcn-amd-amdhsa--gfx902"
+
+; SRAM-ECC-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+sram-ecc"
+; NO-SRAM-ECC-GFX906: "amdgcn-amd-amdhsa--gfx906"
+
+; SRAM-ECC-XNACK-GFX904: .amdgcn_target "amdgcn-amd-amdhsa--gfx904+xnack+sram-ecc"
+; XNACK-GFX906: .amdgcn_target "amdgcn-amd-amdhsa--gfx906+xnack+sram-ecc"
+
+define amdgpu_kernel void @directive_amdgcn_target() {
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
index 9d2d3690995ac56f965231346637b7f6c36de87f..b64e077a59caf0e9704b62019e865c5ed8d38fdb 100644
--- a/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
+++ b/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
@@ -46,6 +46,7 @@
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX902 %s
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx904 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX904 %s
 ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX906 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx909 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX909 %s
 
 ; ARCH-R600: Arch: r600
 ; ARCH-GCN:  Arch: amdgcn
@@ -85,6 +86,8 @@
 ; GFX902-NEXT:   EF_AMDGPU_XNACK              (0x100)
 ; GFX904:        EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E)
 ; GFX906:        EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+; GFX906-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+; GFX909:        EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31)
 ; ALL:         ]
 
 define amdgpu_kernel void @elf_header() {
diff --git a/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll b/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b33b29b59ec182eb4a60c1085599ea59336113ba
--- /dev/null
+++ b/test/CodeGen/AMDGPU/elf-header-flags-sram-ecc.ll
@@ -0,0 +1,38 @@
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=-sram-ecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX902 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx902 -mattr=+sram-ecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX902 %s
+
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=-sram-ecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=NO-SRAM-ECC-GFX906 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=SRAM-ECC-GFX906 %s
+; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 -mattr=+sram-ecc,+xnack < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=SRAM-ECC-XNACK-GFX906 %s
+
+; NO-SRAM-ECC-GFX902:      Flags [
+; NO-SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D)
+; NO-SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_XNACK              (0x100)
+; NO-SRAM-ECC-GFX902-NEXT: ]
+
+; SRAM-ECC-GFX902:      Flags [
+; SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D)
+; SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+; SRAM-ECC-GFX902-NEXT:   EF_AMDGPU_XNACK              (0x100)
+; SRAM-ECC-GFX902-NEXT: ]
+
+; NO-SRAM-ECC-GFX906:      Flags [
+; NO-SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+; NO-SRAM-ECC-GFX906-NEXT: ]
+
+; SRAM-ECC-GFX906:      Flags [
+; SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+; SRAM-ECC-GFX906-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+; SRAM-ECC-GFX906-NEXT: ]
+
+; SRAM-ECC-XNACK-GFX906:      Flags [
+; SRAM-ECC-XNACK-GFX906-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+; SRAM-ECC-XNACK-GFX906-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+; SRAM-ECC-XNACK-GFX906-NEXT:   EF_AMDGPU_XNACK              (0x100)
+; SRAM-ECC-XNACK-GFX906-NEXT: ]
+
+define amdgpu_kernel void @elf_header() {
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 4d4e065ba56d5ec8aa9c114f5526d3ef1f49b66c..e2741c25382eda73541d65efcb0542bd9e637efb 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -455,14 +455,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
-; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
-; GFX9-NOT: v_max
-; GFX9-NOT: v_mul
-
-; VI-DENORM-NOT: v_max_f32
-; VI-DENORM-NOT: v_mul_f32
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
+; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]
 
-; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GCN-NOT: v_max
+; GCN-NOT: v_mul
 
 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
@@ -476,15 +475,13 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee
 }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
-; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-
-; GFX9-NOT: v_max
-; GFX9-NOT: v_mul
-
-
-; VI-DENORM-NOT: v_max
-; VI-DENORM-NOT: v_mul
 ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; GCN-DENORM-NOT: v_max
+; GCN-DENORM-NOT: v_mul
+
+; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN-DENORM-NOT: v_max
+; GCN-DENORM-NOT: v_mul
 
 ; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
 define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
@@ -530,13 +527,19 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace
 }
 
 ; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GFX9:  v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+
+; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
+; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
 
-; VI-FLUSH: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
-; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 
-; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
 
+; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
+; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
+
+; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
 
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
@@ -552,11 +555,14 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa
 }
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
-; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
-; VI-FLUSH:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
-; VI-FLUSH:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
+; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
+
+; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
+
+; VI-FLUSH:    v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
+; VI-FLUSH:    v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
 
-; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
+; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
 
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
@@ -707,16 +713,21 @@ define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspa
 
 ; Need to quiet the nan with a separate instruction since it will be
 ; passed through the minnum.
+; FIXME: canonicalize doens't work correctly without ieee_mode
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
+; GFX9-NOT: v0
+; GFX9-NOT: v1
 ; GFX9: v_min_f32_e32 v0, v0, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX9-DENORM-NEXT: v_max_f32_e32 v0, v0, v0
 ; GFX9-NEXT: ; return to shader
 
-; VI: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH: v_mul_f32_e32 v0, 1.0, v0
-; VI-DENORM: v_max_f32_e32 v0, v0, v0
+; VI-FLUSH: v_min_f32_e32 v0, v0, v1
+; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-NEXT: ; return
+
+; VI-DENORM-NOT: v0
+; VI-DENORM: v_min_f32_e32 v0, v0, v1
+; VI-DENORM-NEXT: ; return
 define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
@@ -727,8 +738,14 @@ define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %
 ; GFX9: v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT: s_setpc_b64
 
-; VI: v_min_f32_e32 v0, v0, v1
-; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0
+; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1
+; VI-FLUSH: v_min_f32_e32 v0, v0, v1
+
+; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0
+; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1
+; VI-DENORM: v_min_f32_e32 v0, v0, v1
+
 ; VI-NEXT: s_setpc_b64
 define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
   %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll
index a56a5866aadd404711bf5d3d4926060a0f484757..fe0e4409f16f235c6f6fc0379a1bcd7c23c91e63 100644
--- a/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -4,11 +4,14 @@
 declare double @llvm.maxnum.f64(double, double) nounwind readnone
 
 ; SI-LABEL: {{^}}test_fmax3_f64:
-; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
-; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
+; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
+; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
+; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]]
+; SI: v_max_f64 [[QUIET_B:v\[[0-9]+:[0-9]+\]]], [[REGB]], [[REGB]]
+; SI: v_max_f64 [[MAX0:v\[[0-9]+:[0-9]+\]]], [[QUIET_A]], [[QUIET_B]]
+; SI: v_max_f64 [[QUIET_C:v\[[0-9]+:[0-9]+\]]], [[REGC]], [[REGC]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[MAX0]], [[QUIET_C]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
 define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index 1f67ace72df787d8044ed6106ced9f228898b60c..5a92eac7f32f83c96a2a591d8d797c30bde380d6 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -48,8 +48,11 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float
 ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
 
-; VI: v_max_f16_e32
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
+; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
+; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
+; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]]
 
 ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
 ; GCN: buffer_store_short [[RESULT]],
@@ -75,8 +78,11 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half ad
 ; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]
 
-; VI: v_max_f16_e32
-; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
+; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
+; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
+; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]]
 
 ; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
 ; GCN: buffer_store_short [[RESULT]],
@@ -100,22 +106,25 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half ad
 ; SI-NEXT: v_max3_f32
 ; SI-NEXT: v_max3_f32
 
-; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_max_f16_e32 v0, v0, v1
-; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_max_f16_e32 v0, v2, v0
-; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI: v_max_f16_e32 v0, v0, v3
-; VI: v_or_b32_e32 v0, v0, v1
-
-; GFX9: v_pk_max_f16
+; VI: s_waitcnt
+; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v1
+; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_max_f16_e32 v0, v2, v0
+; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_max_f16_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16
 ; GFX9-NEXT: v_pk_max_f16
 ; GFX9-NEXT: v_pk_max_f16
-define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
+define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
 entry:
-  %max = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
-  %max1 = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
-  %res = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
+  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
+  %res = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
   ret <2 x half> %res
 }
 
@@ -126,3 +135,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index e06d93f5dc6c25f72727c7cdeb6523b467fe15d3..e7f3f53685c68fbe3a4034324e55e014916648ef 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -97,7 +97,7 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
@@ -178,7 +178,7 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
@@ -283,8 +283,8 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
@@ -437,10 +437,10 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_max_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_max_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
index 6a1f7966c30492b84863064e1aa7b53d43a20ce6..1fd1556de74662a3326bd32462c32b508417d59f 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -1,13 +1,22 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 
 ; EG: MAX
@@ -26,12 +35,16 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]]
 ; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]]
 
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
 
 ; EG: MAX
@@ -52,9 +65,14 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src(float addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_ge_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -72,9 +90,15 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -92,9 +116,14 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
@@ -112,9 +141,15 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
+
 ; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; EG: MAX
 define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
@@ -132,12 +167,24 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32:
-; GCN-SAFE: v_max_legacy_f32_e32
-; GCN-SAFE: v_max_legacy_f32_e32
-; GCN-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+; SI-SAFE: v_max_legacy_f32_e32
+
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_gt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE-NOT: v_cmp
+; VI-SAFE-NOT: v_cndmask
+
 ; GCN-NONAN: v_max_f32_e32
 ; GCN-NONAN: v_max_f32_e32
 ; GCN-NONAN: v_max_f32_e32
+
+; GCN-NOT: v_max
 define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
@@ -153,8 +200,8 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_multi_use:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-NOT: v_max_
 ; GCN: v_cmp_gt_f32
 ; GCN-NEXT: v_cndmask_b32
diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll
index 58b5b5282b096a9a0dfd066a3a0eb7d873331925..7e16d1b883a013921b6e032894124f6f87cbeddf 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -1,14 +1,26 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}test_fmax_f32:
-; GCN: v_max_f32_e32
-define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) #0 {
-  %val = call float @llvm.maxnum.f32(float %a, float %b)
+; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_on:
+; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_fmax_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
 }
 
+; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_off:
+; GCN: v_max_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmax_f32_ieee_mode_off(float %a, float %b) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
 ; GCN-LABEL: {{^}}test_fmax_v2f32:
 ; GCN: v_max_f32_e32
 ; GCN: v_max_f32_e32
@@ -158,38 +170,34 @@ define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}fmax_var_immediate_f32:
+; GCN-LABEL: {{^}}fmax_var_immediate_f32_no_ieee:
 ; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float %a, float 2.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_var_immediate_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmax_immediate_var_f32:
+; GCN-LABEL: {{^}}fmax_immediate_var_f32_no_ieee:
 ; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float 2.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_immediate_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmax_var_literal_f32:
+; GCN-LABEL: {{^}}fmax_var_literal_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float %a, float 99.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_var_literal_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmax_literal_var_f32:
+; GCN-LABEL: {{^}}fmax_literal_var_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.maxnum.f32(float 99.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmax_literal_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
+  ret float %val
 }
 
 ; GCN-LABEL: {{^}}test_func_fmax_v3f32:
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index fa93fbcfb91712982c22c4dc47564ace65c8a53c..48d0eedba5b955479a469c934065236cc5386ed2 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -95,22 +95,26 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half ad
 ; SI-NEXT: v_min3_f32
 ; SI-NEXT: v_min3_f32
 
-; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI: v_min_f16_e32 v0, v0, v1
-; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_min_f16_e32 v0, v2, v0
-; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI: v_min_f16_e32 v0, v0, v3
-; VI: v_or_b32_e32 v0, v0, v1
-
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
-define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
+; VI: s_waitcnt
+; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT: v_min_f16_e32 v0, v0, v1
+; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_min_f16_e32 v0, v2, v0
+; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_min_f16_e32 v0, v0, v3
+; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT: v_pk_min_f16 v0, v2, v0
+; GFX9-NEXT: v_pk_min_f16 v0, v0, v3
+; GFX9-NEXT: s_setpc_b64
+define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
 entry:
-  %min = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
-  %min1 = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
-  %res = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
+  %min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
+  %min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
+  %res = call <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
   ret <2 x half> %res
 }
 
@@ -121,3 +125,4 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
index af1dabaa01ed1cd55a0c5011c82373c79d36b1ef..731204eeaf6966a8d0225a0e0e1920f905c2acb8 100644
--- a/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
+++ b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
@@ -1,9 +1,19 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NONAN -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,SI %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN,SI %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,VI %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN,VI %s
 
 ; GCN-LABEL: {{^}}min_fneg_select_regression_0:
-; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
-; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, -1.0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
+; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0
 define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ult float %a, 1.0
@@ -12,7 +22,14 @@ define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
 }
 
 ; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
-; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+
+; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
 ; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
 define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
   %fneg.a = fsub float -0.0, %a
@@ -22,9 +39,16 @@ define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0
 }
 
 ; GCN-LABEL: {{^}}max_fneg_select_regression_0:
-; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+
+; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
 ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
-define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 {
+define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, 1.0
   %min.a = select i1 %cmp.a, float %fneg.a, float -1.0
@@ -32,9 +56,16 @@ define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 {
 }
 
 ; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
-; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NOT: v_mul
+
+; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+
+; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
+; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
+; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+
 ; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
-define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a, float %b) #0 {
+define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
   %fneg.a = fsub float -0.0, %a
   %cmp.a = fcmp ugt float %a, -1.0
   %min.a = select i1 %cmp.a, float %fneg.a, float 1.0
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 20057307354f59f8982d5e9f377e6e8ab858e13b..19d4c316ec6f367f4f613d1b5b94ceacb7fd7c0a 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -98,7 +98,7 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; VI-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
@@ -179,7 +179,7 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
@@ -284,8 +284,8 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v3
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v2
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v5
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v4
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
@@ -438,10 +438,10 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; VI-NNAN-NEXT:    v_min_f16_e32 v1, v1, v5
 ; VI-NNAN-NEXT:    v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NNAN-NEXT:    v_min_f16_e32 v0, v0, v4
-; VI-NNAN-NEXT:    v_or_b32_e32 v0, v0, v11
-; VI-NNAN-NEXT:    v_or_b32_e32 v1, v1, v10
-; VI-NNAN-NEXT:    v_or_b32_e32 v2, v2, v9
-; VI-NNAN-NEXT:    v_or_b32_e32 v3, v3, v8
+; VI-NNAN-NEXT:    v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NNAN-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NNAN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index e0acbaf59dbaaef8280c6327e17a4c7ad4f27a8b..ca80c4edbfb29e88c03934ebaf43c64a536419f1 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -1,5 +1,9 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s
+
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s
+
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #1
@@ -10,8 +14,13 @@ declare i32 @llvm.r600.read.tidig.x() #1
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32:
 ; EG: MIN *
-; GCN-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; GCN-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(1)* %out, <4 x float> %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
@@ -22,13 +31,17 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(
 }
 
 ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
-; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]
 
-; GCN-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
-; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
+; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
+
+; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]]
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]
 
+; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
@@ -36,13 +49,19 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out,
   ret void
 }
 
+; Nsz also needed
+; FIXME: Should separate tests
 ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src:
-; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
+; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 
 ; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[A]], 1.0
 ; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[B]], 2.0
 
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
+
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
+; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
 define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1)* %out, float %a, float %b) #0 {
   %a.nnan = fadd nnan float %a, 1.0
@@ -54,9 +73,14 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -73,9 +97,14 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE v_cmp_le_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -92,9 +121,14 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -111,9 +145,14 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -130,9 +169,14 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, fl
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+
+; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
+
+; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
+; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
+
 ; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
@@ -149,10 +193,15 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32:
-; GCN: buffer_load_dwordx2
-; GCN: buffer_load_dwordx2
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+
+; VI-SAFE v_cmp_lt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE v_cmp_lt_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
 
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
@@ -171,13 +220,24 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32:
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
-; GCN-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE: v_min_legacy_f32_e32
+; SI-SAFE-NOT: v_min_
+
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-SAFE: v_cmp_nge_f32_e32
+; VI-SAFE: v_cndmask_b32_e32
+; VI-NOT: v_cmp
+; VI-NOT: v_cndmask
 
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
 ; GCN-NONAN: v_min_f32_e32
+; GCN-NONAN-NOT: v_min_
 define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
@@ -193,8 +253,8 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_multi_use:
-; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN-NOT: v_min
 ; GCN: v_cmp_le_f32
 ; GCN-NEXT: v_cndmask_b32
diff --git a/test/CodeGen/AMDGPU/fminnum.f64.ll b/test/CodeGen/AMDGPU/fminnum.f64.ll
index 475615e52cb93ccb8069917e1389f5b56680f7ad..e37a1cead47d3992f11f3446da43bd215ad9669a 100644
--- a/test/CodeGen/AMDGPU/fminnum.f64.ll
+++ b/test/CodeGen/AMDGPU/fminnum.f64.ll
@@ -7,15 +7,35 @@ declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0
 declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
 declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
 
-; FUNC-LABEL: @test_fmin_f64
-; SI: v_min_f64
-define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+; FUNC-LABEL: {{^}}test_fmin_f64_ieee:
+; SI: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]]
+; SI: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]]
+; SI-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]]
+; SI-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]]
+; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]]
+define amdgpu_kernel void @test_fmin_f64_ieee([8 x i32], double %a, [8 x i32], double %b) nounwind {
+  %val = call double @llvm.minnum.f64(double %a, double %b) #0
+  store double %val, double addrspace(1)* undef, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_fmin_f64_no_ieee:
+; SI: ds_read_b64 [[VAL0:v\[[0-9]+:[0-9]+\]]]
+; SI: ds_read_b64 [[VAL1:v\[[0-9]+:[0-9]+\]]]
+; SI-NOT: [[VAL0]]
+; SI-NOT: [[VAL1]]
+; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VAL0]], [[VAL1]]
+; SI-NOT: [[RESULT]]
+; SI: ds_write_b64 v{{[0-9]+}}, [[RESULT]]
+define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind {
+  %a = load volatile double, double addrspace(3)* undef
+  %b = load volatile double, double addrspace(3)* undef
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
-  store double %val, double addrspace(1)* %out, align 8
+  store volatile double %val, double addrspace(3)* undef
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v2f64
+; FUNC-LABEL: {{^}}test_fmin_v2f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
@@ -24,7 +44,7 @@ define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v4f64
+; FUNC-LABEL: {{^}}test_fmin_v4f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
@@ -35,7 +55,7 @@ define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v8f64
+; FUNC-LABEL: {{^}}test_fmin_v8f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
@@ -50,7 +70,7 @@ define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x
   ret void
 }
 
-; FUNC-LABEL: @test_fmin_v16f64
+; FUNC-LABEL: {{^}}test_fmin_v16f64:
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll
index a0642e211f13b587aa6fb95103ea029e1ff65245..a8574b288f57e174f5ff12a82ccd3247885a1f66 100644
--- a/test/CodeGen/AMDGPU/fminnum.ll
+++ b/test/CodeGen/AMDGPU/fminnum.ll
@@ -1,14 +1,45 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}test_fmin_f32:
-; GCN: v_min_f32_e32
-define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) #0 {
-  %val = call float @llvm.minnum.f32(float %a, float %b)
+; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_on:
+; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
+; GCN-NOT: [[RESULT]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @test_fmin_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
 }
 
+; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_on:
+; GCN: s_waitcnt
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: s_setpc_b64
+define float @test_fmin_nnan_f32_ieee_mode_on(float %a, float %b) #0 {
+  %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
+; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_off:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmin_nnan_f32_ieee_mode_off(float %a, float %b) #0 {
+  %val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
+; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_off:
+; GCN: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @test_fmin_f32_ieee_mode_off(float %a, float %b) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float %b) #1
+  ret float %val
+}
+
 ; GCN-LABEL: {{^}}test_fmin_v2f32:
 ; GCN: v_min_f32_e32
 ; GCN: v_min_f32_e32
@@ -147,38 +178,34 @@ define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}fmin_var_immediate_f32:
-; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float %a, float 2.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+; GCN-LABEL: {{^}}fmin_var_immediate_f32_no_ieee:
+; GCN: v_min_f32_e32 v0, 2.0, v0
+define amdgpu_ps float @fmin_var_immediate_f32_no_ieee(float %a) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float 2.0) #1
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmin_immediate_var_f32:
+; GCN-LABEL: {{^}}fmin_immediate_var_f32_no_ieee:
 ; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
-define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float 2.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmin_immediate_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.minnum.f32(float 2.0, float %a) #1
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmin_var_literal_f32:
+; GCN-LABEL: {{^}}fmin_var_literal_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float %a, float 99.0)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmin_var_literal_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.minnum.f32(float %a, float 99.0) #1
+  ret float %val
 }
 
-; GCN-LABEL: {{^}}fmin_literal_var_f32:
+; GCN-LABEL: {{^}}fmin_literal_var_f32_no_ieee:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
 ; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
-define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
-  %val = call float @llvm.minnum.f32(float 99.0, float %a)
-  store float %val, float addrspace(1)* %out, align 4
-  ret void
+define amdgpu_ps float @fmin_literal_var_f32_no_ieee(float inreg %a) #0 {
+  %val = call float @llvm.minnum.f32(float 99.0, float %a) #1
+  ret float %val
 }
 
 ; GCN-LABEL: {{^}}test_func_fmin_v3f32:
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
index 23e86351028198a4ae82a6ac41a83dd9fbd4ffe8..e57ebc9c061ed93b10519a9a2a509316fd20507c 100644
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -396,12 +396,14 @@ define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %
 ; fminnum tests
 ; --------------------------------------------------------------------------------
 
-; GCN-LABEL: {{^}}v_fneg_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -415,11 +417,23 @@ define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float add
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_self_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e64 v0, -v0, -v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 {
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -431,11 +445,22 @@ define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, floa
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, -v0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 {
+  %min = call float @llvm.minnum.f32(float %a, float %a)
+  %min.fneg = fsub float -0.0, %min
+  ret float %min.fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -447,11 +472,22 @@ define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, floa
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, -4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 {
+  %min = call float @llvm.minnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -463,6 +499,16 @@ define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, floa
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_max_f32_e64 v0, -v0, 4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 {
+  %min = call float @llvm.minnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  ret float %fneg
+}
+
 ; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
@@ -479,11 +525,12 @@ define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float a
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -498,10 +545,11 @@ define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, floa
 ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32:
 ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
-; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
+; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
+; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
 
-; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
 ; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -520,10 +568,11 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(float addrspace(1)* %out, fl
 ; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32:
 ; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e22f983
-; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[K]]
+; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
+; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]]
 
-; VI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494
+; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]]
+; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
@@ -545,7 +594,8 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(float addrspace(1)* %out
 ; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
 
-; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]]
+; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]]
 ; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]]
 
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -568,7 +618,8 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(half addrspace(1)* %out, hal
 ; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]]
 ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]]
 
-; VI: v_max_f16_e64 [[RESULT:v[0-9]+]], -[[A]], 0.15915494
+; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]]
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]]
 
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
@@ -588,7 +639,8 @@ define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(half addrspace(1)* %out,
 
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
-; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; SI: v_max_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; VI: v_min_f64 v{{\[}}[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]{{\]}}, [[A]], 0.15915494
 ; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]]
@@ -611,9 +663,11 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(double addrspace(1)* %out, d
 
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882
-; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
-; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], 0.15915494
+; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]]
+; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494
 
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
@@ -638,13 +692,14 @@ define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 {
   ret float %fneg
 }
 
-; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32:
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -660,15 +715,16 @@ define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32:
-; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xbe22f983
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 
+; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]]
 
-; SI: v_max_f32_e64 [[MIN:v[0-9]+]], -[[A]], [[K]]
+; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]]
 ; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]]
 
-; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[A]]
+; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
+; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]]
 ; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
@@ -687,14 +743,29 @@ define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(float addrspace
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32:
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
+  %min = call float @llvm.minnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  %mul = fmul float %fneg, %b
+  ret float %mul
+}
+
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
-define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -710,16 +781,34 @@ define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e64 v0, -v0, -v1
+; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT: ; return
+define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 {
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  %use1 = fmul float %min, 4.0
+  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
+  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
+  ret <2 x float> %ins1
+}
+
 ; --------------------------------------------------------------------------------
 ; fmaxnum tests
 ; --------------------------------------------------------------------------------
 
-; GCN-LABEL: {{^}}v_fneg_maxnum_f32:
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -727,60 +816,104 @@ define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float add
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
   %b = load volatile float, float addrspace(1)* %b.gep
-  %min = call float @llvm.maxnum.f32(float %a, float %b)
-  %fneg = fsub float -0.000000e+00, %min
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e64 v0, -v0, -v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
-  %min = call float @llvm.maxnum.f32(float %a, float %a)
-  %min.fneg = fsub float -0.0, %min
-  store float %min.fneg, float addrspace(1)* %out.gep
+  %max = call float @llvm.maxnum.f32(float %a, float %a)
+  %max.fneg = fsub float -0.0, %max
+  store float %max.fneg, float addrspace(1)* %out.gep
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, -v0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float %a)
+  %max.fneg = fsub float -0.0, %max
+  ret float %max.fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
-  %min = call float @llvm.maxnum.f32(float 4.0, float %a)
-  %fneg = fsub float -0.000000e+00, %min
+  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, -4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
-  %min = call float @llvm.maxnum.f32(float -4.0, float %a)
-  %fneg = fsub float -0.000000e+00, %min
+  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
   store float %fneg, float addrspace(1)* %out.gep
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, 4.0
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
 ; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
@@ -797,11 +930,12 @@ define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float a
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
-; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -813,13 +947,24 @@ define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, floa
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32:
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN: v_min_f32_e64 v0, -v0, 0{{$}}
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  ret float %fneg
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -834,14 +979,29 @@ define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)*
   ret void
 }
 
-; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32:
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1
+; GCN-NEXT: ; return
+define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 {
+  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  %mul = fmul float %fneg, %b
+  ret float %mul
+}
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]]
+; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]]
+; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]]
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]]
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]]
-define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -849,14 +1009,29 @@ define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
   %a = load volatile float, float addrspace(1)* %a.gep
   %b = load volatile float, float addrspace(1)* %b.gep
-  %min = call float @llvm.maxnum.f32(float %a, float %b)
-  %fneg = fsub float -0.000000e+00, %min
-  %use1 = fmul float %min, 4.0
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
+  %use1 = fmul float %max, 4.0
   store volatile float %fneg, float addrspace(1)* %out
   store volatile float %use1, float addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_min_f32_e64 v0, -v0, -v1
+; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; GCN-NEXT: ; return
+define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %max
+  %use1 = fmul float %max, 4.0
+  %ins0 = insertelement <2 x float> undef, float %fneg, i32 0
+  %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1
+  ret <2 x float> %ins1
+}
+
 ; --------------------------------------------------------------------------------
 ; fma tests
 ; --------------------------------------------------------------------------------
diff --git a/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir b/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
index ec0faf7ab66321daafed73637852d7efd1fd94cb..f3a1168885f8a8f408fcf0b7cce1b3c27c0da372 100644
--- a/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
+++ b/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir
@@ -63,7 +63,7 @@ body:             |
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
-    ; GCN: DBG_VALUE debug-use %5:sreg_64_xexec, debug-use $noreg
+    ; GCN: DBG_VALUE %5:sreg_64_xexec, $noreg
     ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
     %0:sreg_32_xm0 = S_MOV_B32 12345
     %1:vgpr_32 = IMPLICIT_DEF
@@ -71,7 +71,7 @@ body:             |
     %3:vgpr_32 = IMPLICIT_DEF
 
     %4:vgpr_32, %5:sreg_64_xexec = V_ADD_I32_e64 %0, %1, implicit $exec
-    DBG_VALUE debug-use %5, debug-use $noreg
+    DBG_VALUE %5, $noreg
     S_ENDPGM implicit %4
 
 ...
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index dd38d1d23660ce2711ac1de8f09b22efb385f773..e937aaca66f21db299434bce43d2981ab1c1c695 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -23,6 +23,7 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx902 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX902 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s
 
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU"
@@ -40,3 +41,4 @@
 ; HSA-GFX902: .hsa_code_object_isa 9,0,2,"AMD","AMDGPU"
 ; HSA-GFX904: .hsa_code_object_isa 9,0,4,"AMD","AMDGPU"
 ; HSA-GFX906: .hsa_code_object_isa 9,0,6,"AMD","AMDGPU"
+; HSA-GFX909: .hsa_code_object_isa 9,0,9,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index 84a2d3d3a7b05ad0023de944268eef0e70483d87..ae78a1ecf32523c0d93dbac0f53a2d68c48ec5c2 100644
--- a/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -1,19 +1,25 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}i1_copy_from_loop:
 ;
-; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would
-; only contain the lanes that were active during the last loop iteration.
-;
 ; SI: ; %for.body
-; SI:      v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,
-; SI:      v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]
-; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]]
-; SI:      [[ENDIF]]:
-; SI-NOT:  [[VREG]]
-; SI:      ; %for.end
-; SI:      v_cmp_ne_u32_e32 vcc, 0, [[VREG]]
+; SI:      v_cmp_gt_u32_e64  [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
+; SI-DAG:  s_andn2_b64       [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
+; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
+
+; SI: ; %Flow1
+; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], exec
+
+; SI: ; %Flow
+; SI-DAG:  s_andn2_b64       [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI:      s_or_b64          [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
+
+; SI: ; %for.end
+; SI:      s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]
+
 define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
 entry:
   br label %for.body
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll b/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0aacbbfda182b3a37b02d6eb4b8394617d9ce239
--- /dev/null
+++ b/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_dont_clobber_scc:
+
+; GCN: ; %entry
+; GCN:      s_cmp_eq_u32    s0, 0
+; GCN:      s_cbranch_scc1  [[PREEXIT:BB[0-9_]+]]
+
+; GCN: ; %blocka
+; GCN:      s_xor_b64       s[{{[0-9:]+}}], exec, -1
+; GCN:      s_cmp_eq_u32    s1, 0
+; GCN:      s_cbranch_scc1  [[EXIT:BB[0-9_]+]]
+
+; GCN: [[PREEXIT]]:
+; GCN: [[EXIT]]:
+
+define amdgpu_vs float @test_dont_clobber_scc(i32 inreg %uni, i32 inreg %uni2) #0 {
+entry:
+  %cc.uni = icmp eq i32 %uni, 0
+  br i1 %cc.uni, label %exit, label %blocka
+
+blocka:
+  call void asm sideeffect "; dummy a", ""()
+  %cc.uni2 = icmp eq i32 %uni2, 0
+  br i1 %cc.uni2, label %exit, label %blockb
+
+blockb:
+  call void asm sideeffect "; dummy b", ""()
+  br label %exit
+
+exit:
+  %cc.phi = phi i1 [ true, %entry ], [ false, %blocka ], [ false, %blockb ]
+  call void asm sideeffect "; dummy exit", ""()
+  %r = select i1 %cc.phi, float 1.0, float 2.0
+  ret float %r
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 63a9f1feb6dff4cd86c4963a6514910929967868..5b25271ce1717059f5de4bb3b25a51568d1a532c 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -2,12 +2,16 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}br_i1_phi:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
-; SI: s_and_saveexec_b64
-; SI: v_mov_b32_e32 [[REG]], -1{{$}}
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
-; SI: s_and_saveexec_b64
-; SI: s_endpgm
+
+; SI: ; %bb
+; SI:    s_mov_b64           [[TMP:s\[[0-9]+:[0-9]+\]]], 0
+
+; SI: ; %bb2
+; SI:    s_mov_b64           [[TMP]], exec
+
+; SI: ; %bb3
+; SI:    s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[TMP]]
+
 define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
   %tidig = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/idot8.ll b/test/CodeGen/AMDGPU/idot8.ll
index 044d2d3b914ab435266372494e7274f2ba0ea3bb..e0cd2ad506be985e24f3a0ca36129cb49b2f7e26 100644
--- a/test/CodeGen/AMDGPU/idot8.ll
+++ b/test/CodeGen/AMDGPU/idot8.ll
@@ -4635,3 +4635,223 @@ entry:
   store i8 %add8, i8 addrspace(1)* %dst, align 4
   ret void
 }
+
+define amdgpu_kernel void @udot8_variant1(i32 addrspace(1)* %v1addr,
+; GFX7-LABEL: udot8_variant1:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_and_b32 s7, s4, 15
+; GFX7-NEXT:    s_and_b32 s8, s5, 15
+; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s13, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s15, s4, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s19, s4, 0x40018
+; GFX7-NEXT:    s_lshr_b32 s4, s4, 28
+; GFX7-NEXT:    v_mov_b32_e32 v0, s7
+; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v0, v1
+; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s12, s5, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s20, s5, 0x40018
+; GFX7-NEXT:    s_lshr_b32 s5, s5, 28
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s11
+; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; GFX7-NEXT:    v_mad_u32_u24 v0, s14, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s15
+; GFX7-NEXT:    v_mad_u32_u24 v0, s16, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s17
+; GFX7-NEXT:    v_mad_u32_u24 v0, s18, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s19
+; GFX7-NEXT:    v_mad_u32_u24 v0, s20, v1, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: udot8_variant1:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s0, s2, 15
+; GFX8-NEXT:    s_and_b32 s1, s3, 15
+; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s15, s2, 0x40018
+; GFX8-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v2, v3
+; GFX8-NEXT:    s_bfe_u32 s6, s3, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s8, s3, 0x40008
+; GFX8-NEXT:    s_bfe_u32 s10, s3, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s12, s3, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s14, s3, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s16, s3, 0x40018
+; GFX8-NEXT:    s_lshr_b32 s3, s3, 28
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s11
+; GFX8-NEXT:    v_mad_u32_u24 v2, s12, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s13
+; GFX8-NEXT:    v_mad_u32_u24 v2, s14, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s15
+; GFX8-NEXT:    v_mad_u32_u24 v2, s16, v3, v2
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: udot8_variant1:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s0, s2, 15
+; GFX9-NEXT:    s_and_b32 s1, s3, 15
+; GFX9-NEXT:    s_bfe_u32 s5, s2, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s7, s2, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s9, s2, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s11, s2, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s13, s2, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s15, s2, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 28
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mad_u32_u24 v2, s1, v2, v3
+; GFX9-NEXT:    s_bfe_u32 s6, s3, 0x40004
+; GFX9-NEXT:    s_bfe_u32 s8, s3, 0x40008
+; GFX9-NEXT:    s_bfe_u32 s10, s3, 0x4000c
+; GFX9-NEXT:    s_bfe_u32 s12, s3, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s14, s3, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s16, s3, 0x40018
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 28
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_mad_u32_u24 v2, s6, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mad_u32_u24 v2, s8, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_mad_u32_u24 v2, s10, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s11
+; GFX9-NEXT:    v_mad_u32_u24 v2, s12, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-NEXT:    v_mad_u32_u24 v2, s14, v3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    v_mad_u32_u24 v2, s16, v3, v2
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-NEXT:    s_endpgm
+;
+; GFX9-DL-LABEL: udot8_variant1:
+; GFX9-DL:       ; %bb.0: ; %entry
+; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-DL-NEXT:    v_dot8_u32_u4 v2, s3, v2, v3
+; GFX9-DL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX9-DL-NEXT:    s_endpgm
+                                          i32 addrspace(1)* %v2addr,
+                                          i32 addrspace(1)* %dst) {
+entry:
+  %v1 = load i32, i32 addrspace(1)* %v1addr, align 4
+  %v2 = load i32, i32 addrspace(1)* %v2addr, align 4
+  %and = and i32 %v1, 15
+  %and1 = and i32 %v2, 15
+  %mul1 = mul nuw nsw i32 %and1, %and
+
+  %shr = lshr i32 %v1, 4
+  %and2 = and i32 %shr, 15
+  %shr3 = lshr i32 %v2, 4
+  %and4 = and i32 %shr3, 15
+  %mul2 = mul nuw nsw i32 %and4, %and2
+
+  %shr6 = lshr i32 %v1, 8
+  %and7 = and i32 %shr6, 15
+  %shr8 = lshr i32 %v2, 8
+  %and9 = and i32 %shr8, 15
+  %mul3 = mul nuw nsw i32 %and9, %and7
+
+  %shr12 = lshr i32 %v1, 12
+  %and13 = and i32 %shr12, 15
+  %shr14 = lshr i32 %v2, 12
+  %and15 = and i32 %shr14, 15
+  %mul4 = mul nuw nsw i32 %and15, %and13
+
+  %shr18 = lshr i32 %v1, 16
+  %and19 = and i32 %shr18, 15
+  %shr20 = lshr i32 %v2, 16
+  %and21 = and i32 %shr20, 15
+  %mul5 = mul nuw nsw i32 %and21, %and19
+
+  %shr24 = lshr i32 %v1, 20
+  %and25 = and i32 %shr24, 15
+  %shr26 = lshr i32 %v2, 20
+  %and27 = and i32 %shr26, 15
+  %mul6 = mul nuw nsw i32 %and27, %and25
+
+  %shr30 = lshr i32 %v1, 24
+  %and31 = and i32 %shr30, 15
+  %shr32 = lshr i32 %v2, 24
+  %and33 = and i32 %shr32, 15
+  %mul7 = mul nuw nsw i32 %and33, %and31
+
+  %shr36 = lshr i32 %v1, 28
+  %shr37 = lshr i32 %v2, 28
+  %mul8 = mul nuw nsw i32 %shr37, %shr36
+  %acc = load i32, i32 addrspace(1)* %dst, align 4
+
+  %add1 = add i32 %mul1, %acc
+  %add2 = add i32 %add1, %mul8
+  %add3 = add i32 %add2, %mul2
+  %add4 = add i32 %add3, %mul3
+  %add5 = add i32 %add4, %mul4
+  %add6 = add i32 %add5, %mul5
+  %add7 = add i32 %add6, %mul6
+  %add8 = add i32 %add7, %mul7
+  store i32 %add8, i32 addrspace(1)* %dst, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 31199b47e20c05b91258d9ae4fc7171b5e0f82fc..8e02303377c001354b7acbe1d24ffa2e2e2490f4 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -386,9 +386,9 @@ bb2:
 ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
 
 ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
-; GCN: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
-; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
-; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}}
+; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
 
 ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
 ; GCN-NEXT: s_waitcnt vmcnt(0)
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index 2856212bc8993dfda77a2c3c52f100c6ac6fcad4..9615efaaa938703b2ba4f30049a9bd25c090a239 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -186,8 +186,8 @@ entry:
 
 ; FIXME: Should not have intermediate sgprs
 ; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
-; CHECK: s_mov_b32 s1, 0
-; CHECK: s_mov_b32 s0, 0x1e240
+; CHECK-DAG: s_mov_b32 s1, 0
+; CHECK-DAG: s_mov_b32 s0, 0x1e240
 ; CHECK: v_mov_b32_e32 v0, s0
 ; CHECK: v_mov_b32_e32 v1, s1
 ; CHECK: use v[0:1]
@@ -198,7 +198,8 @@ entry:
 }
 
 ; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
-; CHECK: v_mov_b32_e32 v0, -1{{$}}
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, [[MASK]]
 ; CHECK: ; use v0
 define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
 entry:
@@ -212,10 +213,14 @@ entry:
 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]
 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK: ; use v0
+; CHECK: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK: v_cndmask_b32_e64 [[STORE:v[0-9]+]], 0, 1, vcc
+; CHECK: {{buffer|flat}}_store_byte [[STORE]],
 define amdgpu_kernel void @i1_input_phys_vgpr() {
 entry:
   %val = load i1, i1 addrspace(1)* undef
-  call void asm sideeffect "; use $0 ", "{v0}"(i1 %val)
+  %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
+  store i1 %cc, i1 addrspace(1)* undef
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index a62ad820c89d2beece85eb1fd2aa98e3b970e684..692696ff7302b8b0483f3e9c1067d5f13cf9d51a 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -352,7 +352,7 @@ endif:
 ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
 
 ; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
-; GCN: v_movreld_b32_e32 v{{[0-9]+}}, 0
+; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0
 
 ; Increment to next element folded into base register, but FileCheck
 ; can't do math expressions
diff --git a/test/CodeGen/AMDGPU/known-never-snan.ll b/test/CodeGen/AMDGPU/known-never-snan.ll
index 864cc745373eeae3af6f16c6afe346da649411c5..abf9b3ecefa72f4655e36f389550bc611d81ef46 100644
--- a/test/CodeGen/AMDGPU/known-never-snan.ll
+++ b/test/CodeGen/AMDGPU/known-never-snan.ll
@@ -99,8 +99,7 @@ define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %b.nnan.add = fadd nnan float %b, 1.0
@@ -110,14 +109,46 @@ define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float
   ret float %med
 }
 
+define float @v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
+; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %b.nsnan = fadd float %b, 1.0
+  %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nsnan)
+  %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+  ret float %med
+}
+
+define float @v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
+; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %a.nsnan = fadd float %a, 1.0
+  %known.not.snan = call float @llvm.minnum.f32(float %a.nsnan, float %b)
+  %max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
+  %med = call float @llvm.minnum.f32(float %max, float 4.0)
+  ret float %med
+}
+
 define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
 ; GCN-LABEL: v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nnan.add)
@@ -131,9 +162,9 @@ define float @v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b)
@@ -148,8 +179,8 @@ define float @v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %b.nnan.add = fadd nnan float %b, 1.0
@@ -164,8 +195,9 @@ define float @v_maxnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %known.not.snan = call float @llvm.maxnum.f32(float %a, float %b.nnan.add)
@@ -179,8 +211,9 @@ define float @v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_max3_f32 v0, v0, v1, 2.0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b)
@@ -215,8 +248,8 @@ define float @v_select_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %b.nnan.add = fadd nnan float %b, 1.0
   %cmp = icmp eq i32 %c, 0
@@ -233,8 +266,8 @@ define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a
   %cmp = icmp eq i32 %c, 0
@@ -494,6 +527,7 @@ define float @v_test_known_not_snan_fmed3_input_fmed3_r_i_i_f32(float %a, float
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_med3_f32 v0, v0, v1, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c)
@@ -507,8 +541,7 @@ define float @v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32(float %a, float
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_min3_f32 v0, v0, v1, v2
-; GCN-NEXT:    v_max_f32_e32 v0, 2.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min0 = call float @llvm.minnum.f32(float %a, float %b)
   %known.not.snan = call float @llvm.minnum.f32(float %min0, float %c)
diff --git a/test/CodeGen/AMDGPU/lds-bounds.ll b/test/CodeGen/AMDGPU/lds-bounds.ll
new file mode 100644
index 0000000000000000000000000000000000000000..80a26281216a918a203b02513415dc39635a156b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lds-bounds.ll
@@ -0,0 +1,129 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOSI %s
+
+@compute_lds = external addrspace(3) global [512 x i32], align 16
+
+; GCN-LABEL: {{^}}store_aligned:
+; GCN: ds_write_b64
+define amdgpu_cs void @store_aligned(i32 addrspace(3)* %ptr) #0 {
+entry:
+  %ptr.gep.1 = getelementptr i32, i32 addrspace(3)* %ptr, i32 1
+
+  store i32 42, i32 addrspace(3)* %ptr, align 8
+  store i32 43, i32 addrspace(3)* %ptr.gep.1
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}load_aligned:
+; GCN: ds_read_b64
+define amdgpu_cs <2 x float> @load_aligned(i32 addrspace(3)* %ptr) #0 {
+entry:
+  %ptr.gep.1 = getelementptr i32, i32 addrspace(3)* %ptr, i32 1
+
+  %v.0 = load i32, i32 addrspace(3)* %ptr, align 8
+  %v.1 = load i32, i32 addrspace(3)* %ptr.gep.1
+
+  %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+  %bc = bitcast <2 x i32> %r.1 to <2 x float>
+  ret <2 x float> %bc
+}
+
+
+; GCN-LABEL: {{^}}store_global_const_idx:
+; GCN: ds_write2_b32
+define amdgpu_cs void @store_global_const_idx() #0 {
+entry:
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 3
+  %ptr.b = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 4
+
+  store i32 42, i32 addrspace(3)* %ptr.a
+  store i32 43, i32 addrspace(3)* %ptr.b
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}load_global_const_idx:
+; GCN: ds_read2_b32
+define amdgpu_cs <2 x float> @load_global_const_idx() #0 {
+entry:
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 3
+  %ptr.b = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 4
+
+  %v.0 = load i32, i32 addrspace(3)* %ptr.a
+  %v.1 = load i32, i32 addrspace(3)* %ptr.b
+
+  %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+  %bc = bitcast <2 x i32> %r.1 to <2 x float>
+  ret <2 x float> %bc
+}
+
+
+; GCN-LABEL: {{^}}store_global_var_idx_case1:
+; SI: ds_write_b32
+; SI: ds_write_b32
+; NONSI: ds_write2_b32
+define amdgpu_cs void @store_global_var_idx_case1(i32 %idx) #0 {
+entry:
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx
+  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+
+  store i32 42, i32 addrspace(3)* %ptr.a
+  store i32 43, i32 addrspace(3)* %ptr.b
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}load_global_var_idx_case1:
+; SI: ds_read_b32
+; SI: ds_read_b32
+; NONSI: ds_read2_b32
+define amdgpu_cs <2 x float> @load_global_var_idx_case1(i32 %idx) #0 {
+entry:
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx
+  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+
+  %v.0 = load i32, i32 addrspace(3)* %ptr.a
+  %v.1 = load i32, i32 addrspace(3)* %ptr.b
+
+  %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+  %bc = bitcast <2 x i32> %r.1 to <2 x float>
+  ret <2 x float> %bc
+}
+
+
+; GCN-LABEL: {{^}}store_global_var_idx_case2:
+; GCN: ds_write2_b32
+define amdgpu_cs void @store_global_var_idx_case2(i32 %idx) #0 {
+entry:
+  %idx.and = and i32 %idx, 255
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx.and
+  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+
+  store i32 42, i32 addrspace(3)* %ptr.a
+  store i32 43, i32 addrspace(3)* %ptr.b
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}load_global_var_idx_case2:
+; GCN: ds_read2_b32
+define amdgpu_cs <2 x float> @load_global_var_idx_case2(i32 %idx) #0 {
+entry:
+  %idx.and = and i32 %idx, 255
+  %ptr.a = getelementptr [512 x i32], [512 x i32] addrspace(3)* @compute_lds, i32 0, i32 %idx.and
+  %ptr.b = getelementptr i32, i32 addrspace(3)* %ptr.a, i32 1
+
+  %v.0 = load i32, i32 addrspace(3)* %ptr.a
+  %v.1 = load i32, i32 addrspace(3)* %ptr.b
+
+  %r.0 = insertelement <2 x i32> undef, i32 %v.0, i32 0
+  %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
+  %bc = bitcast <2 x i32> %r.1 to <2 x float>
+  ret <2 x float> %bc
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index 34b842d8436f0de161b02c5a09d4d83984cfb679..63c1556212de54c04fbad8a0ab81cd78d0026ebe 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; FIXME: Enable for VI.
 
@@ -144,20 +144,24 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
-; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
 
-; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
+; SI: ; %entry
+; SI:     v_cmp_eq_u32_e64   [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
+; SI:     s_mov_b64          vcc, 0
+; SI:     s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]
 
+; SI: ; %bb
+; SI:     buffer_load_dword  [[LOAD:v[0-9]+]],
+; SI:     v_cmp_ne_u32_e32   vcc, 0, [[LOAD]]
+; SI:     s_and_b64          vcc, vcc, exec
+
+; SI: ; %exit
+; SI:     s_or_b64           exec, exec, [[SAVE]]
+; SI-NOT: vcc
+; SI:     v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
+; SI:     buffer_store_dword
+; SI:     s_endpgm
 
-; SI: BB9_2:
-; SI: s_or_b64 exec, exec, [[SAVE]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
-; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-; SI: buffer_store_dword
-; SI: s_endpgm
 define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
new file mode 100644
index 0000000000000000000000000000000000000000..96f0210825c643d9acc274cc7eca8025d45a3249
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.a16.dim.ll
@@ -0,0 +1,530 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}load_1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %t = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %r = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_cube:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1darray:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %slice = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i16(i32 15, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2darray:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2dmsaa:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %fragid = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_2darraymsaa:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %fragid = extractelement <2 x i16> %coords_hi, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_1d:
+; GCN: image_load_mip v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %mip = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i16(i32 15, i16 %s, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_2d:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %mip = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_3d:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %r = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %r, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_cube:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_1darray:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %slice = extractelement <2 x i16> %coords_lo, i32 1
+  %mip = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i16(i32 15, i16 %s, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_mip_2darray:
+; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i16(i32 15, i16 %s, i16 %t, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}store_1d:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2d:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %t = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_3d:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %r = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %r, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_cube:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.cube.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1darray:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %slice = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.1darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2darray:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.2darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2dmsaa:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %fragid = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_2darraymsaa:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %fragid = extractelement <2 x i16> %coords_hi, i32 1
+  call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_1d:
+; GCN: image_store_mip v[0:3], v4, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %mip = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.mip.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_2d:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %mip = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.mip.2d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_3d:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %r = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  call void @llvm.amdgcn.image.store.mip.3d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %r, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_cube:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  call void @llvm.amdgcn.image.store.mip.cube.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_1darray:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %slice = extractelement <2 x i16> %coords_lo, i32 1
+  %mip = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_mip_2darray:
+; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %s = extractelement <2 x i16> %coords_lo, i32 0
+  %t = extractelement <2 x i16> %coords_lo, i32 1
+  %slice = extractelement <2 x i16> %coords_hi, i32 0
+  %mip = extractelement <2 x i16> %coords_hi, i32 1
+  call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, i16 %t, i16 %slice, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}getresinfo_1d:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_2d:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_3d:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_cube:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_1darray:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_2darray:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_2dmsaa:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}getresinfo_2darraymsaa:
+; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm a16 da{{$}}
+define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32 15, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_V1:
+; GCN: image_load v0, v0, s[0:7] dmask:0x8 unorm a16
+define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call float @llvm.amdgcn.image.load.1d.f32.i16(i32 8, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret float %v
+}
+
+; GCN-LABEL: {{^}}load_1d_V2:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm a16
+define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16(i32 9, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; GCN-LABEL: {{^}}store_1d_V1:
+; GCN: image_store v0, v1, s[0:7] dmask:0x2 unorm a16
+define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.f32.i16(float %vdata, i32 2, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1d_V2:
+; GCN: image_store v[0:1], v2, s[0:7] dmask:0xc unorm a16
+define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v2f32.i16(<2 x float> %vdata, i32 12, i16 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}load_1d_glc:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc a16{{$}}
+define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 1)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_slc:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc a16{{$}}
+define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load_1d_glc_slc:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc a16{{$}}
+define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 3)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}store_1d_glc:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc a16{{$}}
+define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 1)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1d_slc:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc a16{{$}}
+define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 2)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_1d_glc_slc:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc a16{{$}}
+define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) {
+main_body:
+  %s = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %vdata, i32 15, i16 %s, <8 x i32> %rsrc, i32 0, i32 3)
+  ret void
+}
+
+; GCN-LABEL: {{^}}getresinfo_dmask0:
+; GCN-NOT: image
+; GCN: ; return to shader part epilog
+define amdgpu_ps <4 x float> @getresinfo_dmask0(<8 x i32> inreg %rsrc, <4 x float> %vdata, <2 x i16> %coords) #0 {
+main_body:
+  %mip = extractelement <2 x i16> %coords, i32 0
+  %r = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32 0, i16 %mip, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %r
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i16(i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #1
+
+declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i16(i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i16(i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i16(i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #1
+
+declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float>, i32, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.cube.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1darray.v4f32.i16(<4 x float>, i32, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darray.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i16(<4 x float>, i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #0
+
+declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i16(<4 x float>, i32, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i16(<4 x float>, i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i16(<4 x float>, i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i16(<4 x float>, i32, i16, i16, i16, i16, <8 x i32>, i32, i32) #0
+
+declare <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+
+declare float @llvm.amdgcn.image.load.1d.f32.i16(i32, i16, <8 x i32>, i32, i32) #1
+declare float @llvm.amdgcn.image.load.2d.f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #1
+declare <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i16(i32, i16, <8 x i32>, i32, i32) #1
+declare void @llvm.amdgcn.image.store.1d.f32.i16(float, i32, i16, <8 x i32>, i32, i32) #0
+declare void @llvm.amdgcn.image.store.1d.v2f32.i16(<2 x float>, i32, i16, <8 x i32>, i32, i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1fbfccb0e39a3acd3e90818e66810ac8550045b6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}load.f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps <4 x half> @load.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v3f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f16.1d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps <4 x half> @load.v4f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps <4 x half> @load.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v3f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f16.2d:
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps <4 x half> @load.v4f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps <4 x half> @load.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps <4 x half> @load.v3f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f16.3d:
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps <4 x half> @load.v4f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x half> %v
+}
+
+declare <4 x half> @llvm.amdgcn.image.load.1d.v4f16.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x half> @llvm.amdgcn.image.load.2d.v4f16.i16(i32, i16, i16, <8 x i32>, i32, i32) #2
+declare <4 x half> @llvm.amdgcn.image.load.3d.v4f16.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d857ae115a7c3e1699133a9744c04b1ced8d1fd4
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}load.f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps <4 x float> @load.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps <4 x float> @load.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f32.1d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load.v4f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps <4 x float> @load.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps <4 x float> @load.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps <4 x float> @load.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f32.2d:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load.v4f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps <4 x float> @load.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v2f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps <4 x float> @load.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v3f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps <4 x float> @load.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+; GCN-LABEL: {{^}}load.v4f32.3d:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps <4 x float> @load.v4f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret <4 x float> %v
+}
+
+declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32, i16, i16, <8 x i32>, i32, i32) #2
+declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i16(i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..48d26f7db20a184ca8bf25c7ef8da0005889fc23
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.d16.ll
@@ -0,0 +1,140 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}store.f16.1d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps void @store.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half> %bitcast, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f16.1d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps void @store.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half> %bitcast, i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f16.1d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps void @store.v3f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half> %bitcast, i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f16.1d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps void @store.v4f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half> %bitcast, i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.f16.2d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps void @store.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half> %bitcast, i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f16.2d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps void @store.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half> %bitcast, i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f16.2d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps void @store.v3f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half> %bitcast, i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f16.2d:
+; GCN: image_store v[1:2], v0, s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps void @store.v4f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half> %bitcast, i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.f16.3d:
+; GCN: image_store v[2:3], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
+define amdgpu_ps void @store.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half> %bitcast, i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f16.3d:
+; GCN: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
+define amdgpu_ps void @store.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half> %bitcast, i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f16.3d:
+; GCN: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 unorm a16 d16
+define amdgpu_ps void @store.v3f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half> %bitcast, i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f16.3d:
+; GCN: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm a16 d16
+define amdgpu_ps void @store.v4f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <2 x i32> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  %bitcast = bitcast <2 x i32> %val to <4 x half>
+  call void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half> %bitcast, i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.image.store.1d.v4f16.i16(<4 x half>, i32, i16, <8 x i32>, i32, i32) #2
+declare void @llvm.amdgcn.image.store.2d.v4f16.i16(<4 x half>, i32, i16, i16, <8 x i32>, i32, i32) #2
+declare void @llvm.amdgcn.image.store.3d.v4f16.i16(<4 x half>, i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f5ec31ba7815c735f34bf0a5dbd76aa8313b4005
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.store.a16.ll
@@ -0,0 +1,128 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+; GCN-LABEL: {{^}}store.f32.1d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps void @store.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f32.1d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps void @store.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 3, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f32.1d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps void @store.v3f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 7, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f32.1d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store.v4f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 15, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.f32.2d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps void @store.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f32.2d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps void @store.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %val, i32 3, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f32.2d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps void @store.v3f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %val, i32 7, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f32.2d:
+; GCN: image_store v[1:4], v0, s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store.v4f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords, i32 0
+  %y = extractelement <2 x i16> %coords, i32 1
+  call void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float> %val, i32 15, i16 %x, i16 %y, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.f32.3d:
+; GCN: image_store v[2:5], v[0:1], s[0:7] dmask:0x1 unorm a16
+define amdgpu_ps void @store.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v2f32.3d:
+; GCN: image_store v[2:5], v[0:1], s[0:7] dmask:0x3 unorm a16
+define amdgpu_ps void @store.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %val, i32 3, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v3f32.3d:
+; GCN: image_store v[2:5], v[0:1], s[0:7] dmask:0x7 unorm a16
+define amdgpu_ps void @store.v3f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %val, i32 7, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}store.v4f32.3d:
+; GCN: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm a16
+define amdgpu_ps void @store.v4f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi, <4 x float> %val) {
+main_body:
+  %x = extractelement <2 x i16> %coords_lo, i32 0
+  %y = extractelement <2 x i16> %coords_lo, i32 1
+  %z = extractelement <2 x i16> %coords_hi, i32 0
+  call void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float> %val, i32 15, i16 %x, i16 %y, i16 %z, <8 x i32> %rsrc, i32 0, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32) #2
+declare void @llvm.amdgcn.image.store.2d.v4f32.i16(<4 x float>, i32, i16, i16, <8 x i32>, i32, i32) #2
+declare void @llvm.amdgcn.image.store.3d.v4f32.i16(<4 x float>, i32, i16, i16, i16, <8 x i32>, i32, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index 244fa6785626b0033c53915d7212f36fff04c311..9fe0c0f8fc1fda1ad9efe9641fd245aced9433d8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -19,7 +19,7 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
 ; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; VI-NEXT: s_nop 0
 ; VI-NEXT: s_nop 0
-; VI-NEXT: v_mov_b32_dpp v2, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
+; VI-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
 @0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
 define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {
 bb:
diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 13fdd288f9d5a0af4fdc5b4cd21a3f0d9b00f17f..12573a5fee3b621b86748e2e077193b9336d6c2f 100644
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -1,23 +1,91 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
 declare half @llvm.maxnum.f16(half %a, half %b)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
 declare <3 x half> @llvm.maxnum.v3f16(<3 x half> %a, <3 x half> %b)
 declare <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
 
-; GCN-LABEL: {{^}}maxnum_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_f16(
+; SI-LABEL: maxnum_f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s8, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_max_f16_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -29,15 +97,65 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_f16_imm_a:
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_f16_imm_a(
+; SI-LABEL: maxnum_f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, 0x4200, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -47,15 +165,65 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_f16_imm_b:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_max_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_f16_imm_b(
+; SI-LABEL: maxnum_f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -65,34 +233,79 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NOT: and
-; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
-
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @maxnum_v2f16(
+; SI-LABEL: maxnum_v2f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT:    s_lshr_b32 s0, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v2, v3, v2
+; SI-NEXT:    v_max_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v2f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s5, s5
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_max_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -104,29 +317,64 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
-; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI-NOT: and
-; SIVI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @maxnum_v2f16_imm_a(
+; SI-LABEL: maxnum_v2f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v2f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -136,31 +384,64 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
-; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
-; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @maxnum_v2f16_imm_b(
+; SI-LABEL: maxnum_v2f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v2f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e32 v0, 4.0, v0
+; VI-NEXT:    v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v2f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -171,10 +452,94 @@ entry:
 }
 
 ; FIXME: Scalarize with undef half
-; GCN-LABEL: {{^}}maxnum_v3f16:
-; GFX9: v_pk_max_f16
-; GFX9: v_pk_max_f16
 define amdgpu_kernel void @maxnum_v3f16(
+; SI-LABEL: maxnum_v3f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    s_lshr_b32 s4, s8, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_max_f32_e32 v2, v3, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, v1, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, v0, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v3f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s6, s6, 16
+; VI-NEXT:    v_max_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s6, s6
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e32 v1, v2, v1
+; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v3f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <3 x half> addrspace(1)* %r,
     <3 x half> addrspace(1)* %a,
     <3 x half> addrspace(1)* %b) {
@@ -186,13 +551,107 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}maxnum_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
 define amdgpu_kernel void @maxnum_v4f16(
+; SI-LABEL: maxnum_v4f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    s_lshr_b32 s4, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    s_lshr_b32 s4, s7, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s4, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_max_f32_e32 v3, v3, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_max_f32_e32 v1, v1, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_max_f32_e32 v2, v2, v5
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_max_f32_e32 v0, v0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: maxnum_v4f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v0, s7, s7
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    v_max_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s6, 16
+; VI-NEXT:    v_max_f16_e32 v0, v2, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v3, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: maxnum_v4f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %a,
     <4 x half> addrspace(1)* %b) {
@@ -204,28 +663,87 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmax_v4f16_imm_a:
-; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
-; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
-
-; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], [[K0]]
-; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], [[K1]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
-
-; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000
-; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
-
-; VI-DAG: v_max_f16_sdwa v[[MAX_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[MAX_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
-; VI-DAG: v_max_f16_sdwa v[[MAX_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_max_f16_e32 v[[MAX_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
-
-; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MAX_LO_LO]], v[[MAX_LO_HI]]
-; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MAX_HI_LO]], v[[MAX_HI_HI]]
-
-; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
 define amdgpu_kernel void @fmax_v4f16_imm_a(
+; SI-LABEL: fmax_v4f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s5, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_max_f32_e32 v3, 2.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fmax_v4f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_max_f16_e64 v3, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e32 v1, 0x4200, v1
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e32 v0, 0x4800, v2
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fmax_v4f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v0, s8
+; GFX9-NEXT:    v_pk_max_f16 v0, v2, s9
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %b) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index b34ad6f6890cd53082a11e94ed7b8faf80170b75..cdf05094f692e0c985c8981880574424434ab4dd 100644
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -1,23 +1,91 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 
 declare half @llvm.minnum.f16(half %a, half %b)
 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
 declare <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b)
 declare <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
 
-; GCN-LABEL: {{^}}minnum_f16:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89:  v_min_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
-define amdgpu_kernel void @minnum_f16(
+define amdgpu_kernel void @minnum_f16_ieee(
+; SI-LABEL: minnum_f16_ieee:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s2, s10
+; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0
+; SI-NEXT:    s_mov_b32 s8, s4
+; SI-NEXT:    s_mov_b32 s9, s5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_f16_ieee:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v1, v1, v1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_f16_ieee:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s10, s2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s11, s3
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -29,15 +97,88 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_f16_imm_a:
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], 0x40400000, v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89:  v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
+define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) {
+; SI-LABEL: minnum_f16_no_ieee:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: minnum_f16_no_ieee:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: minnum_f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %r.val = call half @llvm.minnum.f16(half %a, half %b)
+  ret half %r.val
+}
+
 define amdgpu_kernel void @minnum_f16_imm_a(
+; SI-LABEL: minnum_f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, 0x4200, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -47,15 +188,65 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_f16_imm_b:
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_min_f32_e32 v[[R_F32:[0-9]+]], 4.0, v[[A_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; GFX89:  v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @minnum_f16_imm_b(
+; SI-LABEL: minnum_f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s2
+; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s4, s6
+; GFX9-NEXT:    s_mov_b32 s5, s7
+; GFX9-NEXT:    s_mov_b32 s6, s2
+; GFX9-NEXT:    s_mov_b32 s7, s3
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -65,33 +256,79 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NOT: and
-; VI:    v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
-
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
-define amdgpu_kernel void @minnum_v2f16(
+define amdgpu_kernel void @minnum_v2f16_ieee(
+; SI-LABEL: minnum_v2f16_ieee:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; SI-NEXT:    s_lshr_b32 s0, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v2, v3, v2
+; SI-NEXT:    v_min_f32_e32 v0, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v2f16_ieee:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s5, s5
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_min_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_ieee:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -103,29 +340,94 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
-; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
-; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI-NOT: and
-; SIVI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
+define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) {
+; SI-LABEL: minnum_v2f16_no_ieee:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_min_f32_e32 v0, v0, v2
+; SI-NEXT:    v_min_f32_e32 v1, v1, v3
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: minnum_v2f16_no_ieee:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_min_f16_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: minnum_v2f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+  %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r.val
+}
 
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @minnum_v2f16_imm_a(
+; SI-LABEL: minnum_v2f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v2f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_min_f16_e32 v0, 0x4200, v0
+; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x44004200
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -135,31 +437,64 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
-; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
-; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
-; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-
-; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; SIVI-NOT: and
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
-
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
-; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
-
-; GCN: buffer_store_dword v[[R_V2_F16]]
 define amdgpu_kernel void @minnum_v2f16_imm_b(
+; SI-LABEL: minnum_v2f16_imm_b:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v2f16_imm_b:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4200
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_load_dword s4, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v0, s4, s4
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_min_f16_e32 v0, 4.0, v0
+; VI-NEXT:    v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v2f16_imm_b:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_load_dword s4, s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    s_mov_b32 s4, 0x42004400
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -170,10 +505,94 @@ entry:
 }
 
 ; FIXME: Scalarize with undef half
-; GCN-LABEL: {{^}}minnum_v3f16:
-; GFX9: v_pk_min_f16
-; GFX9: v_pk_min_f16
 define amdgpu_kernel void @minnum_v3f16(
+; SI-LABEL: minnum_v3f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    s_lshr_b32 s4, s8, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s8
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s9
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_min_f32_e32 v2, v3, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, v1, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, v0, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v3f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s6, s6, 16
+; VI-NEXT:    v_min_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v1, s6, s6
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_min_f16_e32 v1, v2, v1
+; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v3f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s4, s4
+; GFX9-NEXT:    v_pk_max_f16 v0, s6, s6
+; GFX9-NEXT:    v_pk_min_f16 v0, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s7, s7
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX9-NEXT:    buffer_store_short v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <3 x half> addrspace(1)* %r,
     <3 x half> addrspace(1)* %a,
     <3 x half> addrspace(1)* %b) {
@@ -185,13 +604,107 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}minnum_v4f16:
-; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
 define amdgpu_kernel void @minnum_v4f16(
+; SI-LABEL: minnum_v4f16:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s4
+; SI-NEXT:    s_lshr_b32 s4, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    s_lshr_b32 s4, s7, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s4, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s6
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_min_f32_e32 v3, v3, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_min_f32_e32 v1, v1, v5
+; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_min_f32_e32 v2, v2, v5
+; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_min_f32_e32 v0, v0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: minnum_v4f16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    v_max_f16_e64 v0, s7, s7
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    s_lshr_b32 s7, s7, 16
+; VI-NEXT:    v_min_f16_e32 v0, v1, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v1, s7, s7
+; VI-NEXT:    v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_max_f16_e64 v0, s6, s6
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    s_lshr_b32 s5, s6, 16
+; VI-NEXT:    v_min_f16_e32 v0, v2, v0
+; VI-NEXT:    v_max_f16_e64 v2, s5, s5
+; VI-NEXT:    v_max_f16_e64 v3, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: minnum_v4f16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[8:9], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v0
+; GFX9-NEXT:    v_pk_max_f16 v2, s6, s6
+; GFX9-NEXT:    v_pk_max_f16 v0, s4, s4
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %a,
     <4 x half> addrspace(1)* %b) {
@@ -203,28 +716,87 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmin_v4f16_imm_a:
-; GFX89-DAG: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
-; GFX9-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x44004200
-; GFX9-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x40004800
-
-; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], [[K0]]
-; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], [[K1]]
-; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
-
-; VI-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x4000
-; VI-DAG: v_mov_b32_e32 [[K4:v[0-9]+]], 0x4400
-
-; VI-DAG: v_min_f16_sdwa v[[MIN_HI_HI:[0-9]+]], v[[A_HI]], [[K4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[MIN_HI_LO:[0-9]+]], 0x4200, v[[A_HI]]
-; VI-DAG: v_min_f16_sdwa v[[MIN_LO_HI:[0-9]+]], v[[A_LO]], [[K2]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-DAG: v_min_f16_e32 v[[MIN_LO_LO:[0-9]+]], 0x4800, v[[A_LO]]
-
-; VI-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[MIN_LO_LO]], v[[MIN_LO_HI]]
-; VI-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], v[[MIN_HI_LO]], v[[MIN_HI_HI]]
-
-; VI: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}
 define amdgpu_kernel void @fmin_v4f16_imm_a(
+; SI-LABEL: fmin_v4f16_imm_a:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
+; SI-NEXT:    s_lshr_b32 s5, s5, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_min_f32_e32 v2, 4.0, v2
+; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_min_f32_e32 v3, 2.0, v3
+; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: fmin_v4f16_imm_a:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e64 v1, s5, s5
+; VI-NEXT:    s_lshr_b32 s5, s5, 16
+; VI-NEXT:    v_max_f16_e64 v3, s5, s5
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_min_f16_e32 v1, 0x4200, v1
+; VI-NEXT:    s_lshr_b32 s4, s4, 16
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_min_f16_e32 v0, 0x4800, v2
+; VI-NEXT:    v_max_f16_e64 v2, s4, s4
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: fmin_v4f16_imm_a:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s8, 0x44004200
+; GFX9-NEXT:    s_mov_b32 s9, 0x40004800
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, s5, s5
+; GFX9-NEXT:    v_pk_max_f16 v2, s4, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v0, s8
+; GFX9-NEXT:    v_pk_min_f16 v0, v2, s9
+; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
     <4 x half> addrspace(1)* %r,
     <4 x half> addrspace(1)* %b) {
 entry:
diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll
index bcab550f6c749abfda10140852ebae7be2645c9c..5913e7275e5246c1d8bb01c39df3706edcaff996 100644
--- a/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -96,8 +96,8 @@ entry:
 ; GFX9-NOT: m0
 ; SICIVI: s_mov_b32 m0
 
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
 
 
 ; EG: LDS_READ_RET
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-offset.ll b/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
index 928eecaae02f21c893b50e01b37c2e1b5d27af47..790715cda728e27f575b5b36d1170a1420b5802d 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
 
 ; Allocate two stack slots of 2052 bytes each requiring a total of 4104 bytes.
 ; Extracting the last element of each does not fit into the offset field of
diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll
index b2641cd4d2e4104947d61ce50f2de4fae4476cf7..f37b3a3637a4364591ee330f49c22806e59dd746 100644
--- a/test/CodeGen/AMDGPU/loop_break.ll
+++ b/test/CodeGen/AMDGPU/loop_break.ll
@@ -5,41 +5,45 @@
 
 ; OPT-LABEL: @break_loop(
 ; OPT: bb1:
-; OPT: call i64 @llvm.amdgcn.break(i64
+; OPT: icmp slt i32
 ; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
 
 ; OPT: bb4:
 ; OPT: load volatile
+; OPT: icmp slt i32
 ; OPT: xor i1 %cmp1
-; OPT: call i64 @llvm.amdgcn.if.break(
 ; OPT: br label %Flow
 
 ; OPT: Flow:
+; OPT: call i64 @llvm.amdgcn.if.break(
 ; OPT: call i1 @llvm.amdgcn.loop(i64
 ; OPT: br i1 %{{[0-9]+}}, label %bb9, label %bb1
 
 ; OPT: bb9:
 ; OPT: call void @llvm.amdgcn.end.cf(i64
 
-; TODO: Can remove exec fixes in return block
 ; GCN-LABEL: {{^}}break_loop:
-; GCN: s_mov_b64 [[INITMASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64         [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
-; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]]
-; GCN: v_cmp_lt_i32_e32 vcc, -1
-; GCN: s_and_b64 vcc, exec, vcc
-; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]
-
-; GCN: ; %bb.2: ; %bb4
-; GCN: buffer_load_dword
-; GCN: v_cmp_ge_i32_e32 vcc,
-; GCN: s_or_b64 [[MASK]], vcc, [[INITMASK]]
-
-; GCN: [[FLOW]]:
-; GCN: s_mov_b64 [[INITMASK]], [[MASK]]
-; GCN: s_andn2_b64 exec, exec, [[MASK]]
-; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]
+; GCN:      v_cmp_lt_i32_e32  vcc, -1
+; GCN:      s_and_b64         vcc, exec, vcc
+; GCN:      s_or_b64          [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec
+; GCN:      s_cbranch_vccnz   [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: ; %bb4
+; GCN:      buffer_load_dword
+; GCN:      v_cmp_ge_i32_e32  vcc,
+; GCN:      s_andn2_b64       [[INNER_MASK]], [[INNER_MASK]], exec
+; GCN:      s_and_b64         [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN:      s_or_b64          [[INNER_MASK]], [[INNER_MASK]], [[TMP0]]
+
+; GCN: [[FLOW]]: ; %Flow
+; GCN:      s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]]
+; GCN:      s_or_b64          [[TMP1]], [[TMP1]], [[OUTER_MASK]]
+; GCN:      s_mov_b64         [[OUTER_MASK]], [[TMP1]]
+; GCN:      s_andn2_b64       exec, exec, [[TMP1]]
+; GCN-NEXT: s_cbranch_execnz  [[LOOP_ENTRY]]
 
 ; GCN: ; %bb.4: ; %bb9
 ; GCN-NEXT: s_endpgm
@@ -66,25 +70,26 @@ bb9:
 
 ; OPT-LABEL: @undef_phi_cond_break_loop(
 ; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
 ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 undef, i64 %phi.broken)
+; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
+; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
 ; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
 
 ; OPT: bb4:
 ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
 ; OPT-NEXT: br label %Flow
 
 ; OPT: Flow:
-; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
 ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
-; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken)
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0)
+; OPT-NEXT: br i1 %1, label %bb9, label %bb1
 
 ; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
 define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
@@ -119,25 +124,26 @@ bb9:                                              ; preds = %Flow
 
 ; OPT-LABEL: @constexpr_phi_cond_break_loop(
 ; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
 ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), i64 %phi.broken)
+; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
+; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
 ; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
 
 ; OPT: bb4:
 ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
 ; OPT-NEXT: br label %Flow
 
 ; OPT: Flow:
-; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
 ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
-; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ]
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken)
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0)
+; OPT-NEXT: br i1 %1, label %bb9, label %bb1
 
 ; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
 define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
@@ -169,25 +175,26 @@ bb9:                                              ; preds = %Flow
 
 ; OPT-LABEL: @true_phi_cond_break_loop(
 ; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
 ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
-; OPT: %0 = call i64 @llvm.amdgcn.break(i64 %phi.broken)
-; OPT: br i1 %cmp0, label %bb4, label %Flow
+; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
+; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
+; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
 
 ; OPT: bb4:
 ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
 ; OPT-NEXT: br label %Flow
 
 ; OPT: Flow:
-; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
 ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
-; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken)
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0)
+; OPT-NEXT: br i1 %1, label %bb9, label %bb1
 
 ; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
 define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
@@ -219,7 +226,7 @@ bb9:                                              ; preds = %Flow
 
 ; OPT-LABEL: @false_phi_cond_break_loop(
 ; OPT: bb1:
-; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ]
 ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
 ; OPT-NOT: call
 ; OPT: br i1 %cmp0, label %bb4, label %Flow
@@ -227,17 +234,17 @@ bb9:                                              ; preds = %Flow
 ; OPT: bb4:
 ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
-; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
 ; OPT-NEXT: br label %Flow
 
 ; OPT: Flow:
-; OPT-NEXT: %loop.phi = phi i64 [ %0, %bb4 ], [ %phi.broken, %bb1 ]
 ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
-; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ false, %bb1 ]
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken)
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0)
 ; OPT-NEXT: br i1 %1, label %bb9, label %bb1
 
 ; OPT: bb9:                                              ; preds = %Flow
-; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
 define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
diff --git a/test/CodeGen/AMDGPU/mad-mix-hi.ll b/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 53a00c240d3e064f3d1f6281e1ca56b2d61b2814..6c27690fb2b77bcda315f12ce8a724bf5db3dbf9 100644
--- a/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -49,9 +49,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
 }
 
 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
-; GFX9: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT: s_setpc_b64
 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
   %src0.ext = fpext half %src0 to float
@@ -66,9 +66,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
 }
 
 ; GCN-LABEL: {{^}}v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
-; GFX9: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v1, v2
-; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT: s_setpc_b64
 define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
   %src0.ext = fpext half %src0 to float
diff --git a/test/CodeGen/AMDGPU/misched-killflags.mir b/test/CodeGen/AMDGPU/misched-killflags.mir
index 811ef0d13751522d554207fe03005fb9c2c9ff39..0c58042d5082a79f960b8a2e8a9dce992cbe61f0 100644
--- a/test/CodeGen/AMDGPU/misched-killflags.mir
+++ b/test/CodeGen/AMDGPU/misched-killflags.mir
@@ -26,20 +26,20 @@ body: |
     S_ENDPGM
 ...
 # CHECK-LABEL: name: func0
-# CHECK: $sgpr10 = S_MOV_B32 5
-# CHECK: $sgpr9 = S_MOV_B32 4
-# CHECK: $sgpr8 = S_MOV_B32 3
-# CHECK: $sgpr33 = S_MOV_B32 killed $sgpr7
+# CHECK-DAG: $sgpr10 = S_MOV_B32 5
+# CHECK-DAG: $sgpr9 = S_MOV_B32 4
+# CHECK-DAG: $sgpr8 = S_MOV_B32 3
+# CHECK-DAG: $sgpr33 = S_MOV_B32 killed $sgpr7
 # CHECK: $vgpr0 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr8_sgpr9_sgpr10_sgpr11
+# CHECK: $sgpr32 = S_MOV_B32 $sgpr33
 # CHECK: BUNDLE implicit-def $sgpr6_sgpr7, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $scc {
 # CHECK:   $sgpr6_sgpr7 = S_GETPC_B64
 # CHECK:   $sgpr6 = S_ADD_U32 internal $sgpr6, 0, implicit-def $scc
 # CHECK:   $sgpr7 = S_ADDC_U32 internal $sgpr7, 0, implicit-def $scc, implicit internal $scc
 # CHECK: }
-# CHECK: $sgpr4 = S_MOV_B32 $sgpr33
+# CHECK: $sgpr4 = S_MOV_B32 killed $sgpr33
 # CHECK: $vgpr1 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
 # CHECK: $vgpr2 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11
 # CHECK: $vgpr3 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec
-# CHECK: $sgpr32 = S_MOV_B32 killed $sgpr33
 # CHECK: S_NOP 0, implicit killed $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3
 # CHECK: S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 9e6efc565e44ea06ac4fb51e5a0b4a57ed5ac61b..679fd7c987038ba7060e19c0ec5811923ad245d9 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -59,31 +59,48 @@
 
 
 ; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
-; GCN: v_cmp_lt_i32_e32 vcc, 1
-; GCN: s_and_saveexec_b64
-; GCN: s_xor_b64
 
+; GCN:      s_mov_b64           [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
+; GCN:      v_cmp_lt_i32_e32    vcc, 1,
+; GCN:      s_mov_b64           [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
+; GCN:      s_and_saveexec_b64
+; GCN:      s_xor_b64
+
+; GCN: ; %LeafBlock1
+; GCN-NEXT: s_mov_b64           [[EXIT0]], exec
+; GCN-NEXT: v_cmp_ne_u32_e32    vcc, 2,
+; GCN-NEXT: s_and_b64           [[EXIT1]], vcc, exec
+
+; GCN: ; %Flow
+; GCN-NEXT: s_or_saveexec_b64
+; GCN-NEXT: s_xor_b64
 
 ; FIXME: Why is this compare essentially repeated?
-; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
-; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: ; %LeafBlock
+; GCN-DAG:  v_cmp_eq_u32_e32    vcc, 1,
+; GCN-DAG:  v_cmp_ne_u32_e64    [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
+; GCN-DAG:  s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
+; GCN-DAG:  s_andn2_b64         [[EXIT1]], [[EXIT1]], exec
+; GCN-DAG:  s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG:  s_and_b64           [[TMP1]], [[TMP1]], exec
+; GCN-DAG:  s_or_b64            [[EXIT0]], [[EXIT0]], [[TMP0]]
+; GCN-DAG:  s_or_b64            [[EXIT1]], [[EXIT1]], [[TMP1]]
 
 ; GCN: ; %Flow4
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_or_b64            exec, exec,
+; GCN-NEXT: s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
+; GCN-NEXT: s_xor_b64
 
 ; GCN: ; %exit1
-; GCN: ds_write_b32
+; GCN:      ds_write_b32
+; GCN:      s_andn2_b64         [[EXIT0]], [[EXIT0]], exec
 
-; GCN: %Flow5
-; GCN-NEXT: s_or_b64 exec, exec
-; GCN: v_cmp_ne_u32_e32 vcc, 0
-; GCN-NEXT: s_and_saveexec_b64
+; GCN: ; %Flow5
+; GCN-NEXT: s_or_b64            exec, exec,
+; GCN-NEXT; s_and_saveexec_b64  {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
 
 ; GCN: ; %exit0
-; GCN: buffer_store_dword
+; GCN:      buffer_store_dword
 
 ; GCN: ; %UnifiedReturnBlock
 ; GCN-NEXT: s_endpgm
@@ -312,13 +329,12 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
 ; IR: Flow2:
-; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
-; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
 
 ; IR: UnifiedReturnBlock:
-; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: %UnifiedRetVal = phi float [ 2.000000e+00, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
 ; IR: ret float %UnifiedRetVal
 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
 entry:
@@ -353,8 +369,8 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; GCN: {{^}}[[FLOW]]:
 ; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
 
-; GCN: v_mov_b32_e32 v0, 2.0
 ; GCN: s_or_b64 exec, exec
+; GCN: v_mov_b32_e32 v0, 2.0
 ; GCN-NOT: s_and_b64 exec, exec
 ; GCN: v_mov_b32_e32 v0, 1.0
 
diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll
index 3f7df7b6005ef99d881223fc4da056a71e278a4e..4c1a769d599588529c2e7066212017450d19e1dd 100644
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -10,41 +10,57 @@
 ;
 ; OPT: Flow:
 ;
-; Ensure two else.break calls, for both the inner and outer loops
+; Ensure two if.break calls, for both the inner and outer loops
 
-; OPT:        call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
-; OPT-NEXT:   call i64 @llvm.amdgcn.else.break(i64 [[if_exec]],
-; OPT-NEXT:   call void @llvm.amdgcn.end.cf
+; OPT:        call void @llvm.amdgcn.end.cf
+; OPT-NEXT:   call i64 @llvm.amdgcn.if.break(i1
+; OPT-NEXT:   call i1 @llvm.amdgcn.loop(i64
+; OPT-NEXT:   call i64 @llvm.amdgcn.if.break(i1
 ;
 ; OPT: Flow1:
 
 ; GCN-LABEL: {{^}}multi_else_break:
 
+; GCN: ; %main_body
+; GCN:      s_mov_b64           [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+
 ; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
+; GCN:      s_mov_b64           [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
-; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc
-
-; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}}
-; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2
-
-; Ensure extra or eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]
-; GCN-NEXT: s_mov_b64
-; GCN-NEXT: s_and_b64 [[MASKED_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]]
-; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]
-
-; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}}
-; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1
-
-; Ensure copy is eliminated
-; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
-; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]]
-; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
-; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
-; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
+; GCN:      s_or_b64            [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
+; GCN:      s_or_b64            [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
+; GCN:      s_and_saveexec_b64  [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+
+; FIXME: duplicate comparison
+; GCN: ; %ENDIF
+; GCN-DAG:  v_cmp_eq_u32_e32    vcc,
+; GCN-DAG:  v_cmp_ne_u32_e64    [[TMP51NEG:s\[[0-9]+:[0-9]+\]]],
+; GCN-DAG:  s_andn2_b64         [[BREAK_OUTER]], [[BREAK_OUTER]], exec
+; GCN-DAG:  s_andn2_b64         [[BREAK_INNER]], [[BREAK_INNER]], exec
+; GCN-DAG:  s_and_b64           [[TMP_EQ:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN-DAG:  s_and_b64           [[TMP_NE:s\[[0-9]+:[0-9]+\]]], [[TMP51NEG]], exec
+; GCN-DAG:  s_or_b64            [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
+; GCN-DAG:  s_or_b64            [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]
+
+; GCN: ; %Flow
+; GCN:      s_or_b64            exec, exec, [[SAVE_EXEC]]
+; GCN:      s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
+; GCN:      s_or_b64            [[TMP0]], [[TMP0]], [[LEFT_INNER]]
+; GCN:      s_mov_b64           [[LEFT_INNER]], [[TMP0]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP0]]
+; GCN:      s_cbranch_execnz    [[INNER_LOOP]]
+
+; GCN: ; %Flow2
+; GCN:      s_or_b64            exec, exec, [[TMP0]]
+; GCN:      s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
+; GCN:      s_or_b64            [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
+; GCN:      s_mov_b64           [[LEFT_OUTER]], [[TMP1]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP1]]
+; GCN:      s_cbranch_execnz    [[OUTER_LOOP]]
+
+; GCN: ; %IF
+; GCN-NEXT: s_endpgm
 define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 main_body:
   br label %LOOP.outer
@@ -68,20 +84,46 @@ ENDIF:                                            ; preds = %LOOP
 }
 
 ; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
-; OPT: llvm.amdgcn.break
-; OPT: llvm.amdgcn.loop
 ; OPT: llvm.amdgcn.if.break
+; OPT: llvm.amdgcn.loop
 ; OPT: llvm.amdgcn.if.break
 ; OPT: llvm.amdgcn.end.cf
 
 ; GCN-LABEL: {{^}}multi_if_break_loop:
-; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64          [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
+; GCN:      s_mov_b64          [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+
+; GCN: ; %LeafBlock1
+; GCN:      s_mov_b64
+; GCN:      s_mov_b64          [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+
+; GCN: ; %case1
+; GCN:      buffer_load_dword  [[LOAD2:v[0-9]+]],
+; GCN:      v_cmp_ge_i32_e32   vcc, {{v[0-9]+}}, [[LOAD2]]
+; GCN:      s_orn2_b64         [[BREAK]], vcc, exec
+
+; GCN: ; %Flow3
+; GCN:      s_branch           [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN:      s_mov_b64          [[BREAK]], -1{{$}}
+
+; GCN: [[FLOW]]: ; %Flow
+
+; GCN: ; %case0
+; GCN:      buffer_load_dword  [[LOAD1:v[0-9]+]],
+; GCN-DAG:  s_andn2_b64        [[BREAK]], [[BREAK]], exec
+; GCN-DAG:  v_cmp_ge_i32_e32   vcc, {{v[0-9]+}}, [[LOAD1]]
+; GCN-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
+; GCN:      s_or_b64           [[BREAK]], [[BREAK]], [[TMP]]
+
+; GCN: ; %Flow4
+; GCN:      s_and_b64          [[BREAK]], exec, [[BREAK]]
+; GCN:      s_or_b64           [[LEFT]], [[BREAK]], [[OLD_LEFT]]
+; GCN:      s_andn2_b64        exec, exec, [[LEFT]]
+; GCN-NEXT: s_cbranch_execnz
 
-; Uses a copy intsead of an or
-; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]]
-; GCN: s_or_b64 [[BREAK_REG]], exec, [[BREAK_REG]]
 define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/nested-calls.ll b/test/CodeGen/AMDGPU/nested-calls.ll
index 462274c65e7f93fb0429d761157a647219d15429..7fbcb9706a891a5e4f5cfa4dcdb9828f4bc7373b 100644
--- a/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/test/CodeGen/AMDGPU/nested-calls.ll
@@ -33,8 +33,8 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
 ; GCN: s_waitcnt
 ; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0x1400{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
+; GCN-DAG: s_add_u32 s32, s32, 0x1400{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
 ; GCN: s_swappc_b64
 ; GCN: s_sub_u32 s32, s32, 0x1400{{$}}
 ; GCN: s_setpc_b64
diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 8489a785310eda580c251c294cff67ef190e92a1..a007c965f949b0c2ee8eea1c08f6a75bf05d40bb 100644
--- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -10,7 +10,7 @@
 ; IR-LABEL: @reduced_nested_loop_conditions(
 
 ; IR: bb5:
-; IR-NEXT: %phi.broken = phi i64 [ %loop.phi, %bb10 ], [ 0, %bb ]
+; IR-NEXT: %phi.broken = phi i64 [ %3, %bb10 ], [ 0, %bb ]
 ; IR-NEXT: %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
 ; IR-NEXT: %tmp7 = icmp eq i32 %tmp6, 1
 ; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp7)
@@ -19,25 +19,23 @@
 ; IR-NEXT: br i1 %1, label %bb8, label %Flow
 
 ; IR: bb8:
-; IR-NEXT: %3 = call i64 @llvm.amdgcn.break(i64 %phi.broken)
 ; IR-NEXT: br label %bb13
 
 ; IR: bb10:
-; IR-NEXT: %loop.phi = phi i64 [ %6, %Flow ]
-; IR-NEXT: %tmp11 = phi i32 [ %5, %Flow ]
-; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; IR-NEXT: %tmp11 = phi i32 [ %6, %Flow ]
+; IR-NEXT: %tmp12 = phi i1 [ %5, %Flow ]
+; IR-NEXT: %3 = call i64 @llvm.amdgcn.if.break(i1 %tmp12, i64 %phi.broken)
+; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %3)
 ; IR-NEXT: br i1 %4, label %bb23, label %bb5
 
 ; IR: Flow:
-; IR-NEXT: %loop.phi1 = phi i64 [ %loop.phi2, %bb4 ], [ %phi.broken, %bb5 ]
-; IR-NEXT: %5 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
-; IR-NEXT: %6 = call i64 @llvm.amdgcn.else.break(i64 %2, i64 %loop.phi1)
+; IR-NEXT: %5 = phi i1 [ %tmp22, %bb4 ], [ true, %bb5 ]
+; IR-NEXT: %6 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %2)
 ; IR-NEXT: br label %bb10
 
 ; IR: bb13:
-; IR-NEXT: %loop.phi3 = phi i64 [ %loop.phi4, %bb3 ], [ %3, %bb8 ]
-; IR-NEXT: %tmp14 = phi i1 [ false, %bb3 ], [ true, %bb8 ]
+; IR-NEXT: %tmp14 = phi i1 [ %tmp22, %bb3 ], [ true, %bb8 ]
 ; IR-NEXT: %tmp15 = bitcast i64 %tmp2 to <2 x i32>
 ; IR-NEXT: br i1 %tmp14, label %bb16, label %bb20
 
@@ -48,13 +46,12 @@
 ; IR-NEXT: br label %bb20
 
 ; IR: bb20:
-; IR-NEXT: %loop.phi4 = phi i64 [ %phi.broken, %bb16 ], [ %phi.broken, %bb13 ]
-; IR-NEXT: %loop.phi2 = phi i64 [ %phi.broken, %bb16 ], [ %loop.phi3, %bb13 ]
 ; IR-NEXT: %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
+; IR-NEXT: %tmp22 = phi i1 [ false, %bb16 ], [ %tmp14, %bb13 ]
 ; IR-NEXT: br label %bb9
 
 ; IR: bb23:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %3)
 ; IR-NEXT: ret void
 
 ; GCN-LABEL: {{^}}reduced_nested_loop_conditions:
@@ -125,7 +122,7 @@ bb23:                                             ; preds = %bb10
 
 ; IR: Flow3:
 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %21)
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %13)
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %14)
 ; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
 ; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
 ; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
@@ -147,25 +144,24 @@ bb23:                                             ; preds = %bb10
 ; IR-NEXT: %8 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp15)
 
 ; IR: Flow1:
-; IR-NEXT: %loop.phi = phi i64 [ %18, %bb21 ], [ %phi.broken, %bb14 ]
 ; IR-NEXT: %11 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %bb14 ]
 ; IR-NEXT: %12 = phi i32 [ %tmp10, %bb21 ], [ undef, %bb14 ]
-; IR-NEXT: %13 = phi i1 [ %17, %bb21 ], [ false, %bb14 ]
-; IR-NEXT: %14 = phi i1 [ false, %bb21 ], [ true, %bb14 ]
-; IR-NEXT: %15 = call i64 @llvm.amdgcn.else.break(i64 %10, i64 %loop.phi)
+; IR-NEXT: %13 = phi i1 [ %18, %bb21 ], [ true, %bb14 ]
+; IR-NEXT: %14 = phi i1 [ %18, %bb21 ], [ false, %bb14 ]
+; IR-NEXT: %15 = phi i1 [ false, %bb21 ], [ true, %bb14 ]
 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %10)
-; IR-NEXT: %16 = call i1 @llvm.amdgcn.loop(i64 %15)
-; IR-NEXT: br i1 %16, label %Flow2, label %bb14
+; IR-NEXT: %16 = call i64 @llvm.amdgcn.if.break(i1 %13, i64 %phi.broken)
+; IR-NEXT: %17 = call i1 @llvm.amdgcn.loop(i64 %16)
+; IR-NEXT: br i1 %17, label %Flow2, label %bb14
 
 ; IR: bb21:
 ; IR: %tmp12 = icmp slt i32 %tmp11, 9
-; IR-NEXT: %17 = xor i1 %tmp12, true
-; IR-NEXT: %18 = call i64 @llvm.amdgcn.if.break(i1 %17, i64 %phi.broken)
+; IR-NEXT: %18 = xor i1 %tmp12, true
 ; IR-NEXT: br label %Flow1
 
 ; IR: Flow2:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %15)
-; IR-NEXT: %19 = call { i1, i64 } @llvm.amdgcn.if(i1 %14)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %16)
+; IR-NEXT: %19 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
 ; IR-NEXT: %20 = extractvalue { i1, i64 } %19, 0
 ; IR-NEXT: %21 = extractvalue { i1, i64 } %19, 1
 ; IR-NEXT: br i1 %20, label %bb31.loopexit, label %Flow3
diff --git a/test/CodeGen/AMDGPU/noop-shader-O0.ll b/test/CodeGen/AMDGPU/noop-shader-O0.ll
new file mode 100644
index 0000000000000000000000000000000000000000..af47170a4a9c968c3d54e76e58704a6979e5f7ad
--- /dev/null
+++ b/test/CodeGen/AMDGPU/noop-shader-O0.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Ensure NOOP shaders compile at OptNone.
+
+; Confirm registers reserved in SIMachineFunctionInfo are those expected during
+; lowering, even when e.g. spilling is required due to being at OptNone.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+target triple = "amdgcn-amd-amdpal"
+
+define amdgpu_vs void @noop_vs() {
+; GCN-LABEL: noop_vs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_ls void @noop_ls() {
+; GCN-LABEL: noop_ls:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_hs void @noop_hs() {
+; GCN-LABEL: noop_hs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_es void @noop_es() {
+; GCN-LABEL: noop_es:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_gs void @noop_gs() {
+; GCN-LABEL: noop_gs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_ps void @noop_ps() {
+; GCN-LABEL: noop_ps:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define amdgpu_cs void @noop_cs() {
+; GCN-LABEL: noop_cs:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_endpgm
+entry:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index d31d636cc41ac119efacc48e8ae5f1558db7002c..a38bacd97a67efd830d4a95a29c2e3b779105f73 100644
--- a/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -82,95 +82,95 @@
 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
 
 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
-; GCN: v_writelane_b32 v0, s[[TMP_LO]], 48
-; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 49
-; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
-; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
-; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
-; GCN-NEXT: v_writelane_b32 v0, s9, 53
-; GCN-NEXT: v_writelane_b32 v0, s10, 54
-; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
-
-; GCN-NEXT: v_writelane_b32 v0, s84, 56
-; GCN-NEXT: v_writelane_b32 v0, s85, 57
-; GCN-NEXT: v_writelane_b32 v0, s86, 58
-; GCN-NEXT: v_writelane_b32 v0, s87, 59
-; GCN-NEXT: v_writelane_b32 v0, s88, 60
-; GCN-NEXT: v_writelane_b32 v0, s89, 61
-; GCN-NEXT: v_writelane_b32 v0, s90, 62
-; GCN-NEXT: v_writelane_b32 v0, s91, 63
-; GCN-NEXT: v_writelane_b32 v1, s12, 0
-; GCN-NEXT: v_writelane_b32 v1, s13, 1
-; GCN-NEXT: v_writelane_b32 v1, s14, 2
-; GCN-NEXT: v_writelane_b32 v1, s15, 3
-; GCN-NEXT: v_writelane_b32 v1, s16, 4
-; GCN-NEXT: v_writelane_b32 v1, s17, 5
-; GCN-NEXT: v_writelane_b32 v1, s18, 6
-; GCN-NEXT: v_writelane_b32 v1, s19, 7
-; GCN-NEXT: v_writelane_b32 v1, s20, 8
-; GCN-NEXT: v_writelane_b32 v1, s21, 9
-; GCN-NEXT: v_writelane_b32 v1, s22, 10
-; GCN-NEXT: v_writelane_b32 v1, s23, 11
-; GCN-NEXT: v_writelane_b32 v1, s24, 12
-; GCN-NEXT: v_writelane_b32 v1, s25, 13
-; GCN-NEXT: v_writelane_b32 v1, s26, 14
-; GCN-NEXT: v_writelane_b32 v1, s27, 15
-; GCN-NEXT: v_writelane_b32 v1, s28, 16
-; GCN-NEXT: v_writelane_b32 v1, s29, 17
-; GCN-NEXT: v_writelane_b32 v1, s30, 18
-; GCN-NEXT: v_writelane_b32 v1, s31, 19
-; GCN-NEXT: v_writelane_b32 v1, s32, 20
-; GCN-NEXT: v_writelane_b32 v1, s33, 21
-; GCN-NEXT: v_writelane_b32 v1, s34, 22
-; GCN-NEXT: v_writelane_b32 v1, s35, 23
-; GCN-NEXT: v_writelane_b32 v1, s36, 24
-; GCN-NEXT: v_writelane_b32 v1, s37, 25
-; GCN-NEXT: v_writelane_b32 v1, s38, 26
-; GCN-NEXT: v_writelane_b32 v1, s39, 27
-; GCN-NEXT: v_writelane_b32 v1, s40, 28
-; GCN-NEXT: v_writelane_b32 v1, s41, 29
-; GCN-NEXT: v_writelane_b32 v1, s42, 30
-; GCN-NEXT: v_writelane_b32 v1, s43, 31
-; GCN-NEXT: v_writelane_b32 v1, s44, 32
-; GCN-NEXT: v_writelane_b32 v1, s45, 33
-; GCN-NEXT: v_writelane_b32 v1, s46, 34
-; GCN-NEXT: v_writelane_b32 v1, s47, 35
-; GCN-NEXT: v_writelane_b32 v1, s48, 36
-; GCN-NEXT: v_writelane_b32 v1, s49, 37
-; GCN-NEXT: v_writelane_b32 v1, s50, 38
-; GCN-NEXT: v_writelane_b32 v1, s51, 39
-; GCN-NEXT: v_writelane_b32 v1, s52, 40
-; GCN-NEXT: v_writelane_b32 v1, s53, 41
-; GCN-NEXT: v_writelane_b32 v1, s54, 42
-; GCN-NEXT: v_writelane_b32 v1, s55, 43
-; GCN-NEXT: v_writelane_b32 v1, s56, 44
-; GCN-NEXT: v_writelane_b32 v1, s57, 45
-; GCN-NEXT: v_writelane_b32 v1, s58, 46
-; GCN-NEXT: v_writelane_b32 v1, s59, 47
-; GCN-NEXT: v_writelane_b32 v1, s60, 48
-; GCN-NEXT: v_writelane_b32 v1, s61, 49
-; GCN-NEXT: v_writelane_b32 v1, s62, 50
-; GCN-NEXT: v_writelane_b32 v1, s63, 51
-; GCN-NEXT: v_writelane_b32 v1, s64, 52
-; GCN-NEXT: v_writelane_b32 v1, s65, 53
-; GCN-NEXT: v_writelane_b32 v1, s66, 54
-; GCN-NEXT: v_writelane_b32 v1, s67, 55
-; GCN-NEXT: v_writelane_b32 v1, s68, 56
-; GCN-NEXT: v_writelane_b32 v1, s69, 57
-; GCN-NEXT: v_writelane_b32 v1, s70, 58
-; GCN-NEXT: v_writelane_b32 v1, s71, 59
-; GCN-NEXT: v_writelane_b32 v1, s72, 60
-; GCN-NEXT: v_writelane_b32 v1, s73, 61
-; GCN-NEXT: v_writelane_b32 v1, s74, 62
-; GCN-NEXT: v_writelane_b32 v1, s75, 63
-; GCN-NEXT: v_writelane_b32 v2, s76, 0
-; GCN-NEXT: v_writelane_b32 v2, s77, 1
-; GCN-NEXT: v_writelane_b32 v2, s78, 2
-; GCN-NEXT: v_writelane_b32 v2, s79, 3
-; GCN-NEXT: v_writelane_b32 v2, s80, 4
-; GCN-NEXT: v_writelane_b32 v2, s81, 5
-; GCN-NEXT: v_writelane_b32 v2, s82, 6
-; GCN-NEXT: v_writelane_b32 v2, s83, 7
+; GCN: v_writelane_b32 v0, s12, 48
+; GCN-NEXT: v_writelane_b32 v0, s13, 49
+; GCN-NEXT: v_writelane_b32 v0, s14, 50
+; GCN-NEXT: v_writelane_b32 v0, s15, 51
+; GCN-NEXT: v_writelane_b32 v0, s16, 52
+; GCN-NEXT: v_writelane_b32 v0, s17, 53
+; GCN-NEXT: v_writelane_b32 v0, s18, 54
+; GCN-NEXT: v_writelane_b32 v0, s19, 55
+
+; GCN-NEXT: v_writelane_b32 v0, s20, 56
+; GCN-NEXT: v_writelane_b32 v0, s21, 57
+; GCN-NEXT: v_writelane_b32 v0, s22, 58
+; GCN-NEXT: v_writelane_b32 v0, s23, 59
+; GCN-NEXT: v_writelane_b32 v0, s24, 60
+; GCN-NEXT: v_writelane_b32 v0, s25, 61
+; GCN-NEXT: v_writelane_b32 v0, s26, 62
+; GCN-NEXT: v_writelane_b32 v0, s27, 63
+; GCN-NEXT: v_writelane_b32 v1, s28, 0
+; GCN-NEXT: v_writelane_b32 v1, s29, 1
+; GCN-NEXT: v_writelane_b32 v1, s30, 2
+; GCN-NEXT: v_writelane_b32 v1, s31, 3
+; GCN-NEXT: v_writelane_b32 v1, s32, 4
+; GCN-NEXT: v_writelane_b32 v1, s33, 5
+; GCN-NEXT: v_writelane_b32 v1, s34, 6
+; GCN-NEXT: v_writelane_b32 v1, s35, 7
+; GCN-NEXT: v_writelane_b32 v1, s36, 8
+; GCN-NEXT: v_writelane_b32 v1, s37, 9
+; GCN-NEXT: v_writelane_b32 v1, s38, 10
+; GCN-NEXT: v_writelane_b32 v1, s39, 11
+; GCN-NEXT: v_writelane_b32 v1, s40, 12
+; GCN-NEXT: v_writelane_b32 v1, s41, 13
+; GCN-NEXT: v_writelane_b32 v1, s42, 14
+; GCN-NEXT: v_writelane_b32 v1, s43, 15
+; GCN-NEXT: v_writelane_b32 v1, s44, 16
+; GCN-NEXT: v_writelane_b32 v1, s45, 17
+; GCN-NEXT: v_writelane_b32 v1, s46, 18
+; GCN-NEXT: v_writelane_b32 v1, s47, 19
+; GCN-NEXT: v_writelane_b32 v1, s48, 20
+; GCN-NEXT: v_writelane_b32 v1, s49, 21
+; GCN-NEXT: v_writelane_b32 v1, s50, 22
+; GCN-NEXT: v_writelane_b32 v1, s51, 23
+; GCN-NEXT: v_writelane_b32 v1, s52, 24
+; GCN-NEXT: v_writelane_b32 v1, s53, 25
+; GCN-NEXT: v_writelane_b32 v1, s54, 26
+; GCN-NEXT: v_writelane_b32 v1, s55, 27
+; GCN-NEXT: v_writelane_b32 v1, s56, 28
+; GCN-NEXT: v_writelane_b32 v1, s57, 29
+; GCN-NEXT: v_writelane_b32 v1, s58, 30
+; GCN-NEXT: v_writelane_b32 v1, s59, 31
+; GCN-NEXT: v_writelane_b32 v1, s60, 32
+; GCN-NEXT: v_writelane_b32 v1, s61, 33
+; GCN-NEXT: v_writelane_b32 v1, s62, 34
+; GCN-NEXT: v_writelane_b32 v1, s63, 35
+; GCN-NEXT: v_writelane_b32 v1, s64, 36
+; GCN-NEXT: v_writelane_b32 v1, s65, 37
+; GCN-NEXT: v_writelane_b32 v1, s66, 38
+; GCN-NEXT: v_writelane_b32 v1, s67, 39
+; GCN-NEXT: v_writelane_b32 v1, s68, 40
+; GCN-NEXT: v_writelane_b32 v1, s69, 41
+; GCN-NEXT: v_writelane_b32 v1, s70, 42
+; GCN-NEXT: v_writelane_b32 v1, s71, 43
+; GCN-NEXT: v_writelane_b32 v1, s72, 44
+; GCN-NEXT: v_writelane_b32 v1, s73, 45
+; GCN-NEXT: v_writelane_b32 v1, s74, 46
+; GCN-NEXT: v_writelane_b32 v1, s75, 47
+; GCN-NEXT: v_writelane_b32 v1, s76, 48
+; GCN-NEXT: v_writelane_b32 v1, s77, 49
+; GCN-NEXT: v_writelane_b32 v1, s78, 50
+; GCN-NEXT: v_writelane_b32 v1, s79, 51
+; GCN-NEXT: v_writelane_b32 v1, s80, 52
+; GCN-NEXT: v_writelane_b32 v1, s81, 53
+; GCN-NEXT: v_writelane_b32 v1, s82, 54
+; GCN-NEXT: v_writelane_b32 v1, s83, 55
+; GCN-NEXT: v_writelane_b32 v1, s84, 56
+; GCN-NEXT: v_writelane_b32 v1, s85, 57
+; GCN-NEXT: v_writelane_b32 v1, s86, 58
+; GCN-NEXT: v_writelane_b32 v1, s87, 59
+; GCN-NEXT: v_writelane_b32 v1, s88, 60
+; GCN-NEXT: v_writelane_b32 v1, s89, 61
+; GCN-NEXT: v_writelane_b32 v1, s90, 62
+; GCN-NEXT: v_writelane_b32 v1, s91, 63
+; GCN-NEXT: v_writelane_b32 v2, s4, 0
+; GCN-NEXT: v_writelane_b32 v2, s5, 1
+; GCN-NEXT: v_writelane_b32 v2, s6, 2
+; GCN-NEXT: v_writelane_b32 v2, s7, 3
+; GCN-NEXT: v_writelane_b32 v2, s8, 4
+; GCN-NEXT: v_writelane_b32 v2, s9, 5
+; GCN-NEXT: v_writelane_b32 v2, s10, 6
+; GCN-NEXT: v_writelane_b32 v2, s11, 7
 ; GCN: s_cbranch_scc1
 
 
@@ -184,6 +184,25 @@
 ; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 7
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 48
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 55
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 56
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 59
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 60
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 61
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 62
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 63
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
 ; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 0
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 1
@@ -265,26 +284,6 @@
 ; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 63
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
-; GCN: v_readlane_b32 s{{[0-9]+}}, v2, 0
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 1
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 2
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 3
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 4
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 5
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 6
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 7
-; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
-
-; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 56
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 59
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 60
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 61
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 62
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 63
-; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
-
 ; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 8
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 9
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 10
@@ -335,14 +334,14 @@
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 47
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
-; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 48
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 55
+; GCN: v_readlane_b32 s{{[0-9]+}}, v2, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 6
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 7
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 {
   %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -396,39 +395,39 @@ ret:
 ; GCN: def s[4:19]
 ; GCN: def s[20:35]
 
-; GCN: v_writelane_b32 v0, s4, 50
-; GCN-NEXT: v_writelane_b32 v0, s5, 51
-; GCN-NEXT: v_writelane_b32 v0, s6, 52
-; GCN-NEXT: v_writelane_b32 v0, s7, 53
-; GCN-NEXT: v_writelane_b32 v0, s8, 54
-; GCN-NEXT: v_writelane_b32 v0, s9, 55
-; GCN-NEXT: v_writelane_b32 v0, s10, 56
-; GCN-NEXT: v_writelane_b32 v0, s11, 57
-; GCN-NEXT: v_writelane_b32 v0, s12, 58
-; GCN-NEXT: v_writelane_b32 v0, s13, 59
-; GCN-NEXT: v_writelane_b32 v0, s14, 60
-; GCN-NEXT: v_writelane_b32 v0, s15, 61
-; GCN-NEXT: v_writelane_b32 v0, s16, 62
-; GCN-NEXT: v_writelane_b32 v0, s17, 63
-; GCN-NEXT: v_writelane_b32 v1, s18, 0
-; GCN-NEXT: v_writelane_b32 v1, s19, 1
-
-; GCN: v_readlane_b32 s4, v0, 50
-; GCN-NEXT: v_readlane_b32 s5, v0, 51
-; GCN-NEXT: v_readlane_b32 s6, v0, 52
-; GCN-NEXT: v_readlane_b32 s7, v0, 53
-; GCN-NEXT: v_readlane_b32 s8, v0, 54
-; GCN-NEXT: v_readlane_b32 s9, v0, 55
-; GCN-NEXT: v_readlane_b32 s10, v0, 56
-; GCN-NEXT: v_readlane_b32 s11, v0, 57
-; GCN-NEXT: v_readlane_b32 s12, v0, 58
-; GCN-NEXT: v_readlane_b32 s13, v0, 59
-; GCN-NEXT: v_readlane_b32 s14, v0, 60
-; GCN-NEXT: v_readlane_b32 s15, v0, 61
-; GCN-NEXT: v_readlane_b32 s16, v0, 62
-; GCN-NEXT: v_readlane_b32 s17, v0, 63
-; GCN-NEXT: v_readlane_b32 s18, v1, 0
-; GCN-NEXT: v_readlane_b32 s19, v1, 1
+; GCN: v_writelane_b32 v0, s4, 48
+; GCN-NEXT: v_writelane_b32 v0, s5, 49
+; GCN-NEXT: v_writelane_b32 v0, s6, 50
+; GCN-NEXT: v_writelane_b32 v0, s7, 51
+; GCN-NEXT: v_writelane_b32 v0, s8, 52
+; GCN-NEXT: v_writelane_b32 v0, s9, 53
+; GCN-NEXT: v_writelane_b32 v0, s10, 54
+; GCN-NEXT: v_writelane_b32 v0, s11, 55
+; GCN-NEXT: v_writelane_b32 v0, s12, 56
+; GCN-NEXT: v_writelane_b32 v0, s13, 57
+; GCN-NEXT: v_writelane_b32 v0, s14, 58
+; GCN-NEXT: v_writelane_b32 v0, s15, 59
+; GCN-NEXT: v_writelane_b32 v0, s16, 60
+; GCN-NEXT: v_writelane_b32 v0, s17, 61
+; GCN-NEXT: v_writelane_b32 v0, s18, 62
+; GCN-NEXT: v_writelane_b32 v0, s19, 63
+
+; GCN: v_readlane_b32 s4, v0, 48
+; GCN-NEXT: v_readlane_b32 s5, v0, 49
+; GCN-NEXT: v_readlane_b32 s6, v0, 50
+; GCN-NEXT: v_readlane_b32 s7, v0, 51
+; GCN-NEXT: v_readlane_b32 s8, v0, 52
+; GCN-NEXT: v_readlane_b32 s9, v0, 53
+; GCN-NEXT: v_readlane_b32 s10, v0, 54
+; GCN-NEXT: v_readlane_b32 s11, v0, 55
+; GCN-NEXT: v_readlane_b32 s12, v0, 56
+; GCN-NEXT: v_readlane_b32 s13, v0, 57
+; GCN-NEXT: v_readlane_b32 s14, v0, 58
+; GCN-NEXT: v_readlane_b32 s15, v0, 59
+; GCN-NEXT: v_readlane_b32 s16, v0, 60
+; GCN-NEXT: v_readlane_b32 s17, v0, 61
+; GCN-NEXT: v_readlane_b32 s18, v0, 62
+; GCN-NEXT: v_readlane_b32 s19, v0, 63
 define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 {
   %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -493,8 +492,8 @@ ret:
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31
 
 ; GCN: def s[0:1]
-; GCN:      v_writelane_b32 v23, s0, 32
-; GCN-NEXT: v_writelane_b32 v23, s1, 33
+; GCN:      v_writelane_b32 v23, s20, 32
+; GCN-NEXT: v_writelane_b32 v23, s21, 33
 
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35
@@ -513,20 +512,6 @@ ret:
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 48
 ; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49
 
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: s_cbranch_scc1
@@ -551,7 +536,9 @@ ret:
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
 
-; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 34
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 32
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 33
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 34
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 35
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 36
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 37
@@ -564,9 +551,7 @@ ret:
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 44
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 45
 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 46
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 47
-; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 48
-; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 49
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 47
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
 ; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16
@@ -587,25 +572,10 @@ ret:
 ; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31
 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
 
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
 
-; GCN: v_readlane_b32 s0, v23, 32
-; GCN: v_readlane_b32 s1, v23, 33
+; GCN: v_readfirstlane_b32 s1, v0
 ; GCN: ;;#ASMSTART
 ; GCN: ; use s[0:1]
 define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
diff --git a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
index 19e89ce97a98bb2185afdfa6e1ce6ef095a4e3e8..5d8863f43377f07f33212baf91f4940395ee408b 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
@@ -1,8 +1,4 @@
-; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck %s
-
-; FIXME: Error is misleading because it's not an indirect call.
-
-; CHECK: error: <unknown>:0:0: in function crash_call_constexpr_cast void (): unsupported indirect call to function foo
+; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck %s
 
 ; Make sure that AMDGPUPromoteAlloca doesn't crash if the called
 ; function is a constantexpr cast of a function.
@@ -10,14 +6,18 @@
 declare void @foo(float addrspace(5)*) #0
 declare void @foo.varargs(...) #0
 
-; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo
+; CHECK-LABEL: @crash_call_constexpr_cast(
+; CHECK: alloca
+; CHECK: call void
 define amdgpu_kernel void @crash_call_constexpr_cast() #0 {
   %alloca = alloca i32, addrspace(5)
   call void bitcast (void (float addrspace(5)*)* @foo to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0
   ret void
 }
 
-; XCHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs
+; CHECK-LABEL: @crash_call_constexpr_cast_varargs(
+; CHECK: alloca
+; CHECK: call void
 define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 {
   %alloca = alloca i32, addrspace(5)
   call void bitcast (void (...)* @foo.varargs to void (i32 addrspace(5)*)*)(i32 addrspace(5)* %alloca) #0
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index ebef612299054b487bbfb5432ce3204ee563e274..8d12a725594a2c586a5794170a9e2d0790322580 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -1,8 +1,11 @@
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca -disable-promote-alloca-to-lds< %s | FileCheck -check-prefix=NOLDS %s
 
 ; This normally would be fixed by instcombine to be compare to the GEP
 ; indices
 
+; NOLDS-NOT: addrspace(3)
+
 ; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer(
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_same_derived_pointer.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
diff --git a/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
new file mode 100644
index 0000000000000000000000000000000000000000..80112160412c1b96f1e005a788b74f1db64e3335
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -0,0 +1,189 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+
+; GCN-LABEL: {{^}}float4_alloca_store4:
+; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
+
+; GFX-NOT: buffer_
+; GCN:  v_readfirstlane_b32
+; GFX8: v_movrels_b32
+; GFX9: s_set_gpr_idx_on
+; GFX9: s_set_gpr_idx_off
+
+; OPT:  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
+; OPT:  %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
+; OPT:  %1 = extractelement <4 x float> %0, i32 %sel2
+; OPT:  store float %1, float addrspace(1)* %out, align 4
+
+define amdgpu_kernel void @float4_alloca_store4(float addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x float>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> addrspace(5)* %alloca, align 4
+  %load = load float, float addrspace(5)* %gep, align 4
+  store float %load, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}float4_alloca_load4:
+; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4
+
+; GFX-NOT: buffer_
+; GCN:  v_readfirstlane_b32
+; GFX8: v_movreld_b32
+; GFX9: s_set_gpr_idx_on
+; GFX9: s_set_gpr_idx_off
+
+; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
+; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
+; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca
+; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
+; OPT:  store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
+
+define amdgpu_kernel void @float4_alloca_load4(<4 x float> addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x float>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store float 1.0, float addrspace(5)* %gep, align 4
+  %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
+  store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}half4_alloca_store4:
+; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4
+
+; GFX-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
+; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
+
+; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, <4 x half> addrspace(5)* %alloca, align 2
+; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
+; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
+; OPT: store half %1, half addrspace(1)* %out, align 2
+
+define amdgpu_kernel void @half4_alloca_store4(half addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x half>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, <4 x half> addrspace(5)* %alloca, align 2
+  %load = load half, half addrspace(5)* %gep, align 2
+  store half %load, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}half4_alloca_load4:
+; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4
+
+; GFX-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
+
+; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
+; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
+; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca
+; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
+; OPT: store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
+
+define amdgpu_kernel void @half4_alloca_load4(<4 x half> addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x half>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store half 1.0, half addrspace(5)* %gep, align 4
+  %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
+  store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}short4_alloca_store4:
+; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4
+
+; GFX-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
+; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]
+
+; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
+; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
+; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
+; OPT: store i16 %1, i16 addrspace(1)* %out, align 2
+
+define amdgpu_kernel void @short4_alloca_store4(i16 addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x i16>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
+  %load = load i16, i16 addrspace(5)* %gep, align 2
+  store i16 %load, i16 addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}short4_alloca_load4:
+; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4
+
+; GFX-NOT: buffer_
+; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
+; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff
+
+; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
+; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
+; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca
+; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
+; OPT: store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
+
+define amdgpu_kernel void @short4_alloca_load4(<4 x i16> addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
+entry:
+  %alloca = alloca <4 x i16>, align 16, addrspace(5)
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %c1 = icmp uge i32 %x, 3
+  %c2 = icmp uge i32 %y, 3
+  %sel1 = select i1 %c1, i32 1, i32 2
+  %sel2 = select i1 %c2, i32 0, i32 %sel1
+  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
+  store i16 1, i16 addrspace(5)* %gep, align 4
+  %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
+  store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
diff --git a/test/CodeGen/AMDGPU/reduction.ll b/test/CodeGen/AMDGPU/reduction.ll
index 74ca4a668f93865c444c0d16f372b628af515c1f..0c605f79d980bfe6c45606f388753710e11203ff 100644
--- a/test/CodeGen/AMDGPU/reduction.ll
+++ b/test/CodeGen/AMDGPU/reduction.ll
@@ -434,12 +434,23 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}reduction_maxnum_v4f16:
-; GFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-
-; VI:      v_max_f16_sdwa
-; VI-NEXT: v_max_f16_e32
-; VI-NEXT: v_max_f16_e32
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -451,12 +462,24 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}reduction_minnum_v4f16:
-; GFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
 
-; VI:      v_min_f16_sdwa
-; VI-NEXT: v_min_f16_e32
-; VI-NEXT: v_min_f16_e32
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_minnum_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -467,13 +490,36 @@ entry:
   ret half %res
 }
 
+; FIXME: Need to preserve fast math flags when fmaxnum matched
+; directly from the IR to avoid unnecessary quieting.
+
 ; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16:
-; GFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; XGFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+; XVI: s_waitcnt
+; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; XVI-NEXT: v_max_f16_e32 v0, v0, v1
+; XVI-NEXT: v_max_f16_e32 v0, v0, v2
+; XVI-NEXT: s_setpc_b64
 
-; VI:      v_max_f16_sdwa
-; VI-NEXT: v_max_f16_e32
-; VI-NEXT: v_max_f16_e32
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
+
+; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -486,13 +532,37 @@ entry:
   ret half %res
 }
 
+; FIXME: Need to preserve fast math flags when fmaxnum matched
+; directly from the IR to avoid unnecessary quieting.
+
 ; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16:
-; GFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
-; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; XGFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+; XVI: s_waitcnt
+; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; XVI-NEXT: v_min_f16_e32 v0, v0, v1
+; XVI-NEXT: v_min_f16_e32 v0, v0, v2
+; XVI-NEXT: s_setpc_b64
+
+; GFX9: s_waitcnt
+; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
+; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
+; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
+
+; FIXME: Extra canonicalize leftover
+; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]
+
+
+; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
 
-; VI:      v_min_f16_sdwa
-; VI-NEXT: v_min_f16_e32
-; VI-NEXT: v_min_f16_e32
+; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
+; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
+; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
 define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
 entry:
   %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
diff --git a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
index 693b7d827c56d2efa67e426edaca61a6cad5b640..d69bbda463c9d683cf63ea926902147c4ca10765 100644
--- a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
+++ b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
@@ -5,7 +5,7 @@
 # a slot index.
 
 # CHECK: %13.sub2:sgpr_128 = S_MOV_B32 0
-# CHECK: DBG_VALUE{{.*}}debug-use %13.sub2
+# CHECK: DBG_VALUE{{.*}} %13.sub2
 
 --- |
   define amdgpu_kernel void @test(i32 addrspace(1)* %out) { ret void }
diff --git a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
index 182096305f40432d41de4a27fd1024de283428d8..3d52f5aad04c2561c5bdfb27a0c774f5f99a8cd6 100644
--- a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
+++ b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
@@ -169,7 +169,7 @@
 ---
 
 # CHECK: name: sched_dbg_value_crash
-# CHECK: DBG_VALUE debug-use %99, debug-use $noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8
+# CHECK: DBG_VALUE %99, $noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8
 
 name:            sched_dbg_value_crash
 alignment:       0
@@ -319,7 +319,7 @@ body:             |
     %124:vgpr_32 = IMPLICIT_DEF
     %125:vgpr_32 = IMPLICIT_DEF
     %126:vgpr_32 = IMPLICIT_DEF
-    DBG_VALUE debug-use %103, debug-use _, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8
+    DBG_VALUE %103, _, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8
     ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32
     %127:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc
     $sgpr4 = COPY $sgpr101
diff --git a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
index ff3d0fc9bfa06e9818ca85bf7603ba12c027b10d..96ebb6f83628daa80131f2a21d68eed680ab1635 100644
--- a/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ b/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -march=amdgcn -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 declare void @llvm.amdgcn.s.barrier() nounwind convergent
 
diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll
index e4405900840a0a9a297c29bcee1bf8ea11c8371a..b2781a77811b853240c0c741639efa024bab0f02 100644
--- a/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=SI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
 
 ; This used to fail due to a v_add_i32 instruction with an illegal immediate
 ; operand that was created during Local Stack Slot Allocation. Test case derived
diff --git a/test/CodeGen/AMDGPU/select-opt.ll b/test/CodeGen/AMDGPU/select-opt.ll
index 33028f17531361d210054dbf907494e86b706c2a..f773357976cce0246658dbe1d2d3dcbcfec47a43 100644
--- a/test/CodeGen/AMDGPU/select-opt.ll
+++ b/test/CodeGen/AMDGPU/select-opt.ll
@@ -137,7 +137,6 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, flo
 ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
 ; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
 ; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}
-; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
 define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
 entry:
diff --git a/test/CodeGen/AMDGPU/selected-stack-object.ll b/test/CodeGen/AMDGPU/selected-stack-object.ll
deleted file mode 100644
index 50ca59ace94e3c1091eb9a4bf4b8500e7ed8da3d..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/selected-stack-object.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; "Assertion failure" should be caught with both XFAIL * and +Asserts.
-; XFAIL: *
-; REQUIRES: asserts
-
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-
-; See also local-stack-slot-bug.ll
-; This fails because a stack object is created during instruction selection.
-
-; CHECK-LABEL: {{^}}main:
-define amdgpu_ps float @main(i32 %idx) {
-main_body:
-  %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
-  ret float %v1
-}
diff --git a/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ae50d4f18c446bf5cb889e263a58091fda1a5f9f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
+define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
+  %load = load i32, i32 addrspace(4)* %ptr, align 4
+  %and = and i32 %load, 524288
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
+define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
+  %load = load i32, i32 addrspace(4)* %ptr, align 4
+  %and = and i32 %load, 8
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
+define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %and = and i32 %load, 524288
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte:
+; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
+define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %and = and i32 %load, 8
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: const_load_shrink_dword_to_unaligned_byte:
+; GCN: global_load_ushort
+define amdgpu_kernel void @const_load_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
+  %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
+  %load = load i32, i32 addrspace(4)* %ptr, align 2
+  %and = and i32 %load, 524288
+  %cmp = icmp eq i32 %and, 0
+  %sel = select i1 %cmp, i32 0, i32 -1
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index 04df33b8dd4802f9dd9e9b974e4c77ec9618bfd1..3db6fd2d898b05e8b6d8e5f8b5b8841e5d2b9d91 100644
--- a/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -100,22 +100,22 @@ endif:
   ret void
 }
 
-; FIXME: Should write to different SGPR pairs instead of copying to
-; VALU for i1 phi.
-
 ; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]
-
-; SI: BB{{[0-9]+}}_2:
-; SI: buffer_load_dword [[AVAL:v[0-9]+]]
-; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
-; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]
-
-; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
-; SI: buffer_store_dword [[RESULT]]
+
+; SI: ; %else
+; SI:      buffer_load_dword  [[AVAL:v[0-9]+]]
+; SI:      v_cmp_gt_i32_e64   [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]]
+
+; SI: ; %if
+; SI:      buffer_load_dword  [[AVAL:v[0-9]+]]
+; SI:      v_cmp_eq_u32_e32   [[CMP_ELSE:vcc]], 0, [[AVAL]]
+; SI-DAG:  s_andn2_b64        [[PHI]], [[PHI]], exec
+; SI-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec
+; SI:      s_or_b64           [[PHI]], [[PHI]], [[TMP]]
+
+; SI: ; %endif
+; SI:      v_cndmask_b32_e64  [[RESULT:v[0-9]+]], 0, -1, [[PHI]]
+; SI:      buffer_store_dword [[RESULT]],
 define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 73e56593ce8bb41b3bb09e796d34e38e8e3a3afa..6215a486a3683d350cb9cf04dd256b59a99cda7a 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
 
@@ -27,18 +27,23 @@ ENDIF:
 
 
 ; FUNC-LABEL: {{^}}phi_cond_outside_loop:
-; FIXME: This could be folded into the s_or_b64 instruction
-; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
-; SI: [[LOOP_LABEL:[A-Z0-9]+]]
-; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
-; SI_IF_BREAK instruction:
-; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]
+; SI:     s_mov_b64         [[LEFT:s\[[0-9]+:[0-9]+\]]], 0
+; SI:     s_mov_b64         [[PHI:s\[[0-9]+:[0-9]+\]]], 0
 
-; SI_LOOP instruction:
-; SI: s_andn2_b64 exec, exec, [[BREAK]]
-; SI: s_cbranch_execnz [[LOOP_LABEL]]
-; SI: s_endpgm
+; SI: ; %else
+; SI:     v_cmp_eq_u32_e64  [[TMP:s\[[0-9]+:[0-9]+\]]],
+; SI:     s_and_b64         [[PHI]], [[TMP]], exec
+
+; SI: ; %endif
+
+; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop
+; SI:     s_mov_b64         [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+; SI:     s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]]
+; SI:     s_or_b64          [[LEFT]], [[TMP1]], [[TMP]]
+; SI:     s_andn2_b64       exec, exec, [[LEFT]]
+; SI:     s_cbranch_execnz  [[LOOP_LABEL]]
+; SI:     s_endpgm
 
 define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
 entry:
@@ -90,19 +95,21 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; This broke the old AMDIL cfg structurizer
 ; FUNC-LABEL: {{^}}loop_land_info_assert:
 ; SI:      v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
-; SI:      s_and_b64 vcc, exec, [[CMP4]]
-; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]]
-; SI-NEXT: s_branch [[BR2:BB[0-9_]+]]
-; SI-NEXT: BB{{[0-9_]+}}:
-; SI-NEXT: buffer_store_dword
+; SI:      s_and_b64        [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
+; SI:      s_mov_b64        vcc, [[CMP4M]]
+; SI-NEXT: s_cbranch_vccnz  [[CONVEX_EXIT:BB[0-9_]+]]
+; SI-NEXT: s_branch         [[FOR_COND_PREHDR:BB[0-9_]+]]
+
+; SI: ; %if.else
+; SI:      buffer_store_dword
 
 ; SI:      [[INFLOOP:BB[0-9]+_[0-9]+]]:
 
-; SI:      [[BR1]]:
-; SI-NEXT: s_and_b64 vcc, exec,
-; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
+; SI:      [[CONVEX_EXIT]]:
+; SI:      s_mov_b64        vcc,
+; SI-NEXT: s_cbranch_vccnz  [[ENDPGM:BB[0-9]+_[0-9]+]]
 ; SI:      s_branch [[INFLOOP]]
-; SI-NEXT: [[BR2]]:
+; SI-NEXT: [[FOR_COND_PREHDR]]:
 ; SI:      s_cbranch_vccz [[ENDPGM]]
 
 ; SI:      [[ENDPGM]]:
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index 683c6695322742b70de6ab2b8516b11ab264895b..c4964e68e28009386d25c0e22d01b6067afdf8ff 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling,-mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 612943b66c4aa1e3dc8c7d81f02edfadab6d6d0e..c87145a1a5ba7a740dcc8be4a03ad6ad7893bcfe 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -495,6 +495,52 @@ main_body:
   ret void
 }
 
+; SMRD load with a non-const non-uniform offset of > 4 dwords (requires splitting)
+; GCN-LABEL: {{^}}smrd_load_nonconst3:
+; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; GCN: ; return to shader part epilog
+define amdgpu_ps <16 x float> @smrd_load_nonconst3(<4 x i32> inreg %rsrc, i32 %off) #0 {
+main_body:
+  %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off, i32 0)
+  %bc = bitcast <16 x i32> %ld to <16 x float>
+  ret <16 x float> %bc
+}
+
+; GCN-LABEL: {{^}}smrd_load_nonconst4:
+; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0xff8, v0 ;
+; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0xff8, v0 ;
+; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; GCN: ; return to shader part epilog
+define amdgpu_ps <16 x float> @smrd_load_nonconst4(<4 x i32> inreg %rsrc, i32 %off) #0 {
+main_body:
+  %off.2 = add i32 %off, 4088
+  %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
+  %bc = bitcast <16 x i32> %ld to <16 x float>
+  ret <16 x float> %bc
+}
+
+; GCN-LABEL: {{^}}smrd_load_nonconst5:
+; SICIVI: v_add_{{i32|u32}}_e32 v{{[0-9]+}}, vcc, 0x1004, v0
+; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x1004, v0
+; GCN-DAG: buffer_load_dwordx4 v[0:3], v{{[0-9]+}}, s[0:3], 0 offen ;
+; GCN-DAG: buffer_load_dwordx4 v[4:7], v{{[0-9]+}}, s[0:3], 0 offen offset:16 ;
+; GCN-DAG: buffer_load_dwordx4 v[8:11], v{{[0-9]+}}, s[0:3], 0 offen offset:32 ;
+; GCN-DAG: buffer_load_dwordx4 v[12:15], v{{[0-9]+}}, s[0:3], 0 offen offset:48 ;
+; GCN: ; return to shader part epilog
+define amdgpu_ps <16 x float> @smrd_load_nonconst5(<4 x i32> inreg %rsrc, i32 %off) #0 {
+main_body:
+  %off.2 = add i32 %off, 4100
+  %ld = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %off.2, i32 0)
+  %bc = bitcast <16 x i32> %ld to <16 x float>
+  ret <16 x float> %bc
+}
+
 ; SMRD load dwordx2
 ; GCN-LABEL: {{^}}smrd_load_dwordx2:
 ; SIVIGFX9: s_buffer_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
@@ -511,6 +557,63 @@ main_body:
   ret void
 }
 
+; GCN-LABEL: {{^}}smrd_uniform_loop:
+;
+; TODO: this should use an s_buffer_load
+;
+; GCN: buffer_load_dword
+define amdgpu_ps float @smrd_uniform_loop(<4 x i32> inreg %desc, i32 %bound) #0 {
+main_body:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop ]
+  %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop ]
+  %offset = shl i32 %counter, 2
+  %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset)
+  %sum.next = fadd float %sum, %v
+  %counter.next = add i32 %counter, 1
+  %cc = icmp uge i32 %counter.next, %bound
+  br i1 %cc, label %exit, label %loop
+
+exit:
+  ret float %sum.next
+}
+
+
+; GCN-LABEL: {{^}}smrd_uniform_loop2:
+; (this test differs from smrd_uniform_loop by the more complex structure of phis,
+; which used to confuse the DivergenceAnalysis after structurization)
+;
+; TODO: we should keep the loop counter in an SGPR and use an S_BUFFER_LOAD
+;
+; GCN: buffer_load_dword
+define amdgpu_ps float @smrd_uniform_loop2(<4 x i32> inreg %desc, i32 %bound, i32 %bound.a) #0 {
+main_body:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %main_body ], [ %counter.next, %loop.a ], [ %counter.next, %loop.b ]
+  %sum = phi float [ 0.0, %main_body ], [ %sum.next, %loop.a ], [ %sum.next.b, %loop.b ]
+  %offset = shl i32 %counter, 2
+  %v = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 %offset)
+  %sum.next = fadd float %sum, %v
+  %counter.next = add i32 %counter, 1
+  %cc = icmp uge i32 %counter.next, %bound
+  br i1 %cc, label %exit, label %loop.a
+
+loop.a:
+  %cc.a = icmp uge i32 %counter.next, %bound.a
+  br i1 %cc, label %loop, label %loop.b
+
+loop.b:
+  %sum.next.b = fadd float %sum.next, 1.0
+  br label %loop
+
+exit:
+  ret float %sum.next
+}
+
 
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll
index 5984d5a30f303bb8c3d357c41596bd4def444e8e..509b7a2dd68556c2e3f2ba57d279aa209a81ac48 100644
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@@ -1,8 +1,8 @@
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling  -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
-; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=1 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga  -verify-machineinstrs < %s | FileCheck -check-prefix=TOVGPR -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOVMEM -check-prefix=GCN %s
+; RUN: llc -O0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-spill-sgpr-to-smem=1 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=GCN %s
 
 ; XXX - Why does it like to use vcc?
 
@@ -13,29 +13,29 @@
 ; GCN-DAG: s_cmp_lg_u32
 
 ; TOVGPR-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
-; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 0
+; TOVGPR: v_writelane_b32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]], 2
 
 ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
-; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill
+; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Spill
 
 ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
-; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
+; TOSMEM: s_add_u32 m0, s3, 0x300{{$}}
 ; TOSMEM-NOT: [[M0_COPY]]
 ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill
 
 ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ENDIF]]:
-; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 0
+; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 2
 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Reload
+; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Reload
 ; TOVMEM: s_waitcnt vmcnt(0)
 ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]]
 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
+; TOSMEM: s_add_u32 m0, s3, 0x300{{$}}
 ; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload
 ; TOSMEM-NOT: [[M0_RESTORE]]
 ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]]
@@ -80,7 +80,7 @@ endif:
 ; TOSMEM: s_branch
 
 ; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM: s_add_u32 m0, s7, 0x400
+; TOSMEM: s_add_u32 m0, s7, 0x500
 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
 
 
@@ -162,17 +162,17 @@ endif:
 ; FIXME: RegScavenger::isRegUsed() always returns true if m0 is reserved, so we have to save and restore it
 ; FIXME-TOSMEM-NOT: m0
 ; TOSMEM: s_add_u32 m0, s3, 0x100
-; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
-; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x300
 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
 ; FIXME-TOSMEM-NOT: m0
+; TOSMEM: s_add_u32 m0, s3, 0x200
+; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
+; FIXME-TOSMEM-NOT: m0
 ; TOSMEM: s_cbranch_scc1
 
 ; TOSMEM: s_mov_b32 m0, -1
 
 ; TOSMEM: s_mov_b32 s0, m0
-; TOSMEM: s_add_u32 m0, s3, 0x100
+; TOSMEM: s_add_u32 m0, s3, 0x200
 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
 ; TOSMEM: s_mov_b32 m0, s0
 ; TOSMEM: s_waitcnt lgkmcnt(0)
@@ -180,7 +180,7 @@ endif:
 ; TOSMEM: ds_write_b64
 
 ; FIXME-TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x300
+; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
 ; FIXME-TOSMEM-NOT: m0
 ; TOSMEM: s_waitcnt lgkmcnt(0)
diff --git a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
index ebba35a6689af2fc99a8c4da30b79f76ae77ec9e..b8824be4725b6d52f1d707a32edb9b466a33d46c 100644
--- a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
+++ b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
@@ -45,11 +45,11 @@ ret:
 
 ; ALL-LABEL: {{^}}spill_sgpr_x4:
 ; SMEM: s_add_u32 m0, s3, 0x100{{$}}
-; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
 ; SMEM: s_add_u32 m0, s3, 0x100{{$}}
-; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload
 ; SMEM: s_dcache_wb
 ; SMEM: s_endpgm
 
@@ -94,15 +94,15 @@ ret:
 ; ALL-LABEL: {{^}}spill_sgpr_x8:
 
 ; SMEM: s_add_u32 m0, s3, 0x100{{$}}
-; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS:[0-9]+:[0-9]+]]{{\]}}, m0 ; 16-byte Folded Spill
 ; SMEM: s_add_u32 m0, s3, 0x110{{$}}
-; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
+; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
 ; SMEM: s_add_u32 m0, s3, 0x100{{$}}
-; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload
 ; SMEM: s_add_u32 m0, s3, 0x110{{$}}
-; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
+; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[VALS]]{{\]}}, m0 ; 16-byte Folded Reload
 
 ; SMEM: s_dcache_wb
 ; SMEM: s_endpgm
diff --git a/test/CodeGen/AMDGPU/sub_i1.ll b/test/CodeGen/AMDGPU/sub_i1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6861d32dccf5096f9ccbb0a13262e58a9395b7d2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sub_i1.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+
+; GCN-LABEL: {{^}}sub_var_var_i1:
+; GCN: s_xor_b64
+define amdgpu_kernel void @sub_var_var_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+  %a = load volatile i1, i1 addrspace(1)* %in0
+  %b = load volatile i1, i1 addrspace(1)* %in1
+  %sub = sub i1 %a, %b
+  store i1 %sub, i1 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}sub_var_imm_i1:
+; GCN: s_not_b64
+define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
+  %a = load volatile i1, i1 addrspace(1)* %in
+  %sub = sub i1 %a, 1
+  store i1 %sub, i1 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}sub_i1_cf:
+; GCN: ; %endif
+; GCN: s_not_b64
+define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %0 = load volatile i1, i1 addrspace(1)* %a
+  br label %endif
+
+else:
+  %1 = load volatile i1, i1 addrspace(1)* %b
+  br label %endif
+
+endif:
+  %2 = phi i1 [%0, %if], [%1, %else]
+  %3 = sub i1 %2, -1
+  store i1 %3, i1 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index d4c05fb56827385af5bcf5257b3ee190c00cbe1e..0d6bb66179775d559b7397e4ea0957a7c1222edc 100644
--- a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -57,18 +57,18 @@ bb11:                                             ; preds = %bb9
 
 ; CHECK-LABEL: {{^}}partially_undef_copy:
 ; CHECK: v_mov_b32_e32 v5, 5
-; CHECK: v_mov_b32_e32 v6, 6
+; CHECK-DAG: v_mov_b32_e32 v6, 6
 
-; CHECK: v_mov_b32_e32 v[[OUTPUT_LO:[0-9]+]], v5
+; CHECK-DAG: v_mov_b32_e32 v[[OUTPUT_LO:[0-9]+]], v5
 
 ; Undef copy
-; CHECK: v_mov_b32_e32 v1, v6
+; CHECK-DAG: v_mov_b32_e32 v1, v6
 
 ; undef copy
-; CHECK: v_mov_b32_e32 v2, v7
+; CHECK-DAG: v_mov_b32_e32 v2, v7
 
-; CHECK: v_mov_b32_e32 v[[OUTPUT_HI:[0-9]+]], v8
-; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6
+; CHECK-DAG: v_mov_b32_e32 v[[OUTPUT_HI:[0-9]+]], v8
+; CHECK-DAG: v_mov_b32_e32 v[[OUTPUT_LO]], v6
 
 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
 define amdgpu_kernel void @partially_undef_copy() #0 {
diff --git a/test/CodeGen/AMDGPU/unsupported-calls.ll b/test/CodeGen/AMDGPU/unsupported-calls.ll
index 2b6e15b79a405ac7bbd0d80ecd3a895a19683c61..303a0d6a1140dd15ef803cef107e5a420eece16a 100644
--- a/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -53,7 +53,7 @@ define void @test_call_varargs() {
 
 declare i32 @extern_variadic(...)
 
-; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported indirect call to function extern_variadic
+; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to variadic function extern_variadic
 ; R600: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to function extern_variadic
 define i32 @test_tail_call_bitcast_extern_variadic(<4 x float> %arg0, <4 x float> %arg1, i32 %arg2) {
   %add = fadd <4 x float> %arg0, %arg1
diff --git a/test/CodeGen/AMDGPU/v_swap_b32.mir b/test/CodeGen/AMDGPU/v_swap_b32.mir
new file mode 100644
index 0000000000000000000000000000000000000000..f0ce14bb9ddc726c49097e01c2202b92c9eb9c39
--- /dev/null
+++ b/test/CodeGen/AMDGPU/v_swap_b32.mir
@@ -0,0 +1,564 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: swap_phys_condensed
+# GCN: bb.0:
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: S_SETPC_B64_return
+---
+name:            swap_phys_condensed
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+    $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+# GCN-LABEL: name: swap_phys_sparse
+# GCN: bb.0:
+# GCN-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec
+# GCN-NEXT: S_SETPC_B64_return
+---
+name:            swap_phys_sparse
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+    $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+    $vgpr3 = V_MOV_B32_e32 killed $vgpr4, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr5 = V_MOV_B32_e32 killed $vgpr6, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1
+...
+
+# GCN-LABEL: name: swap_phys_liveout
+# GCN: bb.0:
+# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr0, $vgpr1 = V_SWAP_B32 $vgpr1, $vgpr0, implicit $exec
+# GCN-NEXT: S_SETPC_B64_return
+---
+name:            swap_phys_liveout
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31
+    $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr2, implicit $vgpr1
+...
+
+# GCN-LABEL: name: swap_phys_b64
+# GCN: bb.0:
+# GCN-NEXT: $vgpr0, $vgpr2 = V_SWAP_B32 $vgpr2, $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr1, $vgpr3 = V_SWAP_B32 $vgpr3, $vgpr1, implicit $exec
+---
+name:            swap_phys_b64
+body:             |
+  bb.0:
+    $vgpr4_vgpr5 = COPY killed $vgpr0_vgpr1
+    $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3
+    $vgpr2_vgpr3 = COPY killed $vgpr4_vgpr5
+...
+
+# GCN-LABEL: name: swap_phys_overlap_x
+# GCN: bb.0:
+# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr3_vgpr4 = V_ADD_F64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $exec
+# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+---
+name:            swap_phys_overlap_x
+body:             |
+  bb.0:
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr3_vgpr4 = V_ADD_F64 0, $vgpr0_vgpr1, 0, $vgpr3_vgpr4, 0, 0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+...
+
+# GCN-LABEL: name: swap_phys_clobber_y
+# GCN: bb.0:
+# GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+# GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+---
+name:            swap_phys_clobber_y
+body:             |
+  bb.0:
+    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+    $vgpr1 = V_MOV_B32_e32 killed $vgpr2, implicit $exec
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_copy_condense
+# GCN: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+---
+name:            swap_virt_copy_condense
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_copy_sparse
+# GCN: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+---
+name:            swap_virt_copy_sparse
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    S_NOP 0
+    %0 = COPY %1
+    S_NOP 0
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg
+# GCN: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+---
+name:            swap_virt_copy_subreg
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %2.sub1 = COPY %0.sub1
+    %0.sub0 = COPY %1.sub0
+    %0.sub1 = COPY %1.sub1
+    %1.sub0 = COPY %2.sub0
+...
+
+# GCN-LABEL: name: swap_virt_mov
+# GCN: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+---
+name:            swap_virt_mov
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = V_MOV_B32_e32 %0, implicit $exec
+    %0 = V_MOV_B32_e32 %1, implicit $exec
+    %1 = V_MOV_B32_e32 %2, implicit $exec
+...
+
+# GCN-LABEL: name: swap_virt_read_x
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %3:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_read_x
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %3 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_read_t_twice
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %3:vgpr_32 = COPY %2
+# GCN-NEXT: %0:vgpr_32, %1:vgpr_32 = V_SWAP_B32 %1, %0, implicit $exec
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_read_t_twice
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %3 = COPY %2
+    %0 = COPY %1
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_clobber_y
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_clobber_y
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = IMPLICIT_DEF
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_clobber_x1
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_clobber_x1
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %0 = IMPLICIT_DEF
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_clobber_x2
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_clobber_x2
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = IMPLICIT_DEF
+    %0 = COPY %1
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_clobber_t
+# GCN: bb.0:
+# GCN-NEXT: %0:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:vgpr_32 = COPY %0
+# GCN-NEXT: %0:vgpr_32 = COPY %1
+# GCN-NEXT: %2:vgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:vgpr_32 = COPY %2
+# GCN-NEXT: S_ENDPGM
+
+---
+name:            swap_virt_clobber_t
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %2 = IMPLICIT_DEF
+    %1 = COPY %2
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_overlap_x_full
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
+# GCN-NEXT: %3:vreg_64 = COPY %0
+# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0
+# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
+---
+name:            swap_virt_copy_subreg_overlap_x_full
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %3 = COPY %0
+    %0.sub0 = COPY %1.sub0
+    %1.sub0 = COPY %2.sub0
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_overlap_x_part
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
+# GCN-NEXT: %3:vreg_64 = COPY %0.sub0_sub1
+# GCN-NEXT: %0.sub0:vreg_128 = COPY %1.sub0
+# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
+---
+name:            swap_virt_copy_subreg_overlap_x_part
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+  - { id: 3, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %3 = COPY %0.sub0_sub1
+    %0.sub0 = COPY %1.sub0
+    %1.sub0 = COPY %2.sub0
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_wide_y
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
+# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0
+# GCN-NEXT: %1:vreg_64 = COPY %2
+---
+name:            swap_virt_copy_subreg_wide_y
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %0.sub0 = COPY %1.sub0
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_b64
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %0.sub1:vreg_64, %1.sub1:vreg_64 = V_SWAP_B32 %1.sub1, %0.sub1, implicit $exec
+---
+name:            swap_virt_b64
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_b128
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %0.sub0:vreg_128, %1.sub0:vreg_128 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %0.sub1:vreg_128, %1.sub1:vreg_128 = V_SWAP_B32 %1.sub1, %0.sub1, implicit $exec
+# GCN-NEXT: %0.sub2:vreg_128, %1.sub2:vreg_128 = V_SWAP_B32 %1.sub2, %0.sub2, implicit $exec
+# GCN-NEXT: %0.sub3:vreg_128, %1.sub3:vreg_128 = V_SWAP_B32 %1.sub3, %0.sub3, implicit $exec
+---
+name:            swap_virt_b128
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_128 }
+  - { id: 2, class: vreg_128 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_b128_sub0_1
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %0.sub0:vreg_128, %1.sub0:vreg_128 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+# GCN-NEXT: %0.sub1:vreg_128, %1.sub1:vreg_128 = V_SWAP_B32 %1.sub1, %0.sub1, implicit $exec
+# GCN-NEXT: S_ENDPGM
+---
+name:            swap_virt_b128_sub0_1
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_128 }
+  - { id: 2, class: vreg_128 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0_sub1 = COPY %0.sub0_sub1
+    %0.sub0_sub1 = COPY %1.sub0_sub1
+    %1.sub0_sub1 = COPY %2.sub0_sub1
+    S_ENDPGM
+...
+
+# GCN-LABEL: name: swap_virt_b128_sub2_3
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_128 = IMPLICIT_DEF
+# GCN-NEXT: %0.sub2:vreg_128, %1.sub2:vreg_128 = V_SWAP_B32 %1.sub2, %0.sub2, implicit $exec
+# GCN-NEXT: %0.sub3:vreg_128, %1.sub3:vreg_128 = V_SWAP_B32 %1.sub3, %0.sub3, implicit $exec
+# GCN-NEXT: S_ENDPGM
+---
+name:            swap_virt_b128_sub2_3
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_128 }
+  - { id: 2, class: vreg_128 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub2_sub3 = COPY %0.sub2_sub3
+    %0.sub2_sub3 = COPY %1.sub2_sub3
+    %1.sub2_sub3 = COPY %2.sub2_sub3
+    S_ENDPGM
+...
+
+
+# GCN-LABEL: name: swap_virt_s_to_s
+# GCN: bb.0:
+# GCN-NEXT: %0:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %1:sgpr_32 = IMPLICIT_DEF
+# GCN-NEXT: %2:sgpr_32 = COPY %0
+# GCN-NEXT: %0:sgpr_32 = COPY %1
+# GCN-NEXT: %1:sgpr_32 = COPY %2
+---
+name:            swap_virt_s_to_s
+registers:
+  - { id: 0, class: sgpr_32 }
+  - { id: 1, class: sgpr_32 }
+  - { id: 2, class: sgpr_32 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = COPY %0
+    %0 = COPY %1
+    %1 = COPY %2
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_impdef_super
+# GCN: %0.sub0:vreg_64, %1.sub0:vreg_64 = V_SWAP_B32 %1.sub0, %0.sub0, implicit $exec
+---
+name:            swap_virt_copy_subreg_impdef_super
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0, implicit-def %2, implicit $exec
+    %2.sub1 = COPY %0.sub1
+    %0.sub0 = COPY %1.sub0
+    %0.sub1 = COPY %1.sub1
+    %1.sub0 = COPY %2.sub0
+...
+
+# GCN-LABEL: name: swap_virt_copy_subreg_impuse_x
+# GCN: bb.0:
+# GCN-NEXT: %0:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %1:vreg_64 = IMPLICIT_DEF
+# GCN-NEXT: %2.sub0:vreg_64 = COPY %0.sub0
+# GCN-NEXT: %2.sub1:vreg_64 = COPY %0.sub1
+# GCN-NEXT: %0.sub0:vreg_64 = COPY %1.sub0, implicit %0
+# GCN-NEXT: %0.sub1:vreg_64 = COPY %1.sub1
+# GCN-NEXT: %1.sub0:vreg_64 = COPY %2.sub0
+# GCN-NEXT: S_ENDPGM
+---
+name:            swap_virt_copy_subreg_impuse_x
+registers:
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: vreg_64 }
+body:             |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2.sub0 = COPY %0.sub0
+    %2.sub1 = COPY %0.sub1
+    %0.sub0 = COPY %1.sub0, implicit %0
+    %0.sub1 = COPY %1.sub1
+    %1.sub0 = COPY %2.sub0
+    S_ENDPGM
+...
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index 58bd9a0cdef67300b0c73e1dd62b9a67feae1d10..ca85f0bee4c8c479a390c13f08a69c321ce7441a 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -8,23 +8,22 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 
 ; waitcnt should be inserted after exec modification
-; SI: v_cmp_lt_i32_e32 vcc, 0,
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
+; SI:      v_cmp_lt_i32_e32 vcc, 0,
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
+; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
 ; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
 ; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
 ; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
 
 ; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
-; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
-; SI: v_mov_b32_e32 v{{[0-9]}}, -1
-; SI: s_and_saveexec_b64
+; SI:      s_mov_b64 s[{{[0-9]:[0-9]}}], -1
+; SI:      s_and_saveexec_b64
 ; SI-NEXT: ; mask branch
 
 ; v_mov should be after exec modification
 ; SI: [[FLOW_BB]]:
 ; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
-; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
 ; SI-NEXT: ; mask branch
 ;
@@ -212,21 +211,18 @@ exit:
 ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
 ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
 ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]]
-; SI: s_xor_b64 [[ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG2]]
 ; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]
 
 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
 ; SI: buffer_store_dword
-; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
-; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
 
 ; SI: [[LABEL_FLOW]]:
 ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
-; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]]
-; SI-NEXT: s_mov_b64 [[MOVED_TMP:s\[[0-9]+:[0-9]+\]]], [[TMP]]
-; SI-NEXT: s_and_b64 [[MASKED_ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG3]]
-; SI-NEXT: s_or_b64 [[COND_STATE]], [[MASKED_ORNEG3]], [[MOVED_TMP]]
-; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
+; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
+; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]]
+; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]]
+; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]]
 ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
 
 ; SI: [[LABEL_EXIT]]:
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index 9cdc333cbc0b29b006cdfa656797f74a754bfeb8..32607c75e67db09d5c52dc0537e0276765f309ee 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
-; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
+; RUN: llc -march=amdgcn  -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
 ; RUN: llc -march=amdgcn  -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
 ; This ends up using all 256 registers and requires register
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index c743d6a48ae9fb78215e83fa9ac76f6eed51c5d3..e803bd40684ccdb312dc4830734818a7161ff50b 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; This ends up using all 255 registers and requires register
 ; scavenging which will fail to find an unsued register.
diff --git a/test/CodeGen/AMDGPU/waitcnt-looptest.ll b/test/CodeGen/AMDGPU/waitcnt-looptest.ll
index a941e5fb1f7d122068dbdeddd0d6c93954b5ec46..08267b76aefb5a7d162e275573416b04ba0acbfd 100644
--- a/test/CodeGen/AMDGPU/waitcnt-looptest.ll
+++ b/test/CodeGen/AMDGPU/waitcnt-looptest.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}testKernel
 ; GCN: BB0_1:
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cmp_eq_f32_e64
+; GCN-NEXT: v_cmp_eq_f32_e32
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT: v_cmp_eq_f32_e32
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/test/CodeGen/AMDGPU/zext-lid.ll b/test/CodeGen/AMDGPU/zext-lid.ll
index 9a9c1fe7550077f98feceb14efd5eeb715c2a339..e257980dc0e55f112060aa60b7e7e55e5b174892 100644
--- a/test/CodeGen/AMDGPU/zext-lid.ll
+++ b/test/CodeGen/AMDGPU/zext-lid.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=amdgcn < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn < %s | FileCheck %s
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s
 
 ; CHECK-NOT: and_b32
@@ -43,10 +44,21 @@ bb:
   ret void
 }
 
+; When EarlyCSE is not run this call produces a range max with 0 active bits,
+; which is a special case as an AssertZext from width 0 is invalid.
+; OPT-LABEL: @zext_grp_size_1x1x1
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !4
+define amdgpu_kernel void @zext_grp_size_1x1x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !1 {
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = and i32 %tmp, 1
+  store i32 %tmp1, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
 ; OPT-LABEL: @zext_grp_size_512
-; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5
-; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !5
-; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !5
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !6
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !6
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !6
 define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -74,7 +86,7 @@ entry:
 }
 
 ; OPT-LABEL: @func_test_workitem_id_x_default_range(
-; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !6
+; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !7
 define void @func_test_workitem_id_x_default_range(i32 addrspace(1)* nocapture %out) #4 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -96,11 +108,13 @@ attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind }
 
 !0 = !{i32 32, i32 4, i32 1}
+!1 = !{i32 1, i32 1, i32 1}
 
 ; OPT: !0 = !{i32 0, i32 128}
 ; OPT: !1 = !{i32 32, i32 4, i32 1}
 ; OPT: !2 = !{i32 0, i32 32}
 ; OPT: !3 = !{i32 0, i32 4}
 ; OPT: !4 = !{i32 0, i32 1}
-; OPT: !5 = !{i32 0, i32 512}
-; OPT: !6 = !{i32 0, i32 1024}
+; OPT: !5 = !{i32 1, i32 1, i32 1}
+; OPT: !6 = !{i32 0, i32 512}
+; OPT: !7 = !{i32 0, i32 1024}
diff --git a/test/CodeGen/ARM/ARMLoadStoreDBG.mir b/test/CodeGen/ARM/ARMLoadStoreDBG.mir
index 76f1523f77903889a9a3de548983e79f43718ae5..ce33dcf52ec450cd1a8a0ac2c4cfe44138781ec0 100644
--- a/test/CodeGen/ARM/ARMLoadStoreDBG.mir
+++ b/test/CodeGen/ARM/ARMLoadStoreDBG.mir
@@ -120,19 +120,19 @@ body:             |
   bb.0.entry:
     liveins: $r0, $r1, $r2, $r3, $lr, $r7
 
-    DBG_VALUE debug-use $r0, debug-use $noreg, !18, !27, debug-location !28
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
-    DBG_VALUE debug-use $r2, debug-use $noreg, !20, !27, debug-location !28
-    DBG_VALUE debug-use $r3, debug-use $noreg, !21, !27, debug-location !28
+    DBG_VALUE $r0, $noreg, !18, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r2, $noreg, !20, !27, debug-location !28
+    DBG_VALUE $r3, $noreg, !21, !27, debug-location !28
     t2CMPri $r3, 4, 14, $noreg, implicit-def $cpsr, debug-location !31
     t2Bcc %bb.2.if.end, 2, killed $cpsr
 
   bb.1:
     liveins: $lr, $r7
 
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     $r0 = t2MOVi -1, 14, $noreg, $noreg
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     tBX_RET 14, $noreg, implicit $r0, debug-location !34
 
   bb.2.if.end:
@@ -142,12 +142,12 @@ body:             |
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
-    DBG_VALUE debug-use $r0, debug-use $noreg, !18, !27, debug-location !28
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
-    DBG_VALUE debug-use $r2, debug-use $noreg, !20, !27, debug-location !28
-    DBG_VALUE debug-use $r3, debug-use $noreg, !21, !27, debug-location !28
+    DBG_VALUE $r0, $noreg, !18, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r2, $noreg, !20, !27, debug-location !28
+    DBG_VALUE $r3, $noreg, !21, !27, debug-location !28
     $r1 = COPY killed $r2, debug-location !32
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     $r2 = COPY killed $r3, debug-location !32
     tBL 14, $noreg, @g, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit-def $sp, debug-location !32
     $r0 = t2MOVi 0, 14, $noreg, $noreg
diff --git a/test/CodeGen/ARM/arm-cgp-calls.ll b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
similarity index 82%
rename from test/CodeGen/ARM/arm-cgp-calls.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-calls.ll
index b9cff6e307a48f0c61faa6a1b1ca96f324e992ce..10cd6671ffc470844b8afdaee8665a874195775b 100644
--- a/test/CodeGen/ARM/arm-cgp-calls.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=armv8 -arm-disable-cgp=false %s -o - | FileCheck %s
 
 ; Check that the pass doesn't try to promote the immediate parameters.
-; CHECK-COMMON-LABEL: call_with_imms
-; CHECK-COMMON-NOT:   uxt
+; CHECK-LABEL: call_with_imms
+; CHECK-NOT:   uxt
 define i8 @call_with_imms(i8* %arg) {
   %call = tail call arm_aapcs_vfpcc zeroext i8 @dummy2(i8* nonnull %arg, i8 zeroext 0, i8 zeroext 0)
   %cmp = icmp eq i8 %call, 0
@@ -12,23 +12,23 @@ define i8 @call_with_imms(i8* %arg) {
 }
 
 ; Test that the call result is still extended.
-; CHECK-COMMON-LABEL: test_call:
-; CHECK-COMMON: bl
-; CHECK-COMMONNEXT: sxtb r1, r0
+; CHECK-LABEL: test_call:
+; CHECK: bl
+; CHECK-NEXT: sxtb r1, r0
 define i16 @test_call(i8 zeroext %arg) {
   %call = call i8 @dummy_i8(i8 %arg)
   %cmp = icmp ult i8 %call, 128
   %conv = zext i1 %cmp to i16
-  ret i16 %conv 
+  ret i16 %conv
 }
 
 ; Test that the transformation bails when it finds that i16 is larger than i8.
 ; TODO: We should be able to remove the uxtb in these cases.
 ; CHECK-LABEL: promote_i8_sink_i16_1
-; CHECK-COMMON: bl dummy_i8
-; CHECK-COMMON: adds r0, #1
-; CHECK-COMMON: uxtb r0, r0
-; CHECK-COMMON: cmp r0
+; CHECK: bl dummy_i8
+; CHECK: add{{.*}} r0, #1
+; CHECK: uxtb r0, r0
+; CHECK: cmp r0
 define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) {
   %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
   %add = add nuw i8 %call, 1
@@ -39,11 +39,11 @@ define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroe
   ret i16 %res
 }
 
-; CHECK-COMMON-LABEL: promote_i8_sink_i16_2
-; CHECK-COMMON: bl dummy_i8
-; CHECK-COMMON: adds r0, #1
-; CHECK-COMMON-NOT: uxt
-; CHECK-COMMON: cmp r0
+; CHECK-LABEL: promote_i8_sink_i16_2
+; CHECK: bl dummy_i8
+; CHECK: add{{.*}} r0, #1
+; CHECK-NOT: uxt
+; CHECK: cmp r0
 define i16 @promote_i8_sink_i16_2(i8 zeroext %arg0, i8 zeroext %arg1, i16 zeroext %arg2) {
   %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
   %add = add nuw i8 %call, 1
@@ -57,9 +57,9 @@ define i16 @promote_i8_sink_i16_2(i8 zeroext %arg0, i8 zeroext %arg1, i16 zeroex
 @uc = global i8 42, align 1
 @LL = global i64 0, align 8
 
-; CHECK-COMMON-LABEL: zext_i64
-; CHECK-COMMON: ldrb
-; CHECK-COMMON: strd
+; CHECK-LABEL: zext_i64
+; CHECK: ldrb
+; CHECK: strd
 define void @zext_i64() {
 entry:
   %0 = load i8, i8* @uc, align 1
@@ -74,8 +74,8 @@ entry:
 @a = global i16* null, align 4
 @b = global i32 0, align 4
 
-; CHECK-COMMON-LABEL: constexpr
-; CHECK-COMMON: uxth
+; CHECK-LABEL: constexpr
+; CHECK: uxth
 define i32 @constexpr() {
 entry:
   store i32 ptrtoint (i32* @b to i32), i32* @b, align 4
@@ -89,12 +89,11 @@ entry:
   ret i32 undef
 }
 
-; The call to safe_lshift_func takes two parameters, but they're the same value just one is zext.
-; The transform won't happen because of the zext.
-; CHECK-COMMON-LABEL: call_zext_i8_i32
-; CHECK-COMMON-NOT: uxt
-; CHECK-COMMON: cmp
-; CHECK-COMMON: uxtb
+; The call to safe_lshift_func takes two parameters, but they're the same value
+; just one is zext. We do support zext now, so the transformation should
+; trigger and we don't want see uxtb here.
+; CHECK-LABEL: call_zext_i8_i32
+; CHECK-NOT: uxt
 define fastcc i32 @call_zext_i8_i32(i32 %p_45, i8 zeroext %p_46) {
 for.cond8.preheader:
   %call217 = call fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 zeroext undef)
@@ -119,9 +118,9 @@ for.end411:                                       ; preds = %for.cond8.preheader
 @g_82 = hidden local_unnamed_addr global i32 0, align 4
 
 ; Test that the transform bails on finding %conv4, a trunc
-; CHECK-COMMON-LABEL: call_return_pointer
-; CHECK-COMMON: sxth
-; CHECK-COMMON-NOT: uxt
+; CHECK-LABEL: call_return_pointer
+; CHECK: sxth
+; CHECK: uxt
 define hidden i32 @call_return_pointer(i8 zeroext %p_13) local_unnamed_addr #0 {
 entry:
   %conv1 = zext i8 %p_13 to i16
@@ -145,11 +144,9 @@ if.then:                                          ; preds = %for.cond
   br label %for.cond.backedge
 }
 
-; Transform will bail because of the zext
 ; Check that d.sroa.0.0.be is promoted passed directly into the tail call.
-; CHECK-COMMON-LABEL: check_zext_phi_call_arg
-; CHECK-COMMON: uxt
-; CHECK-COMMON: uxt
+; CHECK-LABEL: check_zext_phi_call_arg
+; CHECK-NOT: uxt
 define i32 @check_zext_phi_call_arg() {
 entry:
   br label %for.cond
@@ -169,6 +166,19 @@ if.then:                                          ; preds = %for.cond
   br label %for.cond.backedge
 }
 
+%struct.atomic_flag = type { i8 }
+
+; CHECK-LABEL: atomic_flag_test_and_set
+; CHECK-NOT: uxt
+define zeroext i1 @atomic_flag_test_and_set(%struct.atomic_flag* %object) {
+entry:
+  %_Value = getelementptr inbounds %struct.atomic_flag, %struct.atomic_flag* %object, i32 0, i32 0
+  %call = tail call arm_aapcscc zeroext i8 @__atomic_exchange_1(i8* %_Value, i8 zeroext 1, i32 5) #1
+  %0 = and i8 %call, 1
+  %tobool = icmp ne i8 %0, 0
+  ret i1 %tobool
+}
+
 declare i32 @assert(...)
 declare i8 @dummy_i8(i8)
 declare i8 @dummy2(i8*, i8, i8)
@@ -176,6 +186,7 @@ declare i16 @dummy3(i16)
 
 declare dso_local i32 @e(...) local_unnamed_addr #1
 declare dso_local zeroext i16 @f(...) local_unnamed_addr #1
+declare dso_local arm_aapcscc i8 @__atomic_exchange_1(i8*, i8, i32) local_unnamed_addr
 
 declare noalias i16** @func_62(i8 zeroext %p_63, i32 %p_64, i16 signext %p_65, i32* nocapture readnone %p_66)
 declare fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %si2)
diff --git a/test/CodeGen/ARM/arm-cgp-casts.ll b/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
similarity index 90%
rename from test/CodeGen/ARM/arm-cgp-casts.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-casts.ll
index 23467c9a20f4d25937081e3433af800c5fb39f3d..431846482c60c9622ef96250294841ed5296009f 100644
--- a/test/CodeGen/ARM/arm-cgp-casts.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
@@ -232,9 +232,10 @@ exit:
 ; promote %1 for the call - unless we can generate a uadd16.
 ; CHECK-COMMON-LABEL: zext_load_sink_call:
 ; CHECK-COMMON: uxt
-; uadd16
-; cmp
-; CHECK-COMMON: uxt
+; CHECK-DSP-IMM: uadd16
+; CHECK-COMMON: cmp
+; CHECK-NODSP: uxt
+; CHECK-DSP-IMM-NOT: uxt
 define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
 entry:
   %0 = load i16, i16* %ptr, align 4
@@ -338,3 +339,27 @@ declare i32 @dummy(i32, i32)
 @d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
 @sh1 = hidden local_unnamed_addr global i16 0, align 2
 @d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
+
+; CHECK-LABEL: two_stage_zext_trunc_mix
+; CHECK-NOT: uxt
+define i8* @two_stage_zext_trunc_mix(i32* %this, i32 %__pos1, i32 %__n1, i32** %__str, i32 %__pos2, i32 %__n2) {
+entry:
+  %__size_.i.i.i.i = bitcast i32** %__str to i8*
+  %0 = load i8, i8* %__size_.i.i.i.i, align 4
+  %1 = and i8 %0, 1
+  %tobool.i.i.i.i = icmp eq i8 %1, 0
+  %__size_.i5.i.i = getelementptr inbounds i32*, i32** %__str, i32 %__n1
+  %cast = bitcast i32** %__size_.i5.i.i to i32*
+  %2 = load i32, i32* %cast, align 4
+  %3 = lshr i8 %0, 1
+  %4 = zext i8 %3 to i32
+  %cond.i.i = select i1 %tobool.i.i.i.i, i32 %4, i32 %2
+  %__size_.i.i.i.i.i = bitcast i32* %this to i8*
+  %5 = load i8, i8* %__size_.i.i.i.i.i, align 4
+  %6 = and i8 %5, 1
+  %tobool.i.i.i.i.i = icmp eq i8 %6, 0
+  %7 = getelementptr inbounds i8, i8* %__size_.i.i.i.i, i32 %__pos1
+  %8 = getelementptr inbounds i8, i8* %__size_.i.i.i.i, i32 %__pos2
+  %res = select i1 %tobool.i.i.i.i.i,  i8* %7, i8* %8
+  ret i8* %res
+}
diff --git a/test/CodeGen/ARM/arm-cgp-icmps.ll b/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
similarity index 94%
rename from test/CodeGen/ARM/arm-cgp-icmps.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
index fca0be6da1fd6f893a28d857f12ee62e6bdd7a26..8ff7db51e65f5112b36f24929ba35b5630dfeb17 100644
--- a/test/CodeGen/ARM/arm-cgp-icmps.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
@@ -310,3 +310,23 @@ entry:
   ret i32 %conv1
 }
 
+; CHECK-COMMON-LABEL: mul_with_neg_imm
+; CHECK-COMMON-NOT: uxtb
+; CHECK-COMMON:     and [[BIT0:r[0-9]+]], r0, #1
+; CHECK-COMMON:     add.w [[MUL32:r[0-9]+]], [[BIT0]], [[BIT0]], lsl #5
+; CHECK-COMMON:     cmp.w r0, [[MUL32]], lsl #2
+define void @mul_with_neg_imm(i32, i32* %b) {
+entry:
+  %1 = trunc i32 %0 to i8
+  %2 = and i8 %1, 1
+  %conv.i = mul nuw i8 %2, -124
+  %tobool = icmp eq i8 %conv.i, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store i32 0, i32* %b, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/test/CodeGen/ARM/arm-cgp-overflow.ll b/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
similarity index 78%
rename from test/CodeGen/ARM/arm-cgp-overflow.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
index d0c191cc542f44eb7e2a5a656507d0adfc060bd0..8e10876c0b108d7605aba1931a9142e92041e3cb 100644
--- a/test/CodeGen/ARM/arm-cgp-overflow.ll
+++ b/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
@@ -168,6 +168,7 @@ define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
   ret i32 %res
 }
 
+; CHECK-LABEL: unsafe_sub_underflow_neg
 ; CHECK:  subs r0, #4
 ; CHECK:  uxtb [[EXT:r[0-9]+]], r0
 ; CHECK:  cmp [[EXT]], #253
@@ -178,3 +179,54 @@ define i32 @unsafe_sub_underflow_neg(i8 zeroext %a) {
   %res = select i1 %cmp, i32 8, i32 16
   ret i32 %res
 }
+
+; CHECK:      rsb.w [[RSUB:r[0-9]+]], r0, #248
+; CHECK-NOT:  uxt
+; CHECK:      cmp [[RSUB]], #252
+define i32 @safe_sub_imm_var(i8* %b) {
+entry:
+  %0 = load i8, i8* %b, align 1
+  %sub = sub nuw nsw i8 -8, %0
+  %cmp = icmp ugt i8 %sub, 252
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+; CHECK-LABEL: safe_sub_var_imm
+; CHECK:      add.w [[ADD:r[0-9]+]], r0, #8
+; CHECK-NOT:  uxt
+; CHECK:      cmp [[ADD]], #252
+define i32 @safe_sub_var_imm(i8* %b) {
+entry:
+  %0 = load i8, i8* %b, align 1
+  %sub = sub nuw nsw i8 %0, -8
+  %cmp = icmp ugt i8 %sub, 252
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+; CHECK-LABEL: safe_add_imm_var
+; CHECK:      add.w [[ADD:r[0-9]+]], r0, #129
+; CHECK-NOT:  uxt
+; CHECK:      cmp [[ADD]], #127
+define i32 @safe_add_imm_var(i8* %b) {
+entry:
+  %0 = load i8, i8* %b, align 1
+  %add = add nuw nsw i8 -127, %0
+  %cmp = icmp ugt i8 %add, 127
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+; CHECK-LABEL: safe_add_var_imm
+; CHECK:      sub.w [[SUB:r[0-9]+]], r0, #127
+; CHECK-NOT:  uxt
+; CHECK:      cmp [[SUB]], #127
+define i32 @safe_add_var_imm(i8* %b) {
+entry:
+  %0 = load i8, i8* %b, align 1
+  %add = add nuw nsw i8 %0, -127
+  %cmp = icmp ugt i8 %add, 127
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
diff --git a/test/CodeGen/ARM/arm-cgp-phis-ret.ll b/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-phis-ret.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll
diff --git a/test/CodeGen/ARM/arm-cgp-pointers.ll b/test/CodeGen/ARM/CGP/arm-cgp-pointers.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-pointers.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-pointers.ll
diff --git a/test/CodeGen/ARM/arm-cgp-signed-icmps.ll b/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-signed-icmps.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
diff --git a/test/CodeGen/ARM/arm-cgp-signed.ll b/test/CodeGen/ARM/CGP/arm-cgp-signed.ll
similarity index 100%
rename from test/CodeGen/ARM/arm-cgp-signed.ll
rename to test/CodeGen/ARM/CGP/arm-cgp-signed.ll
diff --git a/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
index 019298d208087899e3249b13c56657cec705fce4..e75df160e0040b15e09e6a55a010cf9589808663 100644
--- a/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
+++ b/test/CodeGen/ARM/Windows/chkstk-movw-movt-isel.ll
@@ -19,9 +19,9 @@ entry:
 
 ; CHECK-LABEL: isel
 ; CHECK: push {r4, r5, r6, lr}
-; CHECK: movw r12, #0
-; CHECK: movt r12, #0
-; CHECK: movw r4, #{{\d*}}
+; CHECK-DAG: movw r12, #0
+; CHECK-DAG: movt r12, #0
+; CHECK-DAG: movw r4, #{{\d*}}
 ; CHECK: blx r12
 ; CHECK: sub.w sp, sp, r4
 
diff --git a/test/CodeGen/ARM/Windows/chkstk.ll b/test/CodeGen/ARM/Windows/chkstk.ll
index 330c1f45850093ed156d7c41adc60d9c1aa47208..8fd414614598613025de02169c29f5d9c964242d 100644
--- a/test/CodeGen/ARM/Windows/chkstk.ll
+++ b/test/CodeGen/ARM/Windows/chkstk.ll
@@ -16,9 +16,9 @@ entry:
 ; CHECK-DEFAULT-CODE-MODEL: 	sub.w sp, sp, r4
 
 ; CHECK-LARGE-CODE-MODEL: check_watermark:
-; CHECK-LARGE-CODE-MODEL: 	movw r12, :lower16:__chkstk
-; CHECK-LARGE-CODE-MODEL: 	movt r12, :upper16:__chkstk
-; CHECK-LARGE-CODE-MODEL: 	movw r4, #1024
+; CHECK-LARGE-CODE-MODEL-DAG: 	movw r12, :lower16:__chkstk
+; CHECK-LARGE-CODE-MODEL-DAG: 	movt r12, :upper16:__chkstk
+; CHECK-LARGE-CODE-MODEL-DAG: 	movw r4, #1024
 ; CHECK-LARGE-CODE-MODEL: 	blx r12
 ; CHECK-LARGE-CODE-MODEL: 	sub.w sp, sp, r4
 
diff --git a/test/CodeGen/ARM/Windows/memset.ll b/test/CodeGen/ARM/Windows/memset.ll
index c9b22f47a15227658f902ec9ed751a000688437c..8cb257c156606d87c5582552aacf9f0427ddf4ff 100644
--- a/test/CodeGen/ARM/Windows/memset.ll
+++ b/test/CodeGen/ARM/Windows/memset.ll
@@ -10,9 +10,9 @@ entry:
   unreachable
 }
 
-; CHECK: movw r0, :lower16:source
-; CHECK: movt r0, :upper16:source
 ; CHECK: movs r1, #0
 ; CHECK: mov.w r2, #512
+; CHECK: movw r0, :lower16:source
+; CHECK: movt r0, :upper16:source
 ; CHECK: memset
 
diff --git a/test/CodeGen/ARM/analyze-branch-bkpt.ll b/test/CodeGen/ARM/analyze-branch-bkpt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cba89fe9987001a820a6745a58343f1c67d94be6
--- /dev/null
+++ b/test/CodeGen/ARM/analyze-branch-bkpt.ll
@@ -0,0 +1,61 @@
+; RUN: llc -o - %s -mtriple thumbv4-unknown-linux-android | FileCheck --check-prefix=V4 %s
+; RUN: llc -o - %s -mtriple thumbv5-unknown-linux-android | FileCheck --check-prefix=V5 %s
+
+; V4: udf #254
+; V5: bkpt #0
+
+define i1 @a(i32 %b) !dbg !3 {
+  br i1 undef, label %c, label %d, !dbg !4
+
+d:                                                ; preds = %0
+  call void @llvm.debugtrap()
+  br label %ah, !dbg !4
+
+c:                                                ; preds = %0
+  %aj = icmp ne i20 undef, 5
+  br label %ah, !dbg !4
+
+ah:                                               ; preds = %c, %d
+  %ak = phi i1 [ false, %d ], [ %aj, %c ]
+  call void @llvm.dbg.value(metadata i1 %ak, metadata !7, metadata !DIExpression()), !dbg !9
+  switch i32 %b, label %al [
+    i32 0, label %am
+    i32 10, label %an
+  ]
+
+an:                                               ; preds = %ah
+  %ch = select i1 %ak, i32 0, i32 5
+  br label %am, !dbg !10
+
+al:                                               ; preds = %ah
+  br label %am, !dbg !9
+
+am:                                               ; preds = %al, %an, %ah
+  %1 = phi i32 [ 0, %al ], [ %ch, %an ], [ %b, %ah ]
+  unreachable
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+
+; Function Attrs: nounwind
+declare void @llvm.debugtrap() #1
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "a", directory: "")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
+!4 = !DILocation(line: 0, scope: !5, inlinedAt: !6)
+!5 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
+!6 = !DILocation(line: 0, scope: !3)
+!7 = !DILocalVariable(scope: !8)
+!8 = distinct !DISubprogram(scope: null, isLocal: false, isDefinition: true, isOptimized: false, unit: !0)
+!9 = !DILocation(line: 0, scope: !8, inlinedAt: !6)
+!10 = !DILocation(line: 0, scope: !11, inlinedAt: !6)
+!11 = !DILexicalBlock(scope: !8)
diff --git a/test/CodeGen/ARM/and-load-combine.ll b/test/CodeGen/ARM/and-load-combine.ll
index 09acefad30588aee94da9bb31dee5bbc64000c8b..8f08909c816d1e584140934ed12560cbc5dd7fe6 100644
--- a/test/CodeGen/ARM/and-load-combine.ll
+++ b/test/CodeGen/ARM/and-load-combine.ll
@@ -28,8 +28,7 @@ define arm_aapcscc zeroext i1 @cmp_xor8_short_short(i16* nocapture readonly %a,
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    eors r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -74,8 +73,7 @@ define arm_aapcscc zeroext i1 @cmp_xor8_short_int(i16* nocapture readonly %a, i3
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    eors r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -121,8 +119,7 @@ define arm_aapcscc zeroext i1 @cmp_xor8_int_int(i32* nocapture readonly %a, i32*
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    eors r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -167,8 +164,7 @@ define arm_aapcscc zeroext i1 @cmp_xor16(i32* nocapture readonly %a, i32* nocapt
 ; THUMB1-NEXT:    ldrh r0, [r0]
 ; THUMB1-NEXT:    ldrh r1, [r1]
 ; THUMB1-NEXT:    eors r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -213,8 +209,7 @@ define arm_aapcscc zeroext i1 @cmp_or8_short_short(i16* nocapture readonly %a, i
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    orrs r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -259,8 +254,7 @@ define arm_aapcscc zeroext i1 @cmp_or8_short_int(i16* nocapture readonly %a, i32
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    orrs r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -306,8 +300,7 @@ define arm_aapcscc zeroext i1 @cmp_or8_int_int(i32* nocapture readonly %a, i32*
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    orrs r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -352,8 +345,7 @@ define arm_aapcscc zeroext i1 @cmp_or16(i32* nocapture readonly %a, i32* nocaptu
 ; THUMB1-NEXT:    ldrh r0, [r0]
 ; THUMB1-NEXT:    ldrh r1, [r1]
 ; THUMB1-NEXT:    orrs r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -398,8 +390,7 @@ define arm_aapcscc zeroext i1 @cmp_and8_short_short(i16* nocapture readonly %a,
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    ldrb r2, [r0]
 ; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r2
+; THUMB1-NEXT:    rsbs r0, r2, #0
 ; THUMB1-NEXT:    adcs r0, r2
 ; THUMB1-NEXT:    bx lr
 ;
@@ -444,8 +435,7 @@ define arm_aapcscc zeroext i1 @cmp_and8_short_int(i16* nocapture readonly %a, i3
 ; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -491,8 +481,7 @@ define arm_aapcscc zeroext i1 @cmp_and8_int_int(i32* nocapture readonly %a, i32*
 ; THUMB1-NEXT:    ldrb r1, [r1]
 ; THUMB1-NEXT:    ldrb r2, [r0]
 ; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r2
+; THUMB1-NEXT:    rsbs r0, r2, #0
 ; THUMB1-NEXT:    adcs r0, r2
 ; THUMB1-NEXT:    bx lr
 ;
@@ -537,8 +526,7 @@ define arm_aapcscc zeroext i1 @cmp_and16(i32* nocapture readonly %a, i32* nocapt
 ; THUMB1-NEXT:    ldrh r1, [r1]
 ; THUMB1-NEXT:    ldrh r2, [r0]
 ; THUMB1-NEXT:    ands r2, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r2
+; THUMB1-NEXT:    rsbs r0, r2, #0
 ; THUMB1-NEXT:    adcs r0, r2
 ; THUMB1-NEXT:    bx lr
 ;
@@ -881,8 +869,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
 ; THUMB1-NEXT:    ands r0, r1
 ; THUMB1-NEXT:    uxtb r1, r2
 ; THUMB1-NEXT:    subs r1, r0, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
@@ -929,8 +916,7 @@ define arm_aapcscc i1 @test7(i16* %x, i16 %y, i8 %z) {
 ; THUMB1-NEXT:    ands r0, r1
 ; THUMB1-NEXT:    uxtb r1, r2
 ; THUMB1-NEXT:    subs r1, r0, r1
-; THUMB1-NEXT:    movs r0, #0
-; THUMB1-NEXT:    subs r0, r0, r1
+; THUMB1-NEXT:    rsbs r0, r1, #0
 ; THUMB1-NEXT:    adcs r0, r1
 ; THUMB1-NEXT:    bx lr
 ;
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index c6ca6a624b11699b395db3f1a38fe4b4119f3f99..b81cf443e53751177565278bb73223df33b70865 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -153,18 +153,17 @@ define i32 @test_tst_assessment(i32 %a, i32 %b) {
 ; THUMB-NEXT:    movs r2, r0
 ; THUMB-NEXT:    movs r0, #1
 ; THUMB-NEXT:    ands r0, r2
-; THUMB-NEXT:    subs r2, r0, #1
 ; THUMB-NEXT:    lsls r1, r1, #31
 ; THUMB-NEXT:    beq .LBB2_2
 ; THUMB-NEXT:  @ %bb.1:
-; THUMB-NEXT:    movs r0, r2
+; THUMB-NEXT:    subs r0, r0, #1
 ; THUMB-NEXT:  .LBB2_2:
 ; THUMB-NEXT:    bx lr
 ;
 ; T2-LABEL: test_tst_assessment:
 ; T2:       @ %bb.0:
-; T2-NEXT:    lsls r1, r1, #31
 ; T2-NEXT:    and r0, r0, #1
+; T2-NEXT:    lsls r1, r1, #31
 ; T2-NEXT:    it ne
 ; T2-NEXT:    subne r0, #1
 ; T2-NEXT:    bx lr
diff --git a/test/CodeGen/ARM/arm-shrink-wrapping.ll b/test/CodeGen/ARM/arm-shrink-wrapping.ll
index c943f60c56df2fe765fae0a34d855bd32aa95215..bf4f1bd0d0caf45bae708726e3be8e92dc72cbd0 100644
--- a/test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ b/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -104,10 +104,10 @@ declare i32 @doSomething(i32, i32*)
 ; Next BB.
 ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
 ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
-; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]]
-; THUMB-NEXT: add [[SUM]], [[TMP]]
+; ARM: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB: add [[SUM]], [[TMP]]
+; ARM-NEXT: subs [[IV]], [[IV]], #1
+; THUMB-NEXT: subs [[IV]], #1
 ; CHECK-NEXT: bne [[LOOP]]
 ;
 ; Next BB.
@@ -169,10 +169,10 @@ declare i32 @something(...)
 ; Next BB.
 ; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: @ %for.body
 ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
 ; ARM: add [[SUM]], [[TMP]], [[SUM]]
 ; THUMB: add [[SUM]], [[TMP]]
+; ARM: subs [[IV]], [[IV]], #1
+; THUMB: subs [[IV]], #1
 ; CHECK-NEXT: bne [[LOOP_LABEL]]
 ; Next BB.
 ; CHECK: @ %for.exit
@@ -228,10 +228,10 @@ for.end:                                          ; preds = %for.body
 ; Next BB.
 ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
 ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
-; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]]
-; THUMB-NEXT: add [[SUM]], [[TMP]]
+; ARM: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB: add [[SUM]], [[TMP]]
+; ARM-NEXT: subs [[IV]], [[IV]], #1
+; THUMB-NEXT: subs [[IV]], #1
 ; CHECK-NEXT: bne [[LOOP]]
 ;
 ; Next BB.
@@ -307,10 +307,10 @@ declare void @somethingElse(...)
 ; Next BB.
 ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
 ; CHECK: mov{{(\.w)?}} [[TMP:r[0-9]+]], #1
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
-; ARM-NEXT: add [[SUM]], [[TMP]], [[SUM]]
-; THUMB-NEXT: add [[SUM]], [[TMP]]
+; ARM: add [[SUM]], [[TMP]], [[SUM]]
+; THUMB: add [[SUM]], [[TMP]]
+; ARM-NEXT: subs [[IV]], [[IV]], #1
+; THUMB-NEXT: subs [[IV]], #1
 ; CHECK-NEXT: bne [[LOOP]]
 ;
 ; Next BB.
diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll
index fd87e50d0b77a9350e69e368a920492977fa61d4..b5214f8d67e5ca88531161457d1320c8162ae4b3 100644
--- a/test/CodeGen/ARM/atomic-cmpxchg.ll
+++ b/test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -24,8 +24,7 @@ entry:
 ; CHECK-THUMB: bl __sync_val_compare_and_swap_1
 ; CHECK-THUMB-NOT: mov [[R1:r[0-7]]], r0
 ; CHECK-THUMB: subs [[R1:r[0-7]]], r0, {{r[0-9]+}}
-; CHECK-THUMB: movs r0, #0
-; CHECK-THUMB: subs r0, r0, [[R1]]
+; CHECK-THUMB: rsbs r0, [[R1]], #0
 ; CHECK-THUMB: adcs r0, [[R1]]
 
 ; CHECK-ARMV6-LABEL: test_cmpxchg_res_i8:
@@ -47,8 +46,7 @@ entry:
 ; CHECK-THUMBV6-NEXT:  bl __sync_val_compare_and_swap_1
 ; CHECK-THUMBV6-NEXT:  uxtb r1, r4
 ; CHECK-THUMBV6-NEXT:  subs [[R1:r[0-7]]], r0, {{r[0-9]+}}
-; CHECK-THUMBV6-NEXT:  movs r0, #0
-; CHECK-THUMBV6-NEXT:  subs r0, r0, [[R1]]
+; CHECK-THUMBV6-NEXT:  rsbs r0, [[R1]], #0
 ; CHECK-THUMBV6-NEXT:  adcs r0, [[R1]]
 
 ; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:
diff --git a/test/CodeGen/ARM/cmn.ll b/test/CodeGen/ARM/cmn.ll
index 9321527a9e25ef62d9baaf07e7abb68f7d156af7..fbcee5196b63dccb03980f1cf32e292341880bcd 100644
--- a/test/CodeGen/ARM/cmn.ll
+++ b/test/CodeGen/ARM/cmn.ll
@@ -15,16 +15,15 @@ define i32 @compare_i_gt(i32 %a) {
 ;
 ; T1-LABEL: compare_i_gt:
 ; T1:       @ %bb.0: @ %entry
-; T1-NEXT:    mov r1, r0
-; T1-NEXT:    movs r0, #77
-; T1-NEXT:    mvns r3, r0
-; T1-NEXT:    movs r0, #42
-; T1-NEXT:    movs r2, #24
-; T1-NEXT:    cmp r1, r3
+; T1-NEXT:    movs r1, #77
+; T1-NEXT:    mvns r1, r1
+; T1-NEXT:    cmp r0, r1
 ; T1-NEXT:    bgt .LBB0_2
 ; T1-NEXT:  @ %bb.1: @ %entry
-; T1-NEXT:    mov r0, r2
-; T1-NEXT:  .LBB0_2: @ %entry
+; T1-NEXT:    movs r0, #24
+; T1-NEXT:    bx lr
+; T1-NEXT:  .LBB0_2:
+; T1-NEXT:    movs r0, #42
 ; T1-NEXT:    bx lr
 entry:
   %cmp = icmp sgt i32 %a, -78
@@ -44,14 +43,13 @@ define i32 @compare_r_eq(i32 %a, i32 %b) {
 ;
 ; T1-LABEL: compare_r_eq:
 ; T1:       @ %bb.0: @ %entry
-; T1-NEXT:    mov r2, r0
-; T1-NEXT:    movs r0, #42
-; T1-NEXT:    movs r3, #24
-; T1-NEXT:    cmn r2, r1
+; T1-NEXT:    cmn r0, r1
 ; T1-NEXT:    beq .LBB1_2
 ; T1-NEXT:  @ %bb.1: @ %entry
-; T1-NEXT:    mov r0, r3
-; T1-NEXT:  .LBB1_2: @ %entry
+; T1-NEXT:    movs r0, #24
+; T1-NEXT:    bx lr
+; T1-NEXT:  .LBB1_2:
+; T1-NEXT:    movs r0, #42
 ; T1-NEXT:    bx lr
 entry:
   %sub = sub nsw i32 0, %b
diff --git a/test/CodeGen/ARM/cmpxchg.mir b/test/CodeGen/ARM/cmpxchg.mir
new file mode 100644
index 0000000000000000000000000000000000000000..6ae7e6372493bc192293412162f79a424173f21c
--- /dev/null
+++ b/test/CodeGen/ARM/cmpxchg.mir
@@ -0,0 +1,24 @@
+# RUN: llc -o - %s -mtriple=armv7-unknown-linux-gnu -verify-machineinstrs -run-pass=arm-pseudo | FileCheck %s
+---
+# CHECK-LABEL: name: func
+name: func
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0_r1, $r4_r5, $r3, $lr
+    dead early-clobber renamable $r0_r1, dead early-clobber renamable $r2 = CMP_SWAP_64 killed renamable $r3, killed renamable $r4_r5, renamable $r4_r5 :: (volatile load store monotonic monotonic 8)
+    ; CHECK: bb.0:
+    ; CHECK:   liveins: $r0_r1, $r4_r5, $r3, $lr
+    ; CHECK: bb.1:
+    ; CHEKC:   liveins: $r4_r5, $r3
+    ; CHECK:   $r0_r1 = LDREXD $r3, 14, $noreg
+    ; CHECK:   CMPrr killed $r0, $r4, 14, $noreg, implicit-def $cpsr
+    ; CHECK:   CMPrr killed $r1, $r5, 0, killed $cpsr, implicit-def $cpsr
+    ; CHECK:   Bcc %bb.3, 1, killed $cpsr
+    ; CHECK: bb.2:
+    ; CHECK:   liveins: $r4_r5, $r3
+    ; CHECK:   early-clobber $r2 = STREXD $r4_r5, $r3, 14, $noreg
+    ; CHECK:   CMPri killed $r2, 0, 14, $noreg, implicit-def $cpsr
+    ; CHECK:   Bcc %bb.1, 1, killed $cpsr
+    ; CHECK: bb.3:
+...
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
index 0ae2d5f6f2f2ba7498a54d2ed8c5d004c2ef0059..2c0aa98eae037c31651c7766e9f8a493778d6fd8 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm-wrback.ll
@@ -18,9 +18,9 @@
 ; CHECK-NEXT:  Data
 ; CHECK-SAME:  Latency=3
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=3
+; CHECK-SAME:  Latency=0
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=4
+; CHECK-SAME:  Latency=0
 define i32 @bar(i32 %a1, i32 %b1, i32 %c1) minsize optsize {
   %1 = load i32, i32* @a, align 4
   %2 = load i32, i32* @b, align 4
diff --git a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
index bc7a14b1028ef299cb39c106ed4a863c1a1446d4..02d1c2f55f994b9df8fbcd14a562003410833156 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-ldm.ll
@@ -11,7 +11,7 @@
 ; CHECK:       Data
 ; CHECK-SAME:  Latency=3
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=3
+; CHECK-SAME:  Latency=0
 
 define i32 @foo(i32* %a) nounwind optsize {
 entry:
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
index b5edcc30422932172381242611a89d143ec766ff..1baf472ca49d66893d491bb1c15de0ce43762acf 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm-wrback.ll
@@ -20,9 +20,9 @@
 ; CHECK-NEXT:  Data
 ; CHECK-SAME:  Latency=5
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=5
+; CHECK-SAME:  Latency=0
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=6
+; CHECK-SAME:  Latency=0
 define i32 @bar(i32* %iptr) minsize optsize {
   %1 = load double, double* @a, align 8
   %2 = load double, double* @b, align 8
diff --git a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
index 12c7b3270c3b32f0cd4e9010cc139745e96d8dfd..8da133e806ef6ae194fbde8cd6f58955c91e7629 100644
--- a/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
+++ b/test/CodeGen/ARM/cortex-a57-misched-vldm.ll
@@ -11,9 +11,9 @@
 ; CHECK:       Data
 ; CHECK-SAME:  Latency=5
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=5
+; CHECK-SAME:  Latency=0
 ; CHECK-NEXT:  Data
-; CHECK-SAME:  Latency=6
+; CHECK-SAME:  Latency=0
 
 define double @foo(double* %a) nounwind optsize {
 entry:
diff --git a/test/CodeGen/ARM/cttz_vector.ll b/test/CodeGen/ARM/cttz_vector.ll
index bed644980415238bdc318f10816b0f0497190c97..f27c1e4b4173303d63f2845fb72768628d6983f6 100644
--- a/test/CodeGen/ARM/cttz_vector.ll
+++ b/test/CodeGen/ARM/cttz_vector.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple armv7-linux-gnueabihf -mattr=+neon | FileCheck %s
 
 ; This test checks the @llvm.cttz.* intrinsics for vectors.
@@ -23,7 +24,14 @@ declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1)
 ;------------------------------------------------------------------------------
 
 define void @test_v1i8(<1 x i8>* %p) {
-; CHECK-LABEL: test_v1i8
+; CHECK-LABEL: test_v1i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrb r1, [r0]
+; CHECK-NEXT:    orr r1, r1, #256
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    strb r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i8>, <1 x i8>* %p
   %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 false)
   store <1 x i8> %tmp, <1 x i8>* %p
@@ -32,6 +40,21 @@ define void @test_v1i8(<1 x i8>* %p) {
 
 define void @test_v2i8(<2 x i8>* %p) {
 ; CHECK-LABEL: test_v2i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.16 {d16[0]}, [r0:16]
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vorr.i32 d16, #0x100
+; CHECK-NEXT:    vneg.s32 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vmov.32 r1, d16[1]
+; CHECK-NEXT:    vmov.32 r2, d16[0]
+; CHECK-NEXT:    strb r1, [r0, #1]
+; CHECK-NEXT:    strb r2, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i8>, <2 x i8>* %p
   %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 false)
   store <2 x i8> %tmp, <2 x i8>* %p
@@ -40,6 +63,19 @@ define void @test_v2i8(<2 x i8>* %p) {
 
 define void @test_v4i8(<4 x i8>* %p) {
 ; CHECK-LABEL: test_v4i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmov.i16 d19, #0x1
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vorr.i16 d16, #0x100
+; CHECK-NEXT:    vneg.s16 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vsub.i16 d16, d16, d19
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vuzp.8 d16, d17
+; CHECK-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i8>, <4 x i8>* %p
   %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 false)
   store <4 x i8> %tmp, <4 x i8>* %p
@@ -48,13 +84,15 @@ define void @test_v4i8(<4 x i8>* %p) {
 
 define void @test_v8i8(<8 x i8>* %p) {
 ; CHECK-LABEL: test_v8i8:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i8	[[D2:d[0-9]+]], #0x1
-; CHECK: vneg.s8	[[D3:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D3]]
-; CHECK: vsub.i8	[[D1]], [[D1]], [[D2]]
-; CHECK: vcnt.8		[[D1]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vmov.i8 d18, #0x1
+; CHECK-NEXT:    vneg.s8 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vsub.i8 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <8 x i8>, <8 x i8>* %p
   %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 false)
   store <8 x i8> %tmp, <8 x i8>* %p
@@ -63,13 +101,15 @@ define void @test_v8i8(<8 x i8>* %p) {
 
 define void @test_v16i8(<16 x i8>* %p) {
 ; CHECK-LABEL: test_v16i8:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i8	[[Q2:q[0-9]+]], #0x1
-; CHECK: vneg.s8	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
-; CHECK: vsub.i8	[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vcnt.8		[[Q1]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i8 q10, #0x1
+; CHECK-NEXT:    vneg.s8 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vsub.i8 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <16 x i8>, <16 x i8>* %p
   %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false)
   store <16 x i8> %tmp, <16 x i8>* %p
@@ -78,6 +118,13 @@ define void @test_v16i8(<16 x i8>* %p) {
 
 define void @test_v1i16(<1 x i16>* %p) {
 ; CHECK-LABEL: test_v1i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrh r1, [r0]
+; CHECK-NEXT:    orr r1, r1, #65536
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    strh r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i16>, <1 x i16>* %p
   %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 false)
   store <1 x i16> %tmp, <1 x i16>* %p
@@ -86,6 +133,18 @@ define void @test_v1i16(<1 x i16>* %p) {
 
 define void @test_v2i16(<2 x i16>* %p) {
 ; CHECK-LABEL: test_v2i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vorr.i32 d16, #0x10000
+; CHECK-NEXT:    vneg.s32 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vuzp.16 d16, d17
+; CHECK-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i16>, <2 x i16>* %p
   %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 false)
   store <2 x i16> %tmp, <2 x i16>* %p
@@ -94,14 +153,16 @@ define void @test_v2i16(<2 x i16>* %p) {
 
 define void @test_v4i16(<4 x i16>* %p) {
 ; CHECK-LABEL: test_v4i16:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i16	[[D2:d[0-9]+]], #0x1
-; CHECK: vneg.s16	[[D3:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D3]]
-; CHECK: vsub.i16	[[D1]], [[D1]], [[D2]]
-; CHECK: vcnt.8		[[D1]], [[D1]]
-; CHECK: vpaddl.u8	[[D1]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vmov.i16 d18, #0x1
+; CHECK-NEXT:    vneg.s16 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vsub.i16 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i16>, <4 x i16>* %p
   %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 false)
   store <4 x i16> %tmp, <4 x i16>* %p
@@ -110,14 +171,16 @@ define void @test_v4i16(<4 x i16>* %p) {
 
 define void @test_v8i16(<8 x i16>* %p) {
 ; CHECK-LABEL: test_v8i16:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i16	[[Q2:q[0-9]+]], #0x1
-; CHECK: vneg.s16	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
-; CHECK: vsub.i16	[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vcnt.8		[[Q1]], [[Q1]]
-; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i16 q10, #0x1
+; CHECK-NEXT:    vneg.s16 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vsub.i16 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <8 x i16>, <8 x i16>* %p
   %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false)
   store <8 x i16> %tmp, <8 x i16>* %p
@@ -126,6 +189,12 @@ define void @test_v8i16(<8 x i16>* %p) {
 
 define void @test_v1i32(<1 x i32>* %p) {
 ; CHECK-LABEL: test_v1i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldr r1, [r0]
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i32>, <1 x i32>* %p
   %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 false)
   store <1 x i32> %tmp, <1 x i32>* %p
@@ -134,15 +203,17 @@ define void @test_v1i32(<1 x i32>* %p) {
 
 define void @test_v2i32(<2 x i32>* %p) {
 ; CHECK-LABEL: test_v2i32:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x1
-; CHECK: vneg.s32	[[D3:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D3]]
-; CHECK: vsub.i32	[[D1]], [[D1]], [[D2]]
-; CHECK: vcnt.8		[[D1]], [[D1]]
-; CHECK: vpaddl.u8	[[D1]], [[D1]]
-; CHECK: vpaddl.u16	[[D1]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vmov.i32 d18, #0x1
+; CHECK-NEXT:    vneg.s32 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vsub.i32 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i32>, <2 x i32>* %p
   %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
   store <2 x i32> %tmp, <2 x i32>* %p
@@ -151,15 +222,17 @@ define void @test_v2i32(<2 x i32>* %p) {
 
 define void @test_v4i32(<4 x i32>* %p) {
 ; CHECK-LABEL: test_v4i32:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x1
-; CHECK: vneg.s32	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
-; CHECK: vsub.i32	[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vcnt.8		[[Q1]], [[Q1]]
-; CHECK: vpaddl.u8	[[Q1]], [[Q1]]
-; CHECK: vpaddl.u16	[[Q1]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i32 q10, #0x1
+; CHECK-NEXT:    vneg.s32 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vsub.i32 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i32>, <4 x i32>* %p
   %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false)
   store <4 x i32> %tmp, <4 x i32>* %p
@@ -168,17 +241,19 @@ define void @test_v4i32(<4 x i32>* %p) {
 
 define void @test_v1i64(<1 x i64>* %p) {
 ; CHECK-LABEL: test_v1i64:
-; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x0
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i64	[[D3:d[0-9]+]], #0xffffffffffffffff
-; CHECK: vsub.i64	[[D2]], [[D2]], [[D1]]
-; CHECK: vand		[[D2]], [[D1]], [[D2]]
-; CHECK: vadd.i64	[[D2]], [[D2]], [[D3]]
-; CHECK: vcnt.8		[[D2]], [[D2]]
-; CHECK: vpaddl.u8	[[D2]], [[D2]]
-; CHECK: vpaddl.u16	[[D2]], [[D2]]
-; CHECK: vpaddl.u32	[[D2]], [[D2]]
-; CHECK: vstr		[[D2]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d16, #0x0
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vmov.i64 d18, #0xffffffffffffffff
+; CHECK-NEXT:    vsub.i64 d16, d16, d17
+; CHECK-NEXT:    vand d16, d17, d16
+; CHECK-NEXT:    vadd.i64 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
+; CHECK-NEXT:    vpaddl.u32 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i64>, <1 x i64>* %p
   %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 false)
   store <1 x i64> %tmp, <1 x i64>* %p
@@ -187,17 +262,19 @@ define void @test_v1i64(<1 x i64>* %p) {
 
 define void @test_v2i64(<2 x i64>* %p) {
 ; CHECK-LABEL: test_v2i64:
-; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x0
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i64	[[Q3:q[0-9]+]], #0xffffffffffffffff
-; CHECK: vsub.i64	[[Q2]], [[Q2]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q2]], [[Q1]], [[Q2]]
-; CHECK: vadd.i64	[[Q2]], [[Q2]], [[Q3]]
-; CHECK: vcnt.8		[[Q2]], [[Q2]]
-; CHECK: vpaddl.u8	[[Q2]], [[Q2]]
-; CHECK: vpaddl.u16	[[Q2]], [[Q2]]
-; CHECK: vpaddl.u32	[[Q2]], [[Q2]]
-; CHECK: vst1.64	{d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q8, #0x0
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vmov.i64 q10, #0xffffffffffffffff
+; CHECK-NEXT:    vsub.i64 q8, q8, q9
+; CHECK-NEXT:    vand q8, q9, q8
+; CHECK-NEXT:    vadd.i64 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vpaddl.u32 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i64>, <2 x i64>* %p
   %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false)
   store <2 x i64> %tmp, <2 x i64>* %p
@@ -207,7 +284,13 @@ define void @test_v2i64(<2 x i64>* %p) {
 ;------------------------------------------------------------------------------
 
 define void @test_v1i8_zero_undef(<1 x i8>* %p) {
-; CHECK-LABEL: test_v1i8_zero_undef
+; CHECK-LABEL: test_v1i8_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrb r1, [r0]
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    strb r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i8>, <1 x i8>* %p
   %tmp = call <1 x i8> @llvm.cttz.v1i8(<1 x i8> %a, i1 true)
   store <1 x i8> %tmp, <1 x i8>* %p
@@ -216,6 +299,20 @@ define void @test_v1i8_zero_undef(<1 x i8>* %p) {
 
 define void @test_v2i8_zero_undef(<2 x i8>* %p) {
 ; CHECK-LABEL: test_v2i8_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.16 {d16[0]}, [r0:16]
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vneg.s32 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vmov.32 r1, d16[1]
+; CHECK-NEXT:    vmov.32 r2, d16[0]
+; CHECK-NEXT:    strb r1, [r0, #1]
+; CHECK-NEXT:    strb r2, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i8>, <2 x i8>* %p
   %tmp = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true)
   store <2 x i8> %tmp, <2 x i8>* %p
@@ -224,6 +321,17 @@ define void @test_v2i8_zero_undef(<2 x i8>* %p) {
 
 define void @test_v4i8_zero_undef(<4 x i8>* %p) {
 ; CHECK-LABEL: test_v4i8_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmovl.u8 q8, d16
+; CHECK-NEXT:    vneg.s16 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i16 d17, #0xf
+; CHECK-NEXT:    vclz.i16 d16, d16
+; CHECK-NEXT:    vsub.i16 d16, d17, d16
+; CHECK-NEXT:    vuzp.8 d16, d17
+; CHECK-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i8>, <4 x i8>* %p
   %tmp = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true)
   store <4 x i8> %tmp, <4 x i8>* %p
@@ -232,13 +340,15 @@ define void @test_v4i8_zero_undef(<4 x i8>* %p) {
 
 define void @test_v8i8_zero_undef(<8 x i8>* %p) {
 ; CHECK-LABEL: test_v8i8_zero_undef:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i8	[[D2:d[0-9]+]], #0x1
-; CHECK: vneg.s8	[[D3:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D3]]
-; CHECK: vsub.i8	[[D1]], [[D1]], [[D2]]
-; CHECK: vcnt.8		[[D1]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vmov.i8 d18, #0x1
+; CHECK-NEXT:    vneg.s8 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vsub.i8 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <8 x i8>, <8 x i8>* %p
   %tmp = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true)
   store <8 x i8> %tmp, <8 x i8>* %p
@@ -247,13 +357,15 @@ define void @test_v8i8_zero_undef(<8 x i8>* %p) {
 
 define void @test_v16i8_zero_undef(<16 x i8>* %p) {
 ; CHECK-LABEL: test_v16i8_zero_undef:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i8	[[Q2:q[0-9]+]], #0x1
-; CHECK: vneg.s8	[[Q3:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q3]]
-; CHECK: vsub.i8	[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vcnt.8		[[Q1]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vmov.i8 q10, #0x1
+; CHECK-NEXT:    vneg.s8 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vsub.i8 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <16 x i8>, <16 x i8>* %p
   %tmp = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true)
   store <16 x i8> %tmp, <16 x i8>* %p
@@ -262,6 +374,12 @@ define void @test_v16i8_zero_undef(<16 x i8>* %p) {
 
 define void @test_v1i16_zero_undef(<1 x i16>* %p) {
 ; CHECK-LABEL: test_v1i16_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrh r1, [r0]
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    strh r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i16>, <1 x i16>* %p
   %tmp = call <1 x i16> @llvm.cttz.v1i16(<1 x i16> %a, i1 true)
   store <1 x i16> %tmp, <1 x i16>* %p
@@ -270,6 +388,17 @@ define void @test_v1i16_zero_undef(<1 x i16>* %p) {
 
 define void @test_v2i16_zero_undef(<2 x i16>* %p) {
 ; CHECK-LABEL: test_v2i16_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vneg.s32 d18, d16
+; CHECK-NEXT:    vand d16, d16, d18
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vuzp.16 d16, d17
+; CHECK-NEXT:    vst1.32 {d16[0]}, [r0:32]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i16>, <2 x i16>* %p
   %tmp = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true)
   store <2 x i16> %tmp, <2 x i16>* %p
@@ -278,13 +407,15 @@ define void @test_v2i16_zero_undef(<2 x i16>* %p) {
 
 define void @test_v4i16_zero_undef(<4 x i16>* %p) {
 ; CHECK-LABEL: test_v4i16_zero_undef:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vneg.s16	[[D2:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D2]]
-; CHECK: vmov.i16	[[D3:d[0-9]+]], #0xf
-; CHECK: vclz.i16	[[D1]], [[D1]]
-; CHECK: vsub.i16	[[D1]], [[D3]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vneg.s16 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vmov.i16 d17, #0xf
+; CHECK-NEXT:    vclz.i16 d16, d16
+; CHECK-NEXT:    vsub.i16 d16, d17, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i16>, <4 x i16>* %p
   %tmp = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true)
   store <4 x i16> %tmp, <4 x i16>* %p
@@ -293,13 +424,15 @@ define void @test_v4i16_zero_undef(<4 x i16>* %p) {
 
 define void @test_v8i16_zero_undef(<8 x i16>* %p) {
 ; CHECK-LABEL: test_v8i16_zero_undef:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vneg.s16	[[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vmov.i16	[[Q3:q[0-9]+]], #0xf
-; CHECK: vclz.i16	[[Q1]], [[Q1]]
-; CHECK: vsub.i16	[[Q1]], [[Q3]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vneg.s16 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vmov.i16 q9, #0xf
+; CHECK-NEXT:    vclz.i16 q8, q8
+; CHECK-NEXT:    vsub.i16 q8, q9, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <8 x i16>, <8 x i16>* %p
   %tmp = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true)
   store <8 x i16> %tmp, <8 x i16>* %p
@@ -308,6 +441,12 @@ define void @test_v8i16_zero_undef(<8 x i16>* %p) {
 
 define void @test_v1i32_zero_undef(<1 x i32>* %p) {
 ; CHECK-LABEL: test_v1i32_zero_undef:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldr r1, [r0]
+; CHECK-NEXT:    rbit r1, r1
+; CHECK-NEXT:    clz r1, r1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i32>, <1 x i32>* %p
   %tmp = call <1 x i32> @llvm.cttz.v1i32(<1 x i32> %a, i1 true)
   store <1 x i32> %tmp, <1 x i32>* %p
@@ -316,13 +455,15 @@ define void @test_v1i32_zero_undef(<1 x i32>* %p) {
 
 define void @test_v2i32_zero_undef(<2 x i32>* %p) {
 ; CHECK-LABEL: test_v2i32_zero_undef:
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vneg.s32	[[D2:d[0-9]+]], [[D1]]
-; CHECK: vand		[[D1]], [[D1]], [[D2]]
-; CHECK: vmov.i32	[[D3:d[0-9]+]], #0x1f
-; CHECK: vclz.i32	[[D1]], [[D1]]
-; CHECK: vsub.i32	[[D1]], [[D3]], [[D1]]
-; CHECK: vstr		[[D1]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vneg.s32 d17, d16
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vmov.i32 d17, #0x1f
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vsub.i32 d16, d17, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i32>, <2 x i32>* %p
   %tmp = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true)
   store <2 x i32> %tmp, <2 x i32>* %p
@@ -331,13 +472,15 @@ define void @test_v2i32_zero_undef(<2 x i32>* %p) {
 
 define void @test_v4i32_zero_undef(<4 x i32>* %p) {
 ; CHECK-LABEL: test_v4i32_zero_undef:
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vneg.s32	[[Q2:q[0-9]+]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q1]], [[Q1]], [[Q2]]
-; CHECK: vmov.i32	[[Q3:q[0-9]+]], #0x1f
-; CHECK: vclz.i32	[[Q1]], [[Q1]]
-; CHECK: vsub.i32	[[Q1]], [[Q3]], [[Q1]]
-; CHECK: vst1.64	{[[D1]], [[D2]]}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vneg.s32 q9, q8
+; CHECK-NEXT:    vand q8, q8, q9
+; CHECK-NEXT:    vmov.i32 q9, #0x1f
+; CHECK-NEXT:    vclz.i32 q8, q8
+; CHECK-NEXT:    vsub.i32 q8, q9, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <4 x i32>, <4 x i32>* %p
   %tmp = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true)
   store <4 x i32> %tmp, <4 x i32>* %p
@@ -346,17 +489,19 @@ define void @test_v4i32_zero_undef(<4 x i32>* %p) {
 
 define void @test_v1i64_zero_undef(<1 x i64>* %p) {
 ; CHECK-LABEL: test_v1i64_zero_undef:
-; CHECK: vmov.i32	[[D2:d[0-9]+]], #0x0
-; CHECK: vldr		[[D1:d[0-9]+]], [r0]
-; CHECK: vmov.i64	[[D3:d[0-9]+]], #0xffffffffffffffff
-; CHECK: vsub.i64	[[D2]], [[D2]], [[D1]]
-; CHECK: vand		[[D2]], [[D1]], [[D2]]
-; CHECK: vadd.i64	[[D2]], [[D2]], [[D3]]
-; CHECK: vcnt.8		[[D2]], [[D2]]
-; CHECK: vpaddl.u8	[[D2]], [[D2]]
-; CHECK: vpaddl.u16	[[D2]], [[D2]]
-; CHECK: vpaddl.u32	[[D2]], [[D2]]
-; CHECK: vstr		[[D2]], [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 d16, #0x0
+; CHECK-NEXT:    vldr d17, [r0]
+; CHECK-NEXT:    vmov.i64 d18, #0xffffffffffffffff
+; CHECK-NEXT:    vsub.i64 d16, d16, d17
+; CHECK-NEXT:    vand d16, d17, d16
+; CHECK-NEXT:    vadd.i64 d16, d16, d18
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
+; CHECK-NEXT:    vpaddl.u32 d16, d16
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <1 x i64>, <1 x i64>* %p
   %tmp = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %a, i1 true)
   store <1 x i64> %tmp, <1 x i64>* %p
@@ -365,17 +510,19 @@ define void @test_v1i64_zero_undef(<1 x i64>* %p) {
 
 define void @test_v2i64_zero_undef(<2 x i64>* %p) {
 ; CHECK-LABEL: test_v2i64_zero_undef:
-; CHECK: vmov.i32	[[Q2:q[0-9]+]], #0x0
-; CHECK: vld1.64	{[[D1:d[0-9]+]], [[D2:d[0-9]+]]}, [r0]
-; CHECK: vmov.i64	[[Q3:q[0-9]+]], #0xffffffffffffffff
-; CHECK: vsub.i64	[[Q2]], [[Q2]], [[Q1:q[0-9]+]]
-; CHECK: vand		[[Q2]], [[Q1]], [[Q2]]
-; CHECK: vadd.i64	[[Q2]], [[Q2]], [[Q3]]
-; CHECK: vcnt.8		[[Q2]], [[Q2]]
-; CHECK: vpaddl.u8	[[Q2]], [[Q2]]
-; CHECK: vpaddl.u16	[[Q2]], [[Q2]]
-; CHECK: vpaddl.u32	[[Q2]], [[Q2]]
-; CHECK: vst1.64	{d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i32 q8, #0x0
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vmov.i64 q10, #0xffffffffffffffff
+; CHECK-NEXT:    vsub.i64 q8, q8, q9
+; CHECK-NEXT:    vand q8, q9, q8
+; CHECK-NEXT:    vadd.i64 q8, q8, q10
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vpaddl.u32 q8, q8
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    bx lr
   %a = load <2 x i64>, <2 x i64>* %p
   %tmp = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
   store <2 x i64> %tmp, <2 x i64>* %p
diff --git a/test/CodeGen/ARM/dbg-range-extension.mir b/test/CodeGen/ARM/dbg-range-extension.mir
index 0dd9ed2b207feb0693a225c937ba67cd1665d8dd..f2b174a8ac0bdb33b3a13cf9cc693f7d5e780eb6 100644
--- a/test/CodeGen/ARM/dbg-range-extension.mir
+++ b/test/CodeGen/ARM/dbg-range-extension.mir
@@ -23,37 +23,37 @@
 # CHECK: [[VAR_I:![0-9]+]] = !DILocalVariable(name: "i",
 
 # CHECK: bb.0.entry
-# CHECK: DBG_VALUE debug-use $r0, debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_A:\$r[0-9]+]], debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_B:\$r[0-9]+]], debug-use $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE $r0, $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_A:\$r[0-9]+]], $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_B:\$r[0-9]+]], $noreg, [[VAR_B]]
 
 # CHECK: bb.1.if.then
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_C:\$r[0-9]+]], debug-use $noreg, [[VAR_C]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_C:\$r[0-9]+]], $noreg, [[VAR_C]]
 # CHECK: DBG_VALUE 1, 0, [[VAR_I]]
 
 # CHECK: bb.2.for.body
-# CHECK: DBG_VALUE debug-use [[REG_I:\$r[0-9]+]], debug-use $noreg, [[VAR_I]]
-# CHECK: DBG_VALUE debug-use [[REG_C]], debug-use $noreg, [[VAR_C]]
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_I]], debug-use $noreg, [[VAR_I]]
+# CHECK: DBG_VALUE [[REG_I:\$r[0-9]+]], $noreg, [[VAR_I]]
+# CHECK: DBG_VALUE [[REG_C]], $noreg, [[VAR_C]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_I]], $noreg, [[VAR_I]]
 
 # CHECK: bb.3.for.cond
-# CHECK: DBG_VALUE debug-use [[REG_C]], debug-use $noreg, [[VAR_C]]
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
-# CHECK: DBG_VALUE debug-use [[REG_I]], debug-use $noreg, [[VAR_I]]
+# CHECK: DBG_VALUE [[REG_C]], $noreg, [[VAR_C]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_I]], $noreg, [[VAR_I]]
 
 # CHECK: bb.4.for.cond.cleanup
-# CHECK: DBG_VALUE debug-use [[REG_C]], debug-use $noreg, [[VAR_C]]
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_C]], $noreg, [[VAR_C]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
 
 # CHECK: bb.5.if.end
-# CHECK: DBG_VALUE debug-use [[REG_B]], debug-use $noreg, [[VAR_B]]
-# CHECK: DBG_VALUE debug-use [[REG_A]], debug-use $noreg, [[VAR_A]]
+# CHECK: DBG_VALUE [[REG_B]], $noreg, [[VAR_B]]
+# CHECK: DBG_VALUE [[REG_A]], $noreg, [[VAR_A]]
 --- |
   ; ModuleID = '/data/kwalker/work/OpenSource-llvm/llvm/test/CodeGen/ARM/dbg-range-extension.ll'
   source_filename = "/data/kwalker/work/OpenSource-llvm/llvm/test/CodeGen/ARM/dbg-range-extension.ll"
@@ -219,14 +219,14 @@ body:             |
     frame-setup CFI_INSTRUCTION offset $r6, -16
     frame-setup CFI_INSTRUCTION offset $r5, -20
     frame-setup CFI_INSTRUCTION offset $r4, -24
-    DBG_VALUE debug-use $r0, debug-use $noreg, !13, !20, debug-location !21
+    DBG_VALUE $r0, $noreg, !13, !20, debug-location !21
     $r4 = MOVr killed $r0, 14, $noreg, $noreg
-    DBG_VALUE debug-use $r4, debug-use $noreg, !13, !20, debug-location !21
+    DBG_VALUE $r4, $noreg, !13, !20, debug-location !21
     $r0 = MOVi 10, 14, $noreg, _, debug-location !22
     $r1 = MOVi 11, 14, $noreg, _, debug-location !22
     BL @func2, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit killed $r1, implicit-def $sp, implicit-def $r0, debug-location !22
     $r5 = MOVr killed $r0, 14, $noreg, _, debug-location !22
-    DBG_VALUE debug-use $r5, debug-use $noreg, !14, !20, debug-location !23
+    DBG_VALUE $r5, $noreg, !14, !20, debug-location !23
     CMPri $r4, 0, 14, $noreg, implicit-def $cpsr, debug-location !25
     Bcc %bb.5.if.end, 0, killed $cpsr
   
@@ -237,7 +237,7 @@ body:             |
     $r1 = MOVi 13, 14, $noreg, _, debug-location !26
     BL @func2, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit killed $r1, implicit-def $sp, implicit-def $r0, debug-location !26
     $r6 = MOVr killed $r0, 14, $noreg, _, debug-location !26
-    DBG_VALUE debug-use $r6, debug-use $noreg, !15, !20, debug-location !27
+    DBG_VALUE $r6, $noreg, !15, !20, debug-location !27
     $r7 = MOVi 1, 14, $noreg, $noreg
     DBG_VALUE 1, 0, !18, !20, debug-location !28
     B %bb.3.for.cond
@@ -249,12 +249,12 @@ body:             |
     $r0 = MOVr $r7, 14, $noreg, _, debug-location !36
     BL @func2, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit killed $r1, implicit-def $sp, implicit-def dead $r0, debug-location !36
     $r7 = ADDri killed $r7, 1, 14, $noreg, _, debug-location !38
-    DBG_VALUE debug-use $r7, debug-use $noreg, !18, !20, debug-location !28
+    DBG_VALUE $r7, $noreg, !18, !20, debug-location !28
   
   bb.3.for.cond:
     liveins: $r4, $r5, $r6, $r7
   
-    DBG_VALUE debug-use $r7, debug-use $noreg, !18, !20, debug-location !28
+    DBG_VALUE $r7, $noreg, !18, !20, debug-location !28
     CMPrr $r7, $r4, 14, $noreg, implicit-def $cpsr, debug-location !33
     Bcc %bb.2.for.body, 11, killed $cpsr, debug-location !33
   
diff --git a/test/CodeGen/ARM/debugtrap.ll b/test/CodeGen/ARM/debugtrap.ll
index 5064a4ec2ca906d3a4142314b6b8ff656ab773f5..88ca81c4f2cf98063913860247249e7c88f955ce 100644
--- a/test/CodeGen/ARM/debugtrap.ll
+++ b/test/CodeGen/ARM/debugtrap.ll
@@ -1,7 +1,10 @@
 ; This test ensures the @llvm.debugtrap() call is not removed when generating
 ; the 'pop' instruction to restore the callee saved registers on ARM.
 
-; RUN: llc < %s -mtriple=armv7 -O0 -filetype=asm | FileCheck %s 
+; RUN: llc < %s -mtriple=armv4 -O0 -filetype=asm | FileCheck --check-prefixes=CHECK,V4 %s
+; RUN: llc < %s -mtriple=armv5 -O0 -filetype=asm | FileCheck --check-prefixes=CHECK,V5 %s
+; RUN: llc < %s -mtriple=thumbv4 -O0 -filetype=asm | FileCheck --check-prefixes=CHECK,V4 %s
+; RUN: llc < %s -mtriple=thumbv5 -O0 -filetype=asm | FileCheck --check-prefixes=CHECK,V5 %s
 
 declare void @llvm.debugtrap() nounwind
 declare void @foo() nounwind
@@ -9,8 +12,9 @@ declare void @foo() nounwind
 define void @test() nounwind {
 entry:
   ; CHECK: bl foo
+  ; V4-NEXT: udf #254
+  ; V5-NEXT: bkpt #0
   ; CHECK-NEXT: pop
-  ; CHECK-NEXT: .inst 0xe7ffdefe
   call void @foo()
   call void @llvm.debugtrap()
   ret void
diff --git a/test/CodeGen/ARM/fast-isel-align.ll b/test/CodeGen/ARM/fast-isel-align.ll
index 71cd73a4a25d10e00c5f09c57b96484ad282b5ef..9dab0abedb64f6245ca5ac408ea2a678c7892772 100644
--- a/test/CodeGen/ARM/fast-isel-align.ll
+++ b/test/CodeGen/ARM/fast-isel-align.ll
@@ -26,12 +26,12 @@
 define void @unaligned_store(float %x, float %y) nounwind {
 entry:
 ; ARM: @unaligned_store
-; ARM: vmov r1, s0
-; ARM: str r1, [r0]
+; ARM: vmov [[REG:r[0-9]+]], s0
+; ARM: str [[REG]], [{{r[0-9]+}}]
 
 ; THUMB: @unaligned_store
-; THUMB: vmov r1, s0
-; THUMB: str r1, [r0]
+; THUMB: vmov [[REG:r[0-9]+]], s0
+; THUMB: str [[REG]], [{{r[0-9]+}}]
 
   %add = fadd float %x, %y
   %0 = load %struct.anon*, %struct.anon** @a, align 4
diff --git a/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll b/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll
index ca512970c9cf1786da333fe55547dad450eb7846..f49c907c4145a62b37877e9c9fedeaf9d2b39b4a 100644
--- a/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll
+++ b/test/CodeGen/ARM/fast-isel-ldrh-strh-arm.ll
@@ -82,7 +82,8 @@ entry:
 ; ARM: t9
   %add.ptr = getelementptr inbounds i16, i16* %a, i64 -8
   store i16 0, i16* %add.ptr, align 2
-; ARM: strh	r1, [r0, #-16]
+; ARM: movw [[REG0:r[0-9]+]], #0
+; ARM: strh [[REG0]], [{{r[0-9]+}}, #-16]
   ret void
 }
 
@@ -93,9 +94,10 @@ entry:
 ; ARM: t10
   %add.ptr = getelementptr inbounds i16, i16* %a, i64 -128
   store i16 0, i16* %add.ptr, align 2
-; ARM: mvn r{{[1-9]}}, #255
-; ARM: add r0, r0, r{{[1-9]}}
-; ARM: strh r{{[1-9]}}, [r0]
+; ARM: mvn r1, #255
+; ARM: add [[REG0:r[0-9]+]], r0, r1
+; ARM: movw [[REG1:r[0-9]+]], #0
+; ARM: strh [[REG1]], {{\[}}[[REG0]]]
   ret void
 }
 
@@ -104,7 +106,8 @@ entry:
 ; ARM: t11
   %add.ptr = getelementptr inbounds i16, i16* %a, i64 8
   store i16 0, i16* %add.ptr, align 2
-; ARM: strh r{{[1-9]}}, [r0, #16]
+; ARM: movw [[REG1:r[0-9]+]], #0
+; ARM: strh [[REG1]], [{{r[0-9]+}}, #16]
   ret void
 }
 
@@ -115,8 +118,9 @@ entry:
 ; ARM: t12
   %add.ptr = getelementptr inbounds i16, i16* %a, i64 128
   store i16 0, i16* %add.ptr, align 2
-; ARM: add r0, r0, #256
-; ARM: strh r{{[1-9]}}, [r0]
+; ARM: add [[REG0:r[0-9]+]], r0, #256
+; ARM: movw [[REG1:r[0-9]+]], #0
+; ARM: strh [[REG1]], {{\[}}[[REG0]]]
   ret void
 }
 
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 502285e85dfdf1ca815c7a9e3fc3d19f24ca613c..3661beab5c02842ada763ea7cf64e0dfa5e12df6 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -149,21 +149,21 @@ define void @test4() {
 
 ; THUMB: {{(movw r0, :lower16:L_test4g\$non_lazy_ptr)|(ldr.n r0, .LCPI)}}
 ; THUMB: {{(movt r0, :upper16:L_test4g\$non_lazy_ptr)?}}
-; THUMB: ldr r0, [r0]
-; THUMB: ldr r1, [r0]
-; THUMB: adds r1, #1
-; THUMB: str r1, [r0]
+; THUMB: ldr [[REG:r[0-9]+]], [r0]
+; THUMB: ldr [[REG1:r[0-9]+]], {{\[}}[[REG]]]
+; THUMB: adds [[REG1]], #1
+; THUMB: str [[REG1]], {{\[}}[[REG]]]
 
 ; ARM-MACHO: {{(movw r0, :lower16:L_test4g\$non_lazy_ptr)|(ldr r0, .LCPI)}}
 ; ARM-MACHO: {{(movt r0, :upper16:L_test4g\$non_lazy_ptr)?}}
-; ARM-MACHO: ldr r0, [r0]
+; ARM-MACHO: ldr [[REG:r[0-9]+]], [r0]
 
-; ARM-ELF: movw r0, :lower16:test4g
-; ARM-ELF: movt r0, :upper16:test4g
+; ARM-ELF: movw [[REG:r[0-9]+]], :lower16:test4g
+; ARM-ELF: movt [[REG]], :upper16:test4g
 
-; ARM: ldr r1, [r0]
-; ARM: add r1, r1, #1
-; ARM: str r1, [r0]
+; ARM: ldr [[REG1:r[0-9]+]], {{\[}}[[REG]]]
+; ARM: add [[REG2:r[0-9]+]], [[REG1]], #1
+; ARM: str [[REG2]], {{\[}}[[REG]]]
 }
 
 ; ARM: @urem_fold
diff --git a/test/CodeGen/ARM/fmacs.ll b/test/CodeGen/ARM/fmacs.ll
index 027991ef2c95d74658b8c57e46b4612e859808b4..140ab933d0cdedab885d9d5cee41bdd7b5f101d9 100644
--- a/test/CodeGen/ARM/fmacs.ll
+++ b/test/CodeGen/ARM/fmacs.ll
@@ -27,10 +27,11 @@ entry:
 	ret float %1
 }
 
-define float @vlma_minsize(float %acc, float %a, float %b) #0 {
+define float @vmla_minsize(float %acc, float %a, float %b) #0 {
 entry:
-; VMLA-LABEL: vlma_minsize:
-; VLMA:       vmla.f32  s0, s1, s2
+; VMLA-LABEL: vmla_minsize:
+; VMLA:       vmla.f32  s0, s1, s2
+; VMLA-NEXT:  bx  lr
 
   %0 = fmul float %a, %b
   %1 = fadd float %acc, %0
diff --git a/test/CodeGen/ARM/fold-sext-sextload.ll b/test/CodeGen/ARM/fold-sext-sextload.ll
index 484e93f59d48aba8616ae48e7f2608d19819b7d2..96e2e78a47d4b8bf99f428467eaadaea54b646c5 100644
--- a/test/CodeGen/ARM/fold-sext-sextload.ll
+++ b/test/CodeGen/ARM/fold-sext-sextload.ll
@@ -1,15 +1,14 @@
 ; RUN: llc -mtriple armv7 %s -stop-before=livedebugvalues -o - | FileCheck %s
 
-define <4 x i8> @i(<4 x i8>*) !dbg !8 {
-  %2 = load <4 x i8>, <4 x i8>* %0, align 4, !dbg !14
+define <4 x i8> @i(<4 x i8>*, <4 x i8>) !dbg !8 {
+  %3 = load <4 x i8>, <4 x i8>* %0, align 4, !dbg !14
   ; CHECK: $[[reg:.*]] = VLD1LNd32 {{.*}} debug-location !14 :: (load 4 from %ir.0)
-  ; CHECK-NEXT: VMOVLsv8i16 {{.*}} $[[reg]], {{.*}} debug-location !14
-  ; CHECK-NEXT: VMOVLsv4i32 {{.*}} $[[reg]], {{.*}} debug-location !14
-
-  %3 = sdiv <4 x i8> zeroinitializer, %2, !dbg !15
-  call void @llvm.dbg.value(metadata <4 x i8> %2, metadata !11, metadata !DIExpression()), !dbg !14
-  call void @llvm.dbg.value(metadata <4 x i8> %3, metadata !13, metadata !DIExpression()), !dbg !15
-  ret <4 x i8> %3, !dbg !16
+  ; CHECK: VMOVLsv8i16 {{.*}} $[[reg]], {{.*}} debug-location !14
+  ; CHECK: VMOVLsv4i32 {{.*}} $[[reg]], {{.*}} debug-location !14
+  %4 = sdiv <4 x i8> %1, %3, !dbg !15
+  call void @llvm.dbg.value(metadata <4 x i8> %3, metadata !11, metadata !DIExpression()), !dbg !14
+  call void @llvm.dbg.value(metadata <4 x i8> %4, metadata !13, metadata !DIExpression()), !dbg !15
+  ret <4 x i8> %4, !dbg !16
 }
 
 declare void @llvm.dbg.value(metadata, metadata, metadata)
diff --git a/test/CodeGen/ARM/fold-zext-zextload.ll b/test/CodeGen/ARM/fold-zext-zextload.ll
index 3ff0dd885a80eaf014cd1911f4c3227c4c822056..25e226fda6642e7ced5693851c2c86be04318a10 100644
--- a/test/CodeGen/ARM/fold-zext-zextload.ll
+++ b/test/CodeGen/ARM/fold-zext-zextload.ll
@@ -1,15 +1,14 @@
 ; RUN: llc -mtriple armv7 %s -stop-before=livedebugvalues -o - | FileCheck %s
 
-define <4 x i8> @i(<4 x i8>*) !dbg !8 {
-  %2 = load <4 x i8>, <4 x i8>* %0, align 4, !dbg !14
+define <4 x i8> @i(<4 x i8>*, <4 x i8>) !dbg !8 {
+  %3 = load <4 x i8>, <4 x i8>* %0, align 4, !dbg !14
   ; CHECK: $[[reg:.*]] = VLD1LNd32 {{.*}} debug-location !14 :: (load 4 from %ir.0)
   ; CHECK-NEXT: VMOVLuv8i16 {{.*}} $[[reg]], {{.*}} debug-location !14
   ; CHECK-NEXT: VMOVLuv4i32 {{.*}} $[[reg]], {{.*}} debug-location !14
-
-  %3 = udiv <4 x i8> zeroinitializer, %2, !dbg !15
-  call void @llvm.dbg.value(metadata <4 x i8> %2, metadata !11, metadata !DIExpression()), !dbg !14
-  call void @llvm.dbg.value(metadata <4 x i8> %3, metadata !13, metadata !DIExpression()), !dbg !15
-  ret <4 x i8> %3, !dbg !16
+  %4 = udiv <4 x i8> %1, %3, !dbg !15
+  call void @llvm.dbg.value(metadata <4 x i8> %3, metadata !11, metadata !DIExpression()), !dbg !14
+  call void @llvm.dbg.value(metadata <4 x i8> %4, metadata !13, metadata !DIExpression()), !dbg !15
+  ret <4 x i8> %4, !dbg !16
 }
 
 declare void @llvm.dbg.value(metadata, metadata, metadata)
diff --git a/test/CodeGen/ARM/fp16-instructions.ll b/test/CodeGen/ARM/fp16-instructions.ll
index eb5ec5eb87d93c84d8a58f71145626abae0aad31..670fcf58b1edbabd22835a1aeada67c9bf2420a9 100644
--- a/test/CodeGen/ARM/fp16-instructions.ll
+++ b/test/CodeGen/ARM/fp16-instructions.ll
@@ -935,9 +935,9 @@ entry:
 ; CHECK-SOFTFP-FP16-T32:       vmov	[[S6:s[0-9]]], r0
 ; CHECK-SOFTFP-FP16-T32:       vldr	s0, .LCP{{.*}}
 ; CHECK-SOFTFP-FP16-T32:       vcvtb.f32.f16	[[S6]], [[S6]]
-; CHECK-SOFTFP-FP16-T32:       vmov.f32	[[S2:s[0-9]]], #-2.000000e+00
-; CHECK-SOFTFP-FP16-T32:       vcmp.f32	[[S6]], s0
 ; CHECK-SOFTFP-FP16-T32:       vldr	[[S4:s[0-9]]], .LCPI{{.*}}
+; CHECK-SOFTFP-FP16-T32:       vcmp.f32	[[S6]], s0
+; CHECK-SOFTFP-FP16-T32:       vmov.f32	[[S2:s[0-9]]], #-2.000000e+00
 ; CHECK-SOFTFP-FP16-T32:       vmrs	APSR_nzcv, fpscr
 ; CHECK-SOFTFP-FP16-T32:       it eq
 ; CHECK-SOFTFP-FP16-T32:       vmoveq.f32	[[S4]], [[S2]]
@@ -1043,7 +1043,7 @@ entry:
 
 ; CHECK-SPILL-RELOAD-LABEL: fn1:
 ; CHECK-SPILL-RELOAD:       vstr.16 s0, [sp, #{{.}}]  @ 2-byte Spill
-; CHECK-SPILL-RELOAD-NEXT:  bl  fn2
+; CHECK-SPILL-RELOAD:  bl  fn2
 ; CHECK-SPILL-RELOAD-NEXT:  vldr.16 s0, [sp, #{{.}}]  @ 2-byte Reload
 }
 
diff --git a/test/CodeGen/ARM/fp16-promote.ll b/test/CodeGen/ARM/fp16-promote.ll
index dae9ef2ea83ac4ae91afc98e128346811854ddff..d7eaddc9e408a0416d9055a6df0bbe5d54df2322 100644
--- a/test/CodeGen/ARM/fp16-promote.ll
+++ b/test/CodeGen/ARM/fp16-promote.ll
@@ -644,7 +644,7 @@ define void @test_maxnum(half* %p, half* %q) #0 {
   ret void
 }
 
-; CHECK-ALL-LABEL: test_minnan:
+; CHECK-ALL-LABEL: test_minimum:
 ; CHECK-FP16: vmov.f32 s0, #1.000000e+00
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-LIBCALL: bl __aeabi_h2f
@@ -654,7 +654,7 @@ define void @test_maxnum(half* %p, half* %q) #0 {
 ; CHECK-NOVFP: bl __aeabi_fcmpge
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL: bl __aeabi_f2h
-define void @test_minnan(half* %p) #0 {
+define void @test_minimum(half* %p) #0 {
   %a = load half, half* %p, align 2
   %c = fcmp ult half %a, 1.0
   %r = select i1 %c, half %a, half 1.0
@@ -662,7 +662,7 @@ define void @test_minnan(half* %p) #0 {
   ret void
 }
 
-; CHECK-ALL-LABEL: test_maxnan:
+; CHECK-ALL-LABEL: test_maximum:
 ; CHECK-FP16: vmov.f32 s0, #1.000000e+00
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-LIBCALL: bl __aeabi_h2f
@@ -672,7 +672,7 @@ define void @test_minnan(half* %p) #0 {
 ; CHECK-NOVFP: bl __aeabi_fcmple
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL: bl __aeabi_f2h
-define void @test_maxnan(half* %p) #0 {
+define void @test_maximum(half* %p) #0 {
   %a = load half, half* %p, align 2
   %c = fcmp ugt half %a, 1.0
   %r = select i1 %c, half %a, half 1.0
diff --git a/test/CodeGen/ARM/fusedMAC.ll b/test/CodeGen/ARM/fusedMAC.ll
index 6f6cdc11491e88916ad05218fbfaa92058959d85..6b922895b0060736b2c506ec035a6ce0c0d9b3a7 100644
--- a/test/CodeGen/ARM/fusedMAC.ll
+++ b/test/CodeGen/ARM/fusedMAC.ll
@@ -1,4 +1,8 @@
 ; RUN: llc < %s -mtriple=armv7-eabi -mattr=+neon,+vfp4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=arm-arm-eabi -mcpu=cortex-m7  -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=arm-arm-eabi -mcpu=cortex-m4  -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE
+; RUN: llc < %s -mtriple=arm-arm-eabi -mcpu=cortex-m33 -fp-contract=fast | FileCheck %s -check-prefix=DONT-FUSE
+
 ; Check generated fused MAC and MLS.
 
 define double @fusedMACTest1(double %d1, double %d2, double %d3) {
@@ -12,6 +16,11 @@ define double @fusedMACTest1(double %d1, double %d2, double %d3) {
 define float @fusedMACTest2(float %f1, float %f2, float %f3) {
 ;CHECK-LABEL: fusedMACTest2:
 ;CHECK: vfma.f32
+
+;DONT-FUSE-LABEL: fusedMACTest2:
+;DONT-FUSE:       vmul.f32
+;DONT-FUSE-NEXT:  vadd.f32
+
   %1 = fmul float %f1, %f2
   %2 = fadd float %1, %f3
   ret float %2
diff --git a/test/CodeGen/ARM/inlineasm-X-allocation.ll b/test/CodeGen/ARM/inlineasm-X-allocation.ll
index e88d668f5ccfff11cf8328b2c129f63316522238..b2cb932f905520756602c460449fd4cdb0a93f0f 100644
--- a/test/CodeGen/ARM/inlineasm-X-allocation.ll
+++ b/test/CodeGen/ARM/inlineasm-X-allocation.ll
@@ -1,21 +1,20 @@
-; RUN: llc -mtriple=armv7-none-eabi -mattr=-neon,-vfpv2 %s -o - | FileCheck %s  -check-prefix=novfp
-; RUN: llc -mtriple=armv7-none-eabi -mattr=+neon %s -float-abi=hard -o - | FileCheck %s -check-prefix=vfp
+; RUN: llc -mtriple=armv7-none-eabi -mattr=-neon,-vfp2 %s -o - | FileCheck %s  -check-prefixes=COMMON,NOVFP
+; RUN: llc -mtriple=armv7-none-eabi -mattr=+neon %s -float-abi=hard -o - | FileCheck %s -check-prefixes=COMMON,VFP
 
-; vfp-LABEL: f1
-; vfp-CHECK: vadd.f32 s0, s0, s0
+; The intent here is to test "X", which says that any operand whatsoever is allowed.
+; Using this mechanism, we want to test toggling allocating GPR or SPR registers
+; depending on whether the float registers are available. Thus, the mnemonic is
+; totally irrelevant here, which is why we use FOO and also comment it out using "@"
+; to avoid assembler errors.
 
-; In the novfp case, the compiler is forced to assign a core register.
-; Although this register class can't be used with the vadd.f32 instruction,
-; the compiler behaved as expected since it is allowed to emit anything.
+; Note that this kind of IR can be generated by a function such as:
+;  void f1(float f) {asm volatile ("@FOO $0, $0" : : "X" (f));}
 
-; novfp-LABEL: f1
-; novfp-CHECK: vadd.f32 r0, r0, r0
-
-; This can be generated by a function such as:
-;  void f1(float f) {asm volatile ("add.f32 $0, $0, $0" : : "X" (f));}
-
-define arm_aapcs_vfpcc void @f1(float %f) {
+define arm_aapcs_vfpcc void @func(float %f) {
+; COMMON-LABEL: func
+; NOVFP:        FOO r0, r0
+; VFP:          FOO s0, s0
 entry:
-  call void asm sideeffect "vadd.f32 $0, $0, $0", "X" (float %f) nounwind
+  call void asm sideeffect "@FOO $0, $0", "X" (float %f) nounwind
   ret void
 }
diff --git a/test/CodeGen/ARM/intrinsics-overflow.ll b/test/CodeGen/ARM/intrinsics-overflow.ll
index 835be7e949d3bc8f1a478cb0882d307f3dc47769..d4c20dfacce6d21f3d22f34a2da1cb687aed35ff 100644
--- a/test/CodeGen/ARM/intrinsics-overflow.ll
+++ b/test/CodeGen/ARM/intrinsics-overflow.ll
@@ -38,14 +38,9 @@ define i32 @sadd_overflow(i32 %a, i32 %b) #0 {
   ; ARM: movvc r[[R0]], #0
   ; ARM: mov pc, lr
 
-  ; THUMBV6: mov  r[[R2:[0-9]+]], r[[R0:[0-9]+]]
-  ; THUMBV6: adds r[[R3:[0-9]+]], r[[R0]], r[[R1:[0-9]+]]
-  ; THUMBV6: movs r[[R0]], #0
-  ; THUMBV6: movs r[[R1]], #1
-  ; THUMBV6: cmp  r[[R3]], r[[R2]]
-  ; THUMBV6: bvc  .L[[LABEL:.*]]
-  ; THUMBV6: mov  r[[R0]], r[[R1]]
-  ; THUMBV6: .L[[LABEL]]:
+  ; THUMBV6: adds    r1, r0, r1
+  ; THUMBV6: cmp     r1, r0
+  ; THUMBV6: bvc     .LBB1_2
 
   ; THUMBV7: adds  r[[R2:[0-9]+]], r[[R0]], r[[R1:[0-9]+]]
   ; THUMBV7: mov.w r[[R0:[0-9]+]], #1
@@ -94,12 +89,8 @@ define i32 @ssub_overflow(i32 %a, i32 %b) #0 {
   ; ARM: cmp r[[R0]], r[[R1]]
   ; ARM: movvc r[[R2]], #0
 
-  ; THUMBV6: movs    r[[R0]], #0
-  ; THUMBV6: movs    r[[R3:[0-9]+]], #1
-  ; THUMBV6: cmp     r[[R2]], r[[R1:[0-9]+]]
-  ; THUMBV6: bvc     .L[[LABEL:.*]]
-  ; THUMBV6: mov     r[[R0]], r[[R3]]
-  ; THUMBV6: .L[[LABEL]]:
+  ; THUMBV6: cmp     r0, r1
+  ; THUMBV6: bvc     .LBB3_2
 
   ; THUMBV7: movs  r[[R2:[0-9]+]], #1
   ; THUMBV7: cmp   r[[R0:[0-9]+]], r[[R1:[0-9]+]]
diff --git a/test/CodeGen/ARM/memfunc.ll b/test/CodeGen/ARM/memfunc.ll
index b415ff7b7f419518dff8a879f95e53762da7be59..6c0668a53e82678dc478f585310ef68579c37454 100644
--- a/test/CodeGen/ARM/memfunc.ll
+++ b/test/CodeGen/ARM/memfunc.ll
@@ -388,6 +388,7 @@ entry:
 @arr7 = external global [7 x i8], align 1
 @arr8 = internal global [128 x i8] undef
 @arr9 = weak_odr global [128 x i8] undef
+@arr10 = dso_local global [8 x i8] c"\01\02\03\04\05\06\07\08", align 1
 define void @f9(i8* %dest, i32 %n) "no-frame-pointer-elim"="true" {
 entry:
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr1, i32 0, i32 0), i32 %n, i1 false)
@@ -399,7 +400,7 @@ entry:
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @arr7, i32 0, i32 0), i32 %n, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([128 x i8], [128 x i8]* @arr8, i32 0, i32 0), i32 %n, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([128 x i8], [128 x i8]* @arr9, i32 0, i32 0), i32 %n, i1 false)
-
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @arr10, i32 0, i32 0), i32 %n, i1 false)
   unreachable
 }
 
@@ -427,6 +428,11 @@ entry:
 ; CHECK-GNUEABI: arr8,128,16
 ; CHECK: .p2align 4
 ; CHECK: arr9:
+; CHECK-IOS: .p2align 3
+; CHECK-DARWIN: .p2align 2
+; CHECK-EABI: .p2align 2
+; CHECK-GNUEABI: .p2align 2
+; CHECK: arr10:
 
 ; CHECK-NOT: arr7:
 
diff --git a/test/CodeGen/ARM/popcnt.ll b/test/CodeGen/ARM/popcnt.ll
index fd61811f49cfcc3848ac0fe36bcae074a6ce29f4..e3ce5cd1ff995b2ed3a3a03bf23ff9389b04907e 100644
--- a/test/CodeGen/ARM/popcnt.ll
+++ b/test/CodeGen/ARM/popcnt.ll
@@ -1,17 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 ; Implement ctpop with vcnt
 
 define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vcnt8:
-;CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: vcnt8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: vcntQ8:
-;CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vcntQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp2
@@ -19,11 +29,12 @@ define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
 
 define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind {
 ; CHECK-LABEL: vcnt16:
-; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %tmp1)
 	ret <4 x i16> %tmp2
@@ -31,11 +42,13 @@ define <4 x i16> @vcnt16(<4 x i16>* %A) nounwind {
 
 define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind {
 ; CHECK-LABEL: vcntQ16:
-; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %tmp1)
 	ret <8 x i16> %tmp2
@@ -43,14 +56,13 @@ define <8 x i16> @vcntQ16(<8 x i16>* %A) nounwind {
 
 define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind {
 ; CHECK-LABEL: vcnt32:
-; CHECK: vcnt.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vrev16.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vadd.i8 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vuzp.8 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
-; CHECK: vrev32.16 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vuzp.16 {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %tmp1)
 	ret <2 x i32> %tmp2
@@ -58,14 +70,14 @@ define <2 x i32> @vcnt32(<2 x i32>* %A) nounwind {
 
 define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
 ; CHECK-LABEL: vcntQ32:
-; CHECK: vcnt.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vrev16.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vadd.i8 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vuzp.8 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
-; CHECK: vrev32.16 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vuzp.16 {{q[0-9]+}}, {{q[0-9]+}}
-; CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %tmp1)
 	ret <4 x i32> %tmp2
@@ -73,6 +85,14 @@ define <4 x i32> @vcntQ32(<4 x i32>* %A) nounwind {
 
 define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind {
 ; CHECK-LABEL: vcnt64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcnt.8 d16, d16
+; CHECK-NEXT:    vpaddl.u8 d16, d16
+; CHECK-NEXT:    vpaddl.u16 d16, d16
+; CHECK-NEXT:    vpaddl.u32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %tmp1)
 	ret <1 x i64> %tmp2
@@ -80,6 +100,15 @@ define <1 x i64> @vcnt64(<1 x i64>* %A) nounwind {
 
 define <2 x i64> @vcntQ64(<2 x i64>* %A) nounwind {
 ; CHECK-LABEL: vcntQ64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcnt.8 q8, q8
+; CHECK-NEXT:    vpaddl.u8 q8, q8
+; CHECK-NEXT:    vpaddl.u16 q8, q8
+; CHECK-NEXT:    vpaddl.u32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i64>, <2 x i64>* %A
 	%tmp2 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %tmp1)
 	ret <2 x i64> %tmp2
@@ -95,48 +124,75 @@ declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>) nounwind readnone
 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
 
 define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vclz8:
-;CHECK: vclz.i8 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: vclz8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vclz.i8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %tmp1, i1 0)
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vclz16:
-;CHECK: vclz.i16 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: vclz16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vclz.i16 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %tmp1, i1 0)
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vclz32:
-;CHECK: vclz.i32 {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-LABEL: vclz32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vclz.i32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %tmp1, i1 0)
 	ret <2 x i32> %tmp2
 }
 
 define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: vclzQ8:
-;CHECK: vclz.i8 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vclzQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vclz.i8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %tmp1, i1 0)
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: vclzQ16:
-;CHECK: vclz.i16 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vclzQ16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vclz.i16 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %tmp1, i1 0)
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: vclzQ32:
-;CHECK: vclz.i32 {{q[0-9]+}}, {{q[0-9]+}}
+; CHECK-LABEL: vclzQ32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vclz.i32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %tmp1, i1 0)
 	ret <4 x i32> %tmp2
@@ -151,48 +207,75 @@ declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
 
 define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vclss8:
-;CHECK: vcls.s8
+; CHECK-LABEL: vclss8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcls.s8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vclss16:
-;CHECK: vcls.s16
+; CHECK-LABEL: vclss16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcls.s16 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vclss32:
-;CHECK: vcls.s32
+; CHECK-LABEL: vclss32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vcls.s32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
 	ret <2 x i32> %tmp2
 }
 
 define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: vclsQs8:
-;CHECK: vcls.s8
+; CHECK-LABEL: vclsQs8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcls.s8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: vclsQs16:
-;CHECK: vcls.s16
+; CHECK-LABEL: vclsQs16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcls.s16 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: vclsQs32:
-;CHECK: vcls.s32
+; CHECK-LABEL: vclsQs32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vcls.s32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
 	ret <4 x i32> %tmp2
diff --git a/test/CodeGen/ARM/readonly-aliases.ll b/test/CodeGen/ARM/readonly-aliases.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c90650d3a81de68d040642b218746a7c021cbe9f
--- /dev/null
+++ b/test/CodeGen/ARM/readonly-aliases.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple thumbv7-unknown-linux-android -filetype asm -o - %s | FileCheck %s
+
+@a = protected constant <{ i32, i32 }> <{ i32 0, i32 0 }>
+@b = protected alias i32, getelementptr(i32, i32* getelementptr inbounds (<{ i32, i32 }>, <{ i32, i32 }>* @a, i32 0, i32 1), i32 -1)
+
+declare void @f(i32*)
+
+define void @g() {
+entry:
+  call void @f(i32* @b)
+  ret void
+}
+
+; CHECK-LABEL: g:
+; CHECK: movw [[REGISTER:r[0-9]+]], :lower16:b
+; CHECK: movt [[REGISTER]], :upper16:b
+
diff --git a/test/CodeGen/ARM/sched-it-debug-nodes.mir b/test/CodeGen/ARM/sched-it-debug-nodes.mir
index 8d0688ef01d354c6fcdc0a21c8d4280be0eb7a9f..ec42e7df3b2fd7fd1489a64139a57f125e1699ef 100644
--- a/test/CodeGen/ARM/sched-it-debug-nodes.mir
+++ b/test/CodeGen/ARM/sched-it-debug-nodes.mir
@@ -33,7 +33,7 @@
   ; hopefully, triggering an assert).
 
   ; CHECK: BUNDLE implicit-def dead $itstate{{.*}} {
-  ; CHECK: DBG_VALUE debug-use $r1, debug-use $noreg, !"u"
+  ; CHECK: DBG_VALUE $r1, $noreg, !"u"
   ; CHECK-NOT: DBG_VALUE killed $r1, $noreg, !"u"
 
   declare arm_aapcscc void @g(%struct.s*, i8*, i32) #1
@@ -131,23 +131,23 @@ body:             |
   bb.0.entry:
     liveins: $r0, $r1, $r2, $r3, $lr, $r7
 
-    DBG_VALUE debug-use $r0, debug-use $noreg, !18, !27, debug-location !28
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
-    DBG_VALUE debug-use $r2, debug-use $noreg, !20, !27, debug-location !28
-    DBG_VALUE debug-use $r3, debug-use $noreg, !21, !27, debug-location !28
+    DBG_VALUE $r0, $noreg, !18, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r2, $noreg, !20, !27, debug-location !28
+    DBG_VALUE $r3, $noreg, !21, !27, debug-location !28
     t2CMPri $r3, 4, 14, $noreg, implicit-def $cpsr, debug-location !31
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     $r0 = t2MOVi -1, 3, $cpsr, $noreg, implicit undef $r0
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
     tBX_RET 3, $cpsr, implicit $r0, debug-location !34
     $sp = frame-setup t2STMDB_UPD $sp, 14, $noreg, killed $r7, killed $lr
     frame-setup CFI_INSTRUCTION def_cfa_offset 8
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
-    DBG_VALUE debug-use $r0, debug-use $noreg, !18, !27, debug-location !28
-    DBG_VALUE debug-use $r1, debug-use $noreg, !19, !27, debug-location !28
-    DBG_VALUE debug-use $r2, debug-use $noreg, !20, !27, debug-location !28
-    DBG_VALUE debug-use $r3, debug-use $noreg, !21, !27, debug-location !28
+    DBG_VALUE $r0, $noreg, !18, !27, debug-location !28
+    DBG_VALUE $r1, $noreg, !19, !27, debug-location !28
+    DBG_VALUE $r2, $noreg, !20, !27, debug-location !28
+    DBG_VALUE $r3, $noreg, !21, !27, debug-location !28
     $r1 = tMOVr killed $r2, 14, $noreg, debug-location !32
     $r2 = tMOVr killed $r3, 14, $noreg, debug-location !32
     tBL 14, $noreg, @g, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit $r0, implicit $r1, implicit $r2, implicit-def $sp, debug-location !32
diff --git a/test/CodeGen/ARM/select-imm.ll b/test/CodeGen/ARM/select-imm.ll
index c0cebf833a06786c6187579e86fbf057b5abb736..1e27024e7c79a21a411bda13d9f20620901fd4de 100644
--- a/test/CodeGen/ARM/select-imm.ll
+++ b/test/CodeGen/ARM/select-imm.ll
@@ -24,12 +24,8 @@ entry:
 ; ARMT2: movwgt [[R]], #123
 
 ; THUMB1-LABEL: t1:
-; THUMB1: mov     r1, r0
-; THUMB1: movs    r2, #255
-; THUMB1: adds    r2, #102
-; THUMB1: movs    r0, #123
-; THUMB1: cmp     r1, #1
-; THUMB1: bgt
+; THUMB1: cmp     r0, #1
+; THUMB1: bgt     .LBB0_2
 
 ; THUMB2-LABEL: t1:
 ; THUMB2: movw [[R:r[0-1]]], #357
@@ -75,8 +71,7 @@ entry:
 ; ARMT2: lsr r0, r0, #5
 
 ; THUMB1-LABEL: t3:
-; THUMB1: movs r1, #0
-; THUMB1: subs r1, r1, r0
+; THUMB1: rsbs r1, r0, #0
 ; THUMB1: adcs r0, r1
 
 ; THUMB2-LABEL: t3:
@@ -120,8 +115,7 @@ entry:
 
 ; THUMB1-LABEL: t5:
 ; THUMB1-NOT: bne
-; THUMB1: movs r0, #0
-; THUMB1: subs r0, r0, r1
+; THUMB1: rsbs r0, r1, #0
 ; THUMB1: adcs r0, r1
 
 ; THUMB2-LABEL: t5:
@@ -144,7 +138,7 @@ entry:
 
 ; THUMB1-LABEL: t6:
 ; THUMB1: cmp r{{[0-9]+}}, #0
-; THUMB1: bne
+; THUMB1: beq
 
 ; THUMB2-LABEL: t6:
 ; THUMB2-NOT: mov
@@ -200,8 +194,7 @@ entry:
 ; THUMB1: bl t7
 ; THUMB1: mov r1, r0
 ; THUMB1: subs r2, r4, #5
-; THUMB1: movs r0, #0
-; THUMB1: subs r0, r0, r2
+; THUMB1: rsbs r0, r2, #0
 ; THUMB1: adcs r0, r2
 
 ; THUMB2-LABEL: t8:
@@ -306,8 +299,7 @@ entry:
 ; ARMT2: lsr r0, r0, #5
 
 ; THUMB1-LABEL: t10:
-; THUMB1: movs r0, #0
-; THUMB1: subs r0, r0, r1
+; THUMB1: rsbs r0, r1, #0
 ; THUMB1: adcs r0, r1
 
 ; THUMB2-LABEL: t10:
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index e9394a7207388932421f9546c30bedb9579e6f9e..639b88183cc421db2ba400e570c39c9a5e4e8a63 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -80,8 +80,8 @@ define double @f7(double %a, double %b) {
 ; block generated, odds are good that we have close to the ideal code for this:
 ;
 ; CHECK-NEON-LABEL: f8:
-; CHECK-NEON:      movw    [[R3:r[0-9]+]], #1123
 ; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
+; CHECK-NEON:      movw    [[R3:r[0-9]+]], #1123
 ; CHECK-NEON-NEXT: cmp     r0, [[R3]]
 ; CHECK-NEON-NEXT: it      eq
 ; CHECK-NEON-NEXT: addeq{{.*}} [[R2]], #4
diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll
index ba996e5ddd8814c41a329c6af866168158f711c0..79048348e9bb645f9fdc2bf03bf12200c040b494 100644
--- a/test/CodeGen/ARM/smml.ll
+++ b/test/CodeGen/ARM/smml.ll
@@ -44,7 +44,7 @@ declare void @opaque(i32)
 define void @test_used_flags(i32 %in1, i32 %in2) {
 ; CHECK-LABEL: test_used_flags:
 ; CHECK-THUMB: movs    r2, #0
-; CHECK-THUMB: subs    r0, r2, r0
+; CHECK-THUMB: rsbs    r0, r0, #0
 ; CHECK-THUMB: sbcs    r2, r1
 ; CHECK-THUMB: bge
 ; CHECK-V6: smull [[PROD_LO:r[0-9]+]], [[PROD_HI:r[0-9]+]], r0, r1
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 585218cf337c1133fcb494e068e79b83f54890f6..c45f7133febebdb8d642ab859f7f6e3a639fe1c6 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -59,25 +59,25 @@ entry:
 define void @t2() nounwind {
 entry:
 ; DARWIN-LABEL: t2:
-; DARWIN: trap
+; DARWIN: udf #254
 
 ; FUNC-LABEL: t2:
 ; FUNC: bl __trap
 
 ; NACL-LABEL: t2:
-; NACL: .inst 0xe7fedef0
+; NACL: bkpt #0
 
 ; ARM-LABEL: t2:
-; ARM: .inst 0xe7ffdefe
+; ARM: bkpt #0
 
 ; THUMB-LABEL: t2:
-; THUMB: .inst.n 0xdefe
+; THUMB: bkpt #0
 
-; ENCODING-NACL: f0 de fe e7 trap
+; ENCODING-NACL: 70 00 20 e1 bkpt #0
 
-; ENCODING-ARM: fe de ff e7 trap
+; ENCODING-ARM: 70 00 20 e1 bkpt #0
 
-; ENCODING-THUMB: fe de trap
+; ENCODING-THUMB: 00 be bkpt #0
 
   call void @llvm.debugtrap()
   unreachable
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index f0a95c833c6b10eb71d85e1ada3441dfdc645e0d..e8c52e1b58df011c13c47e99609296e23303c8ca 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -4,8 +4,8 @@
 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
 ; CHECK-LABEL:      PR13378:
-; CHECK:        vld1.32
-; CHECK-NEXT:   vmov.i32
+; CHECK:        vmov.i32
+; CHECK-NEXT:   vld1.32
 ; CHECK-NEXT:   vst1.32
 ; CHECK-NEXT:   vst1.32
 ; CHECK-NEXT:   vmov.f32
diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll
index c08ed81d042a4ed17030277d0671fcfd747aed23..de234b6879eead81aa5a46ca2601b0aa5adc28df 100644
--- a/test/CodeGen/ARM/vcombine.ll
+++ b/test/CodeGen/ARM/vcombine.ll
@@ -39,8 +39,8 @@ define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
 
-; CHECK-LE: vmov r0, r1, [[LD0]]
 ; CHECK-LE: vmov r2, r3, [[LD1]]
+; CHECK-LE: vmov r0, r1, [[LD0]]
 
 ; CHECK-BE: vmov r1, r0, d16
 ; CHECK-BE: vmov r3, r2, d17
@@ -56,8 +56,8 @@ define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind {
 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
 
-; CHECK-LE: vmov r0, r1, [[LD0]]
 ; CHECK-LE: vmov r2, r3, [[LD1]]
+; CHECK-LE: vmov r0, r1, [[LD0]]
 
 ; CHECK-BE: vmov r1, r0, d16
 ; CHECK-BE: vmov r3, r2, d17
@@ -72,11 +72,11 @@ define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0]
 ; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1]
 
-; CHECK-LE: vmov r0, r1, [[LD0]]
 ; CHECK-LE: vmov r2, r3, [[LD1]]
+; CHECK-LE: vmov r0, r1, [[LD0]]
 
-; CHECK-BE: vmov r1, r0, [[LD0]]
 ; CHECK-BE: vmov r3, r2, [[LD1]]
+; CHECK-BE: vmov r1, r0, [[LD0]]
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = load <1 x i64>, <1 x i64>* %B
 	%tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index b7693c797635787a5656a43380d1e8fb696019c7..5127dab2656434756312f5516da6ea9628ca07ad 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -1,9 +1,12 @@
-; RUN: llc -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs %s -o - \
-; RUN:	| FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -verify-machineinstrs | FileCheck %s
 
 define <8 x i8> @v_dup8(i8 %A) nounwind {
-;CHECK-LABEL: v_dup8:
-;CHECK: vdup.8
+; CHECK-LABEL: v_dup8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.8 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
 	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
 	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
@@ -16,8 +19,11 @@ define <8 x i8> @v_dup8(i8 %A) nounwind {
 }
 
 define <4 x i16> @v_dup16(i16 %A) nounwind {
-;CHECK-LABEL: v_dup16:
-;CHECK: vdup.16
+; CHECK-LABEL: v_dup16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.16 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
 	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
 	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
@@ -26,24 +32,34 @@ define <4 x i16> @v_dup16(i16 %A) nounwind {
 }
 
 define <2 x i32> @v_dup32(i32 %A) nounwind {
-;CHECK-LABEL: v_dup32:
-;CHECK: vdup.32
+; CHECK-LABEL: v_dup32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
 	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
 	ret <2 x i32> %tmp2
 }
 
 define <2 x float> @v_dupfloat(float %A) nounwind {
-;CHECK-LABEL: v_dupfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: v_dupfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
 	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
 	ret <2 x float> %tmp2
 }
 
 define <16 x i8> @v_dupQ8(i8 %A) nounwind {
-;CHECK-LABEL: v_dupQ8:
-;CHECK: vdup.8
+; CHECK-LABEL: v_dupQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.8 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
 	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
 	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
@@ -64,8 +80,12 @@ define <16 x i8> @v_dupQ8(i8 %A) nounwind {
 }
 
 define <8 x i16> @v_dupQ16(i16 %A) nounwind {
-;CHECK-LABEL: v_dupQ16:
-;CHECK: vdup.16
+; CHECK-LABEL: v_dupQ16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.16 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
 	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
 	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
@@ -78,8 +98,12 @@ define <8 x i16> @v_dupQ16(i16 %A) nounwind {
 }
 
 define <4 x i32> @v_dupQ32(i32 %A) nounwind {
-;CHECK-LABEL: v_dupQ32:
-;CHECK: vdup.32
+; CHECK-LABEL: v_dupQ32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
 	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
 	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
@@ -88,8 +112,12 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind {
 }
 
 define <4 x float> @v_dupQfloat(float %A) nounwind {
-;CHECK-LABEL: v_dupQfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: v_dupQfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
 	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
 	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
@@ -100,163 +128,248 @@ define <4 x float> @v_dupQfloat(float %A) nounwind {
 ; Check to make sure it works with shuffles, too.
 
 define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
-;CHECK-LABEL: v_shuffledup8:
-;CHECK: vdup.8
+; CHECK-LABEL: v_shuffledup8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.8 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
-;CHECK-LABEL: v_shuffledup16:
-;CHECK: vdup.16
+; CHECK-LABEL: v_shuffledup16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.16 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
-;CHECK-LABEL: v_shuffledup32:
-;CHECK: vdup.32
+; CHECK-LABEL: v_shuffledup32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
 	ret <2 x i32> %tmp2
 }
 
 define <2 x float> @v_shuffledupfloat(float %A) nounwind {
-;CHECK-LABEL: v_shuffledupfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: v_shuffledupfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 d16, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
 	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
 	ret <2 x float> %tmp2
 }
 
 define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ8:
-;CHECK: vdup.8
+; CHECK-LABEL: v_shuffledupQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.8 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
 	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ16:
-;CHECK: vdup.16
+; CHECK-LABEL: v_shuffledupQ16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.16 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
 	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
-;CHECK-LABEL: v_shuffledupQ32:
-;CHECK: vdup.32
+; CHECK-LABEL: v_shuffledupQ32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
 	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
 	ret <4 x i32> %tmp2
 }
 
 define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
-;CHECK-LABEL: v_shuffledupQfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: v_shuffledupQfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
 	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
 	ret <4 x float> %tmp2
 }
 
 define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vduplane8:
-;CHECK: vdup.8
+; CHECK-LABEL: vduplane8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.8 d16, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
 	ret <8 x i8> %tmp2
 }
 
 define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vduplane16:
-;CHECK: vdup.16
+; CHECK-LABEL: vduplane16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.16 d16, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
 	ret <4 x i16> %tmp2
 }
 
 define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vduplane32:
-;CHECK: vdup.32
+; CHECK-LABEL: vduplane32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.32 d16, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
 	ret <2 x i32> %tmp2
 }
 
 define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
-;CHECK-LABEL: vduplanefloat:
-;CHECK: vdup.32
+; CHECK-LABEL: vduplanefloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.32 d16, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x float>, <2 x float>* %A
 	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
 	ret <2 x float> %tmp2
 }
 
 define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ8:
-;CHECK: vdup.8
+; CHECK-LABEL: vduplaneQ8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.8 q8, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
 	ret <16 x i8> %tmp2
 }
 
 define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ16:
-;CHECK: vdup.16
+; CHECK-LABEL: vduplaneQ16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.16 q8, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
 	ret <8 x i16> %tmp2
 }
 
 define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: vduplaneQ32:
-;CHECK: vdup.32
+; CHECK-LABEL: vduplaneQ32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.32 q8, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
 	ret <4 x i32> %tmp2
 }
 
 define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
-;CHECK-LABEL: vduplaneQfloat:
-;CHECK: vdup.32
+; CHECK-LABEL: vduplaneQfloat:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vldr d16, [r0]
+; CHECK-NEXT:    vdup.32 q8, d16[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
 	%tmp1 = load <2 x float>, <2 x float>* %A
 	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
 	ret <4 x float> %tmp2
 }
 
 define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+; CHECK-LABEL: foo:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x i64> %0
 }
 
 define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+; CHECK-LABEL: bar:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    mov r3, r1
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
   ret <2 x i64> %0
 }
 
 define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
+; CHECK-LABEL: baz:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    mov r1, r3
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
   ret <2 x double> %0
 }
 
 define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
+; CHECK-LABEL: qux:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    mov r3, r1
+; CHECK-NEXT:    mov pc, lr
 entry:
   %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
   ret <2 x double> %0
 }
 
 ; Radar 7373643
-;CHECK-LABEL: redundantVdup:
-;CHECK: vmov.i8
-;CHECK-NOT: vdup.8
-;CHECK: vstr
 define void @redundantVdup(<8 x i8>* %ptr) nounwind {
+; CHECK-LABEL: redundantVdup:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov.i8 d16, #0x80
+; CHECK-NEXT:    vstr d16, [r0]
+; CHECK-NEXT:    mov pc, lr
   %1 = insertelement <8 x i8> undef, i8 -128, i32 0
   %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
   store <8 x i8> %2, <8 x i8>* %ptr, align 8
@@ -264,8 +377,13 @@ define void @redundantVdup(<8 x i8>* %ptr) nounwind {
 }
 
 define <4 x i32> @tdupi(i32 %x, i32 %y) {
-;CHECK-LABEL: tdupi:
-;CHECK: vdup.32
+; CHECK-LABEL: tdupi:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q8, r0
+; CHECK-NEXT:    vmov.32 d17[1], r1
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
   %1 = insertelement <4 x i32> undef, i32 %x, i32 0
   %2 = insertelement <4 x i32> %1, i32 %x, i32 1
   %3 = insertelement <4 x i32> %2, i32 %x, i32 2
@@ -274,8 +392,13 @@ define <4 x i32> @tdupi(i32 %x, i32 %y) {
 }
 
 define <4 x float> @tdupf(float %x, float %y) {
-;CHECK-LABEL: tdupf:
-;CHECK: vdup.32
+; CHECK-LABEL: tdupf:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vdup.32 q0, r0
+; CHECK-NEXT:    vmov s3, r1
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    mov pc, lr
   %1 = insertelement <4 x float> undef, float %x, i32 0
   %2 = insertelement <4 x float> %1, float %x, i32 1
   %3 = insertelement <4 x float> %2, float %x, i32 2
@@ -286,9 +409,15 @@ define <4 x float> @tdupf(float %x, float %y) {
 ; This test checks that when splatting an element from a vector into another,
 ; the value isn't moved out to GPRs first.
 define <4 x i32> @tduplane(<4 x i32> %invec) {
-;CHECK-LABEL: tduplane:
-;CHECK-NOT: vmov {{.*}}, d16[1]
-;CHECK: vdup.32 {{.*}}, d16[1]
+; CHECK-LABEL: tduplane:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r0, #255
+; CHECK-NEXT:    vdup.32 q8, d16[1]
+; CHECK-NEXT:    vmov.32 d17[1], r0
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
   %in = extractelement <4 x i32> %invec, i32 1
   %1 = insertelement <4 x i32> undef, i32 %in, i32 0
   %2 = insertelement <4 x i32> %1, i32 %in, i32 1
@@ -298,8 +427,13 @@ define <4 x i32> @tduplane(<4 x i32> %invec) {
 }
 
 define <2 x float> @check_f32(<4 x float> %v) nounwind {
-;CHECK-LABEL: check_f32:
-;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+; CHECK-LABEL: check_f32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vdup.32 d16, d17[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <4 x float> %v, i32 3
   %1 = insertelement  <2 x float> undef, float %x, i32 0
   %2 = insertelement  <2 x float> %1, float %x, i32 1
@@ -307,8 +441,13 @@ define <2 x float> @check_f32(<4 x float> %v) nounwind {
 }
 
 define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
-;CHECK-LABEL: check_i32:
-;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+; CHECK-LABEL: check_i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vdup.32 d16, d17[1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <4 x i32> %v, i32 3
   %1 = insertelement  <2 x i32> undef, i32 %x, i32 0
   %2 = insertelement  <2 x i32> %1, i32 %x, i32 1
@@ -316,8 +455,13 @@ define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
 }
 
 define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
-;CHECK-LABEL: check_i16:
-;CHECK: vdup.16 {{.*}}, d{{..}}[3]
+; CHECK-LABEL: check_i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vdup.16 d16, d16[3]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <8 x i16> %v, i32 3
   %1 = insertelement  <4 x i16> undef, i16 %x, i32 0
   %2 = insertelement  <4 x i16> %1, i16 %x, i32 1
@@ -325,8 +469,13 @@ define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
 }
 
 define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
-;CHECK-LABEL: check_i8:
-;CHECK: vdup.8 {{.*}}, d{{..}}[3]
+; CHECK-LABEL: check_i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vdup.8 d16, d16[3]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <16 x i8> %v, i32 3
   %1 = insertelement  <8  x i8> undef, i8 %x, i32 0
   %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
@@ -336,8 +485,17 @@ define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
 ; Check that an SPR splat produces a vdup.
 
 define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
-;CHECK-LABEL: check_spr_splat2:
-;CHECK: vdup.32 d
+; CHECK-LABEL: check_spr_splat2:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    lsl r2, r2, #16
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    asr r2, r2, #16
+; CHECK-NEXT:    vmov s0, r2
+; CHECK-NEXT:    vcvt.f32.s32 s0, s0
+; CHECK-NEXT:    vdup.32 d17, d0[0]
+; CHECK-NEXT:    vsub.f32 d16, d17, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
   %conv = sitofp i16 %q to float
   %splat.splatinsert = insertelement <2 x float> undef, float %conv, i32 0
   %splat.splat = shufflevector <2 x float> %splat.splatinsert, <2 x float> undef, <2 x i32> zeroinitializer
@@ -346,8 +504,18 @@ define <2 x float> @check_spr_splat2(<2 x float> %p, i16 %q) {
 }
 
 define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
-;CHECK-LABEL: check_spr_splat4:
-;CHECK: vld1.16
+; CHECK-LABEL: check_spr_splat4:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrsh r12, [sp]
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vmov s0, r12
+; CHECK-NEXT:    vcvt.f32.s32 s0, s0
+; CHECK-NEXT:    vdup.32 q9, d0[0]
+; CHECK-NEXT:    vsub.f32 q8, q9, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
   %conv = sitofp i16 %q to float
   %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 0
   %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
@@ -356,8 +524,18 @@ define <4 x float> @check_spr_splat4(<4 x float> %p, i16 %q) {
 }
 ; Same codegen as above test; scalar is splatted using vld1, so shuffle index is irrelevant.
 define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
-;CHECK-LABEL: check_spr_splat4_lane1:
-;CHECK: vld1.16
+; CHECK-LABEL: check_spr_splat4_lane1:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    ldrsh r12, [sp]
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vmov s0, r12
+; CHECK-NEXT:    vcvt.f32.s32 s0, s0
+; CHECK-NEXT:    vdup.32 q9, d0[0]
+; CHECK-NEXT:    vsub.f32 q8, q9, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
   %conv = sitofp i16 %q to float
   %splat.splatinsert = insertelement <4 x float> undef, float %conv, i32 1
   %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -370,12 +548,25 @@ define <4 x float> @check_spr_splat4_lane1(<4 x float> %p, i16 %q) {
 
 define <8 x i8> @check_i8_varidx(<16 x i8> %v, i32 %idx) {
 ; CHECK-LABEL: check_i8_varidx:
-; CHECK: mov r[[FP:[0-9]+]], sp
-; CHECK: ldr r[[IDX:[0-9]+]], [r[[FP]], #4]
-; CHECK: mov r[[SPCOPY:[0-9]+]], sp
-; CHECK: and r[[MASKED_IDX:[0-9]+]], r[[IDX]], #15
-; CHECK: vst1.64 {d{{.*}}, d{{.*}}}, [r[[SPCOPY]]:128], r[[MASKED_IDX]]
-; CHECK: vld1.8 {d{{.*}}[]}, [r[[SPCOPY]]]
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11}
+; CHECK-NEXT:    push {r11}
+; CHECK-NEXT:    .setfp r11, sp
+; CHECK-NEXT:    mov r11, sp
+; CHECK-NEXT:    .pad #28
+; CHECK-NEXT:    sub sp, sp, #28
+; CHECK-NEXT:    bic sp, sp, #15
+; CHECK-NEXT:    ldr r12, [r11, #4]
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    mov r1, sp
+; CHECK-NEXT:    and r0, r12, #15
+; CHECK-NEXT:    vst1.64 {d16, d17}, [r1:128], r0
+; CHECK-NEXT:    vld1.8 {d16[]}, [r1]
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov sp, r11
+; CHECK-NEXT:    pop {r11}
+; CHECK-NEXT:    mov pc, lr
   %x = extractelement <16 x i8> %v, i32 %idx
   %1 = insertelement  <8 x i8> undef, i8 %x, i32 0
   %2 = insertelement  <8 x i8> %1, i8 %x, i32 1
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index d054bfda615e0059d54d0dddf735414e728e09e9..1aaffcc302d2ab408a2e52a0b7725dc18f24c250 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -48,7 +48,7 @@ define <4 x i8> @h(<4 x float> %v) {
 }
 
 ; CHECK-LABEL: i:
-define <4 x i8> @i(<4 x i8>* %x) {
+define <4 x i8> @i(<4 x i8>* %x, <4 x i8> %y) {
 ; Note: vld1 here is reasonably important. Mixing VFP and NEON
 ; instructions is bad on some cores
   ; CHECK: vld1
@@ -59,7 +59,7 @@ define <4 x i8> @i(<4 x i8>* %x) {
   ; CHECK: vmul
   ; CHECK: vmovn
   %1 = load <4 x i8>, <4 x i8>* %x, align 4
-  %2 = sdiv <4 x i8> zeroinitializer, %1
+  %2 = sdiv <4 x i8> %y, %1
   ret <4 x i8> %2
 }
 ; CHECK-LABEL: j:
diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll
index 96cafdec7bf17fec4721307f6721236ecea03006..281fe2537a47230551acf60928f65ac61bd09c2d 100644
--- a/test/CodeGen/ARM/vuzp.ll
+++ b/test/CodeGen/ARM/vuzp.ll
@@ -324,23 +324,23 @@ define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8
 ; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8.
 ; CHECK-LABEL: cmpsel_trunc:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	add	r12, sp, #16
-; CHECK-NEXT: 	vld1.64	{d16, d17}, [r12]
-; CHECK-NEXT:	mov	r12, sp
-; CHECK-NEXT:	vld1.64	{d18, d19}, [r12]
-; CHECK-NEXT:	add	r12, sp, #48
-; CHECK-NEXT:	vld1.64	{d20, d21}, [r12]
-; CHECK-NEXT:	add	r12, sp, #32
-; CHECK-NEXT:	vcgt.u32	q8, q10, q8
-; CHECK-NEXT:	vld1.64	{d20, d21}, [r12]
-; CHECK-NEXT:	vcgt.u32	q9, q10, q9
-; CHECK-NEXT:	vmov	d20, r2, r3
-; CHECK-NEXT:	vmovn.i32	d17, q8
-; CHECK-NEXT:	vmovn.i32	d16, q9
-; CHECK-NEXT:	vmov	d18, r0, r1
-; CHECK-NEXT:	vmovn.i16	d16, q8
-; CHECK-NEXT:	vbsl	d16, d18, d20
-; CHECK-NEXT:	vmov	r0, r1, d16
+; CHECK-NEXT:    add r12, sp, #16
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    mov r12, sp
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
+; CHECK-NEXT:    add r12, sp, #48
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r12]
+; CHECK-NEXT:    add r12, sp, #32
+; CHECK-NEXT:    vcgt.u32 q8, q10, q8
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r12]
+; CHECK-NEXT:    vcgt.u32 q9, q10, q9
+; CHECK-NEXT:    vmov d20, r2, r3
+; CHECK-NEXT:    vmovn.i32 d17, q8
+; CHECK-NEXT:    vmovn.i32 d16, q9
+; CHECK-NEXT:    vmov d18, r0, r1
+; CHECK-NEXT:    vmovn.i16 d16, q8
+; CHECK-NEXT:    vbsl d16, d18, d20
+; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    mov pc, lr
   %c = icmp ult <8 x i32> %cmp0, %cmp1
   %res = select <8 x i1> %c, <8 x i8> %in0, <8 x i8> %in1
@@ -353,28 +353,28 @@ define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8
 define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
 ; CHECK-LABEL: vuzp_trunc_and_shuffle:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	.save	{r11, lr}
-; CHECK-NEXT:	push	{r11, lr}
-; CHECK-NEXT:	add	r12, sp, #8
-; CHECK-NEXT:	add	lr, sp, #24
-; CHECK-NEXT:	vld1.64	{d16, d17}, [r12]
-; CHECK-NEXT:	ldr	r12, [sp, #40]
-; CHECK-NEXT:	vld1.64	{d18, d19}, [lr]
-; CHECK-NEXT:	vcgt.u32	q8, q9, q8
-; CHECK-NEXT:	vld1.32	{d18[0]}, [r12:32]
-; CHECK-NEXT:	vmov.i8	d19, #0x7
-; CHECK-NEXT:	vmovl.u8	q10, d18
-; CHECK-NEXT:	vmovn.i32	d16, q8
-; CHECK-NEXT:	vneg.s8	d17, d19
-; CHECK-NEXT:	vmov	d18, r2, r3
-; CHECK-NEXT:	vuzp.8	d16, d20
-; CHECK-NEXT:	vshl.i8	d16, d16, #7
-; CHECK-NEXT:	vshl.s8	d16, d16, d17
-; CHECK-NEXT:	vmov	d17, r0, r1
-; CHECK-NEXT:	vbsl	d16, d17, d18
-; CHECK-NEXT:	vmov	r0, r1, d16
-; CHECK-NEXT:	pop	{r11, lr}
-; CHECK-NEXT:	mov	pc, lr
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    add r12, sp, #8
+; CHECK-NEXT:    add lr, sp, #24
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    ldr r12, [sp, #40]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [lr]
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vld1.32 {d18[0]}, [r12:32]
+; CHECK-NEXT:    vmov.i8 d19, #0x7
+; CHECK-NEXT:    vmovl.u8 q10, d18
+; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vneg.s8 d17, d19
+; CHECK-NEXT:    vmov d18, r2, r3
+; CHECK-NEXT:    vuzp.8 d16, d20
+; CHECK-NEXT:    vshl.i8 d16, d16, #7
+; CHECK-NEXT:    vshl.s8 d16, d16, d17
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vbsl d16, d17, d18
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    pop {r11, lr}
+; CHECK-NEXT:    mov pc, lr
                          <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
   %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
   %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
@@ -389,22 +389,22 @@ define <8 x i8> @vuzp_trunc_and_shuffle(<8 x i8> %tr0, <8 x i8> %tr1,
 define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1,
 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_right:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	mov	r12, sp
-; CHECK-NEXT:	vld1.64	{d16, d17}, [r12]
-; CHECK-NEXT:	add	r12, sp, #16
-; CHECK-NEXT:	vld1.64	{d18, d19}, [r12]
-; CHECK-NEXT:	vcgt.u32	q8, q9, q8
-; CHECK-NEXT:	vmov.i8	d18, #0x7
-; CHECK-NEXT:	vmovn.i32	d16, q8
-; CHECK-NEXT:	vuzp.8	d16, d17
-; CHECK-NEXT:	vneg.s8	d17, d18
-; CHECK-NEXT:	vshl.i8	d16, d16, #7
-; CHECK-NEXT:	vmov	d18, r2, r3
-; CHECK-NEXT:	vshl.s8	d16, d16, d17
-; CHECK-NEXT:	vmov	d17, r0, r1
-; CHECK-NEXT:	vbsl	d16, d17, d18
-; CHECK-NEXT:	vmov	r0, r1, d16
-; CHECK-NEXT:	mov	pc, lr
+; CHECK-NEXT:    mov r12, sp
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    add r12, sp, #16
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vmov.i8 d18, #0x7
+; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vuzp.8 d16, d17
+; CHECK-NEXT:    vneg.s8 d17, d18
+; CHECK-NEXT:    vshl.i8 d16, d16, #7
+; CHECK-NEXT:    vmov d18, r2, r3
+; CHECK-NEXT:    vshl.s8 d16, d16, d17
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vbsl d16, d17, d18
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
                          <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) {
   %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4
   %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1>
@@ -417,23 +417,23 @@ define <8 x i8> @vuzp_trunc_and_shuffle_undef_right(<8 x i8> %tr0, <8 x i8> %tr1
 define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
 ; CHECK-LABEL: vuzp_trunc_and_shuffle_undef_left:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	mov	r12, sp
-; CHECK-NEXT:	vld1.64	{d16, d17}, [r12]
-; CHECK-NEXT:	add	r12, sp, #16
-; CHECK-NEXT:	vld1.64	{d18, d19}, [r12]
-; CHECK-NEXT:	vcgt.u32	q8, q9, q8
-; CHECK-NEXT:	vldr	d18, .LCPI22_0
-; CHECK-NEXT:	vmov.i8	d19, #0x7
-; CHECK-NEXT:	vmovn.i32	d16, q8
-; CHECK-NEXT:	vtbl.8	d16, {d16}, d18
-; CHECK-NEXT:	vneg.s8	d17, d19
-; CHECK-NEXT:	vmov	d18, r2, r3
-; CHECK-NEXT:	vshl.i8	d16, d16, #7
-; CHECK-NEXT:	vshl.s8	d16, d16, d17
-; CHECK-NEXT:	vmov	d17, r0, r1
-; CHECK-NEXT:	vbsl	d16, d17, d18
-; CHECK-NEXT:	vmov	r0, r1, d16
-; CHECK-NEXT:	mov	pc, lr
+; CHECK-NEXT:    mov r12, sp
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r12]
+; CHECK-NEXT:    add r12, sp, #16
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r12]
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vldr d18, .LCPI22_0
+; CHECK-NEXT:    vmov.i8 d19, #0x7
+; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vtbl.8 d16, {d16}, d18
+; CHECK-NEXT:    vneg.s8 d17, d19
+; CHECK-NEXT:    vmov d18, r2, r3
+; CHECK-NEXT:    vshl.i8 d16, d16, #7
+; CHECK-NEXT:    vshl.s8 d16, d16, d17
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vbsl d16, d17, d18
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
 ; CHECK-NEXT:    .p2align 3
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI22_0:
@@ -459,55 +459,55 @@ define <8 x i8> @vuzp_trunc_and_shuffle_undef_left(<8 x i8> %tr0, <8 x i8> %tr1,
 define <10 x i8> @vuzp_wide_type(<10 x i8> %tr0, <10 x i8> %tr1,
 ; CHECK-LABEL: vuzp_wide_type:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:	.save	{r4, lr}
-; CHECK-NEXT:	push	{r4, lr}
-; CHECK-NEXT:	add	r12, sp, #32
-; CHECK-NEXT:	add	lr, sp, #48
-; CHECK-NEXT:	vld1.32	{d17[0]}, [r12:32]
-; CHECK-NEXT:	add	r12, sp, #24
-; CHECK-NEXT:	vld1.32	{d16[0]}, [r12:32]
-; CHECK-NEXT:	add	r12, sp, #56
-; CHECK-NEXT:	vld1.32	{d19[0]}, [r12:32]
-; CHECK-NEXT:	ldr	r12, [sp, #68]
-; CHECK-NEXT:	vld1.32	{d18[0]}, [lr:32]
-; CHECK-NEXT:	add	lr, sp, #40
-; CHECK-NEXT:	vld1.32	{d20[0]}, [lr:32]
-; CHECK-NEXT:	ldr	r4, [r12]
-; CHECK-NEXT:	vmov.32	d23[0], r4
-; CHECK-NEXT:	add	r4, sp, #64
-; CHECK-NEXT:	vld1.32	{d24[0]}, [r4:32]
-; CHECK-NEXT:	add	r4, sp, #36
-; CHECK-NEXT:	vld1.32	{d17[1]}, [r4:32]
-; CHECK-NEXT:	add	r4, sp, #28
-; CHECK-NEXT:	vcgt.u32	q10, q12, q10
-; CHECK-NEXT:	vmov.u8	lr, d23[3]
-; CHECK-NEXT:	vld1.32	{d16[1]}, [r4:32]
-; CHECK-NEXT:	add	r4, sp, #60
-; CHECK-NEXT:	vld1.32	{d19[1]}, [r4:32]
-; CHECK-NEXT:	add	r4, sp, #52
-; CHECK-NEXT:	vld1.32	{d18[1]}, [r4:32]
-; CHECK-NEXT:	add	r4, r12, #4
-; CHECK-NEXT:	vcgt.u32	q8, q9, q8
-; CHECK-NEXT:	vmovn.i32	d19, q10
-; CHECK-NEXT:	vldr	d20, .LCPI23_0
-; CHECK-NEXT:	vmovn.i32	d18, q8
-; CHECK-NEXT:	vmovn.i16	d22, q9
-; CHECK-NEXT:	vmov.i8	q9, #0x7
-; CHECK-NEXT:	vmov.8	d17[0], lr
-; CHECK-NEXT:	vneg.s8	q9, q9
-; CHECK-NEXT:	vtbl.8	d16, {d22, d23}, d20
-; CHECK-NEXT:	vld1.8	{d17[1]}, [r4]
-; CHECK-NEXT:	add	r4, sp, #8
-; CHECK-NEXT:	vshl.i8	q8, q8, #7
-; CHECK-NEXT:	vld1.64	{d20, d21}, [r4]
-; CHECK-NEXT:	vshl.s8	q8, q8, q9
-; CHECK-NEXT:	vmov	d19, r2, r3
-; CHECK-NEXT:	vmov	d18, r0, r1
-; CHECK-NEXT:	vbsl	q8, q9, q10
-; CHECK-NEXT:	vmov	r0, r1, d16
-; CHECK-NEXT:	vmov	r2, r3, d17
-; CHECK-NEXT:	pop	{r4, lr}
-; CHECK-NEXT:	mov	pc, lr
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    add r12, sp, #32
+; CHECK-NEXT:    add lr, sp, #48
+; CHECK-NEXT:    vld1.32 {d17[0]}, [r12:32]
+; CHECK-NEXT:    add r12, sp, #24
+; CHECK-NEXT:    vld1.32 {d16[0]}, [r12:32]
+; CHECK-NEXT:    add r12, sp, #56
+; CHECK-NEXT:    vld1.32 {d19[0]}, [r12:32]
+; CHECK-NEXT:    vld1.32 {d18[0]}, [lr:32]
+; CHECK-NEXT:    add lr, sp, #40
+; CHECK-NEXT:    vld1.32 {d20[0]}, [lr:32]
+; CHECK-NEXT:    ldr r12, [sp, #68]
+; CHECK-NEXT:    ldr r4, [r12]
+; CHECK-NEXT:    vmov.32 d23[0], r4
+; CHECK-NEXT:    add r4, sp, #64
+; CHECK-NEXT:    vld1.32 {d24[0]}, [r4:32]
+; CHECK-NEXT:    add r4, sp, #36
+; CHECK-NEXT:    vcgt.u32 q10, q12, q10
+; CHECK-NEXT:    vld1.32 {d17[1]}, [r4:32]
+; CHECK-NEXT:    add r4, sp, #28
+; CHECK-NEXT:    vld1.32 {d16[1]}, [r4:32]
+; CHECK-NEXT:    add r4, sp, #60
+; CHECK-NEXT:    vld1.32 {d19[1]}, [r4:32]
+; CHECK-NEXT:    add r4, sp, #52
+; CHECK-NEXT:    vld1.32 {d18[1]}, [r4:32]
+; CHECK-NEXT:    add r4, r12, #4
+; CHECK-NEXT:    vcgt.u32 q8, q9, q8
+; CHECK-NEXT:    vmovn.i32 d19, q10
+; CHECK-NEXT:    vmov.u8 lr, d23[3]
+; CHECK-NEXT:    vldr d20, .LCPI23_0
+; CHECK-NEXT:    vmovn.i32 d18, q8
+; CHECK-NEXT:    vmovn.i16 d22, q9
+; CHECK-NEXT:    vmov.i8 q9, #0x7
+; CHECK-NEXT:    vneg.s8 q9, q9
+; CHECK-NEXT:    vmov.8 d17[0], lr
+; CHECK-NEXT:    vtbl.8 d16, {d22, d23}, d20
+; CHECK-NEXT:    vld1.8 {d17[1]}, [r4]
+; CHECK-NEXT:    add r4, sp, #8
+; CHECK-NEXT:    vshl.i8 q8, q8, #7
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r4]
+; CHECK-NEXT:    vshl.s8 q8, q8, q9
+; CHECK-NEXT:    vmov d19, r2, r3
+; CHECK-NEXT:    vmov d18, r0, r1
+; CHECK-NEXT:    vbsl q8, q9, q10
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    pop {r4, lr}
+; CHECK-NEXT:    mov pc, lr
 ; CHECK-NEXT:    .p2align 3
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI23_0:
diff --git a/test/CodeGen/ARM/wide-compares.ll b/test/CodeGen/ARM/wide-compares.ll
index 9b22f5fedfeb4c65a1588bc636180299333af7bd..6584f0c7616c52878c34ab1a961e76731a9bbb27 100644
--- a/test/CodeGen/ARM/wide-compares.ll
+++ b/test/CodeGen/ARM/wide-compares.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=armv7-unknown-linux < %s | FileCheck --check-prefix=CHECK-ARM %s
-; RUN: llc -mtriple=thumbv6-unknown-linux < %s | FileCheck --check-prefix=CHECK-THUMB1 %s
-; RUN: llc -mtriple=thumbv7-unknown-linux < %s | FileCheck --check-prefix=CHECK-THUMB2 %s
+; RUN: llc -mtriple=armv7-unknown-linux < %s -verify-machineinstrs | FileCheck --check-prefix=CHECK-ARM %s
+; RUN: llc -mtriple=thumb-eabi < %s  -verify-machineinstrs | FileCheck --check-prefix=CHECK-THUMB1-NOMOV %s
+; RUN: llc -mtriple=thumbv6-unknown-linux < %s -verify-machineinstrs | FileCheck --check-prefix=CHECK-THUMB1 %s
+; RUN: llc -mtriple=thumbv7-unknown-linux < %s -verify-machineinstrs | FileCheck --check-prefix=CHECK-THUMB2 %s
 
 define i32 @test_slt1(i64 %a, i64 %b) {
 ; CHECK-ARM-LABEL: test_slt1:
@@ -13,6 +14,18 @@ define i32 @test_slt1(i64 %a, i64 %b) {
 ; CHECK-ARM-NEXT:    mov r0, r12
 ; CHECK-ARM-NEXT:    bx lr
 ;
+; CHECK-THUMB1-NOMOV-LABEL: test_slt1:
+; CHECK-THUMB1-NOMOV:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    subs r0, r0, r2
+; CHECK-THUMB1-NOMOV-NEXT:    sbcs r1, r3
+; CHECK-THUMB1-NOMOV-NEXT:    bge .LBB0_2
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.1: @ %bb1
+; CHECK-THUMB1-NOMOV-NEXT:    movs r0, #1
+; CHECK-THUMB1-NOMOV-NEXT:    bx lr
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB0_2: @ %bb2
+; CHECK-THUMB1-NOMOV-NEXT:    movs r0, #2
+; CHECK-THUMB1-NOMOV-NEXT:    bx lr
+;
 ; CHECK-THUMB1-LABEL: test_slt1:
 ; CHECK-THUMB1:       @ %bb.0: @ %entry
 ; CHECK-THUMB1-NEXT:    subs r0, r0, r2
@@ -57,6 +70,23 @@ define void @test_slt2(i64 %a, i64 %b) {
 ; CHECK-ARM-NEXT:    bl g
 ; CHECK-ARM-NEXT:    pop {r11, pc}
 ;
+; CHECK-THUMB1-NOMOV-LABEL: test_slt2:
+; CHECK-THUMB1-NOMOV:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    .save {r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    push {r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    subs r0, r0, r2
+; CHECK-THUMB1-NOMOV-NEXT:    sbcs r1, r3
+; CHECK-THUMB1-NOMOV-NEXT:    bge .LBB1_2
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.1: @ %bb1
+; CHECK-THUMB1-NOMOV-NEXT:    bl f
+; CHECK-THUMB1-NOMOV-NEXT:    b .LBB1_3
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB1_2: @ %bb2
+; CHECK-THUMB1-NOMOV-NEXT:    bl g
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB1_3: @ %bb1
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r7}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r0}
+; CHECK-THUMB1-NOMOV-NEXT:    bx r0
+;
 ; CHECK-THUMB1-LABEL: test_slt2:
 ; CHECK-THUMB1:       @ %bb.0: @ %entry
 ; CHECK-THUMB1-NEXT:    push {r7, lr}
@@ -95,3 +125,193 @@ bb2:
 
 declare void @f()
 declare void @g()
+
+define i64 @test_slt_select(i64 %c, i64 %d, i64 %a, i64 %b) {
+; CHECK-ARM-LABEL: test_slt_select:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    push {r4, r5, r6, r7, r11, lr}
+; CHECK-ARM-NEXT:    ldr r12, [sp, #32]
+; CHECK-ARM-NEXT:    mov r6, #0
+; CHECK-ARM-NEXT:    ldr lr, [sp, #24]
+; CHECK-ARM-NEXT:    ldr r7, [sp, #36]
+; CHECK-ARM-NEXT:    ldr r5, [sp, #28]
+; CHECK-ARM-NEXT:    subs r4, lr, r12
+; CHECK-ARM-NEXT:    sbcs r7, r5, r7
+; CHECK-ARM-NEXT:    movwlo r6, #1
+; CHECK-ARM-NEXT:    cmp r6, #0
+; CHECK-ARM-NEXT:    moveq r0, r2
+; CHECK-ARM-NEXT:    moveq r1, r3
+; CHECK-ARM-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+;
+; CHECK-THUMB1-NOMOV-LABEL: test_slt_select:
+; CHECK-THUMB1-NOMOV:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    .pad #4
+; CHECK-THUMB1-NOMOV-NEXT:    sub sp, #4
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r4, [sp, #36]
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r5, [sp, #28]
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r6, [sp, #32]
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r7, [sp, #24]
+; CHECK-THUMB1-NOMOV-NEXT:    subs r6, r7, r6
+; CHECK-THUMB1-NOMOV-NEXT:    sbcs r5, r4
+; CHECK-THUMB1-NOMOV-NEXT:    blo .LBB2_2
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.1: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    movs r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    beq .LBB2_3
+; CHECK-THUMB1-NOMOV-NEXT:    b .LBB2_4
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB2_2:
+; CHECK-THUMB1-NOMOV-NEXT:    movs r4, #1
+; CHECK-THUMB1-NOMOV-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    bne .LBB2_4
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB2_3: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    movs r0, r2
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB2_4: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    bne .LBB2_6
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.5: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    movs r1, r3
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB2_6: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    add sp, #4
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r4, r5, r6, r7}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r2}
+; CHECK-THUMB1-NOMOV-NEXT:    bx r2
+;
+; CHECK-THUMB1-LABEL: test_slt_select:
+; CHECK-THUMB1:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-THUMB1-NEXT:    sub sp, #4
+; CHECK-THUMB1-NEXT:    ldr r4, [sp, #36]
+; CHECK-THUMB1-NEXT:    ldr r5, [sp, #28]
+; CHECK-THUMB1-NEXT:    ldr r6, [sp, #32]
+; CHECK-THUMB1-NEXT:    ldr r7, [sp, #24]
+; CHECK-THUMB1-NEXT:    subs r6, r7, r6
+; CHECK-THUMB1-NEXT:    sbcs r5, r4
+; CHECK-THUMB1-NEXT:    blo .LBB2_2
+; CHECK-THUMB1-NEXT:  @ %bb.1: @ %entry
+; CHECK-THUMB1-NEXT:    movs r4, #0
+; CHECK-THUMB1-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NEXT:    beq .LBB2_3
+; CHECK-THUMB1-NEXT:    b .LBB2_4
+; CHECK-THUMB1-NEXT:  .LBB2_2:
+; CHECK-THUMB1-NEXT:    movs r4, #1
+; CHECK-THUMB1-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NEXT:    bne .LBB2_4
+; CHECK-THUMB1-NEXT:  .LBB2_3: @ %entry
+; CHECK-THUMB1-NEXT:    mov r0, r2
+; CHECK-THUMB1-NEXT:  .LBB2_4: @ %entry
+; CHECK-THUMB1-NEXT:    cmp r4, #0
+; CHECK-THUMB1-NEXT:    beq .LBB2_6
+; CHECK-THUMB1-NEXT:  @ %bb.5: @ %entry
+; CHECK-THUMB1-NEXT:    add sp, #4
+; CHECK-THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-THUMB1-NEXT:  .LBB2_6: @ %entry
+; CHECK-THUMB1-NEXT:    mov r1, r3
+; CHECK-THUMB1-NEXT:    add sp, #4
+; CHECK-THUMB1-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; CHECK-THUMB2-LABEL: test_slt_select:
+; CHECK-THUMB2:       @ %bb.0: @ %entry
+; CHECK-THUMB2-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-THUMB2-NEXT:    sub sp, #4
+; CHECK-THUMB2-NEXT:    ldrd r12, r7, [sp, #32]
+; CHECK-THUMB2-NEXT:    movs r6, #0
+; CHECK-THUMB2-NEXT:    ldrd lr, r5, [sp, #24]
+; CHECK-THUMB2-NEXT:    subs.w r4, lr, r12
+; CHECK-THUMB2-NEXT:    sbcs.w r7, r5, r7
+; CHECK-THUMB2-NEXT:    it lo
+; CHECK-THUMB2-NEXT:    movlo r6, #1
+; CHECK-THUMB2-NEXT:    cmp r6, #0
+; CHECK-THUMB2-NEXT:    itt eq
+; CHECK-THUMB2-NEXT:    moveq r0, r2
+; CHECK-THUMB2-NEXT:    moveq r1, r3
+; CHECK-THUMB2-NEXT:    add sp, #4
+; CHECK-THUMB2-NEXT:    pop {r4, r5, r6, r7, pc}
+entry:
+    %cmp = icmp ult i64 %a, %b
+    %r1 = select i1 %cmp, i64 %c, i64 %d
+    ret i64 %r1
+}
+
+define {i32, i32} @test_slt_not(i32 %c, i32 %d, i64 %a, i64 %b) {
+; CHECK-ARM-LABEL: test_slt_not:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    ldr r12, [sp]
+; CHECK-ARM-NEXT:    mov r1, #0
+; CHECK-ARM-NEXT:    ldr r0, [sp, #4]
+; CHECK-ARM-NEXT:    subs r2, r2, r12
+; CHECK-ARM-NEXT:    sbcs r0, r3, r0
+; CHECK-ARM-NEXT:    mov r0, #0
+; CHECK-ARM-NEXT:    movwge r1, #1
+; CHECK-ARM-NEXT:    movwlt r0, #1
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-THUMB1-NOMOV-LABEL: test_slt_not:
+; CHECK-THUMB1-NOMOV:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    push {r4, r5, r7, lr}
+; CHECK-THUMB1-NOMOV-NEXT:    movs r1, #1
+; CHECK-THUMB1-NOMOV-NEXT:    movs r4, #0
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r0, [sp, #20]
+; CHECK-THUMB1-NOMOV-NEXT:    ldr r5, [sp, #16]
+; CHECK-THUMB1-NOMOV-NEXT:    subs r2, r2, r5
+; CHECK-THUMB1-NOMOV-NEXT:    sbcs r3, r0
+; CHECK-THUMB1-NOMOV-NEXT:    push {r1}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r0}
+; CHECK-THUMB1-NOMOV-NEXT:    blt .LBB3_2
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.1: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    push {r4}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r0}
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB3_2: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    bge .LBB3_4
+; CHECK-THUMB1-NOMOV-NEXT:  @ %bb.3: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    movs r1, r4
+; CHECK-THUMB1-NOMOV-NEXT:  .LBB3_4: @ %entry
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r4, r5, r7}
+; CHECK-THUMB1-NOMOV-NEXT:    pop {r2}
+; CHECK-THUMB1-NOMOV-NEXT:    bx r2
+;
+; CHECK-THUMB1-LABEL: test_slt_not:
+; CHECK-THUMB1:       @ %bb.0: @ %entry
+; CHECK-THUMB1-NEXT:    push {r4, r5, r7, lr}
+; CHECK-THUMB1-NEXT:    movs r1, #1
+; CHECK-THUMB1-NEXT:    movs r4, #0
+; CHECK-THUMB1-NEXT:    ldr r0, [sp, #20]
+; CHECK-THUMB1-NEXT:    ldr r5, [sp, #16]
+; CHECK-THUMB1-NEXT:    subs r2, r2, r5
+; CHECK-THUMB1-NEXT:    sbcs r3, r0
+; CHECK-THUMB1-NEXT:    mov r0, r1
+; CHECK-THUMB1-NEXT:    bge .LBB3_3
+; CHECK-THUMB1-NEXT:  @ %bb.1: @ %entry
+; CHECK-THUMB1-NEXT:    blt .LBB3_4
+; CHECK-THUMB1-NEXT:  .LBB3_2: @ %entry
+; CHECK-THUMB1-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-THUMB1-NEXT:  .LBB3_3: @ %entry
+; CHECK-THUMB1-NEXT:    mov r0, r4
+; CHECK-THUMB1-NEXT:    bge .LBB3_2
+; CHECK-THUMB1-NEXT:  .LBB3_4: @ %entry
+; CHECK-THUMB1-NEXT:    mov r1, r4
+; CHECK-THUMB1-NEXT:    pop {r4, r5, r7, pc}
+;
+; CHECK-THUMB2-LABEL: test_slt_not:
+; CHECK-THUMB2:       @ %bb.0: @ %entry
+; CHECK-THUMB2-NEXT:    ldr.w r12, [sp]
+; CHECK-THUMB2-NEXT:    movs r1, #0
+; CHECK-THUMB2-NEXT:    ldr r0, [sp, #4]
+; CHECK-THUMB2-NEXT:    subs.w r2, r2, r12
+; CHECK-THUMB2-NEXT:    sbcs.w r0, r3, r0
+; CHECK-THUMB2-NEXT:    mov.w r0, #0
+; CHECK-THUMB2-NEXT:    ite lt
+; CHECK-THUMB2-NEXT:    movlt r0, #1
+; CHECK-THUMB2-NEXT:    movge r1, #1
+; CHECK-THUMB2-NEXT:    bx lr
+entry:
+    %cmp = icmp slt i64 %a, %b
+    %not = xor i1 %cmp, true
+    %r1 = zext i1 %cmp to i32
+    %r2 = zext i1 %not to i32
+    %z = insertvalue { i32, i32 } undef, i32 %r1, 0
+    %z2 = insertvalue { i32, i32 } %z, i32 %r2, 1
+    ret { i32, i32 } %z2
+}
diff --git a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir
index df69f5fffa5490998449e96293f96298e580ee88..72b20d39d68fc1d701cd5625b44e40b203ccb65b 100644
--- a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir
+++ b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ-same-src-dst.mir
@@ -25,11 +25,11 @@ body: |
 
     ; CHECK-LABEL: test_lddwrdptrq
 
-    ; CHECK:      ldd [[SCRATCH:r[0-9]+]], Z+10
+    ; CHECK:      ldd [[SCRATCH:r[0-9]+]], Y+10
     ; CHECK-NEXT: push [[SCRATCH]]
-    ; CHECK-NEXT: ldd [[SCRATCH]], Z+11
-    ; CHECK-NEXT: mov r31, [[SCRATCH]]
-    ; CHECK-NEXT: pop r30
+    ; CHECK-NEXT: ldd [[SCRATCH]], Y+11
+    ; CHECK-NEXT: mov r29, [[SCRATCH]]
+    ; CHECK-NEXT: pop r28
 
-    early-clobber $r31r30 = LDDWRdPtrQ undef $r31r30, 10
+    early-clobber $r29r28 = LDDWRdPtrQ undef $r29r28, 10
 ...
diff --git a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
index 59b3ce8b60289afe26ad560a11616988fa2dc02c..96d3809ed2d71ffa9ac92ae1d35abe499bf2dc18 100644
--- a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
+++ b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
@@ -18,8 +18,8 @@ body: |
 
     ; CHECK-LABEL: test_lddwrdptrq
 
-    ; CHECK:      ldd     r30, Y+10
-    ; CHECK-NEXT: ldd     r31, Y+11
+    ; CHECK:      ldd     r28, Z+10
+    ; CHECK-NEXT: ldd     r29, Z+11
 
-    early-clobber $r31r30 = LDDWRdPtrQ undef $r29r28, 10
+    early-clobber $r29r28 = LDDWRdPtrQ undef $r31r30, 10
 ...
diff --git a/test/CodeGen/AVR/rust-avr-bug-112.ll b/test/CodeGen/AVR/rust-avr-bug-112.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7cf14330cdc1e91b5f5bd3a6d704264ff76e5aee
--- /dev/null
+++ b/test/CodeGen/AVR/rust-avr-bug-112.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+; The avr-rust bug can be found here:
+; https://github.com/avr-rust/rust/issues/112
+;
+; In this test, the codegen stage generates a FRMIDX
+; instruction. Later in the pipeline, the frame index
+; gets expanded into a 16-bit MOVWRdRr instruction.
+;
+; There was a bug in the FRMIDX->MOVWRdRr expansion logic
+; that could leave the MOVW instruction with an extraneous
+; operand, left over from the original FRMIDX.
+;
+; This would trigger an assertion:
+;
+;   Assertion failed: ((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
+;                       OpNo < MCID->getNumOperands() || isMetaDataOp) &&
+;                       "Trying to add an operand to a machine instr that is already done!"),
+;   function addOperand, file llvm/lib/CodeGen/MachineInstr.cpp
+;
+; The logic has since been fixed.
+
+; CHECK-LABEL: "core::str::slice_error_fail"
+define void @"core::str::slice_error_fail"(i16 %arg) personality i32 (...) addrspace(1)* @rust_eh_personality {
+start:
+  %char_range = alloca { i16, i16 }, align 1
+  br i1 undef, label %"<core::option::Option<T>>::unwrap.exit.thread", label %bb11.i.i
+
+"<core::option::Option<T>>::unwrap.exit.thread":
+  br label %"core::char::methods::<impl char>::len_utf8.exit"
+
+bb11.i.i:
+  %tmp = bitcast { i16, i16 }* %char_range to i8*
+  %tmp1 = icmp ult i32 undef, 65536
+  %..i = select i1 %tmp1, i16 3, i16 4
+  br label %"core::char::methods::<impl char>::len_utf8.exit"
+
+"core::char::methods::<impl char>::len_utf8.exit":
+  %tmp2 = phi i8* [ %tmp, %bb11.i.i ], [ undef, %"<core::option::Option<T>>::unwrap.exit.thread" ]
+  %_0.0.i12 = phi i16 [ %..i, %bb11.i.i ], [ 1, %"<core::option::Option<T>>::unwrap.exit.thread" ]
+  %tmp3 = add i16 %_0.0.i12, %arg
+  store i16 %tmp3, i16* undef, align 1
+  store i8* %tmp2, i8** undef, align 1
+  unreachable
+}
+
+declare i32 @rust_eh_personality(...) addrspace(1)
+
diff --git a/test/CodeGen/AVR/rust-avr-bug-37.ll b/test/CodeGen/AVR/rust-avr-bug-37.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9c269d3dab11a1ebf3d584affd2509013289a8c9
--- /dev/null
+++ b/test/CodeGen/AVR/rust-avr-bug-37.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+%"fmt::Formatter" = type { i32, { i8*, void (i8*)** } }
+
+@str.1b = external constant [0 x i8]
+
+define void @"TryFromIntError::Debug"(%"fmt::Formatter"* dereferenceable(32)) unnamed_addr #0 personality i32 (...)* @rust_eh_personality {
+; CHECK-LABEL: "TryFromIntError::Debug"
+start:
+  %builder = alloca i8, align 8
+  %1 = getelementptr inbounds %"fmt::Formatter", %"fmt::Formatter"* %0, i16 0, i32 1
+  %2 = bitcast { i8*, void (i8*)** }* %1 to {}**
+  %3 = load {}*, {}** %2, align 2
+  %4 = getelementptr inbounds %"fmt::Formatter", %"fmt::Formatter"* %0, i16 0, i32 1, i32 1
+  %5 = load void (i8*)**, void (i8*)*** %4, align 2
+  %6 = getelementptr inbounds void (i8*)*, void (i8*)** %5, i16 3
+  %7 = bitcast void (i8*)** %6 to i8 ({}*, i8*, i16)**
+  %8 = load i8 ({}*, i8*, i16)*, i8 ({}*, i8*, i16)** %7, align 2
+  %9 = tail call i8 %8({}* nonnull %3, i8* noalias nonnull readonly getelementptr inbounds ([0 x i8], [0 x i8]* @str.1b, i16 0, i16 0), i16 15)
+  unreachable
+}
+
+declare i32 @rust_eh_personality(...) unnamed_addr
+
+attributes #0 = { uwtable }
\ No newline at end of file
diff --git a/test/CodeGen/AVR/rust-avr-bug-95.ll b/test/CodeGen/AVR/rust-avr-bug-95.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9534ceb26e75e73daf0ee3c09e27cd7bb8628938
--- /dev/null
+++ b/test/CodeGen/AVR/rust-avr-bug-95.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=avr | FileCheck %s
+
+%"fmt::Formatter.1.77.153.229.305.381.1673" = type { [0 x i8], i32, [0 x i8], i32, [0 x i8], i8, [0 x i8], %"option::Option<usize>.0.76.152.228.304.380.1672", [0 x i8], %"option::Option<usize>.0.76.152.228.304.380.1672", [0 x i8], { {}*, {}* }, [0 x i8], { i8*, i8* }, [0 x i8], { [0 x { i8*, i8* }]*, i16 }, [0 x i8] }
+%"option::Option<usize>.0.76.152.228.304.380.1672" = type { [0 x i8], i8, [2 x i8] }
+
+@str.4S = external constant [5 x i8]
+
+; Function Attrs: uwtable
+define void @"_ZN65_$LT$lib..str..Chars$LT$$u27$a$GT$$u20$as$u20$lib..fmt..Debug$GT$3fmt17h76a537e22649f739E"(%"fmt::Formatter.1.77.153.229.305.381.1673"* dereferenceable(27) %__arg_0) unnamed_addr #0 personality i32 (...)* @rust_eh_personality {
+; CHECK-LABEL: "_ZN65_$LT$lib..str..Chars$LT$$u27$a$GT$$u20$as$u20$lib..fmt..Debug$GT$3fmt17h76a537e22649f739E"
+start:
+  %0 = getelementptr inbounds %"fmt::Formatter.1.77.153.229.305.381.1673", %"fmt::Formatter.1.77.153.229.305.381.1673"* %__arg_0, i16 0, i32 11, i32 0
+  %1 = load {}*, {}** %0, align 1, !noalias !0, !nonnull !9
+  %2 = getelementptr inbounds %"fmt::Formatter.1.77.153.229.305.381.1673", %"fmt::Formatter.1.77.153.229.305.381.1673"* %__arg_0, i16 0, i32 11, i32 1
+  %3 = bitcast {}** %2 to i1 ({}*, [0 x i8]*, i16)***
+  %4 = load i1 ({}*, [0 x i8]*, i16)**, i1 ({}*, [0 x i8]*, i16)*** %3, align 1, !noalias !0, !nonnull !9
+  %5 = getelementptr inbounds i1 ({}*, [0 x i8]*, i16)*, i1 ({}*, [0 x i8]*, i16)** %4, i16 3
+  %6 = load i1 ({}*, [0 x i8]*, i16)*, i1 ({}*, [0 x i8]*, i16)** %5, align 1, !invariant.load !9, !noalias !0, !nonnull !9
+  %7 = tail call zeroext i1 %6({}* nonnull %1, [0 x i8]* noalias nonnull readonly bitcast ([5 x i8]* @str.4S to [0 x i8]*), i16 5), !noalias !10
+  unreachable
+}
+
+declare i32 @rust_eh_personality(...) unnamed_addr
+
+attributes #0 = { uwtable }
+
+!0 = !{!1, !3, !5, !6, !8}
+!1 = distinct !{!1, !2, !"_ZN3lib3fmt9Formatter9write_str17ha1a9656fc66ccbe5E: %data.0"}
+!2 = distinct !{!2, !"_ZN3lib3fmt9Formatter9write_str17ha1a9656fc66ccbe5E"}
+!3 = distinct !{!3, !4, !"_ZN3lib3fmt8builders16debug_struct_new17h352a1de8f89c2bc3E: argument 0"}
+!4 = distinct !{!4, !"_ZN3lib3fmt8builders16debug_struct_new17h352a1de8f89c2bc3E"}
+!5 = distinct !{!5, !4, !"_ZN3lib3fmt8builders16debug_struct_new17h352a1de8f89c2bc3E: %name.0"}
+!6 = distinct !{!6, !7, !"_ZN3lib3fmt9Formatter12debug_struct17ha1ff79f633171b68E: argument 0"}
+!7 = distinct !{!7, !"_ZN3lib3fmt9Formatter12debug_struct17ha1ff79f633171b68E"}
+!8 = distinct !{!8, !7, !"_ZN3lib3fmt9Formatter12debug_struct17ha1ff79f633171b68E: %name.0"}
+!9 = !{}
+!10 = !{!3, !6}
\ No newline at end of file
diff --git a/test/CodeGen/Generic/is-constant.ll b/test/CodeGen/Generic/is-constant.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2e1f4261d6aad76cfddd3efdc21bdafb8e45f552
--- /dev/null
+++ b/test/CodeGen/Generic/is-constant.ll
@@ -0,0 +1,114 @@
+; RUN: opt -O2 -S < %s  | FileCheck %s
+; RUN: llc -o /dev/null 2>&1 < %s
+; RUN: llc -O0 -o /dev/null 2>&1 < %s
+
+;; The llc runs above are just to ensure it doesn't blow up upon
+;; seeing an is_constant intrinsic.
+
+declare i1 @llvm.is.constant.i32(i32 %a)
+declare i1 @llvm.is.constant.i64(i64 %a)
+declare i1 @llvm.is.constant.i256(i256 %a)
+declare i1 @llvm.is.constant.v2i64(<2 x i64> %a)
+declare i1 @llvm.is.constant.f32(float %a)
+declare i1 @llvm.is.constant.sl_i32i32s({i32, i32} %a)
+declare i1 @llvm.is.constant.a2i64([2 x i64] %a)
+declare i1 @llvm.is.constant.p0i64(i64* %a)
+
+;; Basic test that optimization folds away the is.constant when given
+;; a constant.
+define i1 @test_constant() #0 {
+; CHECK-LABEL: @test_constant(
+; CHECK-NOT: llvm.is.constant
+; CHECK: ret i1 true
+%y = call i1 @llvm.is.constant.i32(i32 44)
+  ret i1 %y
+}
+
+;; And test that the intrinsic sticks around when given a
+;; non-constant.
+define i1 @test_nonconstant(i32 %x) #0 {
+; CHECK-LABEL: @test_nonconstant(
+; CHECK: @llvm.is.constant
+  %y = call i1 @llvm.is.constant.i32(i32 %x)
+  ret i1 %y
+}
+
+;; Ensure that nested is.constants fold.
+define i32 @test_nested() #0 {
+; CHECK-LABEL: @test_nested(
+; CHECK-NOT: llvm.is.constant
+; CHECK: ret i32 13
+  %val1 = call i1 @llvm.is.constant.i32(i32 27)
+  %val2 = zext i1 %val1 to i32
+  %val3 = add i32 %val2, 12
+  %1 = call i1 @llvm.is.constant.i32(i32 %val3)
+  %2 = zext i1 %1 to i32
+  %3 = add i32 %2, 12
+  ret i32 %3
+}
+
+@G = global [2 x i64] zeroinitializer
+define i1 @test_global() #0 {
+; CHECK-LABEL: @test_global(
+; CHECK: llvm.is.constant
+  %ret = call i1 @llvm.is.constant.p0i64(i64* getelementptr ([2 x i64], [2 x i64]* @G, i32 0, i32 0))
+  ret i1 %ret
+}
+
+define i1 @test_diff() #0 {
+; CHECK-LABEL: @test_diff(
+  %ret = call i1 @llvm.is.constant.i64(i64 sub (
+      i64 ptrtoint (i64* getelementptr inbounds ([2 x i64], [2 x i64]* @G, i64 0, i64 1) to i64),
+      i64 ptrtoint ([2 x i64]* @G to i64)))
+  ret i1 %ret
+}
+
+define i1 @test_various_types(i256 %int, float %float, <2 x i64> %vec, {i32, i32} %struct, [2 x i64] %arr, i64* %ptr) #0 {
+; CHECK-LABEL: @test_various_types(
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK: llvm.is.constant
+; CHECK-NOT: llvm.is.constant
+  %v1 = call i1 @llvm.is.constant.i256(i256 %int)
+  %v2 = call i1 @llvm.is.constant.f32(float %float)
+  %v3 = call i1 @llvm.is.constant.v2i64(<2 x i64> %vec)
+  %v4 = call i1 @llvm.is.constant.sl_i32i32s({i32, i32} %struct)
+  %v5 = call i1 @llvm.is.constant.a2i64([2 x i64] %arr)
+  %v6 = call i1 @llvm.is.constant.p0i64(i64* %ptr)
+
+  %c1 = call i1 @llvm.is.constant.i256(i256 -1)
+  %c2 = call i1 @llvm.is.constant.f32(float 17.0)
+  %c3 = call i1 @llvm.is.constant.v2i64(<2 x i64> <i64 -1, i64 44>)
+  %c4 = call i1 @llvm.is.constant.sl_i32i32s({i32, i32} {i32 -1, i32 32})
+  %c5 = call i1 @llvm.is.constant.a2i64([2 x i64] [i64 -1, i64 32])
+  %c6 = call i1 @llvm.is.constant.p0i64(i64* inttoptr (i32 42 to i64*))
+
+  %x1 = add i1 %v1, %c1
+  %x2 = add i1 %v2, %c2
+  %x3 = add i1 %v3, %c3
+  %x4 = add i1 %v4, %c4
+  %x5 = add i1 %v5, %c5
+  %x6 = add i1 %v6, %c6
+
+  %res2 = add i1 %x1, %x2
+  %res3 = add i1 %res2, %x3
+  %res4 = add i1 %res3, %x4
+  %res5 = add i1 %res4, %x5
+  %res6 = add i1 %res5, %x6
+
+  ret i1 %res6
+}
+
+define i1 @test_various_types2() #0 {
+; CHECK-LABEL: @test_various_types2(
+; CHECK: ret i1 false
+  %r = call i1 @test_various_types(i256 -1, float 22.0, <2 x i64> <i64 -1, i64 44>,
+                     {i32, i32} {i32 -1, i32 55}, [2 x i64] [i64 -1, i64 55],
+		     i64* inttoptr (i64 42 to i64*))
+  ret i1 %r
+}
+
+attributes #0 = { nounwind uwtable }
diff --git a/test/CodeGen/Generic/zero-probability.mir b/test/CodeGen/Generic/zero-probability.mir
deleted file mode 100644
index 6a9ab67cb2662859691356d711adcd47cd8c56a8..0000000000000000000000000000000000000000
--- a/test/CodeGen/Generic/zero-probability.mir
+++ /dev/null
@@ -1,39 +0,0 @@
-# RUN: llc -o /dev/null %s 
-# REQUIRES: asserts
-# Makes sure that having a probability of 0x00000000 to branch to a successor
-# doesn't hit an APInt assert in the MIParser.
-
---- |
-  define i32 @main() local_unnamed_addr #0 {
-  entry:
-    ret i32 0
-  
-  other:
-    ret i32 0
-  }
-  
-  attributes #0 = { nounwind }
-  
-  !llvm.module.flags = !{!0, !1}
-  !llvm.ident = !{!2}
-  
-  !0 = !{i32 1, !"wchar_size", i32 4}
-  !1 = !{i32 7, !"PIC Level", i32 2}
-  !2 = !{!"clang version 6.0.0"}
-  !3 = !{!"branch_weights", i32 0, i32 -1}
-
-...
----
-name:            main
-alignment:       2
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true   
-body:             |
-  bb.0.entry:
-    successors: %bb.1.other(0x00000000)
-  bb.1.other:
-
-...
diff --git a/test/CodeGen/Hexagon/cfi-late.ll b/test/CodeGen/Hexagon/cfi-late.ll
index b5bdb59cc15b4602da008ffaa931994817df7bff..460b645b4a4e6d31125d303aa3f5a59a5cb2d058 100644
--- a/test/CodeGen/Hexagon/cfi-late.ll
+++ b/test/CodeGen/Hexagon/cfi-late.ll
@@ -32,8 +32,8 @@ declare i32 @bar(i32, i32) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv4" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv4" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
+attributes #1 = { "target-cpu"="hexagonv5" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
diff --git a/test/CodeGen/Hexagon/clr_set_toggle.ll b/test/CodeGen/Hexagon/clr_set_toggle.ll
index 9318f2d8a6b0b593ed823964a4681c81d94f7641..43c866c7b7655ab30f30731ca05d89ed83f0a204 100644
--- a/test/CodeGen/Hexagon/clr_set_toggle.ll
+++ b/test/CodeGen/Hexagon/clr_set_toggle.ll
@@ -70,7 +70,7 @@ entry:
 define zeroext i16 @my_setbit(i16 zeroext %crc) nounwind {
 entry:
 ; CHECK-LABEL: my_setbit
-; CHECK: memh(r{{[0-9]+}}+#{{[0-9]+}}) = setbit(#15)
+; CHECK: r{{[0-9]+}} = setbit(r{{[0-9]+}},#15)
   %crc.addr = alloca i16, align 2
   store i16 %crc, i16* %crc.addr, align 2
   %0 = load i16, i16* %crc.addr, align 2
diff --git a/test/CodeGen/Hexagon/constant_compound.ll b/test/CodeGen/Hexagon/constant_compound.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4ca2dc5d4ededf1cc31983633dec5ecd6167aa5e
--- /dev/null
+++ b/test/CodeGen/Hexagon/constant_compound.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon < %s 2>&1 | FileCheck %s
+
+; Generating a compound instruction with a constant is not profitable.
+; The constant needs to be kept in a register before it is fed to compound
+; instruction.
+; Before, we are generating
+; ra = #65820;
+; rb = lsr(rb, #8);
+; rc ^= and (rb, ra)
+; Now, we are generating
+; ra = and (#65820, lsr(ra, #8));
+; rb = xor(rb, ra)
+
+; CHECK: and(##65280,lsr(r
+; CHECK-NOT : ^= and
+
+define dso_local zeroext i16 @test_compound(i16 zeroext %varA, i16 zeroext %varB) local_unnamed_addr #0 {
+entry:
+  %tmp = zext i16 %varB to i32
+  %tmp1 = and i16 %varA, 255
+  %tmp2 = zext i16 %tmp1 to i32
+  %.masked.i = and i32 %tmp, 255
+  %tmp3 = xor i32 %.masked.i, %tmp2
+  %tmp4 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp3, i32 255) #2
+  %tmp5 = trunc i64 %tmp4 to i32
+  %tmp6 = and i32 %tmp5, 255
+  %tmp7 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp6, i32 81922) #2
+  %tmp8 = trunc i64 %tmp7 to i32
+  %tmp9 = xor i32 %tmp8, %tmp
+  %tmp10 = lshr i32 %tmp9, 8
+  %tmp11 = lshr i16 %varA, 8
+  %conv2 = zext i16 %tmp11 to i32
+  %tmp12 = and i32 %tmp10, 65280
+  %.masked.i7 = and i32 %tmp10, 255
+  %tmp13 = xor i32 %.masked.i7, %conv2
+  %tmp14 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp13, i32 255) #2
+  %tmp15 = trunc i64 %tmp14 to i32
+  %tmp16 = and i32 %tmp15, 255
+  %tmp17 = tail call i64 @llvm.hexagon.M4.pmpyw(i32 %tmp16, i32 81922) #2
+  %tmp18 = trunc i64 %tmp17 to i32
+  %tmp19 = xor i32 %tmp12, %tmp18
+  %tmp20 = lshr i32 %tmp19, 8
+  %tmp21 = trunc i32 %tmp20 to i16
+  ret i16 %tmp21
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.M4.pmpyw(i32, i32) #1
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv65" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/Hexagon/double.ll b/test/CodeGen/Hexagon/double.ll
index b4d025cd7fd05200acda47c047e63ade0dccaa51..336f32fee61172ebfcd496d79c33fd7d44803831 100644
--- a/test/CodeGen/Hexagon/double.ll
+++ b/test/CodeGen/Hexagon/double.ll
@@ -1,22 +1,24 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; CHECK: __hexagon_adddf3
 ; CHECK: __hexagon_subdf3
 
-define void @foo(double* %acc, double %num, double %num2) nounwind {
-entry:
-  %acc.addr = alloca double*, align 4
-  %num.addr = alloca double, align 8
-  %num2.addr = alloca double, align 8
-  store double* %acc, double** %acc.addr, align 4
-  store double %num, double* %num.addr, align 8
-  store double %num2, double* %num2.addr, align 8
-  %0 = load double*, double** %acc.addr, align 4
-  %1 = load double, double* %0
-  %2 = load double, double* %num.addr, align 8
-  %add = fadd double %1, %2
-  %3 = load double, double* %num2.addr, align 8
-  %sub = fsub double %add, %3
-  %4 = load double*, double** %acc.addr, align 4
-  store double %sub, double* %4
+define void @f0(double* %a0, double %a1, double %a2) #0 {
+b0:
+  %v0 = alloca double*, align 4
+  %v1 = alloca double, align 8
+  %v2 = alloca double, align 8
+  store double* %a0, double** %v0, align 4
+  store double %a1, double* %v1, align 8
+  store double %a2, double* %v2, align 8
+  %v3 = load double*, double** %v0, align 4
+  %v4 = load double, double* %v3
+  %v5 = load double, double* %v1, align 8
+  %v6 = fadd double %v4, %v5
+  %v7 = load double, double* %v2, align 8
+  %v8 = fsub double %v6, %v7
+  %v9 = load double*, double** %v0, align 4
+  store double %v8, double* %v9
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/early-if-debug.mir b/test/CodeGen/Hexagon/early-if-debug.mir
index 27e6124d35231db01283d184e528cdf6b00967ff..b76f41019a047c7c2d6eb03ae8afcf4b026702be 100644
--- a/test/CodeGen/Hexagon/early-if-debug.mir
+++ b/test/CodeGen/Hexagon/early-if-debug.mir
@@ -6,11 +6,11 @@
 # CHECK: %0:intregs = COPY $r0
 # CHECK: %1:predregs = C2_cmpeqi %0, 0
 # CHECK: %2:intregs = A2_tfrsi 123
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
-# CHECK: DBG_VALUE debug-use %0, debug-use $noreg
+# CHECK: DBG_VALUE %0, $noreg
+# CHECK: DBG_VALUE %0, $noreg
+# CHECK: DBG_VALUE %0, $noreg
+# CHECK: DBG_VALUE %0, $noreg
+# CHECK: DBG_VALUE %0, $noreg
 # CHECK: %3:intregs = A2_tfrsi 321
 # CHECK: %5:intregs = C2_mux %1, %2, %3
 
@@ -40,11 +40,11 @@ body:             |
     J2_jump %bb.1, implicit-def dead $pc
 
   bb.1:
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
-    DBG_VALUE debug-use %0, debug-use $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
+    DBG_VALUE %0, $noreg, !1, !1
     %3 = A2_tfrsi 321
 
   bb.2:
diff --git a/test/CodeGen/Hexagon/float.ll b/test/CodeGen/Hexagon/float.ll
index 03d1fbf44cb679fea613d8818c54431f8c377260..cc024a76d037c2e23b8b4d0f602a08db82519587 100644
--- a/test/CodeGen/Hexagon/float.ll
+++ b/test/CodeGen/Hexagon/float.ll
@@ -1,22 +1,24 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: __hexagon_addsf3
-; CHECK: __hexagon_subsf3
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: sfadd
+; CHECK: sfsub
 
-define void @foo(float* %acc, float %num, float %num2) nounwind {
-entry:
-  %acc.addr = alloca float*, align 4
-  %num.addr = alloca float, align 4
-  %num2.addr = alloca float, align 4
-  store float* %acc, float** %acc.addr, align 4
-  store float %num, float* %num.addr, align 4
-  store float %num2, float* %num2.addr, align 4
-  %0 = load float*, float** %acc.addr, align 4
-  %1 = load float, float* %0
-  %2 = load float, float* %num.addr, align 4
-  %add = fadd float %1, %2
-  %3 = load float, float* %num2.addr, align 4
-  %sub = fsub float %add, %3
-  %4 = load float*, float** %acc.addr, align 4
-  store float %sub, float* %4
+define void @f0(float* %a0, float %a1, float %a2) #0 {
+b0:
+  %v0 = alloca float*, align 4
+  %v1 = alloca float, align 4
+  %v2 = alloca float, align 4
+  store float* %a0, float** %v0, align 4
+  store float %a1, float* %v1, align 4
+  store float %a2, float* %v2, align 4
+  %v3 = load float*, float** %v0, align 4
+  %v4 = load float, float* %v3
+  %v5 = load float, float* %v1, align 4
+  %v6 = fadd float %v4, %v5
+  %v7 = load float, float* %v2, align 4
+  %v8 = fsub float %v6, %v7
+  %v9 = load float*, float** %v0, align 4
+  store float %v8, float* %v9
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll b/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll
index 03d1fbf44cb679fea613d8818c54431f8c377260..cc024a76d037c2e23b8b4d0f602a08db82519587 100644
--- a/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll
+++ b/test/CodeGen/Hexagon/floatconvert-ieee-rnd-near.ll
@@ -1,22 +1,24 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: __hexagon_addsf3
-; CHECK: __hexagon_subsf3
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: sfadd
+; CHECK: sfsub
 
-define void @foo(float* %acc, float %num, float %num2) nounwind {
-entry:
-  %acc.addr = alloca float*, align 4
-  %num.addr = alloca float, align 4
-  %num2.addr = alloca float, align 4
-  store float* %acc, float** %acc.addr, align 4
-  store float %num, float* %num.addr, align 4
-  store float %num2, float* %num2.addr, align 4
-  %0 = load float*, float** %acc.addr, align 4
-  %1 = load float, float* %0
-  %2 = load float, float* %num.addr, align 4
-  %add = fadd float %1, %2
-  %3 = load float, float* %num2.addr, align 4
-  %sub = fsub float %add, %3
-  %4 = load float*, float** %acc.addr, align 4
-  store float %sub, float* %4
+define void @f0(float* %a0, float %a1, float %a2) #0 {
+b0:
+  %v0 = alloca float*, align 4
+  %v1 = alloca float, align 4
+  %v2 = alloca float, align 4
+  store float* %a0, float** %v0, align 4
+  store float %a1, float* %v1, align 4
+  store float %a2, float* %v2, align 4
+  %v3 = load float*, float** %v0, align 4
+  %v4 = load float, float* %v3
+  %v5 = load float, float* %v1, align 4
+  %v6 = fadd float %v4, %v5
+  %v7 = load float, float* %v2, align 4
+  %v8 = fsub float %v6, %v7
+  %v9 = load float*, float** %v0, align 4
+  store float %v8, float* %v9
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/gp-plus-offset-load.ll b/test/CodeGen/Hexagon/gp-plus-offset-load.ll
index 57783d421a404b42c0bf73077c6c4521742f3c2f..2514d4109c0944470579b2f28fbff67543fc73f9 100644
--- a/test/CodeGen/Hexagon/gp-plus-offset-load.ll
+++ b/test/CodeGen/Hexagon/gp-plus-offset-load.ll
@@ -1,51 +1,57 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we generate load instructions with global + offset
 
-%struct.struc = type { i8, i8, i16, i32 }
 
-@foo = common global %struct.struc zeroinitializer, align 4
+%s.0 = type { i8, i8, i16, i32 }
 
-define void @loadWord(i32 %val1, i32 %val2, i32* nocapture %ival) nounwind {
-; CHECK: r{{[0-9]+}} = memw(##foo+4)
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+@g0 = common global %s.0 zeroinitializer, align 4
 
-if.then:                                          ; preds = %entry
-  %0 = load i32, i32* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 3), align 4
-  store i32 %0, i32* %ival, align 4
-  br label %if.end
+; CHECK-LABEL: f0:
+; CHECK: r{{[0-9]+}} = memw(##g0+4)
+define void @f0(i32 %a0, i32 %a1, i32* nocapture %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b1:                                               ; preds = %b0
+  %v1 = load i32, i32* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 3), align 4
+  store i32 %v1, i32* %a2, align 4
+  br label %b2
+
+b2:                                               ; preds = %b1, %b0
   ret void
 }
 
-define void @loadByte(i32 %val1, i32 %val2, i8* nocapture %ival) nounwind {
-; CHECK: r{{[0-9]+}} = memub(##foo+1)
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+; CHECK-LABEL: f1:
+; CHECK: r{{[0-9]+}} = memub(##g0+1)
+define void @f1(i32 %a0, i32 %a1, i8* nocapture %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.then:                                          ; preds = %entry
-  %0 = load i8, i8* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 1), align 1
-  store i8 %0, i8* %ival, align 1
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v1 = load i8, i8* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 1), align 1
+  store i8 %v1, i8* %a2, align 1
+  br label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b2:                                               ; preds = %b1, %b0
   ret void
 }
 
-define void @loadHWord(i32 %val1, i32 %val2, i16* %ival) nounwind {
-; CHECK: r{{[0-9]+}} = memuh(##foo+2)
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+; CHECK-LABEL: f2:
+; CHECK: r{{[0-9]+}} = memuh(##g0+2)
+define void @f2(i32 %a0, i32 %a1, i16* %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.then:                                          ; preds = %entry
-  %0 = load i16, i16* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 2), align 2
-  store i16 %0, i16* %ival, align 2
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v1 = load i16, i16* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 2), align 2
+  store i16 %v1, i16* %a2, align 2
+  br label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b2:                                               ; preds = %b1, %b0
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/gp-plus-offset-store.ll b/test/CodeGen/Hexagon/gp-plus-offset-store.ll
index 66391b954d0ec4bc650a2af130961f741a925e78..91e412f7c1353b64c34fb36cc1e6ba3c0f55414a 100644
--- a/test/CodeGen/Hexagon/gp-plus-offset-store.ll
+++ b/test/CodeGen/Hexagon/gp-plus-offset-store.ll
@@ -1,35 +1,38 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we generate store instructions with global + offset
 
-%struct.struc = type { i8, i8, i16, i32 }
+%s.0 = type { i8, i8, i16, i32 }
 
-@foo = common global %struct.struc zeroinitializer, align 4
+@g0 = common global %s.0 zeroinitializer, align 4
 
-define void @storeByte(i32 %val1, i32 %val2, i8 zeroext %ival) nounwind {
-; CHECK: memb(##foo+1) = r{{[0-9]+}}
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+; CHECK-LABEL: f0:
+; CHECK: memb(##g0+1) = r{{[0-9]+}}
+define void @f0(i32 %a0, i32 %a1, i8 zeroext %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.then:                                          ; preds = %entry
-  store i8 %ival, i8* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 1), align 1
-  br label %if.end
+b1:                                               ; preds = %b0
+  store i8 %a2, i8* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 1), align 1
+  br label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b2:                                               ; preds = %b1, %b0
   ret void
 }
 
-define void @storeHW(i32 %val1, i32 %val2, i16 signext %ival) nounwind {
-; CHECK: memh(##foo+2) = r{{[0-9]+}}
-entry:
-  %cmp = icmp sgt i32 %val1, %val2
-  br i1 %cmp, label %if.then, label %if.end
+; CHECK-LABEL: f1:
+; CHECK: memh(##g0+2) = r{{[0-9]+}}
+define void @f1(i32 %a0, i32 %a1, i16 signext %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.then:                                          ; preds = %entry
-  store i16 %ival, i16* getelementptr inbounds (%struct.struc, %struct.struc* @foo, i32 0, i32 2), align 2
-  br label %if.end
+b1:                                               ; preds = %b0
+  store i16 %a2, i16* getelementptr inbounds (%s.0, %s.0* @g0, i32 0, i32 2), align 2
+  br label %b2
 
-if.end:                                           ; preds = %if.then, %entry
+b2:                                               ; preds = %b1, %b0
   ret void
 }
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/gp-rel.ll b/test/CodeGen/Hexagon/gp-rel.ll
index ef913134f7cb0bc00c2f96f7576f5ff487503d82..3ce40bb54704b32c72bb61c25a92eb8c2423a8ba 100644
--- a/test/CodeGen/Hexagon/gp-rel.ll
+++ b/test/CodeGen/Hexagon/gp-rel.ll
@@ -1,33 +1,36 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that gp-relative instructions are being generated.
 
-@a = common global i32 0, align 4
-@b = common global i32 0, align 4
-@c = common global i32 0, align 4
+; CHECK: r{{[0-9]+}} = memw(gp+#g0)
+; CHECK: r{{[0-9]+}} = memw(gp+#g1)
+; CHECK: if (p{{[0-3]}}) memw(##g2) = r{{[0-9]+}}
 
-define i32 @foo(i32 %p) #0 {
-entry:
-; CHECK: r{{[0-9]+}} = memw(gp+#a)
-; CHECK: r{{[0-9]+}} = memw(gp+#b)
-; CHECK: if (p{{[0-3]}}) memw(##c) = r{{[0-9]+}}
-  %0 = load i32, i32* @a, align 4
-  %1 = load i32, i32* @b, align 4
-  %add = add nsw i32 %1, %0
-  %cmp = icmp eq i32 %0, %1
-  br i1 %cmp, label %if.then, label %entry.if.end_crit_edge
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+@g2 = common global i32 0, align 4
 
-entry.if.end_crit_edge:
-  %.pre = load i32, i32* @c, align 4
-  br label %if.end
+define i32 @f0(i32 %a0) #0 {
+b0:
+  %v0 = load i32, i32* @g0, align 4
+  %v1 = load i32, i32* @g1, align 4
+  %v2 = add nsw i32 %v1, %v0
+  %v3 = icmp eq i32 %v0, %v1
+  br i1 %v3, label %b2, label %b1
 
-if.then:
-  %add1 = add nsw i32 %add, %0
-  store i32 %add1, i32* @c, align 4
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v4 = load i32, i32* @g2, align 4
+  br label %b3
 
-if.end:
-  %2 = phi i32 [ %.pre, %entry.if.end_crit_edge ], [ %add1, %if.then ]
-  %cmp2 = icmp eq i32 %add, %2
-  %sel1 = select i1 %cmp2, i32 %2, i32 %1
-  ret i32 %sel1
+b2:                                               ; preds = %b0
+  %v5 = add nsw i32 %v2, %v0
+  store i32 %v5, i32* @g2, align 4
+  br label %b3
+
+b3:                                               ; preds = %b2, %b1
+  %v6 = phi i32 [ %v4, %b1 ], [ %v5, %b2 ]
+  %v7 = icmp eq i32 %v2, %v6
+  %v8 = select i1 %v7, i32 %v6, i32 %v1
+  ret i32 %v8
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/hwloop-cleanup.ll b/test/CodeGen/Hexagon/hwloop-cleanup.ll
index 56a6fedf81ef8a2ef9028d9d8993f30d1f2c34b2..71e1bf10fe64bde197087e63e7d0916f1ed56ed2 100644
--- a/test/CodeGen/Hexagon/hwloop-cleanup.ll
+++ b/test/CodeGen/Hexagon/hwloop-cleanup.ll
@@ -1,87 +1,91 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -no-phi-elim-live-out-early-exit \
-; RUN:    < %s | FileCheck %s
+; RUN: llc -march=hexagon -no-phi-elim-live-out-early-exit < %s | FileCheck %s
 ; Check that we remove the compare and induction variable instructions
 ; after generating hardware loops.
 ; Bug 6685.
 
+; CHECK-LABEL: f0:
 ; CHECK: loop0
 ; CHECK-NOT: r{{[0-9]+}} = add(r{{[0-9]+}},#-1)
 ; CHECK-NOT: cmp.eq
 ; CHECK: endloop0
 
-define i32 @test1(i32* nocapture %b, i32 %n) nounwind readonly {
-entry:
-  %cmp1 = icmp sgt i32 %n, 0
-  br i1 %cmp1, label %for.body.preheader, label %for.end
+define i32 @f0(i32* nocapture %a0, i32 %a1) #0 {
+b0:
+  %v0 = icmp sgt i32 %a1, 0
+  br i1 %v0, label %b1, label %b4
 
-for.body.preheader:
-  br label %for.body
+b1:                                               ; preds = %b0
+  br label %b2
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %sum.03 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx.phi = phi i32* [ %arrayidx.inc, %for.body ], [ %b, %for.body.preheader ]
-  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %0 = load i32, i32* %arrayidx.phi, align 4
-  %add = add nsw i32 %0, %sum.03
-  %inc = add nsw i32 %i.02, 1
-  %exitcond = icmp eq i32 %inc, %n
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  br i1 %exitcond, label %for.end.loopexit, label %for.body
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %v5, %b2 ], [ 0, %b1 ]
+  %v2 = phi i32* [ %v8, %b2 ], [ %a0, %b1 ]
+  %v3 = phi i32 [ %v6, %b2 ], [ 0, %b1 ]
+  %v4 = load i32, i32* %v2, align 4
+  %v5 = add nsw i32 %v4, %v1
+  %v6 = add nsw i32 %v3, 1
+  %v7 = icmp eq i32 %v6, %a1
+  %v8 = getelementptr i32, i32* %v2, i32 1
+  br i1 %v7, label %b3, label %b2
 
-for.end.loopexit:
-  br label %for.end
+b3:                                               ; preds = %b2
+  br label %b4
 
-for.end:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
-  ret i32 %sum.0.lcssa
+b4:                                               ; preds = %b3, %b0
+  %v9 = phi i32 [ 0, %b0 ], [ %v5, %b3 ]
+  ret i32 %v9
 }
 
 ; This test checks that that initial loop count value is removed.
+; CHECK-LABEL: f1:
 ; CHECK-NOT: ={{.}}#40
 ; CHECK: loop0
 ; CHECK-NOT: r{{[0-9]+}} = add(r{{[0-9]+}},#-1)
 ; CHECK-NOT: cmp.eq
 ; CHECK: endloop0
 
-define i32 @test2(i32* nocapture %b) nounwind readonly {
-entry:
-  br label %for.body
+define i32 @f1(i32* nocapture %a0) #0 {
+b0:
+  br label %b1
 
-for.body:
-  %sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %arrayidx.phi = phi i32* [ %b, %entry ], [ %arrayidx.inc, %for.body ]
-  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %0 = load i32, i32* %arrayidx.phi, align 4
-  %add = add nsw i32 %0, %sum.02
-  %inc = add nsw i32 %i.01, 1
-  %exitcond = icmp eq i32 %inc, 40
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  br i1 %exitcond, label %for.end, label %for.body
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ 0, %b0 ], [ %v4, %b1 ]
+  %v1 = phi i32* [ %a0, %b0 ], [ %v7, %b1 ]
+  %v2 = phi i32 [ 0, %b0 ], [ %v5, %b1 ]
+  %v3 = load i32, i32* %v1, align 4
+  %v4 = add nsw i32 %v3, %v0
+  %v5 = add nsw i32 %v2, 1
+  %v6 = icmp eq i32 %v5, 40
+  %v7 = getelementptr i32, i32* %v1, i32 1
+  br i1 %v6, label %b2, label %b1
 
-for.end:
-  ret i32 %add
+b2:                                               ; preds = %b1
+  ret i32 %v4
 }
 
 ; This test checks that we don't remove the induction variable since it's used.
+; CHECK-LABEL: f2:
 ; CHECK: loop0
 ; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}},#1)
 ; CHECK-NOT: cmp.eq
 ; CHECK: endloop0
-define i32 @test3(i32* nocapture %b) nounwind {
-entry:
-  br label %for.body
 
-for.body:
-  %arrayidx.phi = phi i32* [ %b, %entry ], [ %arrayidx.inc, %for.body ]
-  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  store i32 %i.01, i32* %arrayidx.phi, align 4
-  %inc = add nsw i32 %i.01, 1
-  %exitcond = icmp eq i32 %inc, 40
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  br i1 %exitcond, label %for.end, label %for.body
+define i32 @f2(i32* nocapture %a0) #1 {
+b0:
+  br label %b1
 
-for.end:
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32* [ %a0, %b0 ], [ %v4, %b1 ]
+  %v1 = phi i32 [ 0, %b0 ], [ %v2, %b1 ]
+  store i32 %v1, i32* %v0, align 4
+  %v2 = add nsw i32 %v1, 1
+  %v3 = icmp eq i32 %v2, 40
+  %v4 = getelementptr i32, i32* %v0, i32 1
+  br i1 %v3, label %b2, label %b1
+
+b2:                                               ; preds = %b1
   ret i32 0
 }
 
-
+attributes #0 = { nounwind readonly "target-cpu"="hexagonv5" }
+attributes #1 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/hwloop-const.ll b/test/CodeGen/Hexagon/hwloop-const.ll
index d549c1fef8c030a3e6d7d475cfaf4aa05cdd052e..eb105a33768a31a077c33a885dba034ff868a0d4 100644
--- a/test/CodeGen/Hexagon/hwloop-const.ll
+++ b/test/CodeGen/Hexagon/hwloop-const.ll
@@ -1,27 +1,27 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O2 < %s | FileCheck %s
-; ModuleID = 'hwloop-const.c'
-target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: endloop
+
 target triple = "hexagon-unknown-linux-gnu"
 
-@b = common global [25000 x i32] zeroinitializer, align 8
-@a = common global [25000 x i32] zeroinitializer, align 8
-@c = common global [25000 x i32] zeroinitializer, align 8
+@g0 = common global [25000 x i32] zeroinitializer, align 8
+@g1 = common global [25000 x i32] zeroinitializer, align 8
 
-define i32 @hwloop_bug() nounwind {
-entry:
-  br label %for.body
+define i32 @f0() #0 {
+b0:
+  br label %b1
 
-; CHECK: endloop
-for.body:                                         ; preds = %for.body, %entry
-  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds [25000 x i32], [25000 x i32]* @b, i32 0, i32 %i.02
-  store i32 %i.02, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds [25000 x i32], [25000 x i32]* @a, i32 0, i32 %i.02
-  store i32 %i.02, i32* %arrayidx1, align 4
-  %inc = add nsw i32 %i.02, 1
-  %exitcond = icmp eq i32 %inc, 25000
-  br i1 %exitcond, label %for.end, label %for.body
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ 0, %b0 ], [ %v3, %b1 ]
+  %v1 = getelementptr inbounds [25000 x i32], [25000 x i32]* @g0, i32 0, i32 %v0
+  store i32 %v0, i32* %v1, align 4
+  %v2 = getelementptr inbounds [25000 x i32], [25000 x i32]* @g1, i32 0, i32 %v0
+  store i32 %v0, i32* %v2, align 4
+  %v3 = add nsw i32 %v0, 1
+  %v4 = icmp eq i32 %v3, 25000
+  br i1 %v4, label %b2, label %b1
 
-for.end:                                          ; preds = %for.body
+b2:                                               ; preds = %b1
   ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index 10f3af73de1a66f54645b5034125e6be850d9037..443e4b59e9dd6b7e9794e4d452e0deb2ad11974f 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -1,63 +1,64 @@
-; RUN: llc < %s -march=hexagon -mcpu=hexagonv4 -O2 -disable-lsr | FileCheck %s
-; ModuleID = 'hwloop-dbg.o'
-target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
-target triple = "hexagon"
-
-define void @foo(i32* nocapture %a, i32* nocapture %b) nounwind !dbg !5 {
-entry:
-  tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !13, metadata !DIExpression()), !dbg !17
-  tail call void @llvm.dbg.value(metadata i32* %b, i64 0, metadata !14, metadata !DIExpression()), !dbg !18
-  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !DIExpression()), !dbg !19
-  br label %for.body, !dbg !19
+; RUN: llc < %s -march=hexagon -disable-lsr | FileCheck %s
 
-for.body:                                         ; preds = %for.body, %entry
 ; CHECK:     loop0(
 ; CHECK-NOT: add({{r[0-9]*}}, #
 ; CHECK:     endloop0
-  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
-  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %b.addr.01 = phi i32* [ %b, %entry ], [ %incdec.ptr, %for.body ]
-  %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.01, i32 1, !dbg !21
-  tail call void @llvm.dbg.value(metadata i32* %incdec.ptr, i64 0, metadata !14, metadata !DIExpression()), !dbg !21
-  %0 = load i32, i32* %b.addr.01, align 4, !dbg !21
-  store i32 %0, i32* %arrayidx.phi, align 4, !dbg !21
-  %inc = add nsw i32 %i.02, 1, !dbg !26
-  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !15, metadata !DIExpression()), !dbg !26
-  %exitcond = icmp eq i32 %inc, 10, !dbg !19
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  br i1 %exitcond, label %for.end, label %for.body, !dbg !19
-
-for.end:                                          ; preds = %for.body
-  ret void, !dbg !27
+
+target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
+target triple = "hexagon"
+
+define void @f0(i32* nocapture %a0, i32* nocapture %a1) #0 !dbg !4 {
+b0:
+  call void @llvm.dbg.value(metadata i32* %a0, metadata !10, metadata !DIExpression()), !dbg !14
+  call void @llvm.dbg.value(metadata i32* %a1, metadata !11, metadata !DIExpression()), !dbg !15
+  call void @llvm.dbg.value(metadata i32 0, metadata !12, metadata !DIExpression()), !dbg !16
+  br label %b1, !dbg !16
+
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32* [ %a0, %b0 ], [ %v7, %b1 ]
+  %v1 = phi i32 [ 0, %b0 ], [ %v5, %b1 ]
+  %v2 = phi i32* [ %a1, %b0 ], [ %v3, %b1 ]
+  %v3 = getelementptr inbounds i32, i32* %v2, i32 1, !dbg !18
+  call void @llvm.dbg.value(metadata i32* %v3, metadata !11, metadata !DIExpression()), !dbg !18
+  %v4 = load i32, i32* %v2, align 4, !dbg !18
+  store i32 %v4, i32* %v0, align 4, !dbg !18
+  %v5 = add nsw i32 %v1, 1, !dbg !20
+  call void @llvm.dbg.value(metadata i32 %v5, metadata !12, metadata !DIExpression()), !dbg !20
+  %v6 = icmp eq i32 %v5, 10, !dbg !16
+  %v7 = getelementptr i32, i32* %v0, i32 1
+  br i1 %v6, label %b2, label %b1, !dbg !16
+
+b2:                                               ; preds = %b1
+  ret void, !dbg !21
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
+attributes #1 = { nounwind readnone speculatable }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!29}
+!llvm.module.flags = !{!3}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", isOptimized: true, emissionKind: FullDebug, file: !28, enums: !2, retainedTypes: !2, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "QuIC LLVM Hexagon Clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2)
+!1 = !DIFile(filename: "hwloop-dbg.c", directory: "/test")
 !2 = !{}
-!5 = distinct !DISubprogram(name: "foo", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !28, scope: null, type: !7, retainedNodes: !11)
-!6 = !DIFile(filename: "hwloop-dbg.c", directory: "/usr2/kparzysz/s.hex/t")
-!7 = !DISubroutineType(types: !8)
-!8 = !{null, !9, !9}
-!9 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, baseType: !10)
-!10 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!11 = !{!13, !14, !15}
-!13 = !DILocalVariable(name: "a", line: 1, arg: 1, scope: !5, file: !6, type: !9)
-!14 = !DILocalVariable(name: "b", line: 1, arg: 2, scope: !5, file: !6, type: !9)
-!15 = !DILocalVariable(name: "i", line: 2, scope: !16, file: !6, type: !10)
-!16 = distinct !DILexicalBlock(line: 1, column: 26, file: !28, scope: !5)
-!17 = !DILocation(line: 1, column: 15, scope: !5)
-!18 = !DILocation(line: 1, column: 23, scope: !5)
-!19 = !DILocation(line: 3, column: 8, scope: !20)
-!20 = distinct !DILexicalBlock(line: 3, column: 3, file: !28, scope: !16)
-!21 = !DILocation(line: 4, column: 5, scope: !22)
-!22 = distinct !DILexicalBlock(line: 3, column: 28, file: !28, scope: !20)
-!26 = !DILocation(line: 3, column: 23, scope: !20)
-!27 = !DILocation(line: 6, column: 1, scope: !16)
-!28 = !DIFile(filename: "hwloop-dbg.c", directory: "/usr2/kparzysz/s.hex/t")
-!29 = !{i32 1, !"Debug Info Version", i32 3}
-!30 = !{i32 0}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: null, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !9)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7, !7}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 32, align: 32)
+!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !{!10, !11, !12}
+!10 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 1, type: !7)
+!11 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 1, type: !7)
+!12 = !DILocalVariable(name: "i", scope: !13, file: !1, line: 2, type: !8)
+!13 = distinct !DILexicalBlock(scope: !4, file: !1, line: 1, column: 26)
+!14 = !DILocation(line: 1, column: 15, scope: !4)
+!15 = !DILocation(line: 1, column: 23, scope: !4)
+!16 = !DILocation(line: 3, column: 8, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !13, file: !1, line: 3, column: 3)
+!18 = !DILocation(line: 4, column: 5, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !17, file: !1, line: 3, column: 28)
+!20 = !DILocation(line: 3, column: 23, scope: !17)
+!21 = !DILocation(line: 6, column: 1, scope: !13)
diff --git a/test/CodeGen/Hexagon/hwloop-le.ll b/test/CodeGen/Hexagon/hwloop-le.ll
index 85a1b3db673b7847f08d1701371cf559b21b1353..d78b234d4ecea876cd98282506e40b89d5e407c6 100644
--- a/test/CodeGen/Hexagon/hwloop-le.ll
+++ b/test/CodeGen/Hexagon/hwloop-le.ll
@@ -1,438 +1,408 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
 
-
-; CHECK: test_pos1_ir_sle
+; CHECK-LABEL: f0:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 28395, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 28395, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f0(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 28395, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 28395, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_ir_sle
+; CHECK-LABEL: f1:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 9073, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 9073, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f1(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 9073, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 9073, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_ir_sle
+; CHECK-LABEL: f2:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 21956, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 21956, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f2(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 21956, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 21956, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_ir_sle
+; CHECK-LABEL: f3:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 16782, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 16782, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f3(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 16782, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 16782, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_ir_sle
+; CHECK-LABEL: f4:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 19097, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 19097, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f4(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 19097, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 19097, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos1_ri_sle
+; CHECK-LABEL: f5:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 14040
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp sle i32 %inc, 14040
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f5(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 14040
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp sle i32 %v7, 14040
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_ri_sle
+; CHECK-LABEL: f6:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 13710
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp sle i32 %inc, 13710
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f6(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 13710
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp sle i32 %v7, 13710
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_ri_sle
+; CHECK-LABEL: f7:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 9920
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp sle i32 %inc, 9920
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f7(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 9920
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp sle i32 %v7, 9920
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_ri_sle
+; CHECK-LABEL: f8:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 18924
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp sle i32 %inc, 18924
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f8(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 18924
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp sle i32 %v7, 18924
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_ri_sle
+; CHECK-LABEL: f9:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_ri_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, 11812
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp sle i32 %inc, 11812
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f9(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, 11812
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp sle i32 %v7, 11812
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos1_rr_sle
+; CHECK-LABEL: f10:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f10(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_rr_sle
+; CHECK-LABEL: f11:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f11(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_rr_sle
+; CHECK-LABEL: f12:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f12(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_rr_sle
+; CHECK-LABEL: f13:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f13(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_rr_sle
+; CHECK-LABEL: f14:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_rr_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp sle i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp sle i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f14(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sle i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp sle i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/hwloop-ne.ll b/test/CodeGen/Hexagon/hwloop-ne.ll
index 12ef3b5dd0bc54c1ba379cb515adb99ce5460cc3..301a31a7c0b7d1c8904df07c23ca3d68895597a3 100644
--- a/test/CodeGen/Hexagon/hwloop-ne.ll
+++ b/test/CodeGen/Hexagon/hwloop-ne.ll
@@ -1,438 +1,408 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
 
-
-; CHECK: test_pos1_ir_ne
+; CHECK-LABEL: f0:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 32623, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 32623, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f0(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 32623, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 32623, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_ir_ne
+; CHECK-LABEL: f1:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 29554, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 29554, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f1(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 29554, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 29554, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_ir_ne
+; CHECK-LABEL: f2:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 15692, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 15692, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f2(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 15692, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 15692, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_ir_ne
+; CHECK-LABEL: f3:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 10449, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 10449, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f3(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 10449, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 10449, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_ir_ne
+; CHECK-LABEL: f4:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_ir_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 32087, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ 32087, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f4(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 32087, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ 32087, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos1_ri_ne
+; CHECK-LABEL: f5:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 3472
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp ne i32 %inc, 3472
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f5(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 3472
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp ne i32 %v7, 3472
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_ri_ne
+; CHECK-LABEL: f6:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 8730
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp ne i32 %inc, 8730
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f6(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 8730
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp ne i32 %v7, 8730
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_ri_ne
+; CHECK-LABEL: f7:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 1493
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp ne i32 %inc, 1493
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f7(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 1493
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp ne i32 %v7, 1493
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_ri_ne
+; CHECK-LABEL: f8:
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 1706
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp ne i32 %inc, 1706
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f8(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 1706
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp ne i32 %v7, 1706
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_ri_ne
+; CHECK-LABEL: f9:
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_ri_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, 1886
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp ne i32 %inc, 1886
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f9(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, 1886
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp ne i32 %v7, 1886
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos1_rr_ne
+; CHECK-LABEL: f10:
 ; CHECK: loop0
 ; a < b
-define void @test_pos1_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 1
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f10(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 1
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos2_rr_ne
+; CHECK-LABEL: f11:
 ; CHECK: loop0
 ; a < b
-define void @test_pos2_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 2
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f11(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 2
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos4_rr_ne
+; CHECK-LABEL: f12:
 ; CHECK: loop0
 ; a < b
-define void @test_pos4_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 4
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f12(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 4
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos8_rr_ne
+; CHECK-LABEL: f13
 ; CHECK: loop0
 ; a < b
-define void @test_pos8_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 8
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f13(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 8
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
-; CHECK: test_pos16_rr_ne
+; CHECK-LABEL: f14
 ; CHECK: loop0
 ; a < b
-define void @test_pos16_rr_ne(i8* nocapture %p, i32 %a, i32 %b) nounwind {
-entry:
-  %cmp3 = icmp slt i32 %a, %b
-  br i1 %cmp3, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  br label %for.body
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.04 = phi i32 [ %a, %for.body.lr.ph ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %i.04
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = zext i8 %0 to i32
-  %add = add nsw i32 %conv, 1
-  %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %arrayidx, align 1
-  %inc = add nsw i32 %i.04, 16
-  %cmp = icmp ne i32 %inc, %b
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %entry
+define void @f14(i8* nocapture %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp slt i32 %a1, %a2
+  br i1 %v0, label %b1, label %b3
+
+b1:                                               ; preds = %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v1 = phi i32 [ %a1, %b1 ], [ %v7, %b2 ]
+  %v2 = getelementptr inbounds i8, i8* %a0, i32 %v1
+  %v3 = load i8, i8* %v2, align 1
+  %v4 = zext i8 %v3 to i32
+  %v5 = add nsw i32 %v4, 1
+  %v6 = trunc i32 %v5 to i8
+  store i8 %v6, i8* %v2, align 1
+  %v7 = add nsw i32 %v1, 16
+  %v8 = icmp ne i32 %v7, %a2
+  br i1 %v8, label %b2, label %b3
+
+b3:                                               ; preds = %b2, %b0
   ret void
 }
 
-
-
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/i16_VarArg.ll b/test/CodeGen/Hexagon/i16_VarArg.ll
index 74d066e4936e1fa1510d985895c880127c8284f8..af2682edc4b38d0bd35e40e2fa1849fe574ff39f 100644
--- a/test/CodeGen/Hexagon/i16_VarArg.ll
+++ b/test/CodeGen/Hexagon/i16_VarArg.ll
@@ -1,40 +1,36 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: call __hexagon_{{[A-Z_a-z0-9]+}}
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: dfcmp
 
-@a_str = internal constant [8 x i8] c"a = %f\0A\00"
-@b_str = internal constant [8 x i8] c"b = %f\0A\00"
-@add_str = internal constant [12 x i8] c"a + b = %f\0A\00"
-@sub_str = internal constant [12 x i8] c"a - b = %f\0A\00"
-@mul_str = internal constant [12 x i8] c"a * b = %f\0A\00"
-@div_str = internal constant [12 x i8] c"b / a = %f\0A\00"
-@rem_str = internal constant [13 x i8] c"b %% a = %f\0A\00"
-@lt_str = internal constant [12 x i8] c"a < b = %d\0A\00"
-@le_str = internal constant [13 x i8] c"a <= b = %d\0A\00"
-@gt_str = internal constant [12 x i8] c"a > b = %d\0A\00"
-@ge_str = internal constant [13 x i8] c"a >= b = %d\0A\00"
-@eq_str = internal constant [13 x i8] c"a == b = %d\0A\00"
-@ne_str = internal constant [13 x i8] c"a != b = %d\0A\00"
-@A = global double 2.000000e+00
-@B = global double 5.000000e+00
+@g0 = internal constant [12 x i8] c"a < b = %d\0A\00"
+@g1 = internal constant [13 x i8] c"a <= b = %d\0A\00"
+@g2 = internal constant [12 x i8] c"a > b = %d\0A\00"
+@g3 = internal constant [13 x i8] c"a >= b = %d\0A\00"
+@g4 = internal constant [13 x i8] c"a == b = %d\0A\00"
+@g5 = internal constant [13 x i8] c"a != b = %d\0A\00"
+@g6 = global double 2.000000e+00
+@g7 = global double 5.000000e+00
 
-declare i32 @printf(i8*, ...)
+declare i32 @f0(i8*, ...) #0
 
-define i32 @main() {
-        %a = load double, double* @A
-        %b = load double, double* @B
-        %lt_r = fcmp olt double %a, %b
-        %le_r = fcmp ole double %a, %b
-        %gt_r = fcmp ogt double %a, %b
-        %ge_r = fcmp oge double %a, %b
-        %eq_r = fcmp oeq double %a, %b
-        %ne_r = fcmp une double %a, %b
-        %val1 = zext i1 %lt_r to i16
-        %lt_s = getelementptr [12 x i8], [12 x i8]* @lt_str, i64 0, i64 0
-        %le_s = getelementptr [13 x i8], [13 x i8]* @le_str, i64 0, i64 0
-        %gt_s = getelementptr [12 x i8], [12 x i8]* @gt_str, i64 0, i64 0
-        %ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0
-        %eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0
-        %ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0
-        call i32 (i8*, ...) @printf( i8* %lt_s, i16 %val1 )
-        ret i32 0
+define i32 @f1() #0 {
+b0:
+  %v0 = load double, double* @g6
+  %v1 = load double, double* @g7
+  %v2 = fcmp olt double %v0, %v1
+  %v3 = fcmp ole double %v0, %v1
+  %v4 = fcmp ogt double %v0, %v1
+  %v5 = fcmp oge double %v0, %v1
+  %v6 = fcmp oeq double %v0, %v1
+  %v7 = fcmp une double %v0, %v1
+  %v8 = zext i1 %v2 to i16
+  %v9 = getelementptr [12 x i8], [12 x i8]* @g0, i64 0, i64 0
+  %v10 = getelementptr [13 x i8], [13 x i8]* @g1, i64 0, i64 0
+  %v11 = getelementptr [12 x i8], [12 x i8]* @g2, i64 0, i64 0
+  %v12 = getelementptr [13 x i8], [13 x i8]* @g3, i64 0, i64 0
+  %v13 = getelementptr [13 x i8], [13 x i8]* @g4, i64 0, i64 0
+  %v14 = getelementptr [13 x i8], [13 x i8]* @g5, i64 0, i64 0
+  %v15 = call i32 (i8*, ...) @f0(i8* %v9, i16 %v8)
+  ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/i1_VarArg.ll b/test/CodeGen/Hexagon/i1_VarArg.ll
index 4078c0f3f005e9fbd1e3bb2d0ac05838694afc8b..01619bc54246012345f4289f15ed0b70b985b4b1 100644
--- a/test/CodeGen/Hexagon/i1_VarArg.ll
+++ b/test/CodeGen/Hexagon/i1_VarArg.ll
@@ -1,44 +1,40 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: call __hexagon_{{[_A-Za-z0-9]+}}
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: dfcmp
 
-@a_str = internal constant [8 x i8] c"a = %f\0A\00"
-@b_str = internal constant [8 x i8] c"b = %f\0A\00"
-@add_str = internal constant [12 x i8] c"a + b = %f\0A\00"
-@sub_str = internal constant [12 x i8] c"a - b = %f\0A\00"
-@mul_str = internal constant [12 x i8] c"a * b = %f\0A\00"
-@div_str = internal constant [12 x i8] c"b / a = %f\0A\00"
-@rem_str = internal constant [13 x i8] c"b %% a = %f\0A\00"
-@lt_str = internal constant [12 x i8] c"a < b = %d\0A\00"
-@le_str = internal constant [13 x i8] c"a <= b = %d\0A\00"
-@gt_str = internal constant [12 x i8] c"a > b = %d\0A\00"
-@ge_str = internal constant [13 x i8] c"a >= b = %d\0A\00"
-@eq_str = internal constant [13 x i8] c"a == b = %d\0A\00"
-@ne_str = internal constant [13 x i8] c"a != b = %d\0A\00"
-@A = global double 2.000000e+00
-@B = global double 5.000000e+00
+@g0 = internal constant [12 x i8] c"a < b = %d\0A\00"
+@g1 = internal constant [13 x i8] c"a <= b = %d\0A\00"
+@g2 = internal constant [12 x i8] c"a > b = %d\0A\00"
+@g3 = internal constant [13 x i8] c"a >= b = %d\0A\00"
+@g4 = internal constant [13 x i8] c"a == b = %d\0A\00"
+@g5 = internal constant [13 x i8] c"a != b = %d\0A\00"
+@g6 = global double 2.000000e+00
+@g7 = global double 5.000000e+00
 
-declare i32 @printf(i8*, ...)
+declare i32 @f0(i8*, ...) #0
 
-define i32 @main() {
-        %a = load double, double* @A
-        %b = load double, double* @B
-        %lt_r = fcmp olt double %a, %b
-        %le_r = fcmp ole double %a, %b
-        %gt_r = fcmp ogt double %a, %b
-        %ge_r = fcmp oge double %a, %b
-        %eq_r = fcmp oeq double %a, %b
-        %ne_r = fcmp une double %a, %b
-        %lt_s = getelementptr [12 x i8], [12 x i8]* @lt_str, i64 0, i64 0
-        %le_s = getelementptr [13 x i8], [13 x i8]* @le_str, i64 0, i64 0
-        %gt_s = getelementptr [12 x i8], [12 x i8]* @gt_str, i64 0, i64 0
-        %ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0
-        %eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0
-        %ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0
-        call i32 (i8*, ...) @printf( i8* %lt_s, i1 %lt_r )
-        call i32 (i8*, ...) @printf( i8* %le_s, i1 %le_r )
-        call i32 (i8*, ...) @printf( i8* %gt_s, i1 %gt_r )
-        call i32 (i8*, ...) @printf( i8* %ge_s, i1 %ge_r )
-        call i32 (i8*, ...) @printf( i8* %eq_s, i1 %eq_r )
-        call i32 (i8*, ...) @printf( i8* %ne_s, i1 %ne_r )
-        ret i32 0
+define i32 @f1() #0 {
+b0:
+  %v0 = load double, double* @g6
+  %v1 = load double, double* @g7
+  %v2 = fcmp olt double %v0, %v1
+  %v3 = fcmp ole double %v0, %v1
+  %v4 = fcmp ogt double %v0, %v1
+  %v5 = fcmp oge double %v0, %v1
+  %v6 = fcmp oeq double %v0, %v1
+  %v7 = fcmp une double %v0, %v1
+  %v8 = getelementptr [12 x i8], [12 x i8]* @g0, i64 0, i64 0
+  %v9 = getelementptr [13 x i8], [13 x i8]* @g1, i64 0, i64 0
+  %v10 = getelementptr [12 x i8], [12 x i8]* @g2, i64 0, i64 0
+  %v11 = getelementptr [13 x i8], [13 x i8]* @g3, i64 0, i64 0
+  %v12 = getelementptr [13 x i8], [13 x i8]* @g4, i64 0, i64 0
+  %v13 = getelementptr [13 x i8], [13 x i8]* @g5, i64 0, i64 0
+  %v14 = call i32 (i8*, ...) @f0(i8* %v8, i1 %v2)
+  %v15 = call i32 (i8*, ...) @f0(i8* %v9, i1 %v3)
+  %v16 = call i32 (i8*, ...) @f0(i8* %v10, i1 %v4)
+  %v17 = call i32 (i8*, ...) @f0(i8* %v11, i1 %v5)
+  %v18 = call i32 (i8*, ...) @f0(i8* %v12, i1 %v6)
+  %v19 = call i32 (i8*, ...) @f0(i8* %v13, i1 %v7)
+  ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/i8_VarArg.ll b/test/CodeGen/Hexagon/i8_VarArg.ll
index 1353de47a976144b926dd8c80e56179aded78bad..247952d0c5cabe592c90595b22dd0ccac9667103 100644
--- a/test/CodeGen/Hexagon/i8_VarArg.ll
+++ b/test/CodeGen/Hexagon/i8_VarArg.ll
@@ -1,40 +1,36 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: call __hexagon_{{[A-Z_a-z0-9]+}}
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: dfcmp
 
-@a_str = internal constant [8 x i8] c"a = %f\0A\00"
-@b_str = internal constant [8 x i8] c"b = %f\0A\00"
-@add_str = internal constant [12 x i8] c"a + b = %f\0A\00"
-@sub_str = internal constant [12 x i8] c"a - b = %f\0A\00"
-@mul_str = internal constant [12 x i8] c"a * b = %f\0A\00"
-@div_str = internal constant [12 x i8] c"b / a = %f\0A\00"
-@rem_str = internal constant [13 x i8] c"b %% a = %f\0A\00"
-@lt_str = internal constant [12 x i8] c"a < b = %d\0A\00"
-@le_str = internal constant [13 x i8] c"a <= b = %d\0A\00"
-@gt_str = internal constant [12 x i8] c"a > b = %d\0A\00"
-@ge_str = internal constant [13 x i8] c"a >= b = %d\0A\00"
-@eq_str = internal constant [13 x i8] c"a == b = %d\0A\00"
-@ne_str = internal constant [13 x i8] c"a != b = %d\0A\00"
-@A = global double 2.000000e+00
-@B = global double 5.000000e+00
+@g0 = internal constant [12 x i8] c"a < b = %d\0A\00"
+@g1 = internal constant [13 x i8] c"a <= b = %d\0A\00"
+@g2 = internal constant [12 x i8] c"a > b = %d\0A\00"
+@g3 = internal constant [13 x i8] c"a >= b = %d\0A\00"
+@g4 = internal constant [13 x i8] c"a == b = %d\0A\00"
+@g5 = internal constant [13 x i8] c"a != b = %d\0A\00"
+@g6 = global double 2.000000e+00
+@g7 = global double 5.000000e+00
 
-declare i32 @printf(i8*, ...)
+declare i32 @f0(i8*, ...) #0
 
-define i32 @main() {
-        %a = load double, double* @A
-        %b = load double, double* @B
-        %lt_r = fcmp olt double %a, %b
-        %le_r = fcmp ole double %a, %b
-        %gt_r = fcmp ogt double %a, %b
-        %ge_r = fcmp oge double %a, %b
-        %eq_r = fcmp oeq double %a, %b
-        %ne_r = fcmp une double %a, %b
-        %val1 = zext i1 %lt_r to i8
-        %lt_s = getelementptr [12 x i8], [12 x i8]* @lt_str, i64 0, i64 0
-        %le_s = getelementptr [13 x i8], [13 x i8]* @le_str, i64 0, i64 0
-        %gt_s = getelementptr [12 x i8], [12 x i8]* @gt_str, i64 0, i64 0
-        %ge_s = getelementptr [13 x i8], [13 x i8]* @ge_str, i64 0, i64 0
-        %eq_s = getelementptr [13 x i8], [13 x i8]* @eq_str, i64 0, i64 0
-        %ne_s = getelementptr [13 x i8], [13 x i8]* @ne_str, i64 0, i64 0
-        call i32 (i8*, ...) @printf( i8* %lt_s, i8 %val1 )
-        ret i32 0
+define i32 @f1() #0 {
+b0:
+  %v0 = load double, double* @g6
+  %v1 = load double, double* @g7
+  %v2 = fcmp olt double %v0, %v1
+  %v3 = fcmp ole double %v0, %v1
+  %v4 = fcmp ogt double %v0, %v1
+  %v5 = fcmp oge double %v0, %v1
+  %v6 = fcmp oeq double %v0, %v1
+  %v7 = fcmp une double %v0, %v1
+  %v8 = zext i1 %v2 to i8
+  %v9 = getelementptr [12 x i8], [12 x i8]* @g0, i64 0, i64 0
+  %v10 = getelementptr [13 x i8], [13 x i8]* @g1, i64 0, i64 0
+  %v11 = getelementptr [12 x i8], [12 x i8]* @g2, i64 0, i64 0
+  %v12 = getelementptr [13 x i8], [13 x i8]* @g3, i64 0, i64 0
+  %v13 = getelementptr [13 x i8], [13 x i8]* @g4, i64 0, i64 0
+  %v14 = getelementptr [13 x i8], [13 x i8]* @g5, i64 0, i64 0
+  %v15 = call i32 (i8*, ...) @f0(i8* %v9, i8 %v8)
+  ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/inline-asm-filetype-null.ll b/test/CodeGen/Hexagon/inline-asm-filetype-null.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9fbbcff71f5160f70d402db31e4b1e9ada36455e
--- /dev/null
+++ b/test/CodeGen/Hexagon/inline-asm-filetype-null.ll
@@ -0,0 +1,8 @@
+; RUN: llc -filetype=null < %s
+
+target triple = "hexagon"
+
+define void @foo() {
+  tail call void asm sideeffect "//", ""()
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/machine-cp-clobbers.mir b/test/CodeGen/Hexagon/machine-cp-clobbers.mir
new file mode 100644
index 0000000000000000000000000000000000000000..736eccc217ec9788ffa5222a2fb4b883a46f582a
--- /dev/null
+++ b/test/CodeGen/Hexagon/machine-cp-clobbers.mir
@@ -0,0 +1,51 @@
+# RUN: llc -march=hexagon -o - %s -run-pass=machine-cp | FileCheck %s
+
+---
+name: dont_propagate_past_lower_subreg_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: dont_propagate_past_lower_subreg_kill
+    ; CHECK: A2_nop implicit-def $d0
+    ; CHECK: A2_nop implicit-def $r2
+    ; CHECK: A2_nop implicit killed $r2
+    ; CHECK: $d1 = COPY killed $d0
+    ; CHECK: $d2 = COPY $d1
+    ; CHECK: A2_nop implicit $d2
+    A2_nop implicit-def $d0
+    $d1 = COPY killed $d0
+    $d0 = COPY killed $d1
+
+    A2_nop implicit-def $r2
+    A2_nop implicit killed $r2
+
+    $d1 = COPY killed $d0
+    $d2 = COPY $d1
+    A2_nop implicit $d2
+
+...
+
+---
+name: dont_propagate_past_upper_subreg_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: dont_propagate_past_upper_subreg_kill
+    ; CHECK: A2_nop implicit-def $d0
+    ; CHECK: A2_nop implicit-def $r3
+    ; CHECK: A2_nop implicit killed $r3
+    ; CHECK: $d1 = COPY killed $d0
+    ; CHECK: $d2 = COPY $d1
+    ; CHECK: A2_nop implicit $d2
+    A2_nop implicit-def $d0
+    $d1 = COPY killed $d0
+    $d0 = COPY killed $d1
+
+    A2_nop implicit-def $r3
+    A2_nop implicit killed $r3
+
+    $d1 = COPY killed $d0
+    $d2 = COPY $d1
+    A2_nop implicit $d2
+
+...
diff --git a/test/CodeGen/Hexagon/macint.ll b/test/CodeGen/Hexagon/macint.ll
index 514ba5b91308b0b0d6ed5b26f6c05b2131c59112..47856f2fcb548d4bea181b27bd4dbe8552816bca 100644
--- a/test/CodeGen/Hexagon/macint.ll
+++ b/test/CodeGen/Hexagon/macint.ll
@@ -1,14 +1,15 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we generate integer multiply accumulate.
 
 ; CHECK: r{{[0-9]+}} {{\+|\-}}= mpyi(r{{[0-9]+}},
 
-define i32 @main(i32* %a, i32* %b) nounwind {
-  entry:
-  %0 = load i32, i32* %a, align 4
-  %div = udiv i32 %0, 10000
-  %rem = urem i32 %div, 10
-  store i32 %rem, i32* %b, align 4
+define i32 @f0(i32* %a0, i32* %a1) #0 {
+b0:
+  %v0 = load i32, i32* %a0, align 4
+  %v1 = udiv i32 %v0, 10000
+  %v2 = urem i32 %v1, 10
+  store i32 %v2, i32* %a1, align 4
   ret i32 0
 }
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/misaligned-access.ll b/test/CodeGen/Hexagon/misaligned-access.ll
index f4b0cb9cb1e38be83db93cbd3aa9505f92f7b477..7eb85ffcc2296344dc5376d76ff694f8cb6f99bb 100644
--- a/test/CodeGen/Hexagon/misaligned-access.ll
+++ b/test/CodeGen/Hexagon/misaligned-access.ll
@@ -1,16 +1,19 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s
+; RUN: llc -march=hexagon < %s
 ; Check that the mis-aligned load doesn't cause compiler to assert.
 
-declare i32 @_hi(i64) #1
-@temp1 = common global i32 0, align 4
+@g0 = common global i32 0, align 4
 
-define i32 @CSDRSEARCH_executeSearchManager() #0 {
-entry:
-  %temp = alloca i32, align 4
-  %0 = load i32, i32* @temp1, align 4
-  store i32 %0, i32* %temp, align 4
-  %1 = bitcast i32* %temp to i64*
-  %2 = load i64, i64* %1, align 8
-  %call = call i32 @_hi(i64 %2)
-  ret i32 %call
+declare i32 @f0(i64) #0
+
+define i32 @f1() #0 {
+b0:
+  %v0 = alloca i32, align 4
+  %v1 = load i32, i32* @g0, align 4
+  store i32 %v1, i32* %v0, align 4
+  %v2 = bitcast i32* %v0 to i64*
+  %v3 = load i64, i64* %v2, align 8
+  %v4 = call i32 @f0(i64 %v3)
+  ret i32 %v4
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/mpy.ll b/test/CodeGen/Hexagon/mpy.ll
index 3ecf7d46ccb07a7fb82c51eb197d59ce34aa38b5..7c1e8c8d3f07f1e4429fdc0473998d47fb311be7 100644
--- a/test/CodeGen/Hexagon/mpy.ll
+++ b/test/CodeGen/Hexagon/mpy.ll
@@ -1,19 +1,21 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; CHECK: += mpyi
 
-define void @foo(i32 %acc, i32 %num, i32 %num2) nounwind {
-entry:
-  %acc.addr = alloca i32, align 4
-  %num.addr = alloca i32, align 4
-  %num2.addr = alloca i32, align 4
-  store i32 %acc, i32* %acc.addr, align 4
-  store i32 %num, i32* %num.addr, align 4
-  store i32 %num2, i32* %num2.addr, align 4
-  %0 = load i32, i32* %num.addr, align 4
-  %1 = load i32, i32* %acc.addr, align 4
-  %mul = mul nsw i32 %0, %1
-  %2 = load i32, i32* %num2.addr, align 4
-  %add = add nsw i32 %mul, %2
-  store i32 %add, i32* %num.addr, align 4
+define void @f0(i32 %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = alloca i32, align 4
+  %v1 = alloca i32, align 4
+  %v2 = alloca i32, align 4
+  store i32 %a0, i32* %v0, align 4
+  store i32 %a1, i32* %v1, align 4
+  store i32 %a2, i32* %v2, align 4
+  %v3 = load i32, i32* %v1, align 4
+  %v4 = load i32, i32* %v0, align 4
+  %v5 = mul nsw i32 %v3, %v4
+  %v6 = load i32, i32* %v2, align 4
+  %v7 = add nsw i32 %v5, %v6
+  store i32 %v7, i32* %v1, align 4
   ret void
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/newvaluejump.ll b/test/CodeGen/Hexagon/newvaluejump.ll
index e1437f369c88e1394d7beae9f576d933ac4daeb7..0697d297d7167c6be159189cdf27dacd536e9945 100644
--- a/test/CodeGen/Hexagon/newvaluejump.ll
+++ b/test/CodeGen/Hexagon/newvaluejump.ll
@@ -1,33 +1,36 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we generate new value jump.
 
-@i = global i32 0, align 4
-@j = global i32 10, align 4
-
-define i32 @foo(i32 %a) nounwind {
-entry:
 ; CHECK: if (cmp.eq(r{{[0-9]+}}.new,#0)) jump{{.}}
-  %addr1 = alloca i32, align 4
-  %addr2 = alloca i32, align 4
-  %0 = load i32, i32* @i, align 4
-  store i32 %0, i32* %addr1, align 4
-  call void @bar(i32 1, i32 2)
-  %1 = load i32, i32* @j, align 4
-  %tobool = icmp ne i32 %1, 0
-  br i1 %tobool, label %if.then, label %if.else
-
-if.then:
-  call void @baz(i32 1, i32 2)
-  br label %if.end
-
-if.else:
-  call void @guy(i32 10, i32 20)
-  br label %if.end
-
-if.end:
+
+@g0 = global i32 0, align 4
+@g1 = global i32 10, align 4
+
+define i32 @f0(i32 %a0) #0 {
+b0:
+  %v0 = alloca i32, align 4
+  %v1 = alloca i32, align 4
+  %v2 = load i32, i32* @g0, align 4
+  store i32 %v2, i32* %v0, align 4
+  call void @f2(i32 1, i32 2)
+  %v3 = load i32, i32* @g1, align 4
+  %v4 = icmp ne i32 %v3, 0
+  br i1 %v4, label %b1, label %b2
+
+b1:                                               ; preds = %b0
+  call void @f3(i32 1, i32 2)
+  br label %b3
+
+b2:                                               ; preds = %b0
+  call void @f1(i32 10, i32 20)
+  br label %b3
+
+b3:                                               ; preds = %b2, %b1
   ret i32 0
 }
 
-declare void @guy(i32, i32)
-declare void @bar(i32, i32)
-declare void @baz(i32, i32)
+declare void @f1(i32, i32) #0
+declare void @f2(i32, i32) #0
+declare void @f3(i32, i32) #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/packetize_cond_inst.ll b/test/CodeGen/Hexagon/packetize_cond_inst.ll
index 1fc6e82959e39035b86da8da7bf2e03d8f463aa4..8dca8f281147d03b35096a8895042376e80b99ba 100644
--- a/test/CodeGen/Hexagon/packetize_cond_inst.ll
+++ b/test/CodeGen/Hexagon/packetize_cond_inst.ll
@@ -1,10 +1,8 @@
-; RUN: llc -mcpu=hexagonv4 -tail-dup-size=1 < %s | FileCheck %s
+; RUN: llc -march=hexagon -tail-dup-size=1 < %s | FileCheck %s
 
-target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon-unknown--elf"
 
 ; Make sure we put the two conditionally executed adds in a packet.
-; ifcnv_add:
 ;     {
 ;       p0 = cmp.gt(r2, r1)
 ;       if (!p0.new) r0 = add(r2, r1)
@@ -13,20 +11,23 @@ target triple = "hexagon-unknown--elf"
 ; CHECK: cmp
 ; CHECK-NEXT: add
 ; CHECK-NEXT: add
-define i32 @ifcnv_add(i32, i32, i32) nounwind readnone {
-  %4 = icmp sgt i32 %2, %1
-  br i1 %4, label %5, label %7
+define i32 @f0(i32 %a0, i32 %a1, i32 %a2) #0 {
+b0:
+  %v0 = icmp sgt i32 %a2, %a1
+  br i1 %v0, label %b1, label %b2
 
-; <label>:5                                       ; preds = %3
-  %6 = add nsw i32 %0, 10
-  br label %9
+b1:                                               ; preds = %b0
+  %v1 = add nsw i32 %a0, 10
+  br label %b3
 
-; <label>:7                                       ; preds = %3
-  %8 = add nsw i32 %2, %1
-  br label %9
+b2:                                               ; preds = %b0
+  %v2 = add nsw i32 %a2, %a1
+  br label %b3
 
-; <label>:9                                       ; preds = %7, %5
-  %10 = phi i32 [ %6, %5 ], [ %8, %7 ]
-  %11 = add nsw i32 %10, 1
-  ret i32 %11
+b3:                                               ; preds = %b2, %b1
+  %v3 = phi i32 [ %v1, %b1 ], [ %v2, %b2 ]
+  %v4 = add nsw i32 %v3, 1
+  ret i32 %v4
 }
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/pic-sdata.ll b/test/CodeGen/Hexagon/pic-sdata.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3e4dc2dc93e99f645b2283a636bdb3b0b8aa6a6f
--- /dev/null
+++ b/test/CodeGen/Hexagon/pic-sdata.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=8 -relocation-model=static < %s | FileCheck --check-prefixes=CHECK,STATIC %s
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=8 -relocation-model=pic < %s | FileCheck --check-prefixes=CHECK,PIC %s
+
+; If a global has a specified section, it should probably be placed in that
+; section, but with PIC any accesses to globals in small data should still
+; go through GOT.
+
+@g0 = global i32 zeroinitializer
+@g1 = global i32 zeroinitializer, section ".sdata"
+
+; CHECK-LABEL: f0:
+; STATIC: memw(gp+#g0)
+; PIC: r[[R0:[0-9]+]] = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+; PIC: = memw(r[[R0]]+##g0@GOT)
+define i32 @f0() #0 {
+  %v0 = load i32, i32* @g0
+  ret i32 %v0
+}
+
+; CHECK-LABEL: f1:
+; STATIC: memw(gp+#g1)
+; PIC: r[[R1:[0-9]+]] = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+; PIC: = memw(r[[R1]]+##g1@GOT)
+define i32 @f1() #0 {
+  %v0 = load i32, i32* @g1
+  ret i32 %v0
+}
+
+; CHECK-LABEL: f2:
+; STATIC: CONST64(#123456789012345678)
+; PIC: r0 = ##-1506741426
+; PIC: r1 = ##28744523
+define i64 @f2() #0 {
+  ret i64 123456789012345678
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" }
diff --git a/test/CodeGen/Hexagon/postinc-load.ll b/test/CodeGen/Hexagon/postinc-load.ll
index 8d8c93d76bf95b36e1251753fe85627e26a2cead..825e16976a53d8a67af444d6a664e9777deecca6 100644
--- a/test/CodeGen/Hexagon/postinc-load.ll
+++ b/test/CodeGen/Hexagon/postinc-load.ll
@@ -1,29 +1,30 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; Check that post-increment load instructions are being generated.
 ; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}++#4)
 
-define i32 @sum(i32* nocapture %a, i16* nocapture %b, i32 %n) nounwind {
-entry:
-  br label %for.body
+define i32 @f0(i32* nocapture %a0, i16* nocapture %a1, i32 %a2) #0 {
+b0:
+  br label %b1
 
-for.body:
-  %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 10, %entry ]
-  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
-  %arrayidx1.phi = phi i16* [ %b, %entry ], [ %arrayidx1.inc, %for.body ]
-  %sum.03 = phi i32 [ 0, %entry ], [ %add2, %for.body ]
-  %0 = load i32, i32* %arrayidx.phi, align 4
-  %1 = load i16, i16* %arrayidx1.phi, align 2
-  %conv = sext i16 %1 to i32
-  %add = add i32 %0, %sum.03
-  %add2 = add i32 %add, %conv
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  %arrayidx1.inc = getelementptr i16, i16* %arrayidx1.phi, i32 1
-  %lsr.iv.next = add i32 %lsr.iv, -1
-  %exitcond = icmp eq i32 %lsr.iv.next, 0
-  br i1 %exitcond, label %for.end, label %for.body
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ %v11, %b1 ], [ 10, %b0 ]
+  %v1 = phi i32* [ %a0, %b0 ], [ %v9, %b1 ]
+  %v2 = phi i16* [ %a1, %b0 ], [ %v10, %b1 ]
+  %v3 = phi i32 [ 0, %b0 ], [ %v8, %b1 ]
+  %v4 = load i32, i32* %v1, align 4
+  %v5 = load i16, i16* %v2, align 2
+  %v6 = sext i16 %v5 to i32
+  %v7 = add i32 %v4, %v3
+  %v8 = add i32 %v7, %v6
+  %v9 = getelementptr i32, i32* %v1, i32 1
+  %v10 = getelementptr i16, i16* %v2, i32 1
+  %v11 = add i32 %v0, -1
+  %v12 = icmp eq i32 %v11, 0
+  br i1 %v12, label %b2, label %b1
 
-for.end:
-  ret i32 %add2
+b2:                                               ; preds = %b1
+  ret i32 %v8
 }
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/postinc-store.ll b/test/CodeGen/Hexagon/postinc-store.ll
index 276a7d8e0ff184d337ea320a97a6dea254319e5b..2dabc7991e39ba5edf454d200a44723d036cf6dd 100644
--- a/test/CodeGen/Hexagon/postinc-store.ll
+++ b/test/CodeGen/Hexagon/postinc-store.ll
@@ -1,29 +1,30 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; Check that post-increment store instructions are being generated.
 ; CHECK: memw(r{{[0-9]+}}++#4) = r{{[0-9]+}}
 
-define i32 @sum(i32* nocapture %a, i16* nocapture %b, i32 %n) nounwind {
-entry:
-  br label %for.body
+define i32 @f0(i32* nocapture %a0, i16* nocapture %a1, i32 %a2) #0 {
+b0:
+  br label %b1
 
-for.body:                                         ; preds = %for.body, %entry
-  %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ 10, %entry ]
-  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
-  %arrayidx1.phi = phi i16* [ %b, %entry ], [ %arrayidx1.inc, %for.body ]
-  %0 = load i32, i32* %arrayidx.phi, align 4
-  %1 = load i16, i16* %arrayidx1.phi, align 2
-  %conv = sext i16 %1 to i32
-  %factor = mul i32 %0, 2
-  %add3 = add i32 %factor, %conv
-  store i32 %add3, i32* %arrayidx.phi, align 4
+b1:                                               ; preds = %b1, %b0
+  %v0 = phi i32 [ %v10, %b1 ], [ 10, %b0 ]
+  %v1 = phi i32* [ %a0, %b0 ], [ %v8, %b1 ]
+  %v2 = phi i16* [ %a1, %b0 ], [ %v9, %b1 ]
+  %v3 = load i32, i32* %v1, align 4
+  %v4 = load i16, i16* %v2, align 2
+  %v5 = sext i16 %v4 to i32
+  %v6 = mul i32 %v3, 2
+  %v7 = add i32 %v6, %v5
+  store i32 %v7, i32* %v1, align 4
+  %v8 = getelementptr i32, i32* %v1, i32 1
+  %v9 = getelementptr i16, i16* %v2, i32 1
+  %v10 = add i32 %v0, -1
+  %v11 = icmp eq i32 %v10, 0
+  br i1 %v11, label %b2, label %b1
 
-  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
-  %arrayidx1.inc = getelementptr i16, i16* %arrayidx1.phi, i32 1
-  %lsr.iv.next = add i32 %lsr.iv, -1
-  %exitcond = icmp eq i32 %lsr.iv.next, 0
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
+b2:                                               ; preds = %b1
   ret i32 0
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/pred-gp.ll b/test/CodeGen/Hexagon/pred-gp.ll
index 76a621699b627719c82a13690c645bd6d6243f8e..4d50abf628387689402082c58f2c8eb64ed4a725 100644
--- a/test/CodeGen/Hexagon/pred-gp.ll
+++ b/test/CodeGen/Hexagon/pred-gp.ll
@@ -1,28 +1,30 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we are able to predicate instructions with gp-relative
 ; addressing mode.
 
-@d = external global i32
-@c = common global i32 0, align 4
+; CHECK: if ({{!?}}p{{[0-3]+}}{{(.new)?}}) r{{[0-9]+}} = memw(##g{{[01]}})
+; CHECK: if ({{!?}}p{{[0-3]+}}) r{{[0-9]+}} = memw(##g{{[01]}})
 
-; Function Attrs: nounwind
-define i32 @test2(i8 zeroext %a, i8 zeroext %b) #0 {
-; CHECK: if ({{!?}}p{{[0-3]+}}{{(.new)?}}) r{{[0-9]+}} = memw(##{{[cd]}})
-; CHECK: if ({{!?}}p{{[0-3]+}}) r{{[0-9]+}} = memw(##{{[cd]}})
-entry:
-  %cmp = icmp eq i8 %a, %b
-  br i1 %cmp, label %if.then, label %entry.if.end_crit_edge
+@g0 = external global i32
+@g1 = common global i32 0, align 4
 
-entry.if.end_crit_edge:
-  %.pre = load i32, i32* @c, align 4
-  br label %if.end
+define i32 @f0(i8 zeroext %a0, i8 zeroext %a1) #0 {
+b0:
+  %v0 = icmp eq i8 %a0, %a1
+  br i1 %v0, label %b2, label %b1
 
-if.then:
-  %0 = load i32, i32* @d, align 4
-  store i32 %0, i32* @c, align 4
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v1 = load i32, i32* @g1, align 4
+  br label %b3
 
-if.end:
-  %1 = phi i32 [ %.pre, %entry.if.end_crit_edge ], [ %0, %if.then ]
-  ret i32 %1
+b2:                                               ; preds = %b0
+  %v2 = load i32, i32* @g0, align 4
+  store i32 %v2, i32* @g1, align 4
+  br label %b3
+
+b3:                                               ; preds = %b2, %b1
+  %v3 = phi i32 [ %v1, %b1 ], [ %v2, %b2 ]
+  ret i32 %v3
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/pred-instrs.ll b/test/CodeGen/Hexagon/pred-instrs.ll
index da8ace98a0b2d69171ab251ee4ef94dbd122a396..27986f872d97858a1ba36ec55b7fb873b2ecc6a1 100644
--- a/test/CodeGen/Hexagon/pred-instrs.ll
+++ b/test/CodeGen/Hexagon/pred-instrs.ll
@@ -1,30 +1,32 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; Check that we are able to predicate instructions.
 
 ; CHECK: if ({{!?}}p{{[0-3]}}{{(.new)?}}) r{{[0-9]+}} = {{and|aslh}}
 ; CHECK: if ({{!?}}p{{[0-3]}}{{(.new)?}}) r{{[0-9]+}} = {{and|aslh}}
-@a = external global i32
-@d = external global i32
 
-; Function Attrs: nounwind
-define i32 @test1(i8 zeroext %la, i8 zeroext %lb) {
-entry:
-  %cmp = icmp eq i8 %la, %lb
-  br i1 %cmp, label %if.then, label %if.else
+@g0 = external global i32
+@g1 = external global i32
 
-if.then:                                          ; preds = %entry
-  %conv1 = zext i8 %la to i32
-  %shl = shl nuw nsw i32 %conv1, 16
-  br label %if.end
+define i32 @f0(i8 zeroext %a0, i8 zeroext %a1) #0 {
+b0:
+  %v0 = icmp eq i8 %a0, %a1
+  br i1 %v0, label %b1, label %b2
 
-if.else:                                          ; preds = %entry
-  %and8 = and i8 %lb, %la
-  %and = zext i8 %and8 to i32
-  br label %if.end
+b1:                                               ; preds = %b0
+  %v1 = zext i8 %a0 to i32
+  %v2 = shl nuw nsw i32 %v1, 16
+  br label %b3
 
-if.end:                                           ; preds = %if.else, %if.then
-  %storemerge = phi i32 [ %and, %if.else ], [ %shl, %if.then ]
-  store i32 %storemerge, i32* @a, align 4
-  %0 = load i32, i32* @d, align 4
-  ret i32 %0
+b2:                                               ; preds = %b0
+  %v3 = and i8 %a1, %a0
+  %v4 = zext i8 %v3 to i32
+  br label %b3
+
+b3:                                               ; preds = %b2, %b1
+  %v5 = phi i32 [ %v4, %b2 ], [ %v2, %b1 ]
+  store i32 %v5, i32* @g0, align 4
+  %v6 = load i32, i32* @g1, align 4
+  ret i32 %v6
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/predicate-copy.ll b/test/CodeGen/Hexagon/predicate-copy.ll
index 552b68794195992316e0cb74a3225ebb6960e542..1b58ec9e790871bc955a3218559d48f6f02e278f 100644
--- a/test/CodeGen/Hexagon/predicate-copy.ll
+++ b/test/CodeGen/Hexagon/predicate-copy.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -O3 < %s | FileCheck %s
+; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
 
 ; CHECK: r{{[0-9]+}} = p{{[0-9]+}}
-define i1 @foo() {
-entry:
+
+define i1 @f0() #0 {
+b0:
   ret i1 false
 }
 
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/remove_lsr.ll b/test/CodeGen/Hexagon/remove_lsr.ll
index 3b85c486348d81ed2db66a73475677dc958399e4..dee384520e50c4951a50a4e2d84dca3f76a3884b 100644
--- a/test/CodeGen/Hexagon/remove_lsr.ll
+++ b/test/CodeGen/Hexagon/remove_lsr.ll
@@ -1,6 +1,6 @@
 ; Test fix for PR-13709.
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: foo
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: f0
 ; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
 ; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
 
@@ -13,64 +13,64 @@
 ; This makes the lsr instruction dead and it gets removed subsequently
 ; by a dead code removal pass.
 
-%union.vect64 = type { i64 }
-%union.vect32 = type { i32 }
 
-define void @foo(%union.vect64* nocapture %sss_extracted_bit_rx_data_ptr,
- %union.vect32* nocapture %s_even, %union.vect32* nocapture %s_odd,
- i8* nocapture %scr_s_even_code_ptr, i8* nocapture %scr_s_odd_code_ptr)
- nounwind {
-entry:
-  %scevgep = getelementptr %union.vect64, %union.vect64* %sss_extracted_bit_rx_data_ptr, i32 1
-  %scevgep28 = getelementptr %union.vect32, %union.vect32* %s_odd, i32 1
-  %scevgep32 = getelementptr %union.vect32, %union.vect32* %s_even, i32 1
-  %scevgep36 = getelementptr i8, i8* %scr_s_odd_code_ptr, i32 1
-  %scevgep39 = getelementptr i8, i8* %scr_s_even_code_ptr, i32 1
-  br label %for.body
+%s.0 = type { i64 }
+%s.1 = type { i32 }
 
-for.body:                                         ; preds = %for.body, %entry
-  %lsr.iv42 = phi i32 [ %lsr.iv.next, %for.body ], [ 2, %entry ]
-  %lsr.iv40 = phi i8* [ %scevgep41, %for.body ], [ %scevgep39, %entry ]
-  %lsr.iv37 = phi i8* [ %scevgep38, %for.body ], [ %scevgep36, %entry ]
-  %lsr.iv33 = phi %union.vect32* [ %scevgep34, %for.body ], [ %scevgep32, %entry ]
-  %lsr.iv29 = phi %union.vect32* [ %scevgep30, %for.body ], [ %scevgep28, %entry ]
-  %lsr.iv = phi %union.vect64* [ %scevgep26, %for.body ], [ %scevgep, %entry ]
-  %predicate_1.023 = phi i8 [ undef, %entry ], [ %10, %for.body ]
-  %predicate.022 = phi i8 [ undef, %entry ], [ %9, %for.body ]
-  %val.021 = phi i64 [ undef, %entry ], [ %srcval, %for.body ]
-  %lsr.iv3335 = bitcast %union.vect32* %lsr.iv33 to i32*
-  %lsr.iv2931 = bitcast %union.vect32* %lsr.iv29 to i32*
-  %lsr.iv27 = bitcast %union.vect64* %lsr.iv to i64*
-  %0 = tail call i64 @llvm.hexagon.A2.vsubhs(i64 0, i64 %val.021)
-  %conv3 = sext i8 %predicate.022 to i32
-  %1 = trunc i64 %val.021 to i32
-  %2 = trunc i64 %0 to i32
-  %3 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv3, i32 %1, i32 %2)
-  store i32 %3, i32* %lsr.iv3335, align 4
-  %conv8 = sext i8 %predicate_1.023 to i32
-  %4 = lshr i64 %val.021, 32
-  %5 = trunc i64 %4 to i32
-  %6 = lshr i64 %0, 32
-  %7 = trunc i64 %6 to i32
-  %8 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv8, i32 %5, i32 %7)
-  store i32 %8, i32* %lsr.iv2931, align 4
-  %srcval = load i64, i64* %lsr.iv27, align 8
-  %9 = load i8, i8* %lsr.iv40, align 1
-  %10 = load i8, i8* %lsr.iv37, align 1
-  %lftr.wideiv = trunc i32 %lsr.iv42 to i8
-  %exitcond = icmp eq i8 %lftr.wideiv, 32
-  %scevgep26 = getelementptr %union.vect64, %union.vect64* %lsr.iv, i32 1
-  %scevgep30 = getelementptr %union.vect32, %union.vect32* %lsr.iv29, i32 1
-  %scevgep34 = getelementptr %union.vect32, %union.vect32* %lsr.iv33, i32 1
-  %scevgep38 = getelementptr i8, i8* %lsr.iv37, i32 1
-  %scevgep41 = getelementptr i8, i8* %lsr.iv40, i32 1
-  %lsr.iv.next = add i32 %lsr.iv42, 1
-  br i1 %exitcond, label %for.end, label %for.body
+define void @f0(%s.0* nocapture %a0, %s.1* nocapture %a1, %s.1* nocapture %a2, i8* nocapture %a3, i8* nocapture %a4) #0 {
+b0:
+  %v0 = getelementptr %s.0, %s.0* %a0, i32 1
+  %v1 = getelementptr %s.1, %s.1* %a2, i32 1
+  %v2 = getelementptr %s.1, %s.1* %a1, i32 1
+  %v3 = getelementptr i8, i8* %a4, i32 1
+  %v4 = getelementptr i8, i8* %a3, i32 1
+  br label %b1
 
-for.end:                                          ; preds = %for.body
+b1:                                               ; preds = %b1, %b0
+  %v5 = phi i32 [ %v38, %b1 ], [ 2, %b0 ]
+  %v6 = phi i8* [ %v37, %b1 ], [ %v4, %b0 ]
+  %v7 = phi i8* [ %v36, %b1 ], [ %v3, %b0 ]
+  %v8 = phi %s.1* [ %v35, %b1 ], [ %v2, %b0 ]
+  %v9 = phi %s.1* [ %v34, %b1 ], [ %v1, %b0 ]
+  %v10 = phi %s.0* [ %v33, %b1 ], [ %v0, %b0 ]
+  %v11 = phi i8 [ undef, %b0 ], [ %v30, %b1 ]
+  %v12 = phi i8 [ undef, %b0 ], [ %v29, %b1 ]
+  %v13 = phi i64 [ undef, %b0 ], [ %v28, %b1 ]
+  %v14 = bitcast %s.1* %v8 to i32*
+  %v15 = bitcast %s.1* %v9 to i32*
+  %v16 = bitcast %s.0* %v10 to i64*
+  %v17 = tail call i64 @llvm.hexagon.A2.vsubhs(i64 0, i64 %v13)
+  %v18 = sext i8 %v12 to i32
+  %v19 = trunc i64 %v13 to i32
+  %v20 = trunc i64 %v17 to i32
+  %v21 = tail call i32 @llvm.hexagon.C2.mux(i32 %v18, i32 %v19, i32 %v20)
+  store i32 %v21, i32* %v14, align 4
+  %v22 = sext i8 %v11 to i32
+  %v23 = lshr i64 %v13, 32
+  %v24 = trunc i64 %v23 to i32
+  %v25 = lshr i64 %v17, 32
+  %v26 = trunc i64 %v25 to i32
+  %v27 = tail call i32 @llvm.hexagon.C2.mux(i32 %v22, i32 %v24, i32 %v26)
+  store i32 %v27, i32* %v15, align 4
+  %v28 = load i64, i64* %v16, align 8
+  %v29 = load i8, i8* %v6, align 1
+  %v30 = load i8, i8* %v7, align 1
+  %v31 = trunc i32 %v5 to i8
+  %v32 = icmp eq i8 %v31, 32
+  %v33 = getelementptr %s.0, %s.0* %v10, i32 1
+  %v34 = getelementptr %s.1, %s.1* %v9, i32 1
+  %v35 = getelementptr %s.1, %s.1* %v8, i32 1
+  %v36 = getelementptr i8, i8* %v7, i32 1
+  %v37 = getelementptr i8, i8* %v6, i32 1
+  %v38 = add i32 %v5, 1
+  br i1 %v32, label %b2, label %b1
+
+b2:                                               ; preds = %b1
   ret void
 }
 
-declare i64 @llvm.hexagon.A2.vsubhs(i64, i64) nounwind readnone
+declare i64 @llvm.hexagon.A2.vsubhs(i64, i64) #1
+declare i32 @llvm.hexagon.C2.mux(i32, i32, i32) #1
 
-declare i32 @llvm.hexagon.C2.mux(i32, i32, i32) nounwind readnone
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
+attributes #1 = { nounwind readnone "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/sdata-load-size.ll b/test/CodeGen/Hexagon/sdata-load-size.ll
new file mode 100644
index 0000000000000000000000000000000000000000..325713f7062a8182b6de1b3ef92d57868cd89643
--- /dev/null
+++ b/test/CodeGen/Hexagon/sdata-load-size.ll
@@ -0,0 +1,19 @@
+; RUN: llc -march=hexagon -hexagon-small-data-threshold=8 < %s | FileCheck %s
+; CHECK: = memd(gp+#g0)
+; If an object will be placed in .sdata, do not shrink any references to it.
+; In this case, g0 must be loaded via memd.
+
+target triple = "hexagon"
+
+@g0 = common global i64 0, align 8
+
+define i32 @f0() #0 {
+entry:
+  %v0 = load i64, i64* @g0, align 8
+  %v1 = trunc i64 %v0 to i8
+  %v2 = zext i8 %v1 to i32
+  ret i32 %v2
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+small-data" }
+
diff --git a/test/CodeGen/Hexagon/simpletailcall.ll b/test/CodeGen/Hexagon/simpletailcall.ll
index 287640489a5e7a84e47e7723b3fcd1ee2ea1a773..76854bc1981d6d76b26198618b392ac11bc3b79e 100644
--- a/test/CodeGen/Hexagon/simpletailcall.ll
+++ b/test/CodeGen/Hexagon/simpletailcall.ll
@@ -1,14 +1,16 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: foo_empty
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: f0
 ; CHECK-NOT: allocframe
 ; CHECK-NOT: memd(r29
-; CHECK: jump bar_empty
+; CHECK: jump f1
 
-define void @foo_empty(i32 %h) nounwind {
-entry:
-  %add = add nsw i32 %h, 3
-  %call = tail call i32 bitcast (i32 (...)* @bar_empty to i32 (i32)*)(i32 %add) nounwind
+define void @f0(i32 %a0) #0 {
+b0:
+  %v0 = add nsw i32 %a0, 3
+  %v1 = tail call i32 bitcast (i32 (...)* @f1 to i32 (i32)*)(i32 %v0) #0
   ret void
 }
 
-declare i32 @bar_empty(...)
+declare i32 @f1(...) #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/swp-art-deps-rec.ll b/test/CodeGen/Hexagon/swp-art-deps-rec.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5272faf8f9b8ab8c168ddc6641ca3f28926223a4
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-art-deps-rec.ll
@@ -0,0 +1,109 @@
+; REQUIRES: asserts
+
+; RUN: llc -march=hexagon -mcpu=hexagonv65 -O3 -debug-only=pipeliner \
+; RUN: < %s 2>&1 | FileCheck %s
+
+; Test that the artificial dependences are ignored while computing the
+; circuits.
+
+; The recurrence should be 1 here. If we do not ignore artificial deps,
+; it will be greater.
+; CHECK: rec=1,
+
+define void @foo(i32 %size) #0 {
+entry:
+  %add = add nsw i32 0, 4
+  %shr = ashr i32 %size, 1
+  br i1 undef, label %L57.us, label %L57.us.ur
+
+L57.us:
+  %R9.0470.us = phi i32 [ %sub40.us.3, %L57.us ], [ undef, %entry ]
+  %sub40.us.3 = add i32 %R9.0470.us, -64
+  br i1 undef, label %L57.us, label %for.cond22.for.end_crit_edge.us.ur-lcssa
+
+for.cond22.for.end_crit_edge.us.ur-lcssa:
+  %inc.us.3.lcssa = phi i32 [ undef, %L57.us ]
+  %sub40.us.3.lcssa = phi i32 [ %sub40.us.3, %L57.us ]
+  %0 = icmp eq i32 %inc.us.3.lcssa, %shr
+  br i1 %0, label %for.cond22.for.end_crit_edge.us, label %L57.us.ur
+
+L57.us.ur:
+  %R15_14.0478.us.ur = phi i64 [ %1, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R13_12.0477.us.ur = phi i64 [ %14, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R11_10.0476.us.ur = phi i64 [ %8, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R7_6.0475.us.ur = phi i64 [ %7, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R5_4.2474.us.ur = phi i64 [ %16, %L57.us.ur ], [ undef, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R3_2.0473.us.ur = phi i64 [ %9, %L57.us.ur ], [ 0, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R1_0.0472.us.ur = phi i64 [ %15, %L57.us.ur ], [ undef, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %kk.0471.us.ur = phi i32 [ %inc.us.ur, %L57.us.ur ], [ 0, %entry ], [ %inc.us.3.lcssa, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R9.0470.us.ur = phi i32 [ %sub40.us.ur, %L57.us.ur ], [ undef, %entry ], [ %sub40.us.3.lcssa, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %R8.0469.us.ur = phi i32 [ %sub34.us.ur, %L57.us.ur ], [ undef, %entry ], [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ]
+  %1 = tail call i64 @llvm.hexagon.M2.vdmacs.s0(i64 %R15_14.0478.us.ur, i64 %R1_0.0472.us.ur, i64 %R3_2.0473.us.ur)
+  %2 = tail call i64 @llvm.hexagon.S2.shuffeh(i64 %R5_4.2474.us.ur, i64 %R7_6.0475.us.ur)
+  %3 = inttoptr i32 %R9.0470.us.ur to i16*
+  %4 = load i16, i16* %3, align 2
+  %conv27.us.ur = sext i16 %4 to i32
+  %sub28.us.ur = add i32 %R9.0470.us.ur, -8
+  %5 = inttoptr i32 %R8.0469.us.ur to i16*
+  %6 = load i16, i16* %5, align 2
+  %conv30.us.ur = sext i16 %6 to i32
+  %sub31.us.ur = add i32 %R8.0469.us.ur, -8
+  %7 = tail call i64 @llvm.hexagon.A2.combinew(i32 %conv27.us.ur, i32 %conv30.us.ur)
+  %8 = tail call i64 @llvm.hexagon.M2.vdmacs.s0(i64 %R11_10.0476.us.ur, i64 %R1_0.0472.us.ur, i64 %2)
+  %9 = tail call i64 @llvm.hexagon.S2.shuffeh(i64 %7, i64 %R5_4.2474.us.ur)
+  %10 = inttoptr i32 %sub31.us.ur to i16*
+  %11 = load i16, i16* %10, align 2
+  %conv33.us.ur = sext i16 %11 to i32
+  %sub34.us.ur = add i32 %R8.0469.us.ur, -16
+  %conv35.us.ur = trunc i64 %9 to i32
+  %12 = inttoptr i32 %sub28.us.ur to i16*
+  %13 = load i16, i16* %12, align 2
+  %conv39.us.ur = sext i16 %13 to i32
+  %sub40.us.ur = add i32 %R9.0470.us.ur, -16
+  %14 = tail call i64 @llvm.hexagon.M2.vdmacs.s0(i64 %R13_12.0477.us.ur, i64 %R1_0.0472.us.ur, i64 %9)
+  %15 = tail call i64 @llvm.hexagon.A2.combinew(i32 %conv35.us.ur, i32 undef)
+  %16 = tail call i64 @llvm.hexagon.A2.combinew(i32 %conv39.us.ur, i32 %conv33.us.ur)
+  %inc.us.ur = add nsw i32 %kk.0471.us.ur, 1
+  %exitcond535.ur = icmp eq i32 %inc.us.ur, %shr
+  br i1 %exitcond535.ur, label %for.cond22.for.end_crit_edge.us.ur-lcssa572, label %L57.us.ur
+
+for.cond22.for.end_crit_edge.us.ur-lcssa572:
+  %.lcssa730 = phi i64 [ %14, %L57.us.ur ]
+  %.lcssa729 = phi i64 [ %8, %L57.us.ur ]
+  %.lcssa728 = phi i64 [ %1, %L57.us.ur ]
+  %extract.t652 = trunc i64 %.lcssa730 to i32
+  %extract661 = lshr i64 %.lcssa729, 32
+  %extract.t662 = trunc i64 %extract661 to i32
+  %extract.t664 = trunc i64 %.lcssa728 to i32
+  br label %for.cond22.for.end_crit_edge.us
+
+for.cond22.for.end_crit_edge.us:
+  %.lcssa551.off0 = phi i32 [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ], [ %extract.t652, %for.cond22.for.end_crit_edge.us.ur-lcssa572 ]
+  %.lcssa550.off32 = phi i32 [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ], [ %extract.t662, %for.cond22.for.end_crit_edge.us.ur-lcssa572 ]
+  %.lcssa549.off0 = phi i32 [ undef, %for.cond22.for.end_crit_edge.us.ur-lcssa ], [ %extract.t664, %for.cond22.for.end_crit_edge.us.ur-lcssa572 ]
+  %17 = inttoptr i32 %add to i32*
+  store i32 %.lcssa549.off0, i32* %17, align 4
+  %add.ptr61.us = getelementptr inbounds i8, i8* null, i32 32
+  %18 = bitcast i8* %add.ptr61.us to i32*
+  store i32 %.lcssa551.off0, i32* %18, align 4
+  %19 = bitcast i8* undef to i32*
+  store i32 %.lcssa550.off32, i32* %19, align 4
+  call void @llvm.trap()
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.A2.combinew(i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.M2.vdmacs.s0(i64, i64, i64) #1
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.hexagon.S2.shuffeh(i64, i64) #1
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.trap() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noreturn nounwind }
diff --git a/test/CodeGen/Hexagon/swp-copytophi-dag.ll b/test/CodeGen/Hexagon/swp-copytophi-dag.ll
new file mode 100644
index 0000000000000000000000000000000000000000..69743407c148ca832a37a742e372a926833c6973
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-copytophi-dag.ll
@@ -0,0 +1,74 @@
+; REQUIRES: asserts
+;
+; RUN: llc -march=hexagon -enable-pipeliner=true -debug-only=pipeliner < %s \
+; RUN: 2>&1 | FileCheck %s
+
+; Test that the artificial dependence is created as a result of
+; CopyToPhi DAG mutation.
+; CHECK: Ord  Latency=0 Artificial
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @foo(i64* nocapture readonly %r64, i16 zeroext %n, i16 zeroext %s, i64* nocapture %p64) #0 {
+entry:
+  %conv = zext i16 %n to i32
+  %cmp = icmp eq i16 %n, 0
+  br i1 %cmp, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = load i64, i64* %r64, align 8
+  %v.sroa.0.0.extract.trunc = trunc i64 %tmp to i16
+  %v.sroa.4.0.extract.shift = lshr i64 %tmp, 16
+  %v.sroa.4.0.extract.trunc = trunc i64 %v.sroa.4.0.extract.shift to i16
+  %v.sroa.5.0.extract.shift = lshr i64 %tmp, 32
+  %v.sroa.5.0.extract.trunc = trunc i64 %v.sroa.5.0.extract.shift to i16
+  %v.sroa.6.0.extract.shift = lshr i64 %tmp, 48
+  %v.sroa.6.0.extract.trunc = trunc i64 %v.sroa.6.0.extract.shift to i16
+  %tmp1 = bitcast i64* %p64 to i16*
+  %conv2 = zext i16 %s to i32
+  %add.ptr = getelementptr inbounds i16, i16* %tmp1, i32 %conv2
+  %add.ptr.sum = add nuw nsw i32 %conv2, 1
+  %add.ptr3 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum
+  %add.ptr.sum50 = add nuw nsw i32 %conv2, 2
+  %add.ptr4 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum50
+  %add.ptr.sum51 = add nuw nsw i32 %conv2, 3
+  %add.ptr5 = getelementptr inbounds i16, i16* %tmp1, i32 %add.ptr.sum51
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %add.ptr11.phi = phi i16* [ %add.ptr11.inc, %for.body ], [ %add.ptr, %for.body.preheader ]
+  %add.ptr16.phi = phi i16* [ %add.ptr16.inc, %for.body ], [ %add.ptr3, %for.body.preheader ]
+  %add.ptr21.phi = phi i16* [ %add.ptr21.inc, %for.body ], [ %add.ptr4, %for.body.preheader ]
+  %add.ptr26.phi = phi i16* [ %add.ptr26.inc, %for.body ], [ %add.ptr5, %for.body.preheader ]
+  %i.058.pmt = phi i32 [ %inc.pmt, %for.body ], [ 0, %for.body.preheader ]
+  %v.sroa.0.157 = phi i16 [ %v.sroa.0.0.extract.trunc34, %for.body ], [ %v.sroa.0.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.4.156 = phi i16 [ %v.sroa.4.0.extract.trunc36, %for.body ], [ %v.sroa.4.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.5.155 = phi i16 [ %v.sroa.5.0.extract.trunc38, %for.body ], [ %v.sroa.5.0.extract.trunc, %for.body.preheader ]
+  %v.sroa.6.154 = phi i16 [ %v.sroa.6.0.extract.trunc40, %for.body ], [ %v.sroa.6.0.extract.trunc, %for.body.preheader ]
+  %q64.153.pn = phi i64* [ %q64.153, %for.body ], [ %r64, %for.body.preheader ]
+  %q64.153 = getelementptr inbounds i64, i64* %q64.153.pn, i32 1
+  store i16 %v.sroa.0.157, i16* %add.ptr11.phi, align 2
+  store i16 %v.sroa.4.156, i16* %add.ptr16.phi, align 2
+  store i16 %v.sroa.5.155, i16* %add.ptr21.phi, align 2
+  store i16 %v.sroa.6.154, i16* %add.ptr26.phi, align 2
+  %tmp2 = load i64, i64* %q64.153, align 8
+  %v.sroa.0.0.extract.trunc34 = trunc i64 %tmp2 to i16
+  %v.sroa.4.0.extract.shift35 = lshr i64 %tmp2, 16
+  %v.sroa.4.0.extract.trunc36 = trunc i64 %v.sroa.4.0.extract.shift35 to i16
+  %v.sroa.5.0.extract.shift37 = lshr i64 %tmp2, 32
+  %v.sroa.5.0.extract.trunc38 = trunc i64 %v.sroa.5.0.extract.shift37 to i16
+  %v.sroa.6.0.extract.shift39 = lshr i64 %tmp2, 48
+  %v.sroa.6.0.extract.trunc40 = trunc i64 %v.sroa.6.0.extract.shift39 to i16
+  %inc.pmt = add i32 %i.058.pmt, 1
+  %cmp8 = icmp slt i32 %inc.pmt, %conv
+  %add.ptr11.inc = getelementptr i16, i16* %add.ptr11.phi, i32 4
+  %add.ptr16.inc = getelementptr i16, i16* %add.ptr16.phi, i32 4
+  %add.ptr21.inc = getelementptr i16, i16* %add.ptr21.phi, i32 4
+  %add.ptr26.inc = getelementptr i16, i16* %add.ptr26.phi, i32 4
+  br i1 %cmp8, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv65" }
diff --git a/test/CodeGen/Hexagon/swp-epilog-phis.ll b/test/CodeGen/Hexagon/swp-epilog-phis.ll
deleted file mode 100644
index 1073f1c46b123810c7d5586434999d6138889a6a..0000000000000000000000000000000000000000
--- a/test/CodeGen/Hexagon/swp-epilog-phis.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -march=hexagon -enable-pipeliner -pipeliner-max-stages=2 \
-; RUN:     -pipeliner-ignore-recmii -disable-hexagon-nv-schedule \
-; RUN:     -hexagon-initial-cfg-cleanup=0 -stats -o /dev/null \
-; RUN:     -enable-aa-sched-mi < %s 2>&1 | FileCheck %s --check-prefix=STATS
-; REQUIRES: asserts
-;
-; Test that we generate the correct phis in the last epilog block when
-; allowing multiple stages.
-;
-; STATS: 1 pipeliner        - Number of loops software pipelined
-
-; Function Attrs: nounwind
-define void @f0() #0 {
-b0:
-  br i1 undef, label %b6, label %b1
-
-b1:                                               ; preds = %b0
-  br i1 undef, label %b6, label %b2
-
-b2:                                               ; preds = %b1
-  br label %b4
-
-b3:                                               ; preds = %b4, %b3
-  %v0 = add nsw i32 0, 57344
-  %v1 = trunc i32 %v0 to i16
-  store i16 %v1, i16* null, align 2, !tbaa !0
-  %v2 = getelementptr inbounds i8, i8* null, i32 undef
-  %v3 = load i8, i8* %v2, align 1, !tbaa !4
-  %v4 = zext i8 %v3 to i32
-  %v5 = shl nuw nsw i32 %v4, 6
-  %v6 = add nsw i32 %v5, 57344
-  %v7 = trunc i32 %v6 to i16
-  store i16 %v7, i16* undef, align 2, !tbaa !0
-  br i1 undef, label %b5, label %b3
-
-b4:                                               ; preds = %b5, %b2
-  %v8 = phi i32 [ 0, %b2 ], [ %v9, %b5 ]
-  br label %b3
-
-b5:                                               ; preds = %b3
-  %v9 = add i32 %v8, 1
-  %v10 = icmp eq i32 %v9, undef
-  br i1 %v10, label %b6, label %b4
-
-b6:                                               ; preds = %b5, %b1, %b0
-  ret void
-}
-
-attributes #0 = { nounwind "target-cpu"="hexagonv55" }
-
-!0 = !{!1, !1, i64 0}
-!1 = !{!"short", !2}
-!2 = !{!"omnipotent char", !3}
-!3 = !{!"Simple C/C++ TBAA"}
-!4 = !{!2, !2, i64 0}
diff --git a/test/CodeGen/Hexagon/swp-memrefs-epilog1.ll b/test/CodeGen/Hexagon/swp-memrefs-epilog1.ll
deleted file mode 100644
index bb45eeac1409d54d3315f24bd29aa37a1e856499..0000000000000000000000000000000000000000
--- a/test/CodeGen/Hexagon/swp-memrefs-epilog1.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; RUN: llc -march=hexagon -enable-pipeliner < %s | FileCheck %s
-
-; Test that a store and load, that alias, are not put in the same packet. The
-; pipeliner altered the size of the memrefs for these instructions, which
-; resulted in no order dependence between the instructions in the DAG. No order
-; dependence was added since the size was set to UINT_MAX, but there is a
-; computation using the size that overflowed.
-
-; CHECK: endloop0
-; CHECK: memh([[REG:r([0-9]+)]]+#0) =
-; CHECK: = memh([[REG]]++#2)
-
-; Function Attrs: nounwind
-define signext i16 @f0(i16* nocapture readonly %a0, i16* nocapture readonly %a1) local_unnamed_addr #0 {
-b0:
-  %v0 = alloca [40 x i16], align 8
-  %v1 = bitcast [40 x i16]* %v0 to i8*
-  call void @llvm.lifetime.start.p0i8(i64 80, i8* nonnull %v1) #2
-  %v2 = getelementptr inbounds [40 x i16], [40 x i16]* %v0, i32 0, i32 0
-  br label %b1
-
-b1:                                               ; preds = %b1, %b0
-  %v3 = phi i16* [ %a1, %b0 ], [ %v24, %b1 ]
-  %v4 = phi i16* [ %v2, %b0 ], [ %v25, %b1 ]
-  %v5 = phi i32 [ 0, %b0 ], [ %v14, %b1 ]
-  %v6 = phi i32 [ 1, %b0 ], [ %v22, %b1 ]
-  %v7 = phi i32 [ 0, %b0 ], [ %v23, %b1 ]
-  %v8 = load i16, i16* %v3, align 2
-  %v9 = sext i16 %v8 to i32
-  %v10 = tail call i32 @llvm.hexagon.A2.aslh(i32 %v9)
-  %v11 = tail call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %v10, i32 1)
-  %v12 = tail call i32 @llvm.hexagon.A2.asrh(i32 %v11)
-  %v13 = trunc i32 %v12 to i16
-  store i16 %v13, i16* %v4, align 2
-  %v14 = add nuw nsw i32 %v5, 1
-  %v15 = icmp eq i32 %v14, 40
-  %v16 = getelementptr inbounds i16, i16* %a0, i32 %v7
-  %v17 = load i16, i16* %v16, align 2
-  %v18 = sext i16 %v17 to i32
-  %v19 = getelementptr inbounds [40 x i16], [40 x i16]* %v0, i32 0, i32 %v7
-  %v20 = load i16, i16* %v19, align 2
-  %v21 = sext i16 %v20 to i32
-  %v22 = tail call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32 %v6, i32 %v18, i32 %v21)
-  %v23 = add nuw nsw i32 %v7, 1
-  %v24 = getelementptr i16, i16* %v3, i32 1
-  %v25 = getelementptr i16, i16* %v4, i32 1
-  br i1 %v15, label %b2, label %b1
-
-b2:                                               ; preds = %b1
-  %v26 = tail call signext i16 @f1(i32 %v22) #0
-  %v27 = sext i16 %v26 to i32
-  %v28 = tail call i32 @llvm.hexagon.S2.asl.r.r.sat(i32 %v22, i32 %v27)
-  %v29 = tail call i32 @llvm.hexagon.A2.asrh(i32 %v28)
-  %v30 = shl i32 %v29, 16
-  %v31 = ashr exact i32 %v30, 16
-  %v32 = icmp slt i32 %v30, 65536
-  br label %b3
-
-b3:                                               ; preds = %b2
-  call void @llvm.lifetime.end.p0i8(i64 80, i8* nonnull %v1) #2
-  ret i16 0
-}
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.S2.asr.r.r.sat(i32, i32) #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.A2.aslh(i32) #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.A2.asrh(i32) #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32, i32, i32) #2
-
-; Function Attrs: nounwind
-declare signext i16 @f1(i32) local_unnamed_addr #0
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.hexagon.S2.asl.r.r.sat(i32, i32) #2
-
-; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/two-addr-tied-subregs.mir b/test/CodeGen/Hexagon/two-addr-tied-subregs.mir
new file mode 100644
index 0000000000000000000000000000000000000000..87e117c461b64d2332076a4fd6086941cea4e079
--- /dev/null
+++ b/test/CodeGen/Hexagon/two-addr-tied-subregs.mir
@@ -0,0 +1,56 @@
+# RUN: llc -march hexagon -run-pass livevars -run-pass twoaddressinstruction  -verify-machineinstrs -o - %s | FileCheck %s
+
+
+###############################################################################
+
+---
+name:            test1
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $d0
+
+    %0:doubleregs = COPY killed $d0
+    %1:intregs = S2_lsr_i_r_acc %0.isub_lo, %0.isub_lo, 16
+
+...
+
+# Verify that both uses if %0.isub_lo are replaced here.
+# (we used to get %1:intregs = S2_lsr_i_r_acc %1, %1.isub_lo, 16)
+#
+# CHECK-LABEL: name:            test1
+# CHECK:  bb.0.entry:
+# CHECK:      %0:doubleregs = COPY killed $d0
+# CHECK-NEXT: %1:intregs = COPY killed %0.isub_lo
+# CHECK-NEXT: %1:intregs = S2_lsr_i_r_acc %1, %1, 16
+
+
+###############################################################################
+
+---
+name:            test2
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $d0
+
+    %0:doubleregs = COPY killed $d0
+    %1:intregs = S2_lsr_i_r_acc %0.isub_lo, %0.isub_hi, 16
+
+...
+
+# Verify that the use of %0.isub_hi isn't replaced here.
+# (we used to get %1:intregs = S2_lsr_i_r_acc %1, %1.isub_hi, 16)
+#
+# We also used to get an incorrect "killed" for %0 in the second COPY.
+# So we verify that we do not get machine verifier complaints here.
+# An improvement could be to get a "killed" attribute on the last
+# use of %0.isub_hi, but we do not need it for the IR to be valid.
+#
+# CHECK-LABEL: name:            test2
+# CHECK:  bb.0.entry:
+# CHECK:      %0:doubleregs = COPY killed $d0
+# CHECK-NEXT: %1:intregs = COPY %0.isub_lo
+# CHECK-NEXT: %1:intregs = S2_lsr_i_r_acc %1, %0.isub_hi, 16
+
+###############################################################################
diff --git a/test/CodeGen/Hexagon/union-1.ll b/test/CodeGen/Hexagon/union-1.ll
index 8f2ff28b38144cee3c939fe7181cd1ddb8ba1671..970ded79deb87f574a641df5debd88e17de302c9 100644
--- a/test/CodeGen/Hexagon/union-1.ll
+++ b/test/CodeGen/Hexagon/union-1.ll
@@ -1,19 +1,21 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: word
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: f0
 ; CHECK-NOT: combine(#0
-; CHECK: jump bar
+; CHECK: jump f1
 
-define void @word(i32* nocapture %a) nounwind {
-entry:
-  %0 = load i32, i32* %a, align 4
-  %1 = zext i32 %0 to i64
-  %add.ptr = getelementptr inbounds i32, i32* %a, i32 1
-  %2 = load i32, i32* %add.ptr, align 4
-  %3 = zext i32 %2 to i64
-  %4 = shl nuw i64 %3, 32
-  %ins = or i64 %4, %1
-  tail call void @bar(i64 %ins) nounwind
+define void @f0(i32* nocapture %a0) #0 {
+b0:
+  %v0 = load i32, i32* %a0, align 4
+  %v1 = zext i32 %v0 to i64
+  %v2 = getelementptr inbounds i32, i32* %a0, i32 1
+  %v3 = load i32, i32* %v2, align 4
+  %v4 = zext i32 %v3 to i64
+  %v5 = shl nuw i64 %v4, 32
+  %v6 = or i64 %v5, %v1
+  tail call void @f1(i64 %v6) #0
   ret void
 }
 
-declare void @bar(i64)
+declare void @f1(i64) #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/vaddh.ll b/test/CodeGen/Hexagon/vaddh.ll
index a4fb33de4ac5b02a49295f915d5da1f9aadf39a5..f139c288bb5b982c8d5179b71f8a10c05a822ab9 100644
--- a/test/CodeGen/Hexagon/vaddh.ll
+++ b/test/CodeGen/Hexagon/vaddh.ll
@@ -1,16 +1,19 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; RUN: llc -march=hexagon < %s | FileCheck %s
 ; CHECK: vaddh(r{{[0-9]+}},r{{[0-9]+}})
 
-@j = external global i32
-@k = external global i32
+@g0 = external global i32
+@g1 = external global i32
 
-define void @foo() nounwind {
-entry:
-  %0 = load i32, i32* @j, align 4
-  %1 = load i32, i32* @k, align 4
-  %2 = call i32 @llvm.hexagon.A2.svaddh(i32 %0, i32 %1)
-  store i32 %2, i32* @k, align 4
+define void @f0() #0 {
+b0:
+  %v0 = load i32, i32* @g0, align 4
+  %v1 = load i32, i32* @g1, align 4
+  %v2 = call i32 @llvm.hexagon.A2.svaddh(i32 %v0, i32 %v1)
+  store i32 %v2, i32* @g1, align 4
   ret void
 }
 
-declare i32 @llvm.hexagon.A2.svaddh(i32, i32) nounwind readnone
+declare i32 @llvm.hexagon.A2.svaddh(i32, i32) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
+attributes #1 = { nounwind readnone "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/Hexagon/validate-offset.ll b/test/CodeGen/Hexagon/validate-offset.ll
index 8de006c80b11ce1c47f9806cd8313fd356ca109c..ed98f281e4b2861de1145094ba4747f7d277fbf6 100644
--- a/test/CodeGen/Hexagon/validate-offset.ll
+++ b/test/CodeGen/Hexagon/validate-offset.ll
@@ -1,36 +1,38 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s -O0
+; RUN: llc -march=hexagon -O0 < %s
 
 ; This is a regression test which makes sure that the offset check
 ; is available for STRiw_indexed instruction. This is required
 ; by 'Hexagon Expand Predicate Spill Code' pass.
 
-define i32 @f(i32 %a, i32 %b) nounwind {
-entry:
-  %retval = alloca i32, align 4
-  %a.addr = alloca i32, align 4
-  %b.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  store i32 %b, i32* %b.addr, align 4
-  %0 = load i32, i32* %a.addr, align 4
-  %1 = load i32, i32* %b.addr, align 4
-  %cmp = icmp sgt i32 %0, %1
-  br i1 %cmp, label %if.then, label %if.else
+define i32 @f0(i32 %a0, i32 %a1) #0 {
+b0:
+  %v0 = alloca i32, align 4
+  %v1 = alloca i32, align 4
+  %v2 = alloca i32, align 4
+  store i32 %a0, i32* %v1, align 4
+  store i32 %a1, i32* %v2, align 4
+  %v3 = load i32, i32* %v1, align 4
+  %v4 = load i32, i32* %v2, align 4
+  %v5 = icmp sgt i32 %v3, %v4
+  br i1 %v5, label %b1, label %b2
 
-if.then:
-  %2 = load i32, i32* %a.addr, align 4
-  %3 = load i32, i32* %b.addr, align 4
-  %add = add nsw i32 %2, %3
-  store i32 %add, i32* %retval
-  br label %return
+b1:                                               ; preds = %b0
+  %v6 = load i32, i32* %v1, align 4
+  %v7 = load i32, i32* %v2, align 4
+  %v8 = add nsw i32 %v6, %v7
+  store i32 %v8, i32* %v0
+  br label %b3
 
-if.else:
-  %4 = load i32, i32* %a.addr, align 4
-  %5 = load i32, i32* %b.addr, align 4
-  %sub = sub nsw i32 %4, %5
-  store i32 %sub, i32* %retval
-  br label %return
+b2:                                               ; preds = %b0
+  %v9 = load i32, i32* %v1, align 4
+  %v10 = load i32, i32* %v2, align 4
+  %v11 = sub nsw i32 %v9, %v10
+  store i32 %v11, i32* %v0
+  br label %b3
 
-return:
-  %6 = load i32, i32* %retval
-  ret i32 %6
+b3:                                               ; preds = %b2, %b1
+  %v12 = load i32, i32* %v0
+  ret i32 %v12
 }
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" }
diff --git a/test/CodeGen/MIR/X86/diexpr-win32.mir b/test/CodeGen/MIR/X86/diexpr-win32.mir
index 3388ef714d42c88b072ab64d3feb40ec1458d430..7c0461233bb9828cad174bc2c59156d296f6a540 100644
--- a/test/CodeGen/MIR/X86/diexpr-win32.mir
+++ b/test/CodeGen/MIR/X86/diexpr-win32.mir
@@ -15,7 +15,7 @@
 # CHECK-NEXT: }
 # CHECK-NEXT: DefRangeFramePointerRelSym {
 # CHECK-NEXT:   Kind: S_DEFRANGE_FRAMEPOINTER_REL (0x1142)
-# CHECK-NEXT:   Offset: 8
+# CHECK-NEXT:   Offset: 12
 # CHECK-NEXT:   LocalVariableAddrRange {
 # CHECK-NEXT:     OffsetStart:
 # CHECK-NEXT:     ISectStart:
@@ -32,7 +32,7 @@
 # CHECK-NEXT: }
 # CHECK-NEXT: DefRangeFramePointerRelSym {
 # CHECK-NEXT:   Kind: S_DEFRANGE_FRAMEPOINTER_REL (0x1142)
-# CHECK-NEXT:   Offset: 4
+# CHECK-NEXT:   Offset: 8
 # CHECK-NEXT:   LocalVariableAddrRange {
 # CHECK-NEXT:     OffsetStart: .text+0x5
 # CHECK-NEXT:     ISectStart: 0x0
@@ -193,8 +193,8 @@ body:             |
     CFI_INSTRUCTION def_cfa_offset 8
     CFI_INSTRUCTION offset $esi, -8
     $esi = MOV32rm $esp, 1, _, 8, _ :: (load 4 from %fixed-stack.2)
-    DBG_VALUE debug-use $esp, 0, !26, !10, debug-location !25
-    DBG_VALUE debug-use $esp, 0, !23, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_deref), debug-location !25
+    DBG_VALUE $esp, 0, !26, !10, debug-location !25
+    DBG_VALUE $esp, 0, !23, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_deref), debug-location !25
     CALLpcrel32 @getString, csr_32, implicit $esp, implicit-def $esp, implicit-def $eax, debug-location !29
     $ecx = MOV32rm $eax, 1, _, 0, _, debug-location !29 :: (dereferenceable load 4 from %ir.1)
     $edx = MOV32rm $eax, 1, _, 4, _, debug-location !29 :: (dereferenceable load 4 from %ir.1 + 4)
@@ -245,7 +245,7 @@ body:             |
   bb.0.entry:
     $eax = MOV32rm $esp, 1, _, 4, _ :: (load 4 from %fixed-stack.1)
     $eax = MOV32rm killed $eax, 1, _, 0, _, debug-location !34 :: (load 4 from %ir.0)
-    DBG_VALUE debug-use $eax, 0, !35, !DIExpression(DW_OP_constu, 4, DW_OP_minus), debug-location !34
+    DBG_VALUE $eax, 0, !35, !DIExpression(DW_OP_constu, 4, DW_OP_minus), debug-location !34
     $eax = ADD32rm killed $eax, $esp, 1, _, 8, _, implicit-def dead $eflags, debug-location !36 :: (load 4 from %fixed-stack.0)
     RET 0, $eax, debug-location !36
 
diff --git a/test/CodeGen/MIR/X86/instructions-debug-location.mir b/test/CodeGen/MIR/X86/instructions-debug-location.mir
index ec819628f44647549e32e2cc3995c7d28aaebb1b..8b6c5cbf5261b8f36e25a0a1bf849f37ac3b8c02 100644
--- a/test/CodeGen/MIR/X86/instructions-debug-location.mir
+++ b/test/CodeGen/MIR/X86/instructions-debug-location.mir
@@ -59,10 +59,14 @@ stack:
 body: |
   bb.0.entry:
     liveins: $edi
-    ; CHECK: DBG_VALUE debug-use $noreg, 0, !11, !DIExpression(), debug-location !12
+    ; CHECK: DBG_VALUE $noreg, 0, !11, !DIExpression(), debug-location !12
+    ; CHECK: DBG_VALUE $noreg, 0, !11, !DIExpression(), debug-location !12
     ; CHECK: $eax = COPY %0, debug-location !13
     ; CHECK: RETQ $eax, debug-location !13
     %0 = COPY $edi
+    DBG_VALUE _, 0, !12, !DIExpression(), debug-location !13
+    ; Test whether debug-use is still recognized for compatibility with old
+    ; files.
     DBG_VALUE debug-use _, 0, !12, !DIExpression(), debug-location !13
     MOV32mr %stack.0.x.addr, 1, _, 0, _, %0
     $eax = COPY %0, debug-location !14
diff --git a/test/CodeGen/MIR/X86/pr38773.mir b/test/CodeGen/MIR/X86/pr38773.mir
index 0cf0bb25b9e56de4d205c268bd4ca9761f81f050..19b0debf2979012dfbbf11c0c3b14998f4bb6051 100644
--- a/test/CodeGen/MIR/X86/pr38773.mir
+++ b/test/CodeGen/MIR/X86/pr38773.mir
@@ -97,8 +97,8 @@ body:             |
     IDIV32r killed renamable $ecx, implicit-def $eax, implicit-def dead $edx, implicit-def dead $eflags, implicit $eax, implicit killed $edx
     renamable $ecx = COPY $eax
     ; CHECK:        IDIV32r killed renamable $ecx
-    ; CHECK-NEXT:   DBG_VALUE debug-use $eax, debug-use $noreg, !12, !DIExpression(), debug-location !13
-    DBG_VALUE debug-use $ecx, debug-use $noreg, !12, !DIExpression(), debug-location !13
+    ; CHECK-NEXT:   DBG_VALUE $eax, $noreg, !12, !DIExpression(), debug-location !13
+    DBG_VALUE $ecx, $noreg, !12, !DIExpression(), debug-location !13
     $eax = COPY killed renamable $ecx
     RET 0, $eax
 
diff --git a/test/CodeGen/MIR/X86/zero-probability.mir b/test/CodeGen/MIR/X86/zero-probability.mir
new file mode 100644
index 0000000000000000000000000000000000000000..a02002299016a8544098ea26ef126d5fe0aa06ad
--- /dev/null
+++ b/test/CodeGen/MIR/X86/zero-probability.mir
@@ -0,0 +1,14 @@
+# RUN: llc -run-pass=none -o /dev/null %s
+# REQUIRES: asserts
+# REQUIRES: default_triple
+# Makes sure that having a probability of 0x00000000 to branch to a successor
+# doesn't hit an APInt assert in the MIParser.
+
+---
+name:            main
+body:             |
+  bb.0:
+    successors: %bb.1(0x00000000)
+  bb.1:
+
+...
diff --git a/test/CodeGen/MSP430/AddrMode-bis-rx.ll b/test/CodeGen/MSP430/AddrMode-bis-rx.ll
index f4cb30f2d014c38a358524fec3337d79fefeadfb..948b67eb66c8dfafbf016d215da1a641c7e54c79 100644
--- a/test/CodeGen/MSP430/AddrMode-bis-rx.ll
+++ b/test/CodeGen/MSP430/AddrMode-bis-rx.ll
@@ -8,7 +8,7 @@ define i16 @am1(i16 %x, i16* %a) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am1:
-; CHECK:		bis.w	0(r13), r12
+; CHECK:		bis	0(r13), r12
 
 @foo = external global i16
 
@@ -18,7 +18,7 @@ define i16 @am2(i16 %x) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am2:
-; CHECK:		bis.w	&foo, r12
+; CHECK:		bis	&foo, r12
 
 @bar = internal constant [2 x i8] [ i8 32, i8 64 ]
 
@@ -37,7 +37,7 @@ define i16 @am4(i16 %x) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am4:
-; CHECK:		bis.w	&32, r12
+; CHECK:		bis	&32, r12
 
 define i16 @am5(i16 %x, i16* %a) nounwind {
 	%1 = getelementptr i16, i16* %a, i16 2
@@ -46,7 +46,7 @@ define i16 @am5(i16 %x, i16* %a) nounwind {
 	ret i16 %3
 }
 ; CHECK-LABEL: am5:
-; CHECK:		bis.w	4(r13), r12
+; CHECK:		bis	4(r13), r12
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer, align 1
@@ -57,7 +57,7 @@ define i16 @am6(i16 %x) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am6:
-; CHECK:		bis.w	&baz+2, r12
+; CHECK:		bis	&baz+2, r12
 
 %T = type { i16, [2 x i8] }
 @duh = internal constant %T { i16 16, [2 x i8][i8 32, i8 64 ] }
diff --git a/test/CodeGen/MSP430/AddrMode-bis-xr.ll b/test/CodeGen/MSP430/AddrMode-bis-xr.ll
index 1e150f382062ccc32ed550377941ba0142e85737..6d3a497386d5d3f57fa2de17055d7686a5b6b9bc 100644
--- a/test/CodeGen/MSP430/AddrMode-bis-xr.ll
+++ b/test/CodeGen/MSP430/AddrMode-bis-xr.ll
@@ -9,7 +9,7 @@ define void @am1(i16* %a, i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am1:
-; CHECK:		bis.w	r13, 0(r12)
+; CHECK:		bis	r13, 0(r12)
 
 @foo = external global i16
 
@@ -20,7 +20,7 @@ define void @am2(i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am2:
-; CHECK:		bis.w	r12, &foo
+; CHECK:		bis	r12, &foo
 
 @bar = external global [2 x i8]
 
@@ -41,7 +41,7 @@ define void @am4(i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am4:
-; CHECK:		bis.w	r12, &32
+; CHECK:		bis	r12, &32
 
 define void @am5(i16* %a, i16 %x) readonly {
 	%1 = getelementptr inbounds i16, i16* %a, i16 2
@@ -51,7 +51,7 @@ define void @am5(i16* %a, i16 %x) readonly {
 	ret void
 }
 ; CHECK-LABEL: am5:
-; CHECK:		bis.w	r13, 4(r12)
+; CHECK:		bis	r13, 4(r12)
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer
@@ -63,7 +63,7 @@ define void @am6(i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am6:
-; CHECK:		bis.w	r12, &baz+2
+; CHECK:		bis	r12, &baz+2
 
 %T = type { i16, [2 x i8] }
 @duh = external global %T
diff --git a/test/CodeGen/MSP430/AddrMode-mov-rx.ll b/test/CodeGen/MSP430/AddrMode-mov-rx.ll
index 808aca0ea10b549ce04cec2bac8cfeedbf1308fa..0605e8e86ce52ec48f48ca9433d901c50ae7fd55 100644
--- a/test/CodeGen/MSP430/AddrMode-mov-rx.ll
+++ b/test/CodeGen/MSP430/AddrMode-mov-rx.ll
@@ -7,7 +7,7 @@ define i16 @am1(i16* %a) nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am1:
-; CHECK:		mov.w	0(r12), r12
+; CHECK:		mov	0(r12), r12
 
 @foo = external global i16
 
@@ -16,7 +16,7 @@ define i16 @am2() nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am2:
-; CHECK:		mov.w	&foo, r12
+; CHECK:		mov	&foo, r12
 
 @bar = internal constant [2 x i8] [ i8 32, i8 64 ]
 
@@ -33,7 +33,7 @@ define i16 @am4() nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am4:
-; CHECK:		mov.w	&32, r12
+; CHECK:		mov	&32, r12
 
 define i16 @am5(i16* %a) nounwind {
 	%1 = getelementptr i16, i16* %a, i16 2
@@ -41,7 +41,7 @@ define i16 @am5(i16* %a) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am5:
-; CHECK:		mov.w	4(r12), r12
+; CHECK:		mov	4(r12), r12
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer, align 1
@@ -51,7 +51,7 @@ define i16 @am6() nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am6:
-; CHECK:		mov.w	&baz+2, r12
+; CHECK:		mov	&baz+2, r12
 
 %T = type { i16, [2 x i8] }
 @duh = internal constant %T { i16 16, [2 x i8][i8 32, i8 64 ] }
diff --git a/test/CodeGen/MSP430/AddrMode-mov-xr.ll b/test/CodeGen/MSP430/AddrMode-mov-xr.ll
index c336289a60d730350318b51019237914eec313ad..acc0b82571166958828deb5debf0ac9697d4b40c 100644
--- a/test/CodeGen/MSP430/AddrMode-mov-xr.ll
+++ b/test/CodeGen/MSP430/AddrMode-mov-xr.ll
@@ -7,7 +7,7 @@ define void @am1(i16* %a, i16 %b) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am1:
-; CHECK:		mov.w	r13, 0(r12)
+; CHECK:		mov	r13, 0(r12)
 
 @foo = external global i16
 
@@ -16,7 +16,7 @@ define void @am2(i16 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am2:
-; CHECK:		mov.w	r12, &foo
+; CHECK:		mov	r12, &foo
 
 @bar = external global [2 x i8]
 
@@ -33,7 +33,7 @@ define void @am4(i16 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am4:
-; CHECK:		mov.w	r12, &32
+; CHECK:		mov	r12, &32
 
 define void @am5(i16* nocapture %p, i16 %a) nounwind readonly {
 	%1 = getelementptr inbounds i16, i16* %p, i16 2
@@ -41,7 +41,7 @@ define void @am5(i16* nocapture %p, i16 %a) nounwind readonly {
 	ret void
 }
 ; CHECK-LABEL: am5:
-; CHECK:		mov.w	r13, 4(r12)
+; CHECK:		mov	r13, 4(r12)
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer, align 1
@@ -51,7 +51,7 @@ define void @am6(i16 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am6:
-; CHECK:		mov.w	r12, &baz+2
+; CHECK:		mov	r12, &baz+2
 
 %T = type { i16, [2 x i8] }
 @duh = external global %T
diff --git a/test/CodeGen/MSP430/Inst16mi.ll b/test/CodeGen/MSP430/Inst16mi.ll
index 38c16f2ba235d9edf98183892bef57926a4d71ee..bb99e28a1ba0c67f07545718e1eef6acd98f5871 100644
--- a/test/CodeGen/MSP430/Inst16mi.ll
+++ b/test/CodeGen/MSP430/Inst16mi.ll
@@ -6,14 +6,14 @@ target triple = "msp430-generic-generic"
 
 define void @mov() nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	#2, &foo
+; CHECK: mov	#2, &foo
 	store i16 2, i16 * @foo
 	ret void
 }
 
 define void @add() nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	#2, &foo
+; CHECK: incd	&foo
 	%1 = load i16, i16* @foo
 	%2 = add i16 %1, 2
 	store i16 %2, i16 * @foo
@@ -22,7 +22,7 @@ define void @add() nounwind {
 
 define void @and() nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	#2, &foo
+; CHECK: and	#2, &foo
 	%1 = load i16, i16* @foo
 	%2 = and i16 %1, 2
 	store i16 %2, i16 * @foo
@@ -31,7 +31,7 @@ define void @and() nounwind {
 
 define void @bis() nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	#2, &foo
+; CHECK: bis	#2, &foo
 	%1 = load i16, i16* @foo
 	%2 = or i16 %1, 2
 	store i16 %2, i16 * @foo
@@ -40,7 +40,7 @@ define void @bis() nounwind {
 
 define void @xor() nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	#2, &foo
+; CHECK: xor	#2, &foo
 	%1 = load i16, i16* @foo
 	%2 = xor i16 %1, 2
 	store i16 %2, i16 * @foo
diff --git a/test/CodeGen/MSP430/Inst16mm.ll b/test/CodeGen/MSP430/Inst16mm.ll
index 14a799b91717dfdac46440d68a90dfe88e9305a7..21fab42fd591fc8d8ae69eb37dab005d3395e5d7 100644
--- a/test/CodeGen/MSP430/Inst16mm.ll
+++ b/test/CodeGen/MSP430/Inst16mm.ll
@@ -6,7 +6,7 @@ target triple = "msp430-generic-generic"
 
 define void @mov() nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	&bar, &foo
+; CHECK: mov	&bar, &foo
         %1 = load i16, i16* @bar
         store i16 %1, i16* @foo
         ret void
@@ -14,7 +14,7 @@ define void @mov() nounwind {
 
 define void @add() nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	&bar, &foo
+; CHECK: add	&bar, &foo
 	%1 = load i16, i16* @bar
 	%2 = load i16, i16* @foo
 	%3 = add i16 %2, %1
@@ -24,7 +24,7 @@ define void @add() nounwind {
 
 define void @and() nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	&bar, &foo
+; CHECK: and	&bar, &foo
 	%1 = load i16, i16* @bar
 	%2 = load i16, i16* @foo
 	%3 = and i16 %2, %1
@@ -34,7 +34,7 @@ define void @and() nounwind {
 
 define void @bis() nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	&bar, &foo
+; CHECK: bis	&bar, &foo
 	%1 = load i16, i16* @bar
 	%2 = load i16, i16* @foo
 	%3 = or i16 %2, %1
@@ -44,7 +44,7 @@ define void @bis() nounwind {
 
 define void @xor() nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	&bar, &foo
+; CHECK: xor	&bar, &foo
 	%1 = load i16, i16* @bar
 	%2 = load i16, i16* @foo
 	%3 = xor i16 %2, %1
@@ -64,6 +64,6 @@ entry:
  %0 = load i16, i16* %retval                          ; <i16> [#uses=1]
  ret i16 %0
 ; CHECK-LABEL: mov2:
-; CHECK-DAG:	mov.w	2(r1), 6(r1)
-; CHECK-DAG:	mov.w	0(r1), 4(r1)
+; CHECK-DAG:	mov	2(r1), 6(r1)
+; CHECK-DAG:	mov	0(r1), 4(r1)
 }
diff --git a/test/CodeGen/MSP430/Inst16mr.ll b/test/CodeGen/MSP430/Inst16mr.ll
index 847c093f4088c2dbcffa890f4920a310d2214a3c..e3f23d9c5624eaa22ba597f244f2e936301fbb4f 100644
--- a/test/CodeGen/MSP430/Inst16mr.ll
+++ b/test/CodeGen/MSP430/Inst16mr.ll
@@ -5,14 +5,14 @@ target triple = "msp430-generic-generic"
 
 define void @mov(i16 %a) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	r12, &foo
+; CHECK: mov	r12, &foo
 	store i16 %a, i16* @foo
 	ret void
 }
 
 define void @add(i16 %a) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	r12, &foo
+; CHECK: add	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = add i16 %a, %1
 	store i16 %2, i16* @foo
@@ -21,7 +21,7 @@ define void @add(i16 %a) nounwind {
 
 define void @and(i16 %a) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	r12, &foo
+; CHECK: and	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = and i16 %a, %1
 	store i16 %2, i16* @foo
@@ -30,7 +30,7 @@ define void @and(i16 %a) nounwind {
 
 define void @bis(i16 %a) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	r12, &foo
+; CHECK: bis	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = or i16 %a, %1
 	store i16 %2, i16* @foo
@@ -39,7 +39,7 @@ define void @bis(i16 %a) nounwind {
 
 define void @bic(i16 zeroext %m) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.w   r12, &foo
+; CHECK: bic   r12, &foo
         %1 = xor i16 %m, -1
         %2 = load i16, i16* @foo
         %3 = and i16 %2, %1
@@ -49,7 +49,7 @@ define void @bic(i16 zeroext %m) nounwind {
 
 define void @xor(i16 %a) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	r12, &foo
+; CHECK: xor	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = xor i16 %a, %1
 	store i16 %2, i16* @foo
diff --git a/test/CodeGen/MSP430/Inst16ri.ll b/test/CodeGen/MSP430/Inst16ri.ll
index 3a4bb6a93d995a8551e3706d97a2a89dba86675b..58b2791194acbdf3cf1352ba4837dee47c2719d3 100644
--- a/test/CodeGen/MSP430/Inst16ri.ll
+++ b/test/CodeGen/MSP430/Inst16ri.ll
@@ -4,34 +4,34 @@ target triple = "msp430-generic-generic"
 
 define i16 @mov() nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	#1, r12
+; CHECK: mov	#1, r12
 	ret i16 1
 }
 
 define i16 @add(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	#1, r12
+; CHECK: inc	r12
 	%1 = add i16 %a, 1
 	ret i16 %1
 }
 
 define i16 @and(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	#1, r12
+; CHECK: and	#1, r12
 	%1 = and i16 %a, 1
 	ret i16 %1
 }
 
 define i16 @bis(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	#1, r12
+; CHECK: bis	#1, r12
 	%1 = or i16 %a, 1
 	ret i16 %1
 }
 
 define i16 @xor(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	#1, r12
+; CHECK: xor	#1, r12
 	%1 = xor i16 %a, 1
 	ret i16 %1
 }
diff --git a/test/CodeGen/MSP430/Inst16rm.ll b/test/CodeGen/MSP430/Inst16rm.ll
index 44b8f39d8fa625b808f5989d8a18effca5788494..8a3cd0a46fb36f03bd04b4c3a5c7c48991b8b0d7 100644
--- a/test/CodeGen/MSP430/Inst16rm.ll
+++ b/test/CodeGen/MSP430/Inst16rm.ll
@@ -5,7 +5,7 @@ target triple = "msp430-generic-generic"
 
 define i16 @add(i16 %a) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	&foo, r12
+; CHECK: add	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = add i16 %a, %1
 	ret i16 %2
@@ -13,7 +13,7 @@ define i16 @add(i16 %a) nounwind {
 
 define i16 @and(i16 %a) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	&foo, r12
+; CHECK: and	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = and i16 %a, %1
 	ret i16 %2
@@ -21,7 +21,7 @@ define i16 @and(i16 %a) nounwind {
 
 define i16 @bis(i16 %a) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	&foo, r12
+; CHECK: bis	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = or i16 %a, %1
 	ret i16 %2
@@ -29,7 +29,7 @@ define i16 @bis(i16 %a) nounwind {
 
 define i16  @bic(i16 %a) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.w	&foo, r12
+; CHECK: bic	&foo, r12
         %1 = load i16, i16* @foo
         %2 = xor i16 %1, -1
         %3 = and i16 %a, %2
@@ -38,7 +38,7 @@ define i16  @bic(i16 %a) nounwind {
 
 define i16 @xor(i16 %a) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	&foo, r12
+; CHECK: xor	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = xor i16 %a, %1
 	ret i16 %2
diff --git a/test/CodeGen/MSP430/Inst16rr.ll b/test/CodeGen/MSP430/Inst16rr.ll
index 75440ca2b403a78b5114a3c6c5412d705899f7eb..124d42113a21cbd5085575c8d051f30916d3f4e9 100644
--- a/test/CodeGen/MSP430/Inst16rr.ll
+++ b/test/CodeGen/MSP430/Inst16rr.ll
@@ -4,34 +4,34 @@ target triple = "msp430-generic-generic"
 
 define i16 @mov(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	r13, r12
+; CHECK: mov	r13, r12
 	ret i16 %b
 }
 
 define i16 @add(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	r13, r12
+; CHECK: add	r13, r12
 	%1 = add i16 %a, %b
 	ret i16 %1
 }
 
 define i16 @and(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	r13, r12
+; CHECK: and	r13, r12
 	%1 = and i16 %a, %b
 	ret i16 %1
 }
 
 define i16 @bis(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	r13, r12
+; CHECK: bis	r13, r12
 	%1 = or i16 %a, %b
 	ret i16 %1
 }
 
 define i16 @bic(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.w	r13, r12
+; CHECK: bic	r13, r12
         %1 = xor i16 %b, -1
         %2 = and i16 %a, %1
         ret i16 %2
@@ -39,7 +39,7 @@ define i16 @bic(i16 %a, i16 %b) nounwind {
 
 define i16 @xor(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	r13, r12
+; CHECK: xor	r13, r12
 	%1 = xor i16 %a, %b
 	ret i16 %1
 }
diff --git a/test/CodeGen/MSP430/Inst8mi.ll b/test/CodeGen/MSP430/Inst8mi.ll
index ff22d7e1eb3d678dfcb8f7eb3975002299a18b74..36eb3f91f840adfcccbb6360579b302b1b4a4dd0 100644
--- a/test/CodeGen/MSP430/Inst8mi.ll
+++ b/test/CodeGen/MSP430/Inst8mi.ll
@@ -12,7 +12,7 @@ define void @mov() nounwind {
 
 define void @add() nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.b	#2, &foo
+; CHECK: incd.b	&foo
 	%1 = load i8, i8* @foo
 	%2 = add i8 %1, 2
 	store i8 %2, i8 * @foo
diff --git a/test/CodeGen/MSP430/Inst8ri.ll b/test/CodeGen/MSP430/Inst8ri.ll
index 0e50f17f2a550b1e312ea8ed9c4f5fd7ba3d1f2e..ff3dee8bfb9484cf0d7af30aae9718dea9880a30 100644
--- a/test/CodeGen/MSP430/Inst8ri.ll
+++ b/test/CodeGen/MSP430/Inst8ri.ll
@@ -10,7 +10,7 @@ define i8 @mov() nounwind {
 
 define i8 @add(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.b	#1, r12
+; CHECK: inc.b	r12
 	%1 = add i8 %a, 1
 	ret i8 %1
 }
diff --git a/test/CodeGen/MSP430/Inst8rr.ll b/test/CodeGen/MSP430/Inst8rr.ll
index f37bc32a28fe1063ecf4b7179414414c459fac32..20c4fa5aacf5b2fbb723a54c2e0aeb7d7c12ce21 100644
--- a/test/CodeGen/MSP430/Inst8rr.ll
+++ b/test/CodeGen/MSP430/Inst8rr.ll
@@ -4,7 +4,7 @@ target triple = "msp430-generic-generic"
 
 define i8 @mov(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.{{[bw]}} r13, r12
+; CHECK: mov r13, r12
 	ret i8 %b
 }
 
@@ -17,14 +17,14 @@ define i8 @add(i8 %a, i8 %b) nounwind {
 
 define i8 @and(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	r13, r12
+; CHECK: and	r13, r12
 	%1 = and i8 %a, %b
 	ret i8 %1
 }
 
 define i8 @bis(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	r13, r12
+; CHECK: bis	r13, r12
 	%1 = or i8 %a, %b
 	ret i8 %1
 }
@@ -39,7 +39,7 @@ define i8 @bic(i8 %a, i8 %b) nounwind {
 
 define i8 @xor(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	r13, r12
+; CHECK: xor	r13, r12
 	%1 = xor i8 %a, %b
 	ret i8 %1
 }
diff --git a/test/CodeGen/MSP430/asm-clobbers.ll b/test/CodeGen/MSP430/asm-clobbers.ll
index 216a3fe401898fdc858647fcb2d7226cdc620ac8..0a0335057f1bcfd898418475bc57c1cc11517291 100644
--- a/test/CodeGen/MSP430/asm-clobbers.ll
+++ b/test/CodeGen/MSP430/asm-clobbers.ll
@@ -6,8 +6,8 @@ target triple = "msp430---elf"
 define void @test() {
 entry:
 ; CHECK-LABEL: test:
-; CHECK: push.w r10
+; CHECK: push r10
   call void asm sideeffect "", "~{r10}"()
-; CHECK: pop.w r10
+; CHECK: pop r10
   ret void
 }
diff --git a/test/CodeGen/MSP430/bit.ll b/test/CodeGen/MSP430/bit.ll
index 172822fbb5fef3fc7ffd40e8853ee0a8cd6bb3b8..a4b781243b4aac35f245c0260e1be73a1ebbae56 100644
--- a/test/CodeGen/MSP430/bit.ll
+++ b/test/CodeGen/MSP430/bit.ll
@@ -93,7 +93,7 @@ define i16 @bitwrr(i16 %a, i16 %b) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: bitwrr:
-; CHECK: bit.w	r13, r12
+; CHECK: bit	r13, r12
 
 define i16 @bitwri(i16 %a) nounwind {
 	%t1 = and i16 %a, 4080
@@ -102,7 +102,7 @@ define i16 @bitwri(i16 %a) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: bitwri:
-; CHECK: bit.w	#4080, r12
+; CHECK: bit	#4080, r12
 
 define i16 @bitwir(i16 %a) nounwind {
 	%t1 = and i16 4080, %a
@@ -111,7 +111,7 @@ define i16 @bitwir(i16 %a) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: bitwir:
-; CHECK: bit.w	#4080, r12
+; CHECK: bit	#4080, r12
 
 define i16 @bitwmi() nounwind {
 	%t1 = load i16, i16* @foo16
@@ -121,7 +121,7 @@ define i16 @bitwmi() nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwmi:
-; CHECK: bit.w	#4080, &foo16
+; CHECK: bit	#4080, &foo16
 
 define i16 @bitwim() nounwind {
 	%t1 = load i16, i16* @foo16
@@ -131,7 +131,7 @@ define i16 @bitwim() nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwim:
-; CHECK: bit.w	#4080, &foo16
+; CHECK: bit	#4080, &foo16
 
 define i16 @bitwrm(i16 %a) nounwind {
 	%t1 = load i16, i16* @foo16
@@ -141,7 +141,7 @@ define i16 @bitwrm(i16 %a) nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwrm:
-; CHECK: bit.w	&foo16, r12
+; CHECK: bit	&foo16, r12
 
 define i16 @bitwmr(i16 %a) nounwind {
 	%t1 = load i16, i16* @foo16
@@ -151,7 +151,7 @@ define i16 @bitwmr(i16 %a) nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwmr:
-; CHECK: bit.w	r12, &foo16
+; CHECK: bit	r12, &foo16
 
 define i16 @bitwmm() nounwind {
 	%t1 = load i16, i16* @foo16
@@ -162,5 +162,5 @@ define i16 @bitwmm() nounwind {
 	ret i16 %t5
 }
 ; CHECK-LABEL: bitwmm:
-; CHECK: bit.w	&bar16, &foo16
+; CHECK: bit	&bar16, &foo16
 
diff --git a/test/CodeGen/MSP430/byval.ll b/test/CodeGen/MSP430/byval.ll
index 401896b43c20cc96a564711ec59b324281db3d9e..838e883d4bec35e2119860194b6b3621fefd7c21 100644
--- a/test/CodeGen/MSP430/byval.ll
+++ b/test/CodeGen/MSP430/byval.ll
@@ -9,7 +9,7 @@ target triple = "msp430---elf"
 define i16 @callee(%struct.Foo* byval %f) nounwind {
 entry:
 ; CHECK-LABEL: callee:
-; CHECK: mov.w 2(r1), r12
+; CHECK: mov 2(r1), r12
   %0 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i32 0, i32 0
   %1 = load i16, i16* %0, align 2
   ret i16 %1
@@ -18,9 +18,9 @@ entry:
 define void @caller() nounwind {
 entry:
 ; CHECK-LABEL: caller:
-; CHECK: mov.w &foo+4, 4(r1)
-; CHECK-NEXT: mov.w &foo+2, 2(r1)
-; CHECK-NEXT: mov.w &foo, 0(r1)
+; CHECK: mov &foo+4, 4(r1)
+; CHECK-NEXT: mov &foo+2, 2(r1)
+; CHECK-NEXT: mov &foo, 0(r1)
   %call = call i16 @callee(%struct.Foo* byval @foo)
   ret void
 }
diff --git a/test/CodeGen/MSP430/cc_args.ll b/test/CodeGen/MSP430/cc_args.ll
index 70ac901f7e4e291fec47b67c13edf3c92ac6fe7c..eb7e470a9b61450348a1f1c66659ab3413d5d8d1 100644
--- a/test/CodeGen/MSP430/cc_args.ll
+++ b/test/CodeGen/MSP430/cc_args.ll
@@ -7,50 +7,50 @@ define void @test() #0 {
 entry:
 ; CHECK: test:
 
-; CHECK: mov.w #1, r12
+; CHECK: mov #1, r12
 ; CHECK: call #f_i16
   call void @f_i16(i16 1)
 
-; CHECK: mov.w #772, r12
-; CHECK: mov.w #258, r13
+; CHECK: mov #772, r12
+; CHECK: mov #258, r13
 ; CHECK: call #f_i32
   call void @f_i32(i32 16909060)
 
-; CHECK: mov.w #1800, r12
-; CHECK: mov.w #1286, r13
-; CHECK: mov.w #772, r14
-; CHECK: mov.w #258, r15
+; CHECK: mov #1800, r12
+; CHECK: mov #1286, r13
+; CHECK: mov #772, r14
+; CHECK: mov #258, r15
 ; CHECK: call #f_i64
   call void @f_i64(i64 72623859790382856)
 
-; CHECK: mov.w #772, r12
-; CHECK: mov.w #258, r13
-; CHECK: mov.w #1800, r14
-; CHECK: mov.w #1286, r15
+; CHECK: mov #772, r12
+; CHECK: mov #258, r13
+; CHECK: mov #1800, r14
+; CHECK: mov #1286, r15
 ; CHECK: call #f_i32_i32
   call void @f_i32_i32(i32 16909060, i32 84281096)
 
-; CHECK: mov.w #1, r12
-; CHECK: mov.w #772, r13
-; CHECK: mov.w #258, r14
-; CHECK: mov.w #2, r15
+; CHECK: mov #1, r12
+; CHECK: mov #772, r13
+; CHECK: mov #258, r14
+; CHECK: mov #2, r15
 ; CHECK: call #f_i16_i32_i16
   call void @f_i16_i32_i16(i16 1, i32 16909060, i16 2)
 
-; CHECK: mov.w #1286, 0(r1)
-; CHECK: mov.w #1, r12
-; CHECK: mov.w #772, r13
-; CHECK: mov.w #258, r14
-; CHECK: mov.w #1800, r15
+; CHECK: mov #1286, 0(r1)
+; CHECK: mov #1, r12
+; CHECK: mov #772, r13
+; CHECK: mov #258, r14
+; CHECK: mov #1800, r15
 ; CHECK: call #f_i16_i32_i32
   call void @f_i16_i32_i32(i16 1, i32 16909060, i32 84281096)
 
-; CHECK: mov.w #258, 6(r1)
-; CHECK: mov.w #772, 4(r1)
-; CHECK: mov.w #1286, 2(r1)
-; CHECK: mov.w #1800, 0(r1)
-; CHECK: mov.w #1, r12
-; CHECK: mov.w #2, r13
+; CHECK: mov #258, 6(r1)
+; CHECK: mov #772, 4(r1)
+; CHECK: mov #1286, 2(r1)
+; CHECK: mov #1800, 0(r1)
+; CHECK: mov #1, r12
+; CHECK: mov #2, r13
 ; CHECK: call #f_i16_i64_i16
   call void @f_i16_i64_i16(i16 1, i64 72623859790382856, i16 2)
 
@@ -63,75 +63,75 @@ entry:
 
 define void @f_i16(i16 %a) #0 {
 ; CHECK: f_i16:
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
   ret void
 }
 
 define void @f_i32(i32 %a) #0 {
 ; CHECK: f_i32:
-; CHECK: mov.w r13, &g_i32+2
-; CHECK: mov.w r12, &g_i32
+; CHECK: mov r13, &g_i32+2
+; CHECK: mov r12, &g_i32
   store volatile i32 %a, i32* @g_i32, align 2
   ret void
 }
 
 define void @f_i64(i64 %a) #0 {
 ; CHECK: f_i64:
-; CHECK: mov.w r15, &g_i64+6
-; CHECK: mov.w r14, &g_i64+4
-; CHECK: mov.w r13, &g_i64+2
-; CHECK: mov.w r12, &g_i64
+; CHECK: mov r15, &g_i64+6
+; CHECK: mov r14, &g_i64+4
+; CHECK: mov r13, &g_i64+2
+; CHECK: mov r12, &g_i64
   store volatile i64 %a, i64* @g_i64, align 2
   ret void
 }
 
 define void @f_i32_i32(i32 %a, i32 %b) #0 {
 ; CHECK: f_i32_i32:
-; CHECK: mov.w r13, &g_i32+2
-; CHECK: mov.w r12, &g_i32
+; CHECK: mov r13, &g_i32+2
+; CHECK: mov r12, &g_i32
   store volatile i32 %a, i32* @g_i32, align 2
-; CHECK: mov.w r15, &g_i32+2
-; CHECK: mov.w r14, &g_i32
+; CHECK: mov r15, &g_i32+2
+; CHECK: mov r14, &g_i32
   store volatile i32 %b, i32* @g_i32, align 2
   ret void
 }
 
 define void @f_i16_i32_i32(i16 %a, i32 %b, i32 %c) #0 {
 ; CHECK: f_i16_i32_i32:
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
-; CHECK: mov.w r14, &g_i32+2
-; CHECK: mov.w r13, &g_i32
+; CHECK: mov r14, &g_i32+2
+; CHECK: mov r13, &g_i32
   store volatile i32 %b, i32* @g_i32, align 2
-; CHECK: mov.w r15, &g_i32
-; CHECK: mov.w 4(r4), &g_i32+2
+; CHECK: mov r15, &g_i32
+; CHECK: mov 4(r4), &g_i32+2
   store volatile i32 %c, i32* @g_i32, align 2
   ret void
 }
 
 define void @f_i16_i32_i16(i16 %a, i32 %b, i16 %c) #0 {
 ; CHECK: f_i16_i32_i16:
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
-; CHECK: mov.w r14, &g_i32+2
-; CHECK: mov.w r13, &g_i32
+; CHECK: mov r14, &g_i32+2
+; CHECK: mov r13, &g_i32
   store volatile i32 %b, i32* @g_i32, align 2
-; CHECK: mov.w r15, &g_i16
+; CHECK: mov r15, &g_i16
   store volatile i16 %c, i16* @g_i16, align 2
   ret void
 }
 
 define void @f_i16_i64_i16(i16 %a, i64 %b, i16 %c) #0 {
 ; CHECK: f_i16_i64_i16:
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
-;CHECK: mov.w 10(r4), &g_i64+6
-;CHECK: mov.w 8(r4), &g_i64+4
-;CHECK: mov.w 6(r4), &g_i64+2
-;CHECK: mov.w 4(r4), &g_i64
+;CHECK: mov 10(r4), &g_i64+6
+;CHECK: mov 8(r4), &g_i64+4
+;CHECK: mov 6(r4), &g_i64+2
+;CHECK: mov 4(r4), &g_i64
   store volatile i64 %b, i64* @g_i64, align 2
-;CHECK: mov.w r13, &g_i16
+;CHECK: mov r13, &g_i16
   store volatile i16 %c, i16* @g_i16, align 2
   ret void
 }
diff --git a/test/CodeGen/MSP430/cc_ret.ll b/test/CodeGen/MSP430/cc_ret.ll
index 937db6dbf3bf0a26f534e4281c24ced91eb972f3..b4bb0554208dae8e4b5a9d80433a9be6d0a15113 100644
--- a/test/CodeGen/MSP430/cc_ret.ll
+++ b/test/CodeGen/MSP430/cc_ret.ll
@@ -8,21 +8,21 @@ entry:
 ; CHECK: test:
 
 ; CHECK: call #f_i16
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov r12, &g_i16
   %0 = call i16 @f_i16()
   store volatile i16 %0, i16* @g_i16
 
 ; CHECK: call #f_i32
-; CHECK: mov.w r13, &g_i32+2
-; CHECK: mov.w r12, &g_i32
+; CHECK: mov r13, &g_i32+2
+; CHECK: mov r12, &g_i32
   %1 = call i32 @f_i32()
   store volatile i32 %1, i32* @g_i32
 
 ; CHECK: call #f_i64
-; CHECK: mov.w r15, &g_i64+6
-; CHECK: mov.w r14, &g_i64+4
-; CHECK: mov.w r13, &g_i64+2
-; CHECK: mov.w r12, &g_i64
+; CHECK: mov r15, &g_i64+6
+; CHECK: mov r14, &g_i64+4
+; CHECK: mov r13, &g_i64+2
+; CHECK: mov r12, &g_i64
   %2 = call i64 @f_i64()
   store volatile i64 %2, i64* @g_i64
 
@@ -35,25 +35,25 @@ entry:
 
 define i16 @f_i16() #0 {
 ; CHECK: f_i16:
-; CHECK: mov.w #1, r12
+; CHECK: mov #1, r12
 ; CHECK: ret
   ret i16 1
 }
 
 define i32 @f_i32() #0 {
 ; CHECK: f_i32:
-; CHECK: mov.w #772, r12
-; CHECK: mov.w #258, r13
+; CHECK: mov #772, r12
+; CHECK: mov #258, r13
 ; CHECK: ret
   ret i32 16909060
 }
 
 define i64 @f_i64() #0 {
 ; CHECK: f_i64:
-; CHECK: mov.w #1800, r12
-; CHECK: mov.w #1286, r13
-; CHECK: mov.w #772, r14
-; CHECK: mov.w #258, r15
+; CHECK: mov #1800, r12
+; CHECK: mov #1286, r13
+; CHECK: mov #772, r14
+; CHECK: mov #258, r15
 ; CHECK: ret
   ret i64 72623859790382856
 }
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
index 2559e23ae1f58269741f052bbaf5af43e3c5ba83..87c4055829c8801c269e3538775e5eeb6ea28643 100644
--- a/test/CodeGen/MSP430/fp.ll
+++ b/test/CodeGen/MSP430/fp.ll
@@ -6,13 +6,13 @@ target triple = "msp430---elf"
 define void @fp() nounwind {
 entry:
 ; CHECK-LABEL: fp:
-; CHECK: push.w r4
-; CHECK: mov.w r1, r4
-; CHECK: sub.w #2, r1
+; CHECK: push r4
+; CHECK: mov r1, r4
+; CHECK: sub #2, r1
   %i = alloca i16, align 2
-; CHECK: mov.w #0, -2(r4)
+; CHECK: clr -2(r4)
   store i16 0, i16* %i, align 2
-; CHECK: pop.w r4
+; CHECK: pop r4
   ret void
 }
 
diff --git a/test/CodeGen/MSP430/jumptable.ll b/test/CodeGen/MSP430/jumptable.ll
index 49f23166a0a1ab309a19867b875a1b71d76aa001..6121f7ebed67cfcae932c654aa344f8aea1b4965 100644
--- a/test/CodeGen/MSP430/jumptable.ll
+++ b/test/CodeGen/MSP430/jumptable.ll
@@ -7,15 +7,15 @@ target triple = "msp430---elf"
 define i16 @test(i16 %i) #0 {
 entry:
 ; CHECK-LABEL: test:
-; CHECK:      sub.w   #4, r1
-; CHECK-NEXT: mov.w   r12, 0(r1)
-; CHECK-NEXT: cmp.w   #4, r12
+; CHECK:      sub   #4, r1
+; CHECK-NEXT: mov   r12, 0(r1)
+; CHECK-NEXT: cmp   #4, r12
 ; CHECK-NEXT: jhs     .LBB0_3
   %retval = alloca i16, align 2
   %i.addr = alloca i16, align 2
   store i16 %i, i16* %i.addr, align 2
   %0 = load i16, i16* %i.addr, align 2
-; CHECK:      rla.w r12
+; CHECK:      add   r12, r12
 ; CHECK-NEXT: br .LJTI0_0(r12)
   switch i16 %0, label %sw.default [
     i16 0, label %sw.bb
diff --git a/test/CodeGen/MSP430/memset.ll b/test/CodeGen/MSP430/memset.ll
index 10b506c60d9523200bc3243988a2650d5e3fe688..0f83b6078201f68e84aee732cf88f1a0c2031070 100644
--- a/test/CodeGen/MSP430/memset.ll
+++ b/test/CodeGen/MSP430/memset.ll
@@ -9,9 +9,9 @@ define void @test() nounwind {
 entry:
 ; CHECK-LABEL: test:
   %0 = load i8*, i8** @buf, align 2
-; CHECK: mov.w &buf, r12
-; CHECK-NEXT: mov.w #5, r13
-; CHECK-NEXT: mov.w #128, r14
+; CHECK: mov &buf, r12
+; CHECK-NEXT: mov #5, r13
+; CHECK-NEXT: mov #128, r14
 ; CHECK-NEXT: call #memset
   call void @llvm.memset.p0i8.i16(i8* %0, i8 5, i16 128, i1 false)
   ret void
diff --git a/test/CodeGen/MSP430/misched-msp430.ll b/test/CodeGen/MSP430/misched-msp430.ll
index 3d18fa005a6b58b0921d2be0a7d397be7c7426e8..f44f10ccd3ee9ae19625b40c6e501c371d353f27 100644
--- a/test/CodeGen/MSP430/misched-msp430.ll
+++ b/test/CodeGen/MSP430/misched-msp430.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
 ; only verifies that the code generator ran successfully.
 ;
 ; CHECK-LABEL: @f
-; CHECK: mov.w &y, &x
+; CHECK: mov &y, &x
 ; CHECK: ret
 define void @f() {
 entry:
diff --git a/test/CodeGen/MSP430/postinc.ll b/test/CodeGen/MSP430/postinc.ll
index 75a927f33fceade76d291396c6a33b5637cc9274..20ee8fb3c856258fa936308bd2beadfad58b4f11 100644
--- a/test/CodeGen/MSP430/postinc.ll
+++ b/test/CodeGen/MSP430/postinc.ll
@@ -12,7 +12,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: add:
-; CHECK: add.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: add @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = add i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
@@ -34,7 +34,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: sub:
-; CHECK: sub.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: sub @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = sub i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
@@ -56,7 +56,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: or:
-; CHECK: bis.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: bis @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = or i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
@@ -78,7 +78,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: xor:
-; CHECK: xor.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: xor @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = xor i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
@@ -100,7 +100,7 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.09 = phi i16 [ 0, %entry ], [ %add, %for.body ] ; <i16> [#uses=1]
   %arrayidx = getelementptr i16, i16* %a, i16 %i.010   ; <i16*> [#uses=1]
 ; CHECK-LABEL: and:
-; CHECK: and.w @r{{[0-9]+}}+, r{{[0-9]+}}
+; CHECK: and @r{{[0-9]+}}+, r{{[0-9]+}}
   %tmp4 = load i16, i16* %arrayidx                     ; <i16> [#uses=1]
   %add = and i16 %tmp4, %sum.09                   ; <i16> [#uses=2]
   %inc = add i16 %i.010, 1                        ; <i16> [#uses=2]
diff --git a/test/CodeGen/MSP430/select-use-sr.ll b/test/CodeGen/MSP430/select-use-sr.ll
index 3f67fb85f793fc36526fd986827d7e7d3889c783..159fc93db5aa19daee7d9c39a170562889c81d50 100644
--- a/test/CodeGen/MSP430/select-use-sr.ll
+++ b/test/CodeGen/MSP430/select-use-sr.ll
@@ -6,8 +6,8 @@ target triple = "msp430"
 ; Test that CMP instruction is not removed by MachineCSE.
 ;
 ; CHECK-LABEL: @f
-; CHECK: cmp.w r15, r13
-; CHECK: cmp.w r15, r13
+; CHECK: cmp r15, r13
+; CHECK: cmp r15, r13
 ; CHECK-NEXT: jeq .LBB0_2
 define i16 @f(i16, i16, i16, i16) {
 entry:
diff --git a/test/CodeGen/MSP430/setcc.ll b/test/CodeGen/MSP430/setcc.ll
index 6e2ec8ea3ea1dade9979e54b40fb3c78869b47d2..52baf64290336b7846146920e57080b07ea882f7 100644
--- a/test/CodeGen/MSP430/setcc.ll
+++ b/test/CodeGen/MSP430/setcc.ll
@@ -9,10 +9,10 @@ define i16 @sccweqand(i16 %a, i16 %b) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: sccweqand:
-; CHECK:	bit.w	r13, r12
-; CHECK:	mov.w	r2, r12
-; CHECK:	rra.w   r12
-; CHECK:	and.w	#1, r12
+; CHECK:	bit	r13, r12
+; CHECK:	mov	r2, r12
+; CHECK:	rra   r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwneand(i16 %a, i16 %b) nounwind {
 	%t1 = and i16 %a, %b
@@ -21,9 +21,9 @@ define i16 @sccwneand(i16 %a, i16 %b) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: sccwneand:
-; CHECK: 	bit.w	r13, r12
-; CHECK:	mov.w	r2, r12
-; CHECK:	and.w	#1, r12
+; CHECK: 	bit	r13, r12
+; CHECK:	mov	r2, r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwne(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ne i16 %a, %b
@@ -31,11 +31,11 @@ define i16 @sccwne(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwne:
-; CHECK:	cmp.w	r13, r12
-; CHECK:	mov.w	r2, r13
-; CHECK:	rra.w	r13
-; CHECK:	mov.w	#1, r12
-; CHECK:	bic.w	r13, r12
+; CHECK:	cmp	r13, r12
+; CHECK:	mov	r2, r13
+; CHECK:	rra	r13
+; CHECK:	mov	#1, r12
+; CHECK:	bic	r13, r12
 
 define i16 @sccweq(i16 %a, i16 %b) nounwind {
 	%t1 = icmp eq i16 %a, %b
@@ -43,10 +43,10 @@ define i16 @sccweq(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccweq:
-; CHECK:	cmp.w	r13, r12
-; CHECK:	mov.w	r2, r12
-; CHECK:	rra.w	r12
-; CHECK:	and.w	#1, r12
+; CHECK:	cmp	r13, r12
+; CHECK:	mov	r2, r12
+; CHECK:	rra	r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwugt(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ugt i16 %a, %b
@@ -54,9 +54,9 @@ define i16 @sccwugt(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwugt:
-; CHECK:	cmp.w	r12, r13
-; CHECK:	mov.w	#1, r12
-; CHECK:	bic.w	r2, r12
+; CHECK:	cmp	r12, r13
+; CHECK:	mov	#1, r12
+; CHECK:	bic	r2, r12
 
 define i16 @sccwuge(i16 %a, i16 %b) nounwind {
 	%t1 = icmp uge i16 %a, %b
@@ -64,9 +64,9 @@ define i16 @sccwuge(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwuge:
-; CHECK:	cmp.w	r13, r12
-; CHECK:	mov.w	r2, r12
-; CHECK:	and.w	#1, r12
+; CHECK:	cmp	r13, r12
+; CHECK:	mov	r2, r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwult(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ult i16 %a, %b
@@ -74,9 +74,9 @@ define i16 @sccwult(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwult:
-; CHECK:	cmp.w	r13, r12
-; CHECK:	mov.w	#1, r12
-; CHECK:	bic.w	r2, r12
+; CHECK:	cmp	r13, r12
+; CHECK:	mov	#1, r12
+; CHECK:	bic	r2, r12
 
 define i16 @sccwule(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ule i16 %a, %b
@@ -84,9 +84,9 @@ define i16 @sccwule(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwule:
-; CHECK:	cmp.w	r12, r13
-; CHECK:	mov.w	r2, r12
-; CHECK:	and.w	#1, r12
+; CHECK:	cmp	r12, r13
+; CHECK:	mov	r2, r12
+; CHECK:	and	#1, r12
 
 define i16 @sccwsgt(i16 %a, i16 %b) nounwind {
 	%t1 = icmp sgt i16 %a, %b
diff --git a/test/CodeGen/MSP430/shifts.ll b/test/CodeGen/MSP430/shifts.ll
index 22ae59ef4b0f713ac090dd8e06589377d57f971f..6d4050f42bef5e8de51ba36cc408ea079fa0c4a2 100644
--- a/test/CodeGen/MSP430/shifts.ll
+++ b/test/CodeGen/MSP430/shifts.ll
@@ -21,7 +21,7 @@ entry:
 define zeroext i8 @shl8(i8 zeroext %a, i8 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK: shl8
-; CHECK: rla.b
+; CHECK: add.b
   %shl = shl i8 %a, %cnt
   ret i8 %shl
 }
@@ -29,7 +29,7 @@ entry:
 define zeroext i16 @lshr16(i16 zeroext %a, i16 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK-LABEL: lshr16:
-; CHECK: rrc.w
+; CHECK: rrc
   %shr = lshr i16 %a, %cnt
   ret i16 %shr
 }
@@ -37,7 +37,7 @@ entry:
 define signext i16 @ashr16(i16 signext %a, i16 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK-LABEL: ashr16:
-; CHECK: rra.w
+; CHECK: rra
   %shr = ashr i16 %a, %cnt
   ret i16 %shr
 }
@@ -45,7 +45,7 @@ entry:
 define zeroext i16 @shl16(i16 zeroext %a, i16 zeroext %cnt) nounwind readnone {
 entry:
 ; CHECK-LABEL: shl16:
-; CHECK: rla.w
+; CHECK: add
   %shl = shl i16 %a, %cnt
   ret i16 %shl
 }
diff --git a/test/CodeGen/MSP430/struct-return.ll b/test/CodeGen/MSP430/struct-return.ll
index c28bf06af439d1258764238a77d3b1d1cfcda79e..a52ea1b702a3bc55c6e382bc7bd70dc058739099 100644
--- a/test/CodeGen/MSP430/struct-return.ll
+++ b/test/CodeGen/MSP430/struct-return.ll
@@ -9,14 +9,14 @@ target triple = "msp430---elf"
 
 define %s @fred() #0 {
 ; CHECK-LABEL: fred:
-; CHECK: mov.w	#2314, 14(r12)
-; CHECK: mov.w	#2828, 12(r12)
-; CHECK: mov.w	#3342, 10(r12)
-; CHECK: mov.w	#3840, 8(r12)
-; CHECK: mov.w	#258, 6(r12)
-; CHECK: mov.w	#772, 4(r12)
-; CHECK: mov.w	#1286, 2(r12)
-; CHECK: mov.w	#1800, 0(r12)
+; CHECK: mov	#2314, 14(r12)
+; CHECK: mov	#2828, 12(r12)
+; CHECK: mov	#3342, 10(r12)
+; CHECK: mov	#3840, 8(r12)
+; CHECK: mov	#258, 6(r12)
+; CHECK: mov	#772, 4(r12)
+; CHECK: mov	#1286, 2(r12)
+; CHECK: mov	#1800, 0(r12)
   ret %s {i64 72623859790382856, i64 651345242494996224} 
 }
 
diff --git a/test/CodeGen/MSP430/struct_layout.ll b/test/CodeGen/MSP430/struct_layout.ll
index 60ae9f09b4ede844ff2353cfe424714b7934e7e2..4c5a131acca6aadd68f1753318750829b1f9b3b2 100644
--- a/test/CodeGen/MSP430/struct_layout.ll
+++ b/test/CodeGen/MSP430/struct_layout.ll
@@ -5,7 +5,7 @@ target triple = "msp430"
 %struct.X = type { i8 }
 
 ; CHECK-LABEL: @foo
-; CHECK: sub.w   #4, r1
+; CHECK: sub   #4, r1
 ; CHECK: mov.b   #1, 3(r1)
 define void @foo() {
   %1 = alloca %struct.X
@@ -21,7 +21,7 @@ define void @foo() {
 }
 
 ; CHECK-LABEL: @bar
-; CHECK: sub.w   #4, r1
+; CHECK: sub   #4, r1
 ; CHECK: mov.b   #1, 3(r1)
 define void @bar() {
   %1 = alloca [3 x %struct.X]
@@ -40,8 +40,8 @@ define void @bar() {
 %struct.Y = type { i8, i16 }
 
 ; CHECK-LABEL: @baz
-; CHECK: sub.w   #8, r1
-; CHECK: mov.w   #2, 6(r1)
+; CHECK: sub   #8, r1
+; CHECK: mov   #2, 6(r1)
 define void @baz() {
   %1 = alloca %struct.Y, align 2
   %2 = alloca %struct.Y, align 2
diff --git a/test/CodeGen/MSP430/transient-stack-alignment.ll b/test/CodeGen/MSP430/transient-stack-alignment.ll
index cca83509cf4c4eb295fcdeb87e4e49b06913176a..a2ddf8a0b08668c46ae66a123ae0c6a8b71d5a7b 100644
--- a/test/CodeGen/MSP430/transient-stack-alignment.ll
+++ b/test/CodeGen/MSP430/transient-stack-alignment.ll
@@ -5,11 +5,11 @@ target triple = "msp430---elf"
 
 define void @test() #0 {
 ; CHECK-LABEL: test:
-; CHECK: sub.w #2, r1
+; CHECK: sub #2, r1
   %1 = alloca i8, align 1
-; CHECK-NEXT: mov.b #0, 1(r1)
+; CHECK-NEXT: clr.b 1(r1)
   store i8 0, i8* %1, align 1
-; CHECK-NEXT: add.w #2, r1
+; CHECK-NEXT: add #2, r1
 ; CHECK-NEXT: ret
   ret void
 }
diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll
index 3501861f5757d0eb82b0db486f6049973f9f382d..edb61d2221ef9498f4906068ee52adf5f2510356 100644
--- a/test/CodeGen/MSP430/vararg.ll
+++ b/test/CodeGen/MSP430/vararg.ll
@@ -10,12 +10,12 @@ declare void @llvm.va_copy(i8*, i8*) nounwind
 define void @va_start(i16 %a, ...) nounwind {
 entry:
 ; CHECK-LABEL: va_start:
-; CHECK: sub.w #2, r1
+; CHECK: sub #2, r1
   %vl = alloca i8*, align 2
   %vl1 = bitcast i8** %vl to i8*
-; CHECK-NEXT: mov.w r1, [[REG:r[0-9]+]]
-; CHECK-NEXT: add.w #6, [[REG]]
-; CHECK-NEXT: mov.w [[REG]], 0(r1)
+; CHECK-NEXT: mov r1, [[REG:r[0-9]+]]
+; CHECK-NEXT: add #6, [[REG]]
+; CHECK-NEXT: mov [[REG]], 0(r1)
   call void @llvm.va_start(i8* %vl1)
   call void @llvm.va_end(i8* %vl1)
   ret void
@@ -26,11 +26,11 @@ entry:
 ; CHECK-LABEL: va_arg:
   %vl.addr = alloca i8*, align 2
   store i8* %vl, i8** %vl.addr, align 2
-; CHECK: mov.w r12, [[REG:r[0-9]+]]
-; CHECK-NEXT: add.w #2, [[REG]]
-; CHECK-NEXT: mov.w [[REG]], 0(r1)
+; CHECK: mov r12, [[REG:r[0-9]+]]
+; CHECK-NEXT: incd [[REG]]
+; CHECK-NEXT: mov [[REG]], 0(r1)
   %0 = va_arg i8** %vl.addr, i16
-; CHECK-NEXT: mov.w 0(r12), r12
+; CHECK-NEXT: mov 0(r12), r12
   ret i16 %0
 }
 
@@ -39,11 +39,11 @@ entry:
 ; CHECK-LABEL: va_copy:
   %vl.addr = alloca i8*, align 2
   %vl2 = alloca i8*, align 2
-; CHECK-DAG: mov.w r12, 2(r1)
+; CHECK-DAG: mov r12, 2(r1)
   store i8* %vl, i8** %vl.addr, align 2
   %0 = bitcast i8** %vl2 to i8*
   %1 = bitcast i8** %vl.addr to i8*
-; CHECK-DAG: mov.w r12, 0(r1)
+; CHECK-DAG: mov r12, 0(r1)
   call void @llvm.va_copy(i8* %0, i8* %1)
   ret void
 }
diff --git a/test/CodeGen/Mips/2008-07-07-Float2Int.ll b/test/CodeGen/Mips/2008-07-07-Float2Int.ll
index 4c552361d9da6080b360af8e2297045bc41594cf..1b2ac19cba0056622f6c1cff670950dbc316279b 100644
--- a/test/CodeGen/Mips/2008-07-07-Float2Int.ll
+++ b/test/CodeGen/Mips/2008-07-07-Float2Int.ll
@@ -1,17 +1,33 @@
-; RUN: llc -march=mips < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=mips-- | FileCheck %s
 
 define i32 @fptoint(float %a) nounwind {
+; CHECK-LABEL: fptoint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    trunc.w.s $f0, $f12
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    mfc1 $2, $f0
 entry:
-; CHECK: trunc.w.s 
   fptosi float %a to i32		; <i32>:0 [#uses=1]
   ret i32 %0
 }
 
 define i32 @fptouint(float %a) nounwind {
+; CHECK-LABEL: fptouint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui $1, %hi($CPI1_0)
+; CHECK-NEXT:    lwc1 $f0, %lo($CPI1_0)($1)
+; CHECK-NEXT:    sub.s $f1, $f12, $f0
+; CHECK-NEXT:    trunc.w.s $f1, $f1
+; CHECK-NEXT:    mfc1 $1, $f1
+; CHECK-NEXT:    lui $2, 32768
+; CHECK-NEXT:    xor $2, $1, $2
+; CHECK-NEXT:    trunc.w.s $f1, $f12
+; CHECK-NEXT:    mfc1 $1, $f1
+; CHECK-NEXT:    c.olt.s $f12, $f0
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    movt $2, $1, $fcc0
 entry:
-; CHECK: fptouint
-; CHECK: trunc.w.s 
-; CHECK: trunc.w.s 
   fptoui float %a to i32		; <i32>:0 [#uses=1]
   ret i32 %0
 }
diff --git a/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll b/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll
index 47b3c92203d1b96d00a0e0f3d7da7e2ecca56313..b5cf2a2030d29e99c671c982d64ce1b945562584 100644
--- a/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/bricmpi1.ll
@@ -109,11 +109,11 @@ end:
 define void @testsgt(i32, i32) {
 ; CHECK-LABEL: testsgt:
 ; CHECK:       andi $[[REG0:[0-9]+]], $4, 1
-; CHECK:       negu $[[REG0]], $[[REG0]]
+; CHECK:       negu $[[REG2:[0-9]+]], $[[REG0]]
 ; CHECK:       andi $[[REG1:[0-9]+]], $5, 1
-; CHECK:       negu $[[REG1]], $[[REG1]]
-; CHECK:       slt $[[REG2:[0-9]+]], $[[REG1]], $[[REG0]]
-; CHECK:       bnez $[[REG2]],
+; CHECK:       negu $[[REG3:[0-9]+]], $[[REG1]]
+; CHECK:       slt $[[REG4:[0-9]+]], $[[REG3]], $[[REG2]]
+; CHECK:       bnez $[[REG4]],
   %3 = trunc i32 %0 to i1
   %4 = trunc i32 %1 to i1
   %5 = icmp sgt i1 %3, %4
@@ -169,11 +169,11 @@ end:
 define void @testsle(i32, i32) {
 ; CHECK-LABEL: testsle:
 ; CHECK:       andi $[[REG0:[0-9]+]], $4, 1
-; CHECK:       negu $[[REG0]], $[[REG0]]
+; CHECK:       negu $[[REG2:[0-9]+]], $[[REG0]]
 ; CHECK:       andi $[[REG1:[0-9]+]], $5, 1
-; CHECK:       negu $[[REG1]], $[[REG1]]
-; CHECK:       slt $[[REG2:[0-9]+]], $[[REG1]], $[[REG0]]
-; CHECK:       beqz $[[REG2]],
+; CHECK:       negu $[[REG3:[0-9]+]], $[[REG1]]
+; CHECK:       slt $[[REG4:[0-9]+]], $[[REG3]], $[[REG2]]
+; CHECK:       beqz $[[REG4]],
   %3 = trunc i32 %0 to i1
   %4 = trunc i32 %1 to i1
   %5 = icmp sle i1 %3, %4
diff --git a/test/CodeGen/Mips/Fast-ISel/callabi.ll b/test/CodeGen/Mips/Fast-ISel/callabi.ll
index 485a1986b26b4355fc96b31c0646d5b101a03144..f22fbcc7b73eebe842c57f888895bbcb3e6fe5b2 100644
--- a/test/CodeGen/Mips/Fast-ISel/callabi.ll
+++ b/test/CodeGen/Mips/Fast-ISel/callabi.ll
@@ -180,7 +180,7 @@ define void @cxcccc() {
   ; 32R1:       sra     $7, $[[R]], 24
   ; 32R2:       seb     $7, $[[R]]
 
-  ; ALL:        lw      $25, %got(xcccc)($2)
+  ; ALL:        lw      $25, %got(xcccc)(${{[0-9]+}})
   ; ALL:        jalr    $25
   ; ALL:        jr      $ra
   call void @xcccc(i8 88, i8 44, i8 11, i8 33)
@@ -209,7 +209,7 @@ define void @cxhhhh() {
   ; 32R1:       sra     $7, $[[R]], 16
   ; 32R2:       seh     $7, $[[R]]
 
-  ; ALL:        lw      $25, %got(xhhhh)($2)
+  ; ALL:        lw      $25, %got(xhhhh)(${{[0-9]+}})
   ; ALL:        jalr    $25
   ; ALL:        jr      $ra
 
diff --git a/test/CodeGen/Mips/Fast-ISel/fastalloca.ll b/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
index c420a04457862b722d7b70c81334fddac44037dd..ad2a0f8f2a841aff83dd1bee83c2e233616ffccf 100644
--- a/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fastalloca.ll
@@ -22,8 +22,8 @@ entry:
   %2 = load i32, i32* %x2, align 4
   store i32 %2, i32* @i, align 4
   %3 = load i32, i32* %retval
-; CHECK-DAG:    lw      $[[I_ADDR:[0-9]+]], %got(i)($[[REG_GP:[0-9]+]])
-; CHECK-DAG:    addiu   $[[A_ADDR:[0-9]+]], $sp, 8
+; CHECK:        lw      $[[I_ADDR:[0-9]+]], %got(i)($[[REG_GP:[0-9]+]])
+; CHECK:        addiu   $[[A_ADDR:[0-9]+]], $sp, 8
 ; CHECK-DAG:    sw      $[[A_ADDR]], [[A_ADDR_FI:[0-9]+]]($sp)
 ; CHECK-DAG:    lw      $[[A_ADDR2:[0-9]+]], [[A_ADDR_FI]]($sp)
 ; CHECK-DAG:    lw      $[[A_X:[0-9]+]], 0($[[A_ADDR2]])
diff --git a/test/CodeGen/Mips/Fast-ISel/logopm.ll b/test/CodeGen/Mips/Fast-ISel/logopm.ll
index 0519c07682ed0609925db5561aac1921e311e53f..ef6b5182a7a56868c7beb848132c02138b9029c6 100644
--- a/test/CodeGen/Mips/Fast-ISel/logopm.ll
+++ b/test/CodeGen/Mips/Fast-ISel/logopm.ll
@@ -245,7 +245,7 @@ entry:
 ; CHECK-DAG:    lw      $[[UC1_ADDR:[0-9]+]], %got(uc1)($[[REG_GP]])
 ; CHECK-DAG:    lbu     $[[UC1:[0-9]+]], 0($[[UC1_ADDR]])
 ; CHECK-DAG:    lbu     $[[UC2:[0-9]+]], 0($[[UC2_ADDR]])
-; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[UC2]], $[[UB1]]
+; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[UC2]], $[[UC1]]
 ; CHECK:        sb      $[[RES]], 0($[[UC_ADDR]])
   ret void
 }
@@ -430,7 +430,7 @@ entry:
 ; CHECK-DAG:    lw      $[[US1_ADDR:[0-9]+]], %got(us1)($[[REG_GP]])
 ; CHECK-DAG:    lhu     $[[US1:[0-9]+]], 0($[[US1_ADDR]])
 ; CHECK-DAG:    lhu     $[[US2:[0-9]+]], 0($[[US2_ADDR]])
-; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[US2]], $[[UB1]]
+; CHECK-DAG:    and     $[[RES:[0-9]+]], $[[US2]], $[[US1]]
 ; CHECK:        sh      $[[RES]], 0($[[US_ADDR]])
 ; CHECK:        .end andUs
   ret void
diff --git a/test/CodeGen/Mips/GlobalISel/irtranslator/split_args.ll b/test/CodeGen/Mips/GlobalISel/irtranslator/split_args.ll
index f51b72060de978b06764ada9d6d49aa60b92ea18..13ffd24bcb96e4442f43ef507381d523c0a4d7ec 100644
--- a/test/CodeGen/Mips/GlobalISel/irtranslator/split_args.ll
+++ b/test/CodeGen/Mips/GlobalISel/irtranslator/split_args.ll
@@ -6,10 +6,10 @@ define i64 @i64_reg(i64 %a) {
   ; MIPS32:   liveins: $a0, $a1
   ; MIPS32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
   ; MIPS32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
@@ -30,10 +30,10 @@ define i64 @i64_stack(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i64 %a) {
   ; MIPS32:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load 4 from %fixed-stack.[[STACK1]], align 0)
   ; MIPS32:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
   ; MIPS32:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load 4 from %fixed-stack.[[STACK0]], align 0)
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[LOAD]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
@@ -46,10 +46,10 @@ define i64 @i64_reg_allign(i32 %a0, i64 %a) {
   ; MIPS32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
   ; MIPS32:   [[COPY1:%[0-9]+]]:_(s32) = COPY $a2
   ; MIPS32:   [[COPY2:%[0-9]+]]:_(s32) = COPY $a3
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY1]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
@@ -73,10 +73,10 @@ define i64 @i64_stack_allign(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %s16, i64 %
   ; MIPS32:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load 4 from %fixed-stack.[[STACK1]], align 0)
   ; MIPS32:   [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
   ; MIPS32:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load 4 from %fixed-stack.[[STACK0]], align 0)
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD1]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[LOAD2]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
@@ -96,10 +96,10 @@ define i64 @i64_reg_stack(i32 %a0, i32 %a1, i32 %a2, i64 %a) {
   ; MIPS32:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load 4 from %fixed-stack.[[STACK1]], align 0)
   ; MIPS32:   [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
   ; MIPS32:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load 4 from %fixed-stack.[[STACK0]], align 0)
-  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD1]](s32), [[LOAD]](s32)
+  ; MIPS32:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
   ; MIPS32:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
-  ; MIPS32:   $v0 = COPY [[UV1]](s32)
-  ; MIPS32:   $v1 = COPY [[UV]](s32)
+  ; MIPS32:   $v0 = COPY [[UV]](s32)
+  ; MIPS32:   $v1 = COPY [[UV1]](s32)
   ; MIPS32:   RetRA implicit $v0, implicit $v1
 entry:
   ret i64 %a
diff --git a/test/CodeGen/Mips/GlobalISel/legalizer/add.mir b/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
index efd071636b55cf9f184b6e4cef09a5aebe4f5adb..ff9ae06a9374cdfb6b485971e0f334aca283be5d 100644
--- a/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
+++ b/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
@@ -226,12 +226,12 @@ body:             |
     ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY1]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[COPY2]]
+    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY]]
+    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY1]]
+    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[COPY3]]
     ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[ADD1]](s32)
-    ; MIPS32: $v1 = COPY [[ADD2]](s32)
+    ; MIPS32: $v0 = COPY [[ADD2]](s32)
+    ; MIPS32: $v1 = COPY [[ADD1]](s32)
     ; MIPS32: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
diff --git a/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir b/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
new file mode 100644
index 0000000000000000000000000000000000000000..d223411c58a5f0a473bc2c1c66b7849bd39435d7
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
@@ -0,0 +1,164 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=mipsel-linux-gnu -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=MIPS32
+--- |
+
+  define void @any_i64() {entry: ret void}
+  define void @any_i32() {entry: ret void}
+  define void @signed_i16() {entry: ret void}
+  define void @signed_i8() {entry: ret void}
+  define void @unsigned_i16() {entry: ret void}
+  define void @unsigned_i8() {entry: ret void}
+  define void @i1_true() {entry: ret void}
+  define void @i1_false() {entry: ret void}
+
+...
+---
+name:            any_i64
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: any_i64
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; MIPS32: $v0 = COPY [[C1]](s32)
+    ; MIPS32: $v1 = COPY [[C]](s32)
+    ; MIPS32: RetRA implicit $v0, implicit $v1
+    %0:_(s64) = G_CONSTANT i64 -9223372036854775808
+    %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(s64)
+    $v0 = COPY %2(s32)
+    $v1 = COPY %1(s32)
+    RetRA implicit $v0, implicit $v1
+
+...
+---
+name:            any_i32
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: any_i32
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; MIPS32: $v0 = COPY [[C]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s32) = G_CONSTANT i32 -2147483648
+    $v0 = COPY %0(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            signed_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: signed_i16
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C1]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]]
+    ; MIPS32: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s16) = G_CONSTANT i16 -32768
+    %1:_(s32) = G_SEXT %0(s16)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            signed_i8
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: signed_i8
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C1]]
+    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]]
+    ; MIPS32: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s8) = G_CONSTANT i8 -128
+    %1:_(s32) = G_SEXT %0(s8)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            unsigned_i16
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: unsigned_i16
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32: $v0 = COPY [[AND]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s16) = G_CONSTANT i16 -32768
+    %1:_(s32) = G_ZEXT %0(s16)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            unsigned_i8
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: unsigned_i8
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32: $v0 = COPY [[AND]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s8) = G_CONSTANT i8 -128
+    %1:_(s32) = G_ZEXT %0(s8)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            i1_true
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: i1_true
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32: $v0 = COPY [[AND]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s1) = G_CONSTANT i1 true
+    %1:_(s32) = G_ZEXT %0(s1)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
+---
+name:            i1_false
+alignment:       2
+tracksRegLiveness: true
+body:             |
+  bb.1.entry:
+    ; MIPS32-LABEL: name: i1_false
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32: $v0 = COPY [[AND]](s32)
+    ; MIPS32: RetRA implicit $v0
+    %0:_(s1) = G_CONSTANT i1 false
+    %1:_(s32) = G_ZEXT %0(s1)
+    $v0 = COPY %1(s32)
+    RetRA implicit $v0
+
+...
diff --git a/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll b/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ef7600402e03116868f19a188d07c6b446711d7f
--- /dev/null
+++ b/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc  -O0 -mtriple=mipsel-linux-gnu -global-isel  -verify-machineinstrs %s -o -| FileCheck %s -check-prefixes=MIPS32
+
+define i64 @any_i64() {
+; MIPS32-LABEL: any_i64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 0
+; MIPS32-NEXT:    ori $2, $1, 0
+; MIPS32-NEXT:    lui $1, 32768
+; MIPS32-NEXT:    ori $3, $1, 0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i64 -9223372036854775808
+}
+
+define i32 @any_i32() {
+; MIPS32-LABEL: any_i32:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 32768
+; MIPS32-NEXT:    ori $2, $1, 0
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i32 -2147483648
+}
+
+define signext i16 @signed_i16() {
+; MIPS32-LABEL: signed_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 32768
+; MIPS32-NEXT:    sll $1, $1, 16
+; MIPS32-NEXT:    sra $2, $1, 16
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i16 -32768
+}
+
+define signext i8 @signed_i8() {
+; MIPS32-LABEL: signed_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 65408
+; MIPS32-NEXT:    sll $1, $1, 24
+; MIPS32-NEXT:    sra $2, $1, 24
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i8 -128
+}
+
+define zeroext i16 @unsigned_i16() {
+; MIPS32-LABEL: unsigned_i16:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 32768
+; MIPS32-NEXT:    lui $2, 0
+; MIPS32-NEXT:    ori $2, $2, 65535
+; MIPS32-NEXT:    and $2, $1, $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i16 -32768
+}
+
+define zeroext i8 @unsigned_i8() {
+; MIPS32-LABEL: unsigned_i8:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 65408
+; MIPS32-NEXT:    lui $2, 0
+; MIPS32-NEXT:    ori $2, $2, 255
+; MIPS32-NEXT:    and $2, $1, $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i8 -128
+}
+
+define zeroext i1 @i1_true() {
+; MIPS32-LABEL: i1_true:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 65535
+; MIPS32-NEXT:    ori $1, $1, 65535
+; MIPS32-NEXT:    lui $2, 0
+; MIPS32-NEXT:    ori $2, $2, 1
+; MIPS32-NEXT:    and $2, $1, $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i1 true
+}
+
+define zeroext i1 @i1_false() {
+; MIPS32-LABEL: i1_false:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $1, 0
+; MIPS32-NEXT:    ori $1, $1, 0
+; MIPS32-NEXT:    lui $2, 0
+; MIPS32-NEXT:    ori $2, $2, 1
+; MIPS32-NEXT:    and $2, $1, $2
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    nop
+entry:
+  ret i1 false
+}
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index b58338aa6fd0fd10d6102cc4af310c621aa12488..3d516ea263825553e39befaf2a5ecb06c9f4d516 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -2038,10 +2038,10 @@ define i32 @AtomicCmpSwap32(i32 signext %oldval, i32 signext %newval) nounwind {
 ; MIPS32R6O0-NEXT:    beqzc $7, $BB7_1
 ; MIPS32R6O0-NEXT:  $BB7_3: # %entry
 ; MIPS32R6O0-NEXT:    move $2, $6
+; MIPS32R6O0-NEXT:    sw $25, 12($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $1, 8($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:    sw $6, 16($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $1, 12($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $25, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $3, 4($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 24
 ; MIPS32R6O0-NEXT:    jrc $ra
 ;
@@ -4550,11 +4550,11 @@ define signext i8 @AtomicCmpSwap8(i8 signext %oldval, i8 signext %newval) nounwi
 ; MIPS32R6O0-NEXT:    srlv $8, $10, $2
 ; MIPS32R6O0-NEXT:    seb $8, $8
 ; MIPS32R6O0-NEXT:  # %bb.4: # %entry
-; MIPS32R6O0-NEXT:    sw $1, 12($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $8, 8($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $25, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $25, 12($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $1, 8($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $8, 4($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:  # %bb.5: # %entry
-; MIPS32R6O0-NEXT:    lw $2, 8($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $2, 4($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 16
 ; MIPS32R6O0-NEXT:    jrc $ra
 ;
@@ -5127,14 +5127,14 @@ define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) n
 ; MIPS32R6O0-NEXT:    srlv $11, $13, $4
 ; MIPS32R6O0-NEXT:    seb $11, $11
 ; MIPS32R6O0-NEXT:  # %bb.4: # %entry
-; MIPS32R6O0-NEXT:    sw $11, 20($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $5, 16($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $3, 12($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $1, 8($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $2, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $5, 20($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $1, 16($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $2, 12($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $11, 4($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:  # %bb.5: # %entry
-; MIPS32R6O0-NEXT:    lw $1, 20($sp) # 4-byte Folded Reload
-; MIPS32R6O0-NEXT:    lw $2, 16($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $1, 4($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $2, 20($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    xor $1, $1, $2
 ; MIPS32R6O0-NEXT:    sltiu $2, $1, 1
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 24
@@ -5282,7 +5282,7 @@ define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) n
 ;
 ; MIPS64R6O0-LABEL: AtomicCmpSwapRes8:
 ; MIPS64R6O0:       # %bb.0: # %entry
-; MIPS64R6O0-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R6O0-NEXT:    daddiu $sp, $sp, -32
 ; MIPS64R6O0-NEXT:    move $1, $6
 ; MIPS64R6O0-NEXT:    move $2, $5
 ; MIPS64R6O0-NEXT:    move $5, $4
@@ -5313,15 +5313,15 @@ define i1 @AtomicCmpSwapRes8(i8* %ptr, i8 signext %oldval, i8 signext %newval) n
 ; MIPS64R6O0-NEXT:    srlv $10, $12, $3
 ; MIPS64R6O0-NEXT:    seb $10, $10
 ; MIPS64R6O0-NEXT:  # %bb.4: # %entry
-; MIPS64R6O0-NEXT:    sd $5, 8($sp) # 8-byte Folded Spill
-; MIPS64R6O0-NEXT:    sw $10, 4($sp) # 4-byte Folded Spill
-; MIPS64R6O0-NEXT:    sw $2, 0($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $2, 28($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sd $5, 16($sp) # 8-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $10, 12($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.5: # %entry
-; MIPS64R6O0-NEXT:    lw $1, 4($sp) # 4-byte Folded Reload
-; MIPS64R6O0-NEXT:    lw $2, 0($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $2, 28($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    xor $1, $1, $2
 ; MIPS64R6O0-NEXT:    sltiu $2, $1, 1
-; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 16
+; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 32
 ; MIPS64R6O0-NEXT:    jrc $ra
 ;
 ; MM32-LABEL: AtomicCmpSwapRes8:
@@ -6233,20 +6233,20 @@ define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) {
 ; MIPS32R6O0-NEXT:    srlv $12, $14, $4
 ; MIPS32R6O0-NEXT:    seh $12, $12
 ; MIPS32R6O0-NEXT:  # %bb.4:
-; MIPS32R6O0-NEXT:    sw $12, 20($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $3, 16($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $8, 12($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $5, 8($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
-; MIPS32R6O0-NEXT:    sw $2, 0($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $1, 20($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $2, 16($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $3, 12($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $8, 8($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $5, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $12, 0($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:  # %bb.5:
-; MIPS32R6O0-NEXT:    lw $1, 8($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $1, 4($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    seh $2, $1
-; MIPS32R6O0-NEXT:    lw $3, 20($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $3, 0($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    xor $2, $3, $2
 ; MIPS32R6O0-NEXT:    sltiu $3, $2, 1
 ; MIPS32R6O0-NEXT:    sync
-; MIPS32R6O0-NEXT:    lw $2, 20($sp) # 4-byte Folded Reload
+; MIPS32R6O0-NEXT:    lw $2, 0($sp) # 4-byte Folded Reload
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 24
 ; MIPS32R6O0-NEXT:    jrc $ra
 ;
@@ -6449,17 +6449,17 @@ define {i16, i1} @foo(i16* %addr, i16 %l, i16 %r, i16 %new) {
 ; MIPS64R6O0-NEXT:    srlv $11, $13, $3
 ; MIPS64R6O0-NEXT:    seh $11, $11
 ; MIPS64R6O0-NEXT:  # %bb.4:
-; MIPS64R6O0-NEXT:    sw $2, 12($sp) # 4-byte Folded Spill
-; MIPS64R6O0-NEXT:    sw $11, 8($sp) # 4-byte Folded Spill
-; MIPS64R6O0-NEXT:    sd $5, 0($sp) # 8-byte Folded Spill
+; MIPS64R6O0-NEXT:    sd $5, 8($sp) # 8-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $2, 4($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $11, 0($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:  # %bb.5:
-; MIPS64R6O0-NEXT:    lw $1, 12($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $1, 4($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    seh $2, $1
-; MIPS64R6O0-NEXT:    lw $3, 8($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $3, 0($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    xor $2, $3, $2
 ; MIPS64R6O0-NEXT:    sltiu $3, $2, 1
 ; MIPS64R6O0-NEXT:    sync
-; MIPS64R6O0-NEXT:    lw $2, 8($sp) # 4-byte Folded Reload
+; MIPS64R6O0-NEXT:    lw $2, 0($sp) # 4-byte Folded Reload
 ; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R6O0-NEXT:    jrc $ra
 ;
@@ -7016,8 +7016,8 @@ define i32 @zeroreg() nounwind {
 ; MIPS32O0-NEXT:    xor $2, $5, $2
 ; MIPS32O0-NEXT:    sltiu $2, $2, 1
 ; MIPS32O0-NEXT:    andi $2, $2, 1
-; MIPS32O0-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
 ; MIPS32O0-NEXT:    sw $5, 12($sp) # 4-byte Folded Spill
+; MIPS32O0-NEXT:    sw $3, 8($sp) # 4-byte Folded Spill
 ; MIPS32O0-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
 ; MIPS32O0-NEXT:    addiu $sp, $sp, 16
 ; MIPS32O0-NEXT:    jr $ra
@@ -7099,8 +7099,8 @@ define i32 @zeroreg() nounwind {
 ; MIPS32R6O0-NEXT:    xor $1, $5, $1
 ; MIPS32R6O0-NEXT:    sltiu $2, $1, 1
 ; MIPS32R6O0-NEXT:    sync
-; MIPS32R6O0-NEXT:    sw $3, 0($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:    sw $5, 4($sp) # 4-byte Folded Spill
+; MIPS32R6O0-NEXT:    sw $3, 0($sp) # 4-byte Folded Spill
 ; MIPS32R6O0-NEXT:    addiu $sp, $sp, 8
 ; MIPS32R6O0-NEXT:    jrc $ra
 ;
@@ -7234,8 +7234,8 @@ define i32 @zeroreg() nounwind {
 ; MIPS64R6O0-NEXT:    xor $2, $6, $3
 ; MIPS64R6O0-NEXT:    sltiu $2, $2, 1
 ; MIPS64R6O0-NEXT:    sync
-; MIPS64R6O0-NEXT:    sw $4, 8($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:    sw $6, 12($sp) # 4-byte Folded Spill
+; MIPS64R6O0-NEXT:    sw $4, 8($sp) # 4-byte Folded Spill
 ; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R6O0-NEXT:    jrc $ra
 ;
diff --git a/test/CodeGen/Mips/atomic64.ll b/test/CodeGen/Mips/atomic64.ll
index aa8442d488b1059a4f8789abe4cb62e938a03cd6..8e5002b38b0849a3785964f2c34c92cb9f7de309 100644
--- a/test/CodeGen/Mips/atomic64.ll
+++ b/test/CodeGen/Mips/atomic64.ll
@@ -1289,8 +1289,8 @@ define i64 @AtomicCmpSwap64(i64 signext %oldval, i64 signext %newval) nounwind {
 ; MIPS64R6O0-NEXT:  .LBB7_3: # %entry
 ; MIPS64R6O0-NEXT:    sd $2, 24($sp) # 8-byte Folded Spill
 ; MIPS64R6O0-NEXT:    move $2, $6
-; MIPS64R6O0-NEXT:    sd $6, 32($sp) # 8-byte Folded Spill
 ; MIPS64R6O0-NEXT:    sd $25, 16($sp) # 8-byte Folded Spill
+; MIPS64R6O0-NEXT:    sd $6, 32($sp) # 8-byte Folded Spill
 ; MIPS64R6O0-NEXT:    sd $3, 8($sp) # 8-byte Folded Spill
 ; MIPS64R6O0-NEXT:    daddiu $sp, $sp, 48
 ; MIPS64R6O0-NEXT:    jrc $ra
diff --git a/test/CodeGen/Mips/atomicCmpSwapPW.ll b/test/CodeGen/Mips/atomicCmpSwapPW.ll
index 10610e34e71613492d988c6f3a4aec270650bc5b..973f3a5bf0b296e4c4260d6649c307ae419f0cc2 100644
--- a/test/CodeGen/Mips/atomicCmpSwapPW.ll
+++ b/test/CodeGen/Mips/atomicCmpSwapPW.ll
@@ -32,10 +32,10 @@ define void @foo(i32 %new, i32 %old) {
 ; O32-NEXT:    nop
 ; O32-NEXT:  $BB0_3: # %entry
 ; O32-NEXT:    sync
+; O32-NEXT:    sw $1, 8($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $2, 4($sp) # 4-byte Folded Spill
 ; O32-NEXT:    sw $7, 12($sp) # 4-byte Folded Spill
-; O32-NEXT:    sw $6, 8($sp) # 4-byte Folded Spill
-; O32-NEXT:    sw $1, 4($sp) # 4-byte Folded Spill
-; O32-NEXT:    sw $2, 0($sp) # 4-byte Folded Spill
+; O32-NEXT:    sw $6, 0($sp) # 4-byte Folded Spill
 ; O32-NEXT:    addiu $sp, $sp, 16
 ; O32-NEXT:    jr $ra
 ; O32-NEXT:    nop
diff --git a/test/CodeGen/Mips/cconv/fmaxl_call.ll b/test/CodeGen/Mips/cconv/fmaxl_call.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0e3078edae45db06194dcd6153575ad0b01f65d2
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/fmaxl_call.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=mips64el-unknown-linux-gnu -mcpu=mips64 < %s | FileCheck %s
+
+define fp128 @call_fmaxl(fp128 %a, fp128 %b) {
+; CHECK-LABEL: call_fmaxl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    daddiu $sp, $sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 31, -8
+; CHECK-NEXT:    jal fmaxl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    mov.d $f12, $f0
+; CHECK-NEXT:    jal f
+; CHECK-NEXT:    mov.d $f13, $f2
+; CHECK-NEXT:    ld $ra, 8($sp) # 8-byte Folded Reload
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    daddiu $sp, $sp, 16
+    %1 = call fp128 @llvm.maxnum.f128(fp128 %a, fp128 %b)
+    %2 = call fp128 @f(fp128 %1)
+    ret fp128 %2
+}
+
+declare fp128 @llvm.maxnum.f128(fp128, fp128)
+declare fp128 @f(fp128)
diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll
index b580d2a338c915feedc87a1dcd516469ab9c661a..9a55285feae207000197642c02f556d827aa0060 100644
--- a/test/CodeGen/Mips/cconv/vector.ll
+++ b/test/CodeGen/Mips/cconv/vector.ll
@@ -2045,31 +2045,29 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS32R5EB-NEXT:    jr $ra
 ; MIPS32R5EB-NEXT:    nop
 ;
-; MIPS64R5-LABEL: i32_2:
-; MIPS64R5:       # %bb.0:
-; MIPS64R5-NEXT:    daddiu $sp, $sp, -32
-; MIPS64R5-NEXT:    .cfi_def_cfa_offset 32
-; MIPS64R5-NEXT:    sd $5, 16($sp)
-; MIPS64R5-NEXT:    sd $4, 24($sp)
-; MIPS64R5-NEXT:    ldi.b $w0, 0
-; MIPS64R5-NEXT:    lw $1, 20($sp)
-; MIPS64R5-NEXT:    lw $2, 16($sp)
-; MIPS64R5-NEXT:    move.v $w1, $w0
-; MIPS64R5-NEXT:    insert.d $w1[0], $2
-; MIPS64R5-NEXT:    insert.d $w1[1], $1
-; MIPS64R5-NEXT:    lw $1, 24($sp)
-; MIPS64R5-NEXT:    insert.d $w0[0], $1
-; MIPS64R5-NEXT:    lw $1, 28($sp)
-; MIPS64R5-NEXT:    insert.d $w0[1], $1
-; MIPS64R5-NEXT:    addv.d $w0, $w0, $w1
-; MIPS64R5-NEXT:    copy_s.d $1, $w0[0]
-; MIPS64R5-NEXT:    copy_s.d $2, $w0[1]
-; MIPS64R5-NEXT:    sw $2, 12($sp)
-; MIPS64R5-NEXT:    sw $1, 8($sp)
-; MIPS64R5-NEXT:    ld $2, 8($sp)
-; MIPS64R5-NEXT:    daddiu $sp, $sp, 32
-; MIPS64R5-NEXT:    jr $ra
-; MIPS64R5-NEXT:    nop
+; MIPS64R5EB-LABEL: i32_2:
+; MIPS64R5EB:       # %bb.0:
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EB-NEXT:    sd $5, 16($sp)
+; MIPS64R5EB-NEXT:    sd $4, 24($sp)
+; MIPS64R5EB-NEXT:    ldi.b $w0, 0
+; MIPS64R5EB-NEXT:    lw $1, 16($sp)
+; MIPS64R5EB-NEXT:    move.v $w1, $w0
+; MIPS64R5EB-NEXT:    insert.d $w1[0], $1
+; MIPS64R5EB-NEXT:    insert.d $w1[1], $5
+; MIPS64R5EB-NEXT:    lw $1, 24($sp)
+; MIPS64R5EB-NEXT:    insert.d $w0[0], $1
+; MIPS64R5EB-NEXT:    insert.d $w0[1], $4
+; MIPS64R5EB-NEXT:    addv.d $w0, $w0, $w1
+; MIPS64R5EB-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[1]
+; MIPS64R5EB-NEXT:    sw $2, 12($sp)
+; MIPS64R5EB-NEXT:    sw $1, 8($sp)
+; MIPS64R5EB-NEXT:    ld $2, 8($sp)
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EB-NEXT:    jr $ra
+; MIPS64R5EB-NEXT:    nop
 ;
 ; MIPS32R5EL-LABEL: i32_2:
 ; MIPS32R5EL:       # %bb.0:
@@ -2095,6 +2093,30 @@ define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
 ; MIPS32R5EL-NEXT:    addiu $sp, $sp, 48
 ; MIPS32R5EL-NEXT:    jr $ra
 ; MIPS32R5EL-NEXT:    nop
+;
+; MIPS64R5EL-LABEL: i32_2:
+; MIPS64R5EL:       # %bb.0:
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -32
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64R5EL-NEXT:    sd $5, 16($sp)
+; MIPS64R5EL-NEXT:    sd $4, 24($sp)
+; MIPS64R5EL-NEXT:    ldi.b $w0, 0
+; MIPS64R5EL-NEXT:    lw $1, 20($sp)
+; MIPS64R5EL-NEXT:    move.v $w1, $w0
+; MIPS64R5EL-NEXT:    insert.d $w1[0], $5
+; MIPS64R5EL-NEXT:    insert.d $w1[1], $1
+; MIPS64R5EL-NEXT:    insert.d $w0[0], $4
+; MIPS64R5EL-NEXT:    lw $1, 28($sp)
+; MIPS64R5EL-NEXT:    insert.d $w0[1], $1
+; MIPS64R5EL-NEXT:    addv.d $w0, $w0, $w1
+; MIPS64R5EL-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[1]
+; MIPS64R5EL-NEXT:    sw $2, 12($sp)
+; MIPS64R5EL-NEXT:    sw $1, 8($sp)
+; MIPS64R5EL-NEXT:    ld $2, 8($sp)
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 32
+; MIPS64R5EL-NEXT:    jr $ra
+; MIPS64R5EL-NEXT:    nop
   %1 = add <2 x i32> %a, %b
   ret <2 x i32> %1
 }
@@ -2398,10 +2420,10 @@ define void @float_2(<2 x float> %a, <2 x float> %b) {
 ; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(float_2)))
 ; MIPS64R5EB-NEXT:    daddu $1, $1, $25
 ; MIPS64R5EB-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(float_2)))
-; MIPS64R5EB-NEXT:    sd $5, 0($sp)
-; MIPS64R5EB-NEXT:    sd $4, 16($sp)
-; MIPS64R5EB-NEXT:    ld.w $w0, 0($sp)
-; MIPS64R5EB-NEXT:    ld.w $w1, 16($sp)
+; MIPS64R5EB-NEXT:    sd $5, 16($sp)
+; MIPS64R5EB-NEXT:    sd $4, 0($sp)
+; MIPS64R5EB-NEXT:    ld.w $w0, 16($sp)
+; MIPS64R5EB-NEXT:    ld.w $w1, 0($sp)
 ; MIPS64R5EB-NEXT:    fadd.w $w0, $w1, $w0
 ; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
 ; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[0]
@@ -2441,10 +2463,10 @@ define void @float_2(<2 x float> %a, <2 x float> %b) {
 ; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(float_2)))
 ; MIPS64R5EL-NEXT:    daddu $1, $1, $25
 ; MIPS64R5EL-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(float_2)))
-; MIPS64R5EL-NEXT:    sd $5, 0($sp)
-; MIPS64R5EL-NEXT:    sd $4, 16($sp)
-; MIPS64R5EL-NEXT:    ld.w $w0, 0($sp)
-; MIPS64R5EL-NEXT:    ld.w $w1, 16($sp)
+; MIPS64R5EL-NEXT:    sd $5, 16($sp)
+; MIPS64R5EL-NEXT:    sd $4, 0($sp)
+; MIPS64R5EL-NEXT:    ld.w $w0, 16($sp)
+; MIPS64R5EL-NEXT:    ld.w $w1, 0($sp)
 ; MIPS64R5EL-NEXT:    fadd.w $w0, $w1, $w0
 ; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EL-NEXT:    ld $1, %got_disp(float_res_v2f32)($1)
@@ -3533,12 +3555,8 @@ define void @call_i8_2() {
 ; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32R5EB-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
 ; MIPS32R5EB-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EB-NEXT:    addiu $1, $zero, 1543
-; MIPS32R5EB-NEXT:    sh $1, 20($sp)
-; MIPS32R5EB-NEXT:    addiu $1, $zero, 3080
-; MIPS32R5EB-NEXT:    sh $1, 24($sp)
-; MIPS32R5EB-NEXT:    lhu $4, 20($sp)
-; MIPS32R5EB-NEXT:    lhu $5, 24($sp)
+; MIPS32R5EB-NEXT:    addiu $4, $zero, 1543
+; MIPS32R5EB-NEXT:    addiu $5, $zero, 3080
 ; MIPS32R5EB-NEXT:    jal i8_2
 ; MIPS32R5EB-NEXT:    nop
 ; MIPS32R5EB-NEXT:    sw $2, 16($sp)
@@ -3645,12 +3663,8 @@ define void @call_i8_2() {
 ; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 32
 ; MIPS32R5EL-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
 ; MIPS32R5EL-NEXT:    .cfi_offset 31, -4
-; MIPS32R5EL-NEXT:    addiu $1, $zero, 1798
-; MIPS32R5EL-NEXT:    sh $1, 20($sp)
-; MIPS32R5EL-NEXT:    addiu $1, $zero, 2060
-; MIPS32R5EL-NEXT:    sh $1, 24($sp)
-; MIPS32R5EL-NEXT:    lhu $4, 20($sp)
-; MIPS32R5EL-NEXT:    lhu $5, 24($sp)
+; MIPS32R5EL-NEXT:    addiu $4, $zero, 1798
+; MIPS32R5EL-NEXT:    addiu $5, $zero, 2060
 ; MIPS32R5EL-NEXT:    jal i8_2
 ; MIPS32R5EL-NEXT:    nop
 ; MIPS32R5EL-NEXT:    sw $2, 16($sp)
@@ -6167,14 +6181,15 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
 ; MIPS32R5-NEXT:    addiu $1, $zero, -16
 ; MIPS32R5-NEXT:    and $sp, $sp, $1
 ; MIPS32R5-NEXT:    andi $1, $6, 255
-; MIPS32R5-NEXT:    sw $1, 36($sp)
-; MIPS32R5-NEXT:    sw $1, 32($sp)
+; MIPS32R5-NEXT:    mtc1 $1, $f0
+; MIPS32R5-NEXT:    cvt.s.w $f0, $f0
+; MIPS32R5-NEXT:    swc1 $f0, 36($sp)
+; MIPS32R5-NEXT:    swc1 $f0, 32($sp)
 ; MIPS32R5-NEXT:    sw $5, 4($sp)
 ; MIPS32R5-NEXT:    sw $4, 0($sp)
-; MIPS32R5-NEXT:    ld.w $w0, 32($sp)
-; MIPS32R5-NEXT:    ffint_s.w $w0, $w0
-; MIPS32R5-NEXT:    ld.w $w1, 0($sp)
-; MIPS32R5-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS32R5-NEXT:    ld.w $w0, 0($sp)
+; MIPS32R5-NEXT:    ld.w $w1, 32($sp)
+; MIPS32R5-NEXT:    fadd.w $w0, $w1, $w0
 ; MIPS32R5-NEXT:    lw $1, 84($fp)
 ; MIPS32R5-NEXT:    sw $1, 20($sp)
 ; MIPS32R5-NEXT:    lw $1, 80($fp)
@@ -6195,16 +6210,17 @@ define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
 ; MIPS64R5-NEXT:    .cfi_def_cfa_offset 48
 ; MIPS64R5-NEXT:    sll $1, $5, 0
 ; MIPS64R5-NEXT:    andi $1, $1, 255
-; MIPS64R5-NEXT:    sw $1, 36($sp)
-; MIPS64R5-NEXT:    sw $1, 32($sp)
-; MIPS64R5-NEXT:    sd $4, 16($sp)
-; MIPS64R5-NEXT:    ld.w $w0, 32($sp)
-; MIPS64R5-NEXT:    ffint_s.w $w0, $w0
+; MIPS64R5-NEXT:    mtc1 $1, $f0
+; MIPS64R5-NEXT:    cvt.s.w $f0, $f0
+; MIPS64R5-NEXT:    swc1 $f0, 36($sp)
+; MIPS64R5-NEXT:    swc1 $f0, 32($sp)
+; MIPS64R5-NEXT:    sd $4, 0($sp)
+; MIPS64R5-NEXT:    ld.w $w0, 0($sp)
+; MIPS64R5-NEXT:    ld.w $w1, 32($sp)
+; MIPS64R5-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64R5-NEXT:    sd $6, 16($sp)
 ; MIPS64R5-NEXT:    ld.w $w1, 16($sp)
 ; MIPS64R5-NEXT:    fadd.w $w0, $w0, $w1
-; MIPS64R5-NEXT:    sd $6, 0($sp)
-; MIPS64R5-NEXT:    ld.w $w1, 0($sp)
-; MIPS64R5-NEXT:    fadd.w $w0, $w0, $w1
 ; MIPS64R5-NEXT:    splati.w $w1, $w0[1]
 ; MIPS64R5-NEXT:    add.s $f0, $f0, $f1
 ; MIPS64R5-NEXT:    daddiu $sp, $sp, 48
@@ -6323,36 +6339,59 @@ define <4 x float> @mixed_32(<4 x float> %a, i32 %b) {
 ; MIPS64EB-NEXT:    jr $ra
 ; MIPS64EB-NEXT:    nop
 ;
-; MIPS32R5-LABEL: mixed_32:
-; MIPS32R5:       # %bb.0: # %entry
-; MIPS32R5-NEXT:    ldi.b $w0, 0
-; MIPS32R5-NEXT:    insert.w $w0[0], $6
-; MIPS32R5-NEXT:    insert.w $w0[1], $7
-; MIPS32R5-NEXT:    lw $1, 16($sp)
-; MIPS32R5-NEXT:    insert.w $w0[2], $1
-; MIPS32R5-NEXT:    lw $1, 20($sp)
-; MIPS32R5-NEXT:    insert.w $w0[3], $1
-; MIPS32R5-NEXT:    lw $1, 24($sp)
-; MIPS32R5-NEXT:    fill.w $w1, $1
-; MIPS32R5-NEXT:    ffint_u.w $w1, $w1
-; MIPS32R5-NEXT:    fadd.w $w0, $w1, $w0
-; MIPS32R5-NEXT:    st.w $w0, 0($4)
-; MIPS32R5-NEXT:    jr $ra
-; MIPS32R5-NEXT:    nop
+; MIPS32R5EB-LABEL: mixed_32:
+; MIPS32R5EB:       # %bb.0: # %entry
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, -8
+; MIPS32R5EB-NEXT:    .cfi_def_cfa_offset 8
+; MIPS32R5EB-NEXT:    lui $1, 17200
+; MIPS32R5EB-NEXT:    sw $1, 0($sp)
+; MIPS32R5EB-NEXT:    lw $1, 32($sp)
+; MIPS32R5EB-NEXT:    sw $1, 4($sp)
+; MIPS32R5EB-NEXT:    lui $1, %hi($CPI41_0)
+; MIPS32R5EB-NEXT:    ldc1 $f0, %lo($CPI41_0)($1)
+; MIPS32R5EB-NEXT:    ldc1 $f1, 0($sp)
+; MIPS32R5EB-NEXT:    sub.d $f0, $f1, $f0
+; MIPS32R5EB-NEXT:    cvt.s.d $f0, $f0
+; MIPS32R5EB-NEXT:    ldi.b $w1, 0
+; MIPS32R5EB-NEXT:    splati.w $w0, $w0[0]
+; MIPS32R5EB-NEXT:    insert.w $w1[0], $6
+; MIPS32R5EB-NEXT:    insert.w $w1[1], $7
+; MIPS32R5EB-NEXT:    lw $1, 24($sp)
+; MIPS32R5EB-NEXT:    insert.w $w1[2], $1
+; MIPS32R5EB-NEXT:    lw $1, 28($sp)
+; MIPS32R5EB-NEXT:    insert.w $w1[3], $1
+; MIPS32R5EB-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS32R5EB-NEXT:    st.w $w0, 0($4)
+; MIPS32R5EB-NEXT:    addiu $sp, $sp, 8
+; MIPS32R5EB-NEXT:    jr $ra
+; MIPS32R5EB-NEXT:    nop
 ;
 ; MIPS64R5EB-LABEL: mixed_32:
 ; MIPS64R5EB:       # %bb.0: # %entry
-; MIPS64R5EB-NEXT:    ldi.b $w0, 0
-; MIPS64R5EB-NEXT:    insert.d $w0[0], $4
-; MIPS64R5EB-NEXT:    insert.d $w0[1], $5
-; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
-; MIPS64R5EB-NEXT:    sll $1, $6, 0
-; MIPS64R5EB-NEXT:    fill.w $w1, $1
-; MIPS64R5EB-NEXT:    ffint_u.w $w1, $w1
-; MIPS64R5EB-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EB-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EB-NEXT:    lui $1, %hi(%neg(%gp_rel(mixed_32)))
+; MIPS64R5EB-NEXT:    daddu $1, $1, $25
+; MIPS64R5EB-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(mixed_32)))
+; MIPS64R5EB-NEXT:    lui $2, 17200
+; MIPS64R5EB-NEXT:    sw $2, 8($sp)
+; MIPS64R5EB-NEXT:    sll $2, $6, 0
+; MIPS64R5EB-NEXT:    sw $2, 12($sp)
+; MIPS64R5EB-NEXT:    ld $1, %got_page(.LCPI41_0)($1)
+; MIPS64R5EB-NEXT:    ldc1 $f0, %got_ofst(.LCPI41_0)($1)
+; MIPS64R5EB-NEXT:    ldc1 $f1, 8($sp)
+; MIPS64R5EB-NEXT:    sub.d $f0, $f1, $f0
+; MIPS64R5EB-NEXT:    ldi.b $w1, 0
+; MIPS64R5EB-NEXT:    insert.d $w1[0], $4
+; MIPS64R5EB-NEXT:    insert.d $w1[1], $5
+; MIPS64R5EB-NEXT:    shf.w $w1, $w1, 177
+; MIPS64R5EB-NEXT:    cvt.s.d $f0, $f0
+; MIPS64R5EB-NEXT:    splati.w $w0, $w0[0]
+; MIPS64R5EB-NEXT:    fadd.w $w0, $w0, $w1
 ; MIPS64R5EB-NEXT:    shf.w $w0, $w0, 177
 ; MIPS64R5EB-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EB-NEXT:    copy_s.d $3, $w0[1]
+; MIPS64R5EB-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EB-NEXT:    jr $ra
 ; MIPS64R5EB-NEXT:    nop
 ;
@@ -6431,17 +6470,57 @@ define <4 x float> @mixed_32(<4 x float> %a, i32 %b) {
 ; MIPS64EL-NEXT:    jr $ra
 ; MIPS64EL-NEXT:    nop
 ;
+; MIPS32R5EL-LABEL: mixed_32:
+; MIPS32R5EL:       # %bb.0: # %entry
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, -8
+; MIPS32R5EL-NEXT:    .cfi_def_cfa_offset 8
+; MIPS32R5EL-NEXT:    lui $1, 17200
+; MIPS32R5EL-NEXT:    sw $1, 4($sp)
+; MIPS32R5EL-NEXT:    lw $1, 32($sp)
+; MIPS32R5EL-NEXT:    sw $1, 0($sp)
+; MIPS32R5EL-NEXT:    lui $1, %hi($CPI41_0)
+; MIPS32R5EL-NEXT:    ldc1 $f0, %lo($CPI41_0)($1)
+; MIPS32R5EL-NEXT:    ldc1 $f1, 0($sp)
+; MIPS32R5EL-NEXT:    sub.d $f0, $f1, $f0
+; MIPS32R5EL-NEXT:    cvt.s.d $f0, $f0
+; MIPS32R5EL-NEXT:    ldi.b $w1, 0
+; MIPS32R5EL-NEXT:    splati.w $w0, $w0[0]
+; MIPS32R5EL-NEXT:    insert.w $w1[0], $6
+; MIPS32R5EL-NEXT:    insert.w $w1[1], $7
+; MIPS32R5EL-NEXT:    lw $1, 24($sp)
+; MIPS32R5EL-NEXT:    insert.w $w1[2], $1
+; MIPS32R5EL-NEXT:    lw $1, 28($sp)
+; MIPS32R5EL-NEXT:    insert.w $w1[3], $1
+; MIPS32R5EL-NEXT:    fadd.w $w0, $w0, $w1
+; MIPS32R5EL-NEXT:    st.w $w0, 0($4)
+; MIPS32R5EL-NEXT:    addiu $sp, $sp, 8
+; MIPS32R5EL-NEXT:    jr $ra
+; MIPS32R5EL-NEXT:    nop
+;
 ; MIPS64R5EL-LABEL: mixed_32:
 ; MIPS64R5EL:       # %bb.0: # %entry
-; MIPS64R5EL-NEXT:    ldi.b $w0, 0
-; MIPS64R5EL-NEXT:    insert.d $w0[0], $4
-; MIPS64R5EL-NEXT:    insert.d $w0[1], $5
-; MIPS64R5EL-NEXT:    sll $1, $6, 0
-; MIPS64R5EL-NEXT:    fill.w $w1, $1
-; MIPS64R5EL-NEXT:    ffint_u.w $w1, $w1
-; MIPS64R5EL-NEXT:    fadd.w $w0, $w1, $w0
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, -16
+; MIPS64R5EL-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64R5EL-NEXT:    lui $1, %hi(%neg(%gp_rel(mixed_32)))
+; MIPS64R5EL-NEXT:    daddu $1, $1, $25
+; MIPS64R5EL-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(mixed_32)))
+; MIPS64R5EL-NEXT:    lui $2, 17200
+; MIPS64R5EL-NEXT:    sw $2, 12($sp)
+; MIPS64R5EL-NEXT:    sll $2, $6, 0
+; MIPS64R5EL-NEXT:    sw $2, 8($sp)
+; MIPS64R5EL-NEXT:    ld $1, %got_page(.LCPI41_0)($1)
+; MIPS64R5EL-NEXT:    ldc1 $f0, %got_ofst(.LCPI41_0)($1)
+; MIPS64R5EL-NEXT:    ldc1 $f1, 8($sp)
+; MIPS64R5EL-NEXT:    sub.d $f0, $f1, $f0
+; MIPS64R5EL-NEXT:    ldi.b $w1, 0
+; MIPS64R5EL-NEXT:    insert.d $w1[0], $4
+; MIPS64R5EL-NEXT:    insert.d $w1[1], $5
+; MIPS64R5EL-NEXT:    cvt.s.d $f0, $f0
+; MIPS64R5EL-NEXT:    splati.w $w0, $w0[0]
+; MIPS64R5EL-NEXT:    fadd.w $w0, $w0, $w1
 ; MIPS64R5EL-NEXT:    copy_s.d $2, $w0[0]
 ; MIPS64R5EL-NEXT:    copy_s.d $3, $w0[1]
+; MIPS64R5EL-NEXT:    daddiu $sp, $sp, 16
 ; MIPS64R5EL-NEXT:    jr $ra
 ; MIPS64R5EL-NEXT:    nop
 entry:
diff --git a/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll b/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
index 4f2339d18c304284a5fcbf9a211e49582cc6b7d2..efa0759090006ccf6eebfb6700d5cb514ef636b5 100644
--- a/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
+++ b/test/CodeGen/Mips/indirect-jump-hazard/jumptables.ll
@@ -155,11 +155,10 @@ define i8* @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R2:       # %bb.0: # %entry
 ; MIPS64R2-NEXT:    daddiu $sp, $sp, -16
 ; MIPS64R2-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R2-NEXT:    sw $4, 4($sp)
-; MIPS64R2-NEXT:    lwu $2, 4($sp)
+; MIPS64R2-NEXT:    dext $2, $4, 0, 32
 ; MIPS64R2-NEXT:    sltiu $1, $2, 7
 ; MIPS64R2-NEXT:    beqz $1, .LBB0_3
-; MIPS64R2-NEXT:    nop
+; MIPS64R2-NEXT:    sw $4, 4($sp)
 ; MIPS64R2-NEXT:  .LBB0_1: # %entry
 ; MIPS64R2-NEXT:    dsll $1, $2, 3
 ; MIPS64R2-NEXT:    lui $2, %highest(.LJTI0_0)
@@ -251,10 +250,10 @@ define i8* @_Z3fooi(i32 signext %Letter) {
 ; MIPS64R6:       # %bb.0: # %entry
 ; MIPS64R6-NEXT:    daddiu $sp, $sp, -16
 ; MIPS64R6-NEXT:    .cfi_def_cfa_offset 16
-; MIPS64R6-NEXT:    sw $4, 4($sp)
-; MIPS64R6-NEXT:    lwu $2, 4($sp)
+; MIPS64R6-NEXT:    dext $2, $4, 0, 32
 ; MIPS64R6-NEXT:    sltiu $1, $2, 7
-; MIPS64R6-NEXT:    beqzc $1, .LBB0_3
+; MIPS64R6-NEXT:    beqz $1, .LBB0_3
+; MIPS64R6-NEXT:    sw $4, 4($sp)
 ; MIPS64R6-NEXT:  .LBB0_1: # %entry
 ; MIPS64R6-NEXT:    dsll $1, $2, 3
 ; MIPS64R6-NEXT:    lui $2, %highest(.LJTI0_0)
@@ -473,11 +472,10 @@ define i8* @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R2-NEXT:    lui $1, %hi(%neg(%gp_rel(_Z3fooi)))
 ; PIC-MIPS64R2-NEXT:    daddu $1, $1, $25
 ; PIC-MIPS64R2-NEXT:    daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi)))
-; PIC-MIPS64R2-NEXT:    sw $4, 4($sp)
-; PIC-MIPS64R2-NEXT:    lwu $3, 4($sp)
+; PIC-MIPS64R2-NEXT:    dext $3, $4, 0, 32
 ; PIC-MIPS64R2-NEXT:    sltiu $1, $3, 7
 ; PIC-MIPS64R2-NEXT:    beqz $1, .LBB0_3
-; PIC-MIPS64R2-NEXT:    nop
+; PIC-MIPS64R2-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS64R2-NEXT:  .LBB0_1: # %entry
 ; PIC-MIPS64R2-NEXT:    dsll $1, $3, 3
 ; PIC-MIPS64R2-NEXT:    ld $3, %got_page(.LJTI0_0)($2)
@@ -537,10 +535,10 @@ define i8* @_Z3fooi(i32 signext %Letter) {
 ; PIC-MIPS64R6-NEXT:    lui $1, %hi(%neg(%gp_rel(_Z3fooi)))
 ; PIC-MIPS64R6-NEXT:    daddu $1, $1, $25
 ; PIC-MIPS64R6-NEXT:    daddiu $2, $1, %lo(%neg(%gp_rel(_Z3fooi)))
-; PIC-MIPS64R6-NEXT:    sw $4, 4($sp)
-; PIC-MIPS64R6-NEXT:    lwu $3, 4($sp)
+; PIC-MIPS64R6-NEXT:    dext $3, $4, 0, 32
 ; PIC-MIPS64R6-NEXT:    sltiu $1, $3, 7
-; PIC-MIPS64R6-NEXT:    beqzc $1, .LBB0_3
+; PIC-MIPS64R6-NEXT:    beqz $1, .LBB0_3
+; PIC-MIPS64R6-NEXT:    sw $4, 4($sp)
 ; PIC-MIPS64R6-NEXT:  .LBB0_1: # %entry
 ; PIC-MIPS64R6-NEXT:    dsll $1, $3, 3
 ; PIC-MIPS64R6-NEXT:    ld $3, %got_page(.LJTI0_0)($2)
diff --git a/test/CodeGen/Mips/llvm-ir/sdiv.ll b/test/CodeGen/Mips/llvm-ir/sdiv.ll
index 03b831191a8d78b006c81cdd918cc299e6a5c69f..e54eaa63222a05484bcff47656a6b3203650432a 100644
--- a/test/CodeGen/Mips/llvm-ir/sdiv.ll
+++ b/test/CodeGen/Mips/llvm-ir/sdiv.ll
@@ -35,55 +35,32 @@
 define signext i1 @sdiv_i1(i1 signext %a, i1 signext %b) {
 ; GP32-LABEL: sdiv_i1:
 ; GP32:       # %bb.0: # %entry
-; GP32-NEXT:    div $zero, $4, $5
-; GP32-NEXT:    teq $5, $zero, 7
-; GP32-NEXT:    mflo $1
-; GP32-NEXT:    andi $1, $1, 1
 ; GP32-NEXT:    jr $ra
-; GP32-NEXT:    negu $2, $1
+; GP32-NEXT:    move $2, $4
 ;
 ; GP32R6-LABEL: sdiv_i1:
 ; GP32R6:       # %bb.0: # %entry
-; GP32R6-NEXT:    div $1, $4, $5
-; GP32R6-NEXT:    teq $5, $zero, 7
-; GP32R6-NEXT:    andi $1, $1, 1
 ; GP32R6-NEXT:    jr $ra
-; GP32R6-NEXT:    negu $2, $1
+; GP32R6-NEXT:    move $2, $4
 ;
 ; GP64-LABEL: sdiv_i1:
 ; GP64:       # %bb.0: # %entry
-; GP64-NEXT:    div $zero, $4, $5
-; GP64-NEXT:    teq $5, $zero, 7
-; GP64-NEXT:    mflo $1
-; GP64-NEXT:    andi $1, $1, 1
 ; GP64-NEXT:    jr $ra
-; GP64-NEXT:    negu $2, $1
+; GP64-NEXT:    move $2, $4
 ;
 ; GP64R6-LABEL: sdiv_i1:
 ; GP64R6:       # %bb.0: # %entry
-; GP64R6-NEXT:    div $1, $4, $5
-; GP64R6-NEXT:    teq $5, $zero, 7
-; GP64R6-NEXT:    andi $1, $1, 1
 ; GP64R6-NEXT:    jr $ra
-; GP64R6-NEXT:    negu $2, $1
+; GP64R6-NEXT:    move $2, $4
 ;
 ; MMR3-LABEL: sdiv_i1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    div $zero, $4, $5
-; MMR3-NEXT:    teq $5, $zero, 7
-; MMR3-NEXT:    mflo16 $2
-; MMR3-NEXT:    andi16 $2, $2, 1
-; MMR3-NEXT:    li16 $3, 0
-; MMR3-NEXT:    subu16 $2, $3, $2
+; MMR3-NEXT:    move $2, $4
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: sdiv_i1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    div $2, $4, $5
-; MMR6-NEXT:    teq $5, $zero, 7
-; MMR6-NEXT:    andi16 $2, $2, 1
-; MMR6-NEXT:    li16 $3, 0
-; MMR6-NEXT:    subu16 $2, $3, $2
+; MMR6-NEXT:    move $2, $4
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = sdiv i1 %a, %b
diff --git a/test/CodeGen/Mips/llvm-ir/srem.ll b/test/CodeGen/Mips/llvm-ir/srem.ll
index 66ee6c01bd24314bb9a8d28bac14b0c0a48f922e..ef0502c85d59bb017262fece51229fe6278548da 100644
--- a/test/CodeGen/Mips/llvm-ir/srem.ll
+++ b/test/CodeGen/Mips/llvm-ir/srem.ll
@@ -35,55 +35,32 @@
 define signext i1 @srem_i1(i1 signext %a, i1 signext %b) {
 ; GP32-LABEL: srem_i1:
 ; GP32:       # %bb.0: # %entry
-; GP32-NEXT:    div $zero, $4, $5
-; GP32-NEXT:    teq $5, $zero, 7
-; GP32-NEXT:    mfhi $1
-; GP32-NEXT:    andi $1, $1, 1
 ; GP32-NEXT:    jr $ra
-; GP32-NEXT:    negu $2, $1
+; GP32-NEXT:    addiu $2, $zero, 0
 ;
 ; GP32R6-LABEL: srem_i1:
 ; GP32R6:       # %bb.0: # %entry
-; GP32R6-NEXT:    mod $1, $4, $5
-; GP32R6-NEXT:    teq $5, $zero, 7
-; GP32R6-NEXT:    andi $1, $1, 1
 ; GP32R6-NEXT:    jr $ra
-; GP32R6-NEXT:    negu $2, $1
+; GP32R6-NEXT:    addiu $2, $zero, 0
 ;
 ; GP64-LABEL: srem_i1:
 ; GP64:       # %bb.0: # %entry
-; GP64-NEXT:    div $zero, $4, $5
-; GP64-NEXT:    teq $5, $zero, 7
-; GP64-NEXT:    mfhi $1
-; GP64-NEXT:    andi $1, $1, 1
 ; GP64-NEXT:    jr $ra
-; GP64-NEXT:    negu $2, $1
+; GP64-NEXT:    addiu $2, $zero, 0
 ;
 ; GP64R6-LABEL: srem_i1:
 ; GP64R6:       # %bb.0: # %entry
-; GP64R6-NEXT:    mod $1, $4, $5
-; GP64R6-NEXT:    teq $5, $zero, 7
-; GP64R6-NEXT:    andi $1, $1, 1
 ; GP64R6-NEXT:    jr $ra
-; GP64R6-NEXT:    negu $2, $1
+; GP64R6-NEXT:    addiu $2, $zero, 0
 ;
 ; MMR3-LABEL: srem_i1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    div $zero, $4, $5
-; MMR3-NEXT:    teq $5, $zero, 7
-; MMR3-NEXT:    mfhi16 $2
-; MMR3-NEXT:    andi16 $2, $2, 1
-; MMR3-NEXT:    li16 $3, 0
-; MMR3-NEXT:    subu16 $2, $3, $2
+; MMR3-NEXT:    li16 $2, 0
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: srem_i1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    mod $2, $4, $5
-; MMR6-NEXT:    teq $5, $zero, 7
-; MMR6-NEXT:    andi16 $2, $2, 1
-; MMR6-NEXT:    li16 $3, 0
-; MMR6-NEXT:    subu16 $2, $3, $2
+; MMR6-NEXT:    li16 $2, 0
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = srem i1 %a, %b
diff --git a/test/CodeGen/Mips/llvm-ir/udiv.ll b/test/CodeGen/Mips/llvm-ir/udiv.ll
index e0ba7bc770eceaefa47d984eb14bf847bbe42192..8694a9f92b65ab204ada07b2a5ed1ee445ef96a9 100644
--- a/test/CodeGen/Mips/llvm-ir/udiv.ll
+++ b/test/CodeGen/Mips/llvm-ir/udiv.ll
@@ -35,41 +35,32 @@
 define zeroext i1 @udiv_i1(i1 zeroext %a, i1 zeroext %b) {
 ; GP32-LABEL: udiv_i1:
 ; GP32:       # %bb.0: # %entry
-; GP32-NEXT:    divu $zero, $4, $5
-; GP32-NEXT:    teq $5, $zero, 7
 ; GP32-NEXT:    jr $ra
-; GP32-NEXT:    mflo $2
+; GP32-NEXT:    move $2, $4
 ;
 ; GP32R6-LABEL: udiv_i1:
 ; GP32R6:       # %bb.0: # %entry
-; GP32R6-NEXT:    divu $2, $4, $5
-; GP32R6-NEXT:    teq $5, $zero, 7
-; GP32R6-NEXT:    jrc $ra
+; GP32R6-NEXT:    jr $ra
+; GP32R6-NEXT:    move $2, $4
 ;
 ; GP64-LABEL: udiv_i1:
 ; GP64:       # %bb.0: # %entry
-; GP64-NEXT:    divu $zero, $4, $5
-; GP64-NEXT:    teq $5, $zero, 7
 ; GP64-NEXT:    jr $ra
-; GP64-NEXT:    mflo $2
+; GP64-NEXT:    move $2, $4
 ;
 ; GP64R6-LABEL: udiv_i1:
 ; GP64R6:       # %bb.0: # %entry
-; GP64R6-NEXT:    divu $2, $4, $5
-; GP64R6-NEXT:    teq $5, $zero, 7
-; GP64R6-NEXT:    jrc $ra
+; GP64R6-NEXT:    jr $ra
+; GP64R6-NEXT:    move $2, $4
 ;
 ; MMR3-LABEL: udiv_i1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    divu $zero, $4, $5
-; MMR3-NEXT:    teq $5, $zero, 7
-; MMR3-NEXT:    mflo16 $2
+; MMR3-NEXT:    move $2, $4
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: udiv_i1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    divu $2, $4, $5
-; MMR6-NEXT:    teq $5, $zero, 7
+; MMR6-NEXT:    move $2, $4
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = udiv i1 %a, %b
diff --git a/test/CodeGen/Mips/llvm-ir/urem.ll b/test/CodeGen/Mips/llvm-ir/urem.ll
index 83830a3689b9faef13eb95c4442f6d8bcc5f7263..b744f706cbf9ceb86ae2edc0b106cce8785ae6e9 100644
--- a/test/CodeGen/Mips/llvm-ir/urem.ll
+++ b/test/CodeGen/Mips/llvm-ir/urem.ll
@@ -35,64 +35,32 @@
 define signext i1 @urem_i1(i1 signext %a, i1 signext %b) {
 ; GP32-LABEL: urem_i1:
 ; GP32:       # %bb.0: # %entry
-; GP32-NEXT:    andi $1, $5, 1
-; GP32-NEXT:    andi $2, $4, 1
-; GP32-NEXT:    divu $zero, $2, $1
-; GP32-NEXT:    teq $1, $zero, 7
-; GP32-NEXT:    mfhi $1
-; GP32-NEXT:    andi $1, $1, 1
 ; GP32-NEXT:    jr $ra
-; GP32-NEXT:    negu $2, $1
+; GP32-NEXT:    addiu $2, $zero, 0
 ;
 ; GP32R6-LABEL: urem_i1:
 ; GP32R6:       # %bb.0: # %entry
-; GP32R6-NEXT:    andi $1, $5, 1
-; GP32R6-NEXT:    andi $2, $4, 1
-; GP32R6-NEXT:    modu $2, $2, $1
-; GP32R6-NEXT:    teq $1, $zero, 7
 ; GP32R6-NEXT:    jr $ra
-; GP32R6-NEXT:    negu $2, $2
+; GP32R6-NEXT:    addiu $2, $zero, 0
 ;
 ; GP64-LABEL: urem_i1:
 ; GP64:       # %bb.0: # %entry
-; GP64-NEXT:    andi $1, $5, 1
-; GP64-NEXT:    andi $2, $4, 1
-; GP64-NEXT:    divu $zero, $2, $1
-; GP64-NEXT:    teq $1, $zero, 7
-; GP64-NEXT:    mfhi $1
-; GP64-NEXT:    andi $1, $1, 1
 ; GP64-NEXT:    jr $ra
-; GP64-NEXT:    negu $2, $1
+; GP64-NEXT:    addiu $2, $zero, 0
 ;
 ; GP64R6-LABEL: urem_i1:
 ; GP64R6:       # %bb.0: # %entry
-; GP64R6-NEXT:    andi $1, $5, 1
-; GP64R6-NEXT:    andi $2, $4, 1
-; GP64R6-NEXT:    modu $2, $2, $1
-; GP64R6-NEXT:    teq $1, $zero, 7
 ; GP64R6-NEXT:    jr $ra
-; GP64R6-NEXT:    negu $2, $2
+; GP64R6-NEXT:    addiu $2, $zero, 0
 ;
 ; MMR3-LABEL: urem_i1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    andi16 $2, $5, 1
-; MMR3-NEXT:    andi16 $3, $4, 1
-; MMR3-NEXT:    divu $zero, $3, $2
-; MMR3-NEXT:    teq $2, $zero, 7
-; MMR3-NEXT:    mfhi16 $2
-; MMR3-NEXT:    andi16 $2, $2, 1
-; MMR3-NEXT:    li16 $3, 0
-; MMR3-NEXT:    subu16 $2, $3, $2
+; MMR3-NEXT:    li16 $2, 0
 ; MMR3-NEXT:    jrc $ra
 ;
 ; MMR6-LABEL: urem_i1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    andi16 $2, $5, 1
-; MMR6-NEXT:    andi16 $3, $4, 1
-; MMR6-NEXT:    modu $3, $3, $2
-; MMR6-NEXT:    teq $2, $zero, 7
 ; MMR6-NEXT:    li16 $2, 0
-; MMR6-NEXT:    subu16 $2, $2, $3
 ; MMR6-NEXT:    jrc $ra
 entry:
   %r = urem i1 %a, %b
diff --git a/test/CodeGen/Mips/longbranch/long-branch-expansion-3.ll b/test/CodeGen/Mips/longbranch/long-branch-expansion-3.ll
index 6fa4d4d072ac50f124e74445140f9733bf088add..1fa78942af4197d396cb60006df0b69d86d01dea 100644
--- a/test/CodeGen/Mips/longbranch/long-branch-expansion-3.ll
+++ b/test/CodeGen/Mips/longbranch/long-branch-expansion-3.ll
@@ -1,12 +1,12 @@
-; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r2 < %s -o - | FileCheck %s --check-prefixes=CHECK32R2
-; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r6 < %s -o - | FileCheck %s --check-prefixes=CHECK32R6
-; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r2 -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK32-IJH
-; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r6 -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK32-IJH
+; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r2 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=CHECK32R2
+; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r6 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=CHECK32R6
+; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r2 -verify-machineinstrs -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK32-IJH
+; RUN: llc -O0 -mtriple=mips-img-linux-gnu -mcpu=mips32r6 -verify-machineinstrs -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK32-IJH
 
-; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r2 < %s -o - | FileCheck %s --check-prefixes=CHECK64R2
-; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r6 < %s -o - | FileCheck %s --check-prefixes=CHECK64R6
-; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r2 -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK64-IJH
-; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r6 -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK64-IJH
+; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r2 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=CHECK64R2
+; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r6 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefixes=CHECK64R6
+; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r2 -verify-machineinstrs -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK64-IJH
+; RUN: llc -O0 -mtriple=mips64-img-linux-gnu -mcpu=mips64r6 -verify-machineinstrs -mattr=+use-indirect-jump-hazard < %s -o - | FileCheck %s --check-prefixes=CHECK64-IJH
 
 declare i32 @foo(...)
 
diff --git a/test/CodeGen/Mips/micromips-gcc-except-table.ll b/test/CodeGen/Mips/micromips-gcc-except-table.ll
new file mode 100644
index 0000000000000000000000000000000000000000..38a76927e2a8a8067054f02755613e014b8936db
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-gcc-except-table.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+micromips -O3 -filetype=obj < %s | llvm-objdump -s -j .gcc_except_table - | FileCheck %s
+
+; CHECK: Contents of section .gcc_except_table:
+; CHECK-NEXT: 0000 ff9b1501 0c011100 00110e1f 011f1800
+; CHECK-NEXT: 0010 00010000 00000000
+
+@_ZTIi = external constant i8*
+
+define dso_local i32 @main() local_unnamed_addr norecurse personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind
+  %0 = bitcast i8* %exception.i to i32*
+  store i32 5, i32* %0, align 16
+  invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+          to label %.noexc unwind label %return
+
+.noexc:
+  unreachable
+
+return:
+  %1 = landingpad { i8*, i32 }
+          catch i8* null
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind
+  tail call void @__cxa_end_catch()
+  ret i32 0
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr
+
+declare void @__cxa_end_catch() local_unnamed_addr
+
+declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr
+
+declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
diff --git a/test/CodeGen/Mips/micromips-mtc-mfc.ll b/test/CodeGen/Mips/micromips-mtc-mfc.ll
index 1db9337a982666fbcf6d47daea792dc68dfe8308..c60b006752256217cc889c2e10c9e1fce9dd75d3 100644
--- a/test/CodeGen/Mips/micromips-mtc-mfc.ll
+++ b/test/CodeGen/Mips/micromips-mtc-mfc.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=mips -mcpu=mips32r2 -mattr=+micromips \
+; RUN: llc -mtriple=mips -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs \
 ; RUN:     -show-mc-encoding < %s | FileCheck --check-prefix=MM2 %s
-; RUN: llc -mtriple=mips -mcpu=mips32r6 -mattr=+micromips \
+; RUN: llc -mtriple=mips -mcpu=mips32r6 -mattr=+micromips -verify-machineinstrs \
 ; RUN:     -show-mc-encoding < %s | FileCheck --check-prefix=MM6 %s
 
 define double @foo(double %a, double %b) {
diff --git a/test/CodeGen/Mips/micromips-target-external-symbol-reloc.ll b/test/CodeGen/Mips/micromips-target-external-symbol-reloc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..df592c49cc683451d29da4abdffc2f5110980931
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-target-external-symbol-reloc.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=mips-mti-linux-gnu -mcpu=mips32r2 -mattr=+micromips -stop-after=expand-isel-pseudos < %s | FileCheck %s
+
+; CHECK: JAL_MM
+; CHECK-NOT: JALR16_MM
+
+define dso_local void @foo(i32* nocapture %ar) local_unnamed_addr {
+entry:
+  %0 = bitcast i32* %ar to i8*
+  tail call void @llvm.memset.p0i8.i32(i8* align 4 %0, i8 0, i32 100, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1)
diff --git a/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index b3ed8bdd3b9a96093e3430a74655e0b324bc2617..4618c96d879900c9b82918ba285bd7bf9ea2ed26 100644
--- a/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -1,20 +1,21 @@
-; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r5 \
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -relocation-model=pic -mtriple=mipsel-- -mcpu=mips32r5 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS32,MIPSR5,MIPS32-O32,MIPS32R5-O32
-; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \
+; RUN: llc -relocation-model=pic -mtriple=mips64el-- -mcpu=mips64r5 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N32,MIPS64R5-N32
-; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r5 \
+; RUN: llc -relocation-model=pic -mtriple=mips64el-- -mcpu=mips64r5 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR5,MIPS64-N64,MIPS64R5-N64
 
-; RUN: llc -relocation-model=pic -march=mipsel -mcpu=mips32r6 \
+; RUN: llc -relocation-model=pic -mtriple=mipsel-- -mcpu=mips32r6 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS32,MIPSR6,MIPSR6-O32
-; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \
+; RUN: llc -relocation-model=pic -mtriple=mips64el-- -mcpu=mips64r6 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n32 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N32,MIPSR6-N32
-; RUN: llc -relocation-model=pic -march=mips64el -mcpu=mips64r6 \
+; RUN: llc -relocation-model=pic -mtriple=mips64el-- -mcpu=mips64r6 \
 ; RUN:     -mattr=+fp64,+msa -verify-machineinstrs -target-abi n64 < %s | FileCheck %s \
 ; RUN:     --check-prefixes=ALL,MIPS64,MIPSR6,MIPS64-N64,MIPSR6-N64
 
@@ -26,16 +27,73 @@
 declare float @k2(half *)
 
 define void @f3(i16 %b) {
+; MIPS32-LABEL: f3:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -32
+; MIPS32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS32-NEXT:    sw $ra, 28($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 24($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $16, $2, $25
+; MIPS32-NEXT:    sh $4, 22($sp)
+; MIPS32-NEXT:    addiu $4, $sp, 22
+; MIPS32-NEXT:    lw $25, %call16(k2)($16)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    move $gp, $16
+; MIPS32-NEXT:    lw $1, %got(k)($16)
+; MIPS32-NEXT:    swc1 $f0, 0($1)
+; MIPS32-NEXT:    lw $16, 24($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 28($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N32-LABEL: f3:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(f3)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(f3)))
+; MIPS64-N32-NEXT:    sh $4, 14($sp)
+; MIPS64-N32-NEXT:    lw $25, %call16(k2)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    addiu $4, $sp, 14
+; MIPS64-N32-NEXT:    lw $1, %got_disp(k)($gp)
+; MIPS64-N32-NEXT:    swc1 $f0, 0($1)
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: f3:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(f3)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(f3)))
+; MIPS64-N64-NEXT:    sh $4, 14($sp)
+; MIPS64-N64-NEXT:    ld $25, %call16(k2)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    daddiu $4, $sp, 14
+; MIPS64-N64-NEXT:    ld $1, %got_disp(k)($gp)
+; MIPS64-N64-NEXT:    swc1 $f0, 0($1)
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: f3:
-
-; ALL: sh $4, [[O0:[0-9]+]]($sp)
-; ALL-DAG: jalr $25
-; MIPS32-DAG: addiu $4, $sp, [[O0]]
-; MIPS64-N32: addiu $4, $sp, [[O0]]
-; MIPS64-N64: daddiu $4, $sp, [[O0]]
-; ALL: swc1 $f0
-
   %0 = alloca half
   %1 = bitcast i16 %b to half
   store half %1, half * %0
@@ -45,16 +103,59 @@ entry:
 }
 
 define void  @f(i16 %b) {
-; ALL-LABEL: f:
-
-; ALL: sh $4, [[O0:[0-9]+]]($sp)
-; ALL: lh $[[R0:[0-9]+]], [[O0]]($sp)
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; ALL: swc1 $f[[F0]]
-
+; MIPS32-LABEL: f:
+; MIPS32:       # %bb.0:
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -8
+; MIPS32-NEXT:    .cfi_def_cfa_offset 8
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    sh $4, 4($sp)
+; MIPS32-NEXT:    lh $2, 4($sp)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    lw $1, %got(k)($1)
+; MIPS32-NEXT:    swc1 $f0, 0($1)
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 8
+;
+; MIPS64-N32-LABEL: f:
+; MIPS64-N32:       # %bb.0:
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -16
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(f)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(f)))
+; MIPS64-N32-NEXT:    sh $4, 12($sp)
+; MIPS64-N32-NEXT:    lh $2, 12($sp)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    lw $1, %got_disp(k)($1)
+; MIPS64-N32-NEXT:    swc1 $f0, 0($1)
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 16
+;
+; MIPS64-N64-LABEL: f:
+; MIPS64-N64:       # %bb.0:
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -16
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(f)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(f)))
+; MIPS64-N64-NEXT:    sh $4, 12($sp)
+; MIPS64-N64-NEXT:    lh $2, 12($sp)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    ld $1, %got_disp(k)($1)
+; MIPS64-N64-NEXT:    swc1 $f0, 0($1)
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 16
   %1 = bitcast i16 %b to half
   %2 = fpext half %1 to float
   store float %2, float * @k
@@ -72,180 +173,488 @@ define void  @f(i16 %b) {
 ; MIPSR5. Additionally, fp64 mode / FR=1 is required to use MSA.
 
 define void @fadd_f64() {
+; MIPS32-LABEL: fadd_f64:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(h)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    fexupr.d $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f1
+; MIPS32-NEXT:    copy_s.w $2, $w0[1]
+; MIPS32-NEXT:    mthc1 $2, $f1
+; MIPS32-NEXT:    add.d $f0, $f1, $f1
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w1, $2
+; MIPS32-NEXT:    mfhc1 $2, $f0
+; MIPS32-NEXT:    insert.w $w1[1], $2
+; MIPS32-NEXT:    insert.w $w1[3], $2
+; MIPS32-NEXT:    fexdo.w $w0, $w1, $w1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fadd_f64:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fadd_f64)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fadd_f64)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(h)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-N32-NEXT:    dmtc1 $2, $f0
+; MIPS64-N32-NEXT:    add.d $f0, $f0, $f0
+; MIPS64-N32-NEXT:    dmfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.d $w0, $2
+; MIPS64-N32-NEXT:    fexdo.w $w0, $w0, $w0
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fadd_f64:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fadd_f64)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fadd_f64)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(h)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64-N64-NEXT:    dmtc1 $2, $f0
+; MIPS64-N64-NEXT:    add.d $f0, $f0, $f0
+; MIPS64-N64-NEXT:    dmfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.d $w0, $2
+; MIPS64-N64-NEXT:    fexdo.w $w0, $w0, $w0
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fadd_f64:
   %0 = load half, half * @h, align 2
   %1 = fpext half %0 to double
-; ALL:    lh $[[R0:[0-9]+]]
-; ALL:    fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:    fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:    fexupr.d $w[[W2:[0-9]+]], $w[[W1]]
-; MIPS32: copy_s.w $[[R1:[0-9]+]], $w[[W2]][0]
-; MIPS32: mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32: copy_s.w $[[R2:[0-9]+]], $w[[W2]][1]
-; MIPS32: mthc1 $[[R2]], $f[[F0]]
-; MIPS64: copy_s.d $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS64: dmtc1 $[[R2]], $f[[F0:[0-9]+]]
-
   %2 = load half, half * @h, align 2
   %3 = fpext half %2 to double
   %add = fadd double %1, %3
-
-; ALL: add.d $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
-
   %4 = fptrunc double %add to half
-
-; MIPS32: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; MIPS32: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; MIPS32: mfhc1 $[[R3:[0-9]+]], $f[[F1]]
-; MIPS32: insert.w $w[[W2]][1], $[[R3]]
-; MIPS32: insert.w $w[[W2]][3], $[[R3]]
-
-; MIPS64: dmfc1 $[[R2:[0-9]+]], $f[[F1]]
-; MIPS64: fill.d $w[[W2:[0-9]+]], $[[R2]]
-
-; ALL:    fexdo.w $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:    fexdo.h $w[[W4:[0-9]+]], $w[[W3]], $w[[W3]]
-; ALL:    copy_u.h $[[R3:[0-9]+]], $w[[W4]][0]
-; ALL:    sh $[[R3]]
    store half %4, half * @h, align 2
   ret void
 }
 
 define i32 @ffptoui() {
+; MIPS32-O32-LABEL: ffptoui:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addu $1, $2, $25
+; MIPS32-O32-NEXT:    lw $2, %got(h)($1)
+; MIPS32-O32-NEXT:    lw $3, %got($CPI3_0)($1)
+; MIPS32-O32-NEXT:    lwc1 $f0, %lo($CPI3_0)($3)
+; MIPS32-O32-NEXT:    lh $2, 0($2)
+; MIPS32-O32-NEXT:    fill.h $w1, $2
+; MIPS32-O32-NEXT:    fexupr.w $w1, $w1
+; MIPS32-O32-NEXT:    copy_s.w $2, $w1[0]
+; MIPS32-O32-NEXT:    mtc1 $2, $f2
+; MIPS32-O32-NEXT:    sub.s $f0, $f2, $f0
+; MIPS32-O32-NEXT:    mfc1 $2, $f0
+; MIPS32-O32-NEXT:    fill.w $w0, $2
+; MIPS32-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    fexupr.d $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $2, $f3
+; MIPS32-O32-NEXT:    copy_s.w $2, $w0[1]
+; MIPS32-O32-NEXT:    mthc1 $2, $f3
+; MIPS32-O32-NEXT:    trunc.w.d $f0, $f3
+; MIPS32-O32-NEXT:    mfc1 $2, $f0
+; MIPS32-O32-NEXT:    fexupr.d $w0, $w1
+; MIPS32-O32-NEXT:    copy_s.w $3, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $3, $f1
+; MIPS32-O32-NEXT:    copy_s.w $3, $w0[1]
+; MIPS32-O32-NEXT:    mthc1 $3, $f1
+; MIPS32-O32-NEXT:    trunc.w.d $f0, $f1
+; MIPS32-O32-NEXT:    mfc1 $3, $f0
+; MIPS32-O32-NEXT:    lw $1, %got($CPI3_1)($1)
+; MIPS32-O32-NEXT:    addiu $1, $1, %lo($CPI3_1)
+; MIPS32-O32-NEXT:    lui $4, 32768
+; MIPS32-O32-NEXT:    xor $2, $2, $4
+; MIPS32-O32-NEXT:    lh $1, 0($1)
+; MIPS32-O32-NEXT:    fill.h $w0, $1
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $1, $f0
+; MIPS32-O32-NEXT:    c.olt.s $f2, $f0
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    movt $2, $3, $fcc0
+;
+; MIPS64R5-N32-LABEL: ffptoui:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPS64R5-N32-NEXT:    lw $2, %got_disp(h)($1)
+; MIPS64R5-N32-NEXT:    lw $3, %got_page(.LCPI3_0)($1)
+; MIPS64R5-N32-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($3)
+; MIPS64R5-N32-NEXT:    lh $2, 0($2)
+; MIPS64R5-N32-NEXT:    fill.h $w1, $2
+; MIPS64R5-N32-NEXT:    fexupr.w $w1, $w1
+; MIPS64R5-N32-NEXT:    copy_s.w $2, $w1[0]
+; MIPS64R5-N32-NEXT:    mtc1 $2, $f2
+; MIPS64R5-N32-NEXT:    sub.s $f0, $f2, $f0
+; MIPS64R5-N32-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N32-NEXT:    fill.w $w0, $2
+; MIPS64R5-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    fexupr.d $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64R5-N32-NEXT:    dmtc1 $2, $f0
+; MIPS64R5-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPS64R5-N32-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N32-NEXT:    fexupr.d $w0, $w1
+; MIPS64R5-N32-NEXT:    copy_s.d $3, $w0[0]
+; MIPS64R5-N32-NEXT:    dmtc1 $3, $f0
+; MIPS64R5-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPS64R5-N32-NEXT:    mfc1 $3, $f0
+; MIPS64R5-N32-NEXT:    lw $1, %got_page(.LCPI3_1)($1)
+; MIPS64R5-N32-NEXT:    addiu $1, $1, %got_ofst(.LCPI3_1)
+; MIPS64R5-N32-NEXT:    lui $4, 32768
+; MIPS64R5-N32-NEXT:    xor $2, $2, $4
+; MIPS64R5-N32-NEXT:    lh $1, 0($1)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $1
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N32-NEXT:    mtc1 $1, $f0
+; MIPS64R5-N32-NEXT:    c.olt.s $f2, $f0
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    movt $2, $3, $fcc0
+;
+; MIPS64R5-N64-LABEL: ffptoui:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPS64R5-N64-NEXT:    ld $2, %got_disp(h)($1)
+; MIPS64R5-N64-NEXT:    ld $3, %got_page(.LCPI3_0)($1)
+; MIPS64R5-N64-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($3)
+; MIPS64R5-N64-NEXT:    lh $2, 0($2)
+; MIPS64R5-N64-NEXT:    fill.h $w1, $2
+; MIPS64R5-N64-NEXT:    fexupr.w $w1, $w1
+; MIPS64R5-N64-NEXT:    copy_s.w $2, $w1[0]
+; MIPS64R5-N64-NEXT:    mtc1 $2, $f2
+; MIPS64R5-N64-NEXT:    sub.s $f0, $f2, $f0
+; MIPS64R5-N64-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N64-NEXT:    fill.w $w0, $2
+; MIPS64R5-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    fexupr.d $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.d $2, $w0[0]
+; MIPS64R5-N64-NEXT:    dmtc1 $2, $f0
+; MIPS64R5-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPS64R5-N64-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N64-NEXT:    fexupr.d $w0, $w1
+; MIPS64R5-N64-NEXT:    copy_s.d $3, $w0[0]
+; MIPS64R5-N64-NEXT:    dmtc1 $3, $f0
+; MIPS64R5-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPS64R5-N64-NEXT:    mfc1 $3, $f0
+; MIPS64R5-N64-NEXT:    ld $1, %got_page(.LCPI3_1)($1)
+; MIPS64R5-N64-NEXT:    daddiu $1, $1, %got_ofst(.LCPI3_1)
+; MIPS64R5-N64-NEXT:    lui $4, 32768
+; MIPS64R5-N64-NEXT:    xor $2, $2, $4
+; MIPS64R5-N64-NEXT:    lh $1, 0($1)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $1
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64R5-N64-NEXT:    mtc1 $1, $f0
+; MIPS64R5-N64-NEXT:    c.olt.s $f2, $f0
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    movt $2, $3, $fcc0
+;
+; MIPSR6-O32-LABEL: ffptoui:
+; MIPSR6-O32:       # %bb.0: # %entry
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $2, %got(h)($1)
+; MIPSR6-O32-NEXT:    lw $1, %got($CPI3_0)($1)
+; MIPSR6-O32-NEXT:    lwc1 $f0, %lo($CPI3_0)($1)
+; MIPSR6-O32-NEXT:    lh $1, 0($2)
+; MIPSR6-O32-NEXT:    fill.h $w1, $1
+; MIPSR6-O32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-O32-NEXT:    copy_s.w $1, $w1[0]
+; MIPSR6-O32-NEXT:    mtc1 $1, $f2
+; MIPSR6-O32-NEXT:    cmp.lt.s $f3, $f2, $f0
+; MIPSR6-O32-NEXT:    sub.s $f0, $f2, $f0
+; MIPSR6-O32-NEXT:    mfc1 $1, $f0
+; MIPSR6-O32-NEXT:    fill.w $w0, $1
+; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    fexupr.d $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $1, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $1, $f2
+; MIPSR6-O32-NEXT:    copy_s.w $1, $w0[1]
+; MIPSR6-O32-NEXT:    mthc1 $1, $f2
+; MIPSR6-O32-NEXT:    trunc.w.d $f0, $f2
+; MIPSR6-O32-NEXT:    mfc1 $1, $f0
+; MIPSR6-O32-NEXT:    fexupr.d $w0, $w1
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f1
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[1]
+; MIPSR6-O32-NEXT:    mthc1 $2, $f1
+; MIPSR6-O32-NEXT:    trunc.w.d $f0, $f1
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    lui $3, 32768
+; MIPSR6-O32-NEXT:    xor $1, $1, $3
+; MIPSR6-O32-NEXT:    mfc1 $3, $f3
+; MIPSR6-O32-NEXT:    seleqz $1, $1, $3
+; MIPSR6-O32-NEXT:    selnez $2, $2, $3
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    or $2, $2, $1
+;
+; MIPSR6-N32-LABEL: ffptoui:
+; MIPSR6-N32:       # %bb.0: # %entry
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPSR6-N32-NEXT:    lw $2, %got_disp(h)($1)
+; MIPSR6-N32-NEXT:    lw $1, %got_page(.LCPI3_0)($1)
+; MIPSR6-N32-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($1)
+; MIPSR6-N32-NEXT:    lh $1, 0($2)
+; MIPSR6-N32-NEXT:    fill.h $w1, $1
+; MIPSR6-N32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N32-NEXT:    copy_s.w $1, $w1[0]
+; MIPSR6-N32-NEXT:    mtc1 $1, $f2
+; MIPSR6-N32-NEXT:    cmp.lt.s $f3, $f2, $f0
+; MIPSR6-N32-NEXT:    sub.s $f0, $f2, $f0
+; MIPSR6-N32-NEXT:    mfc1 $1, $f0
+; MIPSR6-N32-NEXT:    fill.w $w0, $1
+; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    fexupr.d $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.d $1, $w0[0]
+; MIPSR6-N32-NEXT:    dmtc1 $1, $f0
+; MIPSR6-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPSR6-N32-NEXT:    mfc1 $1, $f0
+; MIPSR6-N32-NEXT:    fexupr.d $w0, $w1
+; MIPSR6-N32-NEXT:    copy_s.d $2, $w0[0]
+; MIPSR6-N32-NEXT:    dmtc1 $2, $f0
+; MIPSR6-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    lui $3, 32768
+; MIPSR6-N32-NEXT:    xor $1, $1, $3
+; MIPSR6-N32-NEXT:    mfc1 $3, $f3
+; MIPSR6-N32-NEXT:    seleqz $1, $1, $3
+; MIPSR6-N32-NEXT:    selnez $2, $2, $3
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    or $2, $2, $1
+;
+; MIPSR6-N64-LABEL: ffptoui:
+; MIPSR6-N64:       # %bb.0: # %entry
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptoui)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffptoui)))
+; MIPSR6-N64-NEXT:    ld $2, %got_disp(h)($1)
+; MIPSR6-N64-NEXT:    ld $1, %got_page(.LCPI3_0)($1)
+; MIPSR6-N64-NEXT:    lwc1 $f0, %got_ofst(.LCPI3_0)($1)
+; MIPSR6-N64-NEXT:    lh $1, 0($2)
+; MIPSR6-N64-NEXT:    fill.h $w1, $1
+; MIPSR6-N64-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N64-NEXT:    copy_s.w $1, $w1[0]
+; MIPSR6-N64-NEXT:    mtc1 $1, $f2
+; MIPSR6-N64-NEXT:    cmp.lt.s $f3, $f2, $f0
+; MIPSR6-N64-NEXT:    sub.s $f0, $f2, $f0
+; MIPSR6-N64-NEXT:    mfc1 $1, $f0
+; MIPSR6-N64-NEXT:    fill.w $w0, $1
+; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    fexupr.d $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.d $1, $w0[0]
+; MIPSR6-N64-NEXT:    dmtc1 $1, $f0
+; MIPSR6-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPSR6-N64-NEXT:    mfc1 $1, $f0
+; MIPSR6-N64-NEXT:    fexupr.d $w0, $w1
+; MIPSR6-N64-NEXT:    copy_s.d $2, $w0[0]
+; MIPSR6-N64-NEXT:    dmtc1 $2, $f0
+; MIPSR6-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    lui $3, 32768
+; MIPSR6-N64-NEXT:    xor $1, $1, $3
+; MIPSR6-N64-NEXT:    mfc1 $3, $f3
+; MIPSR6-N64-NEXT:    seleqz $1, $1, $3
+; MIPSR6-N64-NEXT:    selnez $2, $2, $3
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    or $2, $2, $1
 entry:
-; ALL-LABEL: ffptoui:
   %0 = load half, half * @h, align 2
   %1 = fptoui half %0 to i32
 
-; MIPS32:       lwc1 $f[[FC:[0-9]+]], %lo($CPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64-N32:   lwc1 $f[[FC:[0-9]+]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64-N64:   lwc1 $f[[FC:[0-9]+]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-
-; ALL:          lh $[[R0:[0-9]+]]
-; ALL:          fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:          fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:          copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:          mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPSR6:       cmp.lt.s  $f[[F1:[0-9]+]], $f[[F0]], $f[[FC]]
-; ALL:          sub.s $f[[F2:[0-9]+]], $f[[F0]], $f[[FC]]
-; ALL:          mfc1 $[[R2:[0-9]]], $f[[F2]]
-; ALL:          fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:          fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:          fexupr.w $w[[W4:[0-9]+]], $w[[W3]]
-; ALL:          fexupr.d $w[[W5:[0-9]+]], $w[[W4]]
-
-; MIPS32:       copy_s.w $[[R3:[0-9]+]], $w[[W5]][0]
-; MIPS32:       mtc1 $[[R3]], $f[[F3:[0-9]+]]
-; MIPS32:       copy_s.w $[[R4:[0-9]+]], $w[[W5]][1]
-; MIPS32:       mthc1 $[[R3]], $f[[F3]]
-
-; MIPS64:       copy_s.d $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS64:       dmtc1 $[[R2]], $f[[F3:[0-9]+]]
-
-; ALL:          trunc.w.d $f[[F4:[0-9]+]], $f[[F3]]
-; ALL:          mfc1 $[[R4:[0-9]+]], $f[[F4]]
-; ALL:          fexupr.d $w[[W6:[0-9]+]], $w[[W1]]
-
-; MIPS32:       copy_s.w $[[R5:[0-9]+]], $w[[W6]][0]
-; MIPS32:       mtc1 $[[R5]], $f[[F5:[0-9]+]]
-; MIPS32:       copy_s.w $[[R6:[0-9]+]], $w[[W6]][1]
-; MIPS32:       mthc1 $[[R6]], $f[[F5]]
-
-; MIPS64:       copy_s.d $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS64:       dmtc1 $[[R2]], $f[[F5:[0-9]+]]
-
-; ALL:          trunc.w.d $f[[F6:[0-9]]], $f[[F5]]
-; ALL:          mfc1 $[[R7:[0-9]]], $f[[F6]]
-
-; MIPS32R5-O32: lw $[[R13:[0-9]+]], %got($CPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS32R5-O32: addiu $[[R14:[0-9]+]], $[[R13]], %lo($CPI{{[0-9]+}}_{{[0-9]+}})
-
-; MIPS64R5-N32: lw $[[R13:[0-9]+]], %got_page(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64R5-N32: addiu $[[R14:[0-9]+]], $[[R13]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-
-; MIPS64R5-N64: ld $[[R13:[0-9]+]], %got_page(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64R5-N64: daddiu $[[R14:[0-9]+]], $[[R13]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-
-; ALL:          lui $[[R8:[0-9]+]], 32768
-; ALL:          xor $[[R9:[0-9]+]], $[[R4]], $[[R8]]
-
-; MIPSR5:       lh $[[R15:[0-9]+]], 0($[[R14]])
-; MIPSR5:       fill.h $w[[W7:[0-9]+]], $[[R15]]
-; MIPSR5:       fexupr.w $w[[W8:[0-9]+]], $w[[W7]]
-; MIPSR5:       copy_s.w $[[R16:[0-9]+]], $w[[W8]][0]
-; MIPSR5:       mtc1 $[[R16]], $f[[F7:[0-9]+]]
-; MIPSR5:       c.olt.s $f[[F0]], $f[[F7]]
-; MIPSR5:       movt $[[R9]], $[[R7]], $fcc0
-
-; MIPSR6:       mfc1 $[[R10:[0-9]+]], $f[[F1]]
-; MIPSR6:       seleqz $[[R11:[0-9]]], $[[R9]], $[[R10]]
-; MIPSR6:       selnez $[[R12:[0-9]]], $[[R7]], $[[R10]]
-; MIPSR6:       or $2, $[[R12]], $[[R11]]
+
+
+
+
+
+
+
 
   ret i32 %1
 }
 
 define i32 @ffptosi() {
+; MIPS32-LABEL: ffptosi:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(h)($1)
+; MIPS32-NEXT:    lh $1, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    fexupr.d $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    mtc1 $1, $f1
+; MIPS32-NEXT:    copy_s.w $1, $w0[1]
+; MIPS32-NEXT:    mthc1 $1, $f1
+; MIPS32-NEXT:    trunc.w.d $f0, $f1
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    mfc1 $2, $f0
+;
+; MIPS64-N32-LABEL: ffptosi:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptosi)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffptosi)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(h)($1)
+; MIPS64-N32-NEXT:    lh $1, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64-N32-NEXT:    dmtc1 $1, $f0
+; MIPS64-N32-NEXT:    trunc.w.d $f0, $f0
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+;
+; MIPS64-N64-LABEL: ffptosi:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffptosi)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffptosi)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(h)($1)
+; MIPS64-N64-NEXT:    lh $1, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    fexupr.d $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.d $1, $w0[0]
+; MIPS64-N64-NEXT:    dmtc1 $1, $f0
+; MIPS64-N64-NEXT:    trunc.w.d $f0, $f0
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
 entry:
-; ALL-LABEL: ffptosi:
   %0 = load half, half * @h, align 2
   %1 = fptosi half %0 to i32
   ret i32 %1
 
-; ALL:    lh $[[R0:[0-9]+]]
-; ALL:    fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:    fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:    fexupr.d $w[[W2:[0-9]+]], $w[[W1]]
 
-; MIPS32: copy_s.w $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS32: mtc1 $[[R2]], $f[[F0:[0-9]+]]
-; MIPS32: copy_s.w $[[R3:[0-9]+]], $w[[W2]][1]
-; MIPS32: mthc1 $[[R3]], $f[[F0]]
 
-; MIPS64: copy_s.d $[[R2:[0-9]+]], $w[[W2]][0]
-; MIPS64: dmtc1 $[[R2]], $f[[F0:[0-9]+]]
 
-; ALL:    trunc.w.d $f[[F1:[0-9]+]], $f[[F0]]
-; ALL:    mfc1 $2, $f[[F1]]
 }
 
 define void @uitofp(i32 %a) {
+; MIPS32-LABEL: uitofp:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -8
+; MIPS32-NEXT:    .cfi_def_cfa_offset 8
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lui $2, 17200
+; MIPS32-NEXT:    sw $2, 4($sp)
+; MIPS32-NEXT:    sw $4, 0($sp)
+; MIPS32-NEXT:    lw $2, %got($CPI5_0)($1)
+; MIPS32-NEXT:    ldc1 $f0, %lo($CPI5_0)($2)
+; MIPS32-NEXT:    ldc1 $f1, 0($sp)
+; MIPS32-NEXT:    sub.d $f0, $f1, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w1, $2
+; MIPS32-NEXT:    mfhc1 $2, $f0
+; MIPS32-NEXT:    insert.w $w1[1], $2
+; MIPS32-NEXT:    insert.w $w1[3], $2
+; MIPS32-NEXT:    fexdo.w $w0, $w1, $w1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    lw $1, %got(h)($1)
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    sh $2, 0($1)
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 8
+;
+; MIPS64-N32-LABEL: uitofp:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -16
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(uitofp)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(uitofp)))
+; MIPS64-N32-NEXT:    lui $2, 17200
+; MIPS64-N32-NEXT:    sw $2, 12($sp)
+; MIPS64-N32-NEXT:    sll $2, $4, 0
+; MIPS64-N32-NEXT:    sw $2, 8($sp)
+; MIPS64-N32-NEXT:    lw $2, %got_page(.LCPI5_0)($1)
+; MIPS64-N32-NEXT:    ldc1 $f0, %got_ofst(.LCPI5_0)($2)
+; MIPS64-N32-NEXT:    ldc1 $f1, 8($sp)
+; MIPS64-N32-NEXT:    sub.d $f0, $f1, $f0
+; MIPS64-N32-NEXT:    dmfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.d $w0, $2
+; MIPS64-N32-NEXT:    fexdo.w $w0, $w0, $w0
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    lw $1, %got_disp(h)($1)
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 16
+;
+; MIPS64-N64-LABEL: uitofp:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -16
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 16
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(uitofp)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(uitofp)))
+; MIPS64-N64-NEXT:    lui $2, 17200
+; MIPS64-N64-NEXT:    sw $2, 12($sp)
+; MIPS64-N64-NEXT:    sll $2, $4, 0
+; MIPS64-N64-NEXT:    sw $2, 8($sp)
+; MIPS64-N64-NEXT:    ld $2, %got_page(.LCPI5_0)($1)
+; MIPS64-N64-NEXT:    ldc1 $f0, %got_ofst(.LCPI5_0)($2)
+; MIPS64-N64-NEXT:    ldc1 $f1, 8($sp)
+; MIPS64-N64-NEXT:    sub.d $f0, $f1, $f0
+; MIPS64-N64-NEXT:    dmfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.d $w0, $2
+; MIPS64-N64-NEXT:    fexdo.w $w0, $w0, $w0
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    ld $1, %got_disp(h)($1)
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    sh $2, 0($1)
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 16
 entry:
-; ALL-LABEL: uitofp:
 
-; MIPS32-O32: ldc1 $f[[F0:[0-9]+]], %lo($CPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS32-O32: ldc1 $f[[F1:[0-9]+]], 0($sp)
 
-; MIPS64-N32: ldc1 $f[[F0:[0-9]+]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64-N32: ldc1 $f[[F1:[0-9]+]], 8($sp)
 
-; MIPS64-N64: ldc1 $f[[F0:[0-9]+]], %got_ofst(.LCPI{{[0-9]+}}_{{[0-9]+}})
-; MIPS64-N64: ldc1 $f[[F1:[0-9]+]], 8($sp)
 
-; MIPSR5:     sub.d $f[[F2:[0-9]+]], $f[[F1]], $f[[F0]]
-; MIPSR6-O32: sub.d $f[[F2:[0-9]+]], $f[[F0]], $f[[F1]]
-; MIPSR6-N32: sub.d $f[[F2:[0-9]+]], $f[[F1]], $f[[F0]]
-; MIPSR6-N64: sub.d $f[[F2:[0-9]+]], $f[[F1]], $f[[F0]]
 
-; MIPS32:     mfc1 $[[R0:[0-9]+]], $f[[F2]]
-; MIPS32:     fill.w $w[[W0:[0-9]+]], $[[R0]]
-; MIPS32:     mfhc1 $[[R1:[0-9]+]], $f[[F2]]
-; MIPS32:     insert.w $w[[W0]][1], $[[R1]]
-; MIPS32:     insert.w $w[[W0]][3], $[[R1]]
 
-; MIPS64-N64-DAG: ld $[[R3:[0-9]+]], %got_disp(h)
-; MIPS64-N32-DAG: lw $[[R3:[0-9]+]], %got_disp(h)
-; MIPS64-DAG:     dmfc1 $[[R1:[0-9]+]], $f[[F2]]
-; MIPS64-DAG:     fill.d $w[[W0:[0-9]+]], $[[R1]]
 
-; ALL-DAG:        fexdo.w $w[[W1:[0-9]+]], $w[[W0]], $w[[W0]]
-; ALL-DAG:        fexdo.h $w[[W2:[0-9]+]], $w[[W1]], $w[[W1]]
 
-; MIPS32-DAG:     lw $[[R3:[0-9]+]], %got(h)
 
-; ALL:        copy_u.h $[[R2:[0-9]+]], $w[[W2]]
-; ALL:        sh $[[R2]], 0($[[R3]])
   %0 = uitofp i32 %a to half
   store half %0, half * @h, align 2
   ret void
@@ -256,30 +665,74 @@ entry:
 ; We don't check f16 -> f64 expansion occurs, as we expand f16 to f32.
 
 define void @fadd() {
+; MIPS32-LABEL: fadd:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    add.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fadd:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fadd)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fadd)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    add.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fadd:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fadd)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fadd)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    add.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fadd:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %add = fadd float %1, %3
 
-; ALL: add.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
  %4 = call i16 @llvm.convert.to.fp16.f32(float %add)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
-; ALL: sh $[[R3]]
    store i16 %4, i16* @g, align 2
   ret void
 }
@@ -292,126 +745,338 @@ declare i16 @llvm.convert.to.fp16.f32(float)
 
 ; Function Attrs: nounwind
 define void @fsub() {
+; MIPS32-LABEL: fsub:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    sub.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fsub:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fsub)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fsub)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    sub.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fsub:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fsub)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fsub)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    sub.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fsub:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %sub = fsub float %1, %3
 
-; ALL: sub.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
   %4 = call i16 @llvm.convert.to.fp16.f32(float %sub)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %4, i16* @g, align 2
-; ALL: sh $[[R3]]
   ret void
 }
 
 define void @fmult() {
+; MIPS32-LABEL: fmult:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    mul.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fmult:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmult)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fmult)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    mul.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fmult:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmult)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fmult)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    mul.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fmult:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %mul = fmul float %1, %3
 
-; ALL: mul.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
   %4 = call i16 @llvm.convert.to.fp16.f32(float %mul)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %4, i16* @g, align 2
 
-; ALL: sh $[[R3]]
   ret void
 }
 
 define void @fdiv() {
+; MIPS32-LABEL: fdiv:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    div.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fdiv:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fdiv)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fdiv)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    div.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fdiv:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fdiv)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fdiv)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    div.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fdiv:
 
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %div = fdiv float %1, %3
 
-; ALL: div.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
   %4 = call i16 @llvm.convert.to.fp16.f32(float %div)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
   store i16 %4, i16* @g, align 2
-; ALL: sh $[[R3]]
   ret void
 }
 
 define void @frem() {
+; MIPS32-LABEL: frem:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    lw $25, %call16(fmodf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mov.s $f14, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: frem:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(frem)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(frem)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    lw $25, %call16(fmodf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: frem:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(frem)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(frem)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    ld $25, %call16(fmodf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: frem:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:        lh $[[R0:[0-9]+]]
-; ALL:        fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:        fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:        copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %rem = frem float %1, %3
 
-; MIPS32:     lw $25, %call16(fmodf)($gp)
-; MIPS64-N32: lw $25, %call16(fmodf)($gp)
-; MIPS64-N64: ld $25, %call16(fmodf)($gp)
-; ALL:        jalr $25
 
   %4 = call i16 @llvm.convert.to.fp16.f32(float %rem)
 
-; ALL:        mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:        fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:        fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:        copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %4, i16* @g, align 2
-; ALL:        sh $[[R3]]
 
   ret void
 }
@@ -419,31 +1084,127 @@ entry:
 @i1 = external global i16, align 1
 
 define void @fcmp() {
+; MIPS32-O32-LABEL: fcmp:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addu $1, $2, $25
+; MIPS32-O32-NEXT:    lw $2, %got(g)($1)
+; MIPS32-O32-NEXT:    lh $2, 0($2)
+; MIPS32-O32-NEXT:    fill.h $w0, $2
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $2, $f0
+; MIPS32-O32-NEXT:    addiu $2, $zero, 1
+; MIPS32-O32-NEXT:    c.un.s $f0, $f0
+; MIPS32-O32-NEXT:    movt $2, $zero, $fcc0
+; MIPS32-O32-NEXT:    lw $1, %got(i1)($1)
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    sh $2, 0($1)
+;
+; MIPS64R5-N32-LABEL: fcmp:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fcmp)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fcmp)))
+; MIPS64R5-N32-NEXT:    lw $2, %got_disp(g)($1)
+; MIPS64R5-N32-NEXT:    lh $2, 0($2)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $2
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5-N32-NEXT:    mtc1 $2, $f0
+; MIPS64R5-N32-NEXT:    addiu $2, $zero, 1
+; MIPS64R5-N32-NEXT:    c.un.s $f0, $f0
+; MIPS64R5-N32-NEXT:    movt $2, $zero, $fcc0
+; MIPS64R5-N32-NEXT:    lw $1, %got_disp(i1)($1)
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64R5-N64-LABEL: fcmp:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fcmp)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fcmp)))
+; MIPS64R5-N64-NEXT:    ld $2, %got_disp(g)($1)
+; MIPS64R5-N64-NEXT:    lh $2, 0($2)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $2
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5-N64-NEXT:    mtc1 $2, $f0
+; MIPS64R5-N64-NEXT:    addiu $2, $zero, 1
+; MIPS64R5-N64-NEXT:    c.un.s $f0, $f0
+; MIPS64R5-N64-NEXT:    movt $2, $zero, $fcc0
+; MIPS64R5-N64-NEXT:    ld $1, %got_disp(i1)($1)
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-O32-LABEL: fcmp:
+; MIPSR6-O32:       # %bb.0: # %entry
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $2, %got(g)($1)
+; MIPSR6-O32-NEXT:    lh $2, 0($2)
+; MIPSR6-O32-NEXT:    fill.h $w0, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f0
+; MIPSR6-O32-NEXT:    cmp.un.s $f0, $f0, $f0
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    not $2, $2
+; MIPSR6-O32-NEXT:    andi $2, $2, 1
+; MIPSR6-O32-NEXT:    lw $1, %got(i1)($1)
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N32-LABEL: fcmp:
+; MIPSR6-N32:       # %bb.0: # %entry
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fcmp)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fcmp)))
+; MIPSR6-N32-NEXT:    lw $2, %got_disp(g)($1)
+; MIPSR6-N32-NEXT:    lh $2, 0($2)
+; MIPSR6-N32-NEXT:    fill.h $w0, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f0
+; MIPSR6-N32-NEXT:    cmp.un.s $f0, $f0, $f0
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    not $2, $2
+; MIPSR6-N32-NEXT:    andi $2, $2, 1
+; MIPSR6-N32-NEXT:    lw $1, %got_disp(i1)($1)
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N64-LABEL: fcmp:
+; MIPSR6-N64:       # %bb.0: # %entry
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fcmp)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fcmp)))
+; MIPSR6-N64-NEXT:    ld $2, %got_disp(g)($1)
+; MIPSR6-N64-NEXT:    lh $2, 0($2)
+; MIPSR6-N64-NEXT:    fill.h $w0, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f0
+; MIPSR6-N64-NEXT:    cmp.un.s $f0, $f0, $f0
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    not $2, $2
+; MIPSR6-N64-NEXT:    andi $2, $2, 1
+; MIPSR6-N64-NEXT:    ld $1, %got_disp(i1)($1)
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fcmp:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
-; ALL:        lh $[[R0:[0-9]+]]
-; ALL:        fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:        fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:        copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %2 = load i16, i16* @g, align 2
   %3 = call float @llvm.convert.from.fp16.f32(i16 %2)
   %fcmp = fcmp oeq float %1, %3
 
-; MIPSR5: addiu $[[R2:[0-9]+]], $zero, 1
-; MIPSR5: c.un.s $f[[F0]], $f[[F0]]
-; MIPSR5: movt $[[R2]], $zero, $fcc0
-; MIPSR6: cmp.un.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
-; MIPSR6: mfc1 $[[R3:[0-9]]], $f[[F1]]
-; MIPSR6: not $[[R4:[0-9]+]], $[[R3]]
-; MIPSR6: andi $[[R2:[0-9]+]], $[[R4]], 1
 
   %4 = zext i1 %fcmp to i16
   store i16 %4, i16* @i1, align 2
-; ALL:        sh $[[R2]]
 
   ret void
 }
@@ -451,125 +1212,406 @@ entry:
 declare float @llvm.powi.f32(float, i32)
 
 define void @fpowi() {
+; MIPS32-LABEL: fpowi:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    mul.s $f0, $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fpowi:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fpowi)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fpowi)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    mul.s $f0, $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fpowi:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fpowi)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fpowi)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    mul.s $f0, $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fpowi:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
 
   %powi = call float @llvm.powi.f32(float %1, i32 2)
 
-; ALL: mul.s $f[[F1:[0-9]+]], $f[[F0]], $f[[F0]]
 
   %2 = call i16 @llvm.convert.to.fp16.f32(float %powi)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL: sh $[[R3]]
   ret void
 }
 
 define void @fpowi_var(i32 %var) {
+; MIPS32-LABEL: fpowi_var:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    lw $25, %call16(__powisf2)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    move $5, $4
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fpowi_var:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fpowi_var)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fpowi_var)))
+; MIPS64-N32-NEXT:    sll $5, $4, 0
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(__powisf2)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fpowi_var:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fpowi_var)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fpowi_var)))
+; MIPS64-N64-NEXT:    sll $5, $4, 0
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(__powisf2)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fpowi_var:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
 
   %powi = call float @llvm.powi.f32(float %1, i32 %var)
 
-; ALL-DAG: mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(__powisf2)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(__powisf2)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(__powisf2)($gp)
-; ALL-DAG:        jalr $25
 
   %2 = call i16 @llvm.convert.to.fp16.f32(float %powi)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
   ret void
 }
 
 declare float @llvm.pow.f32(float %Val, float %power)
 
 define void @fpow(float %var) {
+; MIPS32-LABEL: fpow:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    mov.s $f14, $f12
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(powf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fpow:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fpow)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fpow)))
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(powf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fpow:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fpow)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fpow)))
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(powf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fpow:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
 
   %powi = call float @llvm.pow.f32(float %1, float %var)
 
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(powf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(powf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(powf)($gp)
-; ALL-DAG:        jalr $25
 
   %2 = call i16 @llvm.convert.to.fp16.f32(float %powi)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
   ret void
 }
 
 declare float @llvm.log2.f32(float %Val)
 
 define void @flog2() {
+; MIPS32-LABEL: flog2:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(log2f)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: flog2:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(flog2)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(flog2)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(log2f)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: flog2:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(flog2)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(flog2)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(log2f)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: flog2:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(log2f)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(log2f)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(log2f)($gp)
-; ALL-DAG:        jalr $25
 
   %log2 = call float @llvm.log2.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %log2)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -577,31 +1619,108 @@ entry:
 declare float @llvm.log10.f32(float %Val)
 
 define void @flog10() {
+; MIPS32-LABEL: flog10:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(log10f)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: flog10:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(flog10)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(flog10)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(log10f)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: flog10:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(flog10)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(flog10)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(log10f)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: flog10:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(log10f)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(log10f)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(log10f)($gp)
-; ALL-DAG:        jalr $25
 
   %log10 = call float @llvm.log10.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %log10)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -609,28 +1728,72 @@ entry:
 declare float @llvm.sqrt.f32(float %Val)
 
 define void @fsqrt() {
+; MIPS32-LABEL: fsqrt:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    sqrt.s $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fsqrt:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fsqrt)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fsqrt)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    sqrt.s $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fsqrt:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fsqrt)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fsqrt)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    sqrt.s $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fsqrt:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL: lh $[[R0:[0-9]+]]
-; ALL: fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL: fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL: copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL: mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; ALL: sqrt.s $f[[F1:[0-9]+]], $f[[F0]]
 
   %sqrt = call float @llvm.sqrt.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %sqrt)
 
-; ALL: mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL: fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL: fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL: copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL: sh $[[R3]]
 
   ret void
 }
@@ -638,31 +1801,108 @@ entry:
 declare float @llvm.sin.f32(float %Val)
 
 define void @fsin() {
+; MIPS32-LABEL: fsin:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(sinf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fsin:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fsin)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fsin)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(sinf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fsin:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fsin)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fsin)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(sinf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fsin:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(sinf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(sinf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(sinf)($gp)
-; ALL-DAG:        jalr $25
 
   %sin = call float @llvm.sin.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %sin)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -670,31 +1910,108 @@ entry:
 declare float @llvm.cos.f32(float %Val)
 
 define void @fcos() {
+; MIPS32-LABEL: fcos:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(cosf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fcos:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fcos)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fcos)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(cosf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fcos:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fcos)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fcos)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(cosf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fcos:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(cosf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(cosf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(cosf)($gp)
-; ALL-DAG:        jalr $25
 
   %cos = call float @llvm.cos.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %cos)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -702,30 +2019,107 @@ entry:
 declare float @llvm.exp.f32(float %Val)
 
 define void @fexp() {
+; MIPS32-LABEL: fexp:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(expf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fexp:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fexp)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fexp)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(expf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fexp:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fexp)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fexp)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(expf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fexp:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(expf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(expf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(expf)($gp)
-; ALL-DAG:        jalr $25
 
   %exp = call float @llvm.exp.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %exp)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -733,31 +2127,108 @@ entry:
 declare float @llvm.exp2.f32(float %Val)
 
 define void @fexp2() {
+; MIPS32-LABEL: fexp2:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(exp2f)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fexp2:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fexp2)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fexp2)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(exp2f)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fexp2:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fexp2)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fexp2)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(exp2f)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fexp2:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(exp2f)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(exp2f)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(exp2f)($gp)
-; ALL-DAG:        jalr $25
 
   %exp2 = call float @llvm.exp2.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %exp2)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -765,31 +2236,115 @@ entry:
 declare float @llvm.fma.f32(float, float, float)
 
 define void @ffma(float %b, float %c) {
+; MIPS32-LABEL: ffma:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    mov.s $f0, $f12
+; MIPS32-NEXT:    mfc1 $6, $f14
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w1, $1
+; MIPS32-NEXT:    fexupr.w $w1, $w1
+; MIPS32-NEXT:    copy_s.w $1, $w1[0]
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    lw $25, %call16(fmaf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mov.s $f14, $f0
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: ffma:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffma)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(ffma)))
+; MIPS64-N32-NEXT:    mov.s $f14, $f13
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(fmaf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: ffma:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffma)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(ffma)))
+; MIPS64-N64-NEXT:    mov.s $f14, $f13
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(fmaf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: ffma:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(fmaf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(fmaf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(fmaf)($gp)
-; ALL-DAG:        jalr $25
 
   %fma = call float @llvm.fma.f32(float %1, float %b, float %c)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %fma)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -800,34 +2355,134 @@ entry:
 declare float @llvm.fmuladd.f32(float, float, float)
 
 define void @ffmuladd(float %b, float %c) {
+; MIPS32-O32-LABEL: ffmuladd:
+; MIPS32-O32:       # %bb.0: # %entry
+; MIPS32-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-O32-NEXT:    addu $1, $2, $25
+; MIPS32-O32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-O32-NEXT:    lh $2, 0($1)
+; MIPS32-O32-NEXT:    fill.h $w0, $2
+; MIPS32-O32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-O32-NEXT:    mtc1 $2, $f0
+; MIPS32-O32-NEXT:    madd.s $f0, $f14, $f0, $f12
+; MIPS32-O32-NEXT:    mfc1 $2, $f0
+; MIPS32-O32-NEXT:    fill.w $w0, $2
+; MIPS32-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-O32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-O32-NEXT:    jr $ra
+; MIPS32-O32-NEXT:    sh $2, 0($1)
+;
+; MIPS64R5-N32-LABEL: ffmuladd:
+; MIPS64R5-N32:       # %bb.0: # %entry
+; MIPS64R5-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffmuladd)))
+; MIPS64R5-N32-NEXT:    addu $1, $1, $25
+; MIPS64R5-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffmuladd)))
+; MIPS64R5-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64R5-N32-NEXT:    lh $2, 0($1)
+; MIPS64R5-N32-NEXT:    fill.h $w0, $2
+; MIPS64R5-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5-N32-NEXT:    mtc1 $2, $f0
+; MIPS64R5-N32-NEXT:    madd.s $f0, $f13, $f0, $f12
+; MIPS64R5-N32-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N32-NEXT:    fill.w $w0, $2
+; MIPS64R5-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64R5-N32-NEXT:    jr $ra
+; MIPS64R5-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64R5-N64-LABEL: ffmuladd:
+; MIPS64R5-N64:       # %bb.0: # %entry
+; MIPS64R5-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffmuladd)))
+; MIPS64R5-N64-NEXT:    daddu $1, $1, $25
+; MIPS64R5-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffmuladd)))
+; MIPS64R5-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64R5-N64-NEXT:    lh $2, 0($1)
+; MIPS64R5-N64-NEXT:    fill.h $w0, $2
+; MIPS64R5-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64R5-N64-NEXT:    mtc1 $2, $f0
+; MIPS64R5-N64-NEXT:    madd.s $f0, $f13, $f0, $f12
+; MIPS64R5-N64-NEXT:    mfc1 $2, $f0
+; MIPS64R5-N64-NEXT:    fill.w $w0, $2
+; MIPS64R5-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64R5-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64R5-N64-NEXT:    jr $ra
+; MIPS64R5-N64-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-O32-LABEL: ffmuladd:
+; MIPSR6-O32:       # %bb.0: # %entry
+; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
+; MIPSR6-O32-NEXT:    lh $2, 0($1)
+; MIPSR6-O32-NEXT:    fill.h $w0, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f0
+; MIPSR6-O32-NEXT:    mul.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    add.s $f0, $f0, $f14
+; MIPSR6-O32-NEXT:    mfc1 $2, $f0
+; MIPSR6-O32-NEXT:    fill.w $w0, $2
+; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-O32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-O32-NEXT:    jr $ra
+; MIPSR6-O32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N32-LABEL: ffmuladd:
+; MIPSR6-N32:       # %bb.0: # %entry
+; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffmuladd)))
+; MIPSR6-N32-NEXT:    addu $1, $1, $25
+; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffmuladd)))
+; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPSR6-N32-NEXT:    lh $2, 0($1)
+; MIPSR6-N32-NEXT:    fill.h $w0, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f0
+; MIPSR6-N32-NEXT:    mul.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    add.s $f0, $f0, $f13
+; MIPSR6-N32-NEXT:    mfc1 $2, $f0
+; MIPSR6-N32-NEXT:    fill.w $w0, $2
+; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N32-NEXT:    jr $ra
+; MIPSR6-N32-NEXT:    sh $2, 0($1)
+;
+; MIPSR6-N64-LABEL: ffmuladd:
+; MIPSR6-N64:       # %bb.0: # %entry
+; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffmuladd)))
+; MIPSR6-N64-NEXT:    daddu $1, $1, $25
+; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffmuladd)))
+; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPSR6-N64-NEXT:    lh $2, 0($1)
+; MIPSR6-N64-NEXT:    fill.h $w0, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f0
+; MIPSR6-N64-NEXT:    mul.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    add.s $f0, $f0, $f13
+; MIPSR6-N64-NEXT:    mfc1 $2, $f0
+; MIPSR6-N64-NEXT:    fill.w $w0, $2
+; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPSR6-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPSR6-N64-NEXT:    jr $ra
+; MIPSR6-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: ffmuladd:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:            mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-O32:     madd.s $f[[F1:[0-9]]], $f14, $f[[F0]], $f12
 ; MIPS32-N32:     madd.s $f[[F1:[0-9]]], $f13, $f[[F0]], $f12
 ; MIPS32-N64:     madd.s $f[[F1:[0-9]]], $f13, $f[[F0]], $f12
-; MIPSR6:         mul.s $f[[F2:[0-9]+]], $f[[F0]], $f12
-; MIPSR6-O32:     add.s $f[[F1:[0-9]+]], $f[[F2]], $f14
-; MIPSR6-N32:     add.s $f[[F1:[0-9]+]], $f[[F2]], $f13
-; MIPSR6-N64:     add.s $f[[F1:[0-9]+]], $f[[F2]], $f13
 
   %fmuladd = call float @llvm.fmuladd.f32(float %1, float %b, float %c)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %fmuladd)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -835,60 +2490,184 @@ entry:
 declare float @llvm.fabs.f32(float %Val)
 
 define void @ffabs() {
+; MIPS32-LABEL: ffabs:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mtc1 $2, $f0
+; MIPS32-NEXT:    abs.s $f0, $f0
+; MIPS32-NEXT:    mfc1 $2, $f0
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: ffabs:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffabs)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(ffabs)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mtc1 $2, $f0
+; MIPS64-N32-NEXT:    abs.s $f0, $f0
+; MIPS64-N32-NEXT:    mfc1 $2, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: ffabs:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffabs)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(ffabs)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mtc1 $2, $f0
+; MIPS64-N64-NEXT:    abs.s $f0, $f0
+; MIPS64-N64-NEXT:    mfc1 $2, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: ffabs:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL:            mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; ALL:            abs.s $f[[F1:[0-9]+]], $f[[F0]]
 
   %fabs = call float @llvm.fabs.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %fabs)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
 
-; ALL:            sh $[[R3]]
   ret void
 }
 
 declare float @llvm.minnum.f32(float %Val, float %b)
 
 define void @fminnum(float %b) {
+; MIPS32-LABEL: fminnum:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    mov.s $f14, $f12
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(fminf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fminnum:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(fminf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fminnum:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(fminf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fminnum:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(fminf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(fminf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(fminf)($gp)
-; ALL-DAG:        jalr $25
 
   %minnum = call float @llvm.minnum.f32(float %1, float %b)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %minnum)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -896,31 +2675,111 @@ entry:
 declare float @llvm.maxnum.f32(float %Val, float %b)
 
 define void @fmaxnum(float %b) {
+; MIPS32-LABEL: fmaxnum:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    mov.s $f14, $f12
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(fmaxf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fmaxnum:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPS64-N32-NEXT:    mov.s $f13, $f12
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(fmaxf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fmaxnum:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPS64-N64-NEXT:    mov.s $f13, $f12
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(fmaxf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fmaxnum:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(fmaxf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(fmaxf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(fmaxf)($gp)
-; ALL-DAG:        jalr $25
 
   %maxnum = call float @llvm.maxnum.f32(float %1, float %b)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %maxnum)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:             sh $[[R3]]
 
   ret void
 }
@@ -930,28 +2789,72 @@ entry:
 declare float @llvm.copysign.f32(float %Val, float %b)
 
 define void @fcopysign(float %b) {
+; MIPS32-LABEL: fcopysign:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addu $1, $2, $25
+; MIPS32-NEXT:    lw $1, %got(g)($1)
+; MIPS32-NEXT:    lh $2, 0($1)
+; MIPS32-NEXT:    fill.h $w0, $2
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS32-NEXT:    mfc1 $3, $f12
+; MIPS32-NEXT:    ext $3, $3, 31, 1
+; MIPS32-NEXT:    ins $2, $3, 31, 1
+; MIPS32-NEXT:    fill.w $w0, $2
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N32-LABEL: fcopysign:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fcopysign)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fcopysign)))
+; MIPS64-N32-NEXT:    lw $1, %got_disp(g)($1)
+; MIPS64-N32-NEXT:    lh $2, 0($1)
+; MIPS64-N32-NEXT:    fill.h $w0, $2
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N32-NEXT:    mfc1 $3, $f12
+; MIPS64-N32-NEXT:    ext $3, $3, 31, 1
+; MIPS64-N32-NEXT:    ins $2, $3, 31, 1
+; MIPS64-N32-NEXT:    fill.w $w0, $2
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    sh $2, 0($1)
+;
+; MIPS64-N64-LABEL: fcopysign:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fcopysign)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fcopysign)))
+; MIPS64-N64-NEXT:    ld $1, %got_disp(g)($1)
+; MIPS64-N64-NEXT:    lh $2, 0($1)
+; MIPS64-N64-NEXT:    fill.h $w0, $2
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $2, $w0[0]
+; MIPS64-N64-NEXT:    mfc1 $3, $f12
+; MIPS64-N64-NEXT:    ext $3, $3, 31, 1
+; MIPS64-N64-NEXT:    ins $2, $3, 31, 1
+; MIPS64-N64-NEXT:    fill.w $w0, $2
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $2, $w0[0]
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    sh $2, 0($1)
 entry:
-; ALL-LABEL: fcopysign:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
 
   %copysign = call float @llvm.copysign.f32(float %1, float %b)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %copysign)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f12
-; ALL:            ext $[[R3:[0-9]+]], $3, 31, 1
-; ALL:            ins $[[R1]], $[[R3]], 31, 1
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R1]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -959,31 +2862,108 @@ entry:
 declare float @llvm.floor.f32(float %Val)
 
 define void @ffloor() {
+; MIPS32-LABEL: ffloor:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(floorf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: ffloor:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ffloor)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(ffloor)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(floorf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: ffloor:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ffloor)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(ffloor)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(floorf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: ffloor:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(floorf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(floorf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(floorf)($gp)
-; ALL-DAG:        jalr $25
 
   %floor = call float @llvm.floor.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %floor)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -991,31 +2971,108 @@ entry:
 declare float @llvm.ceil.f32(float %Val)
 
 define void @fceil() {
+; MIPS32-LABEL: fceil:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(ceilf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fceil:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fceil)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fceil)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(ceilf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fceil:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fceil)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fceil)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(ceilf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fceil:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(ceilf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(ceilf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(ceilf)($gp)
-; ALL-DAG:        jalr $25
 
   %ceil = call float @llvm.ceil.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %ceil)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -1023,31 +3080,108 @@ entry:
 declare float @llvm.trunc.f32(float %Val)
 
 define void @ftrunc() {
+; MIPS32-LABEL: ftrunc:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(truncf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: ftrunc:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(ftrunc)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(ftrunc)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(truncf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: ftrunc:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(ftrunc)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(ftrunc)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(truncf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: ftrunc:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(truncf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(truncf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(truncf)($gp)
-; ALL-DAG:        jalr $25
 
   %trunc = call float @llvm.trunc.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %trunc)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -1055,61 +3189,215 @@ entry:
 declare float @llvm.rint.f32(float %Val)
 
 define void @frint() {
+; MIPS32-LABEL: frint:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(rintf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: frint:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(frint)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(frint)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(rintf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: frint:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(frint)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(frint)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(rintf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: frint:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(rintf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(rintf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(rintf)($gp)
-; ALL-DAG:        jalr $25
   %rint = call float @llvm.rint.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %rint)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
   store i16 %2, i16* @g, align 2
 
-; ALL:            sh $[[R3]]
   ret void
 }
 
 declare float @llvm.nearbyint.f32(float %Val)
 
 define void @fnearbyint() {
+; MIPS32-LABEL: fnearbyint:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(nearbyintf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fnearbyint:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fnearbyint)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fnearbyint)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(nearbyintf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fnearbyint:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fnearbyint)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fnearbyint)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(nearbyintf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fnearbyint:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(nearbyintf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(nearbyintf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(nearbyintf)($gp)
-; ALL-DAG:        jalr $25
 
   %nearbyint = call float @llvm.nearbyint.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %nearbyint)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
@@ -1117,31 +3405,108 @@ entry:
 declare float @llvm.round.f32(float %Val)
 
 define void @fround() {
+; MIPS32-LABEL: fround:
+; MIPS32:       # %bb.0: # %entry
+; MIPS32-NEXT:    lui $2, %hi(_gp_disp)
+; MIPS32-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; MIPS32-NEXT:    addiu $sp, $sp, -24
+; MIPS32-NEXT:    .cfi_def_cfa_offset 24
+; MIPS32-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    sw $16, 16($sp) # 4-byte Folded Spill
+; MIPS32-NEXT:    .cfi_offset 31, -4
+; MIPS32-NEXT:    .cfi_offset 16, -8
+; MIPS32-NEXT:    addu $gp, $2, $25
+; MIPS32-NEXT:    lw $16, %got(g)($gp)
+; MIPS32-NEXT:    lh $1, 0($16)
+; MIPS32-NEXT:    fill.h $w0, $1
+; MIPS32-NEXT:    fexupr.w $w0, $w0
+; MIPS32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS32-NEXT:    lw $25, %call16(roundf)($gp)
+; MIPS32-NEXT:    jalr $25
+; MIPS32-NEXT:    mtc1 $1, $f12
+; MIPS32-NEXT:    mfc1 $1, $f0
+; MIPS32-NEXT:    fill.w $w0, $1
+; MIPS32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS32-NEXT:    sh $1, 0($16)
+; MIPS32-NEXT:    lw $16, 16($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MIPS32-NEXT:    jr $ra
+; MIPS32-NEXT:    addiu $sp, $sp, 24
+;
+; MIPS64-N32-LABEL: fround:
+; MIPS64-N32:       # %bb.0: # %entry
+; MIPS64-N32-NEXT:    addiu $sp, $sp, -32
+; MIPS64-N32-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N32-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N32-NEXT:    .cfi_offset 31, -8
+; MIPS64-N32-NEXT:    .cfi_offset 28, -16
+; MIPS64-N32-NEXT:    .cfi_offset 16, -24
+; MIPS64-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fround)))
+; MIPS64-N32-NEXT:    addu $1, $1, $25
+; MIPS64-N32-NEXT:    addiu $gp, $1, %lo(%neg(%gp_rel(fround)))
+; MIPS64-N32-NEXT:    lw $16, %got_disp(g)($gp)
+; MIPS64-N32-NEXT:    lh $1, 0($16)
+; MIPS64-N32-NEXT:    fill.h $w0, $1
+; MIPS64-N32-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N32-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N32-NEXT:    lw $25, %call16(roundf)($gp)
+; MIPS64-N32-NEXT:    jalr $25
+; MIPS64-N32-NEXT:    mtc1 $1, $f12
+; MIPS64-N32-NEXT:    mfc1 $1, $f0
+; MIPS64-N32-NEXT:    fill.w $w0, $1
+; MIPS64-N32-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N32-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N32-NEXT:    sh $1, 0($16)
+; MIPS64-N32-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N32-NEXT:    jr $ra
+; MIPS64-N32-NEXT:    addiu $sp, $sp, 32
+;
+; MIPS64-N64-LABEL: fround:
+; MIPS64-N64:       # %bb.0: # %entry
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, -32
+; MIPS64-N64-NEXT:    .cfi_def_cfa_offset 32
+; MIPS64-N64-NEXT:    sd $ra, 24($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $gp, 16($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    sd $16, 8($sp) # 8-byte Folded Spill
+; MIPS64-N64-NEXT:    .cfi_offset 31, -8
+; MIPS64-N64-NEXT:    .cfi_offset 28, -16
+; MIPS64-N64-NEXT:    .cfi_offset 16, -24
+; MIPS64-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fround)))
+; MIPS64-N64-NEXT:    daddu $1, $1, $25
+; MIPS64-N64-NEXT:    daddiu $gp, $1, %lo(%neg(%gp_rel(fround)))
+; MIPS64-N64-NEXT:    ld $16, %got_disp(g)($gp)
+; MIPS64-N64-NEXT:    lh $1, 0($16)
+; MIPS64-N64-NEXT:    fill.h $w0, $1
+; MIPS64-N64-NEXT:    fexupr.w $w0, $w0
+; MIPS64-N64-NEXT:    copy_s.w $1, $w0[0]
+; MIPS64-N64-NEXT:    ld $25, %call16(roundf)($gp)
+; MIPS64-N64-NEXT:    jalr $25
+; MIPS64-N64-NEXT:    mtc1 $1, $f12
+; MIPS64-N64-NEXT:    mfc1 $1, $f0
+; MIPS64-N64-NEXT:    fill.w $w0, $1
+; MIPS64-N64-NEXT:    fexdo.h $w0, $w0, $w0
+; MIPS64-N64-NEXT:    copy_u.h $1, $w0[0]
+; MIPS64-N64-NEXT:    sh $1, 0($16)
+; MIPS64-N64-NEXT:    ld $16, 8($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $gp, 16($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    ld $ra, 24($sp) # 8-byte Folded Reload
+; MIPS64-N64-NEXT:    jr $ra
+; MIPS64-N64-NEXT:    daddiu $sp, $sp, 32
 entry:
-; ALL-LABEL: fround:
   %0 = load i16, i16* @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
 
-; ALL:            lh $[[R0:[0-9]+]]
-; ALL:            fill.h $w[[W0:[0-9]+]], $[[R0]]
-; ALL:            fexupr.w $w[[W1:[0-9]+]], $w[[W0]]
-; ALL:            copy_s.w $[[R1:[0-9]+]], $w[[W1]][0]
-; ALL-DAG:        mtc1 $[[R1]], $f[[F0:[0-9]+]]
-; MIPS32-DAG:     lw $25, %call16(roundf)($gp)
-; MIPS64-N32-DAG: lw $25, %call16(roundf)($gp)
-; MIPS64-N64-DAG: ld $25, %call16(roundf)($gp)
-; ALL-DAG:        jalr $25
 
   %round = call float @llvm.round.f32(float %1)
   %2 = call i16 @llvm.convert.to.fp16.f32(float %round)
 
-; ALL:            mfc1 $[[R2:[0-9]+]], $f[[F1]]
-; ALL:            fill.w $w[[W2:[0-9]+]], $[[R2]]
-; ALL:            fexdo.h $w[[W3:[0-9]+]], $w[[W2]], $w[[W2]]
-; ALL:            copy_u.h $[[R3:[0-9]+]], $w[[W3]][0]
 
   store i16 %2, i16* @g, align 2
-; ALL:            sh $[[R3]]
 
   ret void
 }
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index d61f05dc8688440657dcbe07b0d9ca284ead86dc..19eb80b79bafe88b6da98002d8e9ac0b9b2bb6eb 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -109,7 +109,8 @@ define void @f2(float %f, %struct.S1* nocapture byval %s1) nounwind {
 ; CHECK-NEXT:    lw $1, 64($sp)
 ; CHECK-NEXT:    lw $2, 68($sp)
 ; CHECK-NEXT:    lh $3, 58($sp)
-; CHECK-NEXT:    lb $5, 56($sp)
+; CHECK-NEXT:    sll $5, $6, 24
+; CHECK-NEXT:    sra $5, $5, 24
 ; CHECK-NEXT:    swc1 $f12, 36($sp)
 ; CHECK-NEXT:    sw $5, 32($sp)
 ; CHECK-NEXT:    sw $3, 28($sp)
@@ -191,11 +192,12 @@ define void @f4(float %f, %struct.S3* nocapture byval %s3, %struct.S1* nocapture
 ; CHECK-NEXT:    sw $ra, 44($sp) # 4-byte Folded Spill
 ; CHECK-NEXT:    addu $gp, $2, $25
 ; CHECK-NEXT:    move $4, $7
-; CHECK-NEXT:    sw $5, 52($sp)
 ; CHECK-NEXT:    sw $6, 56($sp)
+; CHECK-NEXT:    sw $5, 52($sp)
 ; CHECK-NEXT:    sw $7, 60($sp)
 ; CHECK-NEXT:    lw $1, 80($sp)
-; CHECK-NEXT:    lb $2, 52($sp)
+; CHECK-NEXT:    sll $2, $5, 24
+; CHECK-NEXT:    sra $2, $2, 24
 ; CHECK-NEXT:    addiu $3, $zero, 4
 ; CHECK-NEXT:    lui $5, 16576
 ; CHECK-NEXT:    sw $5, 36($sp)
diff --git a/test/CodeGen/Mips/o32_cc_vararg.ll b/test/CodeGen/Mips/o32_cc_vararg.ll
index 73aad48b73e68cc0162a48fe2ad7c006ba23ceb7..27d454f31d98c60509729a517ad97d94f79a93cf 100644
--- a/test/CodeGen/Mips/o32_cc_vararg.ll
+++ b/test/CodeGen/Mips/o32_cc_vararg.ll
@@ -29,10 +29,10 @@ entry:
 
 ; CHECK-LABEL: va1:
 ; CHECK: addiu   $sp, $sp, -16
-; CHECK: sw      $5, 20($sp)
 ; CHECK: sw      $7, 28($sp)
 ; CHECK: sw      $6, 24($sp)
-; CHECK: lw      $2, 20($sp)
+; CHECK: sw      $5, 20($sp)
+; CHECK: move    $2, $5
 }
 
 ; check whether the variable double argument will be accessed from the 8-byte
@@ -83,9 +83,9 @@ entry:
 
 ; CHECK-LABEL: va3:
 ; CHECK: addiu   $sp, $sp, -16
-; CHECK: sw      $6, 24($sp)
 ; CHECK: sw      $7, 28($sp)
-; CHECK: lw      $2, 24($sp)
+; CHECK: sw      $6, 24($sp)
+; CHECK: move    $2, $6
 }
 
 ; double
@@ -135,7 +135,7 @@ entry:
 ; CHECK-LABEL: va5:
 ; CHECK: addiu   $sp, $sp, -24
 ; CHECK: sw      $7, 36($sp)
-; CHECK: lw      $2, 36($sp)
+; CHECK: move    $2, $7
 }
 
 ; double
diff --git a/test/CodeGen/PowerPC/addi-offset-fold.ll b/test/CodeGen/PowerPC/addi-offset-fold.ll
index ab00a4dab3a90615a3d9583264ee05f4430b9121..db2fb0eee7cb15a2c43e7d8e360fd4b7ac30e571 100644
--- a/test/CodeGen/PowerPC/addi-offset-fold.ll
+++ b/test/CodeGen/PowerPC/addi-offset-fold.ll
@@ -24,14 +24,12 @@ entry:
   ret i32 %bf.cast
 
 ; CHECK-LABEL: @foo
-; FIXME: We don't need to do these stores/loads at all.
+; FIXME: We don't need to do these stores at all.
 ; CHECK-DAG: std 3, -24(1)
 ; CHECK-DAG: stb 4, -16(1)
-; CHECK-DAG: lbz [[REG1:[0-9]+]], -16(1)
 ; CHECK-DAG: lwz [[REG2:[0-9]+]], -20(1)
-; CHECK-DAG: sldi [[REG3:[0-9]+]], [[REG1]], 32
-; CHECK-DAG: or [[REG4:[0-9]+]], [[REG2]], [[REG3]]
-; CHECK: rldicl 3, [[REG4]], 33, 57
+; CHECK-DAG: rlwinm 3, [[REG2]], 1, 31, 31
+; CHECK: rlwimi 3, 4, 1, 25, 30
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/big-endian-store-forward.ll b/test/CodeGen/PowerPC/big-endian-store-forward.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1125a229005ffc141548c17e2921d3e5e1860831
--- /dev/null
+++ b/test/CodeGen/PowerPC/big-endian-store-forward.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; The load is to the high byte of the 2-byte store
+@g = global i8 -75
+
+define void @f(i16 %v) {
+; CHECK-LABEL: f
+; CHECK: sth 3, -2(1)
+; CHECK: lbz 3, -2(1)
+  %p32 = alloca i16
+  store i16 %v, i16* %p32
+  %p16 = bitcast i16* %p32 to i8*
+  %tmp = load i8, i8* %p16
+  store i8 %tmp, i8* @g
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/bitfieldinsert.ll b/test/CodeGen/PowerPC/bitfieldinsert.ll
index e654c7d8a0cbbe85d36dd9033ff7d0c9b3f95d6a..76a648b6f13f4e354446577e4b277505fd6bec42 100644
--- a/test/CodeGen/PowerPC/bitfieldinsert.ll
+++ b/test/CodeGen/PowerPC/bitfieldinsert.ll
@@ -1,6 +1,35 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
 
+; equivalent C code
+;   struct s64 {
+;   	int a:5;
+;   	int b:16;
+;   	long c:42;
+;   };
+;   void bitfieldinsert64(struct s *p, unsigned short v) {
+;   	p->b = v;
+;   }
+
+%struct.s64 = type { i64 }
+
+define void @bitfieldinsert64(%struct.s64* nocapture %p, i16 zeroext %v) {
+; CHECK-LABEL: @bitfieldinsert64
+; CHECK: ld [[REG1:[0-9]+]], 0(3)
+; CHECK-NEXT: rlwimi [[REG1]], 4, 5, 11, 26
+; CHECK-NEXT: std [[REG1]], 0(3)
+; CHECK-NEXT: blr
+entry:
+  %0 = getelementptr inbounds %struct.s64, %struct.s64* %p, i64 0, i32 0
+  %1 = zext i16 %v to i64
+  %bf.load = load i64, i64* %0, align 8
+  %bf.shl = shl nuw nsw i64 %1, 5
+  %bf.clear = and i64 %bf.load, -2097121
+  %bf.set = or i64 %bf.clear, %bf.shl
+  store i64 %bf.set, i64* %0, align 8
+  ret void
+}
+
 ; bitfieldinsert32: Test for rlwimi
 ; equivalent C code
 ;   struct s32 {
@@ -17,9 +46,9 @@
 define void @bitfieldinsert32(%struct.s32* nocapture %p, i32 zeroext %v) {
 ; CHECK-LABEL: @bitfieldinsert32
 ; CHECK: lwz [[REG1:[0-9]+]], 0(3)
-; CHECK: rlwimi [[REG1]], 4, 8, 8, 23
-; CHECK: stw [[REG1]], 0(3)
-; CHECK: blr
+; CHECK-NEXT: rlwimi [[REG1]], 4, 8, 8, 23
+; CHECK-NEXT: stw [[REG1]], 0(3)
+; CHECK-NEXT: blr
 entry:
   %0 = getelementptr inbounds %struct.s32, %struct.s32* %p, i64 0, i32 0
   %bf.load = load i32, i32* %0, align 4
diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
index d192bafca23553ae71d7dcaf0590f9afc1372134..6f65b189b75459458ee40490a354740554b27ed2 100644
--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -838,28 +838,26 @@ entry:
 ; P9LE-LABEL: fromRegsi
 ; P8BE-LABEL: fromRegsi
 ; P8LE-LABEL: fromRegsi
-; P9BE-DAG: mtvsrdd [[REG1:v[0-9]+]], r3, r5
-; P9BE-DAG: mtvsrdd [[REG2:v[0-9]+]], r4, r6
-; P9BE: vmrgow v2, [[REG1]], [[REG2]]
+; P9BE-DAG: rldimi r6, r5, 32, 0
+; P9BE-DAG: rldimi r4, r3, 32, 0
+; P9BE: mtvsrdd v2, r4, r6
 ; P9BE: blr
-; P9LE-DAG: mtvsrdd [[REG1:v[0-9]+]], r5, r3
-; P9LE-DAG: mtvsrdd [[REG2:v[0-9]+]], r6, r4
-; P9LE: vmrgow v2, [[REG2]], [[REG1]]
+; P9LE-DAG: rldimi r3, r4, 32, 0
+; P9LE-DAG: rldimi r5, r6, 32, 0
+; P9LE: mtvsrdd v2, r5, r3
 ; P9LE: blr
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
-; P8BE-DAG: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG1]], {{[v][s]*}}[[REG3]]
-; P8BE-DAG: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG2]], {{[v][s]*}}[[REG4]]
-; P8BE: vmrgow v2, [[REG5]], [[REG6]]
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
-; P8LE: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG3]], {{[v][s]*}}[[REG1]]
-; P8LE: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG4]], {{[v][s]*}}[[REG2]]
-; P8LE: vmrgow v2, [[REG6]], [[REG5]]
+; P8BE-DAG: rldimi r6, r5, 32, 0
+; P8BE-DAG: rldimi r4, r3, 32, 0
+; P8BE-DAG: mtvsrd f[[REG1:[0-9]+]], r6
+; P8BE-DAG: mtvsrd f[[REG2:[0-9]+]], r4
+; P8BE-DAG: xxmrghd v2, vs[[REG2]], vs[[REG1]]
+; P8BE: blr
+; P8LE-DAG: rldimi r3, r4, 32, 0
+; P8LE-DAG: rldimi r5, r6, 32, 0
+; P8LE-DAG: mtvsrd f[[REG1:[0-9]+]], r3
+; P8LE-DAG: mtvsrd f[[REG2:[0-9]+]], r5
+; P8LE-DAG: xxmrghd v2, vs[[REG2]], vs[[REG1]]
+; P8LE: blr
 }
 
 ; Function Attrs: norecurse nounwind readnone
@@ -1065,38 +1063,34 @@ entry:
 ; P9BE: lwz
 ; P9BE: lwz
 ; P9BE: lwz
+; P9BE: rldimi
+; P9BE: rldimi
 ; P9BE: mtvsrdd
-; P9BE: mtvsrdd
-; P9BE: vmrgow
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
+; P9LE: rldimi
+; P9LE: rldimi
 ; P9LE: mtvsrdd
-; P9LE: mtvsrdd
-; P9LE: vmrgow
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: xxmrghd
+; P8BE: rldimi
+; P8BE: rldimi
+; P8BE: mtvsrd
+; P8BE: mtvsrd
 ; P8BE: xxmrghd
-; P8BE: vmrgow
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: xxmrghd
+; P8LE: rldimi
+; P8LE: rldimi
+; P8LE: mtvsrd
+; P8LE: mtvsrd
 ; P8LE: xxmrghd
-; P8LE: vmrgow
 }
 
 ; Function Attrs: norecurse nounwind readonly
@@ -1132,41 +1126,37 @@ entry:
 ; P9BE: lwz
 ; P9BE: lwz
 ; P9BE: lwz
+; P9BE: rldimi
+; P9BE: rldimi
 ; P9BE: mtvsrdd
-; P9BE: mtvsrdd
-; P9BE: vmrgow
 ; P9LE: sldi r4, r4, 2
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
+; P9LE: rldimi
+; P9LE: rldimi
 ; P9LE: mtvsrdd
-; P9LE: mtvsrdd
-; P9LE: vmrgow
 ; P8BE: sldi r4, r4, 2
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: xxmrghd
+; P8BE: rldimi
+; P8BE: rldimi
+; P8BE: mtvsrd
+; P8BE: mtvsrd
 ; P8BE: xxmrghd
-; P8BE: vmrgow
 ; P8LE: sldi r4, r4, 2
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: xxmrghd
+; P8LE: rldimi
+; P8LE: rldimi
+; P8LE: mtvsrd
+; P8LE: mtvsrd
 ; P8LE: xxmrghd
-; P8LE: vmrgow
 }
 
 ; Function Attrs: norecurse nounwind readnone
@@ -1978,28 +1968,26 @@ entry:
 ; P9LE-LABEL: fromRegsui
 ; P8BE-LABEL: fromRegsui
 ; P8LE-LABEL: fromRegsui
-; P9BE-DAG: mtvsrdd [[REG1:v[0-9]+]], r3, r5
-; P9BE-DAG: mtvsrdd [[REG2:v[0-9]+]], r4, r6
-; P9BE: vmrgow v2, [[REG1]], [[REG2]]
+; P9BE-DAG: rldimi r6, r5, 32, 0
+; P9BE-DAG: rldimi r4, r3, 32, 0
+; P9BE: mtvsrdd v2, r4, r6
 ; P9BE: blr
-; P9LE-DAG: mtvsrdd [[REG1:v[0-9]+]], r5, r3
-; P9LE-DAG: mtvsrdd [[REG2:v[0-9]+]], r6, r4
-; P9LE: vmrgow v2, [[REG2]], [[REG1]]
+; P9LE-DAG: rldimi r3, r4, 32, 0
+; P9LE-DAG: rldimi r5, r6, 32, 0
+; P9LE: mtvsrdd v2, r5, r3
 ; P9LE: blr
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
-; P8BE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
-; P8BE-DAG: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG1]], {{[v][s]*}}[[REG3]]
-; P8BE-DAG: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG2]], {{[v][s]*}}[[REG4]]
-; P8BE: vmrgow v2, [[REG5]], [[REG6]]
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG1:[0-9]+]], r3
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG2:[0-9]+]], r4
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG3:[0-9]+]], r5
-; P8LE-DAG: mtvsrwz {{[vf]}}[[REG4:[0-9]+]], r6
-; P8LE: xxmrghd [[REG5:v[0-9]+]], {{[v][s]*}}[[REG3]], {{[v][s]*}}[[REG1]]
-; P8LE: xxmrghd [[REG6:v[0-9]+]], {{[v][s]*}}[[REG4]], {{[v][s]*}}[[REG2]]
-; P8LE: vmrgow v2, [[REG6]], [[REG5]]
+; P8BE-DAG: rldimi r6, r5, 32, 0
+; P8BE-DAG: rldimi r4, r3, 32, 0
+; P8BE-DAG: mtvsrd f[[REG1:[0-9]+]], r6
+; P8BE-DAG: mtvsrd f[[REG2:[0-9]+]], r4
+; P8BE-DAG: xxmrghd v2, vs[[REG2]], vs[[REG1]]
+; P8BE: blr
+; P8LE-DAG: rldimi r3, r4, 32, 0
+; P8LE-DAG: rldimi r5, r6, 32, 0
+; P8LE-DAG: mtvsrd f[[REG1:[0-9]+]], r3
+; P8LE-DAG: mtvsrd f[[REG2:[0-9]+]], r5
+; P8LE-DAG: xxmrghd v2, vs[[REG2]], vs[[REG1]]
+; P8LE: blr
 }
 
 ; Function Attrs: norecurse nounwind readnone
@@ -2207,38 +2195,34 @@ entry:
 ; P9BE: lwz
 ; P9BE: lwz
 ; P9BE: lwz
+; P9BE: rldimi
+; P9BE: rldimi
 ; P9BE: mtvsrdd
-; P9BE: mtvsrdd
-; P9BE: vmrgow
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
+; P9LE: rldimi
+; P9LE: rldimi
 ; P9LE: mtvsrdd
-; P9LE: mtvsrdd
-; P9LE: vmrgow
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: xxmrghd
+; P8BE: rldimi
+; P8BE: rldimi
+; P8BE: mtvsrd
+; P8BE: mtvsrd
 ; P8BE: xxmrghd
-; P8BE: vmrgow
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: xxmrghd
+; P8LE: rldimi
+; P8LE: rldimi
+; P8LE: mtvsrd
+; P8LE: mtvsrd
 ; P8LE: xxmrghd
-; P8LE: vmrgow
 }
 
 ; Function Attrs: norecurse nounwind readonly
@@ -2274,41 +2258,37 @@ entry:
 ; P9BE: lwz
 ; P9BE: lwz
 ; P9BE: lwz
+; P9BE: rldimi
+; P9BE: rldimi
 ; P9BE: mtvsrdd
-; P9BE: mtvsrdd
-; P9BE: vmrgow
 ; P9LE: sldi r4, r4, 2
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
 ; P9LE: lwz
+; P9LE: rldimi
+; P9LE: rldimi
 ; P9LE: mtvsrdd
-; P9LE: mtvsrdd
-; P9LE: vmrgow
 ; P8BE: sldi r4, r4, 2
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
 ; P8BE: lwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: mtvsrwz
-; P8BE: xxmrghd
+; P8BE: rldimi
+; P8BE: rldimi
+; P8BE: mtvsrd
+; P8BE: mtvsrd
 ; P8BE: xxmrghd
-; P8BE: vmrgow
 ; P8LE: sldi r4, r4, 2
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
 ; P8LE: lwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: mtvsrwz
-; P8LE: xxmrghd
+; P8LE: rldimi
+; P8LE: rldimi
+; P8LE: mtvsrd
+; P8LE: mtvsrd
 ; P8LE: xxmrghd
-; P8LE: vmrgow
 }
 
 ; Function Attrs: norecurse nounwind readnone
diff --git a/test/CodeGen/PowerPC/combine-setcc.ll b/test/CodeGen/PowerPC/combine-setcc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a86de8296f82a3c2eb3ee84787f22647e2e4b03b
--- /dev/null
+++ b/test/CodeGen/PowerPC/combine-setcc.ll
@@ -0,0 +1,408 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-unknown \
+; RUN:  -ppc-asm-full-reg-names < %s  | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown \
+; RUN:  -ppc-asm-full-reg-names < %s  | FileCheck %s
+
+define zeroext i1 @eq1(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: eq1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sext i1 %x to i32
+  %conv3 = zext i1 %y to i32
+  %cmp = icmp eq i32 %sub, %conv3
+  ret i1 %cmp
+}
+
+define zeroext i8 @eq2(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: eq2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i8 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i8 %y to i32
+  %cmp = icmp eq i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @eq3(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: eq3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = sext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = sext i16 %y to i32
+  %cmp = icmp eq i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @eq4(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: eq4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i16 %y to i32
+  %cmp = icmp eq i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @eq5(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: eq5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub nsw i32 0, %x
+  %cmp = icmp eq i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @eq6(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: eq6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub i32 0, %x
+  %cmp = icmp eq i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @eq7(i64 %x, i64 %y) {
+; CHECK-LABEL: eq7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzd r3, r3
+; CHECK-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %x
+  %cmp = icmp eq i64 %sub, %y
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
+
+define zeroext i1 @eq8(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: eq8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i1 %y to i32
+  %sub = sext i1 %x to i32
+  %cmp = icmp eq i32 %conv, %sub
+  ret i1 %cmp
+}
+
+define zeroext i8 @eq9(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: eq9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i8 %x to i32
+  %conv1 = zext i8 %y to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @eq10(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: eq10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = sext i16 %x to i32
+  %conv1 = sext i16 %y to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @eq11(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: eq11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %conv = zext i16 %x to i32
+  %conv1 = zext i16 %y to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp eq i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @eq12(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: eq12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub nsw i32 0, %y
+  %cmp = icmp eq i32 %sub, %x
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @eq13(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: eq13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r3, r4
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    blr
+  %sub = sub i32 0, %y
+  %cmp = icmp eq i32 %sub, %x
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @eq14(i64 %x, i64 %y) {
+; CHECK-LABEL: eq14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r3, r4
+; CHECK-NEXT:    cntlzd r3, r3
+; CHECK-NEXT:    rldicl r3, r3, 58, 63
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %y
+  %cmp = icmp eq i64 %sub, %x
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+define zeroext i1 @neq1(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: neq1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %sub = sext i1 %x to i32
+  %conv3 = zext i1 %y to i32
+  %cmp = icmp ne i32 %sub, %conv3
+  ret i1 %cmp
+}
+
+define zeroext i8 @neq2(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: neq2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i8 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i8 %y to i32
+  %cmp = icmp ne i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @neq3(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: neq3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = sext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = sext i16 %y to i32
+  %cmp = icmp ne i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @neq4(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: neq4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i16 %x to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = zext i16 %y to i32
+  %cmp = icmp ne i32 %sub, %conv1
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @neq5(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: neq5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %sub = sub nsw i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @neq6(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: neq6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %sub = sub i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @neq7(i64 %x, i64 %y) {
+; CHECK-LABEL: neq7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %x
+  %cmp = icmp ne i64 %sub, %y
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
+
+define zeroext i1 @neq8(i1 zeroext %x, i1 zeroext %y) {
+; CHECK-LABEL: neq8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i1 %y to i32
+  %sub = sext i1 %x to i32
+  %cmp = icmp ne i32 %conv, %sub
+  ret i1 %cmp
+}
+
+define zeroext i8 @neq9(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: neq9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = zext i8 %y to i32
+  %conv1 = zext i8 %x to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i8
+  ret i8 %conv3
+}
+
+define signext i16 @neq10(i16 signext %x, i16 signext %y) {
+; CHECK-LABEL: neq10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+  %conv = sext i16 %y to i32
+  %conv1 = sext i16 %x to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define zeroext i16 @neq11(i16 zeroext %x, i16 zeroext %y) {
+; CHECK-LABEL: neq11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+entry:
+  %conv = zext i16 %y to i32
+  %conv1 = zext i16 %x to i32
+  %sub = sub nsw i32 0, %conv1
+  %cmp = icmp ne i32 %conv, %sub
+  %conv3 = zext i1 %cmp to i16
+  ret i16 %conv3
+}
+
+define signext i32 @neq12(i32 signext %x, i32 signext %y) {
+; CHECK-LABEL: neq12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+entry:
+  %sub = sub nsw i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i32 @neq13(i32 zeroext %x, i32 zeroext %y) {
+; CHECK-LABEL: neq13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    cntlzw r3, r3
+; CHECK-NEXT:    srwi r3, r3, 5
+; CHECK-NEXT:    xori r3, r3, 1
+; CHECK-NEXT:    blr
+entry:
+  %sub = sub i32 0, %x
+  %cmp = icmp ne i32 %sub, %y
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i64 @neq14(i64 %x, i64 %y) {
+; CHECK-LABEL: neq14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    add r3, r4, r3
+; CHECK-NEXT:    addic r4, r3, -1
+; CHECK-NEXT:    subfe r3, r4, r3
+; CHECK-NEXT:    blr
+  %sub = sub nsw i64 0, %x
+  %cmp = icmp ne i64 %y, %sub
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
diff --git a/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir
index c9038e87af5abde647ad62be0ba10054a9daa4b3..e210ec5c523eb0ecc4e9b09e8f48a33286f66fde 100644
--- a/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir
+++ b/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir
@@ -3265,15 +3265,15 @@ body:             |
     %4 = INSERT_SUBREG %5, killed %3, 1
     %6 = LI8 100
     %7 = LXSDX %0, killed %6, implicit $rm :: (load 8 from %ir.arrayidx, !tbaa !12)
-    ; CHECK: LXSD 100, %0
-    ; CHECK-LATE: lxsd 0, 100(3)
+    ; CHECK: DFLOADf64 100, %0
+    ; CHECK-LATE: lfd 0, 100(3)
     %8 = ADDI %2, 2
     %10 = IMPLICIT_DEF
     %9 = INSERT_SUBREG %10, killed %8, 1
     %11 = LI8 -120
     %12 = LXSDX %0, killed %11, implicit $rm :: (load 8 from %ir.arrayidx3, !tbaa !12)
-    ; CHECK: LXSD -120, %0
-    ; CHECK-LATE: lxsd 1, -120(3)
+    ; CHECK: DFLOADf64 -120, %0
+    ; CHECK-LATE: lfd 1, -120(3)
     %13 = XSADDDP killed %7, killed %12, implicit $rm
     $f1 = COPY %13
     BLR8 implicit $lr8, implicit $rm, implicit $f1
@@ -3338,15 +3338,15 @@ body:             |
     %4 = INSERT_SUBREG %5, killed %3, 1
     %6 = LI8 96
     %7 = LXSSPX %0, killed %6 :: (load 4 from %ir.arrayidx, !tbaa !14)
-    ; CHECK: LXSSP 96, %0
-    ; CHECK-LATE: lxssp 0, 96(3)
+    ; CHECK: DFLOADf32 96, %0
+    ; CHECK-LATE: lfs 0, 96(3)
     %8 = ADDI %2, 2
     %10 = IMPLICIT_DEF
     %9 = INSERT_SUBREG %10, killed %8, 1
     %11 = LI8 -92
     %12 = LXSSPX %0, killed %11 :: (load 4 from %ir.arrayidx3, !tbaa !14)
-    ; CHECK: LXSSP -92, %0
-    ; CHECK-LATE: lxssp 1, -92(3)
+    ; CHECK: DFLOADf32 -92, %0
+    ; CHECK-LATE: lfs 1, -92(3)
     %13 = XSADDSP killed %7, killed %12
     $f1 = COPY %13
     BLR8 implicit $lr8, implicit $rm, implicit $f1
@@ -6031,8 +6031,8 @@ body:             |
     %0 = COPY $x3
     %3 = LI8 444
     STXSSPX %1, %0, killed %3 :: (store 4 into %ir.arrayidx, !tbaa !14)
-    ; CHECK: STXSSP %1, 444, %0
-    ; CHECK-LATE: stxssp 1, 444(3)
+    ; CHECK: DFSTOREf32 %1, 444, %0
+    ; CHECK-LATE: stfs 1, 444(3)
     BLR8 implicit $lr8, implicit $rm
 
 ...
@@ -6083,8 +6083,8 @@ body:             |
     %0 = COPY $x3
     %3 = LI8 4
     STXSDX %1, %0, killed %3, implicit $rm :: (store 8 into %ir.arrayidx, !tbaa !12)
-    ; CHECK: STXSD %1, 4, %0
-    ; CHECK-LATE: stxsd 1, 4(3)
+    ; CHECK: DFSTOREf64 %1, 4, %0
+    ; CHECK-LATE: stfd 1, 4(3)
     BLR8 implicit $lr8, implicit $rm
 
 ...
diff --git a/test/CodeGen/PowerPC/debuginfo-split-int.ll b/test/CodeGen/PowerPC/debuginfo-split-int.ll
index 5a1e409441b6a8d8fa69398ce0b8c92667868905..e12d5e5d220e59a12885eec980ac666f26716e4c 100644
--- a/test/CodeGen/PowerPC/debuginfo-split-int.ll
+++ b/test/CodeGen/PowerPC/debuginfo-split-int.ll
@@ -27,9 +27,9 @@ target triple = "ppc32"
 ;
 ; High 32 bits in R3, low 32 bits in R4
 ; CHECK: %0:gprc = COPY $r3
-; CHECK: DBG_VALUE debug-use %0, debug-use $noreg, [[DL]], !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK: DBG_VALUE %0, $noreg, [[DL]], !DIExpression(DW_OP_LLVM_fragment, 0, 32)
 ; CHECK: %1:gprc = COPY $r4
-; CHECK: DBG_VALUE debug-use %1, debug-use $noreg, [[DL]], !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK: DBG_VALUE %1, $noreg, [[DL]], !DIExpression(DW_OP_LLVM_fragment, 32, 32)
 define void @bar() local_unnamed_addr #0 !dbg !6 {
   %1 = alloca i64, align 8
   %2 = tail call i64 @foo()
diff --git a/test/CodeGen/PowerPC/debuginfo-stackarg.ll b/test/CodeGen/PowerPC/debuginfo-stackarg.ll
index 3830589b4cb5ce7e23090b50dc26b18137db7116..b49f363ed803068c8ed685b64f2201c0848e7cf3 100644
--- a/test/CodeGen/PowerPC/debuginfo-stackarg.ll
+++ b/test/CodeGen/PowerPC/debuginfo-stackarg.ll
@@ -34,7 +34,7 @@ define i64 @foo(i64 %bar1, i64 %bar2, i64 %bar3, i64 %bar4, i64 %bar5) local_unn
 ; We expect to find a DBG_VALUE refering to the metadata id for bar5, using the lowest
 ; of the two fixed stack offsets found earlier.
 ; CHECK-LABEL: body:
-; CHECK: DBG_VALUE debug-use $r1, 0, !17, !DIExpression(DW_OP_plus_uconst, 8)
+; CHECK: DBG_VALUE $r1, 0, !17, !DIExpression(DW_OP_plus_uconst, 8)
 entry:
   tail call void @llvm.dbg.value(metadata i64 %bar1, metadata !13, metadata !DIExpression()), !dbg !18
   tail call void @llvm.dbg.value(metadata i64 %bar2, metadata !14, metadata !DIExpression()), !dbg !19
diff --git a/test/CodeGen/PowerPC/f128-bitcast.ll b/test/CodeGen/PowerPC/f128-bitcast.ll
new file mode 100644
index 0000000000000000000000000000000000000000..68069e542ffd61d800e0402fcefd95f39119ba60
--- /dev/null
+++ b/test/CodeGen/PowerPC/f128-bitcast.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -enable-ppc-quad-precision -verify-machineinstrs \
+; RUN:   -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | FileCheck %s
+; RUN: llc -mcpu=pwr9 -mtriple=powerpc64-unknown-unknown \
+; RUN:   -enable-ppc-quad-precision -verify-machineinstrs \
+; RUN:   -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @getPart1(fp128 %in) local_unnamed_addr {
+entry:
+  %0 = bitcast fp128 %in to i128
+  %a.sroa.0.0.extract.trunc = trunc i128 %0 to i64
+  ret i64 %a.sroa.0.0.extract.trunc
+; CHECK-LABEL: getPart1
+; CHECK:       mfvsrld r3, v2
+; CHECK-NEXT:  blr
+; CHECK-BE-LABEL: getPart1
+; CHECK-BE:       mfvsrld r3, v2
+; CHECK-BE-NEXT:  blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @getPart2(fp128 %in) local_unnamed_addr {
+entry:
+  %0 = bitcast fp128 %in to i128
+  %a.sroa.0.8.extract.shift = lshr i128 %0, 64
+  %a.sroa.0.8.extract.trunc = trunc i128 %a.sroa.0.8.extract.shift to i64
+  ret i64 %a.sroa.0.8.extract.trunc
+; CHECK-LABEL: getPart2
+; CHECK:       mfvsrd r3, v2
+; CHECK-NEXT:  blr
+; CHECK-BE-LABEL: getPart2
+; CHECK-BE:       mfvsrd r3, v2
+; CHECK-BE-NEXT:  blr
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @checkBitcast(fp128 %in, <2 x i64> %in2, <2 x i64> *%out) local_unnamed_addr {
+entry:
+  %0 = bitcast fp128 %in to <2 x i64>
+  %1 = extractelement <2 x i64> %0, i64 0
+  %2 = add <2 x i64> %0, %in2
+  store <2 x i64> %2, <2 x i64> *%out, align 16
+  ret i64 %1
+; CHECK-LABEL: checkBitcast
+; CHECK:       mfvsrld r3, v2
+; CHECK:       blr
+; CHECK-BE-LABEL: checkBitcast
+; CHECK-BE:       mfvsrd r3, v2
+; CHECK-BE:       blr
+}
+
diff --git a/test/CodeGen/PowerPC/inlineasm-vsx-reg.ll b/test/CodeGen/PowerPC/inlineasm-vsx-reg.ll
index 9de6358427d1c0a469d66efa39f381b7468e6ed2..0ebb44930658eec09068eba2df7274b799a37e37 100644
--- a/test/CodeGen/PowerPC/inlineasm-vsx-reg.ll
+++ b/test/CodeGen/PowerPC/inlineasm-vsx-reg.ll
@@ -12,6 +12,21 @@ entry:
 ; CHECK: #NO_APP
 }
 
+define signext i32 @foo1(<4 x float> %__A) {
+entry:
+  %0 = tail call { i32, <4 x float> } asm "xxsldwi ${1:x},${2:x},${2:x},3;\0Axscvspdp ${1:x},${1:x};\0Afctiw  $1,$1;\0Amfvsrd  $0,${1:x};\0A", "=r,=&^wi,^wa"(<4 x float> %__A)
+  %asmresult = extractvalue { i32, <4 x float> } %0, 0
+  ret i32 %asmresult
+
+; CHECK: #APP
+; CHECK: xxsldwi vs0, v2, v2, 3
+; CHECK: xscvspdp f0, f0
+; CEHCK: fctiw f0, f0
+; CHECK: mffprd r3, f0
+; CEHCK: extsw r3, r3
+; CHECK: #NO_APP
+}
+
 define double @test() {
   entry:
     %0 = tail call double asm "mtvsrd ${0:x}, 1", "=^ws,~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14}"()
diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
index 91119786b1f54ef0881f0225ae35fe6dc275d069..a35250526c7cb62bb18cdef1a1b4f2b30ba119c6 100644
--- a/test/CodeGen/PowerPC/ppc64le-aggregates.ll
+++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
@@ -236,14 +236,12 @@ entry:
 ; CHECK-DAG: stfs 6, [[OFF1:[0-9]+]](1)
 ; CHECK-DAG: stfs 7, [[OFF2:[0-9]+]](1)
 ; CHECK-DAG: stfs 8, [[OFF3:[0-9]+]](1)
-; CHECK-DAG: lwz [[REG0:[0-9]+]], [[OFF0]](1)
+; CHECK-DAG: lwz 9, [[OFF0]](1)
 ; CHECK-DAG: lwz [[REG1:[0-9]+]], [[OFF1]](1)
-; CHECK-DAG: lwz [[REG2:[0-9]+]], [[OFF2]](1)
+; CHECK-DAG: lwz 10, [[OFF2]](1)
 ; CHECK-DAG: lwz [[REG3:[0-9]+]], [[OFF3]](1)
-; CHECK-DAG: sldi [[REG1]], [[REG1]], 32
-; CHECK-DAG: sldi [[REG3]], [[REG3]], 32
-; CHECK-DAG: or 9, [[REG0]], [[REG1]]
-; CHECK-DAG: or 10, [[REG2]], [[REG3]]
+; CHECK-DAG: rldimi 9, [[REG1]], 32, 0
+; CHECK-DAG: rldimi 10, [[REG3]], 32, 0
 ; CHECK: bl test1
 
 declare void @test1([8 x float], [8 x float])
diff --git a/test/CodeGen/PowerPC/pr26180.ll b/test/CodeGen/PowerPC/pr26180.ll
index e4cbcb8725d5bc8ef799cda4632d7aff502cdce2..d4b05dfeed6111051056e8d68787acde61efe28f 100644
--- a/test/CodeGen/PowerPC/pr26180.ll
+++ b/test/CodeGen/PowerPC/pr26180.ll
@@ -6,9 +6,9 @@ define i32 @bad(double %x) {
   ret i32 %1
 }
 
-; CHECK: fctidz 1, 1
-; CHECK: stfd 1, [[OFF:.*]](1)
+; CHECK: fctidz [[REG0:[0-9]+]], 1
+; CHECK: stfd [[REG0]], [[OFF:.*]](1)
 ; CHECK: lwz {{[0-9]*}}, [[OFF]](1)
-; GENERIC: fctiwuz 1, 1
-; GENERIC: stfd 1, [[OFF:.*]](1)
+; GENERIC: fctiwuz [[REG0:[0-9]+]], 1
+; GENERIC: stfd [[REG0]], [[OFF:.*]](1)
 ; GENERIC: lwz {{[0-9]*}}, [[OFF]](1)
diff --git a/test/CodeGen/PowerPC/rlwimi-dyn-and.ll b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
index 0d7501afc276265f8accf5aa6b8f2b9d6f52cc85..6e2802f6ff9fefb3153e0539f2aba760baccf787 100644
--- a/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
+++ b/test/CodeGen/PowerPC/rlwimi-dyn-and.ll
@@ -39,7 +39,7 @@ next:
   ret i32 %conv174
 
 ; CHECK-LABEL: @test2
-; CHECK: slwi 3, {{[0-9]+}}, 7
+; CHECK: rlwinm 3, {{[0-9]+}}, 7, 17, 24
 ; CHECK: rlwimi 3, {{[0-9]+}}, 15, 16, 16
 ; CHECK: blr
 }
diff --git a/test/CodeGen/PowerPC/tls.ll b/test/CodeGen/PowerPC/tls.ll
index 8410e9885dec2ea9374ab71f84c4c4f8e8cf8c73..3ad93986bd45b4ad7316a147840dbc85a8a91d86 100644
--- a/test/CodeGen/PowerPC/tls.ll
+++ b/test/CodeGen/PowerPC/tls.ll
@@ -11,12 +11,12 @@ target triple = "powerpc64-unknown-linux-gnu"
 define i32 @localexec() nounwind {
 entry:
 ;OPT0:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
-;OPT0-NEXT:     addi [[REG1]], [[REG1]], a@tprel@l
-;OPT0-NEXT:     li [[REG2:[0-9]+]], 42
-;OPT0:          stw [[REG2]], 0([[REG1]])
+;OPT0-NEXT:     addi [[REG2:[0-9]+]], [[REG1]], a@tprel@l
+;OPT0-NEXT:     li [[REG3:[0-9]+]], 42
+;OPT0:          stw [[REG3]], 0([[REG2]])
 ;OPT1:          addis [[REG1:[0-9]+]], 13, a@tprel@ha
-;OPT1-NEXT:     li [[REG2:[0-9]+]], 42
-;OPT1:     stw [[REG2]], a@tprel@l([[REG1]])
+;OPT1-NEXT:     li [[REG3:[0-9]+]], 42
+;OPT1:     stw [[REG3]], a@tprel@l([[REG1]])
   store i32 42, i32* @a, align 4
   ret i32 0
 }
diff --git a/test/CodeGen/PowerPC/vec-asm-disabled.ll b/test/CodeGen/PowerPC/vec-asm-disabled.ll
index 333ccce6b89fa1da141a106968f69b6df6ffca2f..614f3e3f03a03e4fb0ce73f53e459fb07d4386b2 100644
--- a/test/CodeGen/PowerPC/vec-asm-disabled.ll
+++ b/test/CodeGen/PowerPC/vec-asm-disabled.ll
@@ -10,5 +10,14 @@ entry:
 ; CHECK: error: couldn't allocate output register for constraint 'wd'
 }
 
+define signext i32 @testi2(<4 x float> %__A) #0 {
+entry:
+  %0 = tail call { i32, <4 x float> } asm "xxsldwi ${1:x},${2:x},${2:x},3", "=^wi,=&^wi,^wi"(<4 x float> %__A) #0
+  %asmresult = extractvalue { i32, <4 x float> } %0, 0
+  ret i32 %asmresult
+
+; CHECK: error: couldn't allocate output register for constraint 'wi'
+}
+
 attributes #0 = { nounwind "target-features"="-vsx" }
 
diff --git a/test/CodeGen/PowerPC/vec-itofp.ll b/test/CodeGen/PowerPC/vec-itofp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..852b7c822ad8fcd63898847463f85b4243836de3
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec-itofp.ll
@@ -0,0 +1,192 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P8
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-P9
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN: FileCheck %s --check-prefix=CHECK-BE
+
+define void @test8(<8 x double>* nocapture %Sink, <8 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <8 x i16>, <8 x i16>* %SrcPtr, align 16
+  %1 = uitofp <8 x i16> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: @test8
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: xvcvuxddp
+; CHECK-P9: xvcvuxddp
+; CHECK-P9: xvcvuxddp
+; CHECK-P9: xvcvuxddp
+; CHECK-P8-LABEL: @test8
+; CHECK-P8: vperm
+; CHECK-P8: vperm
+; CHECK-P8: vperm
+; CHECK-P8: vperm
+; CHECK-P8: xvcvuxddp
+; CHECK-P8: xvcvuxddp
+; CHECK-P8: xvcvuxddp
+; CHECK-P8: xvcvuxddp
+}
+
+define void @test4(<4 x double>* nocapture %Sink, <4 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <4 x i16>, <4 x i16>* %SrcPtr, align 16
+  %1 = uitofp <4 x i16> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: @test4
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: xvcvuxddp
+; CHECK-P9: xvcvuxddp
+; CHECK-P8-LABEL: @test4
+; CHECK-P8: vperm
+; CHECK-P8: vperm
+; CHECK-P8: xvcvuxddp
+; CHECK-P8: xvcvuxddp
+}
+
+define void @test2(<2 x double>* nocapture %Sink, <2 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <2 x i16>, <2 x i16>* %SrcPtr, align 16
+  %1 = uitofp <2 x i16> %0 to <2 x double>
+  store <2 x double> %1, <2 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: .LCPI2_0:
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 30
+; CHECK-P9-NEXT: .byte 13
+; CHECK-P9-NEXT: .byte 12
+; CHECK-P9-NEXT: .byte 11
+; CHECK-P9-NEXT: .byte 10
+; CHECK-P9-NEXT: .byte 9
+; CHECK-P9-NEXT: .byte 8
+; CHECK-P9-NEXT: .byte 29
+; CHECK-P9-NEXT: .byte 28
+; CHECK-P9-NEXT: .byte 5
+; CHECK-P9-NEXT: .byte 4
+; CHECK-P9-NEXT: .byte 3
+; CHECK-P9-NEXT: .byte 2
+; CHECK-P9-NEXT: .byte 1
+; CHECK-P9-NEXT: .byte 0
+; CHECK-P9: addi [[REG1:r[0-9]+]], {{r[0-9]+}}, .LCPI2_0@toc@l
+; CHECK-P9: lxvx [[REG2:v[0-9]+]], 0, [[REG1]]
+; CHECK-P9: vperm [[REG3:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[REG2]]
+; CHECK-P9: xvcvuxddp {{vs[0-9]+}}, [[REG3]]
+; CHECK-P8-LABEL: @test2
+; CHECK-P8: vperm [[REG1:v[0-9]+]]
+; CHECK-P8: xvcvuxddp {{vs[0-9]+}}, [[REG1]]
+; CHECK-BE-LABEL: .LCPI2_0:
+; CHECK-BE-NEXT: .byte 16
+; CHECK-BE-NEXT: .byte 17
+; CHECK-BE-NEXT: .byte 18
+; CHECK-BE-NEXT: .byte 19
+; CHECK-BE-NEXT: .byte 20
+; CHECK-BE-NEXT: .byte 21
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 1
+; CHECK-BE-NEXT: .byte 24
+; CHECK-BE-NEXT: .byte 25
+; CHECK-BE-NEXT: .byte 26
+; CHECK-BE-NEXT: .byte 27
+; CHECK-BE-NEXT: .byte 28
+; CHECK-BE-NEXT: .byte 29
+; CHECK-BE-NEXT: .byte 2
+; CHECK-BE-NEXT: .byte 3
+; CHECK-BE: addi [[REG1:r[0-9]+]], {{r[0-9]+}}, .LCPI2_0@toc@l
+; CHECK-BE: lxvx [[REG2:v[0-9]+]], 0, [[REG1]]
+; CHECK-BE: vperm [[REG3:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[REG2]]
+; CHECK-BE: xvcvuxddp {{vs[0-9]+}}, [[REG3]]
+}
+
+define void @stest8(<8 x double>* nocapture %Sink, <8 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <8 x i16>, <8 x i16>* %SrcPtr, align 16
+  %1 = sitofp <8 x i16> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: @stest8
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vextsh2d
+; CHECK-P9: vextsh2d
+; CHECK-P9: vextsh2d
+; CHECK-P9: vextsh2d
+; CHECK-P9: xvcvsxddp
+; CHECK-P9: xvcvsxddp
+; CHECK-P9: xvcvsxddp
+; CHECK-P9: xvcvsxddp
+}
+
+define void @stest4(<4 x double>* nocapture %Sink, <4 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <4 x i16>, <4 x i16>* %SrcPtr, align 16
+  %1 = sitofp <4 x i16> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: @stest4
+; CHECK-P9: vperm
+; CHECK-P9: vperm
+; CHECK-P9: vextsh2d
+; CHECK-P9: vextsh2d
+; CHECK-P9: xvcvsxddp
+; CHECK-P9: xvcvsxddp
+}
+
+define void @stest2(<2 x double>* nocapture %Sink, <2 x i16>* nocapture readonly %SrcPtr) {
+entry:
+  %0 = load <2 x i16>, <2 x i16>* %SrcPtr, align 16
+  %1 = sitofp <2 x i16> %0 to <2 x double>
+  store <2 x double> %1, <2 x double>* %Sink, align 16
+  ret void
+; CHECK-P9-LABEL: .LCPI5_0:
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 30
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 29
+; CHECK-P9-NEXT: .byte 28
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9-NEXT: .byte 31
+; CHECK-P9: vperm [[REG1:v[0-9]+]]
+; CHECK-P9: vextsh2d [[REG2:v[0-9]+]], [[REG1]]
+; CHECK-P9: xvcvsxddp {{vs[0-9]+}}, [[REG2]]
+; CHECK-BE-LABEL: .LCPI5_0:
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 1
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 0
+; CHECK-BE-NEXT: .byte 2
+; CHECK-BE-NEXT: .byte 3
+; CHECK-BE: addi [[REG1:r[0-9]+]], {{r[0-9]+}}, .LCPI5_0@toc@l
+; CHECK-BE: lxvx [[REG2:v[0-9]+]], 0, [[REG1]]
+; CHECK-BE: vperm [[REG3:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[REG2]]
+; CHECK-BE: vextsh2d [[REG4:v[0-9]+]], [[REG3]]
+; CHECK-BE: xvcvsxddp {{vs[0-9]+}}, [[REG4]]
+}
diff --git a/test/CodeGen/PowerPC/vsx-spill.ll b/test/CodeGen/PowerPC/vsx-spill.ll
index 3bea07f3b8da3796bc20d366a881d72df945e29a..d46664ba98d97d36d02c8e69d49a900995e0910e 100644
--- a/test/CodeGen/PowerPC/vsx-spill.ll
+++ b/test/CodeGen/PowerPC/vsx-spill.ll
@@ -60,8 +60,8 @@ entry:
 ; CHECK-REG: blr
 
 ; CHECK-FISL: @foo2
-; CHECK-FISL: xsadddp f1, f1, f1
-; CHECK-FISL: stxsdx f1, r1, r3
+; CHECK-FISL: xsadddp [[REG0:f[0-9]+]], f1, f1
+; CHECK-FISL: stxsdx [[REG0]], r1, r3
 ; CHECK-FISL: lxsdx f1, r1, r3
 ; CHECK-FISL: blr
 
@@ -71,8 +71,8 @@ entry:
 ; CHECK-P9-REG: blr
 
 ; CHECK-P9-FISL: @foo2
-; CHECK-P9-FISL: xsadddp f1, f1, f1
-; CHECK-P9-FISL: stfd f1, -152(r1)
+; CHECK-P9-FISL: xsadddp [[REG0:f[0-9]+]], f1, f1
+; CHECK-P9-FISL: stfd [[REG0]], -152(r1)
 ; CHECK-P9-FISL: lfd f1, -152(r1)
 ; CHECK-P9-FISL: blr
 
diff --git a/test/CodeGen/PowerPC/vsx.ll b/test/CodeGen/PowerPC/vsx.ll
index 3df501db41f1a1cf793c9f25371b5e67ec09cc20..d6a5ed37040f5f2da50be591401f9820bd308593 100644
--- a/test/CodeGen/PowerPC/vsx.ll
+++ b/test/CodeGen/PowerPC/vsx.ll
@@ -1211,51 +1211,3 @@ entry:
 ; CHECK-LE: xscmpudp cr0, f3, f4
 ; CHECK-LE: beqlr cr0
 }
-
-; Function Attrs: nounwind readnone
-define <4 x i32> @test83(i8* %a) {
-  entry:
-    %0 = tail call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %a)
-      ret <4 x i32> %0
-; CHECK-LABEL: test83
-; CHECK: lxvw4x v2, 0, r3
-; CHECK: blr
-}
-; Function Attrs: nounwind readnone
-declare <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8*)
-
-; Function Attrs: nounwind readnone
-define <2 x double> @test84(i8* %a) {
-  entry:
-    %0 = tail call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %a)
-      ret <2 x double> %0
-; CHECK-LABEL: test84
-; CHECK: lxvd2x v2, 0, r3
-; CHECK: blr
-}
-; Function Attrs: nounwind readnone
-declare <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8*)
-
-; Function Attrs: nounwind readnone
-define void @test85(<4 x i32> %a, i8* %b) {
-  entry:
-    tail call void @llvm.ppc.vsx.stxvw4x.be(<4 x i32> %a, i8* %b)
-    ret void
-; CHECK-LABEL: test85
-; CHECK: stxvw4x v2, 0, r5
-; CHECK: blr
-}
-; Function Attrs: nounwind readnone
-declare void @llvm.ppc.vsx.stxvw4x.be(<4 x i32>, i8*)
-
-; Function Attrs: nounwind readnone
-define void @test86(<2 x double> %a, i8* %b) {
-  entry:
-    tail call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %a, i8* %b)
-    ret void
-; CHECK-LABEL: test86
-; CHECK: stxvd2x v2, 0, r5
-; CHECK: blr
-}
-; Function Attrs: nounwind readnone
-declare void @llvm.ppc.vsx.stxvd2x.be(<2 x double>, i8*)
diff --git a/test/CodeGen/PowerPC/vsx_builtins.ll b/test/CodeGen/PowerPC/vsx_builtins.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b386565500f62ad8ca0e233c1451d41c59f2ee09
--- /dev/null
+++ b/test/CodeGen/PowerPC/vsx_builtins.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -relocation-model=static -verify-machineinstrs -mcpu=pwr9 \
+; RUN:     -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s
+
+; Function Attrs: nounwind readnone
+define <4 x i32> @test1(i8* %a) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvw4x v2, 0, r3
+; CHECK-NEXT:    blr
+  entry:
+    %0 = tail call <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8* %a)
+      ret <4 x i32> %0
+}
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.ppc.vsx.lxvw4x.be(i8*)
+
+; Function Attrs: nounwind readnone
+define <2 x double> @test2(i8* %a) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lxvd2x v2, 0, r3
+; CHECK-NEXT:    blr
+  entry:
+    %0 = tail call <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8* %a)
+      ret <2 x double> %0
+}
+; Function Attrs: nounwind readnone
+declare <2 x double> @llvm.ppc.vsx.lxvd2x.be(i8*)
+
+; Function Attrs: nounwind readnone
+define void @test3(<4 x i32> %a, i8* %b) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stxvw4x v2, 0, r5
+; CHECK-NEXT:    blr
+  entry:
+    tail call void @llvm.ppc.vsx.stxvw4x.be(<4 x i32> %a, i8* %b)
+    ret void
+}
+; Function Attrs: nounwind readnone
+declare void @llvm.ppc.vsx.stxvw4x.be(<4 x i32>, i8*)
+
+; Function Attrs: nounwind readnone
+define void @test4(<2 x double> %a, i8* %b) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stxvd2x v2, 0, r5
+; CHECK-NEXT:    blr
+  entry:
+    tail call void @llvm.ppc.vsx.stxvd2x.be(<2 x double> %a, i8* %b)
+    ret void
+}
+; Function Attrs: nounwind readnone
+declare void @llvm.ppc.vsx.stxvd2x.be(<2 x double>, i8*)
diff --git a/test/CodeGen/RISCV/alu16.ll b/test/CodeGen/RISCV/alu16.ll
index 20b79a987f690139f0954890c099404097dbb3a5..79e74ffc8a5c21df6c6fd8107a4be2f39d440170 100644
--- a/test/CodeGen/RISCV/alu16.ll
+++ b/test/CodeGen/RISCV/alu16.ll
@@ -6,8 +6,6 @@
 ; that legalisation of these non-native types doesn't introduce unnecessary
 ; inefficiencies.
 
-; TODO: it's unnecessary to mask (zero-extend) the shift amount.
-
 define i16 @addi(i16 %a) nounwind {
 ; RV32I-LABEL: addi:
 ; RV32I:       # %bb.0:
@@ -122,9 +120,6 @@ define i16 @sub(i16 %a, i16 %b) nounwind {
 define i16 @sll(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: sll:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a2, 16
-; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    ret
   %1 = shl i16 %a, %b
@@ -173,7 +168,6 @@ define i16 @srl(i16 %a, i16 %b) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 16
 ; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    and a0, a0, a2
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -184,9 +178,6 @@ define i16 @srl(i16 %a, i16 %b) nounwind {
 define i16 @sra(i16 %a, i16 %b) nounwind {
 ; RV32I-LABEL: sra:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a2, 16
-; RV32I-NEXT:    addi a2, a2, -1
-; RV32I-NEXT:    and a1, a1, a2
 ; RV32I-NEXT:    slli a0, a0, 16
 ; RV32I-NEXT:    srai a0, a0, 16
 ; RV32I-NEXT:    sra a0, a0, a1
diff --git a/test/CodeGen/RISCV/alu8.ll b/test/CodeGen/RISCV/alu8.ll
index f7d0e8beef34a89f1d9a18a59176ea2c975c7ed0..ad97e6203196a8865b2dbea2dc45f12cb2a41ba3 100644
--- a/test/CodeGen/RISCV/alu8.ll
+++ b/test/CodeGen/RISCV/alu8.ll
@@ -6,8 +6,6 @@
 ; that legalisation of these non-native types doesn't introduce unnecessary
 ; inefficiencies.
 
-; TODO: it's unnecessary to mask (zero-extend) the shift amount.
-
 define i8 @addi(i8 %a) nounwind {
 ; RV32I-LABEL: addi:
 ; RV32I:       # %bb.0:
@@ -118,7 +116,6 @@ define i8 @sub(i8 %a, i8 %b) nounwind {
 define i8 @sll(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: sll:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a1, a1, 255
 ; RV32I-NEXT:    sll a0, a0, a1
 ; RV32I-NEXT:    ret
   %1 = shl i8 %a, %b
@@ -163,7 +160,6 @@ define i8 @xor(i8 %a, i8 %b) nounwind {
 define i8 @srl(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: srl:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a1, a1, 255
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    srl a0, a0, a1
 ; RV32I-NEXT:    ret
@@ -174,7 +170,6 @@ define i8 @srl(i8 %a, i8 %b) nounwind {
 define i8 @sra(i8 %a, i8 %b) nounwind {
 ; RV32I-LABEL: sra:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    andi a1, a1, 255
 ; RV32I-NEXT:    slli a0, a0, 24
 ; RV32I-NEXT:    srai a0, a0, 24
 ; RV32I-NEXT:    sra a0, a0, a1
diff --git a/test/CodeGen/RISCV/double-intrinsics.ll b/test/CodeGen/RISCV/double-intrinsics.ll
index 7d80d2cc8e037d96a935cac0d4f87cecf3199332..4a5239f4f01e2a46f91207b6a36ace931a00e753 100644
--- a/test/CodeGen/RISCV/double-intrinsics.ll
+++ b/test/CodeGen/RISCV/double-intrinsics.ll
@@ -2,14 +2,323 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+d -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32IFD %s
 
-declare double @llvm.floor.f64(double)
+declare double @llvm.sqrt.f64(double)
+
+define double @sqrt_f64(double %a) {
+; RV32IFD-LABEL: sqrt_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    fsqrt.d ft0, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.sqrt.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.powi.f64(double, i32)
+
+define double @powi_f64(double %a, i32 %b) {
+; RV32IFD-LABEL: powi_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call __powidf2
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.powi.f64(double %a, i32 %b)
+	ret double %1
+}
+
+declare double @llvm.sin.f64(double)
+
+define double @sin_f64(double %a) {
+; RV32IFD-LABEL: sin_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call sin
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.sin.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.cos.f64(double)
+
+define double @cos_f64(double %a) {
+; RV32IFD-LABEL: cos_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call cos
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.cos.f64(double %a)
+	ret double %1
+}
+
+; The sin+cos combination results in an FSINCOS SelectionDAG node.
+define double @sincos_f64(double %a) {
+; RV32IFD-LABEL: sincos_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -32
+; RV32IFD-NEXT:    sw ra, 28(sp)
+; RV32IFD-NEXT:    sw s1, 24(sp)
+; RV32IFD-NEXT:    sw s2, 20(sp)
+; RV32IFD-NEXT:    sw s3, 16(sp)
+; RV32IFD-NEXT:    sw s4, 12(sp)
+; RV32IFD-NEXT:    mv s2, a1
+; RV32IFD-NEXT:    mv s1, a0
+; RV32IFD-NEXT:    call sin
+; RV32IFD-NEXT:    mv s3, a0
+; RV32IFD-NEXT:    mv s4, a1
+; RV32IFD-NEXT:    mv a0, s1
+; RV32IFD-NEXT:    mv a1, s2
+; RV32IFD-NEXT:    call cos
+; RV32IFD-NEXT:    sw a0, 0(sp)
+; RV32IFD-NEXT:    sw a1, 4(sp)
+; RV32IFD-NEXT:    fld ft0, 0(sp)
+; RV32IFD-NEXT:    sw s3, 0(sp)
+; RV32IFD-NEXT:    sw s4, 4(sp)
+; RV32IFD-NEXT:    fld ft1, 0(sp)
+; RV32IFD-NEXT:    fadd.d ft0, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 0(sp)
+; RV32IFD-NEXT:    lw a0, 0(sp)
+; RV32IFD-NEXT:    lw a1, 4(sp)
+; RV32IFD-NEXT:    lw s4, 12(sp)
+; RV32IFD-NEXT:    lw s3, 16(sp)
+; RV32IFD-NEXT:    lw s2, 20(sp)
+; RV32IFD-NEXT:    lw s1, 24(sp)
+; RV32IFD-NEXT:    lw ra, 28(sp)
+; RV32IFD-NEXT:    addi sp, sp, 32
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.sin.f64(double %a)
+  %2 = call double @llvm.cos.f64(double %a)
+  %3 = fadd double %1, %2
+	ret double %3
+}
+
+declare double @llvm.pow.f64(double, double)
+
+define double @pow_f64(double %a, double %b) {
+; RV32IFD-LABEL: pow_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call pow
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.pow.f64(double %a, double %b)
+	ret double %1
+}
+
+declare double @llvm.exp.f64(double)
+
+define double @exp_f64(double %a) {
+; RV32IFD-LABEL: exp_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call exp
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.exp.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.exp2.f64(double)
+
+define double @exp2_f64(double %a) {
+; RV32IFD-LABEL: exp2_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call exp2
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.exp2.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.log.f64(double)
+
+define double @log_f64(double %a) {
+; RV32IFD-LABEL: log_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call log
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.log.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.log10.f64(double)
+
+define double @log10_f64(double %a) {
+; RV32IFD-LABEL: log10_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call log10
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.log10.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.log2.f64(double)
+
+define double @log2_f64(double %a) {
+; RV32IFD-LABEL: log2_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call log2
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.log2.f64(double %a)
+	ret double %1
+}
 
-; The call to ffloor is introduced very late, meaning this test case covers
-; aspects of passing f64 on RV32D soft-float that double-calling-conv.ll
-; doesn't.
+declare double @llvm.fma.f64(double, double, double)
 
-define double @foo(double %a) nounwind {
-; RV32IFD-LABEL: foo:
+; TODO: Select RISC-V FMA instruction.
+define double @fma_f64(double %a, double %b, double %c) {
+; RV32IFD-LABEL: fma_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call fma
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.fma.f64(double %a, double %b, double %c)
+	ret double %1
+}
+
+declare double @llvm.fabs.f64(double)
+
+define double @fabs_f64(double %a) {
+; RV32IFD-LABEL: fabs_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    fabs.d ft0, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.fabs.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.minnum.f64(double, double)
+
+define double @minnum_f64(double %a, double %b) nounwind {
+; RV32IFD-LABEL: minnum_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    fmin.d ft0, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.minnum.f64(double %a, double %b)
+  ret double %1
+}
+
+declare double @llvm.maxnum.f64(double, double)
+
+define double @maxnum_f64(double %a, double %b) nounwind {
+; RV32IFD-LABEL: maxnum_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    fmax.d ft0, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.maxnum.f64(double %a, double %b)
+  ret double %1
+}
+
+; TODO: FMINNAN and FMAXNAN aren't handled in
+; SelectionDAGLegalize::ExpandNode.
+
+; declare double @llvm.minimum.f64(double, double)
+
+; define double @fminimum_f64(double %a, double %b) nounwind {
+;   %1 = call double @llvm.minimum.f64(double %a, double %b)
+;   ret double %1
+; }
+
+; declare double @llvm.maximum.f64(double, double)
+
+; define double @fmaximum_f64(double %a, double %b) nounwind {
+;   %1 = call double @llvm.maximum.f64(double %a, double %b)
+;   ret double %1
+; }
+
+declare double @llvm.copysign.f64(double, double)
+
+define double @copysign_f64(double %a, double %b) nounwind {
+; RV32IFD-LABEL: copysign_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
+; RV32IFD-NEXT:    fld ft0, 8(sp)
+; RV32IFD-NEXT:    sw a0, 8(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld ft1, 8(sp)
+; RV32IFD-NEXT:    fsgnj.d ft0, ft1, ft0
+; RV32IFD-NEXT:    fsd ft0, 8(sp)
+; RV32IFD-NEXT:    lw a0, 8(sp)
+; RV32IFD-NEXT:    lw a1, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.copysign.f64(double %a, double %b)
+  ret double %1
+}
+
+declare double @llvm.floor.f64(double)
+
+define double @floor_f64(double %a) {
+; RV32IFD-LABEL: floor_f64:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
 ; RV32IFD-NEXT:    sw ra, 12(sp)
@@ -18,5 +327,80 @@ define double @foo(double %a) nounwind {
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
   %1 = call double @llvm.floor.f64(double %a)
-  ret double %1
+	ret double %1
+}
+
+declare double @llvm.ceil.f64(double)
+
+define double @ceil_f64(double %a) {
+; RV32IFD-LABEL: ceil_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call ceil
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.ceil.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.trunc.f64(double)
+
+define double @trunc_f64(double %a) {
+; RV32IFD-LABEL: trunc_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call trunc
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.trunc.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.rint.f64(double)
+
+define double @rint_f64(double %a) {
+; RV32IFD-LABEL: rint_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call rint
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.rint.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.nearbyint.f64(double)
+
+define double @nearbyint_f64(double %a) {
+; RV32IFD-LABEL: nearbyint_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call nearbyint
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.nearbyint.f64(double %a)
+	ret double %1
+}
+
+declare double @llvm.round.f64(double)
+
+define double @round_f64(double %a) {
+; RV32IFD-LABEL: round_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp)
+; RV32IFD-NEXT:    call round
+; RV32IFD-NEXT:    lw ra, 12(sp)
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+  %1 = call double @llvm.round.f64(double %a)
+	ret double %1
 }
diff --git a/test/CodeGen/RISCV/float-intrinsics.ll b/test/CodeGen/RISCV/float-intrinsics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1da644f5f9e1508fa43d376d31d9cf826c128edb
--- /dev/null
+++ b/test/CodeGen/RISCV/float-intrinsics.ll
@@ -0,0 +1,359 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+f -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IF %s
+; RUN: llc -mtriple=riscv32 -mattr=+d -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IF %s
+
+declare float @llvm.sqrt.f32(float)
+
+define float @sqrt_f32(float %a) {
+; RV32IF-LABEL: sqrt_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a0
+; RV32IF-NEXT:    fsqrt.s ft0, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.sqrt.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.powi.f32(float, i32)
+
+define float @powi_f32(float %a, i32 %b) {
+; RV32IF-LABEL: powi_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call __powisf2
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.powi.f32(float %a, i32 %b)
+	ret float %1
+}
+
+declare float @llvm.sin.f32(float)
+
+define float @sin_f32(float %a) {
+; RV32IF-LABEL: sin_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call sinf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.sin.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.cos.f32(float)
+
+define float @cos_f32(float %a) {
+; RV32IF-LABEL: cos_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call cosf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.cos.f32(float %a)
+	ret float %1
+}
+
+; The sin+cos combination results in an FSINCOS SelectionDAG node.
+define float @sincos_f32(float %a) {
+; RV32IF-LABEL: sincos_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    sw s1, 8(sp)
+; RV32IF-NEXT:    sw s2, 4(sp)
+; RV32IF-NEXT:    mv s1, a0
+; RV32IF-NEXT:    call sinf
+; RV32IF-NEXT:    mv s2, a0
+; RV32IF-NEXT:    mv a0, s1
+; RV32IF-NEXT:    call cosf
+; RV32IF-NEXT:    fmv.w.x ft0, a0
+; RV32IF-NEXT:    fmv.w.x ft1, s2
+; RV32IF-NEXT:    fadd.s ft0, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    lw s2, 4(sp)
+; RV32IF-NEXT:    lw s1, 8(sp)
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.sin.f32(float %a)
+  %2 = call float @llvm.cos.f32(float %a)
+  %3 = fadd float %1, %2
+	ret float %3
+}
+
+declare float @llvm.pow.f32(float, float)
+
+define float @pow_f32(float %a, float %b) {
+; RV32IF-LABEL: pow_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call powf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.pow.f32(float %a, float %b)
+	ret float %1
+}
+
+declare float @llvm.exp.f32(float)
+
+define float @exp_f32(float %a) {
+; RV32IF-LABEL: exp_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call expf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.exp.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.exp2.f32(float)
+
+define float @exp2_f32(float %a) {
+; RV32IF-LABEL: exp2_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call exp2f
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.exp2.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.log.f32(float)
+
+define float @log_f32(float %a) {
+; RV32IF-LABEL: log_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call logf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.log.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.log10.f32(float)
+
+define float @log10_f32(float %a) {
+; RV32IF-LABEL: log10_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call log10f
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.log10.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.log2.f32(float)
+
+define float @log2_f32(float %a) {
+; RV32IF-LABEL: log2_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call log2f
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.log2.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+; TODO: Select RISC-V FMA instruction.
+define float @fma_f32(float %a, float %b, float %c) {
+; RV32IF-LABEL: fma_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call fmaf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.fma.f32(float %a, float %b, float %c)
+	ret float %1
+}
+
+declare float @llvm.fabs.f32(float)
+
+define float @fabs_f32(float %a) {
+; RV32IF-LABEL: fabs_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    lui a1, 524288
+; RV32IF-NEXT:    addi a1, a1, -1
+; RV32IF-NEXT:    and a0, a0, a1
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.fabs.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.minnum.f32(float, float)
+
+define float @minnum_f32(float %a, float %b) nounwind {
+; RV32IF-LABEL: minnum_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a1
+; RV32IF-NEXT:    fmv.w.x ft1, a0
+; RV32IF-NEXT:    fmin.s ft0, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.minnum.f32(float %a, float %b)
+  ret float %1
+}
+
+declare float @llvm.maxnum.f32(float, float)
+
+define float @maxnum_f32(float %a, float %b) nounwind {
+; RV32IF-LABEL: maxnum_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a1
+; RV32IF-NEXT:    fmv.w.x ft1, a0
+; RV32IF-NEXT:    fmax.s ft0, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.maxnum.f32(float %a, float %b)
+  ret float %1
+}
+
+; TODO: FMINNAN and FMAXNAN aren't handled in
+; SelectionDAGLegalize::ExpandNode.
+
+; declare float @llvm.minimum.f32(float, float)
+
+; define float @fminimum_f32(float %a, float %b) nounwind {
+;   %1 = call float @llvm.minimum.f32(float %a, float %b)
+;   ret float %1
+; }
+
+; declare float @llvm.maximum.f32(float, float)
+
+; define float @fmaximum_f32(float %a, float %b) nounwind {
+;   %1 = call float @llvm.maximum.f32(float %a, float %b)
+;   ret float %1
+; }
+
+declare float @llvm.copysign.f32(float, float)
+
+define float @copysign_f32(float %a, float %b) nounwind {
+; RV32IF-LABEL: copysign_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a1
+; RV32IF-NEXT:    fmv.w.x ft1, a0
+; RV32IF-NEXT:    fsgnj.s ft0, ft1, ft0
+; RV32IF-NEXT:    fmv.x.w a0, ft0
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.copysign.f32(float %a, float %b)
+  ret float %1
+}
+
+declare float @llvm.floor.f32(float)
+
+define float @floor_f32(float %a) {
+; RV32IF-LABEL: floor_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call floorf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.floor.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.ceil.f32(float)
+
+define float @ceil_f32(float %a) {
+; RV32IF-LABEL: ceil_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call ceilf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.ceil.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.trunc.f32(float)
+
+define float @trunc_f32(float %a) {
+; RV32IF-LABEL: trunc_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call truncf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.trunc.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.rint.f32(float)
+
+define float @rint_f32(float %a) {
+; RV32IF-LABEL: rint_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call rintf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.rint.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.nearbyint.f32(float)
+
+define float @nearbyint_f32(float %a) {
+; RV32IF-LABEL: nearbyint_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call nearbyintf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.nearbyint.f32(float %a)
+	ret float %1
+}
+
+declare float @llvm.round.f32(float)
+
+define float @round_f32(float %a) {
+; RV32IF-LABEL: round_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp)
+; RV32IF-NEXT:    call roundf
+; RV32IF-NEXT:    lw ra, 12(sp)
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+  %1 = call float @llvm.round.f32(float %a)
+	ret float %1
+}
diff --git a/test/CodeGen/RISCV/shift-masked-shamt.ll b/test/CodeGen/RISCV/shift-masked-shamt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5c77aa2d77f8d689e59571ae7ad9b825268bae8e
--- /dev/null
+++ b/test/CodeGen/RISCV/shift-masked-shamt.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+
+; This test checks that unnecessary masking of shift amount operands is
+; eliminated during instruction selection. The test needs to ensure that the
+; masking is not removed if it may affect the shift amount.
+
+define i32 @sll_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sll_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 31
+  %2 = shl i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @sll_non_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sll_non_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a1, a1, 15
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 15
+  %2 = shl i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @srl_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: srl_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 4095
+  %2 = lshr i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @srl_non_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: srl_non_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a1, a1, 7
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 7
+  %2 = lshr i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @sra_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sra_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 65535
+  %2 = ashr i32 %a, %1
+  ret i32 %2
+}
+
+define i32 @sra_non_redundant_mask(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: sra_non_redundant_mask:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a1, a1, 32
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    ret
+  %1 = and i32 %b, 32
+  %2 = ashr i32 %a, %1
+  ret i32 %2
+}
diff --git a/test/CodeGen/RISCV/vararg.ll b/test/CodeGen/RISCV/vararg.ll
index ac08f346fbb85f4a6278bf60ab7c2d9a5629d741..77f8f300956af0821fd57f54b618bda37702c78e 100644
--- a/test/CodeGen/RISCV/vararg.ll
+++ b/test/CodeGen/RISCV/vararg.ll
@@ -17,16 +17,16 @@ define i32 @va1(i8* %fmt, ...) nounwind {
 ; RV32I-FPELIM-LABEL: va1:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
+; RV32I-FPELIM-NEXT:    mv a0, a1
 ; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw a6, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw a5, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw a4, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw a3, 28(sp)
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
-; RV32I-FPELIM-NEXT:    addi a0, sp, 24
-; RV32I-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32I-FPELIM-NEXT:    lw a0, 20(sp)
+; RV32I-FPELIM-NEXT:    addi a1, sp, 24
+; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a0, 20(sp)
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -36,16 +36,16 @@ define i32 @va1(i8* %fmt, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    mv a0, a1
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32I-WITHFP-NEXT:    sw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
-; RV32I-WITHFP-NEXT:    addi a0, s0, 8
-; RV32I-WITHFP-NEXT:    sw a0, -12(s0)
-; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
+; RV32I-WITHFP-NEXT:    addi a1, s0, 8
+; RV32I-WITHFP-NEXT:    sw a1, -12(s0)
+; RV32I-WITHFP-NEXT:    sw a0, 4(s0)
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 48
@@ -66,16 +66,16 @@ define i32 @va1_va_arg(i8* %fmt, ...) nounwind {
 ; RV32I-FPELIM-LABEL: va1_va_arg:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
+; RV32I-FPELIM-NEXT:    mv a0, a1
 ; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw a6, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw a5, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw a4, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw a3, 28(sp)
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
-; RV32I-FPELIM-NEXT:    addi a0, sp, 24
-; RV32I-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32I-FPELIM-NEXT:    lw a0, 20(sp)
+; RV32I-FPELIM-NEXT:    addi a1, sp, 24
+; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a0, 20(sp)
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -85,16 +85,16 @@ define i32 @va1_va_arg(i8* %fmt, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    mv a0, a1
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32I-WITHFP-NEXT:    sw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
-; RV32I-WITHFP-NEXT:    addi a0, s0, 8
-; RV32I-WITHFP-NEXT:    sw a0, -12(s0)
-; RV32I-WITHFP-NEXT:    lw a0, 4(s0)
+; RV32I-WITHFP-NEXT:    addi a1, s0, 8
+; RV32I-WITHFP-NEXT:    sw a1, -12(s0)
+; RV32I-WITHFP-NEXT:    sw a0, 4(s0)
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 48
@@ -117,7 +117,7 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind {
 ; RV32I-FPELIM-NEXT:    sw s0, 8(sp)
 ; RV32I-FPELIM-NEXT:    sw s1, 4(sp)
 ; RV32I-FPELIM-NEXT:    addi s0, sp, 16
-; RV32I-FPELIM-NEXT:    sw a1, 4(s0)
+; RV32I-FPELIM-NEXT:    mv s1, a1
 ; RV32I-FPELIM-NEXT:    sw a7, 28(s0)
 ; RV32I-FPELIM-NEXT:    sw a6, 24(s0)
 ; RV32I-FPELIM-NEXT:    sw a5, 20(s0)
@@ -126,8 +126,8 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind {
 ; RV32I-FPELIM-NEXT:    sw a2, 8(s0)
 ; RV32I-FPELIM-NEXT:    addi a0, s0, 8
 ; RV32I-FPELIM-NEXT:    sw a0, -16(s0)
-; RV32I-FPELIM-NEXT:    lw s1, 4(s0)
-; RV32I-FPELIM-NEXT:    addi a0, s1, 15
+; RV32I-FPELIM-NEXT:    sw a1, 4(s0)
+; RV32I-FPELIM-NEXT:    addi a0, a1, 15
 ; RV32I-FPELIM-NEXT:    andi a0, a0, -16
 ; RV32I-FPELIM-NEXT:    sub a0, sp, a0
 ; RV32I-FPELIM-NEXT:    mv sp, a0
@@ -147,7 +147,7 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    sw s1, 4(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    mv s1, a1
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
@@ -156,8 +156,8 @@ define i32 @va1_va_arg_alloca(i8* %fmt, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
 ; RV32I-WITHFP-NEXT:    addi a0, s0, 8
 ; RV32I-WITHFP-NEXT:    sw a0, -16(s0)
-; RV32I-WITHFP-NEXT:    lw s1, 4(s0)
-; RV32I-WITHFP-NEXT:    addi a0, s1, 15
+; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    addi a0, a1, 15
 ; RV32I-WITHFP-NEXT:    andi a0, a0, -16
 ; RV32I-WITHFP-NEXT:    sub a0, sp, a0
 ; RV32I-WITHFP-NEXT:    mv sp, a0
@@ -535,17 +535,17 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
 ; RV32I-FPELIM-NEXT:    sw ra, 12(sp)
 ; RV32I-FPELIM-NEXT:    sw s1, 8(sp)
-; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
+; RV32I-FPELIM-NEXT:    mv s1, a1
 ; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw a6, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw a5, 36(sp)
 ; RV32I-FPELIM-NEXT:    sw a4, 32(sp)
 ; RV32I-FPELIM-NEXT:    sw a3, 28(sp)
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
+; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
 ; RV32I-FPELIM-NEXT:    addi a0, sp, 24
 ; RV32I-FPELIM-NEXT:    sw a0, 4(sp)
 ; RV32I-FPELIM-NEXT:    sw a0, 0(sp)
-; RV32I-FPELIM-NEXT:    lw s1, 20(sp)
 ; RV32I-FPELIM-NEXT:    call notdead
 ; RV32I-FPELIM-NEXT:    lw a0, 4(sp)
 ; RV32I-FPELIM-NEXT:    addi a0, a0, 3
@@ -578,17 +578,17 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw s0, 24(sp)
 ; RV32I-WITHFP-NEXT:    sw s1, 20(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 32
-; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
+; RV32I-WITHFP-NEXT:    mv s1, a1
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32I-WITHFP-NEXT:    sw a4, 16(s0)
 ; RV32I-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
+; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
 ; RV32I-WITHFP-NEXT:    addi a0, s0, 8
 ; RV32I-WITHFP-NEXT:    sw a0, -16(s0)
 ; RV32I-WITHFP-NEXT:    sw a0, -20(s0)
-; RV32I-WITHFP-NEXT:    lw s1, 4(s0)
 ; RV32I-WITHFP-NEXT:    call notdead
 ; RV32I-WITHFP-NEXT:    lw a0, -16(s0)
 ; RV32I-WITHFP-NEXT:    addi a0, a0, 3
@@ -777,7 +777,6 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; RV32I-FPELIM-LABEL: va6_no_fixed_args:
 ; RV32I-FPELIM:       # %bb.0:
 ; RV32I-FPELIM-NEXT:    addi sp, sp, -48
-; RV32I-FPELIM-NEXT:    sw a0, 16(sp)
 ; RV32I-FPELIM-NEXT:    sw a7, 44(sp)
 ; RV32I-FPELIM-NEXT:    sw a6, 40(sp)
 ; RV32I-FPELIM-NEXT:    sw a5, 36(sp)
@@ -785,9 +784,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; RV32I-FPELIM-NEXT:    sw a3, 28(sp)
 ; RV32I-FPELIM-NEXT:    sw a2, 24(sp)
 ; RV32I-FPELIM-NEXT:    sw a1, 20(sp)
-; RV32I-FPELIM-NEXT:    addi a0, sp, 20
-; RV32I-FPELIM-NEXT:    sw a0, 12(sp)
-; RV32I-FPELIM-NEXT:    lw a0, 16(sp)
+; RV32I-FPELIM-NEXT:    addi a1, sp, 20
+; RV32I-FPELIM-NEXT:    sw a1, 12(sp)
+; RV32I-FPELIM-NEXT:    sw a0, 16(sp)
 ; RV32I-FPELIM-NEXT:    addi sp, sp, 48
 ; RV32I-FPELIM-NEXT:    ret
 ;
@@ -797,7 +796,6 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    sw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    addi s0, sp, 16
-; RV32I-WITHFP-NEXT:    sw a0, 0(s0)
 ; RV32I-WITHFP-NEXT:    sw a7, 28(s0)
 ; RV32I-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32I-WITHFP-NEXT:    sw a5, 20(s0)
@@ -805,9 +803,9 @@ define i32 @va6_no_fixed_args(...) nounwind {
 ; RV32I-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32I-WITHFP-NEXT:    sw a2, 8(s0)
 ; RV32I-WITHFP-NEXT:    sw a1, 4(s0)
-; RV32I-WITHFP-NEXT:    addi a0, s0, 4
-; RV32I-WITHFP-NEXT:    sw a0, -12(s0)
-; RV32I-WITHFP-NEXT:    lw a0, 0(s0)
+; RV32I-WITHFP-NEXT:    addi a1, s0, 4
+; RV32I-WITHFP-NEXT:    sw a1, -12(s0)
+; RV32I-WITHFP-NEXT:    sw a0, 0(s0)
 ; RV32I-WITHFP-NEXT:    lw s0, 8(sp)
 ; RV32I-WITHFP-NEXT:    lw ra, 12(sp)
 ; RV32I-WITHFP-NEXT:    addi sp, sp, 48
diff --git a/test/CodeGen/SPARC/LeonCASAInstructionUT.ll b/test/CodeGen/SPARC/LeonCASAInstructionUT.ll
index fa2fdd1c9b104c6c3cd0cb1bf872ee628388570a..18c98091da7c83b40ebcf4947be0dfa8166f48c9 100644
--- a/test/CodeGen/SPARC/LeonCASAInstructionUT.ll
+++ b/test/CodeGen/SPARC/LeonCASAInstructionUT.ll
@@ -19,7 +19,9 @@
 ; RUN: llc %s -O0 -march=sparc -mcpu=ma2x8x -o - | FileCheck %s
 
 ; CHECK-LABEL: casa_test
-; CHECK:       casa [%o0] 10, %o3, %o2
+; CHECK-DAG:   mov 1, [[R0:%[a-z0-9]+]]
+; CHECK-DAG:   mov %g0, [[R1:%[a-z0-9]+]]
+; CHECK:       casa [{{%[a-z0-9]+}}] 10, [[R1]], [[R0]]
 define void @casa_test(i32* %ptr) {
   %pair = cmpxchg i32* %ptr, i32 0, i32 1 monotonic monotonic
   %r = extractvalue { i32, i1 } %pair, 0
diff --git a/test/CodeGen/SystemZ/fp-conv-10.ll b/test/CodeGen/SystemZ/fp-conv-10.ll
index dc5178985d93e1f964e8f5e22b7776d35e2bb55c..f897743ef11aa35cb6a1c2c8de678613a23c2ddf 100644
--- a/test/CodeGen/SystemZ/fp-conv-10.ll
+++ b/test/CodeGen/SystemZ/fp-conv-10.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Test conversion of floating-point values to unsigned i32s (z10 only).
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
@@ -10,11 +11,19 @@
 ; Test f32->i32.
 define i32 @f1(float %f) {
 ; CHECK-LABEL: f1:
-; CHECK: cebr
-; CHECK: sebr
-; CHECK: cfebr
-; CHECK: xilf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI0_0
+; CHECK-NEXT:    le %f1, 0(%r1)
+; CHECK-NEXT:    cebr %f0, %f1
+; CHECK-NEXT:    jnl .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cfebr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    sebr %f0, %f1
+; CHECK-NEXT:    cfebr %r2, 5, %f0
+; CHECK-NEXT:    xilf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %conv = fptoui float %f to i32
   ret i32 %conv
 }
@@ -22,11 +31,19 @@ define i32 @f1(float %f) {
 ; Test f64->i32.
 define i32 @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: cdbr
-; CHECK: sdbr
-; CHECK: cfdbr
-; CHECK: xilf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI1_0
+; CHECK-NEXT:    ldeb %f1, 0(%r1)
+; CHECK-NEXT:    cdbr %f0, %f1
+; CHECK-NEXT:    jnl .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cfdbr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    sdbr %f0, %f1
+; CHECK-NEXT:    cfdbr %r2, 5, %f0
+; CHECK-NEXT:    xilf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %conv = fptoui double %f to i32
   ret i32 %conv
 }
@@ -34,11 +51,21 @@ define i32 @f2(double %f) {
 ; Test f128->i32.
 define i32 @f3(fp128 *%src) {
 ; CHECK-LABEL: f3:
-; CHECK: cxbr
-; CHECK: sxbr
-; CHECK: cfxbr
-; CHECK: xilf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld %f0, 0(%r2)
+; CHECK-NEXT:    ld %f2, 8(%r2)
+; CHECK-NEXT:    larl %r1, .LCPI2_0
+; CHECK-NEXT:    lxeb %f1, 0(%r1)
+; CHECK-NEXT:    cxbr %f0, %f1
+; CHECK-NEXT:    jnl .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cfxbr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    sxbr %f0, %f1
+; CHECK-NEXT:    cfxbr %r2, 5, %f0
+; CHECK-NEXT:    xilf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %f = load fp128, fp128 *%src
   %conv = fptoui fp128 %f to i32
   ret i32 %conv
diff --git a/test/CodeGen/SystemZ/fp-conv-12.ll b/test/CodeGen/SystemZ/fp-conv-12.ll
index d37a443c482cd83f403cdf18a2c4cb03db2b1874..91c377fa3e270347a5e1195ad70c230508973b19 100644
--- a/test/CodeGen/SystemZ/fp-conv-12.ll
+++ b/test/CodeGen/SystemZ/fp-conv-12.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Test conversion of floating-point values to unsigned i64s (z10 only).
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
@@ -9,11 +10,19 @@
 ; Test f32->i64.
 define i64 @f1(float %f) {
 ; CHECK-LABEL: f1:
-; CHECK: cebr
-; CHECK: sebr
-; CHECK: cgebr
-; CHECK: xihf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI0_0
+; CHECK-NEXT:    le %f1, 0(%r1)
+; CHECK-NEXT:    cebr %f0, %f1
+; CHECK-NEXT:    jnl .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cgebr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    sebr %f0, %f1
+; CHECK-NEXT:    cgebr %r2, 5, %f0
+; CHECK-NEXT:    xihf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %conv = fptoui float %f to i64
   ret i64 %conv
 }
@@ -21,11 +30,19 @@ define i64 @f1(float %f) {
 ; Test f64->i64.
 define i64 @f2(double %f) {
 ; CHECK-LABEL: f2:
-; CHECK: cdbr
-; CHECK: sdbr
-; CHECK: cgdbr
-; CHECK: xihf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, .LCPI1_0
+; CHECK-NEXT:    ldeb %f1, 0(%r1)
+; CHECK-NEXT:    cdbr %f0, %f1
+; CHECK-NEXT:    jnl .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cgdbr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    sdbr %f0, %f1
+; CHECK-NEXT:    cgdbr %r2, 5, %f0
+; CHECK-NEXT:    xihf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %conv = fptoui double %f to i64
   ret i64 %conv
 }
@@ -33,11 +50,21 @@ define i64 @f2(double %f) {
 ; Test f128->i64.
 define i64 @f3(fp128 *%src) {
 ; CHECK-LABEL: f3:
-; CHECK: cxbr
-; CHECK: sxbr
-; CHECK: cgxbr
-; CHECK: xihf
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld %f0, 0(%r2)
+; CHECK-NEXT:    ld %f2, 8(%r2)
+; CHECK-NEXT:    larl %r1, .LCPI2_0
+; CHECK-NEXT:    lxeb %f1, 0(%r1)
+; CHECK-NEXT:    cxbr %f0, %f1
+; CHECK-NEXT:    jnl .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    cgxbr %r2, 5, %f0
+; CHECK-NEXT:    br %r14
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    sxbr %f0, %f1
+; CHECK-NEXT:    cgxbr %r2, 5, %f0
+; CHECK-NEXT:    xihf %r2, 2147483648
+; CHECK-NEXT:    br %r14
   %f = load fp128, fp128 *%src
   %conv = fptoui fp128 %f to i64
   ret i64 %conv
diff --git a/test/CodeGen/SystemZ/isel-debug.ll b/test/CodeGen/SystemZ/isel-debug.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0e48210e9b6988cb47df9f441a90742051a90e61
--- /dev/null
+++ b/test/CodeGen/SystemZ/isel-debug.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -debug-only=systemz-isel -o - 2>&1 | \
+; RUN:   FileCheck %s
+
+; REQUIRES: asserts
+;
+; Check that some debug output is printed without problems.
+; CHECK: SystemZAddressingMode
+; CHECK: Base t5: i64,ch = load<(load 8 from %ir.0)>
+; CHECK: Index
+; CHECK: Disp
+
+define void @fun(i64* %ptr) {
+entry:
+  %0 = bitcast i64* %ptr to i32**
+  %1 = load i32*, i32** %0, align 8
+  %xpv_pv = getelementptr inbounds i32, i32* %1
+  store i32 0, i32* %xpv_pv
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/misched-readadvances.mir b/test/CodeGen/SystemZ/misched-readadvances.mir
new file mode 100644
index 0000000000000000000000000000000000000000..df8ca2f5f95edc2f2374975b199982ad23368bc7
--- /dev/null
+++ b/test/CodeGen/SystemZ/misched-readadvances.mir
@@ -0,0 +1,31 @@
+# Check that the extra operand for the full register added by RegAlloc does
+# not have a latency that interferes with the latency adjustment
+# (ReadAdvance) for the MSY register operand.
+
+# RUN: llc %s -mtriple=s390x-linux-gnu -mcpu=z13 -start-before=machine-scheduler \
+# RUN:  -debug-only=machine-scheduler -o - 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# CHECK: ScheduleDAGMI::schedule starting
+# CHECK: SU(4): renamable $r2l = MSR renamable $r2l(tied-def 0), renamable $r2l
+# CHECK:   Latency : 6
+# CHECK: SU(5): renamable $r2l = MSY renamable $r2l(tied-def 0), renamable $r1d, -4, $noreg, implicit $r2d
+# CHECK:   Predecessors:
+# CHECK:     SU(4): Data Latency=2 Reg=$r2l
+# CHECK:     SU(4): Data Latency=0 Reg=$r2d
+
+---
+name:            Perl_do_sv_dump
+alignment:       4
+tracksRegLiveness: true
+body:             |
+    bb.0 :
+    %1:addr64bit = IMPLICIT_DEF
+    %2:addr64bit = IMPLICIT_DEF
+    %3:vr64bit = IMPLICIT_DEF
+
+    bb.1 :
+    %2:addr64bit = ALGFI %2, 4294967291, implicit-def dead $cc
+    %2.subreg_l32:addr64bit = MSR %2.subreg_l32, %2.subreg_l32
+    %2.subreg_l32:addr64bit = MSY %2.subreg_l32, %1, -4, $noreg
+...
diff --git a/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
index 3956ce9962394974aae08c784eb7a1f46fd877ea..195dbb996ef3ab11fb4f3edb8987faf768d611f0 100644
--- a/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
+++ b/test/CodeGen/SystemZ/regalloc-fast-invalid-kill-flag.mir
@@ -13,22 +13,18 @@
 name:            main
 alignment:       2
 tracksRegLiveness: true
-registers:       
-  - { id: 0, class: gr128bit }
-  - { id: 1, class: gr64bit }
-  - { id: 2, class: addr64bit }
-# CHECK: $r0q = L128
-# CHECK-NEXT: $r0l = COPY renamable $r1l
+# CHECK: $r0l = COPY renamable $r1l
 # Although R0L partially redefines R0Q, it must not mark R0Q as kill
 # because R1D is still live through that instruction.
 # CHECK-NOT: implicit killed $r0q
-# CHECK-NEXT: $r2d = COPY renamable $r1d
+# CHECK-NEXT: {{\$r[0-9]+d}} = COPY renamable $r1d
 # CHECK-NEXT: LARL
 body:             |
   bb.0:
+    %0 : gr128bit = IMPLICIT_DEF
     %0.subreg_hl32 = COPY %0.subreg_l32
-    %1 = COPY %0.subreg_l64
-    %2 = LARL @g_167
+    %1 : gr64bit = COPY %0.subreg_l64
+    %2 : addr64bit = LARL @g_167
     STC %1.subreg_l32, %2, 8, $noreg
 
 ...
diff --git a/test/CodeGen/SystemZ/rosbg-02.ll b/test/CodeGen/SystemZ/rosbg-02.ll
index fa1ac6e75ea2394612d2144ded6d7d7115e36981..8a7357a5318fa96f08cda9f165db25e685a24c7f 100644
--- a/test/CodeGen/SystemZ/rosbg-02.ll
+++ b/test/CodeGen/SystemZ/rosbg-02.ll
@@ -18,7 +18,7 @@ define void @main() {
   %7 = zext i1 %6 to i32
   %8 = load i32, i32* @g_999, align 4
   %9 = or i32 %8, %7
-; CHECK: rosbg   %r1, %r3, 63, 63, 33
+; CHECK: rosbg   {{%r[0-9]+}}, {{%r[0-9]+}}, 63, 63, 33
   store i32 %9, i32* @g_999, align 4
   ret void
 }
diff --git a/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll b/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
index 8b7184f38e871dff668636d53513f78642769ad1..60a6a180467cd8202c8e4d145e35a6152fd4bd57 100644
--- a/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
+++ b/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
@@ -60,8 +60,7 @@ define i16 @fun1(<16 x i1> %src)
 ; CHECK-NEXT:    rosbg %r0, %r1, 62, 62, 1
 ; CHECK-NEXT:    vlgvb %r1, %v24, 15
 ; CHECK-NEXT:    rosbg %r0, %r1, 63, 63, 0
-; CHECK-NEXT:    sth %r0, 160(%r15)
-; CHECK-NEXT:    lh %r2, 160(%r15)
+; CHECK-NEXT:    llhr %r2, %r0
 ; CHECK-NEXT:    aghi %r15, 168
 ; CHECK-NEXT:    br %r14
 {
diff --git a/test/CodeGen/SystemZ/vec-max-05.ll b/test/CodeGen/SystemZ/vec-max-05.ll
index 591d3bf36f168959cf0ba85667ae788d2f3b1b5d..1fe0db350b74b79b78437762b7c92cb9aff50296 100644
--- a/test/CodeGen/SystemZ/vec-max-05.ll
+++ b/test/CodeGen/SystemZ/vec-max-05.ll
@@ -42,7 +42,7 @@ define double @f3(double %dummy, double %val) {
   ret double %ret
 }
 
-; Test a f64 constant compare/select resulting in maxnan.
+; Test a f64 constant compare/select resulting in maximum.
 define double @f4(double %dummy, double %val) {
 ; CHECK-LABEL: f4:
 ; CHECK: lzdr [[REG:%f[0-9]+]]
@@ -92,7 +92,7 @@ define float @f13(float %dummy, float %val) {
   ret float %ret
 }
 
-; Test a f32 constant compare/select resulting in maxnan.
+; Test a f32 constant compare/select resulting in maximum.
 define float @f14(float %dummy, float %val) {
 ; CHECK-LABEL: f14:
 ; CHECK: lzer [[REG:%f[0-9]+]]
@@ -158,7 +158,7 @@ define void @f23(fp128 *%ptr, fp128 *%dst) {
   ret void
 }
 
-; Test a f128 constant compare/select resulting in maxnan.
+; Test a f128 constant compare/select resulting in maximum.
 define void @f24(fp128 *%ptr, fp128 *%dst) {
 ; CHECK-LABEL: f24:
 ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
diff --git a/test/CodeGen/SystemZ/vec-min-05.ll b/test/CodeGen/SystemZ/vec-min-05.ll
index 3eef9016cd08769b0787835d3b02ded04bb58023..6417e5ed7508a07853627857a2dd644416b30d4d 100644
--- a/test/CodeGen/SystemZ/vec-min-05.ll
+++ b/test/CodeGen/SystemZ/vec-min-05.ll
@@ -42,7 +42,7 @@ define double @f3(double %dummy, double %val) {
   ret double %ret
 }
 
-; Test a f64 constant compare/select resulting in minnan.
+; Test a f64 constant compare/select resulting in minimum.
 define double @f4(double %dummy, double %val) {
 ; CHECK-LABEL: f4:
 ; CHECK: lzdr [[REG:%f[0-9]+]]
@@ -92,7 +92,7 @@ define float @f13(float %dummy, float %val) {
   ret float %ret
 }
 
-; Test a f32 constant compare/select resulting in minnan.
+; Test a f32 constant compare/select resulting in minimum.
 define float @f14(float %dummy, float %val) {
 ; CHECK-LABEL: f14:
 ; CHECK: lzer [[REG:%f[0-9]+]]
@@ -158,7 +158,7 @@ define void @f23(fp128 *%ptr, fp128 *%dst) {
   ret void
 }
 
-; Test a f128 constant compare/select resulting in minnan.
+; Test a f128 constant compare/select resulting in minimum.
 define void @f24(fp128 *%ptr, fp128 *%dst) {
 ; CHECK-LABEL: f24:
 ; CHECK-DAG: vl [[REG1:%v[0-9]+]], 0(%r2)
diff --git a/test/CodeGen/Thumb/branchless-cmp.ll b/test/CodeGen/Thumb/branchless-cmp.ll
index 6c6c905c5d3b804843dfeeef7214be390e4f6218..ed34d630733c7f367f9f0b74f3cf0b1bf958899a 100644
--- a/test/CodeGen/Thumb/branchless-cmp.ll
+++ b/test/CodeGen/Thumb/branchless-cmp.ll
@@ -20,8 +20,7 @@ entry:
 ; CHECK-LABEL: test1b:
 ; CHECK-NOT: b{{(ne)|(eq)}}
 ; CHECK:       subs    r1, r0, r1
-; CHECK-NEXT:  movs    r0, #0
-; CHECK-NEXT:  subs    r0, r0, r1
+; CHECK-NEXT:  rsbs    r0, r1, #0
 ; CHECK-NEXT:  adcs    r0, r1
 }
 
@@ -33,8 +32,7 @@ entry:
 ; CHECK-LABEL: test2a:
 ; CHECK-NOT: b{{(ne)|(eq)}}
 ; CHECK:       subs    r1, r0, r1
-; CHECK-NEXT:  movs    r0, #0
-; CHECK-NEXT:  subs    r0, r0, r1
+; CHECK-NEXT:  rsbs    r0, r1, #0
 ; CHECK-NEXT:  adcs    r0, r1
 }
 
@@ -71,8 +69,7 @@ entry:
 ; CHECK-LABEL: test3b:
 ; CHECK-NOT: b{{(ne)|(eq)}}
 ; CHECK:      subs	r0, r0, r1
-; CHECK-NEXT: movs	r1, #0
-; CHECK-NEXT: subs	r1, r1, r0
+; CHECK-NEXT: rsbs	r1, r0, #0
 ; CHECK-NEXT: adcs	r1, r0
 ; CHECK-NEXT: lsls	r0, r1, #2
 }
@@ -85,14 +82,15 @@ entry:
   %cond = select i1 %cmp, i32 0, i32 4
   ret i32 %cond
 ; CHECK-LABEL: test4a:
-; CHECK-NOT: b{{(ne)|(eq)}}
-; CHECK:       mov     r2, r0
+; CHECK: bb.0:
+; CHECK-NEXT:  cmp     r0, r1
+; CHECK-NEXT:  bne     .LBB6_2
+; CHECK-NEXT: bb.1:
+; CHECK-NEXT:  movs    r0, #4
+; CHECK-NEXT:  bx      lr
+; CHECK-NEXT: .LBB6_2:
 ; CHECK-NEXT:  movs    r0, #0
-; CHECK-NEXT:  movs    r3, #4
-; CHECK-NEXT:  cmp     r2, r1
-; CHECK-NEXT:  bne     .[[BRANCH:[A-Z0-9_]+]]
-; CHECK:       mov     r0, r3
-; CHECK:       .[[BRANCH]]:
+; CHECK-NEXT:  bx      lr
 }
 
 define i32 @test4b(i32 %a, i32 %b) {
diff --git a/test/CodeGen/Thumb/consthoist-few-dependents.ll b/test/CodeGen/Thumb/consthoist-few-dependents.ll
index 4141cf38a93e5342852edb89e172d20f29a5f98c..72f085afdff9e77096eaa538e22e5388f2bea2f5 100644
--- a/test/CodeGen/Thumb/consthoist-few-dependents.ll
+++ b/test/CodeGen/Thumb/consthoist-few-dependents.ll
@@ -23,7 +23,6 @@ target triple = "thumbv6m-none-unknown-musleabi"
 
 ; LLC-LABEL: avalon
 ; LLC-DAG: movs r{{[0-9]+}}, #0
-; LLC-DAG: movs r{{[0-9]+}}, #0
 ; LLC-DAG: movs r{{[0-9]+}}, #1
 ; LLC-NOT: add
 
diff --git a/test/CodeGen/Thumb/long-setcc.ll b/test/CodeGen/Thumb/long-setcc.ll
index f077d0e4cf4629a6366f524d2dfc5e8b712a2225..b8b9cff7b36759cd4094bb4e0ae5fb2127177ade 100644
--- a/test/CodeGen/Thumb/long-setcc.ll
+++ b/test/CodeGen/Thumb/long-setcc.ll
@@ -9,8 +9,7 @@ define i1 @t1(i64 %x) {
 
 define i1 @t2(i64 %x) {
 ; CHECK-LABEL: t2:
-; CHECK: movs  r0, #0
-; CHECK: subs  r0, r0, r1
+; CHECK: rsbs  r0, r1, #0
 ; CHECK: adcs  r0, r1
   %tmp = icmp ult i64 %x, 4294967296
   ret i1 %tmp
diff --git a/test/CodeGen/Thumb/select.ll b/test/CodeGen/Thumb/select.ll
index 75dbeab5ad0f71127520a9fbab8d09598737b7c2..36f16ad44a007d9d47318358d1753d656634f775 100644
--- a/test/CodeGen/Thumb/select.ll
+++ b/test/CodeGen/Thumb/select.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=thumb-pc-linux-gnueabi | FileCheck -check-prefix=CHECK-EABI %s
+; RUN: llc < %s -mtriple=thumb-apple-darwin -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=thumb-pc-linux-gnueabi -verify-machineinstrs | FileCheck -check-prefix=CHECK-EABI %s
 
 define i32 @f1(i32 %a.s) {
 entry:
@@ -73,10 +73,31 @@ define double @f7(double %a, double %b) {
     ret double %tmp1
 }
 ; CHECK-LABEL: f7:
-; CHECK: blt
+; CHECK: {{blt|bge}}
 ; CHECK: {{blt|bge}}
 ; CHECK: __ltdf2
 ; CHECK-EABI-LABEL: f7:
 ; CHECK-EABI: __aeabi_dcmplt
-; CHECK-EABI: bne
 ; CHECK-EABI: {{bne|beq}}
+; CHECK-EABI: {{bne|beq}}
+
+define {i32, i32} @f8(i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+    %cmp = icmp slt i32 %a, %b
+    %r1 = select i1 %cmp, i32 %c, i32 %a
+    %r2 = select i1 %cmp, i32 %d, i32 %b
+    %z = insertvalue { i32, i32 } undef, i32 %r1, 0
+    %z2 = insertvalue { i32, i32 } %z, i32 %r2, 1
+    ret { i32, i32 } %z2
+}
+
+; CHECK-LABEL: f8:
+; CHECK: cmp r0, r1
+; CHECK: blt
+; CHECK: movs
+; CHECK: cmp r0, r1
+; CHECK: blt
+; CHECK: movs
+; CHECK: movs
+; CHECK: movs
+; CHECK: bx lr
diff --git a/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
index f57f46f68cf0aefd5ea8fbbd9909957495817e73..5445cd8e7434172cb3b76fece20fed1fefa76126 100644
--- a/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
+++ b/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
@@ -3,168 +3,200 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV6-LABEL: muloti_test:
-; THUMBV6: push    {r4, r5, r6, r7, lr}
-; THUMBV6: sub     sp, #84
-; THUMBV6-NEXT: mov     r6, r3
-; THUMBV6-NEXT: mov     r7, r2
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: movs    r5, #0
-; THUMBV6-NEXT: mov     r0, sp
-; THUMBV6-NEXT: str     r5, [r0, #12]
-; THUMBV6-NEXT: str     r5, [r0, #8]
-; THUMBV6-NEXT: ldr     r1, [sp, #116]
-; THUMBV6-NEXT: str     r1, [sp, #68]           @ 4-byte Spill
-; THUMBV6-NEXT: str     r1, [r0, #4]
-; THUMBV6-NEXT: ldr     r1, [sp, #112]
-; THUMBV6-NEXT: str     r1, [sp, #32]           @ 4-byte Spill
-; THUMBV6-NEXT: str     r1, [r0]
-; THUMBV6-NEXT: mov     r0, r2
-; THUMBV6-NEXT: mov     r1, r3
-; THUMBV6-NEXT: mov     r2, r5
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __multi3
-; THUMBV6-NEXT: str     r2, [sp, #40]           @ 4-byte Spill
-; THUMBV6-NEXT: str     r3, [sp, #44]           @ 4-byte Spill
-; THUMBV6-NEXT: str     r4, [sp, #72]           @ 4-byte Spill
-; THUMBV6-NEXT: stm     r4!, {r0, r1}
-; THUMBV6-NEXT: ldr     r4, [sp, #120]
-; THUMBV6-NEXT: str     r6, [sp, #60]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r0, r6
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r4
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: mov     r6, r0
-; THUMBV6-NEXT: str     r1, [sp, #52]           @ 4-byte Spill
-; THUMBV6-NEXT: ldr     r0, [sp, #124]
-; THUMBV6-NEXT: str     r0, [sp, #80]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r7
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: str     r1, [sp, #28]           @ 4-byte Spill
-; THUMBV6-NEXT: adds    r6, r0, r6
-; THUMBV6-NEXT: str     r4, [sp, #64]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r0, r4
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r7
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: str     r0, [sp, #24]           @ 4-byte Spill
-; THUMBV6-NEXT: adds    r0, r1, r6
-; THUMBV6-NEXT: str     r0, [sp, #20]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r0, r5
-; THUMBV6-NEXT: adcs    r0, r5
-; THUMBV6-NEXT: str     r0, [sp, #48]           @ 4-byte Spill
-; THUMBV6-NEXT: ldr     r7, [sp, #104]
-; THUMBV6-NEXT: ldr     r0, [sp, #68]           @ 4-byte Reload
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r7
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: mov     r6, r0
-; THUMBV6-NEXT: str     r1, [sp, #56]           @ 4-byte Spill
-; THUMBV6-NEXT: ldr     r0, [sp, #108]
-; THUMBV6-NEXT: str     r0, [sp, #76]           @ 4-byte Spill
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: ldr     r4, [sp, #32]           @ 4-byte Reload
-; THUMBV6-NEXT: mov     r2, r4
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: str     r1, [sp, #36]           @ 4-byte Spill
-; THUMBV6-NEXT: adds    r6, r0, r6
-; THUMBV6-NEXT: mov     r0, r7
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: mov     r2, r4
-; THUMBV6-NEXT: mov     r3, r5
-; THUMBV6-NEXT: bl      __aeabi_lmul
-; THUMBV6-NEXT: adds    r2, r1, r6
-; THUMBV6-NEXT: mov     r1, r5
-; THUMBV6-NEXT: adcs    r1, r5
-; THUMBV6-NEXT: ldr     r3, [sp, #24]           @ 4-byte Reload
-; THUMBV6-NEXT: adds    r0, r0, r3
-; THUMBV6-NEXT: ldr     r3, [sp, #20]           @ 4-byte Reload
-; THUMBV6-NEXT: adcs    r2, r3
-; THUMBV6-NEXT: ldr     r3, [sp, #40]           @ 4-byte Reload
-; THUMBV6-NEXT: adds    r0, r3, r0
-; THUMBV6-NEXT: ldr     r3, [sp, #72]           @ 4-byte Reload
-; THUMBV6-NEXT: str     r0, [r3, #8]
-; THUMBV6-NEXT: ldr     r0, [sp, #44]           @ 4-byte Reload
-; THUMBV6-NEXT: adcs    r2, r0
-; THUMBV6-NEXT: str     r2, [r3, #12]
-; THUMBV6-NEXT: ldr     r2, [sp, #28]           @ 4-byte Reload
-; THUMBV6-NEXT: adcs    r5, r5
-; THUMBV6-NEXT: movs    r0, #1
-; THUMBV6-NEXT: cmp     r2, #0
-; THUMBV6-NEXT: mov     r3, r0
-; THUMBV6-NEXT: bne     .LBB0_2
-; THUMBV6: mov     r3, r2
-; THUMBV6: ldr     r2, [sp, #60]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r2, #0
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: bne     .LBB0_4
-; THUMBV6: mov     r4, r2
-; THUMBV6: ldr     r2, [sp, #80]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r2, #0
-; THUMBV6-NEXT: mov     r2, r0
-; THUMBV6-NEXT: bne     .LBB0_6
-; THUMBV6: ldr     r2, [sp, #80]           @ 4-byte Reload
-; THUMBV6: ands    r2, r4
-; THUMBV6-NEXT: orrs    r2, r3
-; THUMBV6-NEXT: ldr     r4, [sp, #52]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r4, #0
-; THUMBV6-NEXT: mov     r3, r0
-; THUMBV6-NEXT: bne     .LBB0_8
-; THUMBV6: mov     r3, r4
-; THUMBV6: orrs    r2, r3
-; THUMBV6-NEXT: ldr     r3, [sp, #48]           @ 4-byte Reload
-; THUMBV6-NEXT: orrs    r2, r3
-; THUMBV6-NEXT: ldr     r3, [sp, #36]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r3, #0
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: bne     .LBB0_10
-; THUMBV6: mov     r4, r3
-; THUMBV6: ldr     r3, [sp, #68]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r3, #0
-; THUMBV6-NEXT: mov     r6, r0
-; THUMBV6-NEXT: bne     .LBB0_12
-; THUMBV6: mov     r6, r3
-; THUMBV6: ldr     r3, [sp, #76]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r3, #0
-; THUMBV6-NEXT: mov     r3, r0
-; THUMBV6-NEXT: bne     .LBB0_14
-; THUMBV6: ldr     r3, [sp, #76]           @ 4-byte Reload
-; THUMBV6: ands    r3, r6
-; THUMBV6-NEXT: orrs    r3, r4
-; THUMBV6-NEXT: ldr     r6, [sp, #56]           @ 4-byte Reload
-; THUMBV6-NEXT: cmp     r6, #0
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: bne     .LBB0_16
-; THUMBV6: mov     r4, r6
-; THUMBV6: orrs    r3, r4
-; THUMBV6-NEXT: orrs    r3, r1
-; THUMBV6-NEXT: ldr     r4, [sp, #64]           @ 4-byte Reload
-; THUMBV6-NEXT: ldr     r1, [sp, #80]           @ 4-byte Reload
-; THUMBV6-NEXT: orrs    r4, r1
-; THUMBV6-NEXT: cmp     r4, #0
-; THUMBV6-NEXT: mov     r1, r0
-; THUMBV6-NEXT: bne     .LBB0_18
-; THUMBV6: mov     r1, r4
-; THUMBV6: ldr     r4, [sp, #76]           @ 4-byte Reload
-; THUMBV6-NEXT: orrs    r7, r4
-; THUMBV6-NEXT: cmp     r7, #0
-; THUMBV6-NEXT: mov     r4, r0
-; THUMBV6-NEXT: bne     .LBB0_20
-; THUMBV6: mov     r4, r7
-; THUMBV6: ands    r4, r1
-; THUMBV6-NEXT: orrs    r4, r3
-; THUMBV6-NEXT: orrs    r4, r2
-; THUMBV6-NEXT: orrs    r4, r5
-; THUMBV6-NEXT: ands    r4, r0
-; THUMBV6-NEXT: ldr     r0, [sp, #72]           @ 4-byte Reload
-; THUMBV6-NEXT: strb    r4, [r0, #16]
-; THUMBV6-NEXT: add     sp, #84
-; THUMBV6-NEXT: pop     {r4, r5, r6, r7, pc}
+; THUMBV6:       @ %bb.0: @ %start
+; THUMBV6-NEXT:    .save {r4, r5, r6, r7, lr}
+; THUMBV6-NEXT:    push {r4, r5, r6, r7, lr}
+; THUMBV6-NEXT:    .pad #84
+; THUMBV6-NEXT:    sub sp, #84
+; THUMBV6-NEXT:    mov r6, r3
+; THUMBV6-NEXT:    mov r7, r2
+; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    movs r5, #0
+; THUMBV6-NEXT:    mov r0, sp
+; THUMBV6-NEXT:    str r5, [r0, #12]
+; THUMBV6-NEXT:    str r5, [r0, #8]
+; THUMBV6-NEXT:    ldr r1, [sp, #116]
+; THUMBV6-NEXT:    str r1, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [r0, #4]
+; THUMBV6-NEXT:    ldr r1, [sp, #112]
+; THUMBV6-NEXT:    str r1, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [r0]
+; THUMBV6-NEXT:    mov r0, r2
+; THUMBV6-NEXT:    mov r1, r3
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __multi3
+; THUMBV6-NEXT:    str r2, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    str r3, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    str r4, [sp, #76] @ 4-byte Spill
+; THUMBV6-NEXT:    stm r4!, {r0, r1}
+; THUMBV6-NEXT:    ldr r4, [sp, #120]
+; THUMBV6-NEXT:    str r6, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r0
+; THUMBV6-NEXT:    str r1, [sp, #48] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #124]
+; THUMBV6-NEXT:    str r0, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r1, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT:    adds r6, r0, r6
+; THUMBV6-NEXT:    str r4, [sp, #68] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r4
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; THUMBV6-NEXT:    adds r0, r1, r6
+; THUMBV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r5
+; THUMBV6-NEXT:    adcs r0, r5
+; THUMBV6-NEXT:    str r0, [sp, #64] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r7, [sp, #104]
+; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r0
+; THUMBV6-NEXT:    str r1, [sp, #52] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #108]
+; THUMBV6-NEXT:    str r0, [sp, #60] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    ldr r4, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r1, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    adds r6, r0, r6
+; THUMBV6-NEXT:    str r7, [sp, #24] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    adds r1, r1, r6
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    adcs r2, r5
+; THUMBV6-NEXT:    str r2, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    ldr r2, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    str r0, [r2, #8]
+; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r0
+; THUMBV6-NEXT:    str r1, [r2, #12]
+; THUMBV6-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r5, r5
+; THUMBV6-NEXT:    movs r0, #1
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r2, r0
+; THUMBV6-NEXT:    bne .LBB0_2
+; THUMBV6-NEXT:  @ %bb.1: @ %start
+; THUMBV6-NEXT:    mov r2, r1
+; THUMBV6-NEXT:  .LBB0_2: @ %start
+; THUMBV6-NEXT:    str r2, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    bne .LBB0_4
+; THUMBV6-NEXT:  @ %bb.3: @ %start
+; THUMBV6-NEXT:    mov r4, r1
+; THUMBV6-NEXT:  .LBB0_4: @ %start
+; THUMBV6-NEXT:    ldr r1, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r2, r0
+; THUMBV6-NEXT:    ldr r3, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r6, [sp, #32] @ 4-byte Reload
+; THUMBV6-NEXT:    bne .LBB0_6
+; THUMBV6-NEXT:  @ %bb.5: @ %start
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:  .LBB0_6: @ %start
+; THUMBV6-NEXT:    cmp r3, #0
+; THUMBV6-NEXT:    mov r7, r0
+; THUMBV6-NEXT:    ldr r1, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    bne .LBB0_8
+; THUMBV6-NEXT:  @ %bb.7: @ %start
+; THUMBV6-NEXT:    mov r7, r3
+; THUMBV6-NEXT:  .LBB0_8: @ %start
+; THUMBV6-NEXT:    cmp r6, #0
+; THUMBV6-NEXT:    mov r3, r0
+; THUMBV6-NEXT:    bne .LBB0_10
+; THUMBV6-NEXT:  @ %bb.9: @ %start
+; THUMBV6-NEXT:    mov r3, r6
+; THUMBV6-NEXT:  .LBB0_10: @ %start
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    mov r1, r0
+; THUMBV6-NEXT:    bne .LBB0_12
+; THUMBV6-NEXT:  @ %bb.11: @ %start
+; THUMBV6-NEXT:    mov r1, r6
+; THUMBV6-NEXT:  .LBB0_12: @ %start
+; THUMBV6-NEXT:    str r7, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    ands r2, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    cmp r6, #0
+; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    bne .LBB0_14
+; THUMBV6-NEXT:  @ %bb.13: @ %start
+; THUMBV6-NEXT:    mov r4, r6
+; THUMBV6-NEXT:  .LBB0_14: @ %start
+; THUMBV6-NEXT:    ldr r7, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r2, r7
+; THUMBV6-NEXT:    ands r4, r1
+; THUMBV6-NEXT:    orrs r4, r3
+; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    cmp r3, #0
+; THUMBV6-NEXT:    mov r1, r0
+; THUMBV6-NEXT:    bne .LBB0_16
+; THUMBV6-NEXT:  @ %bb.15: @ %start
+; THUMBV6-NEXT:    mov r1, r3
+; THUMBV6-NEXT:  .LBB0_16: @ %start
+; THUMBV6-NEXT:    ldr r3, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r2, r3
+; THUMBV6-NEXT:    orrs r4, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r1, r3
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r3, r0
+; THUMBV6-NEXT:    bne .LBB0_18
+; THUMBV6-NEXT:  @ %bb.17: @ %start
+; THUMBV6-NEXT:    mov r3, r1
+; THUMBV6-NEXT:  .LBB0_18: @ %start
+; THUMBV6-NEXT:    ldr r1, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r2, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r4, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r1, r6
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    cmp r1, #0
+; THUMBV6-NEXT:    mov r1, r0
+; THUMBV6-NEXT:    bne .LBB0_20
+; THUMBV6-NEXT:  @ %bb.19: @ %start
+; THUMBV6-NEXT:    mov r1, r6
+; THUMBV6-NEXT:  .LBB0_20: @ %start
+; THUMBV6-NEXT:    ands r1, r3
+; THUMBV6-NEXT:    orrs r1, r4
+; THUMBV6-NEXT:    orrs r1, r2
+; THUMBV6-NEXT:    orrs r1, r5
+; THUMBV6-NEXT:    ands r1, r0
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    strb r1, [r0, #16]
+; THUMBV6-NEXT:    add sp, #84
+; THUMBV6-NEXT:    pop {r4, r5, r6, r7, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
   %1 = extractvalue { i128, i1 } %0, 0
diff --git a/test/CodeGen/Thumb2/thumb2-tbh.ll b/test/CodeGen/Thumb2/thumb2-tbh.ll
index c67efa09b90655bd9b16c8efb204274042d1a15d..fd8070a8b583a39c030e445031b4e04f9e922da2 100644
--- a/test/CodeGen/Thumb2/thumb2-tbh.ll
+++ b/test/CodeGen/Thumb2/thumb2-tbh.ll
@@ -1,6 +1,11 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=CHECK --check-prefix=T2
-; RUN: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=CHECK --check-prefix=T1
-; RUN: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=static | FileCheck %s --check-prefix=CHECK --check-prefix=T1
+; RUN: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=T1DISABLED
+; FIXME: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=pic | FileCheck %s --check-prefix=CHECK --check-prefix=T1
+; FIXME: llc < %s -mtriple=thumbv6m-apple-darwin -relocation-model=static | FileCheck %s --check-prefix=CHECK --check-prefix=T1
+
+; FIXME: Thumb1 tests temporarily disabled; MachineLICM is now hoisting the
+; subs, so the jump table can't be formed.
+; T1DISABLED: .data_region jt32
 
 ; Thumb2 target should reorder the bb's in order to use tbb / tbh.
 
diff --git a/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
index d639b7acbbc5405713caa77113e92e76a4c5f8f4..5300bed0de88fe365402a0ad721415a01c9c4c92 100644
--- a/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
+++ b/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
@@ -88,15 +88,15 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    orrs r3, r2
 ; THUMBV7-NEXT:    ldr r2, [sp, #80]
 ; THUMBV7-NEXT:    orr.w r1, r1, r4
+; THUMBV7-NEXT:    orr.w r1, r1, r10
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r3, #1
-; THUMBV7-NEXT:    orr.w r1, r1, r10
 ; THUMBV7-NEXT:    orrs.w r7, r2, r11
 ; THUMBV7-NEXT:    orr.w r1, r1, r9
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r7, #1
-; THUMBV7-NEXT:    orr.w r0, r0, r12
 ; THUMBV7-NEXT:    ands r3, r7
+; THUMBV7-NEXT:    orr.w r0, r0, r12
 ; THUMBV7-NEXT:    orrs r1, r3
 ; THUMBV7-NEXT:    orrs r0, r1
 ; THUMBV7-NEXT:    orr.w r0, r0, r8
diff --git a/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll b/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
index e47e88a6832b9e4d59b8b682d1d2e7d099227602..161adf7e7d7639696450e0621ab8d01f32e18452 100644
--- a/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
+++ b/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
@@ -20,11 +20,11 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r1, #1
 ; THUMBV7-NEXT:    cmp r5, #0
+; THUMBV7-NEXT:    and.w r1, r1, r3
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r5, #1
-; THUMBV7-NEXT:    ands r1, r3
+; THUMBV7-NEXT:    orrs r1, r5
 ; THUMBV7-NEXT:    cmp.w lr, #0
-; THUMBV7-NEXT:    orr.w r1, r1, r5
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne.w lr, #1
 ; THUMBV7-NEXT:    orr.w r1, r1, lr
diff --git a/test/CodeGen/Thumb2/unreachable-large-offset-gep.ll b/test/CodeGen/Thumb2/unreachable-large-offset-gep.ll
new file mode 100644
index 0000000000000000000000000000000000000000..641787b0d7f24cfc7130330a1b2ed63339b61833
--- /dev/null
+++ b/test/CodeGen/Thumb2/unreachable-large-offset-gep.ll
@@ -0,0 +1,22 @@
+; RUN: llc -o - %s | FileCheck %s
+
+; CHECK: .LBB0_1:
+; CHECK: b .LBB0_1
+
+target triple = "thumbv8m-unknown-linux-android"
+
+define void @d(i32* %c) {
+entry:
+  br i1 false, label %f.exit, label %i.d
+
+i.d:
+  br label %i.d
+
+f.exit:
+  %0 = getelementptr i32, i32* %c, i32 57
+  br label %if.g
+
+if.g:
+  store i32 0, i32* %0
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/annotations.mir b/test/CodeGen/WebAssembly/annotations.mir
new file mode 100644
index 0000000000000000000000000000000000000000..1ae2db8248471b9caa60d758b72d9f305765eab8
--- /dev/null
+++ b/test/CodeGen/WebAssembly/annotations.mir
@@ -0,0 +1,94 @@
+# RUN: llc -mtriple=wasm32-unknown-unknown -start-after xray-instrumentation -wasm-keep-registers %s -o - | FileCheck %s
+
+---
+# Tests if block/loop/try/catch/end instructions are correctly printed with
+# their annotations.
+
+# CHECK: test0:
+# CHECK:   block
+# CHECK:   try
+# CHECK:   br        0               # 0: down to label1
+# CHECK:   catch_all                 # catch0:
+# CHECK:   block
+# CHECK:   br_if     0, 1            # 0: down to label2
+# CHECK:   loop                      # label3:
+# CHECK:   br_if     0, 1            # 0: up to label3
+# CHECK:   end_loop
+# CHECK:   end_block                 # label2:
+# CHECK:   try
+# CHECK:   rethrow   0               # 0: down to catch1
+# CHECK:   catch_all                 # catch1:
+# CHECK:   block
+# CHECK:   try
+# CHECK:   br        0               # 0: down to label6
+# CHECK:   catch_all                 # catch2:
+# CHECK:   unreachable
+# CHECK:   end_try                   # label6:
+# CHECK:   end_block                 # label5:
+# CHECK:   rethrow   0               # 0: to caller
+# CHECK:   end_try                   # label4:
+# CHECK:   end_try                   # label1:
+# CHECK:   end_block                 # label0:
+
+name: test0
+liveins:
+  - { reg: '$arguments', reg: '$value_stack' }
+body: |
+  bb.0:
+    successors: %bb.7, %bb.1
+    BLOCK 64, implicit-def $value_stack, implicit $value_stack
+    TRY 64, implicit-def $value_stack, implicit $value_stack
+    BR 0, implicit-def $arguments
+
+  bb.1 (landing-pad):
+  ; predecessors: %bb.0
+    successors: %bb.2, %bb.3
+
+    CATCH_ALL implicit-def $arguments
+    BLOCK 64, implicit-def $value_stack, implicit $value_stack
+    BR_IF 0, 1, implicit-def $arguments, implicit-def $value_stack, implicit $value_stack
+
+  bb.2:
+  ; predecessors: %bb.1, %bb.2
+    successors: %bb.2, %bb.3
+
+    LOOP 64, implicit-def $value_stack, implicit $value_stack
+    BR_IF 0, 1, implicit-def $arguments
+
+  bb.3:
+  ; predecessors: %bb.1, %bb.2
+    successors: %bb.4
+
+    END_LOOP implicit-def $value_stack, implicit $value_stack
+    END_BLOCK implicit-def $value_stack, implicit $value_stack
+    TRY 64, implicit-def $value_stack, implicit $value_stack
+    RETHROW 0, implicit-def $arguments
+
+  bb.4 (landing-pad):
+  ; predecessors: %bb.3
+    successors: %bb.6, %bb.5
+
+    CATCH_ALL implicit-def $arguments
+    BLOCK 64, implicit-def $value_stack, implicit $value_stack
+    TRY 64, implicit-def $value_stack, implicit $value_stack
+    BR 0, implicit-def $arguments
+
+  bb.5 (landing-pad):
+  ; predecessors: %bb.4
+    CATCH_ALL implicit-def $arguments
+    UNREACHABLE implicit-def dead $arguments
+
+  bb.6:
+  ; predecessors: %bb.4
+    END_TRY implicit-def $value_stack, implicit $value_stack
+    END_BLOCK implicit-def $value_stack, implicit $value_stack
+    RETHROW 0, implicit-def $arguments
+
+  bb.7:
+  ; predecessors: %bb.0
+    END_TRY implicit-def $value_stack, implicit $value_stack
+    END_TRY implicit-def $value_stack, implicit $value_stack
+    END_BLOCK implicit-def $value_stack, implicit $value_stack
+    FALLTHROUGH_RETURN_VOID implicit-def dead $arguments
+    END_FUNCTION implicit-def $value_stack, implicit $value_stack
+...
diff --git a/test/CodeGen/WebAssembly/cfg-stackify-eh.mir b/test/CodeGen/WebAssembly/cfg-stackify-eh.mir
index 9038f68966b30b8a3caa3bcecccc77366d98a155..b67579087faefbd00df9f76463858d7bedf357bf 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify-eh.mir
+++ b/test/CodeGen/WebAssembly/cfg-stackify-eh.mir
@@ -180,7 +180,7 @@ body: |
     RETHROW_TO_CALLER implicit-def $arguments
   ; CHECK-LABEL: bb.7:
     ; CHECK-NEXT: END_TRY
-    ; CHECK: RETHROW 3
+    ; CHECK: RETHROW 0
 
   bb.8:
   ; predecessors: %bb.2, %bb.4
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
index bd3ae29e28e62f952cc5cc3003bed8b72e18c28c..ea1ef9737c0c050eb0e22ba68d5ba4ae6579a135 100644
--- a/test/CodeGen/WebAssembly/conv.ll
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -45,6 +45,17 @@ define i32 @i32_trunc_s_f32(float %x) {
   ret i32 %a
 }
 
+; CHECK-LABEL: i32_trunc_sat_s_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float)
+define i32 @i32_trunc_sat_s_f32(float %x) {
+  %a = call i32 @llvm.wasm.trunc.saturate.signed.i32.f32(float %x)
+  ret i32 %a
+}
+
 ; CHECK-LABEL: i32_trunc_u_f32:
 ; CHECK-NEXT: .param f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
@@ -55,6 +66,17 @@ define i32 @i32_trunc_u_f32(float %x) {
   ret i32 %a
 }
 
+; CHECK-LABEL: i32_trunc_sat_u_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float)
+define i32 @i32_trunc_sat_u_f32(float %x) {
+  %a = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f32(float %x)
+  ret i32 %a
+}
+
 ; CHECK-LABEL: i32_trunc_s_f64:
 ; CHECK-NEXT: .param f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
@@ -65,6 +87,17 @@ define i32 @i32_trunc_s_f64(double %x) {
   ret i32 %a
 }
 
+; CHECK-LABEL: i32_trunc_sat_s_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double)
+define i32 @i32_trunc_sat_s_f64(double %x) {
+  %a = call i32 @llvm.wasm.trunc.saturate.signed.i32.f64(double %x)
+  ret i32 %a
+}
+
 ; CHECK-LABEL: i32_trunc_u_f64:
 ; CHECK-NEXT: .param f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
@@ -75,6 +108,17 @@ define i32 @i32_trunc_u_f64(double %x) {
   ret i32 %a
 }
 
+; CHECK-LABEL: i32_trunc_sat_u_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i32{{$}}
+; CHECK-NEXT: i32.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double)
+define i32 @i32_trunc_sat_u_f64(double %x) {
+  %a = call i32 @llvm.wasm.trunc.saturate.unsigned.i32.f64(double %x)
+  ret i32 %a
+}
+
 ; CHECK-LABEL: i64_trunc_s_f32:
 ; CHECK-NEXT: .param f32{{$}}
 ; CHECK-NEXT: .result i64{{$}}
@@ -85,6 +129,17 @@ define i64 @i64_trunc_s_f32(float %x) {
   ret i64 %a
 }
 
+; CHECK-LABEL: i64_trunc_sat_s_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_s:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float)
+define i64 @i64_trunc_sat_s_f32(float %x) {
+  %a = call i64 @llvm.wasm.trunc.saturate.signed.i64.f32(float %x)
+  ret i64 %a
+}
+
 ; CHECK-LABEL: i64_trunc_u_f32:
 ; CHECK-NEXT: .param f32{{$}}
 ; CHECK-NEXT: .result i64{{$}}
@@ -95,6 +150,17 @@ define i64 @i64_trunc_u_f32(float %x) {
   ret i64 %a
 }
 
+; CHECK-LABEL: i64_trunc_sat_u_f32:
+; CHECK-NEXT: .param f32{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_u:sat/f32 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float)
+define i64 @i64_trunc_sat_u_f32(float %x) {
+  %a = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f32(float %x)
+  ret i64 %a
+}
+
 ; CHECK-LABEL: i64_trunc_s_f64:
 ; CHECK-NEXT: .param f64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
@@ -105,6 +171,17 @@ define i64 @i64_trunc_s_f64(double %x) {
   ret i64 %a
 }
 
+; CHECK-LABEL: i64_trunc_sat_s_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_s:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double)
+define i64 @i64_trunc_sat_s_f64(double %x) {
+  %a = call i64 @llvm.wasm.trunc.saturate.signed.i64.f64(double %x)
+  ret i64 %a
+}
+
 ; CHECK-LABEL: i64_trunc_u_f64:
 ; CHECK-NEXT: .param f64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
@@ -115,6 +192,17 @@ define i64 @i64_trunc_u_f64(double %x) {
   ret i64 %a
 }
 
+; CHECK-LABEL: i64_trunc_sat_u_f64:
+; CHECK-NEXT: .param f64{{$}}
+; CHECK-NEXT: .result i64{{$}}
+; CHECK-NEXT: i64.trunc_u:sat/f64 $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: return $pop[[NUM]]{{$}}
+declare i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double)
+define i64 @i64_trunc_sat_u_f64(double %x) {
+  %a = call i64 @llvm.wasm.trunc.saturate.unsigned.i64.f64(double %x)
+  ret i64 %a
+}
+
 ; CHECK-LABEL: f32_convert_s_i32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result f32{{$}}
diff --git a/test/CodeGen/WebAssembly/eh-lsda.ll b/test/CodeGen/WebAssembly/eh-lsda.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fd550938c422ca15bc8a317cd99b4cca2d972fa6
--- /dev/null
+++ b/test/CodeGen/WebAssembly/eh-lsda.ll
@@ -0,0 +1,239 @@
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling | FileCheck -allow-deprecated-dag-overlap %s
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+@_ZTIi = external constant i8*
+@_ZTIf = external constant i8*
+@_ZTId = external constant i8*
+
+; Single catch (...) does not need an exception table.
+;
+; try {
+;   may_throw();
+; } catch (...) {
+; }
+; CHECK-LABEL: test0:
+; CHECK-NOT: GCC_except_table
+define void @test0() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+entry:
+  invoke void @may_throw()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null]
+  %2 = call i8* @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch.start
+  ret void
+}
+
+; Exception table generation + shared action test.
+;
+; try {
+;   may_throw();
+; } catch (int) {
+; } catch (float) {
+; } catch (double) {
+; } catch (...) {
+; }
+;
+; try {
+;   may_throw();
+; } catch (double) {
+; } catch (...) {
+; }
+;
+; try {
+;   may_throw();
+; } catch (int) {
+; } catch (float) {
+; }
+;
+; There are three landing pads. The second landing pad should share action table
+; entries with the first landing pad because they end with the same sequence
+; (double -> ...). But the third landing table cannot share action table entries
+; with others, so it should create its own entries.
+; CHECK-LABEL: test1:
+; CHECK: .section  .rodata.gcc_except_table,"",@
+; CHECK-NEXT:   .p2align  2
+; CHECK-NEXT: GCC_except_table[[START:[0-9]+]]:
+; CHECK-NEXT: .Lexception0:
+; CHECK-NEXT:   .int8  255                     # @LPStart Encoding = omit
+; CHECK-NEXT:   .int8  0                       # @TType Encoding = absptr
+; CHECK-NEXT:   .uleb128 .Lttbase0-.Lttbaseref0
+; CHECK-NEXT: .Lttbaseref0:
+; CHECK-NEXT:   .int8  1                       # Call site Encoding = uleb128
+; CHECK-NEXT:   .uleb128 .Lcst_end0-.Lcst_begin0
+; CHECK-NEXT: .Lcst_begin0:
+; CHECK-NEXT:   .int8  0                       # >> Call Site 0 <<
+; CHECK-NEXT:                                  #   On exception at call site 0
+; CHECK-NEXT:   .int8  7                       #   Action: 4
+; CHECK-NEXT:   .int8  1                       # >> Call Site 1 <<
+; CHECK-NEXT:                                  #   On exception at call site 1
+; CHECK-NEXT:   .int8  3                       #   Action: 2
+; CHECK-NEXT:   .int8  2                       # >> Call Site 2 <<
+; CHECK-NEXT:                                  #   On exception at call site 2
+; CHECK-NEXT:   .int8  11                      #   Action: 6
+; CHECK-NEXT: .Lcst_end0:
+; CHECK-NEXT:   .int8  1                       # >> Action Record 1 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 1
+; CHECK-NEXT:   .int8  0                       #   No further actions
+; CHECK-NEXT:   .int8  2                       # >> Action Record 2 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 2
+; CHECK-NEXT:   .int8  125                     #   Continue to action 1
+; CHECK-NEXT:   .int8  3                       # >> Action Record 3 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 3
+; CHECK-NEXT:   .int8  125                     #   Continue to action 2
+; CHECK-NEXT:   .int8  4                       # >> Action Record 4 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 4
+; CHECK-NEXT:   .int8  125                     #   Continue to action 3
+; CHECK-NEXT:   .int8  3                       # >> Action Record 5 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 3
+; CHECK-NEXT:   .int8  0                       #   No further actions
+; CHECK-NEXT:   .int8  4                       # >> Action Record 6 <<
+; CHECK-NEXT:                                  #   Catch TypeInfo 4
+; CHECK-NEXT:   .int8  125                     #   Continue to action 5
+; CHECK-NEXT:   .p2align  2
+; CHECK-NEXT:                                  # >> Catch TypeInfos <<
+; CHECK-NEXT:   .int32  _ZTIi                  # TypeInfo 4
+; CHECK-NEXT:   .int32  _ZTIf                  # TypeInfo 3
+; CHECK-NEXT:   .int32  _ZTId                  # TypeInfo 2
+; CHECK-NEXT:   .int32  0                      # TypeInfo 1
+; CHECK-NEXT: .Lttbase0:
+; CHECK-NEXT:   .p2align  2
+; CHECK-NEXT: .LGCC_except_table_end[[END:[0-9]+]]:
+; CHECK-NEXT:   .size  GCC_except_table[[START]], .LGCC_except_table_end[[END]]-GCC_except_table[[START]]
+define void @test1() personality i8* bitcast (i32 (...)* @__gxx_wasm_personality_v0 to i8*) {
+entry:
+  invoke void @may_throw()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*), i8* bitcast (i8** @_ZTId to i8*), i8* null]
+  %2 = call i8* @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %3, %4
+  br i1 %matches, label %catch10, label %catch.fallthrough
+
+catch10:                                          ; preds = %catch.start
+  %5 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  %6 = bitcast i8* %5 to i32*
+  %7 = load i32, i32* %6, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch, %catch4, %catch7, %catch10
+  invoke void @may_throw()
+          to label %try.cont23 unwind label %catch.dispatch14
+
+catch.dispatch14:                                 ; preds = %try.cont
+  %8 = catchswitch within none [label %catch.start15] unwind to caller
+
+catch.start15:                                    ; preds = %catch.dispatch14
+  %9 = catchpad within %8 [i8* bitcast (i8** @_ZTId to i8*), i8* null]
+  %10 = call i8* @llvm.wasm.get.exception(token %9)
+  %11 = call i32 @llvm.wasm.get.ehselector(token %9)
+  %12 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*))
+  %matches16 = icmp eq i32 %11, %12
+  %13 = call i8* @__cxa_begin_catch(i8* %10) [ "funclet"(token %9) ]
+  br i1 %matches16, label %catch20, label %catch17
+
+catch20:                                          ; preds = %catch.start15
+  %14 = bitcast i8* %13 to double*
+  %15 = load double, double* %14, align 8
+  call void @__cxa_end_catch() [ "funclet"(token %9) ]
+  catchret from %9 to label %try.cont23
+
+try.cont23:                                       ; preds = %try.cont, %catch17, %catch20
+  invoke void @may_throw()
+          to label %try.cont36 unwind label %catch.dispatch25
+
+catch.dispatch25:                                 ; preds = %try.cont23
+  %16 = catchswitch within none [label %catch.start26] unwind to caller
+
+catch.start26:                                    ; preds = %catch.dispatch25
+  %17 = catchpad within %16 [i8* bitcast (i8** @_ZTIi to i8*), i8* bitcast (i8** @_ZTIf to i8*)]
+  %18 = call i8* @llvm.wasm.get.exception(token %17)
+  %19 = call i32 @llvm.wasm.get.ehselector(token %17)
+  %20 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches27 = icmp eq i32 %19, %20
+  br i1 %matches27, label %catch33, label %catch.fallthrough28
+
+catch33:                                          ; preds = %catch.start26
+  %21 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ]
+  %22 = bitcast i8* %21 to i32*
+  %23 = load i32, i32* %22, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %17) ]
+  catchret from %17 to label %try.cont36
+
+catch.fallthrough28:                              ; preds = %catch.start26
+  %24 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*))
+  %matches29 = icmp eq i32 %19, %24
+  br i1 %matches29, label %catch30, label %rethrow
+
+catch30:                                          ; preds = %catch.fallthrough28
+  %25 = call i8* @__cxa_begin_catch(i8* %18) [ "funclet"(token %17) ]
+  %26 = bitcast i8* %25 to float*
+  %27 = load float, float* %26, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %17) ]
+  catchret from %17 to label %try.cont36
+
+rethrow:                                          ; preds = %catch.fallthrough28
+  call void @__cxa_rethrow() [ "funclet"(token %17) ]
+  unreachable
+
+try.cont36:                                       ; preds = %try.cont23, %catch30, %catch33
+  ret void
+
+catch17:                                          ; preds = %catch.start15
+  call void @__cxa_end_catch() [ "funclet"(token %9) ]
+  catchret from %9 to label %try.cont23
+
+catch.fallthrough:                                ; preds = %catch.start
+  %28 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIf to i8*))
+  %matches1 = icmp eq i32 %3, %28
+  br i1 %matches1, label %catch7, label %catch.fallthrough2
+
+catch7:                                           ; preds = %catch.fallthrough
+  %29 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  %30 = bitcast i8* %29 to float*
+  %31 = load float, float* %30, align 4
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+catch.fallthrough2:                               ; preds = %catch.fallthrough
+  %32 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTId to i8*))
+  %matches3 = icmp eq i32 %3, %32
+  %33 = call i8* @__cxa_begin_catch(i8* %2) [ "funclet"(token %1) ]
+  br i1 %matches3, label %catch4, label %catch
+
+catch4:                                           ; preds = %catch.fallthrough2
+  %34 = bitcast i8* %33 to double*
+  %35 = load double, double* %34, align 8
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+catch:                                            ; preds = %catch.fallthrough2
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+}
+
+declare void @may_throw()
+declare i32 @llvm.eh.typeid.for(i8*)
+declare i8* @llvm.wasm.get.exception(token)
+declare i32 @llvm.wasm.get.ehselector(token)
+declare void @__cxa_rethrow()
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+declare i32 @__gxx_wasm_personality_v0(...)
diff --git a/test/CodeGen/WebAssembly/exception.ll b/test/CodeGen/WebAssembly/exception.ll
index 1714ad6dc4022cff25d993128992c85145a0e1cd..bd7935c3684c14c6e3c032edffc1e728dc591da5 100644
--- a/test/CodeGen/WebAssembly/exception.ll
+++ b/test/CodeGen/WebAssembly/exception.ll
@@ -1,5 +1,6 @@
 ; RUN: not llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers -exception-model=wasm -mattr=+exception-handling
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll
index 9314b2e6e5f231c4fac9a8c5d29618cb1810f1d8..27520d035c9ebfa8e8c858669783ed0d40530de1 100644
--- a/test/CodeGen/WebAssembly/f32.ll
+++ b/test/CodeGen/WebAssembly/f32.ll
@@ -123,12 +123,6 @@ define float @nearest32_via_rint(float %x) {
   ret float %a
 }
 
-; Min and max tests. LLVM currently only forms fminnan and fmaxnan nodes in
-; cases where there's a single fcmp with a select and it can prove that one
-; of the arms is never NaN, so we only test that case. In the future if LLVM
-; learns to form fminnan/fmaxnan in more cases, we can write more general
-; tests.
-
 ; CHECK-LABEL: fmin32:
 ; CHECK: f32.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -147,6 +141,24 @@ define float @fmax32(float %x) {
   ret float %b
 }
 
+; CHECK-LABEL: fmin32_intrinsic:
+; CHECK: f32.min $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+declare float @llvm.minimum.f32(float, float)
+define float @fmin32_intrinsic(float %x, float %y) {
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  ret float %a
+}
+
+; CHECK-LABEL: fmax32_intrinsic:
+; CHECK: f32.max $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+declare float @llvm.maximum.f32(float, float)
+define float @fmax32_intrinsic(float %x, float %y) {
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  ret float %a
+}
+
 ; CHECK-LABEL: fma32:
 ; CHECK: {{^}} f32.call $push[[LR:[0-9]+]]=, fmaf@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[LR]]{{$}}
diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll
index 5635e326561ecaeb6f873241ba65ca06d373a885..d02767fa3a15cfc6128bec58e7da1f80b926f626 100644
--- a/test/CodeGen/WebAssembly/f64.ll
+++ b/test/CodeGen/WebAssembly/f64.ll
@@ -123,12 +123,6 @@ define double @nearest64_via_rint(double %x) {
   ret double %a
 }
 
-; Min and max tests. LLVM currently only forms fminnan and fmaxnan nodes in
-; cases where there's a single fcmp with a select and it can prove that one
-; of the arms is never NaN, so we only test that case. In the future if LLVM
-; learns to form fminnan/fmaxnan in more cases, we can write more general
-; tests.
-
 ; CHECK-LABEL: fmin64:
 ; CHECK: f64.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
@@ -147,6 +141,24 @@ define double @fmax64(double %x) {
   ret double %b
 }
 
+; CHECK-LABEL: fmin64_intrinsic:
+; CHECK: f64.min $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+declare double @llvm.minimum.f64(double, double)
+define double @fmin64_intrinsic(double %x, double %y) {
+  %a = call double @llvm.minimum.f64(double %x, double %y)
+  ret double %a
+}
+
+; CHECK-LABEL: fmax64_intrinsic:
+; CHECK: f64.max $push0=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop0{{$}}
+declare double @llvm.maximum.f64(double, double)
+define double @fmax64_intrinsic(double %x, double %y) {
+  %a = call double @llvm.maximum.f64(double %x, double %y)
+  ret double %a
+}
+
 ; CHECK-LABEL: fma64:
 ; CHECK: {{^}} f64.call $push[[LR:[0-9]+]]=, fma@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[LR]]{{$}}
diff --git a/test/CodeGen/WebAssembly/implicit-def.ll b/test/CodeGen/WebAssembly/implicit-def.ll
index 16b4031c96b05ed945fa775a3bd5e796078daaec..8f7dcc8cee3e4ac8f4212aab8f197689c1f89ce3 100644
--- a/test/CodeGen/WebAssembly/implicit-def.ll
+++ b/test/CodeGen/WebAssembly/implicit-def.ll
@@ -1,50 +1,133 @@
-; RUN: llc -o - %s -asm-verbose=false -wasm-keep-registers | FileCheck %s
+; RUN: llc -o - %s -asm-verbose=false -wasm-keep-registers -disable-wasm-fallthrough-return-opt -mattr=+simd128 | FileCheck %s
+
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; Test that stackified IMPLICIT_DEF instructions are converted into
-; CONST_I32 to provide an explicit push.
-
-; CHECK:      br_if 2,
-; CHECK:      i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: return $pop[[L0]]{{$}}
-define i1 @f() {
-  %a = xor i1 0, 0
-  switch i1 %a, label %C [
-    i1 0, label %A
-    i1 1, label %B
-  ]
-
-A:
-  %b = xor i1 0, 0
+; CONST_XXX instructions to provide an explicit push.
+
+; CHECK-LABEL: implicit_def_i32:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define i32 @implicit_def_i32() {
+  br i1 undef, label %A, label %X
+
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
+
+B:                                                ; preds = %A
+  br label %C
+
+C:                                                ; preds = %B, %A
+  %h = phi i32 [ undef, %A ], [ 0, %B ]
+  br label %X
+
+X:                                                ; preds = %0, C
+  %i = phi i32 [ 1, %0 ], [ %h, %C ]
+  ret i32 %i
+}
+
+; CHECK-LABEL: implicit_def_i64:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: i64.const $push[[R:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define i64 @implicit_def_i64() {
+  br i1 undef, label %A, label %X
+
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
+
+B:                                                ; preds = %A
+  br label %C
+
+C:                                                ; preds = %B, %A
+  %h = phi i64 [ undef, %A ], [ 0, %B ]
   br label %X
 
-B:
-  %c = xor i1 0, 0
-  br i1 %c, label %D, label %X
+X:                                                ; preds = %0, C
+  %i = phi i64 [ 1, %0 ], [ %h, %C ]
+  ret i64 %i
+}
+
+; CHECK-LABEL: implicit_def_f32:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: f32.const $push[[R:[0-9]+]]=, 0x0p0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define float @implicit_def_f32() {
+  br i1 undef, label %A, label %X
 
-C:
-  %d = icmp slt i32 0, 0
-  br i1 %d, label %G, label %F
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
 
-D:
-  %e = xor i1 0, 0
-  br i1 %e, label %E, label %X
+B:                                                ; preds = %A
+  br label %C
 
-E:
-  %f = xor i1 0, 0
+C:                                                ; preds = %B, %A
+  %h = phi float [ undef, %A ], [ 0.0, %B ]
   br label %X
 
-F:
-  %g = xor i1 0, 0
-  br label %G
+X:                                                ; preds = %0, C
+  %i = phi float [ 1.0, %0 ], [ %h, %C ]
+  ret float %i
+}
+
+; CHECK-LABEL: implicit_def_f64:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: f64.const $push[[R:[0-9]+]]=, 0x0p0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define double @implicit_def_f64() {
+  br i1 undef, label %A, label %X
+
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
+
+B:                                                ; preds = %A
+  br label %C
 
-G:
-  %h = phi i1 [ undef, %C ], [ false, %F ]
+C:                                                ; preds = %B, %A
+  %h = phi double [ undef, %A ], [ 0.0, %B ]
   br label %X
 
-X:
-  %i = phi i1 [ true, %A ], [ true, %B ], [ true, %D ], [ true, %E ], [ %h, %G ]
-  ret i1 %i
+X:                                                ; preds = %0, C
+  %i = phi double [ 1.0, %0 ], [ %h, %C ]
+  ret double %i
 }
 
+; CHECK-LABEL: implicit_def_v4i32:
+; CHECK: .LBB{{[0-9]+}}_4:{{$}}
+; CHECK-NEXT: end_block{{$}}
+; CHECK-NEXT: v128.const $push[[R:[0-9]+]]=, 0, 0, 0, 0, 0, 0, 0, 0,
+; CHECK-SAME:                                0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+; CHECK-NEXT: end_function{{$}}
+define <4 x i32> @implicit_def_v4i32() {
+  br i1 undef, label %A, label %X
+
+A:                                                ; preds = %0
+  %d = icmp slt i1 0, 0
+  br i1 %d, label %C, label %B
+
+B:                                                ; preds = %A
+  br label %C
+
+C:                                                ; preds = %B, %A
+  %h = phi <4 x i32> [ undef, %A ], [ <i32 0, i32 0, i32 0, i32 0>, %B ]
+  br label %X
+
+X:                                                ; preds = %0, C
+  %i = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %0 ], [ %h, %C ]
+  ret <4 x i32> %i
+}
diff --git a/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll b/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7fcc3cf276a1f6e9884def468a08275dbff374a3
--- /dev/null
+++ b/test/CodeGen/WebAssembly/inline-asm-roundtrip.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s | llvm-mc -triple=wasm32-unknown-unknown | FileCheck --match-full-lines %s
+
+; Test basic inline assembly can actually be assembled by the assembler.
+
+; .ll code below is the result of this code run thru
+; clang -target wasm32-unknown-unknown-wasm -O2 -S -emit-llvm test.c
+
+; int main(int argc, const char *argv[]) {
+;   int src = 1;
+;   int dst;
+;   asm ("i32.const\t2\n"
+;        "\tget_local\t%1\n"
+;        "\ti32.add\n"
+;        "\tset_local\t%0"
+;        : "=r" (dst)
+;        : "r" (src));
+;   return dst != 3;
+; }
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: main:
+; CHECK-NEXT:	.param  	i32, i32
+; CHECK-NEXT:	.local  	i32
+; CHECK-NEXT:	i32.const	1
+; CHECK-NEXT:	set_local	[[SRC:[0-9]+]]
+; CHECK-NEXT:	i32.const	2
+; CHECK-NEXT:	get_local	[[SRC]]
+; CHECK-NEXT:	i32.add
+; CHECK-NEXT:	set_local	[[DST:[0-9]+]]
+; CHECK-NEXT:	get_local	[[DST]]
+; CHECK-NEXT:	i32.const	3
+; CHECK-NEXT:	i32.ne
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
+entry:
+  %0 = tail call i32 asm "i32.const\092\0A\09get_local\09$1\0A\09i32.add\0A\09set_local\09$0", "=r,r"(i32 1) #1
+  %cmp = icmp ne i32 %0, 3
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
diff --git a/test/CodeGen/WebAssembly/regcopy.mir b/test/CodeGen/WebAssembly/regcopy.mir
new file mode 100644
index 0000000000000000000000000000000000000000..5115cde6d240b62cf1c677a913f93844b26ba67e
--- /dev/null
+++ b/test/CodeGen/WebAssembly/regcopy.mir
@@ -0,0 +1,80 @@
+# RUN: llc %s -o - -run-pass=postrapseudos | FileCheck %s
+--- |
+  target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+  target triple = "wasm32-unknown-unknown"
+
+  define void @copy_i32() {
+    ret void
+  }
+
+  define void @copy_i64() {
+    ret void
+  }
+
+  define void @copy_f32() {
+    ret void
+  }
+
+  define void @copy_f64() {
+    ret void
+  }
+
+  define void @copy_v128() {
+    ret void
+  }
+...
+---
+name: copy_i32
+# CHECK-LABEL: copy_i32
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:i32 = COPY_I32 %1:i32
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:i32 = COPY %1:i32
+    RETURN_VOID implicit-def $arguments
+...
+---
+name: copy_i64
+# CHECK-LABEL: copy_i64
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:i64 = COPY_I64 %1:i64
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:i64 = COPY %1:i64
+    RETURN_VOID implicit-def $arguments
+...
+---
+name: copy_f32
+# CHECK-LABEL: copy_f32
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:f32 = COPY_F32 %1:f32
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:f32 = COPY %1:f32
+    RETURN_VOID implicit-def $arguments
+...
+---
+name: copy_f64
+# CHECK-LABEL: copy_f64
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:f64 = COPY_F64 %1:f64
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:f64 = COPY %1:f64
+    RETURN_VOID implicit-def $arguments
+...
+---
+name: copy_v128
+# CHECK-LABEL: copy_v128
+body:             |
+  ; CHECK-LABEL: bb.0:
+  ; CHECK-NEXT: %0:v128 = COPY_V128 %1:v128
+  ; CHECK-NEXT: RETURN_VOID
+  bb.0:
+    %0:v128 = COPY %1:v128
+    RETURN_VOID implicit-def $arguments
+...
diff --git a/test/CodeGen/WebAssembly/select.ll b/test/CodeGen/WebAssembly/select.ll
index 6f6e95f84188561d92adf8bb993579e1fc21ac85..99b8d45d8e2e31de288f7751f4d65a7d000d254e 100644
--- a/test/CodeGen/WebAssembly/select.ll
+++ b/test/CodeGen/WebAssembly/select.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefixes CHECK,SLOW
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel -fast-isel-abort=1 | FileCheck %s
 
 ; Test that wasm select instruction is selected from LLVM select instruction.
@@ -16,6 +16,16 @@ define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {
   ret i32 %cond
 }
 
+; CHECK-LABEL: select_i32_bool_nozext:
+; CHECK-NEXT: .param     i32, i32, i32{{$}}
+; CHECK-NEXT: .result    i32{{$}}
+; SLOW-NEXT: i32.select $push0=, $1, $2, $0{{$}}
+; SLOW-NEXT: return     $pop0{{$}}
+define i32 @select_i32_bool_nozext(i1 %a, i32 %b, i32 %c) {
+  %cond = select i1 %a, i32 %b, i32 %c
+  ret i32 %cond
+}
+
 ; CHECK-LABEL: select_i32_eq:
 ; CHECK-NEXT: .param     i32, i32, i32{{$}}
 ; CHECK-NEXT: .result    i32{{$}}
@@ -48,6 +58,16 @@ define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {
   ret i64 %cond
 }
 
+; CHECK-LABEL: select_i64_bool_nozext:
+; CHECK-NEXT: .param     i32, i64, i64{{$}}
+; CHECK-NEXT: .result    i64{{$}}
+; SLOW-NEXT: i64.select $push0=, $1, $2, $0{{$}}
+; SLOW-NEXT: return     $pop0{{$}}
+define i64 @select_i64_bool_nozext(i1 %a, i64 %b, i64 %c) {
+  %cond = select i1 %a, i64 %b, i64 %c
+  ret i64 %cond
+}
+
 ; CHECK-LABEL: select_i64_eq:
 ; CHECK-NEXT: .param     i32, i64, i64{{$}}
 ; CHECK-NEXT: .result    i64{{$}}
@@ -80,6 +100,16 @@ define float @select_f32_bool(i1 zeroext %a, float %b, float %c) {
   ret float %cond
 }
 
+; CHECK-LABEL: select_f32_bool_nozext:
+; CHECK-NEXT: .param     i32, f32, f32{{$}}
+; CHECK-NEXT: .result    f32{{$}}
+; SLOW-NEXT: f32.select $push0=, $1, $2, $0{{$}}
+; SLOW-NEXT: return     $pop0{{$}}
+define float @select_f32_bool_nozext(i1 %a, float %b, float %c) {
+  %cond = select i1 %a, float %b, float %c
+  ret float %cond
+}
+
 ; CHECK-LABEL: select_f32_eq:
 ; CHECK-NEXT: .param     i32, f32, f32{{$}}
 ; CHECK-NEXT: .result    f32{{$}}
@@ -112,6 +142,16 @@ define double @select_f64_bool(i1 zeroext %a, double %b, double %c) {
   ret double %cond
 }
 
+; CHECK-LABEL: select_f64_bool_nozext:
+; CHECK-NEXT: .param     i32, f64, f64{{$}}
+; CHECK-NEXT: .result    f64{{$}}
+; SLOW-NEXT: f64.select $push0=, $1, $2, $0{{$}}
+; SLOW-NEXT: return     $pop0{{$}}
+define double @select_f64_bool_nozext(i1 %a, double %b, double %c) {
+  %cond = select i1 %a, double %b, double %c
+  ret double %cond
+}
+
 ; CHECK-LABEL: select_f64_eq:
 ; CHECK-NEXT: .param     i32, f64, f64{{$}}
 ; CHECK-NEXT: .result    f64{{$}}
diff --git a/test/CodeGen/WebAssembly/simd-arith.ll b/test/CodeGen/WebAssembly/simd-arith.ll
index f3e70156d8bbc1af4e8b8b0045ed43b55926e801..e092cd98ecb7575bb98c4e9c8b4991343b18a145 100644
--- a/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/test/CodeGen/WebAssembly/simd-arith.ll
@@ -92,6 +92,25 @@ define <16 x i8> @shl_const_v16i8(<16 x i8> %v) {
   ret <16 x i8> %a
 }
 
+; CHECK-LABEL: shl_vec_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 14 lanes
+; SIMD128:      i8x16.extract_lane_u $push[[L4:[0-9]+]]=, $0, 15{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L5:[0-9]+]]=, $1, 15{{$}}
+; SIMD128-NEXT: i32.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 15, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shl_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
+  %a = shl <16 x i8> %v, %x
+  ret <16 x i8> %a
+}
+
 ; CHECK-LABEL: shr_s_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -107,6 +126,33 @@ define <16 x i8> @shr_s_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %a
 }
 
+; CHECK-LABEL: shr_s_vec_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 24{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L3:[0-9]+]]=, 24{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L5:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i8x16.splat $push[[L7:[0-9]+]]=, $pop[[L6]]{{$}}
+; Skip 14 lanes
+; SIMD128:      i8x16.extract_lane_u $push[[L7:[0-9]+]]=, $0, 15{{$}}
+; SIMD128-NEXT: i32.const $push[[L8:[0-9]+]]=, 24{{$}}
+; SIMD128-NEXT: i32.shl $push[[L9:[0-9]+]]=, $pop[[L7]], $pop[[L8]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L10:[0-9]+]]=, 24{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L11:[0-9]+]]=, $pop[[L9]], $pop[[L10]]{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L12:[0-9]+]]=, $1, 15{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L13:[0-9]+]]=, $pop[[L11]], $pop[[L12]]{{$}}
+; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[L14:[0-9]+]], 15, $pop[[L13]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shr_s_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
+  %a = ashr <16 x i8> %v, %x
+  ret <16 x i8> %a
+}
+
 ; CHECK-LABEL: shr_u_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -122,6 +168,25 @@ define <16 x i8> @shr_u_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %a
 }
 
+; CHECK-LABEL: shr_u_vec_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 14 lanes
+; SIMD128:      i8x16.extract_lane_u $push[[L4:[0-9]+]]=, $0, 15{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[L5:[0-9]+]]=, $1, 15{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 15, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shr_u_vec_v16i8(<16 x i8> %v, <16 x i8> %x) {
+  %a = lshr <16 x i8> %v, %x
+  ret <16 x i8> %a
+}
+
 ; CHECK-LABEL: and_v16i8:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -265,6 +330,25 @@ define <8 x i16> @shl_const_v8i16(<8 x i16> %v) {
   ret <8 x i16> %a
 }
 
+; CHECK-LABEL: shl_vec_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 6 lanes
+; SIMD128:      i16x8.extract_lane_u $push[[L4:[0-9]+]]=, $0, 7{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L5:[0-9]+]]=, $1, 7{{$}}
+; SIMD128-NEXT: i32.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 7, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @shl_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
+  %a = shl <8 x i16> %v, %x
+  ret <8 x i16> %a
+}
+
 ; CHECK-LABEL: shr_s_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -279,6 +363,33 @@ define <8 x i16> @shr_s_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %a
 }
 
+; CHECK-LABEL: shr_s_vec_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L3:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L5:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i16x8.splat $push[[L7:[0-9]+]]=, $pop[[L6]]{{$}}
+; Skip 6 lanes
+; SIMD128:      i16x8.extract_lane_u $push[[L7:[0-9]+]]=, $0, 7{{$}}
+; SIMD128-NEXT: i32.const $push[[L8:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.shl $push[[L9:[0-9]+]]=, $pop[[L7]], $pop[[L8]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L10:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L11:[0-9]+]]=, $pop[[L9]], $pop[[L10]]{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L12:[0-9]+]]=, $1, 7{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L13:[0-9]+]]=, $pop[[L11]], $pop[[L12]]{{$}}
+; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[L14:[0-9]+]], 7, $pop[[L13]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @shr_s_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
+  %a = ashr <8 x i16> %v, %x
+  ret <8 x i16> %a
+}
+
 ; CHECK-LABEL: shr_u_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -293,6 +404,25 @@ define <8 x i16> @shr_u_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %a
 }
 
+; CHECK-LABEL: shr_u_vec_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 6 lanes
+; SIMD128:      i16x8.extract_lane_u $push[[L4:[0-9]+]]=, $0, 7{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[L5:[0-9]+]]=, $1, 7{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 7, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @shr_u_vec_v8i16(<8 x i16> %v, <8 x i16> %x) {
+  %a = lshr <8 x i16> %v, %x
+  ret <8 x i16> %a
+}
+
 ; CHECK-LABEL: and_v8i16:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -432,6 +562,25 @@ define <4 x i32> @shl_const_v4i32(<4 x i32> %v) {
   ret <4 x i32> %a
 }
 
+; CHECK-LABEL: shl_vec_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 2 lanes
+; SIMD128:      i32x4.extract_lane $push[[L4:[0-9]+]]=, $0, 3{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L5:[0-9]+]]=, $1, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 3, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @shl_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
+  %a = shl <4 x i32> %v, %x
+  ret <4 x i32> %a
+}
+
 ; CHECK-LABEL: shr_s_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -446,6 +595,25 @@ define <4 x i32> @shr_s_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %a
 }
 
+; CHECK-LABEL: shr_s_vec_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 2 lanes
+; SIMD128:      i32x4.extract_lane $push[[L4:[0-9]+]]=, $0, 3{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L5:[0-9]+]]=, $1, 3{{$}}
+; SIMD128-NEXT: i32.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 3, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @shr_s_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
+  %a = ashr <4 x i32> %v, %x
+  ret <4 x i32> %a
+}
+
 ; CHECK-LABEL: shr_u_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -460,6 +628,25 @@ define <4 x i32> @shr_u_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %a
 }
 
+; CHECK-LABEL: shr_u_vec_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; Skip 2 lanes
+; SIMD128:      i32x4.extract_lane $push[[L4:[0-9]+]]=, $0, 3{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[L5:[0-9]+]]=, $1, 3{{$}}
+; SIMD128-NEXT: i32.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $pop[[L7:[0-9]+]], 3, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @shr_u_vec_v4i32(<4 x i32> %v, <4 x i32> %x) {
+  %a = lshr <4 x i32> %v, %x
+  ret <4 x i32> %a
+}
+
 ; CHECK-LABEL: and_v4i32:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -605,15 +792,32 @@ define <2 x i64> @shl_nozext_v2i64(<2 x i64> %v, i64 %x) {
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: i64.const $push[[L0:[0-9]+]]=, 5{{$}}
-; SIMD128-NEXT: i32.wrap/i64 $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
-; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
+; SIMD128-NEXT: i64x2.shl $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @shl_const_v2i64(<2 x i64> %v) {
   %a = shl <2 x i64> %v, <i64 5, i64 5>
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: shl_vec_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i64.shl $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L4:[0-9]+]]=, $0, 1{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L5:[0-9]+]]=, $1, 1{{$}}
+; SIMD128-NEXT: i64.shl $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L3]], 1, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shl_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
+  %a = shl <2 x i64> %v, %x
+  ret <2 x i64> %a
+}
+
 ; CHECK-LABEL: shr_s_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -642,6 +846,36 @@ define <2 x i64> @shr_s_nozext_v2i64(<2 x i64> %v, i64 %x) {
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: shr_s_const_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
+; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shr_s_const_v2i64(<2 x i64> %v) {
+  %a = ashr <2 x i64> %v, <i64 5, i64 5>
+  ret <2 x i64> %a
+}
+
+; CHECK-LABEL: shr_s_vec_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i64.shr_s $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L4:[0-9]+]]=, $0, 1{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L5:[0-9]+]]=, $1, 1{{$}}
+; SIMD128-NEXT: i64.shr_s $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L3]], 1, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shr_s_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
+  %a = ashr <2 x i64> %v, %x
+  ret <2 x i64> %a
+}
+
 ; CHECK-LABEL: shr_u_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -670,6 +904,36 @@ define <2 x i64> @shr_u_nozext_v2i64(<2 x i64> %v, i64 %x) {
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: shr_u_const_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 5{{$}}
+; SIMD128-NEXT: i64x2.shr_u $push[[R:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shr_u_const_v2i64(<2 x i64> %v) {
+  %a = lshr <2 x i64> %v, <i64 5, i64 5>
+  ret <2 x i64> %a
+}
+
+; CHECK-LABEL: shr_u_vec_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0{{$}}
+; SIMD128-NEXT: i64.shr_u $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L4:[0-9]+]]=, $0, 1{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[L5:[0-9]+]]=, $1, 1{{$}}
+; SIMD128-NEXT: i64.shr_u $push[[L6:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
+; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L3]], 1, $pop[[L6]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shr_u_vec_v2i64(<2 x i64> %v, <2 x i64> %x) {
+  %a = lshr <2 x i64> %v, %x
+  ret <2 x i64> %a
+}
+
 ; CHECK-LABEL: and_v2i64:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
@@ -765,6 +1029,118 @@ define <4 x float> @abs_v4f32(<4 x float> %x) {
   ret <4 x float> %a
 }
 
+; CHECK-LABEL: min_unordered_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @min_unordered_v4f32(<4 x float> %x) {
+  %cmps = fcmp ule <4 x float> %x, <float 5., float 5., float 5., float 5.>
+  %a = select <4 x i1> %cmps, <4 x float> %x,
+    <4 x float> <float 5., float 5., float 5., float 5.>
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: max_unordered_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @max_unordered_v4f32(<4 x float> %x) {
+  %cmps = fcmp uge <4 x float> %x, <float 5., float 5., float 5., float 5.>
+  %a = select <4 x i1> %cmps, <4 x float> %x,
+    <4 x float> <float 5., float 5., float 5., float 5.>
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: min_ordered_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @min_ordered_v4f32(<4 x float> %x) {
+  %cmps = fcmp ole <4 x float> <float 5., float 5., float 5., float 5.>, %x
+  %a = select <4 x i1> %cmps,
+    <4 x float> <float 5., float 5., float 5., float 5.>, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: max_ordered_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f32x4.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @max_ordered_v4f32(<4 x float> %x) {
+  %cmps = fcmp oge <4 x float> <float 5., float 5., float 5., float 5.>, %x
+  %a = select <4 x i1> %cmps,
+    <4 x float> <float 5., float 5., float 5., float 5.>, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: min_intrinsic_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.min $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+define <4 x float> @min_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
+  %a = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: max_intrinsic_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.max $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+define <4 x float> @max_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) {
+  %a = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: min_const_intrinsic_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}}
+; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @min_const_intrinsic_v4f32() {
+  %a = call <4 x float> @llvm.minimum.v4f32(
+    <4 x float> <float 42., float 42., float 42., float 42.>,
+    <4 x float> <float 5., float 5., float 5., float 5.>
+  )
+  ret <4 x float> %a
+}
+
+; CHECK-LABEL: max_const_intrinsic_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}}
+; SIMD128-NEXT: f32x4.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @max_const_intrinsic_v4f32() {
+  %a = call <4 x float> @llvm.maximum.v4f32(
+    <4 x float> <float 42., float 42., float 42., float 42.>,
+    <4 x float> <float 5., float 5., float 5., float 5.>
+  )
+  ret <4 x float> %a
+}
+
 ; CHECK-LABEL: add_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -848,6 +1224,118 @@ define <2 x double> @abs_v2f64(<2 x double> %x) {
   ret <2 x double> %a
 }
 
+; CHECK-LABEL: min_unordered_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @min_unordered_v2f64(<2 x double> %x) {
+  %cmps = fcmp ule <2 x double> %x, <double 5., double 5.>
+  %a = select <2 x i1> %cmps, <2 x double> %x,
+    <2 x double> <double 5., double 5.>
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: max_unordered_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @max_unordered_v2f64(<2 x double> %x) {
+  %cmps = fcmp uge <2 x double> %x, <double 5., double 5.>
+  %a = select <2 x i1> %cmps, <2 x double> %x,
+    <2 x double> <double 5., double 5.>
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: min_ordered_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @min_ordered_v2f64(<2 x double> %x) {
+  %cmps = fcmp ole <2 x double> <double 5., double 5.>, %x
+  %a = select <2 x i1> %cmps, <2 x double> <double 5., double 5.>,
+    <2 x double> %x
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: max_ordered_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L0:[0-9]+]]=, 0x1.4p2
+; SIMD128-NEXT: f64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]
+; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $pop[[L1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @max_ordered_v2f64(<2 x double> %x) {
+  %cmps = fcmp oge <2 x double> <double 5., double 5.>, %x
+  %a = select <2 x i1> %cmps, <2 x double> <double 5., double 5.>,
+    <2 x double> %x
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: min_intrinsic_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.min $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
+define <2 x double> @min_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) {
+  %a = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: max_intrinsic_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.max $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+define <2 x double> @max_intrinsic_v2f64(<2 x double> %x, <2 x double> %y) {
+  %a = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: min_const_intrinsic_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.4p2{{$}}
+; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @min_const_intrinsic_v2f64() {
+  %a = call <2 x double> @llvm.minimum.v2f64(
+    <2 x double> <double 42., double 42.>,
+    <2 x double> <double 5., double 5.>
+  )
+  ret <2 x double> %a
+}
+
+; CHECK-LABEL: max_const_intrinsic_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64.const $push[[L:[0-9]+]]=, 0x1.5p5{{$}}
+; SIMD128-NEXT: f64x2.splat $push[[R:[0-9]+]]=, $pop[[L]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @max_const_intrinsic_v2f64() {
+  %a = call <2 x double> @llvm.maximum.v2f64(
+    <2 x double> <double 42., double 42.>,
+    <2 x double> <double 5., double 5.>
+  )
+  ret <2 x double> %a
+}
+
 ; CHECK-LABEL: add_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f62x2
diff --git a/test/CodeGen/WebAssembly/simd-comparisons.ll b/test/CodeGen/WebAssembly/simd-comparisons.ll
index 790bbb7064627dbf2e5ae958b19284f6719fa472..5f0a1e93b45bf04de735eb3a61be8a8bfb5572a6 100644
--- a/test/CodeGen/WebAssembly/simd-comparisons.ll
+++ b/test/CodeGen/WebAssembly/simd-comparisons.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext --show-mc-encoding | FileCheck %s --check-prefixes CHECK,SIMD128
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+simd128,+sign-ext --show-mc-encoding | FileCheck %s --check-prefixes CHECK,SIMD128-VM
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=-simd128,+sign-ext --show-mc-encoding | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128-VM
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=-simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,NO-SIMD128
 
 ; Test SIMD comparison operators
 
@@ -18,6 +18,18 @@ define <16 x i1> @compare_eq_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_eq_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_eq_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp eq <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_ne_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -29,6 +41,18 @@ define <16 x i1> @compare_ne_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ne_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_ne_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp ne <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_slt_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -40,6 +64,18 @@ define <16 x i1> @compare_slt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_slt_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_slt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp slt <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_ult_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -51,6 +87,18 @@ define <16 x i1> @compare_ult_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_ult_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp ult <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_sle_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -62,6 +110,18 @@ define <16 x i1> @compare_sle_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sle_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_sle_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp sle <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_ule_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -73,6 +133,18 @@ define <16 x i1> @compare_ule_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_ule_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp ule <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_sgt_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -84,6 +156,18 @@ define <16 x i1> @compare_sgt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sgt_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_sgt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp sgt <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -95,6 +179,18 @@ define <16 x i1> @compare_ugt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_ugt_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp ugt <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_sge_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -106,6 +202,18 @@ define <16 x i1> @compare_sge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sge_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_sge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp sge <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_uge_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -117,6 +225,18 @@ define <16 x i1> @compare_uge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @compare_sext_uge_v16i8 (<16 x i8> %x, <16 x i8> %y) {
+  %cmp = icmp uge <16 x i8> %x, %y
+  %res = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: compare_eq_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -128,6 +248,18 @@ define <8 x i1> @compare_eq_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_eq_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_eq_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp eq <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_ne_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -139,6 +271,18 @@ define <8 x i1> @compare_ne_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ne_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_ne_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp ne <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_slt_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -150,6 +294,18 @@ define <8 x i1> @compare_slt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_slt_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_slt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp slt <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_ult_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -161,6 +317,18 @@ define <8 x i1> @compare_ult_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_ult_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp ult <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_sle_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -172,6 +340,18 @@ define <8 x i1> @compare_sle_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sle_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_sle_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp sle <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_ule_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -183,6 +363,18 @@ define <8 x i1> @compare_ule_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_ule_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp ule <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_sgt_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -194,6 +386,18 @@ define <8 x i1> @compare_sgt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sgt_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_sgt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp sgt <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -205,6 +409,18 @@ define <8 x i1> @compare_ugt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_ugt_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp ugt <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_sge_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -216,6 +432,18 @@ define <8 x i1> @compare_sge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sge_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_sge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp sge <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_uge_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -227,6 +455,18 @@ define <8 x i1> @compare_uge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @compare_sext_uge_v8i16 (<8 x i16> %x, <8 x i16> %y) {
+  %cmp = icmp uge <8 x i16> %x, %y
+  %res = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: compare_eq_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -238,6 +478,18 @@ define <4 x i1> @compare_eq_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_eq_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_eq_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp eq <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ne_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -249,6 +501,18 @@ define <4 x i1> @compare_ne_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ne_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ne_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp ne <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_slt_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -260,6 +524,18 @@ define <4 x i1> @compare_slt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_slt_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.lt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_slt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp slt <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ult_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -271,6 +547,18 @@ define <4 x i1> @compare_ult_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.lt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ult_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp ult <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_sle_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -282,6 +570,18 @@ define <4 x i1> @compare_sle_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sle_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.le_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_sle_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp sle <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ule_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -293,6 +593,18 @@ define <4 x i1> @compare_ule_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.le_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ule_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp ule <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_sgt_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -304,6 +616,18 @@ define <4 x i1> @compare_sgt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sgt_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.gt_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_sgt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp sgt <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -315,6 +639,18 @@ define <4 x i1> @compare_ugt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.gt_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ugt_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp ugt <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_sge_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -326,6 +662,18 @@ define <4 x i1> @compare_sge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_sge_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.ge_s $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_sge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp sge <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_uge_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -337,6 +685,18 @@ define <4 x i1> @compare_uge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.ge_u $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_uge_v4i32 (<4 x i32> %x, <4 x i32> %y) {
+  %cmp = icmp uge <4 x i32> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_oeq_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -348,6 +708,18 @@ define <4 x i1> @compare_oeq_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_oeq_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_oeq_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp oeq <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ogt_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -359,6 +731,18 @@ define <4 x i1> @compare_ogt_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ogt_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.gt $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ogt_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ogt <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_oge_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -370,6 +754,18 @@ define <4 x i1> @compare_oge_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_oge_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ge $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_oge_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp oge <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_olt_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -381,6 +777,18 @@ define <4 x i1> @compare_olt_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_olt_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.lt $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_olt_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp olt <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ole_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -392,97 +800,257 @@ define <4 x i1> @compare_ole_v4f32 (<4 x float> %x, <4 x float> %y) {
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ole_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.le $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ole_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ole <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_one_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.ne
+; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_one_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp one <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_one_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_one_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp one <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ord_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.eq
+; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return   $pop[[R]]{{$}}
 define <4 x i1> @compare_ord_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ord <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ord_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return   $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ord_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ord <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ueq_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.eq
+; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ueq_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ueq <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ueq_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]
+define <4 x i32> @compare_sext_ueq_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ueq <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.le
+; SIMD128-NEXT: f32x4.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ugt_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ugt <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ugt_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ugt <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_uge_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.lt
+; SIMD128-NEXT: f32x4.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_uge_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp uge <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_uge_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp uge <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ult_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.ge
+; SIMD128-NEXT: f32x4.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ult_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ult <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ult_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ult <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_ule_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.gt
+; SIMD128-NEXT: f32x4.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_ule_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp ule <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_ule_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp ule <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_une_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_une_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp une <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_une_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_une_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp une <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_uno_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f32x4.ne
+; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i1> @compare_uno_v4f32 (<4 x float> %x, <4 x float> %y) {
   %res = fcmp uno <4 x float> %x, %y
   ret <4 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uno_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f32x4.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @compare_sext_uno_v4f32 (<4 x float> %x, <4 x float> %y) {
+  %cmp = fcmp uno <4 x float> %x, %y
+  %res = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: compare_oeq_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -495,6 +1063,19 @@ define <2 x i1> @compare_oeq_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_oeq_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_oeq_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp oeq <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ogt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -507,6 +1088,19 @@ define <2 x i1> @compare_ogt_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ogt_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.gt $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ogt_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ogt <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_oge_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -519,6 +1113,19 @@ define <2 x i1> @compare_oge_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_oge_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ge $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_oge_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp oge <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_olt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -531,6 +1138,19 @@ define <2 x i1> @compare_olt_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_olt_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.lt $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_olt_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp olt <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ole_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -543,83 +1163,222 @@ define <2 x i1> @compare_ole_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ole_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.le $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ole_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ole <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_one_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.ne
+; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_one_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp one <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_one_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_one_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp one <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ord_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.eq
+; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return   $pop[[R]]{{$}}
 define <2 x i1> @compare_ord_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ord <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ord_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.and $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return   $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ord_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ord <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ueq_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.eq
+; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ueq_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ueq <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ueq_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.eq $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T2:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[T3:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T3]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ueq_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ueq <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ugt_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.le
+; SIMD128-NEXT: f64x2.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ugt_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ugt <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ugt_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.le $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ugt_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ugt <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_uge_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.lt
+; SIMD128-NEXT: f64x2.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_uge_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp uge <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_uge_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.lt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_uge_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp uge <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ult_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.ge
+; SIMD128-NEXT: f64x2.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ult_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ult <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ult_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ge $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ult_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ult <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_ule_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.gt
+; SIMD128-NEXT: f64x2.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_ule_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp ule <2 x double> %x, %y
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_ule_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.gt $push[[T0:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: v128.not $push[[R:[0-9]+]]=, $pop[[T0]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_ule_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp ule <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_une_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -632,13 +1391,44 @@ define <2 x i1> @compare_une_v2f64 (<2 x double> %x, <2 x double> %y) {
   ret <2 x i1> %res
 }
 
+; CHECK-LABEL: compare_sext_une_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[R:[0-9]+]]=, $0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_une_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp une <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: compare_uno_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
 ; SIMD128-NEXT: .param v128, v128{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: f64x2.ne
+; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i1> @compare_uno_v2f64 (<2 x double> %x, <2 x double> %y) {
   %res = fcmp uno <2 x double> %x, %y
   ret <2 x i1> %res
 }
+
+; CHECK-LABEL: compare_sext_uno_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T0:[0-9]+]]=, $0, $0{{$}}
+; SIMD128-NEXT: f64x2.ne $push[[T1:[0-9]+]]=, $1, $1{{$}}
+; SIMD128-NEXT: v128.or $push[[R:[0-9]+]]=, $pop[[T0]], $pop[[T1]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @compare_sext_uno_v2f64 (<2 x double> %x, <2 x double> %y) {
+  %cmp = fcmp uno <2 x double> %x, %y
+  %res = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %res
+}
diff --git a/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll b/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f128483cb9add23c16e95bc87cf9a25d3135af27
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-ext-load-trunc-store.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-unimplemented-simd -mattr=+simd128 | FileCheck %s
+
+; Check that store in memory with smaller lanes are loaded and stored
+; as expected. This is a regression test for part of bug 39275.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: load_ext_2xi32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i64.load32_u $push[[L0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
+; CHECK-NEXT: i64.load32_u $push[[L2:[0-9]+]]=, 4($0){{$}}
+; CHECK-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L1]], 1, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i32> @load_ext_2xi32(<2 x i32>* %p) {
+  %1 = load <2 x i32>, <2 x i32>* %p, align 4
+  ret <2 x i32> %1
+}
+
+; CHECK-LABEL: load_zext_2xi32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i64.load32_u $push[[L0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
+; CHECK-NEXT: i64.load32_u $push[[L2:[0-9]+]]=, 4($0){{$}}
+; CHECK-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L1]], 1, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_zext_2xi32(<2 x i32>* %p) {
+  %1 = load <2 x i32>, <2 x i32>* %p, align 4
+  %2 = zext <2 x i32> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+; CHECK-LABEL: load_sext_2xi32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i64.load32_s $push[[L0:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: i64x2.splat $push[[L1:[0-9]+]]=, $pop[[L0]]{{$}}
+; CHECK-NEXT: i64.load32_s $push[[L2:[0-9]+]]=, 4($0){{$}}
+; CHECK-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $pop[[L1]], 1, $pop[[L2]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_sext_2xi32(<2 x i32>* %p) {
+  %1 = load <2 x i32>, <2 x i32>* %p, align 4
+  %2 = sext <2 x i32> %1 to <2 x i64>
+  ret <2 x i64> %2
+}
+
+; CHECK-LABEL: store_trunc_2xi32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: i64x2.extract_lane $push[[L0:[0-9]+]]=, $1, 1
+; CHECK-NEXT: i64.store32 4($0), $pop[[L0]]
+; CHECK-NEXT: i64x2.extract_lane $push[[L1:[0-9]+]]=, $1, 0
+; CHECK-NEXT: i64.store32 0($0), $pop[[L1]]
+; CHECK-NEXT: return
+define void @store_trunc_2xi32(<2 x i32>* %p, <2 x i32> %x) {
+  store <2 x i32> %x, <2 x i32>* %p, align 4
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/simd-intrinsics.ll b/test/CodeGen/WebAssembly/simd-intrinsics.ll
index f9f4eb0cf9e8a716bfdfed4809f7d5d7d256e9ac..1cf990d11d45554394b2c127f05cd8dfe96c17ab 100644
--- a/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -16,11 +16,9 @@ target triple = "wasm32-unknown-unknown"
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i8x16.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
-declare <16 x i8> @llvm.wasm.add.saturate.signed.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>)
 define <16 x i8> @add_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
-  %a = call <16 x i8> @llvm.wasm.add.saturate.signed.v16i8(
-    <16 x i8> %x, <16 x i8> %y
-  )
+  %a = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %a
 }
 
@@ -29,11 +27,9 @@ define <16 x i8> @add_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i8x16.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
-declare <16 x i8> @llvm.wasm.add.saturate.unsigned.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>)
 define <16 x i8> @add_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) {
-  %a = call <16 x i8> @llvm.wasm.add.saturate.unsigned.v16i8(
-    <16 x i8> %x, <16 x i8> %y
-  )
+  %a = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y)
   ret <16 x i8> %a
 }
 
@@ -106,11 +102,9 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i16x8.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
-declare <8 x i16> @llvm.wasm.add.saturate.signed.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)
 define <8 x i16> @add_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
-  %a = call <8 x i16> @llvm.wasm.add.saturate.signed.v8i16(
-    <8 x i16> %x, <8 x i16> %y
-  )
+  %a = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %a
 }
 
@@ -119,11 +113,9 @@ define <8 x i16> @add_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i16x8.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
-declare <8 x i16> @llvm.wasm.add.saturate.unsigned.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>)
 define <8 x i16> @add_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) {
-  %a = call <8 x i16> @llvm.wasm.add.saturate.unsigned.v8i16(
-    <8 x i16> %x, <8 x i16> %y
-  )
+  %a = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %a
 }
 
@@ -226,6 +218,30 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
   ret <4 x i32> %a
 }
 
+; CHECK-LABEL: trunc_sat_s_v4i32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.trunc_sat_s/f32x4 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+declare <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float>)
+define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) {
+  %a = call <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float> %x)
+  ret <4 x i32> %a
+}
+
+; CHECK-LABEL: trunc_sat_u_v4i32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.trunc_sat_u/f32x4 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+declare <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float>)
+define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) {
+  %a = call <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float> %x)
+  ret <4 x i32> %a
+}
+
 ; ==============================================================================
 ; 2 x i64
 ; ==============================================================================
@@ -264,6 +280,30 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
   ret <2 x i64> %a
 }
 
+; CHECK-LABEL: trunc_sat_s_v2i64:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.trunc_sat_s/f64x2 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+declare <2 x i64> @llvm.wasm.trunc.saturate.signed.v2i64.v2f64(<2 x double>)
+define <2 x i64> @trunc_sat_s_v2i64(<2 x double> %x) {
+  %a = call <2 x i64> @llvm.wasm.trunc.saturate.signed.v2i64.v2f64(<2 x double> %x)
+  ret <2 x i64> %a
+}
+
+; CHECK-LABEL: trunc_sat_u_v2i64:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.trunc_sat_u/f64x2 $push[[R:[0-9]+]]=, $0
+; SIMD128-NEXT: return $pop[[R]]
+declare <2 x i64> @llvm.wasm.trunc.saturate.unsigned.v2i64.v2f64(<2 x double>)
+define <2 x i64> @trunc_sat_u_v2i64(<2 x double> %x) {
+  %a = call <2 x i64> @llvm.wasm.trunc.saturate.unsigned.v2i64.v2f64(<2 x double> %x)
+  ret <2 x i64> %a
+}
+
 ; ==============================================================================
 ; 4 x f32
 ; ==============================================================================
diff --git a/test/CodeGen/WebAssembly/simd-load-store-alignment.ll b/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f113840c0499719fc066228102eab5fc5ffb6fce
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
@@ -0,0 +1,534 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-unimplemented-simd -mattr=+simd128 | FileCheck %s
+
+; Test loads and stores with custom alignment values.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; ==============================================================================
+; 16 x i8
+; ==============================================================================
+
+; CHECK-LABEL: load_v16i8_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @load_v16i8_a1(<16 x i8> *%p) {
+  %v = load <16 x i8>, <16 x i8>* %p, align 1
+  ret <16 x i8> %v
+}
+
+; CHECK-LABEL: load_v16i8_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @load_v16i8_a4(<16 x i8> *%p) {
+  %v = load <16 x i8>, <16 x i8>* %p, align 4
+  ret <16 x i8> %v
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v16i8_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @load_v16i8_a16(<16 x i8> *%p) {
+  %v = load <16 x i8>, <16 x i8>* %p, align 16
+  ret <16 x i8> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v16i8_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @load_v16i8_a32(<16 x i8> *%p) {
+  %v = load <16 x i8>, <16 x i8>* %p, align 32
+  ret <16 x i8> %v
+}
+
+; CHECK-LABEL: store_v16i8_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v16i8_a1(<16 x i8> *%p, <16 x i8> %v) {
+  store <16 x i8> %v, <16 x i8>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v16i8_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v16i8_a4(<16 x i8> *%p, <16 x i8> %v) {
+  store <16 x i8> %v, <16 x i8>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v16i8_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v16i8_a16(<16 x i8> *%p, <16 x i8> %v) {
+  store <16 x i8> %v, <16 x i8>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v16i8_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v16i8_a32(<16 x i8> *%p, <16 x i8> %v) {
+  store <16 x i8> %v, <16 x i8>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 8 x i16
+; ==============================================================================
+
+; CHECK-LABEL: load_v8i16_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @load_v8i16_a1(<8 x i16> *%p) {
+  %v = load <8 x i16>, <8 x i16>* %p, align 1
+  ret <8 x i16> %v
+}
+
+; CHECK-LABEL: load_v8i16_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @load_v8i16_a4(<8 x i16> *%p) {
+  %v = load <8 x i16>, <8 x i16>* %p, align 4
+  ret <8 x i16> %v
+}
+
+; 8 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v8i16_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @load_v8i16_a16(<8 x i16> *%p) {
+  %v = load <8 x i16>, <8 x i16>* %p, align 16
+  ret <8 x i16> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v8i16_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @load_v8i16_a32(<8 x i16> *%p) {
+  %v = load <8 x i16>, <8 x i16>* %p, align 32
+  ret <8 x i16> %v
+}
+
+; CHECK-LABEL: store_v8i16_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v8i16_a1(<8 x i16> *%p, <8 x i16> %v) {
+  store <8 x i16> %v, <8 x i16>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v8i16_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v8i16_a4(<8 x i16> *%p, <8 x i16> %v) {
+  store <8 x i16> %v, <8 x i16>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v8i16_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v8i16_a16(<8 x i16> *%p, <8 x i16> %v) {
+  store <8 x i16> %v, <8 x i16>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v8i16_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v8i16_a32(<8 x i16> *%p, <8 x i16> %v) {
+  store <8 x i16> %v, <8 x i16>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 4 x i32
+; ==============================================================================
+
+; CHECK-LABEL: load_v4i32_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @load_v4i32_a1(<4 x i32> *%p) {
+  %v = load <4 x i32>, <4 x i32>* %p, align 1
+  ret <4 x i32> %v
+}
+
+; CHECK-LABEL: load_v4i32_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @load_v4i32_a4(<4 x i32> *%p) {
+  %v = load <4 x i32>, <4 x i32>* %p, align 4
+  ret <4 x i32> %v
+}
+
+; 4 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v4i32_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @load_v4i32_a16(<4 x i32> *%p) {
+  %v = load <4 x i32>, <4 x i32>* %p, align 16
+  ret <4 x i32> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v4i32_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @load_v4i32_a32(<4 x i32> *%p) {
+  %v = load <4 x i32>, <4 x i32>* %p, align 32
+  ret <4 x i32> %v
+}
+
+; CHECK-LABEL: store_v4i32_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4i32_a1(<4 x i32> *%p, <4 x i32> %v) {
+  store <4 x i32> %v, <4 x i32>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v4i32_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4i32_a4(<4 x i32> *%p, <4 x i32> %v) {
+  store <4 x i32> %v, <4 x i32>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v4i32_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4i32_a16(<4 x i32> *%p, <4 x i32> %v) {
+  store <4 x i32> %v, <4 x i32>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v4i32_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4i32_a32(<4 x i32> *%p, <4 x i32> %v) {
+  store <4 x i32> %v, <4 x i32>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 2 x i64
+; ==============================================================================
+
+; CHECK-LABEL: load_v2i64_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_v2i64_a1(<2 x i64> *%p) {
+  %v = load <2 x i64>, <2 x i64>* %p, align 1
+  ret <2 x i64> %v
+}
+
+; CHECK-LABEL: load_v2i64_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_v2i64_a4(<2 x i64> *%p) {
+  %v = load <2 x i64>, <2 x i64>* %p, align 4
+  ret <2 x i64> %v
+}
+
+; 2 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v2i64_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_v2i64_a16(<2 x i64> *%p) {
+  %v = load <2 x i64>, <2 x i64>* %p, align 16
+  ret <2 x i64> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v2i64_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @load_v2i64_a32(<2 x i64> *%p) {
+  %v = load <2 x i64>, <2 x i64>* %p, align 32
+  ret <2 x i64> %v
+}
+
+; CHECK-LABEL: store_v2i64_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2i64_a1(<2 x i64> *%p, <2 x i64> %v) {
+  store <2 x i64> %v, <2 x i64>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v2i64_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2i64_a4(<2 x i64> *%p, <2 x i64> %v) {
+  store <2 x i64> %v, <2 x i64>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v2i64_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2i64_a16(<2 x i64> *%p, <2 x i64> %v) {
+  store <2 x i64> %v, <2 x i64>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v2i64_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2i64_a32(<2 x i64> *%p, <2 x i64> %v) {
+  store <2 x i64> %v, <2 x i64>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 4 x float
+; ==============================================================================
+
+; CHECK-LABEL: load_v4f32_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @load_v4f32_a1(<4 x float> *%p) {
+  %v = load <4 x float>, <4 x float>* %p, align 1
+  ret <4 x float> %v
+}
+
+; CHECK-LABEL: load_v4f32_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @load_v4f32_a4(<4 x float> *%p) {
+  %v = load <4 x float>, <4 x float>* %p, align 4
+  ret <4 x float> %v
+}
+
+; 4 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v4f32_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @load_v4f32_a16(<4 x float> *%p) {
+  %v = load <4 x float>, <4 x float>* %p, align 16
+  ret <4 x float> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v4f32_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @load_v4f32_a32(<4 x float> *%p) {
+  %v = load <4 x float>, <4 x float>* %p, align 32
+  ret <4 x float> %v
+}
+
+; CHECK-LABEL: store_v4f32_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4f32_a1(<4 x float> *%p, <4 x float> %v) {
+  store <4 x float> %v, <4 x float>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v4f32_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4f32_a4(<4 x float> *%p, <4 x float> %v) {
+  store <4 x float> %v, <4 x float>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v4f32_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4f32_a16(<4 x float> *%p, <4 x float> %v) {
+  store <4 x float> %v, <4 x float>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v4f32_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v4f32_a32(<4 x float> *%p, <4 x float> %v) {
+  store <4 x float> %v, <4 x float>* %p, align 32
+  ret void
+}
+
+; ==============================================================================
+; 2 x double
+; ==============================================================================
+
+; CHECK-LABEL: load_v2f64_a1:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @load_v2f64_a1(<2 x double> *%p) {
+  %v = load <2 x double>, <2 x double>* %p, align 1
+  ret <2 x double> %v
+}
+
+; CHECK-LABEL: load_v2f64_a4:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=2{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @load_v2f64_a4(<2 x double> *%p) {
+  %v = load <2 x double>, <2 x double>* %p, align 4
+  ret <2 x double> %v
+}
+
+; 2 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: load_v2f64_a16:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @load_v2f64_a16(<2 x double> *%p) {
+  %v = load <2 x double>, <2 x double>* %p, align 16
+  ret <2 x double> %v
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: load_v2f64_a32:
+; CHECK-NEXT: .param i32{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @load_v2f64_a32(<2 x double> *%p) {
+  %v = load <2 x double>, <2 x double>* %p, align 32
+  ret <2 x double> %v
+}
+
+; CHECK-LABEL: store_v2f64_a1:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=0, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2f64_a1(<2 x double> *%p, <2 x double> %v) {
+  store <2 x double> %v, <2 x double>* %p, align 1
+  ret void
+}
+
+; CHECK-LABEL: store_v2f64_a4:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0):p2align=2, $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2f64_a4(<2 x double> *%p, <2 x double> %v) {
+  store <2 x double> %v, <2 x double>* %p, align 4
+  ret void
+}
+
+; 16 is the default alignment for v128 so no attribute is needed.
+
+; CHECK-LABEL: store_v2f64_a16:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2f64_a16(<2 x double> *%p, <2 x double> %v) {
+  store <2 x double> %v, <2 x double>* %p, align 16
+  ret void
+}
+
+; 32 is greater than the default alignment so it is ignored.
+
+; CHECK-LABEL: store_v2f64_a32:
+; CHECK-NEXT: .param i32, v128{{$}}
+; CHECK-NEXT: v128.store 0($0), $1{{$}}
+; CHECK-NEXT: return{{$}}
+define void @store_v2f64_a32(<2 x double> *%p, <2 x double> %v) {
+  store <2 x double> %v, <2 x double>* %p, align 32
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/simd-nested-shuffles.ll b/test/CodeGen/WebAssembly/simd-nested-shuffles.ll
new file mode 100644
index 0000000000000000000000000000000000000000..51ba5a99be618dce515cdd57114ca43ba4a1757f
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-nested-shuffles.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mattr=+simd128 | FileCheck %s --check-prefixes CHECK
+
+; Check that shuffles maintain their type when being custom
+; lowered. Regression test for bug 39275.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK: v8x16.shuffle
+define <4 x i32> @foo(<4 x i32> %x) {
+  %1 = shufflevector <4 x i32> %x, <4 x i32> undef,
+    <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef,
+    <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %3 = add <4 x i32> %2, %2
+  ret <4 x i32> %3
+}
diff --git a/test/CodeGen/WebAssembly/simd-offset.ll b/test/CodeGen/WebAssembly/simd-offset.ll
index 5ce0ca94dc433cdb0d6563120caf84983c0e500b..ed20225f02199fdb67c5f6da36dbf04dcacdcd8d 100644
--- a/test/CodeGen/WebAssembly/simd-offset.ll
+++ b/test/CodeGen/WebAssembly/simd-offset.ll
@@ -14,7 +14,7 @@ target triple = "wasm32-unknown-unknown"
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8(<16 x i8>* %p) {
   %v = load <16 x i8>, <16 x i8>* %p
@@ -25,7 +25,7 @@ define <16 x i8> @load_v16i8(<16 x i8>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_folded_offset(<16 x i8>* %p) {
   %q = ptrtoint <16 x i8>* %p to i32
@@ -39,7 +39,7 @@ define <16 x i8> @load_v16i8_with_folded_offset(<16 x i8>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_folded_gep_offset(<16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 1
@@ -53,7 +53,7 @@ define <16 x i8> @load_v16i8_with_folded_gep_offset(<16 x i8>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_unfolded_gep_negative_offset(<16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 -1
@@ -67,7 +67,7 @@ define <16 x i8> @load_v16i8_with_unfolded_gep_negative_offset(<16 x i8>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_unfolded_offset(<16 x i8>* %p) {
   %q = ptrtoint <16 x i8>* %p to i32
@@ -83,7 +83,7 @@ define <16 x i8> @load_v16i8_with_unfolded_offset(<16 x i8>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_with_unfolded_gep_offset(<16 x i8>* %p) {
   %s = getelementptr <16 x i8>, <16 x i8>* %p, i32 1
@@ -95,7 +95,7 @@ define <16 x i8> @load_v16i8_with_unfolded_gep_offset(<16 x i8>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @load_v16i8_from_numeric_address() {
   %s = inttoptr i32 32 to <16 x i8>*
@@ -107,7 +107,7 @@ define <16 x i8> @load_v16i8_from_numeric_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v16i8($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v16i8($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v16i8 = global <16 x i8> <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
 define <16 x i8> @load_v16i8_from_global_address() {
@@ -118,7 +118,7 @@ define <16 x i8> @load_v16i8_from_global_address() {
 ; CHECK-LABEL: store_v16i8:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v16i8(<16 x i8> %v, <16 x i8>* %p) {
   store <16 x i8> %v , <16 x i8>* %p
   ret void
@@ -127,7 +127,7 @@ define void @store_v16i8(<16 x i8> %v, <16 x i8>* %p) {
 ; CHECK-LABEL: store_v16i8_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v16i8_with_folded_offset(<16 x i8> %v, <16 x i8>* %p) {
   %q = ptrtoint <16 x i8>* %p to i32
   %r = add nuw i32 %q, 16
@@ -139,7 +139,7 @@ define void @store_v16i8_with_folded_offset(<16 x i8> %v, <16 x i8>* %p) {
 ; CHECK-LABEL: store_v16i8_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v16i8_with_folded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 1
   store <16 x i8> %v , <16 x i8>* %s
@@ -151,7 +151,7 @@ define void @store_v16i8_with_folded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v16i8_with_unfolded_gep_negative_offset(<16 x i8> %v, <16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 -1
   store <16 x i8> %v , <16 x i8>* %s
@@ -163,7 +163,7 @@ define void @store_v16i8_with_unfolded_gep_negative_offset(<16 x i8> %v, <16 x i
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v16i8_with_unfolded_offset(<16 x i8> %v, <16 x i8>* %p) {
   %s = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i32 -1
   store <16 x i8> %v , <16 x i8>* %s
@@ -175,7 +175,7 @@ define void @store_v16i8_with_unfolded_offset(<16 x i8> %v, <16 x i8>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v16i8_with_unfolded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
   %s = getelementptr <16 x i8>, <16 x i8>* %p, i32 1
   store <16 x i8> %v , <16 x i8>* %s
@@ -186,7 +186,7 @@ define void @store_v16i8_with_unfolded_gep_offset(<16 x i8> %v, <16 x i8>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[R]]), $0{{$}}
 define void @store_v16i8_to_numeric_address(<16 x i8> %v) {
   %s = inttoptr i32 32 to <16 x i8>*
   store <16 x i8> %v , <16 x i8>* %s
@@ -197,7 +197,7 @@ define void @store_v16i8_to_numeric_address(<16 x i8> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v16i8($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v16i8($pop[[R]]), $0{{$}}
 define void @store_v16i8_to_global_address(<16 x i8> %v) {
   store <16 x i8> %v , <16 x i8>* @gv_v16i8
   ret void
@@ -210,7 +210,7 @@ define void @store_v16i8_to_global_address(<16 x i8> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16(<8 x i16>* %p) {
   %v = load <8 x i16>, <8 x i16>* %p
@@ -221,7 +221,7 @@ define <8 x i16> @load_v8i16(<8 x i16>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_folded_offset(<8 x i16>* %p) {
   %q = ptrtoint <8 x i16>* %p to i32
@@ -235,7 +235,7 @@ define <8 x i16> @load_v8i16_with_folded_offset(<8 x i16>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_folded_gep_offset(<8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 1
@@ -249,7 +249,7 @@ define <8 x i16> @load_v8i16_with_folded_gep_offset(<8 x i16>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_unfolded_gep_negative_offset(<8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 -1
@@ -263,7 +263,7 @@ define <8 x i16> @load_v8i16_with_unfolded_gep_negative_offset(<8 x i16>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[L0:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[L0:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[L0]]{{$}}
 define <8 x i16> @load_v8i16_with_unfolded_offset(<8 x i16>* %p) {
   %q = ptrtoint <8 x i16>* %p to i32
@@ -279,7 +279,7 @@ define <8 x i16> @load_v8i16_with_unfolded_offset(<8 x i16>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_with_unfolded_gep_offset(<8 x i16>* %p) {
   %s = getelementptr <8 x i16>, <8 x i16>* %p, i32 1
@@ -291,7 +291,7 @@ define <8 x i16> @load_v8i16_with_unfolded_gep_offset(<8 x i16>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @load_v8i16_from_numeric_address() {
   %s = inttoptr i32 32 to <8 x i16>*
@@ -303,7 +303,7 @@ define <8 x i16> @load_v8i16_from_numeric_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v8i16($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v8i16($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v8i16 = global <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
 define <8 x i16> @load_v8i16_from_global_address() {
@@ -314,7 +314,7 @@ define <8 x i16> @load_v8i16_from_global_address() {
 ; CHECK-LABEL: store_v8i16:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) {
   store <8 x i16> %v , <8 x i16>* %p
   ret void
@@ -323,7 +323,7 @@ define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) {
 ; CHECK-LABEL: store_v8i16_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
   %q = ptrtoint <8 x i16>* %p to i32
   %r = add nuw i32 %q, 16
@@ -335,7 +335,7 @@ define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
 ; CHECK-LABEL: store_v8i16_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 1
   store <8 x i16> %v , <8 x i16>* %s
@@ -347,7 +347,7 @@ define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 -1
   store <8 x i16> %v , <8 x i16>* %s
@@ -359,7 +359,7 @@ define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i1
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) {
   %s = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i32 -1
   store <8 x i16> %v , <8 x i16>* %s
@@ -371,7 +371,7 @@ define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
   %s = getelementptr <8 x i16>, <8 x i16>* %p, i32 1
   store <8 x i16> %v , <8 x i16>* %s
@@ -382,7 +382,7 @@ define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v8i16_to_numeric_address(<8 x i16> %v) {
   %s = inttoptr i32 32 to <8 x i16>*
   store <8 x i16> %v , <8 x i16>* %s
@@ -393,7 +393,7 @@ define void @store_v8i16_to_numeric_address(<8 x i16> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v8i16($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v8i16($pop[[R]]), $0{{$}}
 define void @store_v8i16_to_global_address(<8 x i16> %v) {
   store <8 x i16> %v , <8 x i16>* @gv_v8i16
   ret void
@@ -406,7 +406,7 @@ define void @store_v8i16_to_global_address(<8 x i16> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32(<4 x i32>* %p) {
   %v = load <4 x i32>, <4 x i32>* %p
@@ -417,7 +417,7 @@ define <4 x i32> @load_v4i32(<4 x i32>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_folded_offset(<4 x i32>* %p) {
   %q = ptrtoint <4 x i32>* %p to i32
@@ -431,7 +431,7 @@ define <4 x i32> @load_v4i32_with_folded_offset(<4 x i32>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_folded_gep_offset(<4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 1
@@ -445,7 +445,7 @@ define <4 x i32> @load_v4i32_with_folded_gep_offset(<4 x i32>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_unfolded_gep_negative_offset(<4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 -1
@@ -459,7 +459,7 @@ define <4 x i32> @load_v4i32_with_unfolded_gep_negative_offset(<4 x i32>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_unfolded_offset(<4 x i32>* %p) {
   %q = ptrtoint <4 x i32>* %p to i32
@@ -475,7 +475,7 @@ define <4 x i32> @load_v4i32_with_unfolded_offset(<4 x i32>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_with_unfolded_gep_offset(<4 x i32>* %p) {
   %s = getelementptr <4 x i32>, <4 x i32>* %p, i32 1
@@ -487,7 +487,7 @@ define <4 x i32> @load_v4i32_with_unfolded_gep_offset(<4 x i32>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @load_v4i32_from_numeric_address() {
   %s = inttoptr i32 32 to <4 x i32>*
@@ -499,7 +499,7 @@ define <4 x i32> @load_v4i32_from_numeric_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4i32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4i32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v4i32 = global <4 x i32> <i32 42, i32 42, i32 42, i32 42>
 define <4 x i32> @load_v4i32_from_global_address() {
@@ -510,7 +510,7 @@ define <4 x i32> @load_v4i32_from_global_address() {
 ; CHECK-LABEL: store_v4i32:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) {
   store <4 x i32> %v , <4 x i32>* %p
   ret void
@@ -519,7 +519,7 @@ define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) {
 ; CHECK-LABEL: store_v4i32_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
   %q = ptrtoint <4 x i32>* %p to i32
   %r = add nuw i32 %q, 16
@@ -531,7 +531,7 @@ define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
 ; CHECK-LABEL: store_v4i32_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 1
   store <4 x i32> %v , <4 x i32>* %s
@@ -543,7 +543,7 @@ define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 -1
   store <4 x i32> %v , <4 x i32>* %s
@@ -555,7 +555,7 @@ define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i3
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) {
   %s = getelementptr inbounds <4 x i32>, <4 x i32>* %p, i32 -1
   store <4 x i32> %v , <4 x i32>* %s
@@ -567,7 +567,7 @@ define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
   %s = getelementptr <4 x i32>, <4 x i32>* %p, i32 1
   store <4 x i32> %v , <4 x i32>* %s
@@ -578,7 +578,7 @@ define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v4i32_to_numeric_address(<4 x i32> %v) {
   %s = inttoptr i32 32 to <4 x i32>*
   store <4 x i32> %v , <4 x i32>* %s
@@ -589,7 +589,7 @@ define void @store_v4i32_to_numeric_address(<4 x i32> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v4i32($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v4i32($pop[[R]]), $0{{$}}
 define void @store_v4i32_to_global_address(<4 x i32> %v) {
   store <4 x i32> %v , <4 x i32>* @gv_v4i32
   ret void
@@ -603,7 +603,7 @@ define void @store_v4i32_to_global_address(<4 x i32> %v) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64(<2 x i64>* %p) {
   %v = load <2 x i64>, <2 x i64>* %p
@@ -615,7 +615,7 @@ define <2 x i64> @load_v2i64(<2 x i64>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_folded_offset(<2 x i64>* %p) {
   %q = ptrtoint <2 x i64>* %p to i32
@@ -630,7 +630,7 @@ define <2 x i64> @load_v2i64_with_folded_offset(<2 x i64>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_folded_gep_offset(<2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 1
@@ -645,7 +645,7 @@ define <2 x i64> @load_v2i64_with_folded_gep_offset(<2 x i64>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_unfolded_gep_negative_offset(<2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 -1
@@ -660,7 +660,7 @@ define <2 x i64> @load_v2i64_with_unfolded_gep_negative_offset(<2 x i64>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_unfolded_offset(<2 x i64>* %p) {
   %q = ptrtoint <2 x i64>* %p to i32
@@ -677,7 +677,7 @@ define <2 x i64> @load_v2i64_with_unfolded_offset(<2 x i64>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_with_unfolded_gep_offset(<2 x i64>* %p) {
   %s = getelementptr <2 x i64>, <2 x i64>* %p, i32 1
@@ -690,7 +690,7 @@ define <2 x i64> @load_v2i64_with_unfolded_gep_offset(<2 x i64>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @load_v2i64_from_numeric_address() {
   %s = inttoptr i32 32 to <2 x i64>*
@@ -703,7 +703,7 @@ define <2 x i64> @load_v2i64_from_numeric_address() {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2i64($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2i64($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v2i64 = global <2 x i64> <i64 42, i64 42>
 define <2 x i64> @load_v2i64_from_global_address() {
@@ -715,7 +715,7 @@ define <2 x i64> @load_v2i64_from_global_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v2i64(<2 x i64> %v, <2 x i64>* %p) {
   store <2 x i64> %v , <2 x i64>* %p
   ret void
@@ -725,7 +725,7 @@ define void @store_v2i64(<2 x i64> %v, <2 x i64>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2i64_with_folded_offset(<2 x i64> %v, <2 x i64>* %p) {
   %q = ptrtoint <2 x i64>* %p to i32
   %r = add nuw i32 %q, 16
@@ -738,7 +738,7 @@ define void @store_v2i64_with_folded_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2i64_with_folded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 1
   store <2 x i64> %v , <2 x i64>* %s
@@ -751,7 +751,7 @@ define void @store_v2i64_with_folded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2i64_with_unfolded_gep_negative_offset(<2 x i64> %v, <2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 -1
   store <2 x i64> %v , <2 x i64>* %s
@@ -764,7 +764,7 @@ define void @store_v2i64_with_unfolded_gep_negative_offset(<2 x i64> %v, <2 x i6
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2i64_with_unfolded_offset(<2 x i64> %v, <2 x i64>* %p) {
   %s = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i32 -1
   store <2 x i64> %v , <2 x i64>* %s
@@ -777,7 +777,7 @@ define void @store_v2i64_with_unfolded_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2i64_with_unfolded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
   %s = getelementptr <2 x i64>, <2 x i64>* %p, i32 1
   store <2 x i64> %v , <2 x i64>* %s
@@ -789,7 +789,7 @@ define void @store_v2i64_with_unfolded_gep_offset(<2 x i64> %v, <2 x i64>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v2i64_to_numeric_address(<2 x i64> %v) {
   %s = inttoptr i32 32 to <2 x i64>*
   store <2 x i64> %v , <2 x i64>* %s
@@ -801,7 +801,7 @@ define void @store_v2i64_to_numeric_address(<2 x i64> %v) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v2i64($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v2i64($pop[[R]]), $0{{$}}
 define void @store_v2i64_to_global_address(<2 x i64> %v) {
   store <2 x i64> %v , <2 x i64>* @gv_v2i64
   ret void
@@ -814,7 +814,7 @@ define void @store_v2i64_to_global_address(<2 x i64> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32(<4 x float>* %p) {
   %v = load <4 x float>, <4 x float>* %p
@@ -825,7 +825,7 @@ define <4 x float> @load_v4f32(<4 x float>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_folded_offset(<4 x float>* %p) {
   %q = ptrtoint <4 x float>* %p to i32
@@ -839,7 +839,7 @@ define <4 x float> @load_v4f32_with_folded_offset(<4 x float>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_folded_gep_offset(<4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 1
@@ -853,7 +853,7 @@ define <4 x float> @load_v4f32_with_folded_gep_offset(<4 x float>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_unfolded_gep_negative_offset(<4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1
@@ -867,7 +867,7 @@ define <4 x float> @load_v4f32_with_unfolded_gep_negative_offset(<4 x float>* %p
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_unfolded_offset(<4 x float>* %p) {
   %q = ptrtoint <4 x float>* %p to i32
@@ -883,7 +883,7 @@ define <4 x float> @load_v4f32_with_unfolded_offset(<4 x float>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_with_unfolded_gep_offset(<4 x float>* %p) {
   %s = getelementptr <4 x float>, <4 x float>* %p, i32 1
@@ -895,7 +895,7 @@ define <4 x float> @load_v4f32_with_unfolded_gep_offset(<4 x float>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @load_v4f32_from_numeric_address() {
   %s = inttoptr i32 32 to <4 x float>*
@@ -907,7 +907,7 @@ define <4 x float> @load_v4f32_from_numeric_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4f32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v4f32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v4f32 = global <4 x float> <float 42., float 42., float 42., float 42.>
 define <4 x float> @load_v4f32_from_global_address() {
@@ -918,7 +918,7 @@ define <4 x float> @load_v4f32_from_global_address() {
 ; CHECK-LABEL: store_v4f32:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v4f32(<4 x float> %v, <4 x float>* %p) {
   store <4 x float> %v , <4 x float>* %p
   ret void
@@ -927,7 +927,7 @@ define void @store_v4f32(<4 x float> %v, <4 x float>* %p) {
 ; CHECK-LABEL: store_v4f32_with_folded_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4f32_with_folded_offset(<4 x float> %v, <4 x float>* %p) {
   %q = ptrtoint <4 x float>* %p to i32
   %r = add nuw i32 %q, 16
@@ -939,7 +939,7 @@ define void @store_v4f32_with_folded_offset(<4 x float> %v, <4 x float>* %p) {
 ; CHECK-LABEL: store_v4f32_with_folded_gep_offset:
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v4f32_with_folded_gep_offset(<4 x float> %v, <4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 1
   store <4 x float> %v , <4 x float>* %s
@@ -951,7 +951,7 @@ define void @store_v4f32_with_folded_gep_offset(<4 x float> %v, <4 x float>* %p)
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4f32_with_unfolded_gep_negative_offset(<4 x float> %v, <4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1
   store <4 x float> %v , <4 x float>* %s
@@ -963,7 +963,7 @@ define void @store_v4f32_with_unfolded_gep_negative_offset(<4 x float> %v, <4 x
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4f32_with_unfolded_offset(<4 x float> %v, <4 x float>* %p) {
   %s = getelementptr inbounds <4 x float>, <4 x float>* %p, i32 -1
   store <4 x float> %v , <4 x float>* %s
@@ -975,7 +975,7 @@ define void @store_v4f32_with_unfolded_offset(<4 x float> %v, <4 x float>* %p) {
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v4f32_with_unfolded_gep_offset(<4 x float> %v, <4 x float>* %p) {
   %s = getelementptr <4 x float>, <4 x float>* %p, i32 1
   store <4 x float> %v , <4 x float>* %s
@@ -986,7 +986,7 @@ define void @store_v4f32_with_unfolded_gep_offset(<4 x float> %v, <4 x float>* %
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v4f32_to_numeric_address(<4 x float> %v) {
   %s = inttoptr i32 32 to <4 x float>*
   store <4 x float> %v , <4 x float>* %s
@@ -997,7 +997,7 @@ define void @store_v4f32_to_numeric_address(<4 x float> %v) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v4f32($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v4f32($pop[[R]]), $0{{$}}
 define void @store_v4f32_to_global_address(<4 x float> %v) {
   store <4 x float> %v , <4 x float>* @gv_v4f32
   ret void
@@ -1011,7 +1011,7 @@ define void @store_v4f32_to_global_address(<4 x float> %v) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64(<2 x double>* %p) {
   %v = load <2 x double>, <2 x double>* %p
@@ -1023,7 +1023,7 @@ define <2 x double> @load_v2f64(<2 x double>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_folded_offset(<2 x double>* %p) {
   %q = ptrtoint <2 x double>* %p to i32
@@ -1038,7 +1038,7 @@ define <2 x double> @load_v2f64_with_folded_offset(<2 x double>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param i32{{$}}
 ; SIMD128-NEXT: .result v128{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 16($0){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_folded_gep_offset(<2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 1
@@ -1053,7 +1053,7 @@ define <2 x double> @load_v2f64_with_folded_gep_offset(<2 x double>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_unfolded_gep_negative_offset(<2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 -1
@@ -1068,7 +1068,7 @@ define <2 x double> @load_v2f64_with_unfolded_gep_negative_offset(<2 x double>*
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_unfolded_offset(<2 x double>* %p) {
   %q = ptrtoint <2 x double>* %p to i32
@@ -1085,7 +1085,7 @@ define <2 x double> @load_v2f64_with_unfolded_offset(<2 x double>* %p) {
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($pop[[L1]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_with_unfolded_gep_offset(<2 x double>* %p) {
   %s = getelementptr <2 x double>, <2 x double>* %p, i32 1
@@ -1098,7 +1098,7 @@ define <2 x double> @load_v2f64_with_unfolded_gep_offset(<2 x double>* %p) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 32($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @load_v2f64_from_numeric_address() {
   %s = inttoptr i32 32 to <2 x double>*
@@ -1111,7 +1111,7 @@ define <2 x double> @load_v2f64_from_numeric_address() {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .result v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2f64($pop[[L0]]):p2align=0{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, gv_v2f64($pop[[L0]]){{$}}
 ; SIMD128-NEXT: return $pop[[R]]{{$}}
 @gv_v2f64 = global <2 x double> <double 42., double 42.>
 define <2 x double> @load_v2f64_from_global_address() {
@@ -1123,7 +1123,7 @@ define <2 x double> @load_v2f64_from_global_address() {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 0($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($1), $0{{$}}
 define void @store_v2f64(<2 x double> %v, <2 x double>* %p) {
   store <2 x double> %v , <2 x double>* %p
   ret void
@@ -1133,7 +1133,7 @@ define void @store_v2f64(<2 x double> %v, <2 x double>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2f64_with_folded_offset(<2 x double> %v, <2 x double>* %p) {
   %q = ptrtoint <2 x double>* %p to i32
   %r = add nuw i32 %q, 16
@@ -1146,7 +1146,7 @@ define void @store_v2f64_with_folded_offset(<2 x double> %v, <2 x double>* %p) {
 ; NO-SIMD128-NOT: v128
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128, i32{{$}}
-; SIMD128-NEXT: v128.store 16($1):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 16($1), $0{{$}}
 define void @store_v2f64_with_folded_gep_offset(<2 x double> %v, <2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 1
   store <2 x double> %v , <2 x double>* %s
@@ -1159,7 +1159,7 @@ define void @store_v2f64_with_folded_gep_offset(<2 x double> %v, <2 x double>* %
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2f64_with_unfolded_gep_negative_offset(<2 x double> %v, <2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 -1
   store <2 x double> %v , <2 x double>* %s
@@ -1172,7 +1172,7 @@ define void @store_v2f64_with_unfolded_gep_negative_offset(<2 x double> %v, <2 x
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, -16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2f64_with_unfolded_offset(<2 x double> %v, <2 x double>* %p) {
   %s = getelementptr inbounds <2 x double>, <2 x double>* %p, i32 -1
   store <2 x double> %v , <2 x double>* %s
@@ -1185,7 +1185,7 @@ define void @store_v2f64_with_unfolded_offset(<2 x double> %v, <2 x double>* %p)
 ; SIMD128-NEXT: .param v128, i32{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 16{{$}}
 ; SIMD128-NEXT: i32.add $push[[R:[0-9]+]]=, $1, $pop[[L0]]{{$}}
-; SIMD128-NEXT: v128.store 0($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[R]]), $0{{$}}
 define void @store_v2f64_with_unfolded_gep_offset(<2 x double> %v, <2 x double>* %p) {
   %s = getelementptr <2 x double>, <2 x double>* %p, i32 1
   store <2 x double> %v , <2 x double>* %s
@@ -1197,7 +1197,7 @@ define void @store_v2f64_with_unfolded_gep_offset(<2 x double> %v, <2 x double>*
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store 32($pop[[L0]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store 32($pop[[L0]]), $0{{$}}
 define void @store_v2f64_to_numeric_address(<2 x double> %v) {
   %s = inttoptr i32 32 to <2 x double>*
   store <2 x double> %v , <2 x double>* %s
@@ -1209,7 +1209,7 @@ define void @store_v2f64_to_numeric_address(<2 x double> %v) {
 ; SIMD128-VM-NOT: v128
 ; SIMD128-NEXT: .param v128{{$}}
 ; SIMD128-NEXT: i32.const $push[[R:[0-9]+]]=, 0{{$}}
-; SIMD128-NEXT: v128.store gv_v2f64($pop[[R]]):p2align=0, $0{{$}}
+; SIMD128-NEXT: v128.store gv_v2f64($pop[[R]]), $0{{$}}
 define void @store_v2f64_to_global_address(<2 x double> %v) {
   store <2 x double> %v , <2 x double>* @gv_v2f64
   ret void
diff --git a/test/CodeGen/WebAssembly/simd-sext-inreg.ll b/test/CodeGen/WebAssembly/simd-sext-inreg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1001d0db16822d82a4011f292e00c113c4324fb3
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-sext-inreg.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -asm-verbose=false -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128
+; RUN: llc < %s -asm-verbose=false -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128-VM
+; RUN: llc < %s -asm-verbose=false -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=-simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+
+; Test that vector sign extensions lower to shifts
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: sext_inreg_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i8x16.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
+; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i8x16.shr_s $push[[R:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @sext_inreg_v16i8(<16 x i1> %x) {
+  %res = sext <16 x i1> %x to <16 x i8>
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: sext_inreg_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i16x8.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
+; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i16x8.shr_s $push[[R:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @sext_inreg_v8i16(<8 x i1> %x) {
+  %res = sext <8 x i1> %x to <8 x i16>
+  ret <8 x i16> %res
+}
+
+; CHECK-LABEL: sext_inreg_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 31{{$}}
+; SIMD128-NEXT: i32x4.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
+; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 31{{$}}
+; SIMD128-NEXT: i32x4.shr_s $push[[R:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @sext_inreg_v4i32(<4 x i1> %x) {
+  %res = sext <4 x i1> %x to <4 x i32>
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: sext_inreg_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SDIM128-VM-NOT: i64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32.const $push[[T0:[0-9]+]]=, 63{{$}}
+; SIMD128-NEXT: i64x2.shl $push[[T1:[0-9]+]]=, $0, $pop[[T0]]{{$}}
+; SIMD128-NEXT: i32.const $push[[T2:[0-9]+]]=, 63{{$}}
+; SIMD128-NEXT: i64x2.shr_s $push[[R:[0-9]+]]=, $pop[[T1]], $pop[[T2]]{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @sext_inreg_v2i64(<2 x i1> %x) {
+  %res = sext <2 x i1> %x to <2 x i64>
+  ret <2 x i64> %res
+}
diff --git a/test/CodeGen/WebAssembly/simd-vselect.ll b/test/CodeGen/WebAssembly/simd-vselect.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fd020511cb19b210193ae578a3335997d692179c
--- /dev/null
+++ b/test/CodeGen/WebAssembly/simd-vselect.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -wasm-enable-unimplemented-simd -mattr=+simd128,+sign-ext | FileCheck %s
+
+; Test that lanewise vector selects lower correctly to bitselects
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: vselect_v16i8:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 7{{$}}
+; CHECK-NEXT: i8x16.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 7{{$}}
+; CHECK-NEXT: i8x16.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @vselect_v16i8(<16 x i1> %c, <16 x i8> %x, <16 x i8> %y) {
+  %res = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: vselect_v8i16:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 15{{$}}
+; CHECK-NEXT: i16x8.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 15{{$}}
+; CHECK-NEXT: i16x8.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @vselect_v8i16(<8 x i1> %c, <8 x i16> %x, <8 x i16> %y) {
+  %res = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %res
+}
+
+; CHECK-LABEL: vselect_v4i32:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @vselect_v4i32(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y) {
+  %res = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: vselect_v2i64:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @vselect_v2i64(<2 x i1> %c, <2 x i64> %x, <2 x i64> %y) {
+  %res = select <2 x i1> %c, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %res
+}
+
+; CHECK-LABEL: vselect_v4f32:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
+; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @vselect_v4f32(<4 x i1> %c, <4 x float> %x, <4 x float> %y) {
+  %res = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: vselect_v2f64:
+; CHECK-NEXT: .param v128, v128, v128{{$}}
+; CHECK-NEXT: .result v128{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
+; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @vselect_v2f64(<2 x i1> %c, <2 x double> %x, <2 x double> %y) {
+  %res = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %res
+}
diff --git a/test/CodeGen/WebAssembly/simd.ll b/test/CodeGen/WebAssembly/simd.ll
index 193e3120b9ed75bdb1566b42c3d3511d32abc05c..55a325b939c643905038bff6fb0312c794808320 100644
--- a/test/CodeGen/WebAssembly/simd.ll
+++ b/test/CodeGen/WebAssembly/simd.ll
@@ -54,6 +54,38 @@ define i32 @extract_v16i8_s(<16 x i8> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_var_v16i8_s:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]
+; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $2, $pop[[L5]]
+; SIMD128-NEXT: i32.load8_s $push[[R:[0-9]+]]=, 0($pop[[L6]])
+; SIMD128-NEXT: return $pop[[R]]
+define i32 @extract_var_v16i8_s(<16 x i8> %v, i32 %i) {
+  %elem = extractelement <16 x i8> %v, i32 %i
+  %a = sext i8 %elem to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: extract_undef_v16i8_s:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_s $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v16i8_s(<16 x i8> %v) {
+  %elem = extractelement <16 x i8> %v, i8 undef
+  %a = sext i8 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_v16i8_u:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128{{$}}
@@ -66,6 +98,38 @@ define i32 @extract_v16i8_u(<16 x i8> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_var_v16i8_u:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $2, $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32.load8_u $push[[R:[0-9]+]]=, 0($pop[[L6]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_var_v16i8_u(<16 x i8> %v, i32 %i) {
+  %elem = extractelement <16 x i8> %v, i32 %i
+  %a = zext i8 %elem to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: extract_undef_v16i8_u:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v16i8_u(<16 x i8> %v) {
+  %elem = extractelement <16 x i8> %v, i8 undef
+  %a = zext i8 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128{{$}}
@@ -77,6 +141,36 @@ define i8 @extract_v16i8(<16 x i8> %v) {
   ret i8 %elem
 }
 
+; CHECK-LABEL: extract_var_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $2, $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32.load8_u $push[[R:[0-9]+]]=, 0($pop[[L6]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i8 @extract_var_v16i8(<16 x i8> %v, i32 %i) {
+  %elem = extractelement <16 x i8> %v, i32 %i
+  ret i8 %elem
+}
+
+; CHECK-LABEL: extract_undef_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i8x16.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i8 @extract_undef_v16i8(<16 x i8> %v) {
+  %elem = extractelement <16 x i8> %v, i8 undef
+  ret i8 %elem
+}
+
 ; CHECK-LABEL: replace_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -88,6 +182,37 @@ define <16 x i8> @replace_v16i8(<16 x i8> %v, i8 %x) {
   ret <16 x i8> %res
 }
 
+; CHECK-LABEL: replace_var_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 15{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L6:[0-9]+]]=, $3, $pop[[L5]]{{$}}
+; SIMD128-NEXT: i32.store8 0($pop[[L6]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @replace_var_v16i8(<16 x i8> %v, i32 %i, i8 %x) {
+  %res = insertelement <16 x i8> %v, i8 %x, i32 %i
+  ret <16 x i8> %res
+}
+
+; CHECK-LABEL: replace_undef_v16i8:
+; NO-SIMD128-NOT: i8x16
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i8x16.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @replace_undef_v16i8(<16 x i8> %v, i8 %x) {
+  %res = insertelement <16 x i8> %v, i8 %x, i32 undef
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: shuffle_v16i8:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -102,6 +227,22 @@ define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) {
   ret <16 x i8> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v16i8:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) {
+  %res = shufflevector <16 x i8> %x, <16 x i8> %y,
+    <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %res
+}
+
 ; CHECK-LABEL: build_v16i8:
 ; NO-SIMD128-NOT: i8x16
 ; SIMD128-NEXT: .param i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32{{$}}
@@ -190,6 +331,40 @@ define i32 @extract_v8i16_s(<8 x i16> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_var_v8i16_s:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.load16_s $push[[R:[0-9]+]]=, 0($pop[[L8]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_var_v8i16_s(<8 x i16> %v, i32 %i) {
+  %elem = extractelement <8 x i16> %v, i32 %i
+  %a = sext i16 %elem to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: extract_undef_v8i16_s:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_s $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v8i16_s(<8 x i16> %v) {
+  %elem = extractelement <8 x i16> %v, i16 undef
+  %a = sext i16 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_v8i16_u:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128{{$}}
@@ -202,6 +377,40 @@ define i32 @extract_v8i16_u(<8 x i16> %v) {
   ret i32 %a
 }
 
+; CHECK-LABEL: extract_var_v8i16_u:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.load16_u $push[[R:[0-9]+]]=, 0($pop[[L8]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_var_v8i16_u(<8 x i16> %v, i32 %i) {
+  %elem = extractelement <8 x i16> %v, i32 %i
+  %a = zext i16 %elem to i32
+  ret i32 %a
+}
+
+; CHECK-LABEL: extract_undef_v8i16_u:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v8i16_u(<8 x i16> %v) {
+  %elem = extractelement <8 x i16> %v, i16 undef
+  %a = zext i16 %elem to i32
+  ret i32 %a
+}
+
 ; CHECK-LABEL: extract_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128{{$}}
@@ -213,6 +422,38 @@ define i16 @extract_v8i16(<8 x i16> %v) {
   ret i16 %elem
 }
 
+; CHECK-LABEL: extract_var_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.load16_u $push[[R:[0-9]+]]=, 0($pop[[L8]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i16 @extract_var_v8i16(<8 x i16> %v, i32 %i) {
+  %elem = extractelement <8 x i16> %v, i32 %i
+  ret i16 %elem
+}
+
+; CHECK-LABEL: extract_undef_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i16x8.extract_lane_u $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i16 @extract_undef_v8i16(<8 x i16> %v) {
+  %elem = extractelement <8 x i16> %v, i16 undef
+  ret i16 %elem
+}
+
 ; CHECK-LABEL: replace_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -224,6 +465,39 @@ define <8 x i16> @replace_v8i16(<8 x i16> %v, i16 %x) {
   ret <8 x i16> %res
 }
 
+; CHECK-LABEL: replace_var_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 7{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L8:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.store16 0($pop[[L8]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @replace_var_v8i16(<8 x i16> %v, i32 %i, i16 %x) {
+  %res = insertelement <8 x i16> %v, i16 %x, i32 %i
+  ret <8 x i16> %res
+}
+
+; CHECK-LABEL: replace_undef_v8i16:
+; NO-SIMD128-NOT: i16x8
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i16x8.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @replace_undef_v8i16(<8 x i16> %v, i16 %x) {
+  %res = insertelement <8 x i16> %v, i16 %x, i32 undef
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: shuffle_v8i16:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -237,6 +511,20 @@ define <8 x i16> @shuffle_v8i16(<8 x i16> %x, <8 x i16> %y) {
   ret <8 x i16> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v8i16:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <8 x i16> @shuffle_undef_v8i16(<8 x i16> %x, <8 x i16> %y) {
+  %res = shufflevector <8 x i16> %x, <8 x i16> %y,
+    <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef,
+               i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %res
+}
+
 ; CHECK-LABEL: build_v8i16:
 ; NO-SIMD128-NOT: i16x8
 ; SIMD128-NEXT: .param i32, i32, i32, i32, i32, i32, i32, i32{{$}}
@@ -305,6 +593,38 @@ define i32 @extract_v4i32(<4 x i32> %v) {
   ret i32 %elem
 }
 
+; CHECK-LABEL: extract_var_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L4:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.load $push[[R:[0-9]+]]=, 0($pop[[L4]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_var_v4i32(<4 x i32> %v, i32 %i) {
+  %elem = extractelement <4 x i32> %v, i32 %i
+  ret i32 %elem
+}
+
+; CHECK-LABEL: extract_undef_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i32{{$}}
+; SIMD128-NEXT: i32x4.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i32 @extract_undef_v4i32(<4 x i32> %v) {
+  %elem = extractelement <4 x i32> %v, i32 undef
+  ret i32 %elem
+}
+
 ; CHECK-LABEL: replace_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param v128, i32{{$}}
@@ -316,6 +636,39 @@ define <4 x i32> @replace_v4i32(<4 x i32> %v, i32 %x) {
   ret <4 x i32> %res
 }
 
+; CHECK-LABEL: replace_var_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, i32, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L4:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L4]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L4:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i32.store 0($pop[[L4]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @replace_var_v4i32(<4 x i32> %v, i32 %i, i32 %x) {
+  %res = insertelement <4 x i32> %v, i32 %x, i32 %i
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: replace_undef_v4i32:
+; NO-SIMD128-NOT: i32x4
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i32x4.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @replace_undef_v4i32(<4 x i32> %v, i32 %x) {
+  %res = insertelement <4 x i32> %v, i32 %x, i32 undef
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: shuffle_v4i32:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -329,6 +682,19 @@ define <4 x i32> @shuffle_v4i32(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v4i32:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x i32> @shuffle_undef_v4i32(<4 x i32> %x, <4 x i32> %y) {
+  %res = shufflevector <4 x i32> %x, <4 x i32> %y,
+    <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: build_v4i32:
 ; NO-SIMD128-NOT: i32x4
 ; SIMD128-NEXT: .param i32, i32, i32, i32{{$}}
@@ -390,6 +756,39 @@ define i64 @extract_v2i64(<2 x i64> %v) {
   ret i64 %elem
 }
 
+; CHECK-LABEL: extract_var_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result i64{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i64.load $push[[R:[0-9]+]]=, 0($pop[[L2]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i64 @extract_var_v2i64(<2 x i64> %v, i32 %i) {
+  %elem = extractelement <2 x i64> %v, i32 %i
+  ret i64 %elem
+}
+
+; CHECK-LABEL: extract_undef_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-VM-NOT: i64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result i64{{$}}
+; SIMD128-NEXT: i64x2.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define i64 @extract_undef_v2i64(<2 x i64> %v) {
+  %elem = extractelement <2 x i64> %v, i64 undef
+  ret i64 %elem
+}
+
 ; CHECK-LABEL: replace_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
@@ -402,6 +801,41 @@ define <2 x i64> @replace_v2i64(<2 x i64> %v, i64 %x) {
   ret <2 x i64> %res
 }
 
+; CHECK-LABEL: replace_var_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-VM-NOT: i64x2
+; SIMD128-NEXT: .param v128, i32, i64{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: i64.store 0($pop[[L2]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @replace_var_v2i64(<2 x i64> %v, i32 %i, i64 %x) {
+  %res = insertelement <2 x i64> %v, i64 %x, i32 %i
+  ret <2 x i64> %res
+}
+
+; CHECK-LABEL: replace_undef_v2i64:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-VM-NOT: i64x2
+; SIMD128-NEXT: .param v128, i64{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: i64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @replace_undef_v2i64(<2 x i64> %v, i64 %x) {
+  %res = insertelement <2 x i64> %v, i64 %x, i32 undef
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: shuffle_v2i64:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -414,6 +848,19 @@ define <2 x i64> @shuffle_v2i64(<2 x i64> %x, <2 x i64> %y) {
   ret <2 x i64> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v2i64:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x i64> @shuffle_undef_v2i64(<2 x i64> %x, <2 x i64> %y) {
+  %res = shufflevector <2 x i64> %x, <2 x i64> %y,
+    <2 x i32> <i32 1, i32 undef>
+  ret <2 x i64> %res
+}
+
 ; CHECK-LABEL: build_v2i64:
 ; NO-SIMD128-NOT: i64x2
 ; SIMD128-VM-NOT: i64x2
@@ -472,6 +919,38 @@ define float @extract_v4f32(<4 x float> %v) {
   ret float %elem
 }
 
+; CHECK-LABEL: extract_var_v4f32:
+; NO-SIMD128-NOT: i64x2
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result f32{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: f32.load $push[[R:[0-9]+]]=, 0($pop[[L2]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define float @extract_var_v4f32(<4 x float> %v, i32 %i) {
+  %elem = extractelement <4 x float> %v, i32 %i
+  ret float %elem
+}
+
+; CHECK-LABEL: extract_undef_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result f32{{$}}
+; SIMD128-NEXT: f32x4.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define float @extract_undef_v4f32(<4 x float> %v) {
+  %elem = extractelement <4 x float> %v, i32 undef
+  ret float %elem
+}
+
 ; CHECK-LABEL: replace_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param v128, f32{{$}}
@@ -483,6 +962,39 @@ define <4 x float> @replace_v4f32(<4 x float> %v, float %x) {
   ret <4 x float> %res
 }
 
+; CHECK-LABEL: replace_var_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, i32, f32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 2{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: f32.store 0($pop[[L2]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @replace_var_v4f32(<4 x float> %v, i32 %i, float %x) {
+  %res = insertelement <4 x float> %v, float %x, i32 %i
+  ret <4 x float> %res
+}
+
+; CHECK-LABEL: replace_undef_v4f32:
+; NO-SIMD128-NOT: f32x4
+; SIMD128-NEXT: .param v128, f32{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f32x4.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @replace_undef_v4f32(<4 x float> %v, float %x) {
+  %res = insertelement <4 x float> %v, float %x, i32 undef
+  ret <4 x float> %res
+}
+
 ; CHECK-LABEL: shuffle_v4f32:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -496,6 +1008,19 @@ define <4 x float> @shuffle_v4f32(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v4f32:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <4 x float> @shuffle_undef_v4f32(<4 x float> %x, <4 x float> %y) {
+  %res = shufflevector <4 x float> %x, <4 x float> %y,
+    <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x float> %res
+}
+
 ; CHECK-LABEL: build_v4f32:
 ; NO-SIMD128-NOT: f32x4
 ; SIMD128-NEXT: .param f32, f32, f32, f32{{$}}
@@ -556,6 +1081,39 @@ define double @extract_v2f64(<2 x double> %v) {
   ret double %elem
 }
 
+; CHECK-LABEL: extract_var_v2f64:
+; NO-SIMD128-NOT: i62x2
+; SIMD128-NEXT: .param v128, i32{{$}}
+; SIMD128-NEXT: .result f64{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $2=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $2, $pop[[L7]]{{$}}
+; SIMD128-NEXT: f64.load $push[[R:[0-9]+]]=, 0($pop[[L2]]){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define double @extract_var_v2f64(<2 x double> %v, i32 %i) {
+  %elem = extractelement <2 x double> %v, i32 %i
+  ret double %elem
+}
+
+; CHECK-LABEL: extract_undef_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128{{$}}
+; SIMD128-NEXT: .result f64{{$}}
+; SIMD128-NEXT: f64x2.extract_lane $push[[R:[0-9]+]]=, $0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define double @extract_undef_v2f64(<2 x double> %v) {
+  %elem = extractelement <2 x double> %v, i32 undef
+  ret double %elem
+}
+
 ; CHECK-LABEL: replace_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
@@ -568,6 +1126,41 @@ define <2 x double> @replace_v2f64(<2 x double> %v, double %x) {
   ret <2 x double> %res
 }
 
+; CHECK-LABEL: replace_var_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, i32, f64{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: get_global $push[[L0:[0-9]+]]=, __stack_pointer@GLOBAL{{$}}
+; SIMD128-NEXT: i32.const $push[[L1:[0-9]+]]=, 16{{$}}
+; SIMD128-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; SIMD128-NEXT: tee_local $push[[L3:[0-9]+]]=, $3=, $pop[[L2]]{{$}}
+; SIMD128-NEXT: v128.store 0($pop[[L3]]), $0{{$}}
+; SIMD128-NEXT: i32.const $push[[L2:[0-9]+]]=, 1{{$}}
+; SIMD128-NEXT: i32.and $push[[L5:[0-9]+]]=, $1, $pop[[L2]]{{$}}
+; SIMD128-NEXT: i32.const $push[[L6:[0-9]+]]=, 3{{$}}
+; SIMD128-NEXT: i32.shl $push[[L7:[0-9]+]]=, $pop[[L5]], $pop[[L6]]{{$}}
+; SIMD128-NEXT: i32.or $push[[L2:[0-9]+]]=, $3, $pop[[L7]]{{$}}
+; SIMD128-NEXT: f64.store 0($pop[[L2]]), $2{{$}}
+; SIMD128-NEXT: v128.load $push[[R:[0-9]+]]=, 0($3){{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @replace_var_v2f64(<2 x double> %v, i32 %i, double %x) {
+  %res = insertelement <2 x double> %v, double %x, i32 %i
+  ret <2 x double> %res
+}
+
+; CHECK-LABEL: replace_undef_v2f64:
+; NO-SIMD128-NOT: f64x2
+; SIMD128-VM-NOT: f64x2
+; SIMD128-NEXT: .param v128, f64{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: f64x2.replace_lane $push[[R:[0-9]+]]=, $0, 0, $1{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @replace_undef_v2f64(<2 x double> %v, double %x) {
+  %res = insertelement <2 x double> %v, double %x, i32 undef
+  ret <2 x double> %res
+}
+
 ; CHECK-LABEL: shuffle_v2f64:
 ; NO-SIMD128-NOT: v8x16
 ; SIMD128-NEXT: .param v128, v128{{$}}
@@ -581,6 +1174,19 @@ define <2 x double> @shuffle_v2f64(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %res
 }
 
+; CHECK-LABEL: shuffle_undef_v2f64:
+; NO-SIMD128-NOT: v8x16
+; SIMD128-NEXT: .param v128, v128{{$}}
+; SIMD128-NEXT: .result v128{{$}}
+; SIMD128-NEXT: v8x16.shuffle $push[[R:[0-9]+]]=, $0, $0,
+; SIMD128-SAME: 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0{{$}}
+; SIMD128-NEXT: return $pop[[R]]{{$}}
+define <2 x double> @shuffle_undef_v2f64(<2 x double> %x, <2 x double> %y) {
+  %res = shufflevector <2 x double> %x, <2 x double> %y,
+    <2 x i32> <i32 1, i32 undef>
+  ret <2 x double> %res
+}
+
 ; CHECK-LABEL: build_v2f64:
 ; NO-SIMD128-NOT: f64x2
 ; SIMD128-VM-NOT: f64x2
diff --git a/test/CodeGen/WebAssembly/stack-insts.ll b/test/CodeGen/WebAssembly/stack-insts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0876b4a42797d6c6fcb38ff0ee8e118e3f0e824d
--- /dev/null
+++ b/test/CodeGen/WebAssembly/stack-insts.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare void @foo0()
+declare void @foo1()
+
+; Tests if br_table is printed correctly with a tab.
+; CHECK-LABEL: test0:
+; CHECK-NOT: br_table0, 1, 0, 1, 0
+; CHECK: br_table 0, 1, 0, 1, 0
+define void @test0(i32 %n) {
+entry:
+  switch i32 %n, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb.1
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb.1
+  ]
+
+sw.bb:                                            ; preds = %entry, %entry
+  tail call void @foo0()
+  br label %sw.epilog
+
+sw.bb.1:                                          ; preds = %entry, %entry
+  tail call void @foo1()
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %entry, %sw.bb, %sw.bb.1
+  ret void
+}
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
index 4a6b48b977f8ae465dc5587eea5fcd7f5e648d44..aa4acae5e0791eae896c631ac7871aea7c4bff3e 100644
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -330,4 +330,6 @@ define void @inline_asm() {
   ret void
 }
 
+; CHECK: .globaltype	__stack_pointer, i32{{$}}
+
 ; TODO: test over-aligned alloca
diff --git a/test/CodeGen/WebAssembly/wasmehprepare.ll b/test/CodeGen/WebAssembly/wasmehprepare.ll
index e6005e34057ce2133aea9526f29c2fbc319cf7de..67e198eb05882e4f3b70b7140f9fa21c089a0ab0 100644
--- a/test/CodeGen/WebAssembly/wasmehprepare.ll
+++ b/test/CodeGen/WebAssembly/wasmehprepare.ll
@@ -30,7 +30,7 @@ catch.start:                                      ; preds = %catch.dispatch
 ; CHECK: catch.start:
 ; CHECK-NEXT:   %[[CATCHPAD:.*]] = catchpad
 ; CHECK-NEXT:   %[[EXN:.*]] = call i8* @llvm.wasm.catch(i32 0)
-; CHECK-NEXT:   call void @llvm.wasm.landingpad.index(i32 0)
+; CHECK-NEXT:   call void @llvm.wasm.landingpad.index(token %[[CATCHPAD]], i32 0)
 ; CHECK-NEXT:   store i32 0, i32* getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 0)
 ; CHECK-NEXT:   %[[LSDA:.*]] = call i8* @llvm.wasm.lsda()
 ; CHECK-NEXT:   store i8* %[[LSDA]], i8** getelementptr inbounds ({ i32, i8*, i32 }, { i32, i8*, i32 }* @__wasm_lpad_context, i32 0, i32 1)
@@ -98,7 +98,7 @@ catch.start3:                                     ; preds = %catch.dispatch2
   %matches = icmp eq i32 %8, %9
   br i1 %matches, label %catch4, label %rethrow
 ; CHECK: catch.start3:
-; CHECK:   call void @llvm.wasm.landingpad.index(i32 0)
+; CHECK:   call void @llvm.wasm.landingpad.index(token %{{.+}}, i32 0)
 
 catch4:                                           ; preds = %catch.start3
   %10 = call i8* @__cxa_begin_catch(i8* %7) [ "funclet"(token %6) ]
@@ -311,7 +311,7 @@ declare void @__cxa_rethrow()
 declare void @__clang_call_terminate(i8*)
 
 ; CHECK-DAG: declare i8* @llvm.wasm.catch(i32)
-; CHECK-DAG: declare void @llvm.wasm.landingpad.index(i32)
+; CHECK-DAG: declare void @llvm.wasm.landingpad.index(token, i32)
 ; CHECK-DAG: declare i8* @llvm.wasm.lsda()
 ; CHECK-DAG: declare i32 @_Unwind_CallPersonality(i8*)
 
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 945f47337ba2a096f0e1bd6d0f1f5579526c42d2..0cce34fb7bd65fa1984b6d2a1c0e441802dd12fd 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; REQUIRES: asserts
-; RUN: llc < %s -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machinelicm"
+; RUN: llc < %s -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "6 machinelicm"
 ; RUN: llc < %s -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
@@ -24,15 +24,17 @@ define %struct.__vv* @t(%struct.Key* %desc, i64 %p) nounwind ssp {
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_3: ## %bb.i
 ; CHECK-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    pinsrd $1, 4, %xmm0
-; CHECK-NEXT:    pinsrd $2, 8, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7]
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; CHECK-NEXT:    addps {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    addps %xmm1, %xmm0
+; CHECK-NEXT:    movl 0, %eax
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    cvtsi2ssq %rax, %xmm0
+; CHECK-NEXT:    movl 4, %eax
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    cvtsi2ssq %rax, %xmm1
+; CHECK-NEXT:    movl 8, %eax
+; CHECK-NEXT:    xorps %xmm2, %xmm2
+; CHECK-NEXT:    cvtsi2ssq %rax, %xmm2
+; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; CHECK-NEXT:    movaps %xmm0, 0
 ; CHECK-NEXT:  LBB0_1: ## %bb4
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
index d1d05a1900157e8bda7f0469b2af6e0089e02f8d..be8563a9f2fe2b24e937818bae70b75b34275cdc 100644
--- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
+++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
@@ -1,9 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-- -mcpu=core2 | FileCheck %s
 
 define <4 x i16> @a(i32* %x1) nounwind {
 ; CHECK-LABEL: a:
-; CHECK:         shrl %[[R:[^,]+]]
-; CHECK-NEXT:    movd %[[R]], %xmm0
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    retl
 
   %x2 = load i32, i32* %x1
@@ -15,9 +19,12 @@ define <4 x i16> @a(i32* %x1) nounwind {
 
 define <8 x i16> @b(i32* %x1) nounwind {
 ; CHECK-LABEL: b:
-; CHECK:         shrl %e[[R:.]]x
-; CHECK-NEXT:    movzwl %[[R]]x, %e[[R]]x
-; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    retl
 
   %x2 = load i32, i32* %x1
@@ -29,9 +36,12 @@ define <8 x i16> @b(i32* %x1) nounwind {
 
 define <8 x i8> @c(i32* %x1) nounwind {
 ; CHECK-LABEL: c:
-; CHECK:         shrl %e[[R:.]]x
-; CHECK-NEXT:    movzwl %[[R]]x, %e[[R]]x
-; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    retl
 
   %x2 = load i32, i32* %x1
@@ -43,9 +53,12 @@ define <8 x i8> @c(i32* %x1) nounwind {
 
 define <16 x i8> @d(i32* %x1) nounwind {
 ; CHECK-LABEL: d:
-; CHECK:         shrl %e[[R:.]]x
-; CHECK-NEXT:    movzbl %[[R]]l, %e[[R]]x
-; CHECK-NEXT:    movd %e[[R]]x, %xmm0
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    movd %eax, %xmm0
 ; CHECK-NEXT:    retl
 
   %x2 = load i32, i32* %x1
diff --git a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
index 2ae3d389d055bde8b7f421a10fccf6bbbb49a647..d8a6823f7b8f25700d2a1f74c54ed04af6d27892 100644
--- a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
+++ b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=corei7 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i8:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -13,15 +14,29 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Also make sure that we sign-extend it.
 ; Based on /gcc-4_2-testsuite/src/gcc.c-torture/execute/pr23135.c
 
-; CHECK: main
 define i32 @main() nounwind uwtable {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pmovsxbq {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pmovsxbq {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    pextrq $1, %xmm1, %rax
+; CHECK-NEXT:    pextrq $1, %xmm0, %rcx
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rcx
+; CHECK-NEXT:    movq %rax, %xmm2
+; CHECK-NEXT:    movq %xmm1, %rax
+; CHECK-NEXT:    movq %xmm0, %rcx
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    idivq %rcx
+; CHECK-NEXT:    movq %rax, %xmm0
+; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; CHECK-NEXT:    pextrw $0, %xmm0, {{.*}}(%rip)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
 entry:
-; CHECK: pmovsxbq  i(%rip), %
-; CHECK: pmovsxbq  j(%rip), %
   %0 = load <2 x i8>, <2 x i8>* @i, align 8
   %1 = load <2 x i8>, <2 x i8>* @j, align 8
   %div = sdiv <2 x i8> %1, %0
   store <2 x i8> %div, <2 x i8>* getelementptr inbounds (%union.anon, %union.anon* @res, i32 0, i32 0), align 8
   ret i32 0
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index a84a85e2ecde55875f17ddb1108dd731a9a61baf..44d9569bc57ff55705e08fe6383aa5bbb3fe2a86 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -8,8 +8,7 @@
 define void @simple_widen(<2 x float> %a, <2 x float> %b) {
 ; X32-LABEL: simple_widen:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    extractps $1, %xmm1, (%eax)
-; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    movlps %xmm1, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: simple_widen:
@@ -28,8 +27,7 @@ define void @complex_inreg_work(<2 x float> %a, <2 x float> %b) {
 ; X32-NEXT:    movaps %xmm0, %xmm2
 ; X32-NEXT:    cmpordps %xmm0, %xmm0
 ; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; X32-NEXT:    extractps $1, %xmm1, (%eax)
-; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    movlps %xmm1, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: complex_inreg_work:
@@ -50,8 +48,7 @@ define void @zero_test() {
 ; X32-LABEL: zero_test:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    extractps $1, %xmm0, (%eax)
-; X32-NEXT:    movss %xmm0, (%eax)
+; X32-NEXT:    movlps %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: zero_test:
@@ -75,18 +72,15 @@ define void @full_test() {
 ; X32-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; X32-NEXT:    xorps %xmm0, %xmm0
 ; X32-NEXT:    cmpltps %xmm2, %xmm0
-; X32-NEXT:    movaps {{.*#+}} xmm3 = <1,1,u,u>
+; X32-NEXT:    movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u>
 ; X32-NEXT:    addps %xmm1, %xmm3
 ; X32-NEXT:    movaps %xmm1, %xmm4
 ; X32-NEXT:    blendvps %xmm0, %xmm3, %xmm4
 ; X32-NEXT:    cmpeqps %xmm2, %xmm1
 ; X32-NEXT:    movaps %xmm1, %xmm0
 ; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm4
-; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
-; X32-NEXT:    movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
-; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movlps %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movlps %xmm4, {{[0-9]+}}(%esp)
 ; X32-NEXT:    addl $60, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
@@ -98,7 +92,7 @@ define void @full_test() {
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    cmpltps %xmm2, %xmm0
-; X64-NEXT:    movaps {{.*#+}} xmm3 = <1,1,u,u>
+; X64-NEXT:    movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u>
 ; X64-NEXT:    addps %xmm1, %xmm3
 ; X64-NEXT:    movaps %xmm1, %xmm4
 ; X64-NEXT:    blendvps %xmm0, %xmm3, %xmm4
diff --git a/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll b/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
index 1b80dc9b1d2e0d1518cdf49cf6d7ac27c2bbd91c..ad3a17071d5189c89f83987d92dd30b6d350e6e4 100644
--- a/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
+++ b/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
@@ -7,7 +7,7 @@
 define void @ui_to_fp_conv(<8 x float> * nocapture %aFOO, <8 x float>* nocapture %RET) nounwind {
 ; CHECK-LABEL: ui_to_fp_conv:
 ; CHECK:       # %bb.0: # %allocas
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1,1,0,0]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    movups %xmm1, 16(%rsi)
 ; CHECK-NEXT:    movups %xmm0, (%rsi)
diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll
index 61c25021bbaa95bd11bbd39c5fd4747fe855d0d6..ab57b61770d0dbad5b6d523457fb0da324a18fe7 100644
--- a/test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -4,17 +4,10 @@
 define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: vcast:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $16, %rsp
-; CHECK-NEXT:    .seh_stackalloc 16
-; CHECK-NEXT:    .seh_endprologue
-; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    psubq %xmm1, %xmm0
-; CHECK-NEXT:    addq $16, %rsp
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    .seh_handlerdata
-; CHECK-NEXT:    .text
-; CHECK-NEXT:    .seh_endproc
   %af = bitcast <2 x float> %a to <2 x i32>
   %bf = bitcast <2 x float> %b to <2 x i32>
   %x = sub <2 x i32> %af, %bf
diff --git a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
index c4b307e5a5d32fc75433d88238f1df9a915a77c1..95d78e47479adcef282701a302f4b9b946c35e2e 100644
--- a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
+++ b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
@@ -1,10 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; CHECK: build_vector_again
 define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone {
+; CHECK-LABEL: build_vector_again:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT:    retq
 entry:
   %out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: pmovzxbd
   ret <4 x i8> %out
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 6e5f48e4e0dd66ef34f31468d170ec3aeb4865f1..afa8bf44c206e45660fab1614313f9db6d57361e 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -7,23 +7,19 @@
 define void @func(<4 x float> %a, <16 x i8> %b, <16 x i8> %c, <8 x float> %d, <8 x float> %e, <8 x float>* %f) nounwind ssp {
 ; CHECK-LABEL: func:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqu 0, %xmm3
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm1 = xmm3[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
-; CHECK-NEXT:    vmovdqu 32, %xmm3
-; CHECK-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
-; CHECK-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT:    vmulps %ymm0, %ymm0, %ymm0
-; CHECK-NEXT:    vmulps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vaddps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vmovdqu 0, %xmm0
+; CHECK-NEXT:    vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vmulps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vmulps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmulps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vaddps %ymm0, %ymm0, %ymm0
 ; CHECK-NEXT:    vhaddps %ymm4, %ymm0, %ymm0
 ; CHECK-NEXT:    vsubps %ymm0, %ymm0, %ymm0
-; CHECK-NEXT:    vhaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vhaddps %ymm0, %ymm2, %ymm0
 ; CHECK-NEXT:    vmovaps %ymm0, (%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/4char-promote.ll b/test/CodeGen/X86/4char-promote.ll
index bfe025eaa9145801f6c44401a96981aed8119b49..27778be3b71b7bef62d8d5825fa988ae90a71cc7 100644
--- a/test/CodeGen/X86/4char-promote.ll
+++ b/test/CodeGen/X86/4char-promote.ll
@@ -1,14 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; A test for checking PR 9623
 ; RUN: llc -mcpu=corei7 < %s | FileCheck %s
 
 target triple = "x86_64-apple-darwin"
 
-; CHECK:  pmulld
-; CHECK:  paddd
-; CHECK-NOT:  movdqa
-; CHECK:  ret
-
 define <4 x i8> @foo(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: foo:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pmulld %xmm0, %xmm1
+; CHECK-NEXT:    paddd %xmm1, %xmm0
+; CHECK-NEXT:    retq
 entry:
  %binop = mul <4 x i8> %x, %y
  %binop6 = add <4 x i8> %binop, %x
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
index cf9b80390968ab1348df527b25eadfa2ea55586a..71f1facfb81f291c3175d97396b759d7b867d119 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
@@ -288,12 +288,12 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i1toi8
-    ; X32: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; X32: $al = COPY [[DEF]](s8)
+    ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X32: $al = COPY [[C]](s8)
     ; X32: RET 0, implicit $al
     ; X64-LABEL: name: test_sext_i1toi8
-    ; X64: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; X64: $al = COPY [[DEF]](s8)
+    ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X64: $al = COPY [[C]](s8)
     ; X64: RET 0, implicit $al
     %0(s1) = G_IMPLICIT_DEF
     %1(s8) = G_SEXT %0(s1)
@@ -314,12 +314,12 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i1toi16
-    ; X32: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; X32: $ax = COPY [[DEF]](s16)
+    ; X32: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; X32: $ax = COPY [[C]](s16)
     ; X32: RET 0, implicit $ax
     ; X64-LABEL: name: test_sext_i1toi16
-    ; X64: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; X64: $ax = COPY [[DEF]](s16)
+    ; X64: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; X64: $ax = COPY [[C]](s16)
     ; X64: RET 0, implicit $ax
     %0(s1) = G_IMPLICIT_DEF
     %1(s16) = G_SEXT %0(s1)
@@ -341,12 +341,12 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i1
-    ; X32: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X32: $eax = COPY [[DEF]](s32)
+    ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; X32: $eax = COPY [[C]](s32)
     ; X32: RET 0, implicit $eax
     ; X64-LABEL: name: test_sext_i1
-    ; X64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X64: $eax = COPY [[DEF]](s32)
+    ; X64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; X64: $eax = COPY [[C]](s32)
     ; X64: RET 0, implicit $eax
     %0(s1) = G_IMPLICIT_DEF
     %2(s32) = G_SEXT %0(s1)
diff --git a/test/CodeGen/X86/GlobalISel/legalize-undef.mir b/test/CodeGen/X86/GlobalISel/legalize-undef.mir
index 997064b366dde5c8f52d529bbddd9c2f4a96f891..4a865e4e5821ca9c87f869f3ff3ba2274eae9088 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-undef.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-undef.mir
@@ -11,32 +11,32 @@ body: |
     liveins:
     ; X64-LABEL: name: test_implicit_def
     ; X64: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+    ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X64: G_STORE [[C]](s8), [[DEF]](p0) :: (store 1)
     ; X64: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
     ; X64: G_STORE [[DEF1]](s8), [[DEF]](p0) :: (store 1)
-    ; X64: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; X64: G_STORE [[DEF2]](s8), [[DEF]](p0) :: (store 1)
-    ; X64: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; X64: G_STORE [[DEF3]](s16), [[DEF]](p0) :: (store 2)
-    ; X64: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X64: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 4)
-    ; X64: [[DEF5:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; X64: G_STORE [[DEF5]](s64), [[DEF]](p0) :: (store 8)
+    ; X64: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; X64: G_STORE [[DEF2]](s16), [[DEF]](p0) :: (store 2)
+    ; X64: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; X64: G_STORE [[DEF3]](s32), [[DEF]](p0) :: (store 4)
+    ; X64: [[DEF4:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; X64: G_STORE [[DEF4]](s64), [[DEF]](p0) :: (store 8)
     ; X32-LABEL: name: test_implicit_def
     ; X32: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+    ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X32: G_STORE [[C]](s8), [[DEF]](p0) :: (store 1)
     ; X32: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
     ; X32: G_STORE [[DEF1]](s8), [[DEF]](p0) :: (store 1)
-    ; X32: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
-    ; X32: G_STORE [[DEF2]](s8), [[DEF]](p0) :: (store 1)
-    ; X32: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; X32: G_STORE [[DEF3]](s16), [[DEF]](p0) :: (store 2)
+    ; X32: [[DEF2:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; X32: G_STORE [[DEF2]](s16), [[DEF]](p0) :: (store 2)
+    ; X32: [[DEF3:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; X32: G_STORE [[DEF3]](s32), [[DEF]](p0) :: (store 4)
     ; X32: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X32: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 4)
     ; X32: [[DEF5:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X32: [[DEF6:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; X32: G_STORE [[DEF5]](s32), [[DEF]](p0) :: (store 4, align 8)
-    ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; X32: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C]](s32)
-    ; X32: G_STORE [[DEF6]](s32), [[GEP]](p0) :: (store 4)
+    ; X32: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 4, align 8)
+    ; X32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; X32: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C1]](s32)
+    ; X32: G_STORE [[DEF5]](s32), [[GEP]](p0) :: (store 4)
     %5:_(p0) = G_IMPLICIT_DEF
     %0:_(s1) = G_IMPLICIT_DEF
     G_STORE %0, %5 ::(store 1)
diff --git a/test/CodeGen/X86/MachineSink-eflags.ll b/test/CodeGen/X86/MachineSink-eflags.ll
index 4e52c8c5f7d0cca6e56e59bb65e33e6d96aa572a..6302b3be6717820800d16d99fa3058e240ac9fac 100644
--- a/test/CodeGen/X86/MachineSink-eflags.ll
+++ b/test/CodeGen/X86/MachineSink-eflags.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-pc-linux"
@@ -11,6 +12,36 @@ target triple = "x86_64-pc-linux"
 %5 = type <{ void (i32)*, i8*, i32 (i8*, ...)* }>
 
 define void @foo(i8* nocapture %_stubArgs) nounwind {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq 48(%rdi), %rax
+; CHECK-NEXT:    movl 64(%rdi), %edx
+; CHECK-NEXT:    movl $200, %esi
+; CHECK-NEXT:    addl 68(%rdi), %esi
+; CHECK-NEXT:    imull $46, %edx, %ecx
+; CHECK-NEXT:    addq %rsi, %rcx
+; CHECK-NEXT:    shlq $4, %rcx
+; CHECK-NEXT:    imull $47, %edx, %edx
+; CHECK-NEXT:    addq %rsi, %rdx
+; CHECK-NEXT:    shlq $4, %rdx
+; CHECK-NEXT:    movaps (%rax,%rdx), %xmm0
+; CHECK-NEXT:    cmpl $0, (%rdi)
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    movaps (%rax,%rcx), %xmm1
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rsp
+; CHECK-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %entry
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:  .LBB0_5: # %entry
+; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    addq $152, %rsp
+; CHECK-NEXT:    retq
 entry:
  %i0 = alloca i8*, align 8
  %i2 = alloca i8*, align 8
@@ -60,8 +91,6 @@ entry:
  %cmp432.i = icmp ult i32 %tmp156.i, %tmp1
 
 ; %shl.i should not be sinked below the compare.
-; CHECK: cmpl
-; CHECK-NOT: shlq
 
  %cond.i = select i1 %cmp432.i, <2 x double> %tmp162.i, <2 x double> zeroinitializer
  store <2 x double> %cond.i, <2 x double>* %ptr4438.i, align 16
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
index 05c0f358e7befb012615ef4fcceedbe41af4b4d5..d9a093b8c59d51523a54bb497bb49123dfc89bba 100644
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -1,4 +1,7 @@
-; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+; When EXPENSIVE_CHECKS are enabled, the machine verifier appears between each
+; pass. Ignore it with 'grep -v'.
+; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 \
+; RUN:   | grep -v 'Verify generated machine code' | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/test/CodeGen/X86/O3-pipeline.ll b/test/CodeGen/X86/O3-pipeline.ll
index 93e184c43713001b524f019bf956e00e6facc6e0..9828d1eeab104fd2ff6f237c6701ad45aacb2711 100644
--- a/test/CodeGen/X86/O3-pipeline.ll
+++ b/test/CodeGen/X86/O3-pipeline.ll
@@ -1,4 +1,7 @@
-; RUN: llc -mtriple=x86_64-- -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+; When EXPENSIVE_CHECKS are enabled, the machine verifier appears between each
+; pass. Ignore it with 'grep -v'.
+; RUN: llc -mtriple=x86_64-- -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 \
+; RUN:   | grep -v 'Verify generated machine code' | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/test/CodeGen/X86/PR37310.mir b/test/CodeGen/X86/PR37310.mir
index a3e17b55c4a6ae89aa68c7dbf6ba9a6911d8dfe9..6f09a8987eb076eaf63b9fc1b0d20e215530b48d 100644
--- a/test/CodeGen/X86/PR37310.mir
+++ b/test/CodeGen/X86/PR37310.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -start-before dwarfehprepare -no-stack-coloring=false -stop-after stack-coloring -o - %s
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -no-stack-coloring=false -run-pass stack-coloring -o - %s
 
 # Test to insure that the liveness analysis in the StackColoring
 # pass gracefully handles statically unreachable blocks. See PR 37310.
@@ -79,7 +79,7 @@ registers:
   - { id: 4, class: gr64, preferred-register: '' }
   - { id: 5, class: gr32, preferred-register: '' }
 liveins:         
-  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$edi' }
 frameInfo:       
   isFrameAddressTaken: false
   isReturnAddressTaken: false
diff --git a/test/CodeGen/X86/aes-schedule.ll b/test/CodeGen/X86/aes-schedule.ll
index 344b2aa6a420e47bf4f4dd7903299b08b9928858..c622899ca09168b6e95bdb34646035fcbee38045 100644
--- a/test/CodeGen/X86/aes-schedule.ll
+++ b/test/CodeGen/X86/aes-schedule.ll
@@ -14,6 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=+aes,-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=+aes,-avx  | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -92,6 +94,18 @@ define <2 x i64> @test_aesdec(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vaesdec (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesdec:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesdec %xmm1, %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    aesdec (%rdi), %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_aesdec:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesdec %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaesdec (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesdec:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesdec %xmm1, %xmm0 # sched: [3:1.00]
@@ -195,6 +209,18 @@ define <2 x i64> @test_aesdeclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; SKX-NEXT:    vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesdeclast:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesdeclast %xmm1, %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    aesdeclast (%rdi), %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_aesdeclast:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesdeclast:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesdeclast %xmm1, %xmm0 # sched: [3:1.00]
@@ -298,6 +324,18 @@ define <2 x i64> @test_aesenc(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vaesenc (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesenc:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesenc %xmm1, %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    aesenc (%rdi), %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_aesenc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesenc %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaesenc (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesenc:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesenc %xmm1, %xmm0 # sched: [3:1.00]
@@ -401,6 +439,18 @@ define <2 x i64> @test_aesenclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; SKX-NEXT:    vaesenclast (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesenclast:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesenclast %xmm1, %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    aesenclast (%rdi), %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_aesenclast:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesenclast %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaesenclast (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesenclast:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesenclast %xmm1, %xmm0 # sched: [3:1.00]
@@ -517,6 +567,20 @@ define <2 x i64> @test_aesimc(<2 x i64> %a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aesimc:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aesimc %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    aesimc (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_aesimc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaesimc (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaesimc %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aesimc:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aesimc %xmm0, %xmm1 # sched: [2:1.00]
@@ -637,6 +701,20 @@ define <2 x i64> @test_aeskeygenassist(<2 x i64> %a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_aeskeygenassist:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    aeskeygenassist $7, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    aeskeygenassist $7, (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_aeskeygenassist:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaeskeygenassist $7, (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vaeskeygenassist $7, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_aeskeygenassist:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    aeskeygenassist $7, %xmm0, %xmm1 # sched: [2:1.00]
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
index 7e2bff4116c94846f251c2a31c9eab2bbb8d1ecd..519b169c0f9e424fff90253bc8bddcbc579b864e 100644
--- a/test/CodeGen/X86/atomic32.ll
+++ b/test/CodeGen/X86/atomic32.ll
@@ -61,22 +61,22 @@ define void @atomic_fetch_and32() nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock andl $3, {{.*}}(%rip)
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB2_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB2_2
 ; X64-NEXT:    jmp .LBB2_1
 ; X64-NEXT:  .LBB2_2: # %atomicrmw.end
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    lock andl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
 ;
@@ -85,18 +85,18 @@ define void @atomic_fetch_and32() nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    lock andl $3, sc32
 ; X86-NEXT:    movl sc32, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB2_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    andl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
 ; X86-NEXT:    sete %dl
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X86-NEXT:    jne .LBB2_2
 ; X86-NEXT:    jmp .LBB2_1
 ; X86-NEXT:  .LBB2_2: # %atomicrmw.end
@@ -115,22 +115,22 @@ define void @atomic_fetch_or32() nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock orl $3, {{.*}}(%rip)
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    orl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB3_2
 ; X64-NEXT:    jmp .LBB3_1
 ; X64-NEXT:  .LBB3_2: # %atomicrmw.end
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    lock orl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
 ;
@@ -139,18 +139,18 @@ define void @atomic_fetch_or32() nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    lock orl $3, sc32
 ; X86-NEXT:    movl sc32, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB3_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    orl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
 ; X86-NEXT:    sete %dl
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X86-NEXT:    jne .LBB3_2
 ; X86-NEXT:    jmp .LBB3_1
 ; X86-NEXT:  .LBB3_2: # %atomicrmw.end
@@ -169,22 +169,22 @@ define void @atomic_fetch_xor32() nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    lock xorl $3, {{.*}}(%rip)
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    xorl $5, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %dl
 ; X64-NEXT:    testb $1, %dl
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB4_2
 ; X64-NEXT:    jmp .LBB4_1
 ; X64-NEXT:  .LBB4_2: # %atomicrmw.end
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    lock xorl %eax, {{.*}}(%rip)
 ; X64-NEXT:    retq
 ;
@@ -193,18 +193,18 @@ define void @atomic_fetch_xor32() nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    lock xorl $3, sc32
 ; X86-NEXT:    movl sc32, %eax
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB4_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl $5, %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
 ; X86-NEXT:    sete %dl
 ; X86-NEXT:    testb $1, %dl
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:    jmp .LBB4_1
 ; X86-NEXT:  .LBB4_2: # %atomicrmw.end
@@ -222,19 +222,19 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_nand32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    andl %edx, %ecx
 ; X64-NEXT:    notl %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB5_2
 ; X64-NEXT:    jmp .LBB5_1
 ; X64-NEXT:  .LBB5_2: # %atomicrmw.end
@@ -246,13 +246,13 @@ define void @atomic_fetch_nand32(i32 %x) nounwind {
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl sc32, %ecx
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:  .LBB5_1: # %atomicrmw.start
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    andl %edx, %ecx
 ; X86-NEXT:    notl %ecx
 ; X86-NEXT:    lock cmpxchgl %ecx, sc32
@@ -273,20 +273,20 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_max32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovgel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB6_2
 ; X64-NEXT:    jmp .LBB6_1
 ; X64-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -298,20 +298,20 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovgel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB6_2
 ; X86-CMOV-NEXT:    jmp .LBB6_1
 ; X86-CMOV-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -326,34 +326,34 @@ define void @atomic_fetch_max32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB6_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jge .LBB6_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB6_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB6_2
 ; X86-NOCMOV-NEXT:    jmp .LBB6_1
 ; X86-NOCMOV-NEXT:  .LBB6_2: # %atomicrmw.end
@@ -369,20 +369,20 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_min32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovlel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB7_2
 ; X64-NEXT:    jmp .LBB7_1
 ; X64-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -394,20 +394,20 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovlel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB7_2
 ; X86-CMOV-NEXT:    jmp .LBB7_1
 ; X86-CMOV-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -422,34 +422,34 @@ define void @atomic_fetch_min32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB7_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jle .LBB7_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB7_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB7_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB7_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB7_2
 ; X86-NOCMOV-NEXT:    jmp .LBB7_1
 ; X86-NOCMOV-NEXT:  .LBB7_2: # %atomicrmw.end
@@ -465,20 +465,20 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_umax32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmoval %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB8_2
 ; X64-NEXT:    jmp .LBB8_1
 ; X64-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -490,20 +490,20 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmoval %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB8_2
 ; X86-CMOV-NEXT:    jmp .LBB8_1
 ; X86-CMOV-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -518,34 +518,34 @@ define void @atomic_fetch_umax32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB8_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    ja .LBB8_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB8_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB8_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB8_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB8_2
 ; X86-NOCMOV-NEXT:    jmp .LBB8_1
 ; X86-NOCMOV-NEXT:  .LBB8_2: # %atomicrmw.end
@@ -561,20 +561,20 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_umin32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl sc32, %eax
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %edx # 4-byte Reload
+; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
 ; X64-NEXT:    subl %edx, %ecx
 ; X64-NEXT:    cmovbel %eax, %edx
 ; X64-NEXT:    lock cmpxchgl %edx, {{.*}}(%rip)
 ; X64-NEXT:    sete %sil
 ; X64-NEXT:    testb $1, %sil
-; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    jne .LBB9_2
 ; X64-NEXT:    jmp .LBB9_1
 ; X64-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -586,20 +586,20 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-CMOV-NEXT:    subl $12, %esp
 ; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-CMOV-NEXT:    movl sc32, %ecx
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-CMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X86-CMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-CMOV-NEXT:    movl %eax, %ecx
-; X86-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-CMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-CMOV-NEXT:    subl %edx, %ecx
 ; X86-CMOV-NEXT:    cmovbel %eax, %edx
 ; X86-CMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-CMOV-NEXT:    sete %bl
 ; X86-CMOV-NEXT:    testb $1, %bl
+; X86-CMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-CMOV-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-CMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
 ; X86-CMOV-NEXT:    jne .LBB9_2
 ; X86-CMOV-NEXT:    jmp .LBB9_1
 ; X86-CMOV-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -614,34 +614,34 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 ; X86-NOCMOV-NEXT:    subl $24, %esp
 ; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOCMOV-NEXT:    movl sc32, %ecx
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB9_1: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, %ecx
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    subl %edx, %ecx
 ; X86-NOCMOV-NEXT:    movl %eax, %esi
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X86-NOCMOV-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jbe .LBB9_4
 ; X86-NOCMOV-NEXT:  # %bb.3: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB9_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:  .LBB9_4: # %atomicrmw.start
 ; X86-NOCMOV-NEXT:    # in Loop: Header=BB9_1 Depth=1
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X86-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NOCMOV-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    movl %ecx, %eax
 ; X86-NOCMOV-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NOCMOV-NEXT:    lock cmpxchgl %edx, sc32
 ; X86-NOCMOV-NEXT:    sete %bl
 ; X86-NOCMOV-NEXT:    testb $1, %bl
-; X86-NOCMOV-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NOCMOV-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NOCMOV-NEXT:    jne .LBB9_2
 ; X86-NOCMOV-NEXT:    jmp .LBB9_1
 ; X86-NOCMOV-NEXT:  .LBB9_2: # %atomicrmw.end
@@ -659,7 +659,7 @@ define void @atomic_fetch_cmpxchg32() nounwind {
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    movl $1, %ecx
 ; X64-NEXT:    lock cmpxchgl %ecx, {{.*}}(%rip)
-; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: atomic_fetch_cmpxchg32:
@@ -694,7 +694,7 @@ define void @atomic_fetch_swap32(i32 %x) nounwind {
 ; X64-LABEL: atomic_fetch_swap32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    xchgl %edi, {{.*}}(%rip)
-; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: atomic_fetch_swap32:
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index e8a03fe6a7b79f5c6e8a4d4533fe690891eb82c1..f090585951bfa90ce4cf4e131fb323a7ea4fd602 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -1256,7 +1256,8 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX1-NEXT:    # xmm2 = mem[0,0]
 ; AVX1-NEXT:    vpavgb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1310,7 +1311,8 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
 ; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
 ; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpavgb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -2140,243 +2142,231 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind
 ; AVX1-NEXT:    pushq %r12
 ; AVX1-NEXT:    pushq %rbx
 ; AVX1-NEXT:    subq $24, %rsp
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm6, %rdi
+; AVX1-NEXT:    vmovq %xmm6, %rbp
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
 ; AVX1-NEXT:    vpextrq $1, %xmm5, %rbx
-; AVX1-NEXT:    vmovq %xmm5, %rbp
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX1-NEXT:    vmovq %xmm4, %rcx
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %r8
-; AVX1-NEXT:    vmovq %xmm4, %r11
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %r13
-; AVX1-NEXT:    vmovq %xmm3, %r12
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %r15
-; AVX1-NEXT:    vmovq %xmm4, %rdi
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vmovq %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    vmovq %xmm3, %r10
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX1-NEXT:    addq %rbx, %rdx
-; AVX1-NEXT:    vmovq %xmm4, %r9
-; AVX1-NEXT:    addq %rbp, %r9
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX1-NEXT:    addq %rsi, %rax
-; AVX1-NEXT:    movq %rax, %r14
-; AVX1-NEXT:    vmovq %xmm3, %rbp
-; AVX1-NEXT:    addq %rcx, %rbp
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vmovq %xmm5, %rsi
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX1-NEXT:    vmovq %xmm5, %rcx
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm6, %r13
+; AVX1-NEXT:    vmovq %xmm6, %r12
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm5, %r11
+; AVX1-NEXT:    vmovq %xmm5, %r14
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rsi
-; AVX1-NEXT:    addq %r8, %rsi
-; AVX1-NEXT:    vmovq %xmm3, %rax
-; AVX1-NEXT:    addq %r11, %rax
-; AVX1-NEXT:    movq %rax, %r11
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %r9
+; AVX1-NEXT:    vmovq %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    vmovq %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    addq %r13, %rax
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    addq %r12, %rax
-; AVX1-NEXT:    movq %rax, %r8
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX1-NEXT:    addq %r15, %rax
-; AVX1-NEXT:    movq %rax, %rbx
-; AVX1-NEXT:    vmovq %xmm3, %rax
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %rax
 ; AVX1-NEXT:    addq %rdi, %rax
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    vmovq %xmm5, %rax
+; AVX1-NEXT:    addq %rbp, %rax
+; AVX1-NEXT:    movq %rax, %rbp
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm4, %r15
+; AVX1-NEXT:    addq %rbx, %r15
+; AVX1-NEXT:    vmovq %xmm4, %r10
+; AVX1-NEXT:    addq %rsi, %r10
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX1-NEXT:    addq %rdx, %rax
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    vmovq %xmm4, %r8
+; AVX1-NEXT:    addq %rcx, %r8
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm5, %rcx
+; AVX1-NEXT:    addq %r13, %rcx
+; AVX1-NEXT:    vmovq %xmm5, %rax
+; AVX1-NEXT:    addq %r12, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm4, %rax
+; AVX1-NEXT:    addq %r11, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX1-NEXT:    vmovq %xmm4, %rax
+; AVX1-NEXT:    addq %r14, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX1-NEXT:    addq %r9, %rax
+; AVX1-NEXT:    movq %rax, %r13
+; AVX1-NEXT:    vmovq %xmm1, %rbx
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
 ; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    vmovq %xmm2, %r12
-; AVX1-NEXT:    addq %r10, %r12
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpextrq $1, %xmm0, %r10
-; AVX1-NEXT:    addq %rax, %r10
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vmovq %xmm0, %rdi
-; AVX1-NEXT:    addq %rax, %rdi
-; AVX1-NEXT:    addq $-1, %rdx
-; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %eax
-; AVX1-NEXT:    adcq $-1, %rax
+; AVX1-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r9
-; AVX1-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX1-NEXT:    addq %rax, %rsi
+; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    vmovq %xmm2, %rax
+; AVX1-NEXT:    vmovq %xmm0, %rsi
+; AVX1-NEXT:    addq %rax, %rsi
+; AVX1-NEXT:    addq $-1, %rdi
+; AVX1-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r14
-; AVX1-NEXT:    movq %r14, (%rsp) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %rbp
+; AVX1-NEXT:    movq %rbp, (%rsp) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rbp
-; AVX1-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %r15
+; AVX1-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rsi
-; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %r10
+; AVX1-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %r11
-; AVX1-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    addq $-1, %rdx
+; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
 ; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    addq $-1, %rcx
-; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %ebp
-; AVX1-NEXT:    adcq $-1, %rbp
 ; AVX1-NEXT:    addq $-1, %r8
 ; AVX1-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movl $0, %r15d
-; AVX1-NEXT:    adcq $-1, %r15
-; AVX1-NEXT:    addq $-1, %rbx
-; AVX1-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movl $0, %r12d
+; AVX1-NEXT:    adcq $-1, %r12
+; AVX1-NEXT:    addq $-1, %rcx
+; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX1-NEXT:    movl $0, %eax
 ; AVX1-NEXT:    adcq $-1, %rax
-; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    addq $-1, %rax
+; AVX1-NEXT:    movl $0, %ecx
+; AVX1-NEXT:    adcq $-1, %rcx
 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX1-NEXT:    movl $0, %r13d
-; AVX1-NEXT:    adcq $-1, %r13
+; AVX1-NEXT:    movl $0, %edx
+; AVX1-NEXT:    adcq $-1, %rdx
 ; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    movl $0, %r15d
+; AVX1-NEXT:    adcq $-1, %r15
+; AVX1-NEXT:    addq $-1, %r13
 ; AVX1-NEXT:    movl $0, %r14d
 ; AVX1-NEXT:    adcq $-1, %r14
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    addq $-1, %rdx
+; AVX1-NEXT:    addq $-1, %rbx
 ; AVX1-NEXT:    movl $0, %r11d
 ; AVX1-NEXT:    adcq $-1, %r11
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    addq $-1, %rax
-; AVX1-NEXT:    movl $0, %ebx
-; AVX1-NEXT:    adcq $-1, %rbx
-; AVX1-NEXT:    addq $-1, %r12
-; AVX1-NEXT:    movl $0, %r9d
-; AVX1-NEXT:    adcq $-1, %r9
-; AVX1-NEXT:    addq $-1, %r10
+; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    movl $0, %r8d
 ; AVX1-NEXT:    adcq $-1, %r8
-; AVX1-NEXT:    addq $-1, %rdi
-; AVX1-NEXT:    movl $0, %ecx
-; AVX1-NEXT:    adcq $-1, %rcx
-; AVX1-NEXT:    shldq $63, %rdi, %rcx
+; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    movl $0, %edi
+; AVX1-NEXT:    adcq $-1, %rdi
+; AVX1-NEXT:    addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX1-NEXT:    movl $0, %r10d
+; AVX1-NEXT:    adcq $-1, %r10
+; AVX1-NEXT:    movq %rsi, %rbp
+; AVX1-NEXT:    addq $-1, %rbp
+; AVX1-NEXT:    movl $0, %r9d
+; AVX1-NEXT:    adcq $-1, %r9
+; AVX1-NEXT:    shldq $63, %rbx, %r11
+; AVX1-NEXT:    shldq $63, %r13, %r14
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbx, %r15
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbx, %rdx
+; AVX1-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX1-NEXT:    shldq $63, %rax, %rcx
 ; AVX1-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    shldq $63, %r10, %r8
-; AVX1-NEXT:    shldq $63, %r12, %r9
-; AVX1-NEXT:    shldq $63, %rax, %rbx
-; AVX1-NEXT:    shldq $63, %rdx, %r11
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %r14
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %r13
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rsi
-; AVX1-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r15
+; AVX1-NEXT:    shldq $63, %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rbp
+; AVX1-NEXT:    shldq $63, %rax, %r12
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rax, %rsi
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rax, %rdx
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; AVX1-NEXT:    shldq $63, %rax, %rcx
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %rdi
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX1-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r12
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rax, %r10
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX1-NEXT:    shldq $63, %rdx, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm8
-; AVX1-NEXT:    vmovq %r10, %xmm0
-; AVX1-NEXT:    vmovq %r12, %xmm1
-; AVX1-NEXT:    vmovq %rdi, %xmm11
-; AVX1-NEXT:    vmovq %rcx, %xmm2
-; AVX1-NEXT:    vmovq %rsi, %xmm13
-; AVX1-NEXT:    vmovq %rbp, %xmm14
-; AVX1-NEXT:    vmovq %r15, %xmm15
-; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Folded Reload
+; AVX1-NEXT:    movq (%rsp), %rbx # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbx, %rax
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %r13, %rbx
+; AVX1-NEXT:    shldq $63, %rbp, %r9
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbp, %r10
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbp, %rdi
+; AVX1-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX1-NEXT:    shldq $63, %rbp, %r8
+; AVX1-NEXT:    vmovq %rbx, %xmm8
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vmovq %rcx, %xmm1
+; AVX1-NEXT:    vmovq %rdx, %xmm11
+; AVX1-NEXT:    vmovq %rsi, %xmm2
+; AVX1-NEXT:    vmovq %r12, %xmm13
+; AVX1-NEXT:    vmovq %r8, %xmm14
+; AVX1-NEXT:    vmovq %rdi, %xmm15
+; AVX1-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Reload
 ; AVX1-NEXT:    # xmm9 = mem[0],zero
-; AVX1-NEXT:    vmovq %r13, %xmm10
-; AVX1-NEXT:    vmovq %r14, %xmm12
-; AVX1-NEXT:    vmovq %r11, %xmm3
-; AVX1-NEXT:    vmovq %rbx, %xmm4
-; AVX1-NEXT:    vmovq %r9, %xmm5
-; AVX1-NEXT:    vmovq %r8, %xmm6
-; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload
-; AVX1-NEXT:    # xmm7 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 8-byte Reload
+; AVX1-NEXT:    # xmm10 = mem[0],zero
+; AVX1-NEXT:    vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 8-byte Folded Reload
+; AVX1-NEXT:    # xmm12 = mem[0],zero
+; AVX1-NEXT:    vmovq %r15, %xmm3
+; AVX1-NEXT:    vmovq %r14, %xmm4
+; AVX1-NEXT:    vmovq %r11, %xmm5
+; AVX1-NEXT:    vmovq %r10, %xmm6
+; AVX1-NEXT:    vmovq %r9, %xmm7
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm11[0],xmm1[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm0[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm8 = xmm0[0,2],xmm8[0,2]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm13[0],xmm2[0]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm15[0],xmm14[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm11 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm11 = xmm1[0,2],xmm0[0,2]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm8, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm11, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm10[0],xmm9[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm10[0],xmm9[0]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm2[0,2]
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm5[0],xmm4[0]
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm6[0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm3[0,2]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 80bbf29c1c550dab3a15b59c4b2e85d2746e634b..b85fd4e6482fb23cae0ce12d605631d9ae243bc8 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -92,7 +92,7 @@ define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
 define <16 x float> @fneg(<16 x float> %a) nounwind {
 ; CHECK-LABEL: fneg:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vxorps %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx-fp2int.ll b/test/CodeGen/X86/avx-fp2int.ll
index f06564b0f58ab10d61b3ebcc8bea9a1c5c204f68..d1aa1f281fde8edb0530a97d86476f4d581e965b 100644
--- a/test/CodeGen/X86/avx-fp2int.ll
+++ b/test/CodeGen/X86/avx-fp2int.ll
@@ -1,19 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 ;; Check that FP_TO_SINT and FP_TO_UINT generate convert with truncate
 
-; CHECK-LABEL: test1:
-; CHECK: vcvttpd2dq
-; CHECK: ret
-; CHECK-LABEL: test2:
-; CHECK: vcvttpd2dq
-; CHECK: ret
-
 define <4 x i8> @test1(<4 x double> %d) {
+; CHECK-LABEL: test1:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
   %c = fptoui <4 x double> %d to <4 x i8>
   ret <4 x i8> %c
 }
 define <4 x i8> @test2(<4 x double> %d) {
+; CHECK-LABEL: test2:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
   %c = fptosi <4 x double> %d to <4 x i8>
   ret <4 x i8> %c
 }
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 6e58ffe09621214482028ee5b17bc636109c87b2..84b3b007310c7b34a22e08fbab2620bb291e9c48 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -85,7 +85,10 @@ define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) no
 define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
 ; CHECK-LABEL: test_mm256_andnot_ps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vandnps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
+; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = bitcast <8 x float> %a0 to <8 x i32>
   %2 = bitcast <8 x float> %a1 to <8 x i32>
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll
index d55bbac5dc19fc69d9709dc0d9c2856d1c0b86e1..ea42aa34d8ce4d11858b24ef2b3cf37e05980b54 100644
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -13,15 +13,15 @@ define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>*
 ; CHECK-NEXT:    movq %rsi, %r15
 ; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
-; CHECK-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps (%rsi), %ymm1
-; CHECK-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps (%rdx), %ymm2
 ; CHECK-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
 ; CHECK-NEXT:    callq dummy
-; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; CHECK-NEXT:    vmovaps %ymm0, (%rbx)
-; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; CHECK-NEXT:    vmovaps %ymm0, (%r15)
 ; CHECK-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; CHECK-NEXT:    vmovaps %ymm0, (%r14)
@@ -38,21 +38,21 @@ define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>*
 ; CHECK_O0-NEXT:    vmovapd (%rdi), %ymm0
 ; CHECK_O0-NEXT:    vmovaps (%rsi), %ymm1
 ; CHECK_O0-NEXT:    vmovdqa (%rdx), %ymm2
-; CHECK_O0-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK_O0-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK_O0-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp) # 32-byte Spill
-; CHECK_O0-NEXT:    movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK_O0-NEXT:    movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
-; CHECK_O0-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK_O0-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK_O0-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK_O0-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK_O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK_O0-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK_O0-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK_O0-NEXT:    callq dummy
-; CHECK_O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload
-; CHECK_O0-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; CHECK_O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; CHECK_O0-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; CHECK_O0-NEXT:    vmovapd %ymm0, (%rdx)
-; CHECK_O0-NEXT:    movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
-; CHECK_O0-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1 # 32-byte Reload
+; CHECK_O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; CHECK_O0-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; CHECK_O0-NEXT:    vmovaps %ymm1, (%rsi)
-; CHECK_O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; CHECK_O0-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm2 # 32-byte Reload
+; CHECK_O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK_O0-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; CHECK_O0-NEXT:    vmovdqa %ymm2, (%rdi)
 ; CHECK_O0-NEXT:    addq $152, %rsp
 ; CHECK_O0-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll
index 0fe5cbacc84486009eb58f77a04dd0e95b4daf13..44d0993b68dafc8f674014153a8c91c66745a4cf 100644
--- a/test/CodeGen/X86/avx-logic.ll
+++ b/test/CodeGen/X86/avx-logic.ll
@@ -314,7 +314,7 @@ define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
@@ -338,17 +338,17 @@ define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
 define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
 ; AVX1-LABEL: andn_disguised_i8_elts:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpandn %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; INT256-LABEL: andn_disguised_i8_elts:
@@ -417,17 +417,17 @@ define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %
 define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) {
 ; AVX1-LABEL: andn_variable_mask_operand_concat:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT:    vpandn %xmm1, %xmm4, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpandn %xmm2, %xmm4, %xmm1
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; INT256-LABEL: andn_variable_mask_operand_concat:
@@ -450,7 +450,7 @@ define <8 x i32> @or_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z)
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
@@ -479,7 +479,7 @@ define <8 x i32> @xor_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
@@ -537,7 +537,7 @@ define <8 x i32> @or_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [281470681808895,281470681808895]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
 ; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
@@ -566,7 +566,7 @@ define <8 x i32> @xor_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [281470681808895,281470681808895]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index ea9626979aece543f470b48294dfb992c91df5b9..c9481ccdbf9c0667d04768e89ad8b6a12a87c9f1 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -6,6 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -46,6 +47,12 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_addpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_addpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -100,6 +107,12 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_addps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddps (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_addps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -154,6 +167,12 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_addsubpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_addsubpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -209,6 +228,12 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_addsubps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_addsubps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -270,6 +295,13 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andnotpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vandnpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_andnotpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -339,6 +371,13 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andnotps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vandnps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_andnotps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -408,6 +447,13 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vandpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_andpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -475,6 +521,13 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vandps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_andps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -542,6 +595,13 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blendpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [2:1.00]
+; BDVER2-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0,1],mem[2,3] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blendpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:1.00]
@@ -605,6 +665,13 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blendps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [2:1.00]
+; BDVER2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],mem[2],ymm1[3],mem[4,5,6],ymm1[7] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blendps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:1.00]
@@ -662,6 +729,12 @@ define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blendvpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:3.00]
+; BDVER2-NEXT:    vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blendvpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
@@ -717,6 +790,12 @@ define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blendvps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:3.00]
+; BDVER2-NEXT:    vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blendvps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
@@ -766,6 +845,11 @@ define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
 ; SKX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_broadcastf128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_broadcastf128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:2.00]
@@ -811,6 +895,11 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
 ; SKX-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_broadcastsd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [6:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_broadcastsd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vbroadcastsd (%rdi), %ymm0 # sched: [6:2.00]
@@ -857,6 +946,11 @@ define <4 x float> @test_broadcastss(float *%a0) {
 ; SKX-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_broadcastss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_broadcastss:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vbroadcastss (%rdi), %xmm0 # sched: [6:1.00]
@@ -903,6 +997,11 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
 ; SKX-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_broadcastss_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [6:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_broadcastss_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vbroadcastss (%rdi), %ymm0 # sched: [6:2.00]
@@ -961,6 +1060,13 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmppd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
+; BDVER2-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    vorpd %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmppd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
@@ -1027,6 +1133,13 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
+; BDVER2-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    vorps %ymm0, %ymm1, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmpps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
@@ -1093,6 +1206,13 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtdq2pd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvtdq2pd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtdq2pd (%rdi), %ymm1 # sched: [8:2.00]
@@ -1158,6 +1278,13 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtdq2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvtdq2ps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtdq2ps (%rdi), %ymm1 # sched: [8:2.00]
@@ -1221,6 +1348,13 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpd2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtpd2dqy (%rdi), %xmm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvtpd2dq %ymm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvtpd2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtpd2dqy (%rdi), %xmm1 # sched: [11:2.00]
@@ -1285,6 +1419,13 @@ define <8 x i32> @test_cvttpd2dq(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvttpd2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvttpd2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvttpd2dqy (%rdi), %xmm1 # sched: [11:2.00]
@@ -1348,6 +1489,13 @@ define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpd2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvtpd2ps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtpd2psy (%rdi), %xmm1 # sched: [11:2.00]
@@ -1411,6 +1559,13 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtps2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2dq (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vcvtps2dq %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvtps2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtps2dq (%rdi), %ymm1 # sched: [8:2.00]
@@ -1475,6 +1630,13 @@ define <8 x i32> @test_cvttps2dq(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvttps2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vcvttps2dq %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvttps2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvttps2dq (%rdi), %ymm1 # sched: [8:2.00]
@@ -1532,6 +1694,12 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [21:8.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_divpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [9:19.00]
+; BDVER2-NEXT:    vdivpd (%rdi), %ymm0, %ymm0 # sched: [14:19.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_divpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
@@ -1586,6 +1754,12 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [18:5.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_divps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [9:19.00]
+; BDVER2-NEXT:    vdivps (%rdi), %ymm0, %ymm0 # sched: [14:19.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_divps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
@@ -1640,6 +1814,12 @@ define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ; SKX-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [20:1.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [27:3.00]
+; BDVER2-NEXT:    vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [32:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_dpps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:6.00]
@@ -1701,6 +1881,13 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_extractf128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vextractf128 $1, %ymm1, (%rdi) # sched: [7:0.50]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_extractf128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50]
@@ -1756,6 +1943,12 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; SKX-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_haddpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [11:2.00]
+; BDVER2-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -1811,6 +2004,12 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ; SKX-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_haddps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [11:2.00]
+; BDVER2-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -1866,6 +2065,12 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; SKX-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_hsubpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [11:2.00]
+; BDVER2-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -1921,6 +2126,12 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ; SKX-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_hsubps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [11:2.00]
+; BDVER2-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -1982,6 +2193,13 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_insertf128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_insertf128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
@@ -2035,6 +2253,11 @@ define <32 x i8> @test_lddqu(i8* %a0) {
 ; SKX-NEXT:    vlddqu (%rdi), %ymm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lddqu:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vlddqu (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lddqu:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vlddqu (%rdi), %ymm0 # sched: [5:1.00]
@@ -2092,6 +2315,13 @@ define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
 ; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
+; BDVER2-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [6:2.00]
+; BDVER2-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_maskmovpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
@@ -2155,6 +2385,13 @@ define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2
 ; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovpd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
+; BDVER2-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [6:2.00]
+; BDVER2-NEXT:    vmovapd %ymm2, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_maskmovpd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
@@ -2218,6 +2455,13 @@ define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
+; BDVER2-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [6:2.00]
+; BDVER2-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_maskmovps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
@@ -2281,6 +2525,13 @@ define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
 ; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovps_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
+; BDVER2-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [6:2.00]
+; BDVER2-NEXT:    vmovaps %ymm2, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_maskmovps_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
@@ -2338,6 +2589,12 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maxpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_maxpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -2393,6 +2650,12 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maxps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_maxps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -2448,6 +2711,12 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_minpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vminpd (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_minpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vminpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -2503,6 +2772,12 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_minps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vminps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_minps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vminps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -2564,6 +2839,13 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movapd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovapd (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movapd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovapd (%rdi), %ymm0 # sched: [5:1.00]
@@ -2626,6 +2908,13 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movaps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movaps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovaps (%rdi), %ymm0 # sched: [5:1.00]
@@ -2688,6 +2977,13 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movddup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:1.00]
+; BDVER2-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [2:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movddup:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [6:2.00]
@@ -2745,6 +3041,12 @@ define i32 @test_movmskpd(<4 x double> %a0) {
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movmskpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovmskpd %ymm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movmskpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovmskpd %ymm0, %eax # sched: [3:1.00]
@@ -2797,6 +3099,12 @@ define i32 @test_movmskps(<8 x float> %a0) {
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movmskps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovmskps %ymm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movmskps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovmskps %ymm0, %eax # sched: [3:1.00]
@@ -2861,6 +3169,14 @@ define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movntdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vmovntdq %ymm0, (%rdi) # sched: [2:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movntdq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2916,6 +3232,12 @@ define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movntpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovntpd %ymm0, (%rdi) # sched: [3:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movntpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
@@ -2969,6 +3291,12 @@ define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vmovntps %ymm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movntps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovntps %ymm0, (%rdi) # sched: [3:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movntps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
@@ -3028,6 +3356,13 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movshdup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:1.00]
+; BDVER2-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [2:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movshdup:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [6:2.00]
@@ -3091,6 +3426,13 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movsldup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:1.00]
+; BDVER2-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [2:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movsldup:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [6:2.00]
@@ -3156,6 +3498,13 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movupd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovupd (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovupd %ymm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movupd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovupd (%rdi), %ymm0 # sched: [5:1.00]
@@ -3220,6 +3569,13 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movups:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovups (%rdi), %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovups %ymm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movups:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovups (%rdi), %ymm0 # sched: [5:1.00]
@@ -3276,6 +3632,12 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_mulpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulpd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_mulpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00]
@@ -3330,6 +3692,12 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_mulps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_mulps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -3390,6 +3758,13 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: orpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vorpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: orpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -3457,6 +3832,13 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_orps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vorps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_orps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -3524,6 +3906,13 @@ define <4 x double> @test_perm2f128(<4 x double> %a0, <4 x double> %a1, <4 x dou
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_perm2f128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [4:0.50]
+; BDVER2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:0.50]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_perm2f128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
@@ -3587,6 +3976,13 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:0.50]
+; BDVER2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [2:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_permilpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00]
@@ -3650,6 +4046,13 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilpd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [7:1.00]
+; BDVER2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [2:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_permilpd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:2.00]
@@ -3713,6 +4116,13 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
+; BDVER2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [2:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_permilps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
@@ -3776,6 +4186,13 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilps_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [7:1.00]
+; BDVER2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [2:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_permilps_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:2.00]
@@ -3833,6 +4250,12 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
 ; SKX-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilvarpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    vpermilpd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_permilvarpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
@@ -3888,6 +4311,12 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
 ; SKX-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilvarpd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BDVER2-NEXT:    vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_permilvarpd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
@@ -3943,6 +4372,12 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
 ; SKX-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilvarps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    vpermilps (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_permilvarps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
@@ -3998,6 +4433,12 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
 ; SKX-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_permilvarps_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BDVER2-NEXT:    vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_permilvarps_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vpermilps %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
@@ -4059,6 +4500,13 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps (%rdi), %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rcpps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps (%rdi), %ymm1 # sched: [7:2.00]
@@ -4123,6 +4571,13 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_roundpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vroundpd $7, %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_roundpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vroundpd $7, (%rdi), %ymm1 # sched: [8:2.00]
@@ -4187,6 +4642,13 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_roundps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [9:2.00]
+; BDVER2-NEXT:    vroundps $7, %ymm0, %ymm0 # sched: [4:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_roundps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vroundps $7, (%rdi), %ymm1 # sched: [8:2.00]
@@ -4251,6 +4713,13 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rsqrtps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    vrsqrtps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rsqrtps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrsqrtps (%rdi), %ymm1 # sched: [7:2.00]
@@ -4315,6 +4784,13 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shufpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [2:1.00]
+; BDVER2-NEXT:    vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_shufpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
@@ -4378,6 +4854,13 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shufps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [2:1.00]
+; BDVER2-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[0,3],mem[0,0],ymm1[4,7],mem[4,4] sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_shufps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
@@ -4441,6 +4924,13 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sqrtpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [14:27.00]
+; BDVER2-NEXT:    vsqrtpd %ymm0, %ymm0 # sched: [9:27.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sqrtpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vsqrtpd (%rdi), %ymm1 # sched: [59:54.00]
@@ -4505,6 +4995,13 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
 ; SKX-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sqrtps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [14:21.00]
+; BDVER2-NEXT:    vsqrtps %ymm0, %ymm0 # sched: [9:21.00]
+; BDVER2-NEXT:    vaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sqrtps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vsqrtps (%rdi), %ymm1 # sched: [47:42.00]
@@ -4563,6 +5060,12 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_subpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_subpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -4617,6 +5120,12 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_subps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vsubps (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_subps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
@@ -4689,6 +5198,15 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; SKX-NEXT:    adcl $0, %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_testpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
+; BDVER2-NEXT:    vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vtestpd (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_testpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    xorl %eax, %eax # sched: [0:0.50]
@@ -4775,6 +5293,16 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_testpd_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
+; BDVER2-NEXT:    vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vtestpd (%rdi), %ymm0 # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_testpd_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    xorl %eax, %eax # sched: [0:0.50]
@@ -4856,6 +5384,15 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; SKX-NEXT:    adcl $0, %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_testps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
+; BDVER2-NEXT:    vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vtestps (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_testps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    xorl %eax, %eax # sched: [0:0.50]
@@ -4942,6 +5479,16 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_testps_ymm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    xorl %eax, %eax # sched: [0:0.25]
+; BDVER2-NEXT:    vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vtestps (%rdi), %ymm0 # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $0, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_testps_ymm:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    xorl %eax, %eax # sched: [0:0.50]
@@ -5011,6 +5558,13 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_unpckhpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [2:1.00]
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_unpckhpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
@@ -5068,6 +5622,12 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_unpckhps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [2:1.00]
+; BDVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_unpckhps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
@@ -5128,6 +5688,13 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
 ; SKX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_unpcklpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [2:1.00]
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_unpcklpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
@@ -5185,6 +5752,12 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_unpcklps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [2:1.00]
+; BDVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_unpcklps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
@@ -5245,6 +5818,13 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xorpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vxorpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xorpd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -5312,6 +5892,13 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
 ; SKX-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xorps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vxorps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xorps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
@@ -5367,6 +5954,11 @@ define void @test_zeroall() {
 ; SKX-NEXT:    vzeroall # sched: [12:5.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_zeroall:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vzeroall # sched: [90:8.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_zeroall:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vzeroall # sched: [90:36.50]
@@ -5412,6 +6004,11 @@ define void @test_zeroupper() {
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_zeroupper:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_zeroupper:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vzeroupper # sched: [46:18.50]
@@ -5486,6 +6083,16 @@ define void @test_avx256_zero_idioms() {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_avx256_zero_idioms:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vxorps %ymm0, %ymm0, %ymm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vxorpd %ymm1, %ymm1, %ymm1 # sched: [2:1.00]
+; BDVER2-NEXT:    vandnps %ymm2, %ymm2, %ymm2 # sched: [2:1.00]
+; BDVER2-NEXT:    vandnpd %ymm3, %ymm3, %ymm3 # sched: [2:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_avx256_zero_idioms:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 5686e3abc9771aeebd8f47446831b69fa16151f9..b136c72366e9a4536fd38e7d7a96f4827b99feda 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -316,12 +316,12 @@ entry:
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
 ; X32-LABEL: _e2:
 ; X32:       ## %bb.0: ## %entry
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: _e2:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X64-NEXT:    retq
 entry:
    %vecinit.i = insertelement <4 x float> undef, float       0xbf80000000000000, i32 0
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 75a11845b1e305fd2c2291285353afa487125053..f470c97e4726d188ea86cf7983716b2af09c4908 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -224,7 +224,7 @@ entry:
 define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
 ; AVX1-LABEL: shuffle_v16i16_4501_mem:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
@@ -532,7 +532,7 @@ define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounw
 ; AVX2-LABEL: ld0_hi0_lo1_4f64:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
@@ -552,7 +552,7 @@ define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounw
 ; AVX2-LABEL: ld1_hi0_hi1_4f64:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
@@ -572,7 +572,7 @@ define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind
 ; AVX2-LABEL: ld0_hi0_lo1_8f32:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
@@ -592,7 +592,7 @@ define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind
 ; AVX2-LABEL: ld1_hi0_hi1_8f32:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index e5bff60109eeec42feaa6406c12132c04a5fa62a..3662e39a641b088c2a45579d11bfcfdd0a26542d 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=bdver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BDVER2
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2
 
 declare i32 @foo()
@@ -56,6 +57,20 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
 ; FAST-ymm-zmm-NEXT:    addq $56, %rsp
 ; FAST-ymm-zmm-NEXT:    retq
 ;
+; BDVER2-LABEL: test01:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    subq $56, %rsp
+; BDVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; BDVER2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; BDVER2-NEXT:    vzeroupper
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; BDVER2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; BDVER2-NEXT:    addq $56, %rsp
+; BDVER2-NEXT:    retq
+;
 ; BTVER2-LABEL: test01:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    subq $56, %rsp
@@ -81,16 +96,25 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
 define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
 ; VZ-LABEL: test02:
 ; VZ:       # %bb.0:
-; VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; VZ-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; VZ-NEXT:    vzeroupper
 ; VZ-NEXT:    jmp do_sse # TAILCALL
 ;
-; NO-VZ-LABEL: test02:
-; NO-VZ:       # %bb.0:
-; NO-VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NO-VZ-NEXT:    jmp do_sse # TAILCALL
+; FAST-ymm-zmm-LABEL: test02:
+; FAST-ymm-zmm:       # %bb.0:
+; FAST-ymm-zmm-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; FAST-ymm-zmm-NEXT:    jmp do_sse # TAILCALL
+;
+; BDVER2-LABEL: test02:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vzeroupper
+; BDVER2-NEXT:    jmp do_sse # TAILCALL
+;
+; BTVER2-LABEL: test02:
+; BTVER2:       # %bb.0:
+; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    jmp do_sse # TAILCALL
   %add.i = fadd <8 x float> %a, %b
   %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
   %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
@@ -162,6 +186,37 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
 ; FAST-ymm-zmm-NEXT:    popq %rbx
 ; FAST-ymm-zmm-NEXT:    retq
 ;
+; BDVER2-LABEL: test03:
+; BDVER2:       # %bb.0: # %entry
+; BDVER2-NEXT:    pushq %rbx
+; BDVER2-NEXT:    subq $16, %rsp
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; BDVER2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; BDVER2-NEXT:    .p2align 4, 0x90
+; BDVER2-NEXT:  .LBB3_1: # %while.cond
+; BDVER2-NEXT:    # =>This Inner Loop Header: Depth=1
+; BDVER2-NEXT:    callq foo
+; BDVER2-NEXT:    testl %eax, %eax
+; BDVER2-NEXT:    jne .LBB3_1
+; BDVER2-NEXT:  # %bb.2: # %for.body.preheader
+; BDVER2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; BDVER2-NEXT:    movl $4, %ebx
+; BDVER2-NEXT:    .p2align 4, 0x90
+; BDVER2-NEXT:  .LBB3_3: # %for.body
+; BDVER2-NEXT:    # =>This Inner Loop Header: Depth=1
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm0
+; BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; BDVER2-NEXT:    vzeroupper
+; BDVER2-NEXT:    callq do_sse
+; BDVER2-NEXT:    decl %ebx
+; BDVER2-NEXT:    jne .LBB3_3
+; BDVER2-NEXT:  # %bb.4: # %for.end
+; BDVER2-NEXT:    addq $16, %rsp
+; BDVER2-NEXT:    popq %rbx
+; BDVER2-NEXT:    retq
+;
 ; BTVER2-LABEL: test03:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    pushq %rbx
@@ -230,15 +285,36 @@ define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind {
 ; VZ-NEXT:    vzeroupper
 ; VZ-NEXT:    retq
 ;
-; NO-VZ-LABEL: test04:
-; NO-VZ:       # %bb.0:
-; NO-VZ-NEXT:    pushq %rax
-; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; NO-VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; NO-VZ-NEXT:    callq do_avx
-; NO-VZ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NO-VZ-NEXT:    popq %rax
-; NO-VZ-NEXT:    retq
+; FAST-ymm-zmm-LABEL: test04:
+; FAST-ymm-zmm:       # %bb.0:
+; FAST-ymm-zmm-NEXT:    pushq %rax
+; FAST-ymm-zmm-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; FAST-ymm-zmm-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; FAST-ymm-zmm-NEXT:    callq do_avx
+; FAST-ymm-zmm-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; FAST-ymm-zmm-NEXT:    popq %rax
+; FAST-ymm-zmm-NEXT:    retq
+;
+; BDVER2-LABEL: test04:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pushq %rax
+; BDVER2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; BDVER2-NEXT:    callq do_avx
+; BDVER2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; BDVER2-NEXT:    popq %rax
+; BDVER2-NEXT:    vzeroupper
+; BDVER2-NEXT:    retq
+;
+; BTVER2-LABEL: test04:
+; BTVER2:       # %bb.0:
+; BTVER2-NEXT:    pushq %rax
+; BTVER2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; BTVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; BTVER2-NEXT:    callq do_avx
+; BTVER2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; BTVER2-NEXT:    popq %rax
+; BTVER2-NEXT:    retq
   %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
   %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/test/CodeGen/X86/avx2-fma-fneg-combine.ll
index 9ef7bcf026c196ecf0a99b442abfc77bb5e225af..d9f41539bfe1559d93cd24bbe5092d98708c48eb 100644
--- a/test/CodeGen/X86/avx2-fma-fneg-combine.ll
+++ b/test/CodeGen/X86/avx2-fma-fneg-combine.ll
@@ -44,14 +44,14 @@ define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c)  {
 ; X32-LABEL: test3:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
-; X32-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; X32-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; X32-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test3:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
-; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; X64-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; X64-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
index d19c58eed73a1682f91fedd9ecab0c3c9c3cfba5..81e10a5d2427a619ff74ee9dac25ea3653f54edf 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -5,15 +5,10 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=X64 --check-prefix=X64-AVX512
 
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_pblendw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pblendw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pblendw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -21,15 +16,10 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind
 
 
 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pblendd_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pblendd_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pblendd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -37,15 +27,10 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind
 
 
 define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pblendd_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pblendd_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pblendd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -70,15 +55,10 @@ declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
 
 
 define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: test_x86_avx2_mpsadbw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_mpsadbw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_mpsadbw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -86,15 +66,10 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind re
 
 
 define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_psll_dq_bs:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_psll_dq_bs:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_psll_dq_bs:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -102,15 +77,10 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_psrl_dq_bs:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_psrl_dq_bs:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_psrl_dq_bs:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -118,15 +88,10 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_psll_dq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_psll_dq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_psll_dq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -134,15 +99,10 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_psrl_dq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_psrl_dq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_psrl_dq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -150,17 +110,11 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
 
 
 define <2 x i64> @test_x86_avx2_vextracti128(<4 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_vextracti128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-NEXT:    vzeroupper
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vextracti128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vextracti128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64> %a0, i8 7)
   ret <2 x i64> %res
 }
@@ -168,15 +122,10 @@ declare <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64>, i8) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) {
-; X86-LABEL: test_x86_avx2_vinserti128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vinserti128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vinserti128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64> %a0, <2 x i64> %a1, i8 7)
   ret <4 x i64> %res
 }
@@ -184,15 +133,10 @@ declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind
 
 
 define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
-; X86-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastsd %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
   ret <4 x double> %res
 }
@@ -200,15 +144,10 @@ declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind
 
 
 define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
-; X86-LABEL: test_x86_avx2_vbroadcast_ss_ps:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastss %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vbroadcast_ss_ps:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
   ret <4 x float> %res
 }
@@ -216,15 +155,10 @@ declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readon
 
 
 define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
-; X86-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastss %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
   ret <8 x float> %res
 }
@@ -232,15 +166,10 @@ declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind re
 
 
 define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastb_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastb %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastb_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastb %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastb_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0)
   ret <16 x i8> %res
 }
@@ -248,15 +177,10 @@ declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
 
 
 define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastb_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastb %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastb_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastb_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0)
   ret <32 x i8> %res
 }
@@ -264,15 +188,10 @@ declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
 
 
 define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastw_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastw_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastw_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0)
   ret <8 x i16> %res
 }
@@ -280,15 +199,10 @@ declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
 
 
 define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastw_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastw %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastw_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastw_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0)
   ret <16 x i16> %res
 }
@@ -296,15 +210,10 @@ declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
 
 
 define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastd_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastss %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastd_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0)
   ret <4 x i32> %res
 }
@@ -312,15 +221,10 @@ declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
 
 
 define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastd_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastss %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastd_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0)
   ret <8 x i32> %res
 }
@@ -328,15 +232,10 @@ declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
 
 
 define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastq_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastq_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastq_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0)
   ret <2 x i64> %res
 }
@@ -344,15 +243,10 @@ declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
 
 
 define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
-; X86-LABEL: test_x86_avx2_pbroadcastq_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vbroadcastsd %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pbroadcastq_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pbroadcastq_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0)
   ret <4 x i64> %res
 }
@@ -360,15 +254,10 @@ declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
 
 
 define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxbd:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxbd %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxbd:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxbd %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxbd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -376,15 +265,10 @@ declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxbq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxbq %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxbq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxbq %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxbq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -392,15 +276,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxbw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxbw %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxbw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxbw %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxbw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -408,15 +287,10 @@ declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxdq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxdq %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxdq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxdq %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxdq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -424,15 +298,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxwd:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxwd %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxwd:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxwd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -440,15 +309,10 @@ declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pmovsxwq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovsxwq %xmm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovsxwq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovsxwq %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovsxwq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -456,15 +320,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxbd:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxbd:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxbd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -472,15 +331,10 @@ declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxbq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxbq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxbq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -488,15 +342,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxbw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxbw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxbw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -504,15 +353,10 @@ declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxdq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxdq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxdq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -520,15 +364,10 @@ declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxwd:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxwd:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxwd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -536,15 +375,10 @@ declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pmovzxwq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmovzxwq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmovzxwq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -576,240 +410,160 @@ define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
 
 define <32 x i8> @mm256_max_epi8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_max_epi8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epi8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epi8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_max_epi16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_max_epi16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epi16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epi16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @mm256_max_epi32(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: mm256_max_epi32:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epi32:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epi32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <32 x i8> @mm256_max_epu8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_max_epu8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epu8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epu8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_max_epu16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_max_epu16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epu16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epu16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @mm256_max_epu32(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: mm256_max_epu32:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_max_epu32:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_max_epu32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <32 x i8> @mm256_min_epi8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_min_epi8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epi8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epi8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_min_epi16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_min_epi16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epi16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epi16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @mm256_min_epi32(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: mm256_min_epi32:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epi32:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epi32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <32 x i8> @mm256_min_epu8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_min_epu8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epu8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epu8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_min_epu16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_min_epu16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epu16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epu16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: mm256_min_epu32:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpminud %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_min_epu32:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpminud %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_min_epu32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 define <32 x i8> @mm256_avg_epu8(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: mm256_avg_epu8:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_avg_epu8:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_avg_epu8:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i16> @mm256_avg_epu16(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: mm256_avg_epu16:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: mm256_avg_epu16:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: mm256_avg_epu16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pabs_b:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpabsb %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pabs_b:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpabsb %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pabs_b:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpabsb %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
 
 define <8 x i32> @test_x86_avx2_pabs_d(<8 x i32> %a0) {
-; X86-LABEL: test_x86_avx2_pabs_d:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpabsd %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pabs_d:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpabsd %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pabs_d:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpabsd %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -817,15 +571,10 @@ declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) {
-; X86-LABEL: test_x86_avx2_pabs_w:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpabsw %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pabs_w:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpabsw %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pabs_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpabsw %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -833,15 +582,10 @@ declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
-; X86-LABEL: test_x86_avx2_vperm2i128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_vperm2i128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_vperm2i128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -849,15 +593,10 @@ declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind r
 
 
 define <4 x i64> @test_x86_avx2_pmulu_dq(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pmulu_dq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmulu_dq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmulu_dq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -865,15 +604,10 @@ declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnon
 
 
 define <4 x i64> @test_x86_avx2_pmul_dq(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pmul_dq:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
-; X86-NEXT:    retl
-;
-; X64-LABEL: test_x86_avx2_pmul_dq:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_x86_avx2_pmul_dq:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -881,25 +615,10 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_paddus_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_paddus_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_paddus_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_paddus_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_paddus_b:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -907,25 +626,10 @@ declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnon
 
 
 define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_paddus_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_paddus_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_paddus_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_paddus_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_paddus_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -933,25 +637,10 @@ declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind read
 
 
 define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubus_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubus_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubus_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubus_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psubus_b:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -959,25 +648,10 @@ declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnon
 
 
 define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubus_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubus_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubus_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubus_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psubus_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 7eaa7f1cf98e9c2d17ca8d1e8194b55a48ace9f1..101448e22acbeeb3523446437e23ef9feb7462c7 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1,29 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL
+; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX
+; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL
 
 define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packssdw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packssdw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packssdw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packssdw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_packssdw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_packssdw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -64,25 +54,15 @@ define <16 x i16> @test_x86_avx2_packssdw_fold() {
 
 
 define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packsswb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packsswb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packsswb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packsswb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_packsswb:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_packsswb:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -123,25 +103,15 @@ define <32 x i8> @test_x86_avx2_packsswb_fold() {
 
 
 define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packuswb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packuswb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packuswb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packuswb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_packuswb:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_packuswb:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -182,25 +152,15 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() {
 
 
 define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_padds_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_padds_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_padds_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_padds_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_padds_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_padds_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -208,25 +168,15 @@ declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_padds_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_padds_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_padds_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_padds_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_padds_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_padds_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -234,25 +184,15 @@ declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmadd_wd:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmadd_wd:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmadd_wd:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmadd_wd:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmadd_wd:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmadd_wd:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -260,25 +200,15 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <16 x i16> @test_x86_avx2_pmaxs_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxs_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxs_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxs_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxs_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxs_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxs_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -286,25 +216,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <32 x i8> @test_x86_avx2_pmaxu_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxu_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxu_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxu_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxu_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxu_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxu_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -312,25 +232,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmins_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmins_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmins_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmins_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmins_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmins_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmins_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -338,25 +248,15 @@ declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <32 x i8> @test_x86_avx2_pminu_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminu_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminu_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminu_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminu_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminu_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminu_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -364,17 +264,11 @@ declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) {
-; X86-LABEL: test_x86_avx2_pmovmskb:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pmovmskb:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pmovmskb:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -382,25 +276,15 @@ declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmulh_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmulh_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmulh_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmulh_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmulh_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulh_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -408,25 +292,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmulhu_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmulhu_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmulhu_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmulhu_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmulhu_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmulhu_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -434,25 +308,15 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind read
 
 
 define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psad_bw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psad_bw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psad_bw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psad_bw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psad_bw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psad_bw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -460,25 +324,15 @@ declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psll_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psll_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psll_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psll_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psll_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -486,25 +340,15 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psll_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psll_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psll_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psll_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psll_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -512,25 +356,15 @@ declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psll_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psll_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psll_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psll_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psll_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psll_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -538,25 +372,15 @@ declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnon
 
 
 define <8 x i32> @test_x86_avx2_pslli_d(<8 x i32> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_pslli_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pslli_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pslli_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pslli_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pslli_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -564,25 +388,15 @@ declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_pslli_q(<4 x i64> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_pslli_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pslli_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pslli_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pslli_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pslli_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -590,25 +404,15 @@ declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pslli_w(<16 x i16> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_pslli_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pslli_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pslli_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pslli_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pslli_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pslli_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -616,25 +420,15 @@ declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psra_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psra_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psra_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psra_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psra_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psra_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -642,25 +436,15 @@ declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psra_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psra_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psra_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psra_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psra_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psra_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -668,25 +452,15 @@ declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnon
 
 
 define <8 x i32> @test_x86_avx2_psrai_d(<8 x i32> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrai_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrai_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrai_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrai_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrai_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrai_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -694,25 +468,15 @@ declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psrai_w(<16 x i16> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrai_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrai_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrai_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrai_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrai_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrai_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -720,25 +484,15 @@ declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrl_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrl_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrl_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrl_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrl_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -746,25 +500,15 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrl_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrl_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrl_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrl_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrl_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -772,51 +516,59 @@ declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrl_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrl_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrl_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrl_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrl_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrl_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
 
 
-define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrli_d:
+define <16 x i16> @test_x86_avx2_psrl_w_load(<16 x i16> %a0, <8 x i16>* %p) {
+; X86-AVX-LABEL: test_x86_avx2_psrl_w_load:
 ; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07]
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT:    vpsrlw (%eax), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0x00]
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrli_d:
+; X86-AVX512VL-LABEL: test_x86_avx2_psrl_w_load:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07]
+; X86-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT:    vpsrlw (%eax), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0x00]
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
-; X64-AVX-LABEL: test_x86_avx2_psrli_d:
+; X64-AVX-LABEL: test_x86_avx2_psrl_w_load:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07]
+; X64-AVX-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0x07]
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrli_d:
+; X64-AVX512VL-LABEL: test_x86_avx2_psrl_w_load:
 ; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07]
+; X64-AVX512VL-NEXT:    vpsrlw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0x07]
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+  %a1 = load <8 x i16>, <8 x i16>* %p
+  %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
+  ret <16 x i16> %res
+}
+
+
+define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) {
+; AVX2-LABEL: test_x86_avx2_psrli_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -824,25 +576,15 @@ declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrli_q(<4 x i64> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrli_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrli_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrli_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrli_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrli_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -850,25 +592,15 @@ declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) {
-; X86-AVX-LABEL: test_x86_avx2_psrli_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrli_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrli_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrli_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrli_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrli_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -876,25 +608,15 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
 
 
 define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubs_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubs_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubs_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubs_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psubs_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psubs_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -902,40 +624,25 @@ declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psubs_w:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psubs_w:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psubs_w:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psubs_w:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psubs_w:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psubs_w:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
 declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
 
 define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_phadd_d:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phadd_d:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phadd_d:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -943,15 +650,10 @@ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_phadd_sw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phadd_sw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phadd_sw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -959,15 +661,10 @@ declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind read
 
 
 define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_phadd_w:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phadd_w:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phadd_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -975,15 +672,10 @@ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_phsub_d:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phsub_d:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phsub_d:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -991,15 +683,10 @@ declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_phsub_sw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phsub_sw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phsub_sw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1007,15 +694,10 @@ declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind read
 
 
 define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_phsub_w:
-; X86:       ## %bb.0:
-; X86-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_phsub_w:
-; X64:       ## %bb.0:
-; X64-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_phsub_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1023,25 +705,15 @@ declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmadd_ub_sw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmadd_ub_sw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmadd_ub_sw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1080,25 +752,15 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(<32 x i8>* %ptr, <32 x i8>
 }
 
 define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmul_hr_sw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmul_hr_sw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmul_hr_sw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1106,25 +768,15 @@ declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind re
 
 
 define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pshuf_b:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pshuf_b:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pshuf_b:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pshuf_b:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pshuf_b:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pshuf_b:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1132,15 +784,10 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: test_x86_avx2_psign_b:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_psign_b:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psign_b:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1148,15 +795,10 @@ declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_psign_d:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_psign_d:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psign_d:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1164,15 +806,10 @@ declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_psign_w:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_psign_w:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_psign_w:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1180,15 +817,10 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-LABEL: test_x86_avx2_mpsadbw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_mpsadbw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_mpsadbw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1196,25 +828,15 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
 
 
 define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_packusdw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_packusdw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_packusdw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_packusdw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_packusdw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_packusdw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1226,28 +848,28 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() {
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI51_0, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovaps LCPI50_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT:    vmovaps LCPI51_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI50_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI51_0, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI51_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI50_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI51_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
   ret <16 x i16> %res
@@ -1255,15 +877,10 @@ define <16 x i16> @test_x86_avx2_packusdw_fold() {
 
 
 define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
-; X86-LABEL: test_x86_avx2_pblendvb:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pblendvb:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pblendvb:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1271,17 +888,11 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
 
 
 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-LABEL: test_x86_avx2_pblendw:
-; X86:       ## %bb.0:
-; X86-NEXT:    vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07]
-; X86-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pblendw:
-; X64:       ## %bb.0:
-; X64-NEXT:    vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07]
-; X64-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pblendw:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07]
+; CHECK-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1289,25 +900,15 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r
 
 
 define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxsb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxsb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxsb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxsb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxsb:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxsb:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1315,25 +916,15 @@ declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxsd:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxsd:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxsd:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxsd:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxsd:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxsd:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1341,25 +932,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxud:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxud:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxud:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxud:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxud:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxud:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1367,25 +948,15 @@ declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pmaxuw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pmaxuw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pmaxuw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pmaxuw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pmaxuw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pmaxuw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1393,25 +964,15 @@ declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <32 x i8> @test_x86_avx2_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminsb:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminsb:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminsb:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminsb:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminsb:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminsb:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
   ret <32 x i8> %res
 }
@@ -1419,25 +980,15 @@ declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminsd:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminsd:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminsd:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminsd:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminsd:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminsd:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1445,25 +996,15 @@ declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_pminud(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminud:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminud:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminud:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminud:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminud:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminud:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1471,25 +1012,15 @@ declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
 
 define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_pminuw:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_pminuw:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_pminuw:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_pminuw:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_pminuw:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_pminuw:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
   ret <16 x i16> %res
 }
@@ -1497,17 +1028,11 @@ declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readn
 
 
 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pblendd_128:
-; X86:       ## %bb.0:
-; X86-NEXT:    vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08]
-; X86-NEXT:    ## xmm0 = xmm1[0,1,2],xmm0[3]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pblendd_128:
-; X64:       ## %bb.0:
-; X64-NEXT:    vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08]
-; X64-NEXT:    ## xmm0 = xmm1[0,1,2],xmm0[3]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pblendd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08]
+; CHECK-NEXT:    ## xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -1515,17 +1040,11 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind
 
 
 define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-LABEL: test_x86_avx2_pblendd_256:
-; X86:       ## %bb.0:
-; X86-NEXT:    vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07]
-; X86-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; X86-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-LABEL: test_x86_avx2_pblendd_256:
-; X64:       ## %bb.0:
-; X64-NEXT:    vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07]
-; X64-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; X64-NEXT:    retq ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pblendd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07]
+; CHECK-NEXT:    ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1536,25 +1055,15 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind
 ; and its lowering. Indeed, the offsets are the first source in
 ; the instruction.
 define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_permd:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_permd:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_permd:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_permd:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_permd:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_permd:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1565,25 +1074,15 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
 ; and its lowering. Indeed, the offsets are the first source in
 ; the instruction.
 define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_permps:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_permps:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_permps:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_permps:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_permps:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_permps:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
   ret <8 x float> %res
 }
@@ -1731,25 +1230,15 @@ declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
 
 
 define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psllv_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psllv_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psllv_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -1757,25 +1246,15 @@ declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psllv_d_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psllv_d_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psllv_d_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_d_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1783,25 +1262,15 @@ declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind read
 
 
 define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psllv_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psllv_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psllv_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
@@ -1809,25 +1278,15 @@ declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psllv_q_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psllv_q_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psllv_q_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psllv_q_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psllv_q_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -1835,25 +1294,15 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read
 
 
 define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrlv_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrlv_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrlv_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -1861,25 +1310,15 @@ declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
 
 
 define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrlv_d_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrlv_d_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrlv_d_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_d_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -1887,25 +1326,15 @@ declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind read
 
 
 define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrlv_q:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrlv_q:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrlv_q:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_q:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
@@ -1913,25 +1342,15 @@ declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
 
 
 define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrlv_q_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrlv_q_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrlv_q_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrlv_q_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -1939,25 +1358,15 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read
 
 
 define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrav_d:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrav_d:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrav_d:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -1967,36 +1376,36 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsravd LCPI82_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI83_0, kind: FK_Data_4
+; X86-AVX-NEXT:    vpsravd LCPI83_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI83_1, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa LCPI82_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
+; X86-AVX512VL-NEXT:    vmovdqa LCPI83_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI82_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsravd LCPI82_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI82_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI83_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpsravd LCPI83_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI83_1, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI83_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI83_1-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI82_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI83_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI82_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI83_1-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
   ret <4 x i32> %res
@@ -2004,25 +1413,15 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) {
-; X86-AVX-LABEL: test_x86_avx2_psrav_d_256:
-; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; X86-AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256:
-; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
-;
-; X64-AVX-LABEL: test_x86_avx2_psrav_d_256:
-; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; X64-AVX-NEXT:    retq ## encoding: [0xc3]
-;
-; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256:
-; X64-AVX512VL:       ## %bb.0:
-; X64-AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_avx2_psrav_d_256:
+; AVX2:       ## %bb.0:
+; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
+; AVX2-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx2_psrav_d_256:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
+; AVX512VL-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -2032,36 +1431,36 @@ define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1)
 ; X86-AVX:       ## %bb.0:
 ; X86-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X86-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
-; X86-AVX-NEXT:    vpsravd LCPI84_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
+; X86-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
+; X86-AVX-NEXT:    vpsravd LCPI85_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
 ; X86-AVX-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X86-AVX512VL:       ## %bb.0:
-; X86-AVX512VL-NEXT:    vmovdqa LCPI84_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; X86-AVX512VL-NEXT:    vmovdqa LCPI85_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X86-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
-; X86-AVX512VL-NEXT:    vpsravd LCPI84_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT:    vpsravd LCPI85_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
 ; X86-AVX512VL-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X64-AVX:       ## %bb.0:
 ; X64-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X64-AVX-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT:    ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
 ; X64-AVX512VL:       ## %bb.0:
 ; X64-AVX512VL-NEXT:    vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
 ; X64-AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI84_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI84_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
   ret <8 x i32> %res
diff --git a/test/CodeGen/X86/avx2-phaddsub.ll b/test/CodeGen/X86/avx2-phaddsub.ll
index 67ea37575abca0f44991b52c149917023a878758..99cdb100e3f2a44556a58f9882ae6aaee5c9ba45 100644
--- a/test/CodeGen/X86/avx2-phaddsub.ll
+++ b/test/CodeGen/X86/avx2-phaddsub.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686--   -mattr=+avx2           | FileCheck %s --check-prefixes=X32,X32-SLOW
+; RUN: llc < %s -mtriple=i686--   -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X32,X32-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2           | FileCheck %s --check-prefixes=X64,X64-SLOW
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X64,X64-FAST
 
 define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) {
 ; X32-LABEL: phaddw1:
@@ -67,15 +69,29 @@ define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) {
 }
 
 define <8 x i32> @phaddd3(<8 x i32> %x) {
-; X32-LABEL: phaddd3:
-; X32:       # %bb.0:
-; X32-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
-; X32-NEXT:    retl
+; X32-SLOW-LABEL: phaddd3:
+; X32-SLOW:       # %bb.0:
+; X32-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; X32-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; X32-SLOW-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; X32-SLOW-NEXT:    retl
 ;
-; X64-LABEL: phaddd3:
-; X64:       # %bb.0:
-; X64-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
-; X64-NEXT:    retq
+; X32-FAST-LABEL: phaddd3:
+; X32-FAST:       # %bb.0:
+; X32-FAST-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; X32-FAST-NEXT:    retl
+;
+; X64-SLOW-LABEL: phaddd3:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; X64-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; X64-SLOW-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; X64-SLOW-NEXT:    retq
+;
+; X64-FAST-LABEL: phaddd3:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; X64-FAST-NEXT:    retq
   %a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = add <8 x i32> %a, %b
diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll
index 1bfe60e31042a3299dde16f3a0d429e45e131d04..4cec0508f33e484aa9c34c70c2e3b5a958234c5e 100644
--- a/test/CodeGen/X86/avx2-schedule.ll
+++ b/test/CodeGen/X86/avx2-schedule.ll
@@ -171,66 +171,58 @@ define <8 x float> @test_broadcastss_ymm(<4 x float> %a0) {
   ret <8 x float> %2
 }
 
-define <4 x i32> @test_extracti128(<8 x i32> %a0, <8 x i32> %a1, <4 x i32> *%a2) {
+define <4 x i32> @test_extracti128(<8 x i16> %a0, <4 x i32> *%a1) {
 ; GENERIC-LABEL: test_extracti128:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
-; GENERIC-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_extracti128:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
-; HASWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; HASWELL-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; HASWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_extracti128:
 ; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
-; BROADWELL-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; BROADWELL-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; BROADWELL-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; BROADWELL-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; BROADWELL-NEXT:    vzeroupper # sched: [4:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_extracti128:
 ; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
-; SKYLAKE-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SKYLAKE-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; SKYLAKE-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKYLAKE-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_extracti128:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
-; SKX-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKX-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [3:1.00]
+; SKX-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_extracti128:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.25]
-; ZNVER1-NEXT:    vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT:    vextracti128 $1, %ymm0, %xmm0 # sched: [2:0.25]
-; ZNVER1-NEXT:    vextracti128 $1, %ymm2, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
+; ZNVER1-NEXT:    vextracti128 $1, %ymm1, %xmm0 # sched: [2:0.25]
+; ZNVER1-NEXT:    vextracti128 $1, %ymm1, (%rdi) # sched: [1:0.50]
 ; ZNVER1-NEXT:    vzeroupper # sched: [100:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = add <8 x i32> %a0, %a1
-  %2 = sub <8 x i32> %a0, %a1
-  %3 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %4 = shufflevector <8 x i32> %2, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  store <4 x i32> %3, <4 x i32> *%a2
-  ret <4 x i32> %4
+  %z = zext <8 x i16> %a0 to <8 x i32>
+  %ext = shufflevector <8 x i32> %z, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  store <4 x i32> %ext, <4 x i32> *%a1
+  ret <4 x i32> %ext
 }
 
 define <2 x double> @test_gatherdpd(<2 x double> %a0, i8* %a1, <4 x i32> %a2, <2 x double> %a3) {
@@ -4734,46 +4726,52 @@ define <4 x i64> @test_pmovzxwq(<8 x i16> %a0, <8 x i16> *%a1) {
   ret <4 x i64> %6
 }
 
-define <4 x i64> @test_pmuldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+define <4 x i64> @test_pmuldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> *%a3) {
 ; GENERIC-LABEL: test_pmuldq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; GENERIC-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; GENERIC-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [12:1.00]
+; GENERIC-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: test_pmuldq:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [12:1.00]
+; HASWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pmuldq:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; BROADWELL-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [11:1.00]
+; BROADWELL-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pmuldq:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
-; SKYLAKE-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [11:0.50]
+; SKYLAKE-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pmuldq:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vpmuldq (%rdi), %ymm2, %ymm1 # sched: [11:0.50]
+; SKX-NEXT:    vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_pmuldq:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    vpmuldq (%rdi), %ymm2, %ymm2 # sched: [11:1.00]
 ; ZNVER1-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
-; ZNVER1-NEXT:    vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    vpor %ymm2, %ymm0, %ymm0 # sched: [1:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1)
-  %2 = bitcast <4 x i64> %1 to <8 x i32>
-  %3 = load <8 x i32>, <8 x i32> *%a2, align 32
-  %4 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %2, <8 x i32> %3)
+  %2 = load <8 x i32>, <8 x i32> *%a3, align 32
+  %3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a2, <8 x i32> %2)
+  %4 = or <4 x i64> %1, %3
   ret <4 x i64> %4
 }
 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 5d7ac684e54358fed03a565d7e1e83c9f425ea41..b333e9109bdcdbec8a84f3337a8c38efbbf9a683 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -615,13 +615,13 @@ entry:
 define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
 ; X32-AVX2-LABEL: V113:
 ; X32-AVX2:       ## %bb.0: ## %entry
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X32-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; X32-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: V113:
 ; X64-AVX2:       ## %bb.0: ## %entry
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X64-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    retq
 ;
@@ -642,12 +642,12 @@ entry:
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
 ; X32-LABEL: _e2:
 ; X32:       ## %bb.0:
-; X32-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X32-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: _e2:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
+; X64-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3]
 ; X64-NEXT:    retq
   %vecinit.i = insertelement <4 x float> undef, float        0xbf80000000000000, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float  0xbf80000000000000, i32 1
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index d836e9e439be099db032bd9b40dcef110b59ad23..29793a7e0bc62d19359f82c3ace35380b0deb033 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -601,17 +601,17 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
 define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
 ; AVX512F-LABEL: andd512fold:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpandq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT:    vpandd (%rdi), %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: andd512fold:
 ; AVX512VL:       # %bb.0: # %entry
-; AVX512VL-NEXT:    vpandq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpandd (%rdi), %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: andd512fold:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpandq (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandd (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: andd512fold:
@@ -969,7 +969,7 @@ define <16 x float>  @test_fxor(<16 x float> %a) {
 define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
 ; AVX512F-LABEL: test_fxor_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX512F-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -980,13 +980,13 @@ define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
 ;
 ; AVX512BW-LABEL: test_fxor_8f32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; AVX512BW-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX512BW-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_fxor_8f32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; AVX512DQ-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX512DQ-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 4c089ac379c0d5d2394668b5d52aed8d08617ceb..e99cdaf1ce92d16b3d7b5870f6061469dcd5b500 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -484,32 +484,12 @@ define <4 x float> @ulto4f32(<4 x i64> %a) {
 define <8 x double> @ulto8f64(<8 x i64> %a) {
 ; NODQ-LABEL: ulto8f64:
 ; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; NODQ-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; NODQ-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; NODQ-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; NODQ-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: ulto8f64:
@@ -524,32 +504,12 @@ define <8 x double> @ulto8f64(<8 x i64> %a) {
 ;
 ; KNL_WIDEN-LABEL: ulto8f64:
 ; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm0
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; KNL_WIDEN-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
 ; KNL_WIDEN-NEXT:    retq
   %b = uitofp <8 x i64> %a to <8 x double>
   ret <8 x double> %b
@@ -558,58 +518,22 @@ define <8 x double> @ulto8f64(<8 x i64> %a) {
 define <16 x double> @ulto16f64(<16 x i64> %a) {
 ; NODQ-LABEL: ulto16f64:
 ; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
+; NODQ-NEXT:    vpandq %zmm2, %zmm0, %zmm3
+; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; NODQ-NEXT:    vporq %zmm4, %zmm3, %zmm3
+; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; NODQ-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; NODQ-NEXT:    vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; NODQ-NEXT:    vsubpd %zmm6, %zmm0, %zmm0
+; NODQ-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
+; NODQ-NEXT:    vpandq %zmm2, %zmm1, %zmm2
+; NODQ-NEXT:    vporq %zmm4, %zmm2, %zmm2
+; NODQ-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; NODQ-NEXT:    vporq %zmm5, %zmm1, %zmm1
+; NODQ-NEXT:    vsubpd %zmm6, %zmm1, %zmm1
+; NODQ-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: ulto16f64:
@@ -626,58 +550,22 @@ define <16 x double> @ulto16f64(<16 x i64> %a) {
 ;
 ; KNL_WIDEN-LABEL: ulto16f64:
 ; KNL_WIDEN:       # %bb.0:
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm0, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm0
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; KNL_WIDEN-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovq %xmm2, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm2
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL_WIDEN-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL_WIDEN-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm3, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL_WIDEN-NEXT:    vmovq %xmm1, %rax
-; KNL_WIDEN-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm1
-; KNL_WIDEN-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; KNL_WIDEN-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; KNL_WIDEN-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
+; KNL_WIDEN-NEXT:    vpandq %zmm2, %zmm0, %zmm3
+; KNL_WIDEN-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; KNL_WIDEN-NEXT:    vporq %zmm4, %zmm3, %zmm3
+; KNL_WIDEN-NEXT:    vpsrlq $32, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; KNL_WIDEN-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; KNL_WIDEN-NEXT:    vsubpd %zmm6, %zmm0, %zmm0
+; KNL_WIDEN-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
+; KNL_WIDEN-NEXT:    vpandq %zmm2, %zmm1, %zmm2
+; KNL_WIDEN-NEXT:    vporq %zmm4, %zmm2, %zmm2
+; KNL_WIDEN-NEXT:    vpsrlq $32, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vporq %zmm5, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vsubpd %zmm6, %zmm1, %zmm1
+; KNL_WIDEN-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
 ; KNL_WIDEN-NEXT:    retq
   %b = uitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %b
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index c23a474a97f6ff68381528f45d0a4bbc7509b717..d56cf0fe09ee9c8999a62f68f818c4bffeac8355 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -2157,7 +2157,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
 define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 ; ALL-LABEL: zext_4xi1_to_4x32:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; ALL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255]
 ; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; ALL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
@@ -2171,7 +2171,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
 ; ALL-LABEL: zext_2xi1_to_2xi64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; ALL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255]
 ; ALL-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; ALL-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; ALL-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/avx512-hadd-hsub.ll b/test/CodeGen/X86/avx512-hadd-hsub.ll
index 510553b56d46ff606dfe30ecc3639a104efd87ad..00063521c6d13745d92343b4f9edc804893699ce 100644
--- a/test/CodeGen/X86/avx512-hadd-hsub.ll
+++ b/test/CodeGen/X86/avx512-hadd-hsub.ll
@@ -8,7 +8,7 @@ define i32 @hadd_16(<16 x i32> %x225) {
 ; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; KNL-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; KNL-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovd %xmm0, %eax
 ; KNL-NEXT:    retq
 ;
@@ -17,7 +17,7 @@ define i32 @hadd_16(<16 x i32> %x225) {
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SKX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SKX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vmovd %xmm0, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -35,7 +35,7 @@ define i32 @hsub_16(<16 x i32> %x225) {
 ; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; KNL-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; KNL-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovd %xmm0, %eax
 ; KNL-NEXT:    retq
 ;
@@ -44,7 +44,7 @@ define i32 @hsub_16(<16 x i32> %x225) {
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SKX-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SKX-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vmovd %xmm0, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -62,8 +62,7 @@ define float @fhadd_16(<16 x float> %x225) {
 ; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; KNL-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; KNL-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fhadd_16:
@@ -71,8 +70,7 @@ define float @fhadd_16(<16 x float> %x225) {
 ; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; SKX-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SKX-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -89,8 +87,7 @@ define float @fhsub_16(<16 x float> %x225) {
 ; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; KNL-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; KNL-NEXT:    vsubps %zmm1, %zmm0, %zmm0
-; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL-NEXT:    vsubps %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fhsub_16:
@@ -98,8 +95,7 @@ define float @fhsub_16(<16 x float> %x225) {
 ; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; SKX-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SKX-NEXT:    vsubps %zmm1, %zmm0, %zmm0
-; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; SKX-NEXT:    vsubps %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -178,18 +174,16 @@ define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) {
 define <4 x double> @fadd_noundef_low(<8 x double> %x225, <8 x double> %x227) {
 ; KNL-LABEL: fadd_noundef_low:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; KNL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; KNL-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; KNL-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; KNL-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; KNL-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fadd_noundef_low:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; SKX-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; SKX-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
 ; SKX-NEXT:    retq
   %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
@@ -203,16 +197,18 @@ define <4 x double> @fadd_noundef_high(<8 x double> %x225, <8 x double> %x227) {
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; KNL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; KNL-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
 ; KNL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; KNL-NEXT:    vextractf64x4 $1, %zmm2, %ymm1
+; KNL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fadd_noundef_high:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; SKX-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
 ; SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; SKX-NEXT:    vextractf64x4 $1, %zmm2, %ymm1
+; SKX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; SKX-NEXT:    retq
   %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
@@ -227,16 +223,14 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) {
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; KNL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; KNL-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
-; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; KNL-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: hadd_16_3_sv:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; SKX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; SKX-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
-; SKX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; SKX-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
 ; SKX-NEXT:    retq
   %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
 , i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
@@ -252,18 +246,14 @@ define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) {
 define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {
 ; KNL-LABEL: fadd_noundef_eel:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; KNL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; KNL-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; KNL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; KNL-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fadd_noundef_eel:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; SKX-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; SKX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; SKX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -278,19 +268,19 @@ define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {
 define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
 ; KNL-LABEL: fsub_noundef_ee:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; KNL-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; KNL-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
-; KNL-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; KNL-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
+; KNL-NEXT:    vbroadcastsd %xmm0, %zmm1
+; KNL-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
+; KNL-NEXT:    vsubpd %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: fsub_noundef_ee:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; SKX-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
-; SKX-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; SKX-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
+; SKX-NEXT:    vbroadcastsd %xmm0, %zmm1
+; SKX-NEXT:    vextractf32x4 $2, %zmm1, %xmm1
+; SKX-NEXT:    vsubpd %xmm0, %xmm1, %xmm0
 ; SKX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index db3716c9530807a758f64fc3826a56d19a3c04dd..6944d3ea27b64d566a50b1b7bcce6a7219f9c3ef 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -851,7 +851,7 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; SKX-NEXT:    kxorb %k2, %k1, %k1
 ; SKX-NEXT:    kshiftlb $7, %k1, %k1
 ; SKX-NEXT:    kshiftrb $5, %k1, %k1
-; SKX-NEXT:    kxorb %k1, %k0, %k0
+; SKX-NEXT:    kxorw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq
@@ -890,7 +890,7 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
 ; SKX-NEXT:    kshiftrb $7, %k0, %k0
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftlb $1, %k1, %k1
-; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    korw %k1, %k0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: def $al killed $al killed $eax
 ; SKX-NEXT:    retq
@@ -993,7 +993,6 @@ define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
 ; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $2, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1020,10 +1019,9 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpminub %ymm3, %ymm1, %ymm0
 ; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1056,10 +1054,9 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vpminub %ymm3, %ymm1, %ymm0
 ; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index f889bb905505c015542a1b2499f727f4ed88c9da..e04d8e3194420f4c13768b4de240bda0d2678aa5 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -3156,7 +3156,7 @@ entry:
 define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
 ; CHECK-LABEL: test_mm512_fnmsub_round_pd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
 ; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0
@@ -3387,7 +3387,7 @@ entry:
 define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) {
 ; CHECK-LABEL: test_mm512_fnmsub_pd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
 ; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
@@ -3613,9 +3613,9 @@ entry:
 define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
 ; CHECK-LABEL: test_mm512_fnmsub_round_ps:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
-; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
-; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
+; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
@@ -3836,9 +3836,9 @@ entry:
 define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) {
 ; CHECK-LABEL: test_mm512_fnmsub_ps:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0]
-; CHECK-NEXT:    vpxorq %zmm3, %zmm0, %zmm4
-; CHECK-NEXT:    vpxorq %zmm3, %zmm2, %zmm0
+; CHECK-NEXT:    vpbroadcastd {{.*#+}} zmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxord %zmm3, %zmm0, %zmm4
+; CHECK-NEXT:    vpxord %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 entry:
@@ -6860,7 +6860,8 @@ define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) {
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    ret{{[l|q]}}
@@ -6989,7 +6990,8 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
 ; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovd %xmm0, %eax
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -7004,7 +7006,8 @@ define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) {
 ; X64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; X64-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; X64-NEXT:    vmovd %xmm0, %eax
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -7301,7 +7304,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -7318,7 +7321,7 @@ define float @test_mm512_reduce_add_ps(<16 x float> %__W) {
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -7351,7 +7354,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -7368,7 +7371,7 @@ define float @test_mm512_reduce_mul_ps(<16 x float> %__W) {
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -7456,7 +7459,7 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movb 8(%ebp), %al
 ; X86-NEXT:    kmovw %eax, %k1
-; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
+; X86-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X86-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
 ; X86-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
@@ -7475,7 +7478,7 @@ define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W)
 ; X64-LABEL: test_mm512_mask_reduce_mul_pd:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1]
+; X64-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-NEXT:    vmovapd %zmm0, %zmm1 {%k1}
 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
 ; X64-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
@@ -7513,7 +7516,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -7532,7 +7535,7 @@ define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -7562,7 +7565,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; X86-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
 ; X86-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
 ; X86-NEXT:    vmulps %ymm0, %ymm1, %ymm0
@@ -7570,7 +7573,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X86-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -7582,7 +7585,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X64-LABEL: test_mm512_mask_reduce_mul_ps:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    kmovw %edi, %k1
-; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; X64-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
 ; X64-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
 ; X64-NEXT:    vmulps %ymm0, %ymm1, %ymm0
@@ -7590,7 +7593,7 @@ define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W)
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; X64-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index a70ff9bc1b165120b4d9b45c3631b88b263e766a..26e8636df8f8a4526cf6849354ba91531a8607bb 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1658,7 +1658,7 @@ define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_xor_epi32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xef,0xc1]
+; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xef,0xc1]
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
@@ -1687,7 +1687,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16
 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_or_epi32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xeb,0xc1]
+; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xeb,0xc1]
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
@@ -1716,7 +1716,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x
 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_and_epi32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xdb,0xc1]
+; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xdb,0xc1]
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll
index bb1e8550ba23e496911ad0cf760e6c9fb98dac2b..65d9d67b2caa166461d93f1e71cde0a6e027f1c6 100644
--- a/test/CodeGen/X86/avx512-logic.ll
+++ b/test/CodeGen/X86/avx512-logic.ll
@@ -7,7 +7,7 @@ define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon
 ; ALL-LABEL: vpandd:
 ; ALL:       ## %bb.0: ## %entry
 ; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -21,7 +21,7 @@ define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readno
 ; ALL-LABEL: vpandnd:
 ; ALL:       ## %bb.0: ## %entry
 ; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT:    vpandnq %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    vpandnd %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -37,7 +37,7 @@ define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone
 ; ALL-LABEL: vpord:
 ; ALL:       ## %bb.0: ## %entry
 ; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -51,7 +51,7 @@ define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon
 ; ALL-LABEL: vpxord:
 ; ALL:       ## %bb.0: ## %entry
 ; ALL-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; ALL-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -132,7 +132,7 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
 define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
 ; KNL-LABEL: andd512fold:
 ; KNL:       ## %bb.0: ## %entry
-; KNL-NEXT:    vpandq (%rdi), %zmm0, %zmm0
+; KNL-NEXT:    vpandd (%rdi), %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: andd512fold:
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 1449f5cf7b4260af4ad4681030cb8ceb159588d5..13ffb9f65bf76f9d809d70987b22342bfcb13039 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -3177,7 +3177,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    pushq %rax
 ; KNL-NEXT:    .cfi_def_cfa_offset 16
-; KNL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; KNL-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testw %ax, %ax
@@ -3196,7 +3196,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; SKX:       ## %bb.0:
 ; SKX-NEXT:    pushq %rax
 ; SKX-NEXT:    .cfi_def_cfa_offset 16
-; SKX-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testw %ax, %ax
@@ -3215,7 +3215,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; AVX512BW:       ## %bb.0:
 ; AVX512BW-NEXT:    pushq %rax
 ; AVX512BW-NEXT:    .cfi_def_cfa_offset 16
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    testw %ax, %ax
@@ -3234,7 +3234,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; AVX512DQ:       ## %bb.0:
 ; AVX512DQ-NEXT:    pushq %rax
 ; AVX512DQ-NEXT:    .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
 ; AVX512DQ-NEXT:    testw %ax, %ax
@@ -3253,7 +3253,7 @@ define void @ktest_signed(<16 x i32> %x, <16 x i32> %y) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kmovd %k0, %eax
 ; X86-NEXT:    testw %ax, %ax
@@ -3287,7 +3287,7 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; CHECK-NEXT:    kortestw %k0, %k0
 ; CHECK-NEXT:    jb LBB65_2
@@ -3303,7 +3303,7 @@ define void @ktest_allones(<16 x i32> %x, <16 x i32> %y) {
 ; X86:       ## %bb.0:
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    vptestnmd %zmm0, %zmm0, %k0
 ; X86-NEXT:    kortestw %k0, %k0
 ; X86-NEXT:    jb LBB65_2
diff --git a/test/CodeGen/X86/avx512-mask-zext-bugfix.ll b/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
index 4d939bd5b8ca69f7d763abf26f958c3ac3e0ea95..fed87ebf6eb46242dde68237135b2c95cf77985a 100755
--- a/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
+++ b/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
@@ -17,50 +17,50 @@ declare i32 @check_mask16(i16 zeroext %res_mask, i16 zeroext %exp_mask, i8* %fna
 define void @test_xmm(i32 %shift, i32 %mulp, <2 x i64> %a,i8* %arraydecay,i8* %fname){
 ; CHECK-LABEL: test_xmm:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subq $56, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-NEXT:    vpmovw2m %xmm0, %k0
 ; CHECK-NEXT:    movl $2, %esi
 ; CHECK-NEXT:    movl $8, %eax
 ; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill
+; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT:    kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; CHECK-NEXT:    callq _calc_expected_mask_val
 ; CHECK-NEXT:    movl %eax, %edx
 ; CHECK-NEXT:    movw %dx, %r8w
 ; CHECK-NEXT:    movzwl %r8w, %esi
-; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
+; CHECK-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload
 ; CHECK-NEXT:    kmovb %k0, %edi
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdx ## 8-byte Reload
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
 ; CHECK-NEXT:    callq _check_mask16
-; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload
+; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; CHECK-NEXT:    vpmovd2m %xmm0, %k0
 ; CHECK-NEXT:    kmovq %k0, %k1
 ; CHECK-NEXT:    kmovd %k0, %esi
 ; CHECK-NEXT:    movb %sil, %r9b
 ; CHECK-NEXT:    movzbl %r9b, %esi
 ; CHECK-NEXT:    movw %si, %r8w
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
 ; CHECK-NEXT:    movl $4, %esi
-; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edx ## 4-byte Reload
-; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
-; CHECK-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; CHECK-NEXT:    movw %r8w, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; CHECK-NEXT:    movw %r8w, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
 ; CHECK-NEXT:    callq _calc_expected_mask_val
 ; CHECK-NEXT:    movw %ax, %r8w
-; CHECK-NEXT:    movw {{[0-9]+}}(%rsp), %r10w ## 2-byte Reload
+; CHECK-NEXT:    movw {{[-0-9]+}}(%r{{[sb]}}p), %r10w ## 2-byte Reload
 ; CHECK-NEXT:    movzwl %r10w, %edi
 ; CHECK-NEXT:    movzwl %r8w, %esi
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdx ## 8-byte Reload
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
 ; CHECK-NEXT:    callq _check_mask16
-; CHECK-NEXT:    movl %eax, (%rsp) ## 4-byte Spill
-; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT:    addq $72, %rsp
 ; CHECK-NEXT:    retq
   %d2 = bitcast <2 x i64> %a to <8 x i16>
   %m2 = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %d2)
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index ea705d16c33d665256511e399fdaa1ff281e2ac9..5ce1705e37740f9fdcca63774543fdb024449aa4 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-pc-win32       -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-win32        -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck %s --check-prefix=WIN64
-; RUN: llc < %s -mtriple=x86_64-linux-gnu    -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq  | FileCheck %s --check-prefix=LINUXOSX64
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39437.
+; RUN: llc < %s -mtriple=i386-pc-win32       -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq -verify-machineinstrs=0  | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-win32        -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq -verify-machineinstrs=0  | FileCheck %s --check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu    -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq -verify-machineinstrs=0  | FileCheck %s --check-prefix=LINUXOSX64
 
 ; Test regcall when receiving/returning i1
 define x86_regcallcc i1 @test_argReti1(i1 %a)  {
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
index 71dabf70a18017bfd03fa2299d1a06b5865e4c57..7ea2aef6ba4c77e58ee68acebbfaa11a09964435 100755
--- a/test/CodeGen/X86/avx512-schedule.ll
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -4711,7 +4711,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
 define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 ; GENERIC-LABEL: zext_4xi1_to_4x32:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50]
+; GENERIC-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] sched: [7:0.50]
 ; GENERIC-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
@@ -4720,7 +4720,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 ;
 ; SKX-LABEL: zext_4xi1_to_4x32:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50]
+; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,255,255] sched: [6:0.50]
 ; SKX-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
 ; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
@@ -4734,7 +4734,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
 define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
 ; GENERIC-LABEL: zext_2xi1_to_2xi64:
 ; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50]
+; GENERIC-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255] sched: [7:0.50]
 ; GENERIC-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
@@ -4743,7 +4743,7 @@ define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
 ;
 ; SKX-LABEL: zext_2xi1_to_2xi64:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50]
+; SKX-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [255,255] sched: [6:0.50]
 ; SKX-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
 ; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
@@ -5029,13 +5029,13 @@ define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon
 ; GENERIC-LABEL: vpandd:
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT:    vpandd %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpandd:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpandd %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   ; Force the execution domain with an add.
@@ -5049,13 +5049,13 @@ define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readno
 ; GENERIC-LABEL: vpandnd:
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT:    vpandnd %zmm0, %zmm1, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpandnd:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpandnd %zmm0, %zmm1, %zmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   ; Force the execution domain with an add.
@@ -5071,13 +5071,13 @@ define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone
 ; GENERIC-LABEL: vpord:
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT:    vpord %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpord:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpord %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   ; Force the execution domain with an add.
@@ -5091,13 +5091,13 @@ define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnon
 ; GENERIC-LABEL: vpxord:
 ; GENERIC:       # %bb.0: # %entry
 ; GENERIC-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT:    vpxord %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SKX-LABEL: vpxord:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpxord %zmm1, %zmm0, %zmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 entry:
   ; Force the execution domain with an add.
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 2ad2251bc1a179cfa1fc2e0b2764e56f4704532f..90e533c09b7f6c31db8f611ac0b7ea696d546a81 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -11,18 +11,18 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
 ; X86-NEXT:  # %bb.1:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; X86-NEXT:  .LBB0_2:
-; X86-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; X86-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: select00:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    cmpl $255, %edi
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    je .LBB0_2
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; X64-NEXT:  .LBB0_2:
-; X64-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; X64-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    retq
   %cmpres = icmp eq i32 %a, 255
   %selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b
@@ -44,8 +44,8 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
 ;
 ; X64-LABEL: select01:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    cmpl $255, %edi
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    je .LBB1_2
 ; X64-NEXT:  # %bb.1:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm1
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index d198fe87bed14bcf6a3cd994f5f44fce0211e3d2..0768508cca977d6b3655716d5af21ea8d1f9a7b9 100644
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -4019,11 +4019,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,5,5]
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm4
+; CHECK-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT:    vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4034,11 +4034,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
 define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [1,1,5,5]
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT:    vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index d9dc5087f54fdc816751e923b0d958e621893f57..5338eb3c3a16d980efdeb3b245bdcc0c6019ca0e 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -73,7 +73,7 @@ define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
 ; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovq2m %xmm0, %k1
 ; CHECK-NEXT:    kshiftlb $2, %k0, %k0
-; CHECK-NEXT:    korb %k0, %k1, %k0
+; CHECK-NEXT:    korw %k0, %k1, %k0
 ; CHECK-NEXT:    vpmovm2d %k0, %xmm0
 ; CHECK-NEXT:    retq
 
@@ -89,7 +89,7 @@ define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
 ; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovq2m %xmm0, %k1
 ; CHECK-NEXT:    kshiftlb $2, %k0, %k0
-; CHECK-NEXT:    korb %k0, %k1, %k0
+; CHECK-NEXT:    korw %k0, %k1, %k0
 ; CHECK-NEXT:    vpmovm2b %k0, %xmm0
 ; CHECK-NEXT:    retq
 
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index c17ba57d11ad6e907bba446722f3afa176d8f201..cf52746c3a533ff69cf607147a9764c05df01abe 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -disable-peephole -mtriple=i686-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
 define i32 @test_int_x86_avx512_kadd_d(<32 x i16> %A, <32 x i16> %B) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_kadd_d:
@@ -1948,6 +1948,21 @@ define <32 x i16> @test_x86_avx512_maskz_psrl_w_512(<32 x i16> %a0, <8 x i16> %a
 }
 declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone
 
+define <32 x i16> @test_x86_avx512_psrl_w_512_load(<32 x i16> %a0, <8 x i16>* %p) {
+; X86-LABEL: test_x86_avx512_psrl_w_512_load:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpsrlw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_psrl_w_512_load:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %a1 = load <8 x i16>, <8 x i16>* %p
+  %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
+  ret <32 x i16> %res
+}
 
 define <32 x i16> @test_x86_avx512_psrli_w_512(<32 x i16> %a0) {
 ; CHECK-LABEL: test_x86_avx512_psrli_w_512:
diff --git a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
index 70cadc78c18b7052be73d40646a645f97bb3ede0..ecb76b3f9a7a6788d47a2d3ec661a2ed70059e18 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
@@ -1,14 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BW
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_test_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmb %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_test_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmb %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_test_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -19,13 +30,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_mask_test_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmb %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_mask_test_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmb %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_mask_test_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -38,12 +60,22 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_test_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmw %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_test_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmw %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_test_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -54,13 +86,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_mask_test_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmw %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andb %dil, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_mask_test_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmw %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andb %dil, %al
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_mask_test_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andb %dil, %al
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -73,12 +116,22 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_testn_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmb %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_testn_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmb %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_testn_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -89,13 +142,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_mask_testn_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmb %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_mask_testn_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmb %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_mask_testn_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -108,12 +172,22 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_testn_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmw %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_testn_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmw %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_testn_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -124,13 +198,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm_mask_testn_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmw %xmm0, %xmm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andb %dil, %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm_mask_testn_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmw %xmm0, %xmm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andb %dil, %al
+; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm_mask_testn_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andb %dil, %al
+; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -143,12 +228,21 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define i32 @TEST_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_test_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmb %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_test_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmb %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_test_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -159,13 +253,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define i32 @TEST_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_mask_test_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmb %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_mask_test_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmb %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_mask_test_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -178,13 +282,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_test_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmw %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_test_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmw %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_test_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
@@ -195,14 +309,25 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_mask_test_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestmw %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_mask_test_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestmw %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_mask_test_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
@@ -215,12 +340,21 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define i32 @TEST_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_testn_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmb %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_testn_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmb %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_testn_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -231,13 +365,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define i32 @TEST_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_mask_testn_epi8_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmb %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_mask_testn_epi8_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmb %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_mask_testn_epi8_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -250,13 +394,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_testn_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmw %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_testn_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmw %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_testn_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
@@ -267,14 +421,25 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @TEST_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: TEST_mm256_mask_testn_epi16_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vptestnmw %ymm0, %ymm1, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; AVX512BWVL-LABEL: TEST_mm256_mask_testn_epi16_mask:
+; AVX512BWVL:       # %bb.0: # %entry
+; AVX512BWVL-NEXT:    vptestnmw %ymm0, %ymm1, %k0
+; AVX512BWVL-NEXT:    kmovd %k0, %eax
+; AVX512BWVL-NEXT:    andl %edi, %eax
+; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BWVL-NEXT:    vzeroupper
+; AVX512BWVL-NEXT:    retq
+;
+; AVX512BW-LABEL: TEST_mm256_mask_testn_epi16_mask:
+; AVX512BW:       # %bb.0: # %entry
+; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT:    vptestnmw %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl %edi, %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
diff --git a/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
index 80dc7fcd7039e3a0c6ca8e2fd8d1039601040722..ffce664b8d6d8822b795de6db595232318c2d071 100644
--- a/test/CodeGen/X86/avx512vbmi-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
@@ -77,9 +77,8 @@ define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x
 ; X86-NEXT:    vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda]
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca]
-; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4]
-; X86-NEXT:    vpermi2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x75,0xe2]
-; X86-NEXT:    vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3]
+; X86-NEXT:    vpermt2b %zmm2, %zmm3, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xc9,0x7d,0xc2]
+; X86-NEXT:    vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3]
 ; X86-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -89,15 +88,14 @@ define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x
 ; X64-NEXT:    vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda]
 ; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca]
-; X64-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4]
-; X64-NEXT:    vpermi2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x75,0xe2]
-; X64-NEXT:    vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3]
+; X64-NEXT:    vpermt2b %zmm2, %zmm3, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xc9,0x7d,0xc2]
+; X64-NEXT:    vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3]
 ; X64-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2)
   %2 = bitcast i64 %x3 to <64 x i1>
   %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x1
-  %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2)
+  %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %1, <64 x i8> %x2)
   %5 = bitcast i64 %x3 to <64 x i1>
   %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer
   %7 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2)
diff --git a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
index 272ac903a9a8f51786ba5f143a2e74e92a3b96f0..79f32103dddeca8617576287025a11633fe64a67 100644
--- a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
@@ -139,9 +139,8 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x
 ; X86-NEXT:    vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca]
-; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X86-NEXT:    vpermi2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x75,0xe2]
-; X86-NEXT:    vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3]
+; X86-NEXT:    vpermt2b %xmm2, %xmm3, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0x89,0x7d,0xc2]
+; X86-NEXT:    vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3]
 ; X86-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -151,15 +150,14 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x
 ; X64-NEXT:    vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca]
-; X64-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X64-NEXT:    vpermi2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x75,0xe2]
-; X64-NEXT:    vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3]
+; X64-NEXT:    vpermt2b %xmm2, %xmm3, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0x89,0x7d,0xc2]
+; X64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3]
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1
-  %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2)
+  %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %1, <16 x i8> %x2)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer
   %7 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2)
@@ -177,9 +175,8 @@ define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x
 ; X86-NEXT:    vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
 ; X86-NEXT:    vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca]
-; X86-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X86-NEXT:    vpermi2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x75,0xe2]
-; X86-NEXT:    vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3]
+; X86-NEXT:    vpermt2b %ymm2, %ymm3, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xa9,0x7d,0xc2]
+; X86-NEXT:    vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3]
 ; X86-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -189,15 +186,14 @@ define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x
 ; X64-NEXT:    vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda]
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca]
-; X64-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
-; X64-NEXT:    vpermi2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x75,0xe2]
-; X64-NEXT:    vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3]
+; X64-NEXT:    vpermt2b %ymm2, %ymm3, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xa9,0x7d,0xc2]
+; X64-NEXT:    vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3]
 ; X64-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1
-  %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2)
+  %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %1, <32 x i8> %x2)
   %5 = bitcast i32 %x3 to <32 x i1>
   %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer
   %7 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2)
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 8c3fe90033679ec5bb3acc530befc1f428889258..79de4aec42b3eeb858a9505d21f06c3e83157521 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -9780,7 +9780,6 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -9807,7 +9806,6 @@ define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -9835,7 +9833,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -9866,7 +9863,6 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -9897,7 +9893,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -9925,7 +9920,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -9954,7 +9948,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -9985,7 +9978,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10017,12 +10009,10 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -10052,12 +10042,10 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -10088,14 +10076,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -10130,14 +10116,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -10172,7 +10156,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -10201,7 +10184,6 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -10231,7 +10213,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10264,7 +10245,6 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10296,7 +10276,6 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10323,7 +10302,6 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10351,7 +10329,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10382,7 +10359,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10413,7 +10389,6 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -10441,7 +10416,6 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -10470,7 +10444,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10502,7 +10475,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -10535,7 +10507,6 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10563,7 +10534,6 @@ define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -10592,7 +10562,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10624,7 +10593,6 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10656,7 +10624,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -10685,7 +10652,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -10715,7 +10681,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10747,7 +10712,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10782,12 +10746,10 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vpternlogq $15, %zmm2, %zmm2, %zmm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -10820,12 +10782,10 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -10856,7 +10816,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm2
 ; NoVLX-NEXT:    vpternlogq $15, %zmm2, %zmm2, %zmm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10866,7 +10825,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -10901,7 +10859,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -10911,7 +10868,6 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -14768,7 +14724,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -14795,7 +14750,6 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -14824,7 +14778,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -14855,7 +14808,6 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -14887,7 +14839,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -14915,7 +14866,6 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -14945,7 +14895,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -14976,7 +14925,6 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15009,12 +14957,10 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -15044,12 +14990,10 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -15081,14 +15025,12 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -15123,14 +15065,12 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
 ; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -15166,7 +15106,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -15195,7 +15134,6 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -15226,7 +15164,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15259,7 +15196,6 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15292,7 +15228,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -15319,7 +15254,6 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -15348,7 +15282,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15379,7 +15312,6 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15411,7 +15343,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -15439,7 +15370,6 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -15469,7 +15399,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15501,7 +15430,6 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
-; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kmovw %k0, %eax
@@ -15535,7 +15463,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -15563,7 +15490,6 @@ define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
@@ -15593,7 +15519,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15625,7 +15550,6 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15658,7 +15582,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -15687,7 +15610,6 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    movzwl %ax, %eax
@@ -15718,7 +15640,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15750,7 +15671,6 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15785,14 +15705,12 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vpmaxuw %ymm3, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpcmpeqw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -15823,14 +15741,12 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
 ; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    vpmaxuw 32(%rdi), %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    shll $16, %eax
@@ -15862,7 +15778,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
 ; NoVLX-NEXT:    vpternlogq $15, %zmm2, %zmm2, %zmm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15873,7 +15788,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
@@ -15908,7 +15822,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm1
 ; NoVLX-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    andl %edi, %eax
@@ -15918,7 +15831,6 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %ecx
 ; NoVLX-NEXT:    andl %edi, %ecx
diff --git a/test/CodeGen/X86/avx512vl-vec-test-testn.ll b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
index c662226fde9c5d0069947e1c6574123821f5126f..ae74be241d5a0b52d44ce811d82ead10fbcad321 100644
--- a/test/CodeGen/X86/avx512vl-vec-test-testn.ll
+++ b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
@@ -1,22 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X86_64
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=I386
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-X64
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F --check-prefix=AVX512F-X64
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F --check-prefix=AVX512F-X86
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_test_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmq %xmm0, %xmm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_test_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmq %xmm0, %xmm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm_test_epi64_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestmq %xmm0, %xmm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm_test_epi64_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
@@ -27,19 +34,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_test_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmd %xmm0, %xmm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_test_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmd %xmm0, %xmm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm_test_epi32_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestmd %xmm0, %xmm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm_test_epi32_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
@@ -51,21 +63,25 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_test_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmq %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_test_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmq %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm256_test_epi64_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestmq %ymm0, %ymm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm256_test_epi64_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
@@ -76,21 +92,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_test_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmd %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_test_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmd %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm256_test_epi32_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestmd %ymm0, %ymm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm256_test_epi32_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
@@ -101,22 +119,49 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_test_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_mask_test_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_mask_test_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm_mask_test_epi64_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm_mask_test_epi64_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestmq %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm_mask_test_epi64_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm_mask_test_epi64_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
@@ -130,22 +175,49 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_test_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_mask_test_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_mask_test_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm_mask_test_epi32_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm_mask_test_epi32_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestmd %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm_mask_test_epi32_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm_mask_test_epi32_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestmd %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
@@ -161,24 +233,51 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_mask_test_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_mask_test_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm256_mask_test_epi64_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    vzeroupper
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm256_mask_test_epi64_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestmq %ymm0, %ymm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    vzeroupper
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm256_mask_test_epi64_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm256_mask_test_epi64_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
@@ -192,23 +291,45 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_mask_test_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestmd %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    andb %dil, %al
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_mask_test_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestmd %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    andb {{[0-9]+}}(%esp), %al
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm256_mask_test_epi32_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    vptestmd %ymm0, %ymm1, %k0
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    andb %dil, %al
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    vzeroupper
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm256_mask_test_epi32_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    vptestmd %ymm0, %ymm1, %k0
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    andb {{[0-9]+}}(%esp), %al
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    vzeroupper
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm256_mask_test_epi32_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X64-NEXT:    vptestmd %zmm0, %zmm1, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    andb %dil, %al
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm256_mask_test_epi32_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X86-NEXT:    vptestmd %zmm0, %zmm1, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    andb {{[0-9]+}}(%esp), %al
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
@@ -221,19 +342,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_testn_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmq %xmm0, %xmm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_testn_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmq %xmm0, %xmm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm_testn_epi64_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm_testn_epi64_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
@@ -244,19 +370,24 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_testn_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmd %xmm0, %xmm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_testn_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmd %xmm0, %xmm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm_testn_epi32_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm_testn_epi32_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
@@ -268,21 +399,25 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_testn_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmq %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_testn_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmq %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm256_testn_epi64_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestnmq %ymm0, %ymm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm256_testn_epi64_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
@@ -293,21 +428,23 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_testn_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmd %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_testn_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmd %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-LABEL: TEST_mm256_testn_epi32_mask:
+; AVX512VL:       # %bb.0: # %entry
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm1, %k0
+; AVX512VL-NEXT:    kmovw %k0, %eax
+; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    ret{{[l|q]}}
+;
+; AVX512F-LABEL: TEST_mm256_testn_epi32_mask:
+; AVX512F:       # %bb.0: # %entry
+; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT:    vptestnmd %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
@@ -318,22 +455,49 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_testn_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_mask_testn_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_mask_testn_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm_mask_testn_epi64_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm_mask_testn_epi64_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestnmq %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm_mask_testn_epi64_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm_mask_testn_epi64_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $14, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
@@ -347,22 +511,49 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm_mask_testn_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm_mask_testn_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm_mask_testn_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm_mask_testn_epi32_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm_mask_testn_epi32_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestnmd %xmm0, %xmm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm_mask_testn_epi32_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm_mask_testn_epi32_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestnmd %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
@@ -378,24 +569,51 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_mask_testn_epi64_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    kmovw %edi, %k1
-; X86_64-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_mask_testn_epi64_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NEXT:    kmovw %eax, %k1
-; I386-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    kmovw %edi, %k1
+; AVX512VL-X64-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    vzeroupper
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512VL-X86-NEXT:    kmovw %eax, %k1
+; AVX512VL-X86-NEXT:    vptestnmq %ymm0, %ymm1, %k0 {%k1}
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    vzeroupper
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X64-NEXT:    kmovw %edi, %k1
+; AVX512F-X64-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X64-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; AVX512F-X86-NEXT:    kmovw %eax, %k1
+; AVX512F-X86-NEXT:    vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-X86-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
@@ -409,23 +627,45 @@ entry:
 
 ; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
-; X86_64-LABEL: TEST_mm256_mask_testn_epi32_mask:
-; X86_64:       # %bb.0: # %entry
-; X86_64-NEXT:    vptestnmd %ymm0, %ymm1, %k0
-; X86_64-NEXT:    kmovw %k0, %eax
-; X86_64-NEXT:    andb %dil, %al
-; X86_64-NEXT:    # kill: def $al killed $al killed $eax
-; X86_64-NEXT:    vzeroupper
-; X86_64-NEXT:    retq
-;
-; I386-LABEL: TEST_mm256_mask_testn_epi32_mask:
-; I386:       # %bb.0: # %entry
-; I386-NEXT:    vptestnmd %ymm0, %ymm1, %k0
-; I386-NEXT:    kmovw %k0, %eax
-; I386-NEXT:    andb {{[0-9]+}}(%esp), %al
-; I386-NEXT:    # kill: def $al killed $al killed $eax
-; I386-NEXT:    vzeroupper
-; I386-NEXT:    retl
+; AVX512VL-X64-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; AVX512VL-X64:       # %bb.0: # %entry
+; AVX512VL-X64-NEXT:    vptestnmd %ymm0, %ymm1, %k0
+; AVX512VL-X64-NEXT:    kmovw %k0, %eax
+; AVX512VL-X64-NEXT:    andb %dil, %al
+; AVX512VL-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X64-NEXT:    vzeroupper
+; AVX512VL-X64-NEXT:    retq
+;
+; AVX512VL-X86-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; AVX512VL-X86:       # %bb.0: # %entry
+; AVX512VL-X86-NEXT:    vptestnmd %ymm0, %ymm1, %k0
+; AVX512VL-X86-NEXT:    kmovw %k0, %eax
+; AVX512VL-X86-NEXT:    andb {{[0-9]+}}(%esp), %al
+; AVX512VL-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512VL-X86-NEXT:    vzeroupper
+; AVX512VL-X86-NEXT:    retl
+;
+; AVX512F-X64-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; AVX512F-X64:       # %bb.0: # %entry
+; AVX512F-X64-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X64-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X64-NEXT:    vptestnmd %zmm0, %zmm1, %k0
+; AVX512F-X64-NEXT:    kmovw %k0, %eax
+; AVX512F-X64-NEXT:    andb %dil, %al
+; AVX512F-X64-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X64-NEXT:    vzeroupper
+; AVX512F-X64-NEXT:    retq
+;
+; AVX512F-X86-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; AVX512F-X86:       # %bb.0: # %entry
+; AVX512F-X86-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-X86-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-X86-NEXT:    vptestnmd %zmm0, %zmm1, %k0
+; AVX512F-X86-NEXT:    kmovw %k0, %eax
+; AVX512F-X86-NEXT:    andb {{[0-9]+}}(%esp), %al
+; AVX512F-X86-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512F-X86-NEXT:    vzeroupper
+; AVX512F-X86-NEXT:    retl
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 74c48e35bfe0edbaaa9de9e03d83ed0e3fffc78a..b1a63ffedf3fc7ba4e5cd2c78e4244adf98fcf10 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -158,7 +158,8 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
@@ -380,31 +381,15 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: ext_i32_32i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovd %edi, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-SLOW-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: ext_i32_32i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovd %edi, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-FAST-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: ext_i32_32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: ext_i32_32i8:
 ; AVX512:       # %bb.0:
@@ -672,7 +657,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307]
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
@@ -696,43 +681,18 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: ext_i64_64i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovq %rdi, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: ext_i64_64i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovq %rdi, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: ext_i64_64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %rdi, %xmm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: ext_i64_64i8:
 ; AVX512:       # %bb.0:
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 6cd52c4d25c459b35f4865ef312e9b862318e54f..c524021866d656d033033f91327a4066c8d4e5f6 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -200,7 +200,8 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
@@ -484,35 +485,17 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: ext_i32_32i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovd %edi, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-SLOW-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: ext_i32_32i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovd %edi, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-FAST-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: ext_i32_32i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: ext_i32_32i8:
 ; AVX512F:       # %bb.0:
@@ -862,7 +845,7 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307]
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
@@ -895,53 +878,23 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
-; AVX2-SLOW-LABEL: ext_i64_64i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovq %rdi, %xmm0
-; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-SLOW-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm4, %ymm1
-; AVX2-SLOW-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: ext_i64_64i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovq %rdi, %xmm0
-; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpsrlw $7, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-FAST-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpsrlw $7, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    retq
+; AVX2-LABEL: ext_i64_64i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %rdi, %xmm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlw $7, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: ext_i64_64i8:
 ; AVX512F:       # %bb.0:
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index 1acc83485ce1f6023713f3de098a7be105c1a6ef..6a8726b3a2ac270dd8d775e9bbfb10a2ac7faa08 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -163,7 +163,8 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovd %edi, %xmm0
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307]
+; AVX1-NEXT:    # xmm1 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
@@ -225,12 +226,8 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
 ; AVX2-LABEL: bitcast_i32_32i1:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
-; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/bitcast-int-to-vector.ll b/test/CodeGen/X86/bitcast-int-to-vector.ll
index 1a04fef9e01d0f1a9ca7ebc61c74a480dfa93f5c..e319255e8f0c339ffd91a1734450061e314ed9d1 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector.ll
@@ -17,8 +17,10 @@ define i1 @foo(i64 %a) {
 ;
 ; X86-SSE-LABEL: foo:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movaps %xmm0, %xmm1
+; X86-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
+; X86-SSE-NEXT:    ucomiss %xmm1, %xmm0
 ; X86-SSE-NEXT:    setp %al
 ; X86-SSE-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index c43b19a717910b5c01da0a8ae1e188c2a5cedc75..3f78d0c9c5ccfaef6148b7b5a0cd68d314626749 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -708,7 +708,6 @@ define i64 @v16i8_widened_with_ones(<16 x i8> %a, <16 x i8> %b) {
 ; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vinserti128 $1, {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
 ; AVX2-NEXT:    movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000
 ; AVX2-NEXT:    orq %rcx, %rax
diff --git a/test/CodeGen/X86/bitreverse.ll b/test/CodeGen/X86/bitreverse.ll
index 2e35fde6c55e4c22701f1fcd7c2209850fc9469f..aeac9e88dd0b5135e17912f3222b7af55bb72b41 100644
--- a/test/CodeGen/X86/bitreverse.ll
+++ b/test/CodeGen/X86/bitreverse.ll
@@ -523,3 +523,621 @@ define <2 x i16> @undef_v2i16() {
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef)
   ret <2 x i16> %b
 }
+
+; Make sure we don't assert during type legalization promoting a large
+; bitreverse due to the need for a large shift that won't fit in the i8 returned
+; from getShiftAmountTy.
+define i528 @large_promotion(i528 %A) nounwind {
+; X86-LABEL: large_promotion:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    bswapl %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    andl $252645135, %ebp # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebp
+; X86-NEXT:    andl $-252645136, %ebx # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %ebx
+; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    andl $858993459, %ebp # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %ebx # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %ebx
+; X86-NEXT:    leal (%ebx,%ebp,4), %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    andl $1431633920, %ebp # imm = 0x55550000
+; X86-NEXT:    andl $-1431699456, %ebx # imm = 0xAAAA0000
+; X86-NEXT:    shrl %ebx
+; X86-NEXT:    leal (%ebx,%ebp,2), %ebx
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    bswapl %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ebx
+; X86-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %edi
+; X86-NEXT:    leal (%edi,%ebx,4), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %edi # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %edi
+; X86-NEXT:    leal (%edi,%ebx,2), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edi
+; X86-NEXT:    andl $-252645136, %esi # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %esi # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %esi
+; X86-NEXT:    leal (%esi,%edi,4), %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %esi # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %esi
+; X86-NEXT:    leal (%esi,%edi,2), %ebx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    andl $-252645136, %edx # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %edx # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %edx # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    andl $-252645136, %ecx # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %ecx # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %ecx # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %edx
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %esi
+; X86-NEXT:    shrdl $16, %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %ebx
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ebp, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shrdl $16, %ebx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %ebx
+; X86-NEXT:    shrdl $16, %edi, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrdl $16, %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, 60(%eax)
+; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    movl %ebx, 52(%eax)
+; X86-NEXT:    movl %ebp, 48(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 44(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 32(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 28(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    shrl $16, %edx
+; X86-NEXT:    movw %dx, 64(%eax)
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: large_promotion:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rdi, %r12
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    bswapq %rbx
+; X64-NEXT:    movabsq $1085102592571150095, %r13 # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    movq %rbx, %r10
+; X64-NEXT:    andq %r13, %r10
+; X64-NEXT:    shlq $4, %r10
+; X64-NEXT:    movabsq $-1085102592571150096, %rax # imm = 0xF0F0F0F0F0F0F0F0
+; X64-NEXT:    andq %rax, %rbx
+; X64-NEXT:    shrq $4, %rbx
+; X64-NEXT:    orq %r10, %rbx
+; X64-NEXT:    movabsq $3689348814741910323, %r11 # imm = 0x3333333333333333
+; X64-NEXT:    movq %rbx, %r10
+; X64-NEXT:    andq %r11, %r10
+; X64-NEXT:    movabsq $-3689348814741910324, %r14 # imm = 0xCCCCCCCCCCCCCCCC
+; X64-NEXT:    andq %r14, %rbx
+; X64-NEXT:    shrq $2, %rbx
+; X64-NEXT:    leaq (%rbx,%r10,4), %r10
+; X64-NEXT:    movabsq $6148820866244280320, %rbx # imm = 0x5555000000000000
+; X64-NEXT:    andq %r10, %rbx
+; X64-NEXT:    movabsq $-6149102341220990976, %rdi # imm = 0xAAAA000000000000
+; X64-NEXT:    andq %r10, %rdi
+; X64-NEXT:    shrq %rdi
+; X64-NEXT:    leaq (%rdi,%rbx,2), %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    bswapq %rbp
+; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    andq %r13, %rdi
+; X64-NEXT:    shlq $4, %rdi
+; X64-NEXT:    andq %rax, %rbp
+; X64-NEXT:    shrq $4, %rbp
+; X64-NEXT:    orq %rdi, %rbp
+; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    andq %r11, %rdi
+; X64-NEXT:    andq %r14, %rbp
+; X64-NEXT:    shrq $2, %rbp
+; X64-NEXT:    leaq (%rbp,%rdi,4), %rbp
+; X64-NEXT:    movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
+; X64-NEXT:    movq %rbp, %r10
+; X64-NEXT:    andq %rbx, %r10
+; X64-NEXT:    movabsq $-6148914691236517206, %rdi # imm = 0xAAAAAAAAAAAAAAAA
+; X64-NEXT:    andq %rdi, %rbp
+; X64-NEXT:    shrq %rbp
+; X64-NEXT:    leaq (%rbp,%r10,2), %rbp
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT:    bswapq %rbp
+; X64-NEXT:    movq %rbp, %r10
+; X64-NEXT:    andq %r13, %r10
+; X64-NEXT:    shlq $4, %r10
+; X64-NEXT:    andq %rax, %rbp
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    shrq $4, %rbp
+; X64-NEXT:    orq %r10, %rbp
+; X64-NEXT:    movq %rbp, %r10
+; X64-NEXT:    andq %r11, %r10
+; X64-NEXT:    andq %r14, %rbp
+; X64-NEXT:    shrq $2, %rbp
+; X64-NEXT:    leaq (%rbp,%r10,4), %rbp
+; X64-NEXT:    movq %rbp, %r10
+; X64-NEXT:    andq %rbx, %r10
+; X64-NEXT:    andq %rdi, %rbp
+; X64-NEXT:    shrq %rbp
+; X64-NEXT:    leaq (%rbp,%r10,2), %rbp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    bswapq %r10
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    movq %r15, %rdi
+; X64-NEXT:    andq %r15, %r10
+; X64-NEXT:    shrq $4, %r10
+; X64-NEXT:    orq %rax, %r10
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %r10
+; X64-NEXT:    shrq $2, %r10
+; X64-NEXT:    leaq (%r10,%rax,4), %rax
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    andq %rbx, %r10
+; X64-NEXT:    movabsq $-6148914691236517206, %r15 # imm = 0xAAAAAAAAAAAAAAAA
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%r10,2), %r10
+; X64-NEXT:    bswapq %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andq %rdi, %r9
+; X64-NEXT:    shrq $4, %r9
+; X64-NEXT:    orq %rax, %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %r9
+; X64-NEXT:    shrq $2, %r9
+; X64-NEXT:    leaq (%r9,%rax,4), %rax
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    andq %rbx, %r9
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%r9,2), %r9
+; X64-NEXT:    bswapq %r8
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andq %rdi, %r8
+; X64-NEXT:    shrq $4, %r8
+; X64-NEXT:    orq %rax, %r8
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %r8
+; X64-NEXT:    shrq $2, %r8
+; X64-NEXT:    leaq (%r8,%rax,4), %rax
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    andq %rbx, %r8
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%r8,2), %r8
+; X64-NEXT:    bswapq %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andq %rdi, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %rcx
+; X64-NEXT:    shrq $2, %rcx
+; X64-NEXT:    leaq (%rcx,%rax,4), %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    andq %rbx, %rcx
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%rcx,2), %rcx
+; X64-NEXT:    bswapq %rdx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    andq %r13, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andq %rdi, %rdx
+; X64-NEXT:    shrq $4, %rdx
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    andq %r11, %rax
+; X64-NEXT:    andq %r14, %rdx
+; X64-NEXT:    shrq $2, %rdx
+; X64-NEXT:    leaq (%rdx,%rax,4), %rax
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    andq %rbx, %rdx
+; X64-NEXT:    andq %r15, %rax
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    leaq (%rax,%rdx,2), %rax
+; X64-NEXT:    bswapq %rsi
+; X64-NEXT:    andq %rsi, %r13
+; X64-NEXT:    andq %rdi, %rsi
+; X64-NEXT:    shlq $4, %r13
+; X64-NEXT:    shrq $4, %rsi
+; X64-NEXT:    orq %r13, %rsi
+; X64-NEXT:    andq %rsi, %r11
+; X64-NEXT:    andq %r14, %rsi
+; X64-NEXT:    shrq $2, %rsi
+; X64-NEXT:    leaq (%rsi,%r11,4), %rdx
+; X64-NEXT:    andq %rdx, %rbx
+; X64-NEXT:    andq %r15, %rdx
+; X64-NEXT:    shrq %rdx
+; X64-NEXT:    leaq (%rdx,%rbx,2), %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    shrdq $48, %rdi, %rsi
+; X64-NEXT:    shrdq $48, %rbp, %rdi
+; X64-NEXT:    shrdq $48, %r10, %rbp
+; X64-NEXT:    shrdq $48, %r9, %r10
+; X64-NEXT:    shrdq $48, %r8, %r9
+; X64-NEXT:    shrdq $48, %rcx, %r8
+; X64-NEXT:    shrdq $48, %rax, %rcx
+; X64-NEXT:    shrdq $48, %rdx, %rax
+; X64-NEXT:    movq %rax, 56(%r12)
+; X64-NEXT:    movq %rcx, 48(%r12)
+; X64-NEXT:    movq %r8, 40(%r12)
+; X64-NEXT:    movq %r9, 32(%r12)
+; X64-NEXT:    movq %r10, 24(%r12)
+; X64-NEXT:    movq %rbp, 16(%r12)
+; X64-NEXT:    movq %rdi, 8(%r12)
+; X64-NEXT:    movq %rsi, (%r12)
+; X64-NEXT:    shrq $48, %rdx
+; X64-NEXT:    movw %dx, 64(%r12)
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+  %Z = call i528 @llvm.bitreverse.i528(i528 %A)
+  ret i528 %Z
+}
+declare i528 @llvm.bitreverse.i528(i528)
diff --git a/test/CodeGen/X86/bmi-schedule.ll b/test/CodeGen/X86/bmi-schedule.ll
index bd0ba7e72c81b5d37750885f0fffdbb94a5ba20b..5b5b388c100c3f8505ef229a978a480a8f1abbe0 100644
--- a/test/CodeGen/X86/bmi-schedule.ll
+++ b/test/CodeGen/X86/bmi-schedule.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -36,6 +37,13 @@ define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andn_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [5:0.50]
+; BDVER2-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_andn_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [4:1.00]
@@ -86,6 +94,13 @@ define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_andn_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [5:0.50]
+; BDVER2-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_andn_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [4:1.00]
@@ -136,6 +151,13 @@ define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bextr_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    bextrl %edi, %esi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bextr_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [4:1.00]
@@ -186,6 +208,13 @@ define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bextr_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bextr_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [4:1.00]
@@ -236,6 +265,13 @@ define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsi_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsil (%rsi), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    blsil %edi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blsi_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsil (%rsi), %ecx # sched: [5:1.00]
@@ -287,6 +323,13 @@ define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsi_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsiq (%rsi), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    blsiq %rdi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blsi_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsiq (%rsi), %rcx # sched: [5:1.00]
@@ -338,6 +381,13 @@ define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsmsk_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsmskl (%rsi), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    blsmskl %edi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blsmsk_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsmskl (%rsi), %ecx # sched: [5:1.00]
@@ -389,6 +439,13 @@ define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsmsk_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsmskq (%rsi), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    blsmskq %rdi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blsmsk_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsmskq (%rsi), %rcx # sched: [5:1.00]
@@ -440,6 +497,13 @@ define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsr_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsrl (%rsi), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    blsrl %edi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blsr_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsrl (%rsi), %ecx # sched: [5:1.00]
@@ -491,6 +555,13 @@ define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_blsr_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsrq (%rsi), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    blsrq %rdi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_blsr_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    blsrq (%rsi), %rcx # sched: [5:1.00]
@@ -546,6 +617,14 @@ define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) {
 ; SKYLAKE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cttz_i16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzcntw (%rsi), %cx # sched: [6:1.00]
+; BDVER2-NEXT:    tzcntw %di, %ax # sched: [2:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cttz_i16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    tzcntw (%rsi), %cx # sched: [5:1.00]
@@ -598,6 +677,13 @@ define i32 @test_cttz_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cttz_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [6:1.00]
+; BDVER2-NEXT:    tzcntl %edi, %eax # sched: [2:1.00]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cttz_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [5:1.00]
@@ -648,6 +734,13 @@ define i64 @test_cttz_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cttz_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [6:1.00]
+; BDVER2-NEXT:    tzcntq %rdi, %rax # sched: [2:1.00]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cttz_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [5:1.00]
diff --git a/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir b/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir
deleted file mode 100644
index bbefc4f920a10de5ba02bc3bd4e48c90574b98dc..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir
+++ /dev/null
@@ -1,288 +0,0 @@
-# RUN: llc -mcpu=haswell -filetype=obj -start-before stack-protector -O2 %s -o - | llvm-objdump -d - | FileCheck %s
-
-# Test 1:
-#
-# Source C code:
-# volatile int y;
-# volatile int x;
-# 
-# int switchCase(int z, int w) {
-# 	int result = 0;
-# 	while (x > 0 && y < 0) {
-# 		switch(z) {
-# 			case 0:
-# 			result+=result*5;break;
-# 			case 1:
-# 			result--; break;
-# 			case 2:
-# 			result *= result; break;
-# 			case 3:
-# 			result <<= 7; break;
-# 			case 4:
-# 			result >>= 7; break;
-# 			case 5:
-# 			result = result * 16 | ~result; break;
-# 		}
-# 	}
-# 	return result;
-# }
-#
-# CHECK:       49:       eb 4a   jmp     74 <switchCase+0x95>
-# CHECK:       57:       eb 3c   jmp     60 <switchCase+0x95>
-# CHECK:       65:       eb 2e   jmp     46 <switchCase+0x95>
-# CHECK:       73:       eb 20   jmp     32 <switchCase+0x95>
-# CHECK:       81:       eb 12   jmp     18 <switchCase+0x95>
-# CHECK:       93:       7f 8b   jg      -117 <switchCase+0x20>
-
-# Test 2:
-#
-# Source C code:
-# 
-# int ifElse(int z) {
-# 	int w = 0;
-# 	while(1) {
-# 		if(x < 0)
-# 			w++;
-# 		else if(y > 0)
-# 			w--;
-# 		else if((x & y) == 3)
-# 			w*=2;
-# 		else if ((x | y) == 18)
-# 			w += 2;
-# 		else if ((y ^ x) == 154)
-# 			w -= 3;
-# 		else if(((y ^ x) & 1) != 0)
-# 			break;
-# 	}
-# 	return w;
-# }
-#
-# CHECK:       129:       eb 13   jmp     19 <ifElse+0x7e>
-# CHECK:       12e:       eb a0   jmp     -96 <ifElse+0x10>
-# CHECK:       132:       eb 9c   jmp     -100 <ifElse+0x10>
-# CHECK:       137:       eb 97   jmp     -105 <ifElse+0x10>
-# CHECK:       13c:       eb 92   jmp     -110 <ifElse+0x10>
---- |
-  ; ModuleID = 'D:\iusers\opaparo\dev_test\branch_instruction_and_target_split_perf_nops.ll'
-  source_filename = "D:\5C\5Ciusers\5C\5Copaparo\5C\5Cdev_test\5C\5Cbranch_instruction_and_target_split_perf_nops.c"
-  target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64-pc-windows-msvc19.0.24210"
-  
-  @x = common global i32 0, align 4
-  @y = common global i32 0, align 4
-  
-  ; Function Attrs: norecurse nounwind uwtable
-  define i32 @switchCase(i32 %z, i32 %w) local_unnamed_addr #0 {
-  entry:
-    %0 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %cmp19 = icmp sgt i32 %0, 0
-    br i1 %cmp19, label %land.rhs.preheader, label %while.end
-  
-  land.rhs.preheader:                               ; preds = %entry
-    br label %land.rhs
-  
-  land.rhs:                                         ; preds = %sw.epilog, %land.rhs.preheader
-    %result.020 = phi i32 [ %result.1, %sw.epilog ], [ 0, %land.rhs.preheader ]
-    %1 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %cmp1 = icmp slt i32 %1, 0
-    br i1 %cmp1, label %while.body, label %while.end
-  
-  while.body:                                       ; preds = %land.rhs
-    switch i32 %z, label %sw.epilog [
-      i32 0, label %sw.bb
-      i32 1, label %sw.bb2
-      i32 2, label %sw.bb3
-      i32 3, label %sw.bb5
-      i32 4, label %sw.bb6
-      i32 5, label %sw.bb7
-    ]
-  
-  sw.bb:                                            ; preds = %while.body
-    %add = mul nsw i32 %result.020, 6
-    br label %sw.epilog
-  
-  sw.bb2:                                           ; preds = %while.body
-    %dec = add nsw i32 %result.020, -1
-    br label %sw.epilog
-  
-  sw.bb3:                                           ; preds = %while.body
-    %mul4 = mul nsw i32 %result.020, %result.020
-    br label %sw.epilog
-  
-  sw.bb5:                                           ; preds = %while.body
-    %shl = shl i32 %result.020, 7
-    br label %sw.epilog
-  
-  sw.bb6:                                           ; preds = %while.body
-    %shr = ashr i32 %result.020, 7
-    br label %sw.epilog
-  
-  sw.bb7:                                           ; preds = %while.body
-    %mul8 = shl nsw i32 %result.020, 4
-    %neg = xor i32 %result.020, -1
-    %or = or i32 %mul8, %neg
-    br label %sw.epilog
-  
-  sw.epilog:                                        ; preds = %sw.bb7, %sw.bb6, %sw.bb5, %sw.bb3, %sw.bb2, %sw.bb, %while.body
-    %result.1 = phi i32 [ %result.020, %while.body ], [ %or, %sw.bb7 ], [ %shr, %sw.bb6 ], [ %shl, %sw.bb5 ], [ %mul4, %sw.bb3 ], [ %dec, %sw.bb2 ], [ %add, %sw.bb ]
-    %2 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %cmp = icmp sgt i32 %2, 0
-    br i1 %cmp, label %land.rhs, label %while.end
-  
-  while.end:                                        ; preds = %sw.epilog, %land.rhs, %entry
-    %result.0.lcssa = phi i32 [ 0, %entry ], [ %result.020, %land.rhs ], [ %result.1, %sw.epilog ]
-    ret i32 %result.0.lcssa
-  }
-  
-  ; Function Attrs: norecurse nounwind uwtable
-  define i32 @ifElse(i32 %z) local_unnamed_addr #0 {
-  entry:
-    br label %while.cond.outer
-  
-  while.cond.outer:                                 ; preds = %if.then, %if.then2, %if.then5, %if.then8, %if.then11, %entry
-    %w.0.ph = phi i32 [ 0, %entry ], [ %sub, %if.then11 ], [ %add, %if.then8 ], [ %mul, %if.then5 ], [ %dec, %if.then2 ], [ %inc, %if.then ]
-    br label %while.cond
-  
-  while.cond:                                       ; preds = %if.else12, %while.cond.outer
-    %0 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %cmp = icmp slt i32 %0, 0
-    br i1 %cmp, label %if.then, label %if.else
-  
-  if.then:                                          ; preds = %while.cond
-    %inc = add nsw i32 %w.0.ph, 1
-    br label %while.cond.outer
-  
-  if.else:                                          ; preds = %while.cond
-    %1 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %cmp1 = icmp sgt i32 %1, 0
-    br i1 %cmp1, label %if.then2, label %if.else3
-  
-  if.then2:                                         ; preds = %if.else
-    %dec = add nsw i32 %w.0.ph, -1
-    br label %while.cond.outer
-  
-  if.else3:                                         ; preds = %if.else
-    %2 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %3 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %and = and i32 %3, %2
-    %cmp4 = icmp eq i32 %and, 3
-    br i1 %cmp4, label %if.then5, label %if.else6
-  
-  if.then5:                                         ; preds = %if.else3
-    %mul = shl nsw i32 %w.0.ph, 1
-    br label %while.cond.outer
-  
-  if.else6:                                         ; preds = %if.else3
-    %4 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %5 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %or = or i32 %5, %4
-    %cmp7 = icmp eq i32 %or, 18
-    br i1 %cmp7, label %if.then8, label %if.else9
-  
-  if.then8:                                         ; preds = %if.else6
-    %add = add nsw i32 %w.0.ph, 2
-    br label %while.cond.outer
-  
-  if.else9:                                         ; preds = %if.else6
-    %6 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %7 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %xor = xor i32 %7, %6
-    %cmp10 = icmp eq i32 %xor, 154
-    br i1 %cmp10, label %if.then11, label %if.else12
-  
-  if.then11:                                        ; preds = %if.else9
-    %sub = add nsw i32 %w.0.ph, -3
-    br label %while.cond.outer
-  
-  if.else12:                                        ; preds = %if.else9
-    %8 = load volatile i32, i32* @y, align 4, !tbaa !3
-    %9 = load volatile i32, i32* @x, align 4, !tbaa !3
-    %xor13 = xor i32 %9, %8
-    %and14 = and i32 %xor13, 1
-    %cmp15 = icmp eq i32 %and14, 0
-    br i1 %cmp15, label %while.cond, label %while.end
-  
-  while.end:                                        ; preds = %if.else12
-    ret i32 %w.0.ph
-  }
-  
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  
-  !llvm.module.flags = !{!0, !1}
-  !llvm.ident = !{!2}
-  
-  !0 = !{i32 1, !"wchar_size", i32 2}
-  !1 = !{i32 7, !"PIC Level", i32 2}
-  !2 = !{!"clang version 6.0.0 (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_clang_worldread 3789ad4283ec09df1ed8411abbb227d76e7ef8cb) (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_llvm_worldread 42897913cc9fac0d94e8636d9aed4dc193d7864e)"}
-  !3 = !{!4, !4, i64 0}
-  !4 = !{!"int", !5, i64 0}
-  !5 = !{!"omnipotent char", !6, i64 0}
-  !6 = !{!"Simple C/C++ TBAA"}
-
-...
----
-name:            switchCase
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:       
-liveins:         
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
-body:             |
-
-...
----
-name:            ifElse
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:       
-liveins:         
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  maxCallFrameSize: 4294967295
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      
-stack:           
-constants:       
-body:             |
-
-...
diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index 90f65597810a3fc4bb06906b4bb4673eb68360ea..353faabba2de2b6279fc286878d8113549b47069 100644
--- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -87,21 +87,24 @@ define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
 define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
 ; AVX-LABEL: f16xi8_i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f16xi8_i64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; ALL32-NEXT:    # xmm1 = mem[0,0]
 ; ALL32-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f16xi8_i64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-64-NEXT:    # xmm1 = mem[0,0]
 ; AVX-64-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    retq
@@ -202,7 +205,8 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
 ; AVX-LABEL: f32xi8_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-NEXT:    # xmm2 = mem[0,0]
 ; AVX-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -219,7 +223,8 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
 ; AVX-64-LABEL: f32xi8_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = [7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX-64-NEXT:    # xmm2 = mem[0,0]
 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; AVX-64-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -354,7 +359,8 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 ; AVX-LABEL: f64i8_i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -362,7 +368,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -386,7 +392,8 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 ; AVX-64-LABEL: f64i8_i32:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
+; AVX-64-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-64-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -394,7 +401,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -420,11 +427,13 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
 }
 
 
+; FIXME the load should be folded with the MOVDDUP with AVX1. PR39454
 define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-LABEL: f64xi8_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -432,7 +441,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -456,7 +465,8 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-64-LABEL: f64xi8_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -464,7 +474,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -489,7 +499,6 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
   ret <64 x i8> %res2
 }
 
-
 define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
 ; AVX-LABEL: f64xi8_i128:
 ; AVX:       # %bb.0:
@@ -502,7 +511,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
 ; AVX-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -536,7 +545,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -675,21 +684,24 @@ define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
 define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
 ; AVX-LABEL: f8xi16_i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f8xi16_i64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; ALL32-NEXT:    # xmm1 = mem[0,0]
 ; ALL32-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f8xi16_i64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-64-NEXT:    # xmm1 = mem[0,0]
 ; AVX-64-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    retq
@@ -750,7 +762,8 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
 ; AVX-LABEL: f16xi16_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-NEXT:    # xmm2 = mem[0,0]
 ; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -767,7 +780,8 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
 ; AVX-64-LABEL: f16xi16_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = [4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX-64-NEXT:    # xmm2 = mem[0,0]
 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; AVX-64-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -832,7 +846,8 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 ; AVX-LABEL: f32xi16_i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -840,7 +855,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -864,7 +879,8 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 ; AVX-64-LABEL: f32xi16_i32:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
+; AVX-64-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-64-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -872,7 +888,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -898,11 +914,13 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
 }
 
 
+; FIXME the load should be folded with the MOVDDUP with AVX1. PR39454
 define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-LABEL: f32xi16_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -910,7 +928,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -934,7 +952,8 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-64-LABEL: f32xi16_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -942,7 +961,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -980,7 +999,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
 ; AVX-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -1014,7 +1033,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -1120,21 +1139,24 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
 define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
 ; AVX-LABEL: f4xi32_i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f4xi32_i64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; ALL32-NEXT:    # xmm1 = mem[0,0]
 ; ALL32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f4xi32_i64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-64-NEXT:    # xmm1 = mem[0,0]
 ; AVX-64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    retq
@@ -1155,7 +1177,8 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
 ; AVX-LABEL: f8xi32_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-NEXT:    # xmm2 = mem[0,0]
 ; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1172,7 +1195,8 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
 ; AVX-64-LABEL: f8xi32_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm2 = [2.1219957909652723E-314,2.1219957909652723E-314]
+; AVX-64-NEXT:    # xmm2 = mem[0,0]
 ; AVX-64-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX-64-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1233,11 +1257,13 @@ define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
 }
 
 
+; FIXME the load should be folded with the MOVDDUP with AVX1. PR39454
 define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX-LABEL: f16xi32_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1245,7 +1271,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -1263,13 +1289,14 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    retl
 ;
 ; AVX-64-LABEL: f16xi32_i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX-64-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0]
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -1277,7 +1304,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -1295,7 +1322,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
 ; AVX512F-64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    retq
   %res1 = add <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %a
   %res2 = and <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>, %res1
@@ -1315,7 +1342,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
 ; AVX-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    retl
@@ -1335,7 +1362,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
 ; AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    retl
 ;
 ; AVX-64-LABEL: f16xi32_i128:
@@ -1349,7 +1376,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
 ; AVX-64-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
+; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm2
 ; AVX-64-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    retq
@@ -1369,7 +1396,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
 ; AVX512F-64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-64-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    retq
   %res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
   %res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
@@ -1573,21 +1600,24 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
 define <4 x float> @f4xf32_f64(<4 x float> %a) {
 ; AVX-LABEL: f4xf32_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f4xf32_f64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; ALL32-NEXT:    vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3]
+; ALL32-NEXT:    # xmm1 = mem[0,0]
 ; ALL32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; ALL32-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f4xf32_f64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-64-NEXT:    vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3]
+; AVX-64-NEXT:    # xmm1 = mem[0,0]
 ; AVX-64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-64-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX-64-NEXT:    retq
@@ -1607,21 +1637,21 @@ define <4 x float> @f4xf32_f64(<4 x float> %a) {
 define <8 x float> @f8xf32_f64(<8 x float> %a) {
 ; AVX-LABEL: f8xf32_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retl
 ;
 ; ALL32-LABEL: f8xf32_f64:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; ALL32-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; ALL32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; ALL32-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; ALL32-NEXT:    retl
 ;
 ; AVX-64-LABEL: f8xf32_f64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX-64-NEXT:    retq
@@ -1641,7 +1671,7 @@ define <8 x float> @f8xf32_f64(<8 x float> %a) {
 define <8 x float> @f8xf32_f128(<8 x float> %a) {
 ; AVX-LABEL: f8xf32_f128:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4,1,2,3,4,1,2,3]
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
@@ -1649,7 +1679,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
 ;
 ; ALL32-LABEL: f8xf32_f128:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4,1,2,3,4,1,2,3]
+; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
 ; ALL32-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; ALL32-NEXT:    vdivps %ymm0, %ymm1, %ymm0
@@ -1657,7 +1687,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
 ;
 ; AVX-64-LABEL: f8xf32_f128:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4,1,2,3,4,1,2,3]
+; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-64-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX-64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
@@ -1665,7 +1695,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
 ;
 ; ALL64-LABEL: f8xf32_f128:
 ; ALL64:       # %bb.0:
-; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4,1,2,3,4,1,2,3]
+; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
 ; ALL64-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; ALL64-NEXT:    vdivps %ymm0, %ymm1, %ymm0
@@ -1679,7 +1709,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
 define <16 x float> @f16xf32_f64(<16 x float> %a) {
 ; AVX-LABEL: f16xf32_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1688,7 +1718,7 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
 ;
 ; AVX2-LABEL: f16xf32_f64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1697,14 +1727,14 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
 ;
 ; AVX512-LABEL: f16xf32_f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX512-NEXT:    vbroadcastsd {{.*#+}} zmm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    retl
 ;
 ; AVX-64-LABEL: f16xf32_f64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
+; AVX-64-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3]
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1735,7 +1765,7 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
 define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ; AVX-LABEL: f16xf32_f128:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4,1,2,3,4,1,2,3]
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -1745,7 +1775,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX2-LABEL: f16xf32_f128:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4,1,2,3,4,1,2,3]
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -1755,7 +1785,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX512-LABEL: f16xf32_f128:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3]
+; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
@@ -1763,7 +1793,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX-64-LABEL: f16xf32_f128:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4,1,2,3,4,1,2,3]
+; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -1773,7 +1803,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX2-64-LABEL: f16xf32_f128:
 ; AVX2-64:       # %bb.0:
-; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4,1,2,3,4,1,2,3]
+; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
@@ -1783,7 +1813,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 ;
 ; AVX512F-64-LABEL: f16xf32_f128:
 ; AVX512F-64:       # %bb.0:
-; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3]
+; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
@@ -1797,7 +1827,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
 define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ; AVX-LABEL: f16xf32_f256:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [8,1,2,3,4,5,6,7]
+; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1806,7 +1836,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX2-LABEL: f16xf32_f256:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [8,1,2,3,4,5,6,7]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1815,7 +1845,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX512-LABEL: f16xf32_f256:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8,1,2,3,4,5,6,7,8,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
@@ -1823,7 +1853,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX-64-LABEL: f16xf32_f256:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8,1,2,3,4,5,6,7]
+; AVX-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1832,7 +1862,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX2-64-LABEL: f16xf32_f256:
 ; AVX2-64:       # %bb.0:
-; AVX2-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8,1,2,3,4,5,6,7]
+; AVX2-64-NEXT:    vmovaps {{.*#+}} ymm2 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; AVX2-64-NEXT:    vdivps %ymm0, %ymm2, %ymm0
@@ -1841,7 +1871,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 ;
 ; AVX512F-64-LABEL: f16xf32_f256:
 ; AVX512F-64:       # %bb.0:
-; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8,1,2,3,4,5,6,7,8,1,2,3,4,5,6,7]
+; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    vdivps %zmm0, %zmm1, %zmm0
@@ -1855,7 +1885,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
 define <4 x double> @f4xf64_f128(<4 x double> %a) {
 ; AVX-LABEL: f4xf64_f128:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,1,2,1]
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
@@ -1863,7 +1893,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
 ;
 ; ALL32-LABEL: f4xf64_f128:
 ; ALL32:       # %bb.0:
-; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,1,2,1]
+; ALL32-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; ALL32-NEXT:    # ymm1 = mem[0,1,0,1]
 ; ALL32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; ALL32-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
@@ -1871,7 +1901,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
 ;
 ; AVX-64-LABEL: f4xf64_f128:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,1,2,1]
+; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX-64-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX-64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
@@ -1879,7 +1909,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
 ;
 ; ALL64-LABEL: f4xf64_f128:
 ; ALL64:       # %bb.0:
-; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2,1,2,1]
+; ALL64-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; ALL64-NEXT:    # ymm1 = mem[0,1,0,1]
 ; ALL64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; ALL64-NEXT:    vdivpd %ymm0, %ymm1, %ymm0
@@ -1893,7 +1923,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
 define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ; AVX-LABEL: f8xf64_f128:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2,1,2,1]
+; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
@@ -1903,7 +1933,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX2-LABEL: f8xf64_f128:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2,1,2,1]
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
@@ -1913,7 +1943,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX512-LABEL: f8xf64_f128:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2,1,2,1,2,1,2,1]
+; AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
@@ -1921,7 +1951,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX-64-LABEL: f8xf64_f128:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2,1,2,1]
+; AVX-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX-64-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
@@ -1931,7 +1961,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX2-64-LABEL: f8xf64_f128:
 ; AVX2-64:       # %bb.0:
-; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2,1,2,1]
+; AVX2-64-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX2-64-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
@@ -1941,7 +1971,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 ;
 ; AVX512F-64-LABEL: f8xf64_f128:
 ; AVX512F-64:       # %bb.0:
-; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2,1,2,1,2,1,2,1]
+; AVX512F-64-NEXT:    vbroadcastf32x4 {{.*#+}} zmm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0,2.0E+0,1.0E+0]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
@@ -1962,7 +1992,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
 define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ; AVX-LABEL: f8xf64_f256:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovapd {{.*#+}} ymm2 = [4,1,2,3]
+; AVX-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
@@ -1971,7 +2001,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX2-LABEL: f8xf64_f256:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovapd {{.*#+}} ymm2 = [4,1,2,3]
+; AVX2-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX2-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
@@ -1980,7 +2010,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX512-LABEL: f8xf64_f256:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4,1,2,3,4,1,2,3]
+; AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX512-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
@@ -1988,7 +2018,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX-64-LABEL: f8xf64_f256:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4,1,2,3]
+; AVX-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; AVX-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
@@ -1997,7 +2027,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX2-64-LABEL: f8xf64_f256:
 ; AVX2-64:       # %bb.0:
-; AVX2-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4,1,2,3]
+; AVX2-64-NEXT:    vmovapd {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
 ; AVX2-64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; AVX2-64-NEXT:    vdivpd %ymm0, %ymm2, %ymm0
@@ -2006,7 +2036,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
 ;
 ; AVX512F-64-LABEL: f8xf64_f256:
 ; AVX512F-64:       # %bb.0:
-; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4,1,2,3,4,1,2,3]
+; AVX512F-64-NEXT:    vbroadcastf64x4 {{.*#+}} zmm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0]
 ; AVX512F-64-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512F-64-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
diff --git a/test/CodeGen/X86/bswap.ll b/test/CodeGen/X86/bswap.ll
index 756dd7fa6f6b2102eb23d73c70ecaceef4de8d8f..4753fc27cc01cc453739557ba398f3c919923654 100644
--- a/test/CodeGen/X86/bswap.ll
+++ b/test/CodeGen/X86/bswap.ll
@@ -206,3 +206,153 @@ define i64 @finally_useful_bswap() {
   ret i64 %swapped
 }
 
+; Make sure we don't assert during type legalization promoting a large
+; bswap due to the need for a large shift that won't fit in the i8 returned
+; from getShiftAmountTy.
+define i528 @large_promotion(i528 %A) nounwind {
+; CHECK-LABEL: large_promotion:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $44, %esp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrdl $16, %ecx, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %edx
+; CHECK-NEXT:    shrdl $16, %edx, %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %esi
+; CHECK-NEXT:    shrdl $16, %esi, %edx
+; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %edi
+; CHECK-NEXT:    shrdl $16, %edi, %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %ebx
+; CHECK-NEXT:    shrdl $16, %ebx, %edi
+; CHECK-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    bswapl %ebp
+; CHECK-NEXT:    shrdl $16, %ebp, %ebx
+; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrdl $16, %ecx, %ebp
+; CHECK-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    shrdl $16, %eax, %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrdl $16, %ecx, %eax
+; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    shrdl $16, %eax, %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    bswapl %ebp
+; CHECK-NEXT:    shrdl $16, %ebp, %eax
+; CHECK-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT:    bswapl %ebx
+; CHECK-NEXT:    shrdl $16, %ebx, %ebp
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    bswapl %esi
+; CHECK-NEXT:    shrdl $16, %esi, %ebx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    bswapl %edx
+; CHECK-NEXT:    shrdl $16, %edx, %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    bswapl %ecx
+; CHECK-NEXT:    shrdl $16, %ecx, %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    bswapl %edi
+; CHECK-NEXT:    shrdl $16, %edi, %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %ecx, 60(%eax)
+; CHECK-NEXT:    movl %edx, 56(%eax)
+; CHECK-NEXT:    movl %esi, 52(%eax)
+; CHECK-NEXT:    movl %ebx, 48(%eax)
+; CHECK-NEXT:    movl %ebp, 44(%eax)
+; CHECK-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 40(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 36(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 32(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 28(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 24(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 20(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 16(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 12(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 8(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, 4(%eax)
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    shrl $16, %edi
+; CHECK-NEXT:    movw %di, 64(%eax)
+; CHECK-NEXT:    addl $44, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl $4
+;
+; CHECK64-LABEL: large_promotion:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    pushq %rbx
+; CHECK64-NEXT:    movq %rdi, %rax
+; CHECK64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; CHECK64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; CHECK64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; CHECK64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK64-NEXT:    bswapq %r10
+; CHECK64-NEXT:    bswapq %rdi
+; CHECK64-NEXT:    shrdq $48, %rdi, %r10
+; CHECK64-NEXT:    bswapq %r11
+; CHECK64-NEXT:    shrdq $48, %r11, %rdi
+; CHECK64-NEXT:    bswapq %rbx
+; CHECK64-NEXT:    shrdq $48, %rbx, %r11
+; CHECK64-NEXT:    bswapq %r9
+; CHECK64-NEXT:    shrdq $48, %r9, %rbx
+; CHECK64-NEXT:    bswapq %r8
+; CHECK64-NEXT:    shrdq $48, %r8, %r9
+; CHECK64-NEXT:    bswapq %rcx
+; CHECK64-NEXT:    shrdq $48, %rcx, %r8
+; CHECK64-NEXT:    bswapq %rdx
+; CHECK64-NEXT:    shrdq $48, %rdx, %rcx
+; CHECK64-NEXT:    bswapq %rsi
+; CHECK64-NEXT:    shrdq $48, %rsi, %rdx
+; CHECK64-NEXT:    shrq $48, %rsi
+; CHECK64-NEXT:    movq %rdx, 56(%rax)
+; CHECK64-NEXT:    movq %rcx, 48(%rax)
+; CHECK64-NEXT:    movq %r8, 40(%rax)
+; CHECK64-NEXT:    movq %r9, 32(%rax)
+; CHECK64-NEXT:    movq %rbx, 24(%rax)
+; CHECK64-NEXT:    movq %r11, 16(%rax)
+; CHECK64-NEXT:    movq %rdi, 8(%rax)
+; CHECK64-NEXT:    movq %r10, (%rax)
+; CHECK64-NEXT:    movw %si, 64(%rax)
+; CHECK64-NEXT:    popq %rbx
+; CHECK64-NEXT:    retq
+  %Z = call i528 @llvm.bswap.i528(i528 %A)
+  ret i528 %Z
+}
+declare i528 @llvm.bswap.i528(i528)
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
index ce7614c16fb74437781795a069e448cc6b95b77f..065d87ed8881d75f246299dea10b9834dc38a1e7 100644
--- a/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -65,7 +65,7 @@ entry:
 define <2 x double> @test_negative_zero_2(<2 x double> %A) {
 ; SSE2-LABEL: test_negative_zero_2:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movapd {{.*#+}} xmm1 = <u,-0>
+; SSE2-NEXT:    movapd {{.*#+}} xmm1 = <u,-0.0E+0>
 ; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll
index ff41083835f4ff39b90eb7448bb6341059110d05..03efb54021674ae6438a90ea996ac714ebb618c7 100644
--- a/test/CodeGen/X86/cast-vsel.ll
+++ b/test/CodeGen/X86/cast-vsel.ll
@@ -93,15 +93,14 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
 ; AVX1-LABEL: zext:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vcmpltps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext:
@@ -357,17 +356,16 @@ define void @example25() nounwind {
 ; AVX2-LABEL: example25:
 ; AVX2:       # %bb.0: # %vector.ph
 ; AVX2-NEXT:    movq $-4096, %rax # imm = 0xF000
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    .p2align 4, 0x90
 ; AVX2-NEXT:  .LBB5_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vmovups da+4096(%rax), %ymm1
-; AVX2-NEXT:    vcmpltps db+4096(%rax), %ymm1, %ymm1
-; AVX2-NEXT:    vmovups dc+4096(%rax), %ymm2
-; AVX2-NEXT:    vcmpltps dd+4096(%rax), %ymm2, %ymm2
-; AVX2-NEXT:    vandps %ymm0, %ymm2, %ymm2
-; AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vmovups %ymm1, dj+4096(%rax)
+; AVX2-NEXT:    vmovups da+4096(%rax), %ymm0
+; AVX2-NEXT:    vcmpltps db+4096(%rax), %ymm0, %ymm0
+; AVX2-NEXT:    vmovups dc+4096(%rax), %ymm1
+; AVX2-NEXT:    vcmpltps dd+4096(%rax), %ymm1, %ymm1
+; AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, dj+4096(%rax)
 ; AVX2-NEXT:    addq $32, %rax
 ; AVX2-NEXT:    jne .LBB5_1
 ; AVX2-NEXT:  # %bb.2: # %for.end
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index 961ec2be59e66d4491f2e97e7a978b3529f0c9e1..983c7342603353e9797d1b828c51e770e0cd742c 100644
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -296,59 +296,10 @@ define <16 x i16> @_clearupper16xi16a(<16 x i16>) nounwind {
 }
 
 define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
-; SSE2-LABEL: _clearupper16xi8a:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: _clearupper16xi8a:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    andps {{.*}}(%rip), %xmm0
-; SSE42-NEXT:    retq
+; SSE-LABEL: _clearupper16xi8a:
+; SSE:       # %bb.0:
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: _clearupper16xi8a:
 ; AVX:       # %bb.0:
@@ -422,107 +373,12 @@ define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
 }
 
 define <32 x i8> @_clearupper32xi8a(<32 x i8>) nounwind {
-; SSE2-LABEL: _clearupper32xi8a:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movd %eax, %xmm6
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: _clearupper32xi8a:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE42-NEXT:    andps %xmm2, %xmm0
-; SSE42-NEXT:    andps %xmm2, %xmm1
-; SSE42-NEXT:    retq
+; SSE-LABEL: _clearupper32xi8a:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: _clearupper32xi8a:
 ; AVX:       # %bb.0:
diff --git a/test/CodeGen/X86/cmov-schedule.ll b/test/CodeGen/X86/cmov-schedule.ll
index 8993c30d1f8f99f70a3c8ca1a3f5bbdbeaff01ba..de3e8637a183b90c03aea3577a395de1acb9c2fd 100644
--- a/test/CodeGen/X86/cmov-schedule.ll
+++ b/test/CodeGen/X86/cmov-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -540,6 +541,72 @@ define void @test_cmov_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmov_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmovow %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnow %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovsw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnsw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlew %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmovow (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnow (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovsw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnsw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlew (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgw (%rdx), %di # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmov_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1204,6 +1271,72 @@ define void @test_cmov_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmov_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmovol %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnol %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovael %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovael %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovael %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmoval %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmoval %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovsl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnsl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovll %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovll %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlel %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovol (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnol (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovael (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmoval (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmoval (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovsl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnsl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovll (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovll (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlel (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgl (%rdx), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmov_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1868,6 +2001,72 @@ define void @test_cmov_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmov_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmovoq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnoq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmoveq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmoveq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovneq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovneq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovbeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovaq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovsq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnsq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovpq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovnpq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovlq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgeq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovleq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovleq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovgq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmovoq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnoq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmoveq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmoveq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovneq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovneq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovbeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovaq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovsq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnsq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovpq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovnpq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovlq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgeq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovleq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovleq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    cmovgq (%rdx), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmov_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
diff --git a/test/CodeGen/X86/coalesce-dbg-value-subreg-rewrite.mir b/test/CodeGen/X86/coalesce-dbg-value-subreg-rewrite.mir
index 3c339bb013d08e7eb42892759ce21124243598ad..8fa3d82a966945f661c5ac74030c55da782449c8 100644
--- a/test/CodeGen/X86/coalesce-dbg-value-subreg-rewrite.mir
+++ b/test/CodeGen/X86/coalesce-dbg-value-subreg-rewrite.mir
@@ -36,7 +36,7 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     %0:gr16_abcd = MOV16ri 0
-    DBG_VALUE debug-use %0.sub_8bit:gr16_abcd, debug-use $noreg, !11, !DIExpression(), debug-location !13
+    DBG_VALUE %0.sub_8bit:gr16_abcd, $noreg, !11, !DIExpression(), debug-location !13
     undef %6.sub_8bit:gr16_abcd = COPY killed %0.sub_8bit
     dead $dx = COPY killed %6
 
@@ -48,4 +48,4 @@ body:             |
 #
 # CHECK:      bb.0.entry:
 # CHECK-NEXT:    $dx = MOV16ri 0
-# CHECK-NEXT:    DBG_VALUE debug-use $dl,
+# CHECK-NEXT:    DBG_VALUE $dl,
diff --git a/test/CodeGen/X86/code-model-elf-memset.ll b/test/CodeGen/X86/code-model-elf-memset.ll
index ba34aaeddcbc6ee83362df3601a1415fb63ead09..2f429f32eab067301d97c7de4a7b639992dc9b99 100644
--- a/test/CodeGen/X86/code-model-elf-memset.ll
+++ b/test/CodeGen/X86/code-model-elf-memset.ll
@@ -56,16 +56,16 @@ define i32 @main() #0 {
 ; LARGE-PIC:       # %bb.0: # %entry
 ; LARGE-PIC-NEXT:    subq $424, %rsp # imm = 0x1A8
 ; LARGE-PIC-NEXT:    .cfi_def_cfa_offset 432
-; LARGE-PIC-NEXT:  .Ltmp0:
-; LARGE-PIC-NEXT:    leaq {{.*}}(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp0, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
+; LARGE-PIC-NEXT:  .L0$pb:
+; LARGE-PIC-NEXT:    leaq .L0$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movl $0, {{[0-9]+}}(%rsp)
 ; LARGE-PIC-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; LARGE-PIC-NEXT:    movabsq $memset@GOT, %rcx
+; LARGE-PIC-NEXT:    movabsq $memset@GOT, %rax
 ; LARGE-PIC-NEXT:    xorl %esi, %esi
 ; LARGE-PIC-NEXT:    movl $400, %edx # imm = 0x190
-; LARGE-PIC-NEXT:    callq *(%rax,%rcx)
+; LARGE-PIC-NEXT:    callq *(%rcx,%rax)
 ; LARGE-PIC-NEXT:    xorl %eax, %eax
 ; LARGE-PIC-NEXT:    addq $424, %rsp # imm = 0x1A8
 ; LARGE-PIC-NEXT:    .cfi_def_cfa_offset 8
diff --git a/test/CodeGen/X86/code-model-elf.ll b/test/CodeGen/X86/code-model-elf.ll
index 6d62f25617980351ca858420f958521555acf882..56d3f4c102f0f16a748b912ee5edfadb503cd220 100644
--- a/test/CodeGen/X86/code-model-elf.ll
+++ b/test/CodeGen/X86/code-model-elf.ll
@@ -2,12 +2,12 @@
 ; Run with --no_x86_scrub_rip because we care a lot about how globals are
 ; accessed in the code model.
 
-; RUN: llc < %s -relocation-model=static -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-STATIC
-; RUN: llc < %s -relocation-model=static -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-STATIC
-; RUN: llc < %s -relocation-model=static -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-STATIC
-; RUN: llc < %s -relocation-model=pic    -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-PIC
-; RUN: llc < %s -relocation-model=pic    -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-PIC
-; RUN: llc < %s -relocation-model=pic    -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-PIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-STATIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-STATIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-STATIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-PIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-PIC
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-PIC
 
 ; Generated from this C source:
 ;
@@ -68,9 +68,9 @@ define dso_local i32* @lea_static_data() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_static_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp0:
-; LARGE-PIC-NEXT:    leaq .Ltmp0(%rip), %rcx
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp0, %rax
+; LARGE-PIC-NEXT:  .L0$pb:
+; LARGE-PIC-NEXT:    leaq .L0$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $static_data@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
@@ -108,9 +108,9 @@ define dso_local i32* @lea_global_data() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_global_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp1:
-; LARGE-PIC-NEXT:    leaq .Ltmp1(%rip), %rcx
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp1, %rax
+; LARGE-PIC-NEXT:  .L1$pb:
+; LARGE-PIC-NEXT:    leaq .L1$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L1$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
@@ -146,12 +146,12 @@ define dso_local i32* @lea_extern_data() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_extern_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp2:
-; LARGE-PIC-NEXT:    leaq .Ltmp2(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp2, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
-; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rcx
-; LARGE-PIC-NEXT:    movq (%rax,%rcx), %rax
+; LARGE-PIC-NEXT:  .L2$pb:
+; LARGE-PIC-NEXT:    leaq .L2$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L2$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
+; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rax
+; LARGE-PIC-NEXT:    movq (%rcx,%rax), %rax
 ; LARGE-PIC-NEXT:    retq
   ret i32* getelementptr inbounds ([10 x i32], [10 x i32]* @extern_data, i64 0, i64 0)
 }
@@ -188,12 +188,12 @@ define dso_local i32 @load_global_data() #0 {
 ;
 ; LARGE-PIC-LABEL: load_global_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp3:
-; LARGE-PIC-NEXT:    leaq .Ltmp3(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp3, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
-; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rcx
-; LARGE-PIC-NEXT:    movl 8(%rax,%rcx), %eax
+; LARGE-PIC-NEXT:  .L3$pb:
+; LARGE-PIC-NEXT:    leaq .L3$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L3$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
+; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
+; LARGE-PIC-NEXT:    movl 8(%rcx,%rax), %eax
 ; LARGE-PIC-NEXT:    retq
   %rv = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @global_data, i64 0, i64 2)
   ret i32 %rv
@@ -231,12 +231,12 @@ define dso_local i32 @load_extern_data() #0 {
 ;
 ; LARGE-PIC-LABEL: load_extern_data:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp4:
-; LARGE-PIC-NEXT:    leaq .Ltmp4(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp4, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
-; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rcx
-; LARGE-PIC-NEXT:    movq (%rax,%rcx), %rax
+; LARGE-PIC-NEXT:  .L4$pb:
+; LARGE-PIC-NEXT:    leaq .L4$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L4$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
+; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rax
+; LARGE-PIC-NEXT:    movq (%rcx,%rax), %rax
 ; LARGE-PIC-NEXT:    movl 8(%rax), %eax
 ; LARGE-PIC-NEXT:    retq
   %rv = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @extern_data, i64 0, i64 2)
@@ -287,9 +287,9 @@ define dso_local void ()* @lea_static_fn() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_static_fn:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp5:
-; LARGE-PIC-NEXT:    leaq .Ltmp5(%rip), %rcx
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp5, %rax
+; LARGE-PIC-NEXT:  .L7$pb:
+; LARGE-PIC-NEXT:    leaq .L7$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L7$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $static_fn@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
@@ -325,9 +325,9 @@ define dso_local void ()* @lea_global_fn() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_global_fn:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp6:
-; LARGE-PIC-NEXT:    leaq .Ltmp6(%rip), %rcx
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp6, %rax
+; LARGE-PIC-NEXT:  .L8$pb:
+; LARGE-PIC-NEXT:    leaq .L8$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L8$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $global_fn@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
@@ -363,12 +363,12 @@ define dso_local void ()* @lea_extern_fn() #0 {
 ;
 ; LARGE-PIC-LABEL: lea_extern_fn:
 ; LARGE-PIC:       # %bb.0:
-; LARGE-PIC-NEXT:  .Ltmp7:
-; LARGE-PIC-NEXT:    leaq .Ltmp7(%rip), %rax
-; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp7, %rcx
-; LARGE-PIC-NEXT:    addq %rcx, %rax
-; LARGE-PIC-NEXT:    movabsq $extern_fn@GOT, %rcx
-; LARGE-PIC-NEXT:    movq (%rax,%rcx), %rax
+; LARGE-PIC-NEXT:  .L9$pb:
+; LARGE-PIC-NEXT:    leaq .L9$pb(%rip), %rax
+; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L9$pb, %rcx
+; LARGE-PIC-NEXT:    addq %rax, %rcx
+; LARGE-PIC-NEXT:    movabsq $extern_fn@GOT, %rax
+; LARGE-PIC-NEXT:    movq (%rcx,%rax), %rax
 ; LARGE-PIC-NEXT:    retq
   ret void ()* @extern_fn
 }
diff --git a/test/CodeGen/X86/combine-64bit-vec-binop.ll b/test/CodeGen/X86/combine-64bit-vec-binop.ll
index e434bfc11c4c6a1691dbbd0e7bafcc4e8e9c5318..4bd1ebbc93edea5667564a897067f5c5d7b027f5 100644
--- a/test/CodeGen/X86/combine-64bit-vec-binop.ll
+++ b/test/CodeGen/X86/combine-64bit-vec-binop.ll
@@ -97,7 +97,7 @@ define double @test2_mul(double %A, double %B) {
   ret double %3
 }
 
-; There is no legal ISD::MUL with type MVT::v8i16.
+; There is no legal ISD::MUL with type MVT::v16i8.
 define double @test3_mul(double %A, double %B) {
 ; SSE41-LABEL: test3_mul:
 ; SSE41:       # %bb.0:
diff --git a/test/CodeGen/X86/combine-fabs.ll b/test/CodeGen/X86/combine-fabs.ll
index c71eeb3962319235fbac6ea5eb320399dd1f2d02..b779c589cf9dde2c04a49bdd2bbf9570eae9e1bf 100644
--- a/test/CodeGen/X86/combine-fabs.ll
+++ b/test/CodeGen/X86/combine-fabs.ll
@@ -24,12 +24,12 @@ define float @combine_fabs_constant() {
 define <4 x float> @combine_vec_fabs_constant() {
 ; SSE-LABEL: combine_vec_fabs_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,0,2,2]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,2.0E+0,2.0E+0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_fabs_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,2,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,2.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
   %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> <float 0.0, float -0.0, float 2.0, float -2.0>)
   ret <4 x float> %1
diff --git a/test/CodeGen/X86/combine-fcopysign.ll b/test/CodeGen/X86/combine-fcopysign.ll
index 4b416085c5d3f8daa11387366d83b0042acb190d..5d27fdfa889589b79578a84fda49ecd0dc2aba52 100644
--- a/test/CodeGen/X86/combine-fcopysign.ll
+++ b/test/CodeGen/X86/combine-fcopysign.ll
@@ -62,7 +62,7 @@ define <4 x float> @combine_vec_fcopysign_neg_constant0(<4 x float> %x) {
 ;
 ; AVX-LABEL: combine_vec_fcopysign_neg_constant0:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> <float -2.0, float -2.0, float -2.0, float -2.0>)
@@ -77,7 +77,7 @@ define <4 x float> @combine_vec_fcopysign_neg_constant1(<4 x float> %x) {
 ;
 ; AVX-LABEL: combine_vec_fcopysign_neg_constant1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> <float -0.0, float -2.0, float -4.0, float -8.0>)
@@ -92,7 +92,7 @@ define <4 x float> @combine_vec_fcopysign_fneg_fabs_sgn(<4 x float> %x, <4 x flo
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fneg_fabs_sgn:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
@@ -112,7 +112,7 @@ define <4 x float> @combine_vec_fcopysign_fabs_mag(<4 x float> %x, <4 x float> %
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fabs_mag:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
@@ -134,7 +134,7 @@ define <4 x float> @combine_vec_fcopysign_fneg_mag(<4 x float> %x, <4 x float> %
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fneg_mag:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
@@ -156,7 +156,7 @@ define <4 x float> @combine_vec_fcopysign_fcopysign_mag(<4 x float> %x, <4 x flo
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fcopysign_mag:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
@@ -178,7 +178,7 @@ define <4 x float> @combine_vec_fcopysign_fcopysign_sgn(<4 x float> %x, <4 x flo
 ;
 ; AVX-LABEL: combine_vec_fcopysign_fcopysign_sgn:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
@@ -202,7 +202,7 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float
 ; SSE-NEXT:    movaps {{.*#+}} xmm7
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    andps %xmm7, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm8 = [-0,-0]
+; SSE-NEXT:    movaps {{.*#+}} xmm8 = [-0.0E+0,-0.0E+0]
 ; SSE-NEXT:    andps %xmm8, %xmm4
 ; SSE-NEXT:    orps %xmm4, %xmm2
 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -232,7 +232,7 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float
 ; AVX-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm2
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    vcvtps2pd %xmm1, %ymm1
-; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
@@ -249,7 +249,7 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl
 ; SSE-NEXT:    movaps {{.*#+}} xmm5
 ; SSE-NEXT:    andps %xmm5, %xmm0
 ; SSE-NEXT:    cvtsd2ss %xmm1, %xmm6
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-0,-0,-0,-0]
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; SSE-NEXT:    andps %xmm4, %xmm6
 ; SSE-NEXT:    orps %xmm6, %xmm0
 ; SSE-NEXT:    movshdup {{.*#+}} xmm6 = xmm3[1,1,3,3]
@@ -282,7 +282,7 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl
 ; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
 ; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vcvtpd2ps %ymm1, %xmm1
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; AVX-NEXT:    vandpd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vorpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/combine-pmuldq.ll b/test/CodeGen/X86/combine-pmuldq.ll
index c735b204344f0613fedd74dd1a9912b93e6fcb86..cd58947b186db8c067326b615f0d56cd3523a241 100644
--- a/test/CodeGen/X86/combine-pmuldq.ll
+++ b/test/CodeGen/X86/combine-pmuldq.ll
@@ -130,3 +130,48 @@ define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) {
   %2 = mul nuw nsw <8 x i64> %1, <i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883, i64 715827883>
   ret <8 x i64> %2
 }
+
+define void @PR39398() {
+; SSE-LABEL: PR39398:
+; SSE:       # %bb.0: # %bb
+; SSE-NEXT:    .p2align 4, 0x90
+; SSE-NEXT:  .LBB5_1: # %bb10
+; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
+; SSE-NEXT:    cmpl $232, %eax
+; SSE-NEXT:    jne .LBB5_1
+; SSE-NEXT:  # %bb.2: # %bb34
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR39398:
+; AVX:       # %bb.0: # %bb
+; AVX-NEXT:    .p2align 4, 0x90
+; AVX-NEXT:  .LBB5_1: # %bb10
+; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX-NEXT:    cmpl $232, %eax
+; AVX-NEXT:    jne .LBB5_1
+; AVX-NEXT:  # %bb.2: # %bb34
+; AVX-NEXT:    retq
+bb:
+  %tmp9 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+  br label %bb10
+
+bb10:                                             ; preds = %bb10, %bb
+  %tmp12 = phi <4 x i32> [ <i32 9, i32 8, i32 7, i32 6>, %bb ], [ zeroinitializer, %bb10 ]
+  %tmp16 = add <4 x i32> %tmp12, <i32 -4, i32 -4, i32 -4, i32 -4>
+  %tmp18 = zext <4 x i32> %tmp12 to <4 x i64>
+  %tmp19 = zext <4 x i32> %tmp16 to <4 x i64>
+  %tmp20 = xor <4 x i64> %tmp18, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %tmp21 = xor <4 x i64> %tmp19, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %tmp24 = mul <4 x i64> %tmp9, %tmp20
+  %tmp25 = mul <4 x i64> %tmp9, %tmp21
+  %tmp26 = select <4 x i1> undef, <4 x i64> zeroinitializer, <4 x i64> %tmp24
+  %tmp27 = select <4 x i1> undef, <4 x i64> zeroinitializer, <4 x i64> %tmp25
+  %tmp28 = add <4 x i64> zeroinitializer, %tmp26
+  %tmp29 = add <4 x i64> zeroinitializer, %tmp27
+  %tmp33 = icmp eq i32 undef, 232
+  br i1 %tmp33, label %bb34, label %bb10
+
+bb34:                                             ; preds = %bb10
+  %tmp35 = add <4 x i64> %tmp29, %tmp28
+  ret void
+}
diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll
index 72d458c8513f3269cfac3f2b332cb501dab9cad6..a78ecd27c99057693f5d3bad8cea85b41655f5ba 100644
--- a/test/CodeGen/X86/combine-sdiv.ll
+++ b/test/CodeGen/X86/combine-sdiv.ll
@@ -107,99 +107,25 @@ define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
-; TODO fold (sdiv 0, x) -> 0
+; fold (sdiv 0, x) -> 0
 define i32 @combine_sdiv_zero(i32 %x) {
 ; CHECK-LABEL: combine_sdiv_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    idivl %edi
 ; CHECK-NEXT:    retq
   %1 = sdiv i32 0, %x
   ret i32 %1
 }
 
 define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
-; SSE2-LABEL: combine_vec_sdiv_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm1, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm2, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: combine_vec_sdiv_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    movl %eax, %ecx
-; SSE41-NEXT:    movd %xmm0, %esi
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    idivl %esi
-; SSE41-NEXT:    movd %eax, %xmm1
-; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
-; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE41-NEXT:    pextrd $3, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_vec_sdiv_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_sdiv_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    vmovd %xmm0, %esi
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    vmovd %eax, %xmm1
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = sdiv <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
@@ -726,7 +652,8 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -777,7 +704,9 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; XOP-NEXT:    vpaddw %xmm3, %xmm0, %xmm3
 ; XOP-NEXT:    vpshaw %xmm2, %xmm3, %xmm2
 ; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; XOP-NEXT:    vpcmov {{.*}}(%rip), %ymm0, %ymm1, %ymm0
+; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; XOP-NEXT:    # ymm2 = mem[0,1,0,1]
+; XOP-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
 ; XOP-NEXT:    retq
   %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
   ret <16 x i16> %1
@@ -960,7 +889,8 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; AVX1-NEXT:    vpsraw $1, %xmm5, %xmm5
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; AVX1-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX1-NEXT:    vandps %ymm5, %ymm2, %ymm2
 ; AVX1-NEXT:    vandnps %ymm0, %ymm5, %ymm0
 ; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
@@ -1055,7 +985,8 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; XOP-NEXT:    vpaddw %xmm5, %xmm0, %xmm5
 ; XOP-NEXT:    vpshaw %xmm3, %xmm5, %xmm5
 ; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; XOP-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
+; XOP-NEXT:    # ymm5 = mem[0,1,0,1]
 ; XOP-NEXT:    vpcmov %ymm5, %ymm0, %ymm2, %ymm0
 ; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; XOP-NEXT:    vpsraw $15, %xmm2, %xmm6
@@ -3289,322 +3220,16 @@ define i1 @bool_sdiv(i1 %x, i1 %y) {
 ; CHECK-LABEL: bool_sdiv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    negb %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    cbtw
-; CHECK-NEXT:    andb $1, %sil
-; CHECK-NEXT:    negb %sil
-; CHECK-NEXT:    idivb %sil
 ; CHECK-NEXT:    retq
   %r = sdiv i1 %x, %y
   ret i1 %r
 }
 
 define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) {
-; SSE2-LABEL: boolvec_sdiv:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pslld $31, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pslld $31, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %ecx
-; SSE2-NEXT:    cltd
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm3, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd %xmm3, %ecx
-; SSE2-NEXT:    cltd
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movd %xmm1, %ecx
-; SSE2-NEXT:    cltd
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    cltd
-; SSE2-NEXT:    idivl %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: boolvec_sdiv:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pslld $31, %xmm1
-; SSE41-NEXT:    psrad $31, %xmm1
-; SSE41-NEXT:    pslld $31, %xmm0
-; SSE41-NEXT:    psrad $31, %xmm0
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE41-NEXT:    cltd
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    movl %eax, %ecx
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    movd %xmm1, %esi
-; SSE41-NEXT:    cltd
-; SSE41-NEXT:    idivl %esi
-; SSE41-NEXT:    movd %eax, %xmm2
-; SSE41-NEXT:    pinsrd $1, %ecx, %xmm2
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE41-NEXT:    cltd
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm2
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE41-NEXT:    cltd
-; SSE41-NEXT:    idivl %ecx
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: boolvec_sdiv:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    cltd
-; AVX1-NEXT:    idivl %ecx
-; AVX1-NEXT:    movl %eax, %ecx
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vmovd %xmm1, %esi
-; AVX1-NEXT:    cltd
-; AVX1-NEXT:    idivl %esi
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    cltd
-; AVX1-NEXT:    idivl %ecx
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    cltd
-; AVX1-NEXT:    idivl %ecx
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: boolvec_sdiv:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX2-NEXT:    cltd
-; AVX2-NEXT:    idivl %ecx
-; AVX2-NEXT:    movl %eax, %ecx
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vmovd %xmm1, %esi
-; AVX2-NEXT:    cltd
-; AVX2-NEXT:    idivl %esi
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT:    cltd
-; AVX2-NEXT:    idivl %ecx
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT:    cltd
-; AVX2-NEXT:    idivl %ecx
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: boolvec_sdiv:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k3
-; AVX512F-NEXT:    kshiftrw $3, %k3, %k0
-; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k4
-; AVX512F-NEXT:    kshiftrw $3, %k4, %k1
-; AVX512F-NEXT:    kshiftrw $2, %k3, %k2
-; AVX512F-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512F-NEXT:    kmovw %k5, %ecx
-; AVX512F-NEXT:    kshiftrw $1, %k3, %k5
-; AVX512F-NEXT:    kmovw %k3, %edi
-; AVX512F-NEXT:    kshiftrw $1, %k4, %k3
-; AVX512F-NEXT:    kmovw %k4, %esi
-; AVX512F-NEXT:    kmovw %k5, %edx
-; AVX512F-NEXT:    kmovw %k3, %eax
-; AVX512F-NEXT:    andb $1, %al
-; AVX512F-NEXT:    negb %al
-; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512F-NEXT:    cbtw
-; AVX512F-NEXT:    andb $1, %dl
-; AVX512F-NEXT:    negb %dl
-; AVX512F-NEXT:    idivb %dl
-; AVX512F-NEXT:    movl %eax, %edx
-; AVX512F-NEXT:    andb $1, %sil
-; AVX512F-NEXT:    negb %sil
-; AVX512F-NEXT:    movl %esi, %eax
-; AVX512F-NEXT:    cbtw
-; AVX512F-NEXT:    andb $1, %dil
-; AVX512F-NEXT:    negb %dil
-; AVX512F-NEXT:    idivb %dil
-; AVX512F-NEXT:    movl %eax, %esi
-; AVX512F-NEXT:    andb $1, %cl
-; AVX512F-NEXT:    negb %cl
-; AVX512F-NEXT:    movl %ecx, %eax
-; AVX512F-NEXT:    cbtw
-; AVX512F-NEXT:    kmovw %k2, %ecx
-; AVX512F-NEXT:    andb $1, %cl
-; AVX512F-NEXT:    negb %cl
-; AVX512F-NEXT:    idivb %cl
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    kmovw %k1, %eax
-; AVX512F-NEXT:    andb $1, %al
-; AVX512F-NEXT:    negb %al
-; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512F-NEXT:    cbtw
-; AVX512F-NEXT:    kmovw %k0, %edi
-; AVX512F-NEXT:    andb $1, %dil
-; AVX512F-NEXT:    negb %dil
-; AVX512F-NEXT:    idivb %dil
-; AVX512F-NEXT:    # kill: def $al killed $al def $eax
-; AVX512F-NEXT:    kmovw %edx, %k0
-; AVX512F-NEXT:    kmovw %esi, %k1
-; AVX512F-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512F-NEXT:    kxorw %k0, %k2, %k0
-; AVX512F-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512F-NEXT:    kxorw %k0, %k1, %k0
-; AVX512F-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512F-NEXT:    kmovw %ecx, %k2
-; AVX512F-NEXT:    kxorw %k2, %k1, %k1
-; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512F-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512F-NEXT:    kxorw %k1, %k0, %k0
-; AVX512F-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512F-NEXT:    kshiftrw $13, %k0, %k0
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512F-NEXT:    korw %k1, %k0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: boolvec_sdiv:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512BW-NEXT:    vptestmd %xmm1, %xmm1, %k3
-; AVX512BW-NEXT:    kshiftrw $3, %k3, %k0
-; AVX512BW-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX512BW-NEXT:    vptestmd %xmm0, %xmm0, %k4
-; AVX512BW-NEXT:    kshiftrw $3, %k4, %k1
-; AVX512BW-NEXT:    kshiftrw $2, %k3, %k2
-; AVX512BW-NEXT:    kshiftrw $2, %k4, %k5
-; AVX512BW-NEXT:    kmovd %k5, %ecx
-; AVX512BW-NEXT:    kshiftrw $1, %k3, %k5
-; AVX512BW-NEXT:    kmovd %k3, %edi
-; AVX512BW-NEXT:    kshiftrw $1, %k4, %k3
-; AVX512BW-NEXT:    kmovd %k4, %esi
-; AVX512BW-NEXT:    kmovd %k5, %edx
-; AVX512BW-NEXT:    kmovd %k3, %eax
-; AVX512BW-NEXT:    andb $1, %al
-; AVX512BW-NEXT:    negb %al
-; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT:    cbtw
-; AVX512BW-NEXT:    andb $1, %dl
-; AVX512BW-NEXT:    negb %dl
-; AVX512BW-NEXT:    idivb %dl
-; AVX512BW-NEXT:    movl %eax, %edx
-; AVX512BW-NEXT:    andb $1, %sil
-; AVX512BW-NEXT:    negb %sil
-; AVX512BW-NEXT:    movl %esi, %eax
-; AVX512BW-NEXT:    cbtw
-; AVX512BW-NEXT:    andb $1, %dil
-; AVX512BW-NEXT:    negb %dil
-; AVX512BW-NEXT:    idivb %dil
-; AVX512BW-NEXT:    movl %eax, %esi
-; AVX512BW-NEXT:    andb $1, %cl
-; AVX512BW-NEXT:    negb %cl
-; AVX512BW-NEXT:    movl %ecx, %eax
-; AVX512BW-NEXT:    cbtw
-; AVX512BW-NEXT:    kmovd %k2, %ecx
-; AVX512BW-NEXT:    andb $1, %cl
-; AVX512BW-NEXT:    negb %cl
-; AVX512BW-NEXT:    idivb %cl
-; AVX512BW-NEXT:    movl %eax, %ecx
-; AVX512BW-NEXT:    kmovd %k1, %eax
-; AVX512BW-NEXT:    andb $1, %al
-; AVX512BW-NEXT:    negb %al
-; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT:    cbtw
-; AVX512BW-NEXT:    kmovd %k0, %edi
-; AVX512BW-NEXT:    andb $1, %dil
-; AVX512BW-NEXT:    negb %dil
-; AVX512BW-NEXT:    idivb %dil
-; AVX512BW-NEXT:    # kill: def $al killed $al def $eax
-; AVX512BW-NEXT:    kmovd %edx, %k0
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    kshiftrw $1, %k1, %k2
-; AVX512BW-NEXT:    kxorw %k0, %k2, %k0
-; AVX512BW-NEXT:    kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT:    kshiftrw $14, %k0, %k0
-; AVX512BW-NEXT:    kxorw %k0, %k1, %k0
-; AVX512BW-NEXT:    kshiftrw $2, %k0, %k1
-; AVX512BW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-NEXT:    kxorw %k2, %k1, %k1
-; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT:    kshiftrw $13, %k1, %k1
-; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
-; AVX512BW-NEXT:    kshiftlw $13, %k0, %k0
-; AVX512BW-NEXT:    kshiftrw $13, %k0, %k0
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    kshiftlw $3, %k1, %k1
-; AVX512BW-NEXT:    korw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; XOP-LABEL: boolvec_sdiv:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vpslld $31, %xmm1, %xmm1
-; XOP-NEXT:    vpsrad $31, %xmm1, %xmm1
-; XOP-NEXT:    vpslld $31, %xmm0, %xmm0
-; XOP-NEXT:    vpsrad $31, %xmm0, %xmm0
-; XOP-NEXT:    vpextrd $1, %xmm0, %eax
-; XOP-NEXT:    vpextrd $1, %xmm1, %ecx
-; XOP-NEXT:    cltd
-; XOP-NEXT:    idivl %ecx
-; XOP-NEXT:    movl %eax, %ecx
-; XOP-NEXT:    vmovd %xmm0, %eax
-; XOP-NEXT:    vmovd %xmm1, %esi
-; XOP-NEXT:    cltd
-; XOP-NEXT:    idivl %esi
-; XOP-NEXT:    vmovd %eax, %xmm2
-; XOP-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; XOP-NEXT:    vpextrd $2, %xmm0, %eax
-; XOP-NEXT:    vpextrd $2, %xmm1, %ecx
-; XOP-NEXT:    cltd
-; XOP-NEXT:    idivl %ecx
-; XOP-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; XOP-NEXT:    vpextrd $3, %xmm0, %eax
-; XOP-NEXT:    vpextrd $3, %xmm1, %ecx
-; XOP-NEXT:    cltd
-; XOP-NEXT:    idivl %ecx
-; XOP-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; XOP-NEXT:    retq
+; CHECK-LABEL: boolvec_sdiv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %r = sdiv <4 x i1> %x, %y
   ret <4 x i1> %r
 }
diff --git a/test/CodeGen/X86/combine-srem.ll b/test/CodeGen/X86/combine-srem.ll
index 7af33fea6dbc69004b7fde16a72db1b4a8a99b5c..dab3bdcedb264653e1b36037e2fc7f3cc152bb89 100644
--- a/test/CodeGen/X86/combine-srem.ll
+++ b/test/CodeGen/X86/combine-srem.ll
@@ -100,14 +100,11 @@ define <4 x i32> @combine_vec_srem_by_minsigned(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
-; TODO fold (srem 0, x) -> 0
+; fold (srem 0, x) -> 0
 define i32 @combine_srem_zero(i32 %x) {
 ; CHECK-LABEL: combine_srem_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    idivl %edi
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    retq
   %1 = srem i32 0, %x
   ret i32 %1
@@ -116,53 +113,12 @@ define i32 @combine_srem_zero(i32 %x) {
 define <4 x i32> @combine_vec_srem_zero(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_srem_zero:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pextrd $1, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    movl %edx, %ecx
-; SSE-NEXT:    movd %xmm0, %esi
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    idivl %esi
-; SSE-NEXT:    movd %edx, %xmm1
-; SSE-NEXT:    pinsrd $1, %ecx, %xmm1
-; SSE-NEXT:    pextrd $2, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $2, %edx, %xmm1
-; SSE-NEXT:    pextrd $3, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $3, %edx, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_srem_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    movl %edx, %ecx
-; AVX-NEXT:    vmovd %xmm0, %esi
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    vmovd %edx, %xmm1
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = srem <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
@@ -462,16 +418,7 @@ define i32 @ossfuzz6883() {
 define i1 @bool_srem(i1 %x, i1 %y) {
 ; CHECK-LABEL: bool_srem:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andb $1, %al
-; CHECK-NEXT:    negb %al
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    cbtw
-; CHECK-NEXT:    andb $1, %sil
-; CHECK-NEXT:    negb %sil
-; CHECK-NEXT:    idivb %sil
-; CHECK-NEXT:    movsbl %ah, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    retq
   %r = srem i1 %x, %y
   ret i1 %r
@@ -479,61 +426,12 @@ define i1 @bool_srem(i1 %x, i1 %y) {
 define <4 x i1> @boolvec_srem(<4 x i1> %x, <4 x i1> %y) {
 ; SSE-LABEL: boolvec_srem:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pslld $31, %xmm1
-; SSE-NEXT:    psrad $31, %xmm1
-; SSE-NEXT:    pslld $31, %xmm0
-; SSE-NEXT:    psrad $31, %xmm0
-; SSE-NEXT:    pextrd $1, %xmm0, %eax
-; SSE-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    movl %edx, %ecx
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    movd %xmm1, %esi
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %esi
-; SSE-NEXT:    movd %edx, %xmm2
-; SSE-NEXT:    pinsrd $1, %ecx, %xmm2
-; SSE-NEXT:    pextrd $2, %xmm0, %eax
-; SSE-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $2, %edx, %xmm2
-; SSE-NEXT:    pextrd $3, %xmm0, %eax
-; SSE-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $3, %edx, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: boolvec_srem:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
-; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    movl %edx, %ecx
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    vmovd %xmm1, %esi
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    vmovd %edx, %xmm2
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; AVX-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %r = srem <4 x i1> %x, %y
   ret <4 x i1> %r
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 9bd0be073f6bd293c9f47ddf6afc7640a31c80df..80dcb29209b431545752d03bfdb40d12505b85d9 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -63,17 +63,7 @@ define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) {
 define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_lshr_known_zero1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $11, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrld $9, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $10, %xmm1
-; SSE-NEXT:    psrld $8, %xmm0
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_known_zero1:
@@ -357,55 +347,50 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    pshufb %xmm1, %xmm4
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    psrlw $4, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE-NEXT:    movdqa %xmm3, %xmm4
 ; SSE-NEXT:    pshufb %xmm1, %xmm3
 ; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm4, %xmm1
-; SSE-NEXT:    paddb %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; SSE-NEXT:    psrlw $8, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE-NEXT:    pand %xmm0, %xmm5
+; SSE-NEXT:    pshufb %xmm5, %xmm4
+; SSE-NEXT:    pand %xmm1, %xmm4
+; SSE-NEXT:    paddb %xmm4, %xmm3
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqb %xmm2, %xmm1
 ; SSE-NEXT:    psrlw $8, %xmm1
-; SSE-NEXT:    paddw %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    psrlw $8, %xmm3
+; SSE-NEXT:    paddw %xmm1, %xmm3
 ; SSE-NEXT:    pcmpeqw %xmm2, %xmm0
 ; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    psrld $5, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    psrld $16, %xmm3
+; SSE-NEXT:    paddd %xmm3, %xmm0
+; SSE-NEXT:    psrld $5, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm4
-; AVX-NEXT:    vpand %xmm1, %xmm4, %xmm1
-; AVX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm1, %xmm5
-; AVX-NEXT:    vpand %xmm5, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX-NEXT:    vpcmpeqb %xmm4, %xmm0, %xmm2
-; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm2
+; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm4
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX-NEXT:    vpand %xmm3, %xmm4, %xmm3
+; AVX-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
+; AVX-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
+; AVX-NEXT:    vpsrlw $8, %xmm3, %xmm3
+; AVX-NEXT:    vpand %xmm3, %xmm1, %xmm3
 ; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll
index d31d2504d65ff0572d3269c1bfe37e5621a4c053..346e54476273ddca8193293715d0cb57c76005d4 100644
--- a/test/CodeGen/X86/combine-udiv.ll
+++ b/test/CodeGen/X86/combine-udiv.ll
@@ -90,124 +90,30 @@ define <4 x i32> @combine_vec_udiv_by_minsigned(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
-; TODO fold (udiv 0, x) -> 0
+; fold (udiv 0, x) -> 0
 define i32 @combine_udiv_zero(i32 %x) {
 ; CHECK-LABEL: combine_udiv_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %edi
 ; CHECK-NEXT:    retq
   %1 = udiv i32 0, %x
   ret i32 %1
 }
 
 define <4 x i32> @combine_vec_udiv_zero(<4 x i32> %x) {
-; SSE2-LABEL: combine_vec_udiv_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm1, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm2, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: combine_vec_udiv_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrd $1, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    movl %eax, %ecx
-; SSE41-NEXT:    movd %xmm0, %esi
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %esi
-; SSE41-NEXT:    movd %eax, %xmm1
-; SSE41-NEXT:    pinsrd $1, %ecx, %xmm1
-; SSE41-NEXT:    pextrd $2, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE41-NEXT:    pextrd $3, %xmm0, %ecx
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_vec_udiv_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_udiv_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    vmovd %xmm0, %esi
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %esi
-; AVX-NEXT:    vmovd %eax, %xmm1
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; XOP-LABEL: combine_vec_udiv_zero:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vpextrd $1, %xmm0, %ecx
-; XOP-NEXT:    xorl %eax, %eax
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    movl %eax, %ecx
-; XOP-NEXT:    vmovd %xmm0, %esi
-; XOP-NEXT:    xorl %eax, %eax
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %esi
-; XOP-NEXT:    vmovd %eax, %xmm1
-; XOP-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; XOP-NEXT:    vpextrd $2, %xmm0, %ecx
-; XOP-NEXT:    xorl %eax, %eax
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; XOP-NEXT:    vpextrd $3, %xmm0, %ecx
-; XOP-NEXT:    xorl %eax, %eax
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
+; XOP-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; XOP-NEXT:    retq
   %1 = udiv <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
@@ -763,20 +669,15 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; SSE41-NEXT:    pmullw %xmm0, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    pmullw %xmm3, %xmm0
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    packuswb %xmm0, %xmm3
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    psllw $1, %xmm3
-; SSE41-NEXT:    psllw $8, %xmm2
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    psllw $8, %xmm0
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; SSE41-NEXT:    psllw $1, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    psrlw $8, %xmm2
-; SSE41-NEXT:    packuswb %xmm0, %xmm2
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
@@ -787,21 +688,16 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; AVX1-NEXT:    movl $171, %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm2
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpsllw $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
-; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
@@ -911,166 +807,17 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
 define i1 @bool_udiv(i1 %x, i1 %y) {
 ; CHECK-LABEL: bool_udiv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %sil
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    # kill: def $eax killed $eax def $ax
-; CHECK-NEXT:    divb %sil
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %r = udiv i1 %x, %y
   ret i1 %r
 }
 
 define <4 x i1> @boolvec_udiv(<4 x i1> %x, <4 x i1> %y) {
-; SSE2-LABEL: boolvec_udiv:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; SSE2-NEXT:    movd %xmm2, %ecx
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm3, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movd %xmm3, %ecx
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movd %xmm1, %ecx
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    xorl %edx, %edx
-; SSE2-NEXT:    divl %ecx
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: boolvec_udiv:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pextrd $1, %xmm0, %eax
-; SSE41-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    movl %eax, %ecx
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    movd %xmm1, %esi
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %esi
-; SSE41-NEXT:    movd %eax, %xmm2
-; SSE41-NEXT:    pinsrd $1, %ecx, %xmm2
-; SSE41-NEXT:    pextrd $2, %xmm0, %eax
-; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    pinsrd $2, %eax, %xmm2
-; SSE41-NEXT:    pextrd $3, %xmm0, %eax
-; SSE41-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE41-NEXT:    xorl %edx, %edx
-; SSE41-NEXT:    divl %ecx
-; SSE41-NEXT:    pinsrd $3, %eax, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: boolvec_udiv:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    movl %eax, %ecx
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vmovd %xmm1, %esi
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %esi
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: boolvec_udiv:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    movl %eax, %ecx
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vmovd %xmm1, %esi
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %esi
-; AVX2-NEXT:    vmovd %eax, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    retq
-;
-; XOP-LABEL: boolvec_udiv:
-; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
-; XOP-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vpextrd $1, %xmm0, %eax
-; XOP-NEXT:    vpextrd $1, %xmm1, %ecx
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    movl %eax, %ecx
-; XOP-NEXT:    vmovd %xmm0, %eax
-; XOP-NEXT:    vmovd %xmm1, %esi
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %esi
-; XOP-NEXT:    vmovd %eax, %xmm2
-; XOP-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; XOP-NEXT:    vpextrd $2, %xmm0, %eax
-; XOP-NEXT:    vpextrd $2, %xmm1, %ecx
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; XOP-NEXT:    vpextrd $3, %xmm0, %eax
-; XOP-NEXT:    vpextrd $3, %xmm1, %ecx
-; XOP-NEXT:    xorl %edx, %edx
-; XOP-NEXT:    divl %ecx
-; XOP-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; XOP-NEXT:    retq
+; CHECK-LABEL: boolvec_udiv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
   %r = udiv <4 x i1> %x, %y
   ret <4 x i1> %r
 }
diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll
index 11505edfb65b5e855133873b6ba7ad120c0de01c..b21ed8ec60cef83770ccdda12b0f6babf54da112 100644
--- a/test/CodeGen/X86/combine-urem.ll
+++ b/test/CodeGen/X86/combine-urem.ll
@@ -89,14 +89,11 @@ define <4 x i32> @combine_vec_urem_by_minsigned(<4 x i32> %x) {
   ret <4 x i32> %1
 }
 
-; TODO fold (urem 0, x) -> 0
+; fold (urem 0, x) -> 0
 define i32 @combine_urem_zero(i32 %x) {
 ; CHECK-LABEL: combine_urem_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %edi
-; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    retq
   %1 = urem i32 0, %x
   ret i32 %1
@@ -105,53 +102,12 @@ define i32 @combine_urem_zero(i32 %x) {
 define <4 x i32> @combine_vec_urem_zero(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_urem_zero:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pextrd $1, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    movl %edx, %ecx
-; SSE-NEXT:    movd %xmm0, %esi
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %esi
-; SSE-NEXT:    movd %edx, %xmm1
-; SSE-NEXT:    pinsrd $1, %ecx, %xmm1
-; SSE-NEXT:    pextrd $2, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    pinsrd $2, %edx, %xmm1
-; SSE-NEXT:    pextrd $3, %xmm0, %ecx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    pinsrd $3, %edx, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_urem_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    movl %edx, %ecx
-; AVX-NEXT:    vmovd %xmm0, %esi
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %esi
-; AVX-NEXT:    vmovd %edx, %xmm1
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    xorl %edx, %edx
-; AVX-NEXT:    divl %ecx
-; AVX-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = urem <4 x i32> zeroinitializer, %x
   ret <4 x i32> %1
@@ -383,13 +339,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
 define i1 @bool_urem(i1 %x, i1 %y) {
 ; CHECK-LABEL: bool_urem:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andb $1, %sil
-; CHECK-NEXT:    andb $1, %dil
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    # kill: def $eax killed $eax def $ax
-; CHECK-NEXT:    divb %sil
-; CHECK-NEXT:    movzbl %ah, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    retq
   %r = urem i1 %x, %y
   ret i1 %r
@@ -398,88 +348,13 @@ define i1 @bool_urem(i1 %x, i1 %y) {
 define <4 x i1> @boolvec_urem(<4 x i1> %x, <4 x i1> %y) {
 ; SSE-LABEL: boolvec_urem:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pextrd $1, %xmm0, %eax
-; SSE-NEXT:    pextrd $1, %xmm1, %ecx
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    movl %edx, %ecx
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    movd %xmm1, %esi
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %esi
-; SSE-NEXT:    movd %edx, %xmm2
-; SSE-NEXT:    pinsrd $1, %ecx, %xmm2
-; SSE-NEXT:    pextrd $2, %xmm0, %eax
-; SSE-NEXT:    pextrd $2, %xmm1, %ecx
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    pinsrd $2, %edx, %xmm2
-; SSE-NEXT:    pextrd $3, %xmm0, %eax
-; SSE-NEXT:    pextrd $3, %xmm1, %ecx
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    divl %ecx
-; SSE-NEXT:    pinsrd $3, %edx, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    xorps %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: boolvec_urem:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1]
-; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    movl %edx, %ecx
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vmovd %xmm1, %esi
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %esi
-; AVX1-NEXT:    vmovd %edx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX1-NEXT:    xorl %edx, %edx
-; AVX1-NEXT:    divl %ecx
-; AVX1-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: boolvec_urem:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
-; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    movl %edx, %ecx
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vmovd %xmm1, %esi
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %esi
-; AVX2-NEXT:    vmovd %edx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
-; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    divl %ecx
-; AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: boolvec_urem:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %r = urem <4 x i1> %x, %y
   ret <4 x i1> %r
 }
diff --git a/test/CodeGen/X86/copy-eflags.ll b/test/CodeGen/X86/copy-eflags.ll
index 10fccacf1936585f911b321101944f5ed5127fc0..1e9a598c6511198e07010d0e6fc256e00870fb89 100644
--- a/test/CodeGen/X86/copy-eflags.ll
+++ b/test/CodeGen/X86/copy-eflags.ll
@@ -200,45 +200,37 @@ else:
 ; Test a function that gets special select lowering into CFG with copied EFLAGS
 ; threaded across the CFG. This requires our EFLAGS copy rewriting to handle
 ; cross-block rewrites in at least some narrow cases.
-define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %ptr2) {
+define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %ptr2, i32 %x) nounwind {
 ; X32-LABEL: PR37100:
 ; X32:       # %bb.0: # %bb
 ; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    .cfi_def_cfa_offset 12
 ; X32-NEXT:    pushl %edi
-; X32-NEXT:    .cfi_def_cfa_offset 16
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 20
-; X32-NEXT:    .cfi_offset %esi, -20
-; X32-NEXT:    .cfi_offset %edi, -16
-; X32-NEXT:    .cfi_offset %ebx, -12
-; X32-NEXT:    .cfi_offset %ebp, -8
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X32-NEXT:    jmp .LBB3_1
 ; X32-NEXT:    .p2align 4, 0x90
 ; X32-NEXT:  .LBB3_5: # %bb1
 ; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    xorl %edx, %edx
-; X32-NEXT:    idivl %ebp
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    cltd
+; X32-NEXT:    idivl %edi
 ; X32-NEXT:  .LBB3_1: # %bb1
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X32-NEXT:    movsbl %cl, %eax
 ; X32-NEXT:    movl %eax, %edx
 ; X32-NEXT:    sarl $31, %edx
-; X32-NEXT:    cmpl %eax, %esi
+; X32-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    sbbl %edx, %eax
 ; X32-NEXT:    setl %al
 ; X32-NEXT:    setl %dl
-; X32-NEXT:    movzbl %dl, %ebp
-; X32-NEXT:    negl %ebp
+; X32-NEXT:    movzbl %dl, %edi
+; X32-NEXT:    negl %edi
 ; X32-NEXT:    testb %al, %al
 ; X32-NEXT:    jne .LBB3_3
 ; X32-NEXT:  # %bb.2: # %bb1
@@ -246,33 +238,34 @@ define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %
 ; X32-NEXT:    movb %ch, %cl
 ; X32-NEXT:  .LBB3_3: # %bb1
 ; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X32-NEXT:    movb %cl, (%ebx)
-; X32-NEXT:    movl (%edi), %edx
+; X32-NEXT:    movb %cl, (%ebp)
+; X32-NEXT:    movl (%ebx), %edx
 ; X32-NEXT:    testb %al, %al
 ; X32-NEXT:    jne .LBB3_5
 ; X32-NEXT:  # %bb.4: # %bb1
 ; X32-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X32-NEXT:    movl %edx, %ebp
+; X32-NEXT:    movl %edx, %edi
 ; X32-NEXT:    jmp .LBB3_5
 ;
 ; X64-LABEL: PR37100:
 ; X64:       # %bb.0: # %bb
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; X64-NEXT:    jmp .LBB3_1
 ; X64-NEXT:    .p2align 4, 0x90
 ; X64-NEXT:  .LBB3_5: # %bb1
 ; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    movl %r10d, %eax
+; X64-NEXT:    cltd
 ; X64-NEXT:    idivl %esi
 ; X64-NEXT:  .LBB3_1: # %bb1
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movsbq %dil, %rax
 ; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    cmpq %rax, %r10
+; X64-NEXT:    cmpq %rax, %r11
 ; X64-NEXT:    setl %sil
 ; X64-NEXT:    negl %esi
-; X64-NEXT:    cmpq %rax, %r10
+; X64-NEXT:    cmpq %rax, %r11
 ; X64-NEXT:    jl .LBB3_3
 ; X64-NEXT:  # %bb.2: # %bb1
 ; X64-NEXT:    # in Loop: Header=BB3_1 Depth=1
@@ -300,7 +293,7 @@ bb1:
   store volatile i8 %tmp8, i8* %ptr1
   %tmp9 = load volatile i32, i32* %ptr2
   %tmp10 = select i1 %tmp6, i32 %tmp7, i32 %tmp9
-  %tmp11 = srem i32 0, %tmp10
+  %tmp11 = srem i32 %x, %tmp10
   %tmp12 = trunc i32 %tmp11 to i16
   br label %bb1
 }
@@ -308,47 +301,46 @@ bb1:
 ; Use a particular instruction pattern in order to lower to the post-RA pseudo
 ; used to lower SETB into an SBB pattern in order to make sure that kind of
 ; usage of a copied EFLAGS continues to work.
-define void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3) {
+define void @PR37431(i32* %arg1, i8* %arg2, i8* %arg3, i32 %x) nounwind {
 ; X32-LABEL: PR37431:
 ; X32:       # %bb.0: # %entry
+; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %esi, -8
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl (%eax), %eax
 ; X32-NEXT:    movl %eax, %ecx
 ; X32-NEXT:    sarl $31, %ecx
 ; X32-NEXT:    cmpl %eax, %eax
 ; X32-NEXT:    sbbl %ecx, %eax
-; X32-NEXT:    setb %al
-; X32-NEXT:    sbbb %cl, %cl
+; X32-NEXT:    setb %cl
+; X32-NEXT:    sbbb %dl, %dl
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    movb %cl, (%edx)
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    xorl %ecx, %ecx
-; X32-NEXT:    subl %eax, %ecx
-; X32-NEXT:    xorl %eax, %eax
-; X32-NEXT:    xorl %edx, %edx
-; X32-NEXT:    idivl %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movb %dl, (%edi)
+; X32-NEXT:    movzbl %cl, %ecx
+; X32-NEXT:    xorl %edi, %edi
+; X32-NEXT:    subl %ecx, %edi
+; X32-NEXT:    cltd
+; X32-NEXT:    idivl %edi
 ; X32-NEXT:    movb %dl, (%esi)
 ; X32-NEXT:    popl %esi
-; X32-NEXT:    .cfi_def_cfa_offset 4
+; X32-NEXT:    popl %edi
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: PR37431:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movslq (%rdi), %rax
-; X64-NEXT:    cmpq %rax, %rax
-; X64-NEXT:    sbbb %dl, %dl
-; X64-NEXT:    cmpq %rax, %rax
-; X64-NEXT:    movb %dl, (%rsi)
-; X64-NEXT:    sbbl %esi, %esi
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    idivl %esi
-; X64-NEXT:    movb %dl, (%rcx)
+; X64-NEXT:    movl %ecx, %eax
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movslq (%rdi), %rdx
+; X64-NEXT:    cmpq %rdx, %rax
+; X64-NEXT:    sbbb %cl, %cl
+; X64-NEXT:    cmpq %rdx, %rax
+; X64-NEXT:    movb %cl, (%rsi)
+; X64-NEXT:    sbbl %ecx, %ecx
+; X64-NEXT:    cltd
+; X64-NEXT:    idivl %ecx
+; X64-NEXT:    movb %dl, (%r8)
 ; X64-NEXT:    retq
 entry:
   %tmp = load i32, i32* %arg1
@@ -358,7 +350,7 @@ entry:
   %tmp4 = sub i8 0, %tmp3
   store i8 %tmp4, i8* %arg2
   %tmp5 = sext i8 %tmp4 to i32
-  %tmp6 = srem i32 0, %tmp5
+  %tmp6 = srem i32 %x, %tmp5
   %tmp7 = trunc i32 %tmp6 to i8
   store i8 %tmp7, i8* %arg3
   ret void
diff --git a/test/CodeGen/X86/cpus-amd-no-x86_64.ll b/test/CodeGen/X86/cpus-amd-no-x86_64.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0dadc599abdb959f21444584e4ca290afc691821
--- /dev/null
+++ b/test/CodeGen/X86/cpus-amd-no-x86_64.ll
@@ -0,0 +1,17 @@
+; Check that we reject 64-bit mode on 32-bit only CPUs.
+; CHECK-NO-ERROR-NOT: not a recognized processor for this target
+; CHECK-ERROR64: LLVM ERROR: 64-bit code requested on a subtarget that doesn't support it!
+
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6-2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6-3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-tbird 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-mp 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=geode 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/X86/cpus-intel-no-x86_64.ll b/test/CodeGen/X86/cpus-intel-no-x86_64.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d28ac9a83fd82c7b89d64bb4ed69d86a6443e1c8
--- /dev/null
+++ b/test/CodeGen/X86/cpus-intel-no-x86_64.ll
@@ -0,0 +1,24 @@
+; Check that we reject 64-bit mode on 32-bit only CPUs.
+; CHECK-NO-ERROR-NOT: not a recognized processor for this target
+; CHECK-ERROR64: LLVM ERROR: 64-bit code requested on a subtarget that doesn't support it!
+
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i386 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i486 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i586 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium-mmx 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i686 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentiumpro 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=lakemont 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
+
+define void @foo() {
+  ret void
+}
diff --git a/test/CodeGen/X86/cpus-no-x86_64.ll b/test/CodeGen/X86/cpus-no-x86_64.ll
index de873c81205af5edba91b7f05717e6ca7884b693..e2e000386717ea87a326887b4b88650ce92568b4 100644
--- a/test/CodeGen/X86/cpus-no-x86_64.ll
+++ b/test/CodeGen/X86/cpus-no-x86_64.ll
@@ -2,31 +2,6 @@
 ; CHECK-NO-ERROR-NOT: not a recognized processor for this target
 ; CHECK-ERROR64: LLVM ERROR: 64-bit code requested on a subtarget that doesn't support it!
 
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i386 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i486 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i586 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium-mmx 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=i686 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentiumpro 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=lakemont 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6-2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k6-3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-tbird 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-mp 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
-; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=geode 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
 ; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=winchip-c6 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
 ; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=winchip2 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
 ; RUN: not llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=c3 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR64
diff --git a/test/CodeGen/X86/crash-O0.ll b/test/CodeGen/X86/crash-O0.ll
index dab15c19c69ed9e349730c2f0a954e8e37ffd915..1a234d45cb21904940233ffb55c2e5f3a42179d3 100644
--- a/test/CodeGen/X86/crash-O0.ll
+++ b/test/CodeGen/X86/crash-O0.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -O0 -relocation-model=pic -disable-fp-elim < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10"
@@ -9,6 +10,35 @@ target triple = "x86_64-apple-darwin10"
 ; aliased registers (AX and AL) - RegAllocFast does not like that.
 ; PR7312
 define i32 @div8() nounwind {
+; CHECK-LABEL: div8:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    ## implicit-def: $rdx
+; CHECK-NEXT:    movb %dl, %sil
+; CHECK-NEXT:    movzbw %cl, %ax
+; CHECK-NEXT:    divb %sil
+; CHECK-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Spill
+; CHECK-NEXT:    movzbw %cl, %ax
+; CHECK-NEXT:    divb %sil
+; CHECK-NEXT:    shrw $8, %ax
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    cmpb %sil, %cl
+; CHECK-NEXT:    jae LBB0_2
+; CHECK-NEXT:  ## %bb.1: ## %"39"
+; CHECK-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al ## 1-byte Reload
+; CHECK-NEXT:    movzbl %al, %ecx
+; CHECK-NEXT:    ## implicit-def: $edx
+; CHECK-NEXT:    imull %edx, %ecx
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    cmpl %edx, %ecx
+; CHECK-NEXT:    je LBB0_3
+; CHECK-NEXT:  LBB0_2: ## %"40"
+; CHECK-NEXT:    ud2
+; CHECK-NEXT:  LBB0_3: ## %"41"
+; CHECK-NEXT:    ud2
 entry:
   %0 = trunc i64 undef to i8                      ; <i8> [#uses=3]
   %1 = udiv i8 0, %0                              ; <i8> [#uses=1]
@@ -38,12 +68,22 @@ entry:
 ; An instruction gets between CQO and DIV64 because the load is folded
 ; into the division but it requires a sign extension.
 ; PR21700
-; CHECK-LABEL: addressModeWith32bitIndex:
-; CHECK: cqto
-; CHECK-NEXT: movslq
-; CHECK-NEXT: idivq
-; CHECK: retq
 define i64 @addressModeWith32bitIndex(i32 %V) {
+; CHECK-LABEL: addressModeWith32bitIndex:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    cqto
+; CHECK-NEXT:    movslq %edi, %rsi
+; CHECK-NEXT:    idivq (%rcx,%rsi,8)
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
   %gep = getelementptr i64, i64* null, i32 %V
   %load = load i64, i64* %gep
   %sdiv = sdiv i64 0, %load
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
index 974324f4bdb4e56ba27de2d3b6608d09a7320c6a..c755d5f8bd7add836e0217279860d481bf0a2f1a 100644
--- a/test/CodeGen/X86/cvtv2f32.ll
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -8,26 +8,27 @@
 define <2 x float> @uitofp_2i32_cvt_buildvector(i32 %x, i32 %y, <2 x float> %v) {
 ; X32-LABEL: uitofp_2i32_cvt_buildvector:
 ; X32:       # %bb.0:
-; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; X32-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; X32-NEXT:    psrld $16, %xmm1
-; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7]
-; X32-NEXT:    addps {{\.LCPI.*}}, %xmm1
-; X32-NEXT:    addps %xmm2, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT:    orpd %xmm2, %xmm1
+; X32-NEXT:    subsd %xmm2, %xmm1
+; X32-NEXT:    cvtsd2ss %xmm1, %xmm1
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    orpd %xmm2, %xmm3
+; X32-NEXT:    subsd %xmm2, %xmm3
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    cvtsd2ss %xmm3, %xmm2
+; X32-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; X32-NEXT:    mulps %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_2i32_cvt_buildvector:
 ; X64:       # %bb.0:
-; X64-NEXT:    movd %edi, %xmm1
-; X64-NEXT:    pinsrd $1, %esi, %xmm1
-; X64-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; X64-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; X64-NEXT:    psrld $16, %xmm1
-; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2],mem[3],xmm1[4],mem[5],xmm1[6],mem[7]
-; X64-NEXT:    addps {{.*}}(%rip), %xmm1
-; X64-NEXT:    addps %xmm2, %xmm1
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    cvtsi2ssq %rax, %xmm1
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cvtsi2ssq %rax, %xmm2
+; X64-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; X64-NEXT:    mulps %xmm1, %xmm0
 ; X64-NEXT:    retq
   %t1 = uitofp i32 %x to float
@@ -44,7 +45,7 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v)
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; X32-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X32-NEXT:    movapd {{.*#+}} xmm1 = [4503599627370496,4503599627370496]
+; X32-NEXT:    movapd {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
 ; X32-NEXT:    orpd %xmm1, %xmm2
 ; X32-NEXT:    subpd %xmm1, %xmm2
 ; X32-NEXT:    cvtpd2ps %xmm2, %xmm1
@@ -56,7 +57,7 @@ define <2 x float> @uitofp_2i32_buildvector_cvt(i32 %x, i32 %y, <2 x float> %v)
 ; X64-NEXT:    movd %esi, %xmm1
 ; X64-NEXT:    movd %edi, %xmm2
 ; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = [4503599627370496,4503599627370496]
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
 ; X64-NEXT:    por %xmm1, %xmm2
 ; X64-NEXT:    subpd %xmm1, %xmm2
 ; X64-NEXT:    cvtpd2ps %xmm2, %xmm1
@@ -74,7 +75,7 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
 ; X32:       # %bb.0:
 ; X32-NEXT:    xorps %xmm2, %xmm2
 ; X32-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; X32-NEXT:    movaps {{.*#+}} xmm0 = [4503599627370496,4503599627370496]
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [4.503599627370496E+15,4.503599627370496E+15]
 ; X32-NEXT:    orps %xmm0, %xmm2
 ; X32-NEXT:    subpd %xmm0, %xmm2
 ; X32-NEXT:    cvtpd2ps %xmm2, %xmm0
@@ -85,7 +86,7 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorps %xmm2, %xmm2
 ; X64-NEXT:    blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [4503599627370496,4503599627370496]
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [4.503599627370496E+15,4.503599627370496E+15]
 ; X64-NEXT:    orps %xmm0, %xmm2
 ; X64-NEXT:    subpd %xmm0, %xmm2
 ; X64-NEXT:    cvtpd2ps %xmm2, %xmm0
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding2.mir b/test/CodeGen/X86/dbg-changes-codegen-branch-folding2.mir
index 1a9221ae9e95b4cd8d66ce4d8d40eed029d1ebd9..def14391a514920c255ae505c4466a9ee1ff64a4 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding2.mir
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding2.mir
@@ -207,7 +207,7 @@ body:             |
     liveins: $rdi
   
     dead renamable $al = MOV8rm $rsp, 1, $noreg, -121, $noreg 
-    DBG_VALUE debug-use $al, debug-use $noreg, !16, !DIExpression(), debug-location !19
+    DBG_VALUE $al, $noreg, !16, !DIExpression(), debug-location !19
     renamable $ecx = XOR32rr undef $ecx, undef $ecx, implicit-def dead $eflags
     JMP_1 %bb.1
 ...
diff --git a/test/CodeGen/X86/dbg-value-superreg-copy.mir b/test/CodeGen/X86/dbg-value-superreg-copy.mir
index cd316dea88b98007e212eed8e15f68948aa71061..0a8af06b9abd1c1e4788d4ce18e033f17a8ebeb2 100644
--- a/test/CodeGen/X86/dbg-value-superreg-copy.mir
+++ b/test/CodeGen/X86/dbg-value-superreg-copy.mir
@@ -37,7 +37,7 @@ body:             |
     %0:gr16_abcd = MOV16ri 1
 
   bb.1:
-    DBG_VALUE debug-use %0.sub_8bit_hi, debug-use $noreg, !7, !DIExpression(), debug-location !9
+    DBG_VALUE %0.sub_8bit_hi, $noreg, !7, !DIExpression(), debug-location !9
     %1:gr16 = COPY %0
     %2:gr16 = COPY %0
 
diff --git a/test/CodeGen/X86/debug-loclists.ll b/test/CodeGen/X86/debug-loclists.ll
new file mode 100644
index 0000000000000000000000000000000000000000..874cdc196e4d04e83a77b9cbde31ca67df099bd3
--- /dev/null
+++ b/test/CodeGen/X86/debug-loclists.ll
@@ -0,0 +1,142 @@
+; RUN: llc -mtriple=x86_64-pc-linux -filetype=obj -o %t < %s
+; RUN: llvm-dwarfdump -v %t | FileCheck %s
+
+; CHECK:      0x00000033: DW_TAG_formal_parameter [3]
+; CHECK-NEXT:               DW_AT_location [DW_FORM_sec_offset]   (0x0000000c
+; CHECK-NEXT:                  [0x0000000000000000, 0x0000000000000004): DW_OP_breg5 RDI+0
+; CHECK-NEXT:                  [0x0000000000000004, 0x0000000000000012): DW_OP_breg3 RBX+0)
+; CHECK-NEXT:               DW_AT_name [DW_FORM_strx1]    ( indexed (0000000e) string = "a")
+; CHECK-NEXT:               DW_AT_decl_file [DW_FORM_data1]       ("/home/folder{{\\|\/}}test.cc")
+; CHECK-NEXT:               DW_AT_decl_line [DW_FORM_data1]       (6)
+; CHECK-NEXT:               DW_AT_type [DW_FORM_ref4]     (cu + 0x0040 => {0x00000040} "A")
+
+; CHECK:      .debug_loclists contents:
+; CHECK-NEXT: 0x00000000: locations list header: length = 0x00000017, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+; CHECK-NEXT: 0x00000000:
+; CHECK-NEXT:  [0x0000000000000000, 0x0000000000000004): DW_OP_breg5 RDI+0
+; CHECK-NEXT:  [0x0000000000000004, 0x0000000000000012): DW_OP_breg3 RBX+0
+
+; There is no way to use llvm-dwarfdump atm (2018, october) to verify the DW_LLE_* codes emited,
+; because dumper is not yet implements that. Use asm code to do this check instead.
+;
+; RUN: llc -mtriple=x86_64-pc-linux -filetype=asm < %s -o - | FileCheck %s --check-prefix=ASM
+; ASM:      .section .debug_loclists,"",@progbits
+; ASM-NEXT: .long .Ldebug_loclist_table_end0-.Ldebug_loclist_table_start0 # Length
+; ASM-NEXT: .Ldebug_loclist_table_start0:
+; ASM-NEXT:  .short 5                              # Version
+; ASM-NEXT:  .byte 8                               # Address size
+; ASM-NEXT:  .byte 0                               # Segment selector size
+; ASM-NEXT:  .long 0                               # Offset entry count
+; ASM-NEXT: .Lloclists_table_base0:                
+; ASM-NEXT: .Ldebug_loc0:
+; ASM-NEXT:  .byte 4                               # DW_LLE_offset_pair
+; ASM-NEXT:  .uleb128 .Lfunc_begin0-.Lfunc_begin0  # starting offset
+; ASM-NEXT:  .uleb128 .Ltmp0-.Lfunc_begin0         # ending offset
+; ASM-NEXT:  .short 2                              # Loc expr size
+; ASM-NEXT:  .byte 117                             # DW_OP_breg5
+; ASM-NEXT:  .byte 0                               # 0
+; ASM-NEXT:  .byte 4                               # DW_LLE_offset_pair
+; ASM-NEXT:  .uleb128 .Ltmp0-.Lfunc_begin0         # starting offset
+; ASM-NEXT:  .uleb128 .Ltmp1-.Lfunc_begin0         # ending offset
+; ASM-NEXT:  .short 2                              # Loc expr size
+; ASM-NEXT:  .byte 115                             # DW_OP_breg3
+; ASM-NEXT:  .byte 0                               # 0
+; ASM-NEXT:  .byte 0                               # DW_LLE_end_of_list
+; ASM-NEXT: .Ldebug_loclist_table_end0:
+
+; ModuleID = 'test.cc'
+source_filename = "test.cc"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { i32 (...)** }
+
+@_ZTV1A = dso_local unnamed_addr constant { [4 x i8*] } { [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI1A to i8*), i8* bitcast (void (%struct.A*)* @_ZN1A3fooEv to i8*), i8* bitcast (void (%struct.A*)* @_ZN1A3barEv to i8*)] }, align 8
+@_ZTVN10__cxxabiv117__class_type_infoE = external dso_local global i8*
+@_ZTS1A = dso_local constant [3 x i8] c"1A\00", align 1
+@_ZTI1A = dso_local constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8*, i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @_ZTS1A, i32 0, i32 0) }, align 8
+
+; Function Attrs: noinline optnone uwtable
+define dso_local void @_Z3baz1A(%struct.A* %a) #0 !dbg !7 {
+entry:
+  call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !23, metadata !DIExpression()), !dbg !24
+  call void @_ZN1A3fooEv(%struct.A* %a), !dbg !25
+  call void @_ZN1A3barEv(%struct.A* %a), !dbg !26
+  ret void, !dbg !27
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @_ZN1A3fooEv(%struct.A* %this) unnamed_addr #2 align 2 !dbg !28 {
+entry:
+  %this.addr = alloca %struct.A*, align 8
+  store %struct.A* %this, %struct.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata %struct.A** %this.addr, metadata !29, metadata !DIExpression()), !dbg !31
+  %this1 = load %struct.A*, %struct.A** %this.addr, align 8
+  ret void, !dbg !32
+}
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @_ZN1A3barEv(%struct.A* %this) unnamed_addr #2 align 2 !dbg !33 {
+entry:
+  %this.addr = alloca %struct.A*, align 8
+  store %struct.A* %this, %struct.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata %struct.A** %this.addr, metadata !34, metadata !DIExpression()), !dbg !35
+  %this1 = load %struct.A*, %struct.A** %this.addr, align 8
+  ret void, !dbg !36
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define dso_local i32 @main() #3 !dbg !37 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  ret i32 0, !dbg !38
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 344035)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "test.cc", directory: "/home/folder", checksumkind: CSK_MD5, checksum: "e0f357ad6dcb791a774a0dae55baf5e7")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 344035)"}
+!7 = distinct !DISubprogram(name: "baz", linkageName: "_Z3baz1A", scope: !1, file: !1, line: 6, type: !8, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "A", file: !1, line: 1, size: 64, flags: DIFlagTypePassByReference, elements: !11, vtableHolder: !10, identifier: "_ZTS1A")
+!11 = !{!12, !18, !22}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "_vptr$A", scope: !1, file: !1, baseType: !13, size: 64, flags: DIFlagArtificial)
+!13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
+!14 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "__vtbl_ptr_type", baseType: !15, size: 64)
+!15 = !DISubroutineType(types: !16)
+!16 = !{!17}
+!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!18 = !DISubprogram(name: "foo", linkageName: "_ZN1A3fooEv", scope: !10, file: !1, line: 2, type: !19, isLocal: false, isDefinition: false, scopeLine: 2, containingType: !10, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0, flags: DIFlagPrototyped, isOptimized: false)
+!19 = !DISubroutineType(types: !20)
+!20 = !{null, !21}
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!22 = !DISubprogram(name: "bar", linkageName: "_ZN1A3barEv", scope: !10, file: !1, line: 3, type: !19, isLocal: false, isDefinition: false, scopeLine: 3, containingType: !10, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 1, flags: DIFlagPrototyped, isOptimized: false)
+!23 = !DILocalVariable(name: "a", arg: 1, scope: !7, file: !1, line: 6, type: !10)
+!24 = !DILocation(line: 6, column: 19, scope: !7)
+!25 = !DILocation(line: 7, column: 6, scope: !7)
+!26 = !DILocation(line: 8, column: 6, scope: !7)
+!27 = !DILocation(line: 9, column: 1, scope: !7)
+!28 = distinct !DISubprogram(name: "foo", linkageName: "_ZN1A3fooEv", scope: !10, file: !1, line: 12, type: !19, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !18, retainedNodes: !2)
+!29 = !DILocalVariable(name: "this", arg: 1, scope: !28, type: !30, flags: DIFlagArtificial | DIFlagObjectPointer)
+!30 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!31 = !DILocation(line: 0, scope: !28)
+!32 = !DILocation(line: 12, column: 16, scope: !28)
+!33 = distinct !DISubprogram(name: "bar", linkageName: "_ZN1A3barEv", scope: !10, file: !1, line: 13, type: !19, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !22, retainedNodes: !2)
+!34 = !DILocalVariable(name: "this", arg: 1, scope: !33, type: !30, flags: DIFlagArtificial | DIFlagObjectPointer)
+!35 = !DILocation(line: 0, scope: !33)
+!36 = !DILocation(line: 13, column: 16, scope: !33)
+!37 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 15, type: !15, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!38 = !DILocation(line: 16, column: 3, scope: !37)
diff --git a/test/CodeGen/X86/dwarf-headers.ll b/test/CodeGen/X86/dwarf-headers.ll
index b2a3115cf5e45146d4222965c0e353937a834de7..fa2080d1e1d31c743573ca52bfb62f1a00f88551 100644
--- a/test/CodeGen/X86/dwarf-headers.ll
+++ b/test/CodeGen/X86/dwarf-headers.ll
@@ -74,12 +74,12 @@
 ;
 ; O-5: .debug_info contents:
 ; O-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_skeleton abbr_offset
-; O-5-SAME:        DWO_id = 0x4ed74084f749d96b
+; O-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
 ; O-5: 0x00000014: DW_TAG_compile_unit
 ;
 ; DWO-5: .debug_info.dwo contents:
 ; DWO-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_compile abbr_offset
-; DWO-5-SAME:        DWO_id = 0x4ed74084f749d96b
+; DWO-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
 ; DWO-5: 0x00000014: DW_TAG_compile_unit
 ;
 ; FIXME: V5 wants type units in .debug_info.dwo not .debug_types.dwo.
diff --git a/test/CodeGen/X86/epilogue-cfi-no-fp.ll b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
index 6b0e79fce4391253c66073a41f15d23fd9329677..6ff0604cdba1a728a1315b71932515311027315d 100644
--- a/test/CodeGen/X86/epilogue-cfi-no-fp.ll
+++ b/test/CodeGen/X86/epilogue-cfi-no-fp.ll
@@ -1,33 +1,19 @@
 ; RUN: llc -O0 < %s | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i686-pc-linux"
+target triple = "i686--"
 
 ; Function Attrs: noinline nounwind
 define i32 @foo(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m) {
 ; CHECK-LABEL:   foo:
-; CHECK:         addl	$20, %esp
+; CHECK:         popl   %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popl	%esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    popl	%edi
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    popl	%ebx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    popl	%ebp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
 entry:
-  %i.addr = alloca i32, align 4
-  %j.addr = alloca i32, align 4
-  %k.addr = alloca i32, align 4
-  %l.addr = alloca i32, align 4
-  %m.addr = alloca i32, align 4
-  store i32 %i, i32* %i.addr, align 4
-  store i32 %j, i32* %j.addr, align 4
-  store i32 %k, i32* %k.addr, align 4
-  store i32 %l, i32* %l.addr, align 4
-  store i32 %m, i32* %m.addr, align 4
+  tail call void asm sideeffect "nop", "~{eax},~{ebx},~{ecx},~{edx},~{esi},~{edi},~{ebp}"()
   ret i32 0
 }
-
-
-
diff --git a/test/CodeGen/X86/extract-bits.ll b/test/CodeGen/X86/extract-bits.ll
index 6ee5b4a39a5379c5a1fa8622f5a03232637be1c9..cfe6ba571df8f1c2609925b365a2e1ca1132e973 100644
--- a/test/CodeGen/X86/extract-bits.ll
+++ b/test/CodeGen/X86/extract-bits.ll
@@ -48,17 +48,12 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a0:
@@ -86,11 +81,8 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a0:
@@ -105,6 +97,73 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
   ret i32 %masked
 }
 
+define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
+; X86-NOBMI-LABEL: bextr32_a0_arithmetic:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    sarl %cl, %esi
+; X86-NOBMI-NEXT:    movl $1, %eax
+; X86-NOBMI-NEXT:    movl %edx, %ecx
+; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    decl %eax
+; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: bextr32_a0_arithmetic:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    sarl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1BMI2-LABEL: bextr32_a0_arithmetic:
+; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    sarxl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: bextr32_a0_arithmetic:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    sarl %cl, %edi
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    movl %edx, %ecx
+; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    decl %eax
+; X64-NOBMI-NEXT:    andl %edi, %eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: bextr32_a0_arithmetic:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    sarl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1BMI2-LABEL: bextr32_a0_arithmetic:
+; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    sarxl %esi, %edi, %eax
+; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    retq
+  %shifted = ashr i32 %val, %numskipbits
+  %onebit = shl i32 1, %numlowbits
+  %mask = add nsw i32 %onebit, -1
+  %masked = and i32 %mask, %shifted
+  ret i32 %masked
+}
+
 define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_a1_indexzext:
 ; X86-NOBMI:       # %bb.0:
@@ -123,17 +182,12 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a1_indexzext:
@@ -161,11 +215,8 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a1_indexzext:
@@ -201,18 +252,13 @@ define i32 @bextr32_a2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a2_load:
@@ -240,14 +286,11 @@ define i32 @bextr32_a2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-LABEL: bextr32_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %esi
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a2_load:
@@ -282,18 +325,13 @@ define i32 @bextr32_a3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a3_load_indexzext:
@@ -321,14 +359,11 @@ define i32 @bextr32_a3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X64-BMI1NOTBM-LABEL: bextr32_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %esi
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a3_load_indexzext:
@@ -364,17 +399,12 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a4_commutative:
@@ -402,11 +432,8 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_a4_commutative:
@@ -447,25 +474,19 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_a5_skipextrauses:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    pushl %eax
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    movl $1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    decl %esi
-; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
-; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_a5_skipextrauses:
@@ -504,11 +525,8 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $1, %ebx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    decl %ebx
-; X64-BMI1NOTBM-NEXT:    andl %edi, %ebx
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -548,22 +566,22 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB6_2
+; X86-NOBMI-NEXT:    je .LBB7_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB6_2:
+; X86-NOBMI-NEXT:  .LBB7_2:
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB6_4
+; X86-NOBMI-NEXT:    je .LBB7_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB6_4:
+; X86-NOBMI-NEXT:  .LBB7_4:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -584,22 +602,22 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB6_2
+; X86-BMI1NOTBM-NEXT:    je .LBB7_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB6_2:
+; X86-BMI1NOTBM-NEXT:  .LBB7_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB6_4
+; X86-BMI1NOTBM-NEXT:    je .LBB7_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB6_4:
+; X86-BMI1NOTBM-NEXT:  .LBB7_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
@@ -620,22 +638,22 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB6_2
+; X86-BMI1BMI2-NEXT:    je .LBB7_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB6_2:
+; X86-BMI1BMI2-NEXT:  .LBB7_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB6_4
+; X86-BMI1BMI2-NEXT:    je .LBB7_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI1BMI2-NEXT:  .LBB6_4:
+; X86-BMI1BMI2-NEXT:  .LBB7_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
 ; X86-BMI1BMI2-NEXT:    andl %esi, %eax
@@ -662,11 +680,8 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a0:
@@ -681,80 +696,82 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
   ret i64 %masked
 }
 
-define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
-; X86-NOBMI-LABEL: bextr64_a1_indexzext:
+define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
+; X86-NOBMI-LABEL: bextr64_a0_arithmetic:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:    sarl %cl, %esi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB7_2
+; X86-NOBMI-NEXT:    je .LBB8_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB7_2:
+; X86-NOBMI-NEXT:    sarl $31, %eax
+; X86-NOBMI-NEXT:    movl %esi, %edi
+; X86-NOBMI-NEXT:    movl %eax, %esi
+; X86-NOBMI-NEXT:  .LBB8_2:
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB7_4
+; X86-NOBMI-NEXT:    je .LBB8_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB7_4:
+; X86-NOBMI-NEXT:  .LBB8_4:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
-; X86-NOBMI-NEXT:    andl %edi, %edx
+; X86-NOBMI-NEXT:    andl %edi, %eax
+; X86-NOBMI-NEXT:    andl %esi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
-; X86-BMI1NOTBM-LABEL: bextr64_a1_indexzext:
+; X86-BMI1NOTBM-LABEL: bextr64_a0_arithmetic:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1NOTBM-NEXT:    movl %eax, %esi
+; X86-BMI1NOTBM-NEXT:    sarl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB7_2
+; X86-BMI1NOTBM-NEXT:    je .LBB8_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
-; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB7_2:
+; X86-BMI1NOTBM-NEXT:    sarl $31, %eax
+; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
+; X86-BMI1NOTBM-NEXT:    movl %eax, %esi
+; X86-BMI1NOTBM-NEXT:  .LBB8_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB7_4
+; X86-BMI1NOTBM-NEXT:    je .LBB8_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB7_4:
+; X86-BMI1NOTBM-NEXT:  .LBB8_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X86-BMI1NOTBM-NEXT:    andl %esi, %edx
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
-; X86-BMI1BMI2-LABEL: bextr64_a1_indexzext:
+; X86-BMI1BMI2-LABEL: bextr64_a0_arithmetic:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
@@ -764,24 +781,25 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
+; X86-BMI1BMI2-NEXT:    sarxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB7_2
+; X86-BMI1BMI2-NEXT:    je .LBB8_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
+; X86-BMI1BMI2-NEXT:    sarl $31, %eax
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB7_2:
+; X86-BMI1BMI2-NEXT:    movl %eax, %edi
+; X86-BMI1BMI2-NEXT:  .LBB8_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB7_4
+; X86-BMI1BMI2-NEXT:    je .LBB8_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI1BMI2-NEXT:  .LBB7_4:
+; X86-BMI1BMI2-NEXT:  .LBB8_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
 ; X86-BMI1BMI2-NEXT:    andl %esi, %eax
@@ -791,11 +809,11 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
-; X64-NOBMI-LABEL: bextr64_a1_indexzext:
+; X64-NOBMI-LABEL: bextr64_a0_arithmetic:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rdi
+; X64-NOBMI-NEXT:    movq %rsi, %rcx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NOBMI-NEXT:    sarq %cl, %rdi
 ; X64-NOBMI-NEXT:    movl $1, %eax
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
 ; X64-NOBMI-NEXT:    shlq %cl, %rax
@@ -803,64 +821,56 @@ define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X64-NOBMI-NEXT:    andq %rdi, %rax
 ; X64-NOBMI-NEXT:    retq
 ;
-; X64-BMI1NOTBM-LABEL: bextr64_a1_indexzext:
+; X64-BMI1NOTBM-LABEL: bextr64_a0_arithmetic:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-BMI1NOTBM-NEXT:    sarq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
-; X64-BMI1BMI2-LABEL: bextr64_a1_indexzext:
+; X64-BMI1BMI2-LABEL: bextr64_a0_arithmetic:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    # kill: def $edx killed $edx def $rdx
-; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
+; X64-BMI1BMI2-NEXT:    sarxq %rsi, %rdi, %rax
 ; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
 ; X64-BMI1BMI2-NEXT:    retq
-  %skip = zext i8 %numskipbits to i64
-  %shifted = lshr i64 %val, %skip
-  %conv = zext i8 %numlowbits to i64
-  %onebit = shl i64 1, %conv
+  %shifted = ashr i64 %val, %numskipbits
+  %onebit = shl i64 1, %numlowbits
   %mask = add nsw i64 %onebit, -1
   %masked = and i64 %mask, %shifted
   ret i64 %masked
 }
 
-define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind {
-; X86-NOBMI-LABEL: bextr64_a2_load:
+define i64 @bextr64_a1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %numlowbits) nounwind {
+; X86-NOBMI-LABEL: bextr64_a1_indexzext:
 ; X86-NOBMI:       # %bb.0:
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %esi
-; X86-NOBMI-NEXT:    movl 4(%eax), %eax
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB8_2
+; X86-NOBMI-NEXT:    je .LBB9_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB8_2:
+; X86-NOBMI-NEXT:  .LBB9_2:
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB8_4
+; X86-NOBMI-NEXT:    je .LBB9_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB8_4:
+; X86-NOBMI-NEXT:  .LBB9_4:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -869,35 +879,34 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
-; X86-BMI1NOTBM-LABEL: bextr64_a2_load:
+; X86-BMI1NOTBM-LABEL: bextr64_a1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
-; X86-BMI1NOTBM-NEXT:    movl 4(%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB8_2
+; X86-BMI1NOTBM-NEXT:    je .LBB9_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB8_2:
+; X86-BMI1NOTBM-NEXT:  .LBB9_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB8_4
+; X86-BMI1NOTBM-NEXT:    je .LBB9_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB8_4:
+; X86-BMI1NOTBM-NEXT:  .LBB9_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
@@ -906,7 +915,157 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
-; X86-BMI1BMI2-LABEL: bextr64_a2_load:
+; X86-BMI1BMI2-LABEL: bextr64_a1_indexzext:
+; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
+; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    je .LBB9_2
+; X86-BMI1BMI2-NEXT:  # %bb.1:
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI1BMI2-NEXT:  .LBB9_2:
+; X86-BMI1BMI2-NEXT:    movl $1, %eax
+; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
+; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
+; X86-BMI1BMI2-NEXT:    testb $32, %bl
+; X86-BMI1BMI2-NEXT:    je .LBB9_4
+; X86-BMI1BMI2-NEXT:  # %bb.3:
+; X86-BMI1BMI2-NEXT:    movl %eax, %edx
+; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
+; X86-BMI1BMI2-NEXT:  .LBB9_4:
+; X86-BMI1BMI2-NEXT:    addl $-1, %eax
+; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
+; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    andl %edi, %edx
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: bextr64_a1_indexzext:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %esi, %ecx
+; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NOBMI-NEXT:    shrq %cl, %rdi
+; X64-NOBMI-NEXT:    movl $1, %eax
+; X64-NOBMI-NEXT:    movl %edx, %ecx
+; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    decq %rax
+; X64-NOBMI-NEXT:    andq %rdi, %rax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: bextr64_a1_indexzext:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1BMI2-LABEL: bextr64_a1_indexzext:
+; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
+; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    retq
+  %skip = zext i8 %numskipbits to i64
+  %shifted = lshr i64 %val, %skip
+  %conv = zext i8 %numlowbits to i64
+  %onebit = shl i64 1, %conv
+  %mask = add nsw i64 %onebit, -1
+  %masked = and i64 %mask, %shifted
+  ret i64 %masked
+}
+
+define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind {
+; X86-NOBMI-LABEL: bextr64_a2_load:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    movl 4(%eax), %eax
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB10_2
+; X86-NOBMI-NEXT:  # %bb.1:
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:  .LBB10_2:
+; X86-NOBMI-NEXT:    movl $1, %eax
+; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movb %ch, %cl
+; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
+; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    testb $32, %ch
+; X86-NOBMI-NEXT:    je .LBB10_4
+; X86-NOBMI-NEXT:  # %bb.3:
+; X86-NOBMI-NEXT:    movl %eax, %edx
+; X86-NOBMI-NEXT:    xorl %eax, %eax
+; X86-NOBMI-NEXT:  .LBB10_4:
+; X86-NOBMI-NEXT:    addl $-1, %eax
+; X86-NOBMI-NEXT:    adcl $-1, %edx
+; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    andl %edi, %edx
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: bextr64_a2_load:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
+; X86-BMI1NOTBM-NEXT:    movl 4(%eax), %eax
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1NOTBM-NEXT:    testb $32, %cl
+; X86-BMI1NOTBM-NEXT:    je .LBB10_2
+; X86-BMI1NOTBM-NEXT:  # %bb.1:
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
+; X86-BMI1NOTBM-NEXT:  .LBB10_2:
+; X86-BMI1NOTBM-NEXT:    movl $1, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
+; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
+; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    testb $32, %ch
+; X86-BMI1NOTBM-NEXT:    je .LBB10_4
+; X86-BMI1NOTBM-NEXT:  # %bb.3:
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
+; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
+; X86-BMI1NOTBM-NEXT:  .LBB10_4:
+; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
+; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
+; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1BMI2-LABEL: bextr64_a2_load:
 ; X86-BMI1BMI2:       # %bb.0:
 ; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
@@ -919,22 +1078,22 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB8_2
+; X86-BMI1BMI2-NEXT:    je .LBB10_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB8_2:
+; X86-BMI1BMI2-NEXT:  .LBB10_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB8_4
+; X86-BMI1BMI2-NEXT:    je .LBB10_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI1BMI2-NEXT:  .LBB8_4:
+; X86-BMI1BMI2-NEXT:  .LBB10_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
 ; X86-BMI1BMI2-NEXT:    andl %esi, %eax
@@ -960,14 +1119,11 @@ define i64 @bextr64_a2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-LABEL: bextr64_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rsi
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rsi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rsi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a2_load:
@@ -997,22 +1153,22 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB9_2
+; X86-NOBMI-NEXT:    je .LBB11_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB9_2:
+; X86-NOBMI-NEXT:  .LBB11_2:
 ; X86-NOBMI-NEXT:    movl $1, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %eax, %edx
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB9_4
+; X86-NOBMI-NEXT:    je .LBB11_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB9_4:
+; X86-NOBMI-NEXT:  .LBB11_4:
 ; X86-NOBMI-NEXT:    addl $-1, %eax
 ; X86-NOBMI-NEXT:    adcl $-1, %edx
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -1034,22 +1190,22 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB9_2
+; X86-BMI1NOTBM-NEXT:    je .LBB11_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB9_2:
+; X86-BMI1NOTBM-NEXT:  .LBB11_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB9_4
+; X86-BMI1NOTBM-NEXT:    je .LBB11_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB9_4:
+; X86-BMI1NOTBM-NEXT:  .LBB11_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
@@ -1071,22 +1227,22 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB9_2
+; X86-BMI1BMI2-NEXT:    je .LBB11_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB9_2:
+; X86-BMI1BMI2-NEXT:  .LBB11_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB9_4
+; X86-BMI1BMI2-NEXT:    je .LBB11_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %eax, %eax
-; X86-BMI1BMI2-NEXT:  .LBB9_4:
+; X86-BMI1BMI2-NEXT:  .LBB11_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %eax
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %edx
 ; X86-BMI1BMI2-NEXT:    andl %esi, %eax
@@ -1111,15 +1267,13 @@ define i64 @bextr64_a3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rsi
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rsi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rsi, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a3_load_indexzext:
@@ -1152,22 +1306,22 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    shrl %cl, %edx
 ; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB10_2
+; X86-NOBMI-NEXT:    je .LBB12_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edx, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:  .LBB10_2:
+; X86-NOBMI-NEXT:  .LBB12_2:
 ; X86-NOBMI-NEXT:    movl $1, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB10_4
+; X86-NOBMI-NEXT:    je .LBB12_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
-; X86-NOBMI-NEXT:  .LBB10_4:
+; X86-NOBMI-NEXT:  .LBB12_4:
 ; X86-NOBMI-NEXT:    addl $-1, %esi
 ; X86-NOBMI-NEXT:    adcl $-1, %edi
 ; X86-NOBMI-NEXT:    andl %esi, %eax
@@ -1188,22 +1342,22 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB10_2
+; X86-BMI1NOTBM-NEXT:    je .LBB12_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB10_2:
+; X86-BMI1NOTBM-NEXT:  .LBB12_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    movb %ch, %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %ch
-; X86-BMI1NOTBM-NEXT:    je .LBB10_4
+; X86-BMI1NOTBM-NEXT:    je .LBB12_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB10_4:
+; X86-BMI1NOTBM-NEXT:  .LBB12_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
@@ -1224,22 +1378,22 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB10_2
+; X86-BMI1BMI2-NEXT:    je .LBB12_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB10_2:
+; X86-BMI1BMI2-NEXT:  .LBB12_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ebx, %edi, %ecx
 ; X86-BMI1BMI2-NEXT:    testb $32, %bl
-; X86-BMI1BMI2-NEXT:    je .LBB10_4
+; X86-BMI1BMI2-NEXT:    je .LBB12_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ecx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %ecx, %ecx
-; X86-BMI1BMI2-NEXT:  .LBB10_4:
+; X86-BMI1BMI2-NEXT:  .LBB12_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %ecx
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %esi
 ; X86-BMI1BMI2-NEXT:    andl %ecx, %eax
@@ -1266,11 +1420,8 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_a4_commutative:
@@ -1302,22 +1453,22 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
 ; X86-NOBMI-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %al
-; X86-NOBMI-NEXT:    je .LBB11_2
+; X86-NOBMI-NEXT:    je .LBB13_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %ebp, %ebx
 ; X86-NOBMI-NEXT:    xorl %ebp, %ebp
-; X86-NOBMI-NEXT:  .LBB11_2:
+; X86-NOBMI-NEXT:  .LBB13_2:
 ; X86-NOBMI-NEXT:    movl $1, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %dl
-; X86-NOBMI-NEXT:    je .LBB11_4
+; X86-NOBMI-NEXT:    je .LBB13_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
-; X86-NOBMI-NEXT:  .LBB11_4:
+; X86-NOBMI-NEXT:  .LBB13_4:
 ; X86-NOBMI-NEXT:    addl $-1, %esi
 ; X86-NOBMI-NEXT:    adcl $-1, %edi
 ; X86-NOBMI-NEXT:    andl %ebx, %esi
@@ -1352,22 +1503,22 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebp
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB11_2
+; X86-BMI1NOTBM-NEXT:    je .LBB13_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %ebp, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %ebp, %ebp
-; X86-BMI1NOTBM-NEXT:  .LBB11_2:
+; X86-BMI1NOTBM-NEXT:  .LBB13_2:
 ; X86-BMI1NOTBM-NEXT:    movl $1, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %dl
-; X86-BMI1NOTBM-NEXT:    je .LBB11_4
+; X86-BMI1NOTBM-NEXT:    je .LBB13_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB11_4:
+; X86-BMI1NOTBM-NEXT:  .LBB13_4:
 ; X86-BMI1NOTBM-NEXT:    addl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    adcl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    andl %ebx, %esi
@@ -1401,22 +1552,22 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %esi, %ebp
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB11_2
+; X86-BMI1BMI2-NEXT:    je .LBB13_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
-; X86-BMI1BMI2-NEXT:  .LBB11_2:
+; X86-BMI1BMI2-NEXT:  .LBB13_2:
 ; X86-BMI1BMI2-NEXT:    movl $1, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:    movl %edx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %edx, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %dl
-; X86-BMI1BMI2-NEXT:    je .LBB11_4
+; X86-BMI1BMI2-NEXT:    je .LBB13_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB11_4:
+; X86-BMI1BMI2-NEXT:  .LBB13_4:
 ; X86-BMI1BMI2-NEXT:    addl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    adcl $-1, %esi
 ; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
@@ -1454,13 +1605,10 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_a5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movl $1, %ebx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    decq %rbx
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rbx
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
@@ -1507,16 +1655,12 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b0:
@@ -1544,10 +1688,8 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b0:
@@ -1580,16 +1722,12 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b1_indexzext:
@@ -1617,10 +1755,8 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b1_indexzext:
@@ -1656,17 +1792,13 @@ define i32 @bextr32_b2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b2_load:
@@ -1697,10 +1829,8 @@ define i32 @bextr32_b2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X64-BMI1NOTBM-NEXT:    andnl %eax, %esi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b2_load:
@@ -1735,17 +1865,13 @@ define i32 @bextr32_b3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b3_load_indexzext:
@@ -1776,10 +1902,8 @@ define i32 @bextr32_b3_load_indexzext(i32* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X64-BMI1NOTBM-NEXT:    andnl %eax, %esi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b3_load_indexzext:
@@ -1815,16 +1939,12 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    andnl %edx, %esi, %eax
-; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b4_commutative:
@@ -1852,10 +1972,8 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_b4_commutative:
@@ -1896,24 +2014,19 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_b5_skipextrauses:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    pushl %eax
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
-; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
-; X86-BMI1NOTBM-NEXT:    andnl %esi, %edi, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
-; X86-BMI1NOTBM-NEXT:    calll use32
-; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
-; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_b5_skipextrauses:
@@ -1952,10 +2065,8 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %ebx
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -1995,22 +2106,22 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB18_2
+; X86-NOBMI-NEXT:    je .LBB20_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB18_2:
+; X86-NOBMI-NEXT:  .LBB20_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB18_4
+; X86-NOBMI-NEXT:    je .LBB20_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB18_4:
+; X86-NOBMI-NEXT:  .LBB20_4:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
@@ -2032,22 +2143,22 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB18_2
+; X86-BMI1NOTBM-NEXT:    je .LBB20_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB18_2:
+; X86-BMI1NOTBM-NEXT:  .LBB20_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB18_4
+; X86-BMI1NOTBM-NEXT:    je .LBB20_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB18_4:
+; X86-BMI1NOTBM-NEXT:  .LBB20_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -2067,21 +2178,21 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB18_2
+; X86-BMI1BMI2-NEXT:    je .LBB20_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB18_2:
+; X86-BMI1BMI2-NEXT:  .LBB20_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB18_4
+; X86-BMI1BMI2-NEXT:    je .LBB20_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB18_4:
+; X86-BMI1BMI2-NEXT:  .LBB20_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2106,10 +2217,8 @@ define i64 @bextr64_b0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b0:
@@ -2137,22 +2246,22 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB19_2
+; X86-NOBMI-NEXT:    je .LBB21_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB19_2:
+; X86-NOBMI-NEXT:  .LBB21_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB19_4
+; X86-NOBMI-NEXT:    je .LBB21_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB19_4:
+; X86-NOBMI-NEXT:  .LBB21_4:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
@@ -2174,22 +2283,22 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB19_2
+; X86-BMI1NOTBM-NEXT:    je .LBB21_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB19_2:
+; X86-BMI1NOTBM-NEXT:  .LBB21_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB19_4
+; X86-BMI1NOTBM-NEXT:    je .LBB21_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB19_4:
+; X86-BMI1NOTBM-NEXT:  .LBB21_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -2209,21 +2318,21 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB19_2
+; X86-BMI1BMI2-NEXT:    je .LBB21_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB19_2:
+; X86-BMI1BMI2-NEXT:  .LBB21_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB19_4
+; X86-BMI1BMI2-NEXT:    je .LBB21_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB19_4:
+; X86-BMI1BMI2-NEXT:  .LBB21_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2245,13 +2354,12 @@ define i64 @bextr64_b1_indexzext(i64 %val, i8 zeroext %numskipbits, i8 zeroext %
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_b1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b1_indexzext:
@@ -2284,22 +2392,22 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB20_2
+; X86-NOBMI-NEXT:    je .LBB22_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB20_2:
+; X86-NOBMI-NEXT:  .LBB22_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB20_4
+; X86-NOBMI-NEXT:    je .LBB22_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB20_4:
+; X86-NOBMI-NEXT:  .LBB22_4:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
@@ -2322,22 +2430,22 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB20_2
+; X86-BMI1NOTBM-NEXT:    je .LBB22_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB20_2:
+; X86-BMI1NOTBM-NEXT:  .LBB22_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB20_4
+; X86-BMI1NOTBM-NEXT:    je .LBB22_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB20_4:
+; X86-BMI1NOTBM-NEXT:  .LBB22_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -2358,21 +2466,21 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB20_2
+; X86-BMI1BMI2-NEXT:    je .LBB22_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB20_2:
+; X86-BMI1BMI2-NEXT:  .LBB22_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB20_4
+; X86-BMI1BMI2-NEXT:    je .LBB22_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB20_4:
+; X86-BMI1BMI2-NEXT:  .LBB22_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2399,10 +2507,8 @@ define i64 @bextr64_b2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rsi
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rsi
-; X64-BMI1NOTBM-NEXT:    andnq %rax, %rsi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b2_load:
@@ -2432,22 +2538,22 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB21_2
+; X86-NOBMI-NEXT:    je .LBB23_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB21_2:
+; X86-NOBMI-NEXT:  .LBB23_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %eax
 ; X86-NOBMI-NEXT:    shldl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB21_4
+; X86-NOBMI-NEXT:    je .LBB23_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %edx
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB21_4:
+; X86-NOBMI-NEXT:  .LBB23_4:
 ; X86-NOBMI-NEXT:    notl %edx
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %eax
@@ -2470,22 +2576,22 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB21_2
+; X86-BMI1NOTBM-NEXT:    je .LBB23_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB21_2:
+; X86-BMI1NOTBM-NEXT:  .LBB23_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB21_4
+; X86-BMI1NOTBM-NEXT:    je .LBB23_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB21_4:
+; X86-BMI1NOTBM-NEXT:  .LBB23_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -2506,21 +2612,21 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB21_2
+; X86-BMI1BMI2-NEXT:    je .LBB23_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB21_2:
+; X86-BMI1BMI2-NEXT:  .LBB23_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB21_4
+; X86-BMI1BMI2-NEXT:    je .LBB23_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB21_4:
+; X86-BMI1BMI2-NEXT:  .LBB23_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2543,14 +2649,13 @@ define i64 @bextr64_b3_load_indexzext(i64* %w, i8 zeroext %numskipbits, i8 zeroe
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_b3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rsi
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rsi
-; X64-BMI1NOTBM-NEXT:    andnq %rax, %rsi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b3_load_indexzext:
@@ -2583,22 +2688,22 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-NOBMI-NEXT:    shrl %cl, %edx
 ; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB22_2
+; X86-NOBMI-NEXT:    je .LBB24_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edx, %eax
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:  .LBB22_2:
+; X86-NOBMI-NEXT:  .LBB24_2:
 ; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shll %cl, %esi
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %edi
 ; X86-NOBMI-NEXT:    testb $32, %ch
-; X86-NOBMI-NEXT:    je .LBB22_4
+; X86-NOBMI-NEXT:    je .LBB24_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
-; X86-NOBMI-NEXT:  .LBB22_4:
+; X86-NOBMI-NEXT:  .LBB24_4:
 ; X86-NOBMI-NEXT:    notl %edi
 ; X86-NOBMI-NEXT:    andl %edi, %edx
 ; X86-NOBMI-NEXT:    notl %esi
@@ -2620,22 +2725,22 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB22_2
+; X86-BMI1NOTBM-NEXT:    je .LBB24_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB22_2:
+; X86-BMI1NOTBM-NEXT:  .LBB24_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB22_4
+; X86-BMI1NOTBM-NEXT:    je .LBB24_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB22_4:
+; X86-BMI1NOTBM-NEXT:  .LBB24_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    popl %esi
@@ -2655,21 +2760,21 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB22_2
+; X86-BMI1BMI2-NEXT:    je .LBB24_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB22_2:
+; X86-BMI1BMI2-NEXT:  .LBB24_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %edi
 ; X86-BMI1BMI2-NEXT:    shlxl %eax, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB22_4
+; X86-BMI1BMI2-NEXT:    je .LBB24_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB22_4:
+; X86-BMI1BMI2-NEXT:  .LBB24_4:
 ; X86-BMI1BMI2-NEXT:    andnl %edx, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebx, %eax
 ; X86-BMI1BMI2-NEXT:    popl %esi
@@ -2694,10 +2799,8 @@ define i64 @bextr64_b4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_b4_commutative:
@@ -2729,22 +2832,22 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
 ; X86-NOBMI-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %al
-; X86-NOBMI-NEXT:    je .LBB23_2
+; X86-NOBMI-NEXT:    je .LBB25_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %ebp, %ebx
 ; X86-NOBMI-NEXT:    xorl %ebp, %ebp
-; X86-NOBMI-NEXT:  .LBB23_2:
+; X86-NOBMI-NEXT:  .LBB25_2:
 ; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    movl $-1, %edi
 ; X86-NOBMI-NEXT:    movl %edx, %ecx
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    shldl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %dl
-; X86-NOBMI-NEXT:    je .LBB23_4
+; X86-NOBMI-NEXT:    je .LBB25_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB23_4:
+; X86-NOBMI-NEXT:  .LBB25_4:
 ; X86-NOBMI-NEXT:    notl %esi
 ; X86-NOBMI-NEXT:    andl %ebp, %esi
 ; X86-NOBMI-NEXT:    notl %edi
@@ -2779,22 +2882,22 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB23_2
+; X86-BMI1NOTBM-NEXT:    je .LBB25_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB23_2:
+; X86-BMI1NOTBM-NEXT:  .LBB25_2:
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %ecx
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebp
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %dl
-; X86-BMI1NOTBM-NEXT:    je .LBB23_4
+; X86-BMI1NOTBM-NEXT:    je .LBB25_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %ebp, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %ebp, %ebp
-; X86-BMI1NOTBM-NEXT:  .LBB23_4:
+; X86-BMI1NOTBM-NEXT:  .LBB25_4:
 ; X86-BMI1NOTBM-NEXT:    andnl %esi, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    andnl %edi, %ebp, %edi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
@@ -2826,21 +2929,21 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB23_2
+; X86-BMI1BMI2-NEXT:    je .LBB25_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB23_2:
+; X86-BMI1BMI2-NEXT:  .LBB25_2:
 ; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
 ; X86-BMI1BMI2-NEXT:    shlxl %edx, %ebp, %ebx
 ; X86-BMI1BMI2-NEXT:    movl %edx, %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:    testb $32, %dl
-; X86-BMI1BMI2-NEXT:    je .LBB23_4
+; X86-BMI1BMI2-NEXT:    je .LBB25_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
 ; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB23_4:
+; X86-BMI1BMI2-NEXT:  .LBB25_4:
 ; X86-BMI1BMI2-NEXT:    andnl %esi, %ebp, %esi
 ; X86-BMI1BMI2-NEXT:    andnl %edi, %ebx, %edi
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
@@ -2876,12 +2979,10 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_b5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rbx
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
@@ -2913,68 +3014,133 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c0:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c0:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %edx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   ret i32 %masked
 }
@@ -2982,70 +3148,135 @@ define i32 @bextr32_c0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, %esi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
 ; X64-NOBMI-NEXT:    negb %dl
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c1_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %skip = zext i8 %numskipbits to i32
   %shifted = lshr i32 %val, %skip
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
   %mask = lshr i32 -1, %sh_prom
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   ret i32 %masked
 }
@@ -3053,72 +3284,137 @@ define i32 @bextr32_c1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl (%eax), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI1BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, (%eax), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl (%rdi), %eax
+; X64-NOBMI-NEXT:    movl (%rdi), %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movl $-1, %ebx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
+; X64-NOBMI-NEXT:    movl %ebx, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebp, %ebx
+; X64-NOBMI-NEXT:    movl %ebx, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
 ; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebp, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c2_load:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, (%rdi), %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, (%rdi), %ebp
+; X64-BMI1BMI2-NEXT:    movl %edx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i32, i32* %w
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   ret i32 %masked
 }
@@ -3126,67 +3422,131 @@ define i32 @bextr32_c2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl (%eax), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-BMI1BMI2-NEXT:    shrxl %edx, (%ecx), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, (%eax), %esi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, %esi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl (%rdi), %eax
+; X64-NOBMI-NEXT:    movl (%rdi), %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
 ; X64-NOBMI-NEXT:    negb %dl
+; X64-NOBMI-NEXT:    movl $-1, %ebx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
+; X64-NOBMI-NEXT:    movl %ebx, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebp, %ebx
+; X64-NOBMI-NEXT:    movl %ebx, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
 ; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebp, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c3_load_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, (%rdi), %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, (%rdi), %ebp
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i32, i32* %w
   %skip = zext i8 %numskipbits to i32
@@ -3194,6 +3554,7 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
   %mask = lshr i32 -1, %sh_prom
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   ret i32 %masked
 }
@@ -3201,68 +3562,133 @@ define i32 @bextr32_c3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1BMI2-NEXT:    shrxl %eax, {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebx
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c4_commutative:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %edx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %shifted, %mask ; swapped order
   ret i32 %masked
 }
@@ -3270,104 +3696,156 @@ define i32 @bextr32_c4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits)
 define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr32_c5_skipextrauses:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    subl $16, %esp
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NOBMI-NEXT:    movl %ebx, %ecx
+; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %esi
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
-; X86-NOBMI-NEXT:    movl %eax, (%esp)
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl %edi, %esi
+; X86-NOBMI-NEXT:    movl %ebx, (%esp)
 ; X86-NOBMI-NEXT:    calll use32
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_c5_skipextrauses:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    subl $8, %esp
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    subl $16, %esp
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ebx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_c5_skipextrauses:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    subl $8, %esp
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, %edx, %esi
-; X86-BMI1BMI2-NEXT:    movl %ecx, (%esp)
+; X86-BMI1BMI2-NEXT:    subl $16, %esp
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    shrxl %edi, {{[0-9]+}}(%esp), %ebx
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, %ebx, %esi
+; X86-BMI1BMI2-NEXT:    movl %edi, (%esp)
 ; X86-BMI1BMI2-NEXT:    calll use32
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
-; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr32_c5_skipextrauses:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %r14
 ; X64-NOBMI-NEXT:    pushq %rbx
-; X64-NOBMI-NEXT:    movl %edi, %ebx
-; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    shrl %cl, %ebx
+; X64-NOBMI-NEXT:    movl %esi, %r14d
+; X64-NOBMI-NEXT:    movl %edi, %ebp
+; X64-NOBMI-NEXT:    movl %r14d, %ecx
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movl $-1, %ebx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shll %cl, %ebx
 ; X64-NOBMI-NEXT:    shrl %cl, %ebx
-; X64-NOBMI-NEXT:    movl %esi, %edi
+; X64-NOBMI-NEXT:    movl %ebx, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebp, %ebx
+; X64-NOBMI-NEXT:    movl %r14d, %edi
 ; X64-NOBMI-NEXT:    callq use32
 ; X64-NOBMI-NEXT:    movl %ebx, %eax
 ; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr32_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %r14
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %esi, %r14d
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %r14d, %ecx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
 ; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebp, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %r14d, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_c5_skipextrauses:
 ; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %r14
 ; X64-BMI1BMI2-NEXT:    pushq %rbx
-; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %eax
-; X64-BMI1BMI2-NEXT:    bzhil %edx, %eax, %ebx
-; X64-BMI1BMI2-NEXT:    movl %esi, %edi
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebp
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %edi, %r14d
+; X64-BMI1BMI2-NEXT:    movl %edx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %r14d, %ebx
+; X64-BMI1BMI2-NEXT:    movl %ebp, %edi
 ; X64-BMI1BMI2-NEXT:    callq use32
 ; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
 ; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i32 %val, %numskipbits
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %shifted
   call void @use32(i32 %numskipbits)
   ret i32 %masked
@@ -3378,8 +3856,11 @@ define i32 @bextr32_c5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c0:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3387,33 +3868,46 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB30_2
+; X86-NOBMI-NEXT:    je .LBB32_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB30_2:
+; X86-NOBMI-NEXT:  .LBB32_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB30_4
+; X86-NOBMI-NEXT:    je .LBB32_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:  .LBB30_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:  .LBB32_4:
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3421,93 +3915,149 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB30_2
+; X86-BMI1NOTBM-NEXT:    je .LBB32_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB30_2:
+; X86-BMI1NOTBM-NEXT:  .LBB32_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB30_4
+; X86-BMI1NOTBM-NEXT:    je .LBB32_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB30_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
+; X86-BMI1NOTBM-NEXT:  .LBB32_4:
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c0:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB30_2
+; X86-BMI1BMI2-NEXT:    je .LBB32_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB30_2:
+; X86-BMI1BMI2-NEXT:  .LBB32_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB30_4
+; X86-BMI1BMI2-NEXT:    je .LBB32_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB30_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:  .LBB32_4:
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
 ; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c0:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   ret i64 %masked
 }
@@ -3515,8 +4065,11 @@ define i64 @bextr64_c0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3524,33 +4077,46 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB31_2
+; X86-NOBMI-NEXT:    je .LBB33_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB31_2:
+; X86-NOBMI-NEXT:  .LBB33_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB31_4
+; X86-NOBMI-NEXT:    je .LBB33_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:  .LBB31_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:  .LBB33_4:
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3558,97 +4124,152 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB31_2
+; X86-BMI1NOTBM-NEXT:    je .LBB33_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB31_2:
+; X86-BMI1NOTBM-NEXT:  .LBB33_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB31_4
+; X86-BMI1NOTBM-NEXT:    je .LBB33_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB31_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
+; X86-BMI1NOTBM-NEXT:  .LBB33_4:
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB31_2
+; X86-BMI1BMI2-NEXT:    je .LBB33_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB31_2:
+; X86-BMI1BMI2-NEXT:  .LBB33_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB31_4
+; X86-BMI1BMI2-NEXT:    je .LBB33_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB31_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:  .LBB33_4:
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negb %dl
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
 ; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c1_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
 ; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %skip = zext i8 %numskipbits to i64
   %shifted = lshr i64 %val, %skip
   %numhighbits = sub i8 64, %numlowbits
   %sh_prom = zext i8 %numhighbits to i64
   %mask = lshr i64 -1, %sh_prom
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   ret i64 %masked
 }
@@ -3656,8 +4277,11 @@ define i64 @bextr64_c1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
@@ -3666,33 +4290,46 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB32_2
+; X86-NOBMI-NEXT:    je .LBB34_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB32_2:
+; X86-NOBMI-NEXT:  .LBB34_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB32_4
+; X86-NOBMI-NEXT:    je .LBB34_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:  .LBB32_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:  .LBB34_4:
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
@@ -3701,33 +4338,46 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB32_2
+; X86-BMI1NOTBM-NEXT:    je .LBB34_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB32_2:
+; X86-BMI1NOTBM-NEXT:  .LBB34_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB32_4
+; X86-BMI1NOTBM-NEXT:    je .LBB34_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB32_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
+; X86-BMI1NOTBM-NEXT:  .LBB34_4:
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl (%eax), %esi
@@ -3735,61 +4385,104 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB32_2
+; X86-BMI1BMI2-NEXT:    je .LBB34_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB32_2:
+; X86-BMI1BMI2-NEXT:  .LBB34_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB32_4
+; X86-BMI1BMI2-NEXT:    je .LBB34_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB32_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:  .LBB34_4:
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq (%rdi), %rax
+; X64-NOBMI-NEXT:    movq (%rdi), %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
 ; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c2_load:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, (%rdi), %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, (%rdi), %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i64, i64* %w
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   ret i64 %masked
 }
@@ -3797,8 +4490,11 @@ define i64 @bextr64_c2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl (%eax), %esi
@@ -3807,33 +4503,46 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-NOBMI-NEXT:    shrl %cl, %edi
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB33_2
+; X86-NOBMI-NEXT:    je .LBB35_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB33_2:
+; X86-NOBMI-NEXT:  .LBB35_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB33_4
+; X86-NOBMI-NEXT:    je .LBB35_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:  .LBB33_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:  .LBB35_4:
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
@@ -3842,33 +4551,46 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB33_2
+; X86-BMI1NOTBM-NEXT:    je .LBB35_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB33_2:
+; X86-BMI1NOTBM-NEXT:  .LBB35_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB33_4
+; X86-BMI1NOTBM-NEXT:    je .LBB35_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB33_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
+; X86-BMI1NOTBM-NEXT:  .LBB35_4:
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl (%eax), %esi
@@ -3876,58 +4598,99 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB33_2
+; X86-BMI1BMI2-NEXT:    je .LBB35_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB33_2:
+; X86-BMI1BMI2-NEXT:  .LBB35_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB33_4
+; X86-BMI1BMI2-NEXT:    je .LBB35_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB33_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:  .LBB35_4:
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movq (%rdi), %rax
+; X64-NOBMI-NEXT:    movq (%rdi), %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negb %dl
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
 ; X64-BMI1NOTBM-NEXT:    negb %dl
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c3_load_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    # kill: def $edx killed $edx def $rdx
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %edx, %ebx
 ; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, (%rdi), %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, (%rdi), %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i64, i64* %w
   %skip = zext i8 %numskipbits to i64
@@ -3935,6 +4698,7 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
   %numhighbits = sub i8 64, %numlowbits
   %sh_prom = zext i8 %numhighbits to i64
   %mask = lshr i64 -1, %sh_prom
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   ret i64 %masked
 }
@@ -3942,136 +4706,208 @@ define i64 @bextr64_c3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 define i64 @bextr64_c4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bextr64_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    pushl %ebx
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $12, %esp
 ; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOBMI-NEXT:    movl %esi, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB34_2
+; X86-NOBMI-NEXT:    je .LBB36_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
-; X86-NOBMI-NEXT:  .LBB34_2:
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:  .LBB36_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB34_4
+; X86-NOBMI-NEXT:    je .LBB36_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB34_4:
-; X86-NOBMI-NEXT:    andl %edi, %edx
-; X86-NOBMI-NEXT:    andl %esi, %eax
+; X86-NOBMI-NEXT:    movl %ebx, %ebp
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
+; X86-NOBMI-NEXT:  .LBB36_4:
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %ebp
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebp, %esi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $12, %esp
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
+; X86-NOBMI-NEXT:    popl %ebp
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bextr64_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-BMI1NOTBM-NEXT:    movl %esi, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB34_2
+; X86-BMI1NOTBM-NEXT:    je .LBB36_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB34_2:
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
+; X86-BMI1NOTBM-NEXT:  .LBB36_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB34_4
+; X86-BMI1NOTBM-NEXT:    je .LBB36_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
-; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
-; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB34_4:
-; X86-BMI1NOTBM-NEXT:    andl %edi, %edx
-; X86-BMI1NOTBM-NEXT:    andl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
+; X86-BMI1NOTBM-NEXT:  .LBB36_4:
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %ebp
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $12, %esp
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
+; X86-BMI1NOTBM-NEXT:    popl %ebp
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr64_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $12, %esp
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB34_2
+; X86-BMI1BMI2-NEXT:    je .LBB36_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB34_2:
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI1BMI2-NEXT:  .LBB36_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %esi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB34_4
+; X86-BMI1BMI2-NEXT:    je .LBB36_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB34_4:
-; X86-BMI1BMI2-NEXT:    andl %edi, %edx
-; X86-BMI1BMI2-NEXT:    andl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    xorl %ebp, %ebp
+; X86-BMI1BMI2-NEXT:  .LBB36_4:
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebx, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebp, %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $12, %esp
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
+; X86-BMI1BMI2-NEXT:    popl %ebp
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bextr64_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %r14
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r14
 ; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c4_commutative:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %shifted, %mask ; swapped order
   ret i64 %masked
 }
@@ -4084,37 +4920,40 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %esi, %ebx
-; X86-NOBMI-NEXT:    movl %eax, %ecx
-; X86-NOBMI-NEXT:    shrl %cl, %ebx
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %edx
-; X86-NOBMI-NEXT:    testb $32, %al
-; X86-NOBMI-NEXT:    je .LBB35_2
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl %eax, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %eax, %esi
+; X86-NOBMI-NEXT:    testb $32, %cl
+; X86-NOBMI-NEXT:    je .LBB37_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %ebx, %edx
-; X86-NOBMI-NEXT:    xorl %ebx, %ebx
-; X86-NOBMI-NEXT:  .LBB35_2:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
+; X86-NOBMI-NEXT:  .LBB37_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %esi
-; X86-NOBMI-NEXT:    movl $-1, %edi
-; X86-NOBMI-NEXT:    shrl %cl, %edi
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    movl $-1, %ebp
+; X86-NOBMI-NEXT:    shrl %cl, %ebp
+; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB35_4
+; X86-NOBMI-NEXT:    je .LBB37_4
 ; X86-NOBMI-NEXT:  # %bb.3:
-; X86-NOBMI-NEXT:    movl %edi, %esi
-; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:  .LBB35_4:
-; X86-NOBMI-NEXT:    andl %ebx, %edi
-; X86-NOBMI-NEXT:    andl %edx, %esi
+; X86-NOBMI-NEXT:    movl %ebp, %ebx
+; X86-NOBMI-NEXT:    xorl %ebp, %ebp
+; X86-NOBMI-NEXT:  .LBB37_4:
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ebp
-; X86-NOBMI-NEXT:    pushl %eax
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl %ebx, %esi
+; X86-NOBMI-NEXT:    andl %ebp, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NOBMI-NEXT:    calll use64
 ; X86-NOBMI-NEXT:    addl $16, %esp
 ; X86-NOBMI-NEXT:    movl %esi, %eax
@@ -4133,37 +4972,40 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $12, %esp
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %esi, %ebx
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %edx
-; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB35_2
-; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %ebx, %edx
-; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB35_2:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB35_4
-; X86-BMI1NOTBM-NEXT:  # %bb.3:
+; X86-BMI1NOTBM-NEXT:    je .LBB37_2
+; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB35_4:
-; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
-; X86-BMI1NOTBM-NEXT:    andl %edx, %esi
+; X86-BMI1NOTBM-NEXT:  .LBB37_2:
+; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
+; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebp
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %ebx
+; X86-BMI1NOTBM-NEXT:    testb $32, %cl
+; X86-BMI1NOTBM-NEXT:    je .LBB37_4
+; X86-BMI1NOTBM-NEXT:  # %bb.3:
+; X86-BMI1NOTBM-NEXT:    movl %ebp, %ebx
+; X86-BMI1NOTBM-NEXT:    xorl %ebp, %ebp
+; X86-BMI1NOTBM-NEXT:  .LBB37_4:
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ebp
-; X86-BMI1NOTBM-NEXT:    pushl %eax
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %esi
+; X86-BMI1NOTBM-NEXT:    andl %ebp, %edi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-BMI1NOTBM-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use64
 ; X86-BMI1NOTBM-NEXT:    addl $16, %esp
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
@@ -4182,35 +5024,38 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
 ; X86-BMI1BMI2-NEXT:    subl $12, %esp
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    movl %eax, %ecx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edx
-; X86-BMI1BMI2-NEXT:    shrxl %eax, %esi, %ebx
-; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB35_2
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edi
+; X86-BMI1BMI2-NEXT:    testb $32, %cl
+; X86-BMI1BMI2-NEXT:    je .LBB37_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %ebx, %edx
-; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB35_2:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
+; X86-BMI1BMI2-NEXT:  .LBB37_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %esi
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
+; X86-BMI1BMI2-NEXT:    movl $-1, %ebp
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %ebp, %ebx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %ebp, %ebp
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB35_4
+; X86-BMI1BMI2-NEXT:    je .LBB37_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
-; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
-; X86-BMI1BMI2-NEXT:  .LBB35_4:
-; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
-; X86-BMI1BMI2-NEXT:    andl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl %ebx, %ebp
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
+; X86-BMI1BMI2-NEXT:  .LBB37_4:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
 ; X86-BMI1BMI2-NEXT:    pushl %ebp
-; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl %ebp, %esi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-BMI1BMI2-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-BMI1BMI2-NEXT:    calll use64
 ; X86-BMI1BMI2-NEXT:    addl $16, %esp
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
@@ -4224,49 +5069,77 @@ define i64 @bextr64_c5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ;
 ; X64-NOBMI-LABEL: bextr64_c5_skipextrauses:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r15
+; X64-NOBMI-NEXT:    pushq %r14
 ; X64-NOBMI-NEXT:    pushq %rbx
-; X64-NOBMI-NEXT:    movq %rdi, %rbx
-; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rsi, %r14
+; X64-NOBMI-NEXT:    movq %rdi, %r15
+; X64-NOBMI-NEXT:    movl %r14d, %ecx
+; X64-NOBMI-NEXT:    shrq %cl, %r15
 ; X64-NOBMI-NEXT:    negl %edx
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    movl %edx, %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rbx
 ; X64-NOBMI-NEXT:    shrq %cl, %rbx
-; X64-NOBMI-NEXT:    movq %rsi, %rdi
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r15, %rbx
+; X64-NOBMI-NEXT:    movq %r14, %rdi
 ; X64-NOBMI-NEXT:    callq use64
 ; X64-NOBMI-NEXT:    movq %rbx, %rax
 ; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
+; X64-NOBMI-NEXT:    popq %r15
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_c5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r15
+; X64-BMI1NOTBM-NEXT:    pushq %r14
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %r14
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r15
+; X64-BMI1NOTBM-NEXT:    movl %r14d, %ecx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %r15
 ; X64-BMI1NOTBM-NEXT:    negl %edx
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r15, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %r14, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
 ; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
+; X64-BMI1NOTBM-NEXT:    popq %r15
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_c5_skipextrauses:
 ; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    pushq %r15
+; X64-BMI1BMI2-NEXT:    pushq %r14
 ; X64-BMI1BMI2-NEXT:    pushq %rbx
-; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %rax
-; X64-BMI1BMI2-NEXT:    bzhiq %rdx, %rax, %rbx
-; X64-BMI1BMI2-NEXT:    movq %rsi, %rdi
+; X64-BMI1BMI2-NEXT:    movq %rdx, %rbx
+; X64-BMI1BMI2-NEXT:    movq %rsi, %r14
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rdi, %r15
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r15, %rbx
+; X64-BMI1BMI2-NEXT:    movq %r14, %rdi
 ; X64-BMI1BMI2-NEXT:    callq use64
 ; X64-BMI1BMI2-NEXT:    movq %rbx, %rax
 ; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
+; X64-BMI1BMI2-NEXT:    popq %r15
 ; X64-BMI1BMI2-NEXT:    retq
   %shifted = lshr i64 %val, %numskipbits
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %shifted
   call void @use64(i64 %numskipbits)
   ret i64 %masked
@@ -4291,14 +5164,12 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d0:
@@ -4324,13 +5195,10 @@ define i32 @bextr32_d0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr32_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d0:
@@ -4360,14 +5228,12 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -4393,13 +5259,10 @@ define i32 @bextr32_d1_indexzext(i32 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X64-BMI1NOTBM-LABEL: bextr32_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d1_indexzext:
@@ -4432,15 +5295,13 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d2_load:
@@ -4470,10 +5331,8 @@ define i32 @bextr32_d2_load(i32* %w, i32 %numskipbits, i32 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d2_load:
@@ -4505,15 +5364,13 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X86-BMI1NOTBM-LABEL: bextr32_d3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %edx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -4543,10 +5400,8 @@ define i32 @bextr32_d3_load_indexzext(i32* %w, i8 %numskipbits, i8 %numlowbits)
 ; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %eax, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr32_d3_load_indexzext:
@@ -4589,16 +5444,13 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl %eax, %ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %esi
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
-; X86-BMI1NOTBM-NEXT:    movl %eax, (%esp)
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%esp)
 ; X86-BMI1NOTBM-NEXT:    calll use32
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
 ; X86-BMI1NOTBM-NEXT:    addl $8, %esp
@@ -4639,13 +5491,10 @@ define i32 @bextr32_d5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr32_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %ebx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X64-BMI1NOTBM-NEXT:    shll $8, %edx
+; X64-BMI1NOTBM-NEXT:    bextrl %edx, %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %edi
 ; X64-BMI1NOTBM-NEXT:    callq use32
 ; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
@@ -4686,36 +5535,36 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB41_2
+; X86-NOBMI-NEXT:    je .LBB43_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB41_2:
+; X86-NOBMI-NEXT:  .LBB43_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB41_4
+; X86-NOBMI-NEXT:    jne .LBB43_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %ebx
-; X86-NOBMI-NEXT:  .LBB41_4:
+; X86-NOBMI-NEXT:  .LBB43_4:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edx
-; X86-NOBMI-NEXT:    jne .LBB41_6
+; X86-NOBMI-NEXT:    jne .LBB43_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:  .LBB41_6:
+; X86-NOBMI-NEXT:  .LBB43_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB41_8
+; X86-NOBMI-NEXT:    jne .LBB43_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:  .LBB41_8:
+; X86-NOBMI-NEXT:  .LBB43_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -4734,36 +5583,36 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB41_2
+; X86-BMI1NOTBM-NEXT:    je .LBB43_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB41_2:
+; X86-BMI1NOTBM-NEXT:  .LBB43_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X86-BMI1NOTBM-NEXT:    jne .LBB41_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB43_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB41_4:
+; X86-BMI1NOTBM-NEXT:  .LBB43_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edx
-; X86-BMI1NOTBM-NEXT:    jne .LBB41_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB43_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB41_6:
+; X86-BMI1NOTBM-NEXT:  .LBB43_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    jne .LBB41_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB43_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB41_8:
+; X86-BMI1NOTBM-NEXT:  .LBB43_8:
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -4780,32 +5629,32 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB41_2
+; X86-BMI1BMI2-NEXT:    je .LBB43_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB41_2:
+; X86-BMI1BMI2-NEXT:  .LBB43_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB41_4
+; X86-BMI1BMI2-NEXT:    je .LBB43_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    movl $0, %edi
-; X86-BMI1BMI2-NEXT:  .LBB41_4:
+; X86-BMI1BMI2-NEXT:  .LBB43_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %eax
-; X86-BMI1BMI2-NEXT:    jne .LBB41_6
+; X86-BMI1BMI2-NEXT:    jne .LBB43_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
-; X86-BMI1BMI2-NEXT:  .LBB41_6:
+; X86-BMI1BMI2-NEXT:  .LBB43_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    jne .LBB41_8
+; X86-BMI1BMI2-NEXT:    jne .LBB43_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %eax
-; X86-BMI1BMI2-NEXT:  .LBB41_8:
+; X86-BMI1BMI2-NEXT:  .LBB43_8:
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
@@ -4825,13 +5674,10 @@ define i64 @bextr64_d0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
 ; X64-BMI1NOTBM-LABEL: bextr64_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d0:
@@ -4860,36 +5706,36 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB42_2
+; X86-NOBMI-NEXT:    je .LBB44_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB42_2:
+; X86-NOBMI-NEXT:  .LBB44_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB42_4
+; X86-NOBMI-NEXT:    jne .LBB44_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %ebx
-; X86-NOBMI-NEXT:  .LBB42_4:
+; X86-NOBMI-NEXT:  .LBB44_4:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edx
-; X86-NOBMI-NEXT:    jne .LBB42_6
+; X86-NOBMI-NEXT:    jne .LBB44_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:  .LBB42_6:
+; X86-NOBMI-NEXT:  .LBB44_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB42_8
+; X86-NOBMI-NEXT:    jne .LBB44_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:  .LBB42_8:
+; X86-NOBMI-NEXT:  .LBB44_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -4908,36 +5754,36 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB42_2
+; X86-BMI1NOTBM-NEXT:    je .LBB44_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB42_2:
+; X86-BMI1NOTBM-NEXT:  .LBB44_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X86-BMI1NOTBM-NEXT:    jne .LBB42_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB44_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB42_4:
+; X86-BMI1NOTBM-NEXT:  .LBB44_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edx
-; X86-BMI1NOTBM-NEXT:    jne .LBB42_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB44_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB42_6:
+; X86-BMI1NOTBM-NEXT:  .LBB44_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    jne .LBB42_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB44_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB42_8:
+; X86-BMI1NOTBM-NEXT:  .LBB44_8:
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -4954,32 +5800,32 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %esi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB42_2
+; X86-BMI1BMI2-NEXT:    je .LBB44_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB42_2:
+; X86-BMI1BMI2-NEXT:  .LBB44_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB42_4
+; X86-BMI1BMI2-NEXT:    je .LBB44_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    movl $0, %edi
-; X86-BMI1BMI2-NEXT:  .LBB42_4:
+; X86-BMI1BMI2-NEXT:  .LBB44_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %eax
-; X86-BMI1BMI2-NEXT:    jne .LBB42_6
+; X86-BMI1BMI2-NEXT:    jne .LBB44_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
-; X86-BMI1BMI2-NEXT:  .LBB42_6:
+; X86-BMI1BMI2-NEXT:  .LBB44_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    jne .LBB42_8
+; X86-BMI1BMI2-NEXT:    jne .LBB44_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %eax
-; X86-BMI1BMI2-NEXT:  .LBB42_8:
+; X86-BMI1BMI2-NEXT:  .LBB44_8:
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
@@ -4998,14 +5844,12 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d1_indexzext:
@@ -5039,36 +5883,36 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB43_2
+; X86-NOBMI-NEXT:    je .LBB45_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB43_2:
+; X86-NOBMI-NEXT:  .LBB45_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB43_4
+; X86-NOBMI-NEXT:    jne .LBB45_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %ebx
-; X86-NOBMI-NEXT:  .LBB43_4:
+; X86-NOBMI-NEXT:  .LBB45_4:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edx
-; X86-NOBMI-NEXT:    jne .LBB43_6
+; X86-NOBMI-NEXT:    jne .LBB45_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:  .LBB43_6:
+; X86-NOBMI-NEXT:  .LBB45_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB43_8
+; X86-NOBMI-NEXT:    jne .LBB45_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:  .LBB43_8:
+; X86-NOBMI-NEXT:  .LBB45_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -5088,36 +5932,36 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB43_2
+; X86-BMI1NOTBM-NEXT:    je .LBB45_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB43_2:
+; X86-BMI1NOTBM-NEXT:  .LBB45_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X86-BMI1NOTBM-NEXT:    jne .LBB43_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB45_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB43_4:
+; X86-BMI1NOTBM-NEXT:  .LBB45_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edx
-; X86-BMI1NOTBM-NEXT:    jne .LBB43_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB45_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB43_6:
+; X86-BMI1NOTBM-NEXT:  .LBB45_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    jne .LBB43_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB45_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB43_8:
+; X86-BMI1NOTBM-NEXT:  .LBB45_8:
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -5135,32 +5979,32 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB43_2
+; X86-BMI1BMI2-NEXT:    je .LBB45_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB43_2:
+; X86-BMI1BMI2-NEXT:  .LBB45_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB43_4
+; X86-BMI1BMI2-NEXT:    je .LBB45_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    movl $0, %edi
-; X86-BMI1BMI2-NEXT:  .LBB43_4:
+; X86-BMI1BMI2-NEXT:  .LBB45_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %eax
-; X86-BMI1BMI2-NEXT:    jne .LBB43_6
+; X86-BMI1BMI2-NEXT:    jne .LBB45_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
-; X86-BMI1BMI2-NEXT:  .LBB43_6:
+; X86-BMI1BMI2-NEXT:  .LBB45_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    jne .LBB43_8
+; X86-BMI1BMI2-NEXT:    jne .LBB45_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %eax
-; X86-BMI1BMI2-NEXT:  .LBB43_8:
+; X86-BMI1BMI2-NEXT:  .LBB45_8:
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
@@ -5183,10 +6027,8 @@ define i64 @bextr64_d2_load(i64* %w, i64 %numskipbits, i64 %numlowbits) nounwind
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d2_load:
@@ -5217,36 +6059,36 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    je .LBB44_2
+; X86-NOBMI-NEXT:    je .LBB46_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %eax, %edi
 ; X86-NOBMI-NEXT:    xorl %eax, %eax
-; X86-NOBMI-NEXT:  .LBB44_2:
+; X86-NOBMI-NEXT:  .LBB46_2:
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %edi, %eax
 ; X86-NOBMI-NEXT:    shll %cl, %edi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %edi, %ebx
-; X86-NOBMI-NEXT:    jne .LBB44_4
+; X86-NOBMI-NEXT:    jne .LBB46_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %eax, %ebx
-; X86-NOBMI-NEXT:  .LBB44_4:
+; X86-NOBMI-NEXT:  .LBB46_4:
 ; X86-NOBMI-NEXT:    movl %ebx, %eax
 ; X86-NOBMI-NEXT:    shrl %cl, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edx
-; X86-NOBMI-NEXT:    jne .LBB44_6
+; X86-NOBMI-NEXT:    jne .LBB46_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %edi, %esi
 ; X86-NOBMI-NEXT:    movl %eax, %edx
-; X86-NOBMI-NEXT:  .LBB44_6:
+; X86-NOBMI-NEXT:  .LBB46_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
-; X86-NOBMI-NEXT:    jne .LBB44_8
+; X86-NOBMI-NEXT:    jne .LBB46_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %esi, %eax
-; X86-NOBMI-NEXT:  .LBB44_8:
+; X86-NOBMI-NEXT:  .LBB46_8:
 ; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    popl %ebx
@@ -5266,36 +6108,36 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    je .LBB44_2
+; X86-BMI1NOTBM-NEXT:    je .LBB46_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edi
 ; X86-BMI1NOTBM-NEXT:    xorl %eax, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB44_2:
+; X86-BMI1NOTBM-NEXT:  .LBB46_2:
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %edi, %eax
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %edi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %ebx
-; X86-BMI1NOTBM-NEXT:    jne .LBB44_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB46_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %ebx
-; X86-BMI1NOTBM-NEXT:  .LBB44_4:
+; X86-BMI1NOTBM-NEXT:  .LBB46_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edx
-; X86-BMI1NOTBM-NEXT:    jne .LBB44_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB46_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
 ; X86-BMI1NOTBM-NEXT:    movl %eax, %edx
-; X86-BMI1NOTBM-NEXT:  .LBB44_6:
+; X86-BMI1NOTBM-NEXT:  .LBB46_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
-; X86-BMI1NOTBM-NEXT:    jne .LBB44_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB46_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
-; X86-BMI1NOTBM-NEXT:  .LBB44_8:
+; X86-BMI1NOTBM-NEXT:  .LBB46_8:
 ; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    popl %ebx
@@ -5313,32 +6155,32 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB44_2
+; X86-BMI1BMI2-NEXT:    je .LBB46_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %esi, %eax
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB44_2:
+; X86-BMI1BMI2-NEXT:  .LBB46_2:
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %eax, %esi
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %eax, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB44_4
+; X86-BMI1BMI2-NEXT:    je .LBB46_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
 ; X86-BMI1BMI2-NEXT:    movl $0, %edi
-; X86-BMI1BMI2-NEXT:  .LBB44_4:
+; X86-BMI1BMI2-NEXT:  .LBB46_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %eax
-; X86-BMI1BMI2-NEXT:    jne .LBB44_6
+; X86-BMI1BMI2-NEXT:    jne .LBB46_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %eax, %edx
-; X86-BMI1BMI2-NEXT:  .LBB44_6:
+; X86-BMI1BMI2-NEXT:  .LBB46_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %edi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    jne .LBB44_8
+; X86-BMI1BMI2-NEXT:    jne .LBB46_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %eax
-; X86-BMI1BMI2-NEXT:  .LBB44_8:
+; X86-BMI1BMI2-NEXT:  .LBB46_8:
 ; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
@@ -5357,14 +6199,13 @@ define i64 @bextr64_d3_load_indexzext(i64* %w, i8 %numskipbits, i8 %numlowbits)
 ;
 ; X64-BMI1NOTBM-LABEL: bextr64_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
 ; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    negb %dl
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rax, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bextr64_d3_load_indexzext:
@@ -5401,37 +6242,37 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-NOBMI-NEXT:    xorl %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %al
-; X86-NOBMI-NEXT:    je .LBB45_2
+; X86-NOBMI-NEXT:    je .LBB47_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %esi, %ebx
 ; X86-NOBMI-NEXT:    xorl %esi, %esi
-; X86-NOBMI-NEXT:  .LBB45_2:
+; X86-NOBMI-NEXT:  .LBB47_2:
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-NOBMI-NEXT:    shldl %cl, %ebx, %esi
 ; X86-NOBMI-NEXT:    shll %cl, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl %ebx, %ebp
-; X86-NOBMI-NEXT:    jne .LBB45_4
+; X86-NOBMI-NEXT:    jne .LBB47_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %ebp
-; X86-NOBMI-NEXT:  .LBB45_4:
+; X86-NOBMI-NEXT:  .LBB47_4:
 ; X86-NOBMI-NEXT:    movl %ebp, %esi
 ; X86-NOBMI-NEXT:    shrl %cl, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl $0, %edi
-; X86-NOBMI-NEXT:    jne .LBB45_6
+; X86-NOBMI-NEXT:    jne .LBB47_6
 ; X86-NOBMI-NEXT:  # %bb.5:
 ; X86-NOBMI-NEXT:    movl %ebx, %edx
 ; X86-NOBMI-NEXT:    movl %esi, %edi
-; X86-NOBMI-NEXT:  .LBB45_6:
+; X86-NOBMI-NEXT:  .LBB47_6:
 ; X86-NOBMI-NEXT:    shrdl %cl, %ebp, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    jne .LBB45_8
+; X86-NOBMI-NEXT:    jne .LBB47_8
 ; X86-NOBMI-NEXT:  # %bb.7:
 ; X86-NOBMI-NEXT:    movl %edx, %esi
-; X86-NOBMI-NEXT:  .LBB45_8:
+; X86-NOBMI-NEXT:  .LBB47_8:
 ; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    pushl %ecx
 ; X86-NOBMI-NEXT:    pushl %eax
@@ -5462,37 +6303,37 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %al
-; X86-BMI1NOTBM-NEXT:    je .LBB45_2
+; X86-BMI1NOTBM-NEXT:    je .LBB47_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %ebx
 ; X86-BMI1NOTBM-NEXT:    xorl %esi, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB45_2:
+; X86-BMI1NOTBM-NEXT:  .LBB47_2:
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1NOTBM-NEXT:    shldl %cl, %ebx, %esi
 ; X86-BMI1NOTBM-NEXT:    shll %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %ebp
-; X86-BMI1NOTBM-NEXT:    jne .LBB45_4
+; X86-BMI1NOTBM-NEXT:    jne .LBB47_4
 ; X86-BMI1NOTBM-NEXT:  # %bb.3:
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %ebp
-; X86-BMI1NOTBM-NEXT:  .LBB45_4:
+; X86-BMI1NOTBM-NEXT:  .LBB47_4:
 ; X86-BMI1NOTBM-NEXT:    movl %ebp, %esi
 ; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl $0, %edi
-; X86-BMI1NOTBM-NEXT:    jne .LBB45_6
+; X86-BMI1NOTBM-NEXT:    jne .LBB47_6
 ; X86-BMI1NOTBM-NEXT:  # %bb.5:
 ; X86-BMI1NOTBM-NEXT:    movl %ebx, %edx
 ; X86-BMI1NOTBM-NEXT:    movl %esi, %edi
-; X86-BMI1NOTBM-NEXT:  .LBB45_6:
+; X86-BMI1NOTBM-NEXT:  .LBB47_6:
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %ebp, %edx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    jne .LBB45_8
+; X86-BMI1NOTBM-NEXT:    jne .LBB47_8
 ; X86-BMI1NOTBM-NEXT:  # %bb.7:
 ; X86-BMI1NOTBM-NEXT:    movl %edx, %esi
-; X86-BMI1NOTBM-NEXT:  .LBB45_8:
+; X86-BMI1NOTBM-NEXT:  .LBB47_8:
 ; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    pushl %ecx
 ; X86-BMI1NOTBM-NEXT:    pushl %eax
@@ -5520,33 +6361,33 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1BMI2-NEXT:    shrxl %eax, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    xorl %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %al
-; X86-BMI1BMI2-NEXT:    je .LBB45_2
+; X86-BMI1BMI2-NEXT:    je .LBB47_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
 ; X86-BMI1BMI2-NEXT:    movl %edx, %edi
 ; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
-; X86-BMI1BMI2-NEXT:  .LBB45_2:
+; X86-BMI1BMI2-NEXT:  .LBB47_2:
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
 ; X86-BMI1BMI2-NEXT:    shldl %cl, %edi, %edx
 ; X86-BMI1BMI2-NEXT:    shlxl %ecx, %edi, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
-; X86-BMI1BMI2-NEXT:    je .LBB45_4
+; X86-BMI1BMI2-NEXT:    je .LBB47_4
 ; X86-BMI1BMI2-NEXT:  # %bb.3:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edx
 ; X86-BMI1BMI2-NEXT:    movl $0, %ebx
-; X86-BMI1BMI2-NEXT:  .LBB45_4:
+; X86-BMI1BMI2-NEXT:  .LBB47_4:
 ; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %edi
-; X86-BMI1BMI2-NEXT:    jne .LBB45_6
+; X86-BMI1BMI2-NEXT:    jne .LBB47_6
 ; X86-BMI1BMI2-NEXT:  # %bb.5:
 ; X86-BMI1BMI2-NEXT:    movl %edi, %esi
-; X86-BMI1BMI2-NEXT:  .LBB45_6:
+; X86-BMI1BMI2-NEXT:  .LBB47_6:
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    jne .LBB45_8
+; X86-BMI1BMI2-NEXT:    jne .LBB47_8
 ; X86-BMI1BMI2-NEXT:  # %bb.7:
 ; X86-BMI1BMI2-NEXT:    movl %ebx, %edi
-; X86-BMI1BMI2-NEXT:  .LBB45_8:
+; X86-BMI1BMI2-NEXT:  .LBB47_8:
 ; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    pushl %ecx
 ; X86-BMI1BMI2-NEXT:    pushl %eax
@@ -5578,13 +6419,10 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X64-BMI1NOTBM-LABEL: bextr64_d5_skipextrauses:
 ; X64-BMI1NOTBM:       # %bb.0:
 ; X64-BMI1NOTBM-NEXT:    pushq %rbx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rbx
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    negl %edx
-; X64-BMI1NOTBM-NEXT:    movl %edx, %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rbx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rdi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rdx
+; X64-BMI1NOTBM-NEXT:    bextrq %rdx, %rdi, %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rdi
 ; X64-BMI1NOTBM-NEXT:    callq use64
 ; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
@@ -5615,23 +6453,69 @@ define i64 @bextr64_d5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 
 ; https://bugs.llvm.org/show_bug.cgi?id=38938
 define void @pr38938(i32* %a0, i64* %a1) {
-; X86-LABEL: pr38938:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    shrl $19, %ecx
-; X86-NEXT:    andl $4092, %ecx # imm = 0xFFC
-; X86-NEXT:    incl (%eax,%ecx)
-; X86-NEXT:    retl
+; X86-NOBMI-LABEL: pr38938:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    movl (%ecx), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $4092, %ecx # imm = 0xFFC
+; X86-NOBMI-NEXT:    incl (%eax,%ecx)
+; X86-NOBMI-NEXT:    retl
 ;
-; X64-LABEL: pr38938:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rsi), %rax
-; X64-NEXT:    shrq $19, %rax
-; X64-NEXT:    andl $4092, %eax # imm = 0xFFC
-; X64-NEXT:    incl (%rdi,%rax)
-; X64-NEXT:    retq
+; X86-BMI1NOTBM-LABEL: pr38938:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl $2581, %edx # imm = 0xA15
+; X86-BMI1NOTBM-NEXT:    bextrl %edx, (%ecx), %ecx
+; X86-BMI1NOTBM-NEXT:    incl (%eax,%ecx,4)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: pr38938:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1TBM-NEXT:    bextrl $2581, (%ecx), %ecx # imm = 0xA15
+; X86-BMI1TBM-NEXT:    incl (%eax,%ecx,4)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: pr38938:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl $2581, %edx # imm = 0xA15
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %edx, (%ecx), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    incl (%eax,%ecx,4)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: pr38938:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movq (%rsi), %rax
+; X64-NOBMI-NEXT:    shrq $19, %rax
+; X64-NOBMI-NEXT:    andl $4092, %eax # imm = 0xFFC
+; X64-NOBMI-NEXT:    incl (%rdi,%rax)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: pr38938:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $2581, %eax # imm = 0xA15
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, (%rsi), %rax
+; X64-BMI1NOTBM-NEXT:    incl (%rdi,%rax,4)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: pr38938:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrq $2581, (%rsi), %rax # imm = 0xA15
+; X64-BMI1TBM-NEXT:    incl (%rdi,%rax,4)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: pr38938:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $2581, %eax # imm = 0xA15
+; X64-BMI1NOTBMBMI2-NEXT:    bextrq %rax, (%rsi), %rax
+; X64-BMI1NOTBMBMI2-NEXT:    incl (%rdi,%rax,4)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
   %tmp = load i64, i64* %a1, align 8
   %tmp1 = lshr i64 %tmp, 21
   %tmp2 = and i64 %tmp1, 1023
@@ -5880,3 +6764,332 @@ define i64 @c4_i64_bad(i64 %arg) {
   %tmp1 = and i64 %tmp0, 16382
   ret i64 %tmp1
 }
+
+; ---------------------------------------------------------------------------- ;
+; Constant, storing the result afterwards.
+; ---------------------------------------------------------------------------- ;
+
+; i32
+
+; The most canonical variant
+define void @c5_i32(i32 %arg, i32* %ptr) {
+; X86-NOBMI-LABEL: c5_i32:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $1023, %ecx # imm = 0x3FF
+; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: c5_i32:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl $2579, %ecx # imm = 0xA13
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: c5_i32:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13
+; X86-BMI1TBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: c5_i32:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl $2579, %ecx # imm = 0xA13
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: c5_i32:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    shrl $19, %edi
+; X64-NOBMI-NEXT:    andl $1023, %edi # imm = 0x3FF
+; X64-NOBMI-NEXT:    movl %edi, (%rsi)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: c5_i32:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $2579, %eax # imm = 0xA13
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %eax, (%rsi)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: c5_i32:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrl $2579, %edi, %eax # imm = 0xA13
+; X64-BMI1TBM-NEXT:    movl %eax, (%rsi)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: c5_i32:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $2579, %eax # imm = 0xA13
+; X64-BMI1NOTBMBMI2-NEXT:    bextrl %eax, %edi, %eax
+; X64-BMI1NOTBMBMI2-NEXT:    movl %eax, (%rsi)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  store i32 %tmp1, i32* %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i32(i32 %arg, i32* %ptr) {
+; X86-NOBMI-LABEL: c6_i32:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $4095, %ecx # imm = 0xFFF
+; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: c6_i32:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl $3091, %ecx # imm = 0xC13
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: c6_i32:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    bextrl $3091, {{[0-9]+}}(%esp), %ecx # imm = 0xC13
+; X86-BMI1TBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: c6_i32:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl $3091, %ecx # imm = 0xC13
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: c6_i32:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    shrl $19, %edi
+; X64-NOBMI-NEXT:    andl $4095, %edi # imm = 0xFFF
+; X64-NOBMI-NEXT:    movl %edi, (%rsi)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: c6_i32:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $3091, %eax # imm = 0xC13
+; X64-BMI1NOTBM-NEXT:    bextrl %eax, %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %eax, (%rsi)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: c6_i32:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrl $3091, %edi, %eax # imm = 0xC13
+; X64-BMI1TBM-NEXT:    movl %eax, (%rsi)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: c6_i32:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $3091, %eax # imm = 0xC13
+; X64-BMI1NOTBMBMI2-NEXT:    bextrl %eax, %edi, %eax
+; X64-BMI1NOTBMBMI2-NEXT:    movl %eax, (%rsi)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 4095
+  store i32 %tmp1, i32* %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i32(i32 %arg, i32* %ptr) {
+; X86-LABEL: c7_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $17, %ecx
+; X86-NEXT:    andl $4092, %ecx # imm = 0xFFC
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: c7_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    shrl $17, %edi
+; X64-NEXT:    andl $4092, %edi # imm = 0xFFC
+; X64-NEXT:    movl %edi, (%rsi)
+; X64-NEXT:    retq
+  %tmp0 = lshr i32 %arg, 19
+  %tmp1 = and i32 %tmp0, 1023
+  %tmp2 = shl i32 %tmp1, 2
+  store i32 %tmp2, i32* %ptr
+  ret void
+}
+
+; i64
+
+; The most canonical variant
+define void @c5_i64(i64 %arg, i64* %ptr) {
+; X86-NOBMI-LABEL: c5_i64:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $1023, %ecx # imm = 0x3FF
+; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    movl $0, 4(%eax)
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: c5_i64:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl $2579, %ecx # imm = 0xA13
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBM-NEXT:    movl $0, 4(%eax)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: c5_i64:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    bextrl $2579, {{[0-9]+}}(%esp), %ecx # imm = 0xA13
+; X86-BMI1TBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1TBM-NEXT:    movl $0, 4(%eax)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: c5_i64:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl $2579, %ecx # imm = 0xA13
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    movl $0, 4(%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: c5_i64:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    shrq $51, %rdi
+; X64-NOBMI-NEXT:    andl $1023, %edi # imm = 0x3FF
+; X64-NOBMI-NEXT:    movq %rdi, (%rsi)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: c5_i64:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $2611, %eax # imm = 0xA33
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rax, (%rsi)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: c5_i64:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrq $2611, %rdi, %rax # imm = 0xA33
+; X64-BMI1TBM-NEXT:    movq %rax, (%rsi)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: c5_i64:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $2611, %eax # imm = 0xA33
+; X64-BMI1NOTBMBMI2-NEXT:    bextrq %rax, %rdi, %rax
+; X64-BMI1NOTBMBMI2-NEXT:    movq %rax, (%rsi)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  store i64 %tmp1, i64* %ptr
+  ret void
+}
+
+; Should be still fine, but the mask is shifted
+define void @c6_i64(i64 %arg, i64* %ptr) {
+; X86-NOBMI-LABEL: c6_i64:
+; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOBMI-NEXT:    shrl $19, %ecx
+; X86-NOBMI-NEXT:    andl $4095, %ecx # imm = 0xFFF
+; X86-NOBMI-NEXT:    movl %ecx, (%eax)
+; X86-NOBMI-NEXT:    movl $0, 4(%eax)
+; X86-NOBMI-NEXT:    retl
+;
+; X86-BMI1NOTBM-LABEL: c6_i64:
+; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movl $3091, %ecx # imm = 0xC13
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBM-NEXT:    movl $0, 4(%eax)
+; X86-BMI1NOTBM-NEXT:    retl
+;
+; X86-BMI1TBM-LABEL: c6_i64:
+; X86-BMI1TBM:       # %bb.0:
+; X86-BMI1TBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1TBM-NEXT:    bextrl $3091, {{[0-9]+}}(%esp), %ecx # imm = 0xC13
+; X86-BMI1TBM-NEXT:    movl %ecx, (%eax)
+; X86-BMI1TBM-NEXT:    movl $0, 4(%eax)
+; X86-BMI1TBM-NEXT:    retl
+;
+; X86-BMI1NOTBMBMI2-LABEL: c6_i64:
+; X86-BMI1NOTBMBMI2:       # %bb.0:
+; X86-BMI1NOTBMBMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBMBMI2-NEXT:    movl $3091, %ecx # imm = 0xC13
+; X86-BMI1NOTBMBMI2-NEXT:    bextrl %ecx, {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBMBMI2-NEXT:    movl %ecx, (%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    movl $0, 4(%eax)
+; X86-BMI1NOTBMBMI2-NEXT:    retl
+;
+; X64-NOBMI-LABEL: c6_i64:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    shrq $51, %rdi
+; X64-NOBMI-NEXT:    andl $4095, %edi # imm = 0xFFF
+; X64-NOBMI-NEXT:    movq %rdi, (%rsi)
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI1NOTBM-LABEL: c6_i64:
+; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    movl $3123, %eax # imm = 0xC33
+; X64-BMI1NOTBM-NEXT:    bextrq %rax, %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rax, (%rsi)
+; X64-BMI1NOTBM-NEXT:    retq
+;
+; X64-BMI1TBM-LABEL: c6_i64:
+; X64-BMI1TBM:       # %bb.0:
+; X64-BMI1TBM-NEXT:    bextrq $3123, %rdi, %rax # imm = 0xC33
+; X64-BMI1TBM-NEXT:    movq %rax, (%rsi)
+; X64-BMI1TBM-NEXT:    retq
+;
+; X64-BMI1NOTBMBMI2-LABEL: c6_i64:
+; X64-BMI1NOTBMBMI2:       # %bb.0:
+; X64-BMI1NOTBMBMI2-NEXT:    movl $3123, %eax # imm = 0xC33
+; X64-BMI1NOTBMBMI2-NEXT:    bextrq %rax, %rdi, %rax
+; X64-BMI1NOTBMBMI2-NEXT:    movq %rax, (%rsi)
+; X64-BMI1NOTBMBMI2-NEXT:    retq
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 4095
+  store i64 %tmp1, i64* %ptr
+  ret void
+}
+
+; Should be still fine, but the result is shifted left afterwards
+define void @c7_i64(i64 %arg, i64* %ptr) {
+; X86-LABEL: c7_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shrl $17, %ecx
+; X86-NEXT:    andl $4092, %ecx # imm = 0xFFC
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: c7_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    shrq $49, %rdi
+; X64-NEXT:    andl $4092, %edi # imm = 0xFFC
+; X64-NEXT:    movq %rdi, (%rsi)
+; X64-NEXT:    retq
+  %tmp0 = lshr i64 %arg, 51
+  %tmp1 = and i64 %tmp0, 1023
+  %tmp2 = shl i64 %tmp1, 2
+  store i64 %tmp2, i64* %ptr
+  ret void
+}
diff --git a/test/CodeGen/X86/extract-concat.ll b/test/CodeGen/X86/extract-concat.ll
index 704309eb650724a49a8bd6f2ca24b099163052c1..029c69a34cb67be9e60b25d442d3f90e80452fc4 100644
--- a/test/CodeGen/X86/extract-concat.ll
+++ b/test/CodeGen/X86/extract-concat.ll
@@ -1,6 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 define void @foo(<4 x float> %in, <4 x i8>* %out) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    movl $255, %eax
+; CHECK-NEXT:    pinsrd $3, %eax, %xmm0
+; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT:    movd %xmm0, (%rdi)
+; CHECK-NEXT:    retq
   %t0 = fptosi <4 x float> %in to <4 x i32>
   %t1 = trunc <4 x i32> %t0 to <4 x i16>
   %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -9,9 +18,4 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
   %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
   store <4 x i8> %t5, <4 x i8>* %out
   ret void
-; CHECK: foo
-; CHECK: cvttps2dq
-; CHECK-NOT: pextrd
-; CHECK: pshufb
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/extract-insert.ll b/test/CodeGen/X86/extract-insert.ll
index 823390e86d1764617293080fd963ee755147d473..be5f9ed24fbc987ddb95254b7121302d93a94cb2 100644
--- a/test/CodeGen/X86/extract-insert.ll
+++ b/test/CodeGen/X86/extract-insert.ll
@@ -3,16 +3,112 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
 
 define i32 @extractelt_undef_insertelt(i32 %x, i32 %y) {
-; X86-LABEL: extractelt_undef_insertelt:
+; CHECK-LABEL: extractelt_undef_insertelt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
+  %b = insertelement <4 x i32> zeroinitializer, i32 %x, i64 3
+  %c = icmp uge i32 %y, %y
+  %d = extractelement <4 x i32> %b, i1 %c
+  ret i32 %d
+}
+
+define i8 @extractelt_bitcast(i32 %x) nounwind {
+; X86-LABEL: extractelt_bitcast:
 ; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: extractelt_undef_insertelt:
+; X64-LABEL: extractelt_bitcast:
 ; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-  %b = insertelement <4 x i32> zeroinitializer, i32 %x, i64 3
-  %c = icmp uge i32 %y, %y
-  %d = extractelement <4 x i32> %b, i1 %c
-  ret i32 %d
+  %bc = bitcast i32 %x to <4 x i8>
+  %ext = extractelement <4 x i8> %bc, i32 0
+  ret i8 %ext
+}
+
+; TODO: This should have folded to avoid vector ops, but the transform
+; is guarded by 'hasOneUse'. That limitation apparently makes some AMDGPU
+; codegen better.
+
+define i8 @extractelt_bitcast_extra_use(i32 %x, <4 x i8>* %p) nounwind {
+; X86-LABEL: extractelt_bitcast_extra_use:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movd %eax, %xmm0
+; X86-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    popl %ecx
+; X86-NEXT:    retl
+;
+; X64-LABEL: extractelt_bitcast_extra_use:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %edi, %xmm0
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT:    movl %edi, (%rsi)
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %bc = bitcast i32 %x to <4 x i8>
+  store <4 x i8> %bc, <4 x i8>* %p
+  %ext = extractelement <4 x i8> %bc, i32 0
+  ret i8 %ext
+}
+
+define i32 @trunc_i64_to_i32_le(i64 %x) {
+; X86-LABEL: trunc_i64_to_i32_le:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: trunc_i64_to_i32_le:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <4 x i32>
+  %ext = extractelement <4 x i32> %bc, i32 0
+  ret i32 %ext
+}
+
+define i16 @trunc_i64_to_i16_le(i64 %x) {
+; X86-LABEL: trunc_i64_to_i16_le:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: trunc_i64_to_i16_le:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
+; X64-NEXT:    retq
+  %ins = insertelement <2 x i64> undef, i64 %x, i32 0
+  %bc = bitcast <2 x i64> %ins to <8 x i16>
+  %ext = extractelement <8 x i16> %bc, i32 0
+  ret i16 %ext
+}
+
+define i8 @trunc_i32_to_i8_le(i32 %x) {
+; X86-LABEL: trunc_i32_to_i8_le:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: trunc_i32_to_i8_le:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %ins = insertelement <4 x i32> undef, i32 %x, i32 0
+  %bc = bitcast <4 x i32> %ins to <16 x i8>
+  %ext = extractelement <16 x i8> %bc, i32 0
+  ret i8 %ext
 }
 
diff --git a/test/CodeGen/X86/extract-lowbits.ll b/test/CodeGen/X86/extract-lowbits.ll
index 4af130cd825c5b88e9e1bba3f272d272cc0e531e..8d18f29d332ece2083506d2f1fd482f7d1b4edf0 100644
--- a/test/CodeGen/X86/extract-lowbits.ll
+++ b/test/CodeGen/X86/extract-lowbits.ll
@@ -39,11 +39,9 @@ define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a0:
@@ -64,12 +62,8 @@ define i32 @bzhi32_a0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a0:
@@ -94,11 +88,9 @@ define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a1_indexzext:
@@ -119,12 +111,8 @@ define i32 @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a1_indexzext:
@@ -151,12 +139,10 @@ define i32 @bzhi32_a2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl (%edx), %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a2_load:
@@ -178,12 +164,8 @@ define i32 @bzhi32_a2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a2_load:
@@ -210,12 +192,10 @@ define i32 @bzhi32_a3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl (%edx), %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a3_load_indexzext:
@@ -237,12 +217,8 @@ define i32 @bzhi32_a3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl (%rdi), %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a3_load_indexzext:
@@ -269,11 +245,9 @@ define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_a4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    decl %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_a4_commutative:
@@ -294,12 +268,8 @@ define i32 @bzhi32_a4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_a4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    decl %eax
-; X64-BMI1NOTBM-NEXT:    andl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_a4_commutative:
@@ -384,12 +354,8 @@ define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a0:
@@ -472,12 +438,9 @@ define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a1_indexzext:
@@ -571,12 +534,8 @@ define i64 @bzhi64_a2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a2_load:
@@ -669,12 +628,9 @@ define i64 @bzhi64_a3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq (%rdi), %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a3_load_indexzext:
@@ -760,12 +716,8 @@ define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_a4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movl $1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    decq %rax
-; X64-BMI1NOTBM-NEXT:    andq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_a4_commutative:
@@ -794,10 +746,9 @@ define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_b0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b0:
@@ -818,11 +769,8 @@ define i32 @bzhi32_b0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b0:
@@ -847,10 +795,9 @@ define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_b1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b1_indexzext:
@@ -871,11 +818,8 @@ define i32 @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b1_indexzext:
@@ -904,9 +848,8 @@ define i32 @bzhi32_b2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %edx
-; X86-BMI1NOTBM-NEXT:    andnl (%eax), %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b2_load:
@@ -928,11 +871,8 @@ define i32 @bzhi32_b2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl (%rdi), %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b2_load:
@@ -961,9 +901,8 @@ define i32 @bzhi32_b3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind {
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %edx
-; X86-BMI1NOTBM-NEXT:    andnl (%eax), %edx, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b3_load_indexzext:
@@ -985,11 +924,8 @@ define i32 @bzhi32_b3_load_indexzext(i32* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl (%rdi), %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b3_load_indexzext:
@@ -1016,10 +952,9 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_b4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    andnl {{[0-9]+}}(%esp), %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_b4_commutative:
@@ -1040,11 +975,8 @@ define i32 @bzhi32_b4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_b4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    andnl %edi, %eax, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_b4_commutative:
@@ -1128,11 +1060,8 @@ define i64 @bzhi64_b0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b0:
@@ -1214,11 +1143,9 @@ define i64 @bzhi64_b1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b1_indexzext:
@@ -1307,11 +1234,8 @@ define i64 @bzhi64_b2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq (%rdi), %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b2_load:
@@ -1399,11 +1323,9 @@ define i64 @bzhi64_b3_load_indexzext(i64* %w, i8 zeroext %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq (%rdi), %rax, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b3_load_indexzext:
@@ -1488,11 +1410,8 @@ define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_b4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    andnq %rdi, %rax, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_b4_commutative:
@@ -1509,59 +1428,119 @@ define i64 @bzhi64_b4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; Pattern c. 32-bit
 ; ---------------------------------------------------------------------------- ;
 
+declare void @use32(i32)
+
 define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c0:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c0:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c0:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
+; X64-BMI1BMI2-NEXT:    movl %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %esi, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %val
   ret i32 %masked
 }
@@ -1569,57 +1548,115 @@ define i32 @bzhi32_c0(i32 %val, i32 %numlowbits) nounwind {
 define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-BMI1BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    negb %al
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %ebx, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c1_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
+; X64-BMI1BMI2-NEXT:    movl %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
   %mask = lshr i32 -1, %sh_prom
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %val
   ret i32 %masked
 }
@@ -1627,60 +1664,106 @@ define i32 @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    andl %edx, %esi
+; X86-NOBMI-NEXT:    movl %edx, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
+; X86-BMI1NOTBM-NEXT:    andl %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %edx, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    bzhil %eax, (%ecx), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %esi
+; X86-BMI1BMI2-NEXT:    negl %ecx
+; X86-BMI1BMI2-NEXT:    movl $-1, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl (%rdi), %eax
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    movl (%rdi), %ebx
+; X64-NOBMI-NEXT:    andl %eax, %ebx
+; X64-NOBMI-NEXT:    movl %eax, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    movl %ebx, %eax
+; X64-NOBMI-NEXT:    popq %rbx
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %ebx
+; X64-BMI1NOTBM-NEXT:    andl %eax, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %eax, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X64-BMI1NOTBM-NEXT:    popq %rbx
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c2_load:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, (%rdi), %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    bzhil %esi, (%rdi), %ebx
+; X64-BMI1BMI2-NEXT:    negl %esi
+; X64-BMI1BMI2-NEXT:    movl $-1, %eax
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %eax, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    popq %rbx
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i32, i32* %w
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %val
   ret i32 %masked
 }
@@ -1688,61 +1771,109 @@ define i32 @bzhi32_c2_load(i32* %w, i32 %numlowbits) nounwind {
 define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl (%eax), %eax
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %edx
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    andl %edx, %esi
+; X86-NOBMI-NEXT:    movl %edx, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
+; X86-BMI1NOTBM-NEXT:    andl %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl %edx, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
 ; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %eax
+; X86-BMI1BMI2-NEXT:    bzhil %ecx, (%eax), %esi
+; X86-BMI1BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx def $ecx
+; X86-BMI1BMI2-NEXT:    negb %cl
+; X86-BMI1BMI2-NEXT:    movl $-1, %eax
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl (%rdi), %eax
 ; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %eax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    movl (%rdi), %ebx
+; X64-NOBMI-NEXT:    andl %eax, %ebx
+; X64-NOBMI-NEXT:    movl %eax, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    movl %ebx, %eax
+; X64-NOBMI-NEXT:    popq %rbx
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    movl (%rdi), %ebx
+; X64-BMI1NOTBM-NEXT:    andl %eax, %ebx
+; X64-BMI1NOTBM-NEXT:    movl %eax, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X64-BMI1NOTBM-NEXT:    popq %rbx
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c3_load_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, (%rdi), %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    bzhil %esi, (%rdi), %ebx
+; X64-BMI1BMI2-NEXT:    # kill: def $sil killed $sil killed $esi def $esi
+; X64-BMI1BMI2-NEXT:    negb %sil
+; X64-BMI1BMI2-NEXT:    movl $-1, %eax
+; X64-BMI1BMI2-NEXT:    shrxl %esi, %eax, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    popq %rbx
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i32, i32* %w
   %numhighbits = sub i8 32, %numlowbits
   %sh_prom = zext i8 %numhighbits to i32
   %mask = lshr i32 -1, %sh_prom
+  call void @use32(i32 %mask)
   %masked = and i32 %mask, %val
   ret i32 %masked
 }
@@ -1750,142 +1881,275 @@ define i32 @bzhi32_c3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi32_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    subl $8, %esp
 ; X86-NOBMI-NEXT:    xorl %ecx, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    shll %cl, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
 ; X86-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NOBMI-NEXT:    shrl %cl, %eax
+; X86-NOBMI-NEXT:    shrl %cl, %esi
+; X86-NOBMI-NEXT:    movl %esi, (%esp)
+; X86-NOBMI-NEXT:    calll use32
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    addl $8, %esp
+; X86-NOBMI-NEXT:    popl %esi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
 ; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
 ; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, (%esp)
+; X86-BMI1NOTBM-NEXT:    calll use32
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    addl $8, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    negl %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X86-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %eax
+; X86-BMI1BMI2-NEXT:    movl %eax, (%esp)
+; X86-BMI1BMI2-NEXT:    calll use32
+; X86-BMI1BMI2-NEXT:    bzhil %esi, {{[0-9]+}}(%esp), %eax
+; X86-BMI1BMI2-NEXT:    addl $8, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi32_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbp
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edi, %ebx
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shll %cl, %eax
+; X64-NOBMI-NEXT:    movl $-1, %ebp
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrl %cl, %eax
+; X64-NOBMI-NEXT:    shrl %cl, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %edi
+; X64-NOBMI-NEXT:    callq use32
+; X64-NOBMI-NEXT:    andl %ebx, %ebp
+; X64-NOBMI-NEXT:    movl %ebp, %eax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %rbp
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbp
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
+; X64-BMI1NOTBM-NEXT:    movl %edi, %ebx
 ; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
+; X64-BMI1NOTBM-NEXT:    movl $-1, %ebp
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shrl %cl, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %edi
+; X64-BMI1NOTBM-NEXT:    callq use32
+; X64-BMI1NOTBM-NEXT:    andl %ebx, %ebp
+; X64-BMI1NOTBM-NEXT:    movl %ebp, %eax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %rbp
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_c4_commutative:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhil %esi, %edi, %eax
+; X64-BMI1BMI2-NEXT:    pushq %rbp
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
+; X64-BMI1BMI2-NEXT:    movl %edi, %ebp
+; X64-BMI1BMI2-NEXT:    movl %esi, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movl $-1, %ecx
+; X64-BMI1BMI2-NEXT:    shrxl %eax, %ecx, %edi
+; X64-BMI1BMI2-NEXT:    callq use32
+; X64-BMI1BMI2-NEXT:    bzhil %ebx, %ebp, %eax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %rbp
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i32 32, %numlowbits
   %mask = lshr i32 -1, %numhighbits
+  call void @use32(i32 %mask)
   %masked = and i32 %val, %mask ; swapped order
   ret i32 %masked
 }
 
 ; 64-bit
 
+declare void @use64(i64)
+
 define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c0:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB25_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB25_2:
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c0:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB25_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB25_2:
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c0:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB25_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB25_2:
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c0:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c0:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c0:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rsi, %rbx
+; X64-BMI1BMI2-NEXT:    movq %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %val
   ret i64 %masked
 }
@@ -1893,85 +2157,157 @@ define i64 @bzhi64_c0(i64 %val, i64 %numlowbits) nounwind {
 define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c1_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB26_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB26_2:
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB26_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB26_2:
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c1_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB26_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB26_2:
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c1_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c1_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movl %esi, %ebx
+; X64-BMI1BMI2-NEXT:    movq %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negb %al
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i8 64, %numlowbits
   %sh_prom = zext i8 %numhighbits to i64
   %mask = lshr i64 -1, %sh_prom
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %val
   ret i64 %masked
 }
@@ -1979,93 +2315,153 @@ define i64 @bzhi64_c1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c2_load:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
 ; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %edx, %edx
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB27_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %edx
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB27_2:
-; X86-NOBMI-NEXT:    andl (%esi), %eax
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
+; X86-NOBMI-NEXT:    movl (%eax), %esi
+; X86-NOBMI-NEXT:    andl %edx, %esi
+; X86-NOBMI-NEXT:    movl 4(%eax), %edi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edx
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %edx, %edx
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB27_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %edx
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB27_2:
-; X86-BMI1NOTBM-NEXT:    andl (%esi), %eax
-; X86-BMI1NOTBM-NEXT:    andl 4(%esi), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%eax), %esi
+; X86-BMI1NOTBM-NEXT:    andl %edx, %esi
+; X86-BMI1NOTBM-NEXT:    movl 4(%eax), %edi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %edx
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c2_load:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %edx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %edx, %ebx
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %edx, %edx
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB27_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebx, %edx
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB27_2:
-; X86-BMI1BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI1BMI2-NEXT:    andl 4(%esi), %edx
+; X86-BMI1BMI2-NEXT:    movl (%eax), %esi
+; X86-BMI1BMI2-NEXT:    andl %edx, %esi
+; X86-BMI1BMI2-NEXT:    movl 4(%eax), %edi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edx
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c2_load:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq (%rdi), %rax
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    movq (%rdi), %rbx
+; X64-NOBMI-NEXT:    andq %rax, %rbx
+; X64-NOBMI-NEXT:    movq %rax, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    popq %rbx
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rbx
+; X64-BMI1NOTBM-NEXT:    andq %rax, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rax, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    popq %rbx
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c2_load:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, (%rdi), %rax
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    bzhiq %rsi, (%rdi), %rbx
+; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
+; X64-BMI1BMI2-NEXT:    negl %esi
+; X64-BMI1BMI2-NEXT:    movq $-1, %rax
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rax, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    movq %rbx, %rax
+; X64-BMI1BMI2-NEXT:    popq %rbx
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i64, i64* %w
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %val
   ret i64 %masked
 }
@@ -2073,95 +2469,155 @@ define i64 @bzhi64_c2_load(i64* %w, i64 %numlowbits) nounwind {
 define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c3_load_indexzext:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movb $64, %cl
 ; X86-NOBMI-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
+; X86-NOBMI-NEXT:    movl $-1, %ebx
+; X86-NOBMI-NEXT:    shrl %cl, %ebx
 ; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB28_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %ebx, %eax
+; X86-NOBMI-NEXT:    xorl %ebx, %ebx
 ; X86-NOBMI-NEXT:  .LBB28_2:
-; X86-NOBMI-NEXT:    andl (%esi), %eax
-; X86-NOBMI-NEXT:    andl 4(%esi), %edx
+; X86-NOBMI-NEXT:    movl (%edx), %esi
+; X86-NOBMI-NEXT:    andl %eax, %esi
+; X86-NOBMI-NEXT:    movl 4(%edx), %edi
+; X86-NOBMI-NEXT:    andl %ebx, %edi
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %ebx
+; X86-NOBMI-NEXT:    pushl %eax
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
 ; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
+; X86-NOBMI-NEXT:    popl %ebx
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %edi
 ; X86-BMI1NOTBM-NEXT:    pushl %esi
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1NOTBM-NEXT:    movb $64, %cl
 ; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
+; X86-BMI1NOTBM-NEXT:    movl $-1, %ebx
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %ebx
 ; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB28_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %ebx, %eax
+; X86-BMI1NOTBM-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1NOTBM-NEXT:  .LBB28_2:
-; X86-BMI1NOTBM-NEXT:    andl (%esi), %eax
-; X86-BMI1NOTBM-NEXT:    andl 4(%esi), %edx
+; X86-BMI1NOTBM-NEXT:    movl (%edx), %esi
+; X86-BMI1NOTBM-NEXT:    andl %eax, %esi
+; X86-BMI1NOTBM-NEXT:    movl 4(%edx), %edi
+; X86-BMI1NOTBM-NEXT:    andl %ebx, %edi
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %ebx
+; X86-BMI1NOTBM-NEXT:    pushl %eax
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
 ; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
+; X86-BMI1NOTBM-NEXT:    popl %ebx
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c3_load_indexzext:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %edi
 ; X86-BMI1BMI2-NEXT:    pushl %esi
-; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1BMI2-NEXT:    movb $64, %cl
 ; X86-BMI1BMI2-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %ebx
 ; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB28_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X86-BMI1BMI2-NEXT:    xorl %ebx, %ebx
 ; X86-BMI1BMI2-NEXT:  .LBB28_2:
-; X86-BMI1BMI2-NEXT:    andl (%esi), %eax
-; X86-BMI1BMI2-NEXT:    andl 4(%esi), %edx
+; X86-BMI1BMI2-NEXT:    movl (%edx), %esi
+; X86-BMI1BMI2-NEXT:    andl %eax, %esi
+; X86-BMI1BMI2-NEXT:    movl 4(%edx), %edi
+; X86-BMI1BMI2-NEXT:    andl %ebx, %edi
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %ebx
+; X86-BMI1BMI2-NEXT:    pushl %eax
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
 ; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
+; X86-BMI1BMI2-NEXT:    popl %ebx
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c3_load_indexzext:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %rbx
 ; X64-NOBMI-NEXT:    movl %esi, %ecx
-; X64-NOBMI-NEXT:    movq (%rdi), %rax
 ; X64-NOBMI-NEXT:    negb %cl
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rax
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    movq (%rdi), %rbx
+; X64-NOBMI-NEXT:    andq %rax, %rbx
+; X64-NOBMI-NEXT:    movq %rax, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    popq %rbx
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
 ; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rax
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rbx
+; X64-BMI1NOTBM-NEXT:    andq %rax, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rax, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    popq %rbx
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c3_load_indexzext:
 ; X64-BMI1BMI2:       # %bb.0:
+; X64-BMI1BMI2-NEXT:    pushq %rbx
 ; X64-BMI1BMI2-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, (%rdi), %rax
+; X64-BMI1BMI2-NEXT:    bzhiq %rsi, (%rdi), %rbx
+; X64-BMI1BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI1BMI2-NEXT:    negb %sil
+; X64-BMI1BMI2-NEXT:    movq $-1, %rax
+; X64-BMI1BMI2-NEXT:    shrxq %rsi, %rax, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    movq %rbx, %rax
+; X64-BMI1BMI2-NEXT:    popq %rbx
 ; X64-BMI1BMI2-NEXT:    retq
   %val = load i64, i64* %w
   %numhighbits = sub i8 64, %numlowbits
   %sh_prom = zext i8 %numhighbits to i64
   %mask = lshr i64 -1, %sh_prom
+  call void @use64(i64 %mask)
   %masked = and i64 %mask, %val
   ret i64 %masked
 }
@@ -2169,83 +2625,156 @@ define i64 @bzhi64_c3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 define i64 @bzhi64_c4_commutative(i64 %val, i64 %numlowbits) nounwind {
 ; X86-NOBMI-LABEL: bzhi64_c4_commutative:
 ; X86-NOBMI:       # %bb.0:
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    movl $64, %ecx
 ; X86-NOBMI-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NOBMI-NEXT:    movl $-1, %eax
-; X86-NOBMI-NEXT:    movl $-1, %edx
-; X86-NOBMI-NEXT:    shrl %cl, %edx
-; X86-NOBMI-NEXT:    shrdl %cl, %eax, %eax
+; X86-NOBMI-NEXT:    movl $-1, %esi
+; X86-NOBMI-NEXT:    movl $-1, %edi
+; X86-NOBMI-NEXT:    shrl %cl, %edi
+; X86-NOBMI-NEXT:    shrdl %cl, %esi, %esi
 ; X86-NOBMI-NEXT:    testb $32, %cl
 ; X86-NOBMI-NEXT:    je .LBB29_2
 ; X86-NOBMI-NEXT:  # %bb.1:
-; X86-NOBMI-NEXT:    movl %edx, %eax
-; X86-NOBMI-NEXT:    xorl %edx, %edx
+; X86-NOBMI-NEXT:    movl %edi, %esi
+; X86-NOBMI-NEXT:    xorl %edi, %edi
 ; X86-NOBMI-NEXT:  .LBB29_2:
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    subl $8, %esp
+; X86-NOBMI-NEXT:    pushl %edi
+; X86-NOBMI-NEXT:    pushl %esi
+; X86-NOBMI-NEXT:    calll use64
+; X86-NOBMI-NEXT:    addl $16, %esp
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-NOBMI-NEXT:    movl %esi, %eax
+; X86-NOBMI-NEXT:    movl %edi, %edx
+; X86-NOBMI-NEXT:    addl $4, %esp
+; X86-NOBMI-NEXT:    popl %esi
+; X86-NOBMI-NEXT:    popl %edi
 ; X86-NOBMI-NEXT:    retl
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi64_c4_commutative:
 ; X86-BMI1NOTBM:       # %bb.0:
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    pushl %eax
 ; X86-BMI1NOTBM-NEXT:    movl $64, %ecx
 ; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    movl $-1, %eax
-; X86-BMI1NOTBM-NEXT:    movl $-1, %edx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %edx
-; X86-BMI1NOTBM-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1NOTBM-NEXT:    movl $-1, %esi
+; X86-BMI1NOTBM-NEXT:    movl $-1, %edi
+; X86-BMI1NOTBM-NEXT:    shrl %cl, %edi
+; X86-BMI1NOTBM-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1NOTBM-NEXT:    testb $32, %cl
 ; X86-BMI1NOTBM-NEXT:    je .LBB29_2
 ; X86-BMI1NOTBM-NEXT:  # %bb.1:
-; X86-BMI1NOTBM-NEXT:    movl %edx, %eax
-; X86-BMI1NOTBM-NEXT:    xorl %edx, %edx
+; X86-BMI1NOTBM-NEXT:    movl %edi, %esi
+; X86-BMI1NOTBM-NEXT:    xorl %edi, %edi
 ; X86-BMI1NOTBM-NEXT:  .LBB29_2:
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1NOTBM-NEXT:    subl $8, %esp
+; X86-BMI1NOTBM-NEXT:    pushl %edi
+; X86-BMI1NOTBM-NEXT:    pushl %esi
+; X86-BMI1NOTBM-NEXT:    calll use64
+; X86-BMI1NOTBM-NEXT:    addl $16, %esp
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1NOTBM-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1NOTBM-NEXT:    movl %esi, %eax
+; X86-BMI1NOTBM-NEXT:    movl %edi, %edx
+; X86-BMI1NOTBM-NEXT:    addl $4, %esp
+; X86-BMI1NOTBM-NEXT:    popl %esi
+; X86-BMI1NOTBM-NEXT:    popl %edi
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi64_c4_commutative:
 ; X86-BMI1BMI2:       # %bb.0:
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    pushl %eax
 ; X86-BMI1BMI2-NEXT:    movl $64, %ecx
 ; X86-BMI1BMI2-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1BMI2-NEXT:    movl $-1, %eax
-; X86-BMI1BMI2-NEXT:    shrxl %ecx, %eax, %edx
-; X86-BMI1BMI2-NEXT:    shrdl %cl, %eax, %eax
+; X86-BMI1BMI2-NEXT:    movl $-1, %esi
+; X86-BMI1BMI2-NEXT:    shrxl %ecx, %esi, %edi
+; X86-BMI1BMI2-NEXT:    shrdl %cl, %esi, %esi
 ; X86-BMI1BMI2-NEXT:    testb $32, %cl
 ; X86-BMI1BMI2-NEXT:    je .LBB29_2
 ; X86-BMI1BMI2-NEXT:  # %bb.1:
-; X86-BMI1BMI2-NEXT:    movl %edx, %eax
-; X86-BMI1BMI2-NEXT:    xorl %edx, %edx
+; X86-BMI1BMI2-NEXT:    movl %edi, %esi
+; X86-BMI1BMI2-NEXT:    xorl %edi, %edi
 ; X86-BMI1BMI2-NEXT:  .LBB29_2:
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-BMI1BMI2-NEXT:    subl $8, %esp
+; X86-BMI1BMI2-NEXT:    pushl %edi
+; X86-BMI1BMI2-NEXT:    pushl %esi
+; X86-BMI1BMI2-NEXT:    calll use64
+; X86-BMI1BMI2-NEXT:    addl $16, %esp
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X86-BMI1BMI2-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; X86-BMI1BMI2-NEXT:    movl %esi, %eax
+; X86-BMI1BMI2-NEXT:    movl %edi, %edx
+; X86-BMI1BMI2-NEXT:    addl $4, %esp
+; X86-BMI1BMI2-NEXT:    popl %esi
+; X86-BMI1BMI2-NEXT:    popl %edi
 ; X86-BMI1BMI2-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bzhi64_c4_commutative:
 ; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    pushq %r14
+; X64-NOBMI-NEXT:    pushq %rbx
+; X64-NOBMI-NEXT:    pushq %rax
 ; X64-NOBMI-NEXT:    movq %rsi, %rcx
-; X64-NOBMI-NEXT:    movq %rdi, %rax
+; X64-NOBMI-NEXT:    movq %rdi, %r14
 ; X64-NOBMI-NEXT:    negl %ecx
-; X64-NOBMI-NEXT:    shlq %cl, %rax
+; X64-NOBMI-NEXT:    movq $-1, %rbx
 ; X64-NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NOBMI-NEXT:    shrq %cl, %rax
+; X64-NOBMI-NEXT:    shrq %cl, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rdi
+; X64-NOBMI-NEXT:    callq use64
+; X64-NOBMI-NEXT:    andq %r14, %rbx
+; X64-NOBMI-NEXT:    movq %rbx, %rax
+; X64-NOBMI-NEXT:    addq $8, %rsp
+; X64-NOBMI-NEXT:    popq %rbx
+; X64-NOBMI-NEXT:    popq %r14
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_c4_commutative:
 ; X64-BMI1NOTBM:       # %bb.0:
+; X64-BMI1NOTBM-NEXT:    pushq %r14
+; X64-BMI1NOTBM-NEXT:    pushq %rbx
+; X64-BMI1NOTBM-NEXT:    pushq %rax
 ; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
+; X64-BMI1NOTBM-NEXT:    movq %rdi, %r14
 ; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    movq $-1, %rbx
 ; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shrq %cl, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rdi
+; X64-BMI1NOTBM-NEXT:    callq use64
+; X64-BMI1NOTBM-NEXT:    andq %r14, %rbx
+; X64-BMI1NOTBM-NEXT:    movq %rbx, %rax
+; X64-BMI1NOTBM-NEXT:    addq $8, %rsp
+; X64-BMI1NOTBM-NEXT:    popq %rbx
+; X64-BMI1NOTBM-NEXT:    popq %r14
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_c4_commutative:
 ; X64-BMI1BMI2:       # %bb.0:
-; X64-BMI1BMI2-NEXT:    bzhiq %rsi, %rdi, %rax
+; X64-BMI1BMI2-NEXT:    pushq %r14
+; X64-BMI1BMI2-NEXT:    pushq %rbx
+; X64-BMI1BMI2-NEXT:    pushq %rax
+; X64-BMI1BMI2-NEXT:    movq %rsi, %rbx
+; X64-BMI1BMI2-NEXT:    movq %rdi, %r14
+; X64-BMI1BMI2-NEXT:    movl %ebx, %eax
+; X64-BMI1BMI2-NEXT:    negl %eax
+; X64-BMI1BMI2-NEXT:    movq $-1, %rcx
+; X64-BMI1BMI2-NEXT:    shrxq %rax, %rcx, %rdi
+; X64-BMI1BMI2-NEXT:    callq use64
+; X64-BMI1BMI2-NEXT:    bzhiq %rbx, %r14, %rax
+; X64-BMI1BMI2-NEXT:    addq $8, %rsp
+; X64-BMI1BMI2-NEXT:    popq %rbx
+; X64-BMI1BMI2-NEXT:    popq %r14
 ; X64-BMI1BMI2-NEXT:    retq
   %numhighbits = sub i64 64, %numlowbits
   %mask = lshr i64 -1, %numhighbits
+  call void @use64(i64 %mask)
   %masked = and i64 %val, %mask ; swapped order
   ret i64 %masked
 }
@@ -2268,11 +2797,8 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d0:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d0:
@@ -2293,12 +2819,8 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d0:
@@ -2324,12 +2846,9 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X86-BMI1NOTBM-LABEL: bzhi32_d1_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
-; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-BMI1NOTBM-NEXT:    shll $8, %eax
+; X86-BMI1NOTBM-NEXT:    bextrl %eax, {{[0-9]+}}(%esp), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d1_indexzext:
@@ -2350,12 +2869,8 @@ define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl %edi, %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, %edi, %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d1_indexzext:
@@ -2384,12 +2899,9 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d2_load:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d2_load:
@@ -2411,12 +2923,8 @@ define i32 @bzhi32_d2_load(i32* %w, i32 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d2_load:
@@ -2445,12 +2953,9 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ; X86-BMI1NOTBM-LABEL: bzhi32_d3_load_indexzext:
 ; X86-BMI1NOTBM:       # %bb.0:
 ; X86-BMI1NOTBM-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1NOTBM-NEXT:    movl (%eax), %eax
-; X86-BMI1NOTBM-NEXT:    xorl %ecx, %ecx
-; X86-BMI1NOTBM-NEXT:    subb {{[0-9]+}}(%esp), %cl
-; X86-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X86-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X86-BMI1NOTBM-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-BMI1NOTBM-NEXT:    shll $8, %ecx
+; X86-BMI1NOTBM-NEXT:    bextrl %ecx, (%eax), %eax
 ; X86-BMI1NOTBM-NEXT:    retl
 ;
 ; X86-BMI1BMI2-LABEL: bzhi32_d3_load_indexzext:
@@ -2472,12 +2977,8 @@ define i32 @bzhi32_d3_load_indexzext(i32* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi32_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movl (%rdi), %eax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shll %cl, %eax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrl %cl, %eax
+; X64-BMI1NOTBM-NEXT:    shll $8, %esi
+; X64-BMI1NOTBM-NEXT:    bextrl %esi, (%rdi), %eax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi32_d3_load_indexzext:
@@ -2617,12 +3118,8 @@ define i64 @bzhi64_d0(i64 %val, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d0:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d0:
@@ -2758,12 +3255,9 @@ define i64 @bzhi64_d1_indexzext(i64 %val, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d1_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq %rdi, %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, %rdi, %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d1_indexzext:
@@ -2904,12 +3398,8 @@ define i64 @bzhi64_d2_load(i64* %w, i64 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d2_load:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movq %rsi, %rcx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negl %ecx
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d2_load:
@@ -3049,12 +3539,9 @@ define i64 @bzhi64_d3_load_indexzext(i64* %w, i8 %numlowbits) nounwind {
 ;
 ; X64-BMI1NOTBM-LABEL: bzhi64_d3_load_indexzext:
 ; X64-BMI1NOTBM:       # %bb.0:
-; X64-BMI1NOTBM-NEXT:    movl %esi, %ecx
-; X64-BMI1NOTBM-NEXT:    movq (%rdi), %rax
-; X64-BMI1NOTBM-NEXT:    negb %cl
-; X64-BMI1NOTBM-NEXT:    shlq %cl, %rax
-; X64-BMI1NOTBM-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-BMI1NOTBM-NEXT:    shrq %cl, %rax
+; X64-BMI1NOTBM-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-BMI1NOTBM-NEXT:    shlq $8, %rsi
+; X64-BMI1NOTBM-NEXT:    bextrq %rsi, (%rdi), %rax
 ; X64-BMI1NOTBM-NEXT:    retq
 ;
 ; X64-BMI1BMI2-LABEL: bzhi64_d3_load_indexzext:
diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll
index a2155de783117cf896d22945669f589c96a74ac1..534c63f708c09889317ec23cc1eebbe05eaf8d26 100644
--- a/test/CodeGen/X86/f16c-schedule.ll
+++ b/test/CodeGen/X86/f16c-schedule.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -43,6 +44,13 @@ define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKYLAKE-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_vcvtph2ps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtph2ps %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_vcvtph2ps_128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00]
@@ -100,6 +108,13 @@ define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKYLAKE-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_vcvtph2ps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [13:2.00]
+; BDVER2-NEXT:    vcvtph2ps %xmm0, %ymm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_vcvtph2ps_256:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtph2ps (%rdi), %ymm1 # sched: [8:2.00]
@@ -152,6 +167,12 @@ define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16>
 ; SKYLAKE-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [6:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_vcvtps2ph_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_vcvtps2ph_128:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
@@ -207,6 +228,13 @@ define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16>
 ; SKYLAKE-NEXT:    vzeroupper # sched: [4:1.00]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_vcvtps2ph_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:2.00]
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_vcvtps2ph_256:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:2.00]
diff --git a/test/CodeGen/X86/fast-isel-extract.ll b/test/CodeGen/X86/fast-isel-extract.ll
index fb20fdd0d36ff4ff1518c59eb6a01ef173526fe1..62d5b440afac27cc390d2101e3d4d04cee5d1aec 100644
--- a/test/CodeGen/X86/fast-isel-extract.ll
+++ b/test/CodeGen/X86/fast-isel-extract.ll
@@ -12,7 +12,8 @@ define void @test1(i64*) nounwind ssp {
   ret void
 ; CHECK-LABEL: test1:
 ; CHECK: callq _f
-; CHECK-NEXT: addq	$10, %rax
+; CHECK-NOT: %rax
+; CHECK: addq $10, %rax
 }
 
 define void @test2(i64*) nounwind ssp {
@@ -23,7 +24,8 @@ define void @test2(i64*) nounwind ssp {
   ret void
 ; CHECK-LABEL: test2:
 ; CHECK: callq _f
-; CHECK-NEXT: addq	$10, %rdx
+; CHECK-NOT: %rdx
+; CHECK: addq $10, %rdx
 }
 
 declare %addovf @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/fast-isel-gep.ll b/test/CodeGen/X86/fast-isel-gep.ll
index 88a22ca899d732099128de26ffc57c9c5d67d3e7..576990d2fe562faf5cdb93f0bd6cfda714d2ae26 100644
--- a/test/CodeGen/X86/fast-isel-gep.ll
+++ b/test/CodeGen/X86/fast-isel-gep.ll
@@ -24,7 +24,7 @@ define i32 @test2(i64 %t3, i32* %t1) nounwind {
        %t15 = load i32, i32* %t9            ; <i32> [#uses=1]
        ret i32 %t15
 ; X32-LABEL: test2:
-; X32:  	movl	(%edx,%ecx,4), %e
+; X32:  	movl	({{%e[a-z]+}},{{%e[a-z]+}},4), %e
 ; X32:  	ret
 
 ; X64-LABEL: test2:
@@ -81,8 +81,8 @@ define i64 @test5(i8* %A, i32 %I, i64 %B) nounwind {
   %v11 = add i64 %B, %v10
   ret i64 %v11
 ; X64-LABEL: test5:
-; X64: movslq	%e[[A1]], %rax
-; X64-NEXT: (%r[[A0]],%rax),
+; X64: movslq	%e[[A1]], [[R0:%r[a-z]+]]
+; X64-NEXT: (%r[[A0]],[[R0]]),
 ; X64: ret
 }
 
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index 7fb2670e6d13750a41fc14c051d85c6e250c0294..b1f380e3a85c87c02e82ecd32906fcf29b3e8efd 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -84,7 +84,7 @@ entry:
   ret i64 %mul
 
 ; CHECK-LABEL: test6:
-; CHECK: shlq	$3, %rdi
+; CHECK: shlq	$3, {{%r[a-z]+}}
 }
 
 define i32 @test7(i32 %x) nounwind ssp {
@@ -92,7 +92,7 @@ entry:
   %mul = mul nsw i32 %x, 8
   ret i32 %mul
 ; CHECK-LABEL: test7:
-; CHECK: shll	$3, %edi
+; CHECK: shll	$3, {{%e[a-z]+}}
 }
 
 
@@ -103,7 +103,7 @@ entry:
   ret i64 %add
 
 ; CHECK-LABEL: test8:
-; CHECK: addq	$7, %rdi
+; CHECK: addq	$7, {{%r[a-z]+}}
 }
 
 define i64 @test9(i64 %x) nounwind ssp {
@@ -297,8 +297,10 @@ define void @test23(i8* noalias sret %result) {
   %b = call i8* @foo23()
   ret void
 ; CHECK-LABEL: test23:
+; CHECK: movq %rdi, [[STACK:[0-9]+\(%rsp\)]]
 ; CHECK: call
-; CHECK: movq  %rdi, %rax
+; CHECK: movq [[STACK]], %rdi
+; CHECK: movq %rdi, %rax
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll
index 62e86e3ad2cc18773d857eecf355cb36ab4a5497..c0c5baa2c8bfa6d5393dcfeea598fe908f32d0e9 100644
--- a/test/CodeGen/X86/fdiv-combine.ll
+++ b/test/CodeGen/X86/fdiv-combine.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
 
 ; More than one 'arcp' division using a single divisor operand
@@ -79,12 +80,12 @@ define float @div2_arcp_partial3(float %x, float %y, float %z) {
 }
 
 ; If the reciprocal is already calculated, we should not
-; generate an extra multiplication by 1.0. 
+; generate an extra multiplication by 1.0.
 
 define double @div3_arcp(double %x, double %y, double %z) {
 ; CHECK-LABEL: div3_arcp:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd{{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
 ; CHECK-NEXT:    divsd %xmm1, %xmm2
 ; CHECK-NEXT:    mulsd %xmm2, %xmm0
 ; CHECK-NEXT:    addsd %xmm2, %xmm0
@@ -132,9 +133,16 @@ define float @div_select_constant_fold_zero(i1 zeroext %arg) {
 
 define void @PR24141() {
 ; CHECK-LABEL: PR24141:
-; CHECK:	callq
-; CHECK-NEXT:	divsd
-; CHECK-NEXT:	jmp
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    # implicit-def: $xmm0
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB8_1: # %while.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq g
+; CHECK-NEXT:    divsd %xmm1, %xmm0
+; CHECK-NEXT:    jmp .LBB8_1
 entry:
   br label %while.body
 
diff --git a/test/CodeGen/X86/fentry-insertion.ll b/test/CodeGen/X86/fentry-insertion.ll
index c5fb3b254b26168203584803105471f26b123bb7..56e32742c59766596d346795f5e6a192cb6da12a 100644
--- a/test/CodeGen/X86/fentry-insertion.ll
+++ b/test/CodeGen/X86/fentry-insertion.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -o - | FileCheck %s
+; RUN: llc %s -o - -verify-machineinstrs | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index ce36874456251ecd60106673b6af88216e86ab4c..35965a8b66e9dee55c9a4ae7e3cb95bfe9d308a3 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -8,7 +8,7 @@
 define <16 x float> @test1(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; CHECK-NEXT:    retq
 entry:
   %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
@@ -24,7 +24,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x fl
 define <16 x float> @test2(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
 ; CHECK-LABEL: test2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i32 4) #2
@@ -35,7 +35,7 @@ entry:
 define <16 x float> @test3(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
 ; CHECK-LABEL: test3:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfmsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2
@@ -46,7 +46,7 @@ entry:
 define <16 x float> @test4(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
 ; CHECK-LABEL: test4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 -1, i32 4) #2
@@ -105,7 +105,7 @@ declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x f
 define <8 x double> @test9(<8 x double> %a, <8 x double> %b, <8 x double> %c) {
 ; CHECK-LABEL: test9:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32 4) #2
@@ -118,7 +118,7 @@ declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double
 define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-LABEL: test10:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
 ; CHECK-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
@@ -141,7 +141,7 @@ define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ;
 ; KNL-LABEL: test11:
 ; KNL:       # %bb.0: # %entry
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; KNL-NEXT:    vxorps %xmm3, %xmm2, %xmm3
 ; KNL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; KNL-NEXT:    kmovw %edi, %k1
@@ -160,13 +160,13 @@ define <4 x float> @test11b(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 z
 ; SKX-LABEL: test11b:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test11b:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
 ; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
@@ -180,14 +180,14 @@ define <8 x double> @test12(<8 x double> %a, <8 x double> %b, <8 x double> %c, i
 ; SKX-LABEL: test12:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
 ; SKX-NEXT:    vxorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test12:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; KNL-NEXT:    vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2
 ; KNL-NEXT:    vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
 ; KNL-NEXT:    retq
 entry:
@@ -297,13 +297,13 @@ define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i
 ; SKX-LABEL: test17:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test17:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; KNL-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2
 ; KNL-NEXT:    retq
   %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
   %res = call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %sub.i, i32 4)
@@ -317,13 +317,13 @@ define <4 x float> @test18(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ; SKX-LABEL: test18:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test18:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
 ; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
@@ -335,13 +335,13 @@ define <4 x float> @test19(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ; SKX-LABEL: test19:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test19:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
 ; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
@@ -354,14 +354,14 @@ define <4 x float> @test20(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ; SKX-LABEL: test20:
 ; SKX:       # %bb.0: # %entry
 ; SKX-NEXT:    kmovd %edi, %k1
-; SKX-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
 ; SKX-NEXT:    vmovaps %xmm2, %xmm0
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test20:
 ; KNL:       # %bb.0: # %entry
 ; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT:    vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2
 ; KNL-NEXT:    vmovaps %xmm2, %xmm0
 ; KNL-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/fma-intrinsics-fast-isel.ll b/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
index fbe282f01fff2a334ad6787cb7133f650756429d..d82fe58ec40f7275e1bd98bd9769ddc4d340c7b6 100644
--- a/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/fma-intrinsics-fast-isel.ll
@@ -160,7 +160,7 @@ entry:
 define <4 x float> @test_mm_fnmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
 ; CHECK-LABEL: test_mm_fnmsub_ps:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [-0,-0,-0,-0]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorps %xmm3, %xmm0, %xmm4
 ; CHECK-NEXT:    vxorps %xmm3, %xmm2, %xmm0
 ; CHECK-NEXT:    vfmadd231ps {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0
@@ -175,7 +175,7 @@ entry:
 define <2 x double> @test_mm_fnmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-LABEL: test_mm_fnmsub_pd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovapd {{.*#+}} xmm3 = [-0,-0]
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm0, %xmm4
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm2, %xmm0
 ; CHECK-NEXT:    vfmadd231pd {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0
@@ -342,7 +342,7 @@ entry:
 define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
 ; CHECK-LABEL: test_mm256_fnmsub_ps:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorps %ymm3, %ymm0, %ymm4
 ; CHECK-NEXT:    vxorps %ymm3, %ymm2, %ymm0
 ; CHECK-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0
@@ -357,7 +357,7 @@ entry:
 define <4 x double> @test_mm256_fnmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
 ; CHECK-LABEL: test_mm256_fnmsub_pd:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [-0,-0,-0,-0]
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    vxorpd %ymm3, %ymm0, %ymm4
 ; CHECK-NEXT:    vxorpd %ymm3, %ymm2, %ymm0
 ; CHECK-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0
diff --git a/test/CodeGen/X86/fma-schedule.ll b/test/CodeGen/X86/fma-schedule.ll
index 819b9c7f27d5731402ac735e802c136513903dc6..6cdc615b231a86567ebb3fae9d9dcc85087c2fc7 100644
--- a/test/CodeGen/X86/fma-schedule.ll
+++ b/test/CodeGen/X86/fma-schedule.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
@@ -24,6 +25,18 @@ define void @test_vfmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132pd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231pd {{.*#+}} xmm0 = (xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132pd {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231pd {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -113,6 +126,19 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132pd {{.*#+}} ymm0 = (ymm0 * mem) + ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmadd231pd {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -206,6 +232,18 @@ define void @test_vfmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231ps {{.*#+}} xmm0 = (xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231ps {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -295,6 +333,19 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm2) + ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmadd231ps {{.*#+}} ymm0 = (ymm1 * mem) + ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -388,6 +439,18 @@ define void @test_vfmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132sd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231sd {{.*#+}} xmm0 = (xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132sd {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231sd {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -476,6 +539,18 @@ define void @test_vfmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddss_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd231ss {{.*#+}} xmm0 = (xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmadd231ss {{.*#+}} xmm0 = (xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddss_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -568,6 +643,18 @@ define void @test_vfmaddsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsubpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * xmm2) +/- xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} xmm0 = (xmm1 * xmm2) +/- xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} xmm0 = (xmm0 * mem) +/- xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} xmm0 = (xmm1 * mem) +/- xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsubpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -657,6 +744,19 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsubpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * ymm2) +/- ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub132pd {{.*#+}} ymm0 = (ymm0 * mem) +/- ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddsub231pd {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsubpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -750,6 +850,18 @@ define void @test_vfmaddsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsubps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * xmm2) +/- xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} xmm0 = (xmm1 * xmm2) +/- xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} xmm0 = (xmm0 * mem) +/- xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} xmm0 = (xmm1 * mem) +/- xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsubps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -839,6 +951,19 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmaddsubps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * ymm2) +/- ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * ymm2) +/- ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddsub132ps {{.*#+}} ymm0 = (ymm0 * mem) +/- ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddsub231ps {{.*#+}} ymm0 = (ymm1 * mem) +/- ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmaddsubps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -936,6 +1061,18 @@ define void @test_vfmsubaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubaddpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * xmm2) -/+ xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} xmm0 = (xmm1 * xmm2) -/+ xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} xmm0 = (xmm0 * mem) -/+ xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} xmm0 = (xmm1 * mem) -/+ xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubaddpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1025,6 +1162,19 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubaddpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * ymm2) -/+ ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd132pd {{.*#+}} ymm0 = (ymm0 * mem) -/+ ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmsubadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmsubadd231pd {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubaddpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1118,6 +1268,18 @@ define void @test_vfmsubaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubaddps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * xmm2) -/+ xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} xmm0 = (xmm1 * xmm2) -/+ xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} xmm0 = (xmm0 * mem) -/+ xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) -/+ mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} xmm0 = (xmm1 * mem) -/+ xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubaddps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1207,6 +1369,19 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubaddps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) -/+ ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * ymm2) -/+ ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsubadd132ps {{.*#+}} ymm0 = (ymm0 * mem) -/+ ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmsubadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) -/+ mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmsubadd231ps {{.*#+}} ymm0 = (ymm1 * mem) -/+ ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubaddps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1304,6 +1479,18 @@ define void @test_vfmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231pd {{.*#+}} xmm0 = (xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231pd {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1393,6 +1580,19 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * ymm2) - ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * mem) - ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmsub231pd {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1486,6 +1686,18 @@ define void @test_vfmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132ps {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231ps {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1575,6 +1787,19 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * ymm2) - ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132ps {{.*#+}} ymm0 = (ymm0 * mem) - ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem sched: [10:1.00]
+; BDVER2-NEXT:    vfmsub231ps {{.*#+}} ymm0 = (ymm1 * mem) - ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1668,6 +1893,18 @@ define void @test_vfmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubsd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132sd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231sd {{.*#+}} xmm0 = (xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132sd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231sd {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubsd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1756,6 +1993,18 @@ define void @test_vfmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfmsubss_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfmsub132ss {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfmsub132ss {{.*#+}} xmm0 = (xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfmsub231ss {{.*#+}} xmm0 = (xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfmsubss_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1848,6 +2097,18 @@ define void @test_vfnmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} xmm0 = -(xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -1937,6 +2198,19 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * ymm2) + ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132pd {{.*#+}} ymm0 = -(ymm0 * mem) + ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:1.00]
+; BDVER2-NEXT:    vfnmadd231pd {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2030,6 +2304,18 @@ define void @test_vfnmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} xmm0 = -(xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2119,6 +2405,19 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * ymm2) + ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * mem) + ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + mem sched: [10:1.00]
+; BDVER2-NEXT:    vfnmadd231ps {{.*#+}} ymm0 = -(ymm1 * mem) + ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2212,6 +2511,18 @@ define void @test_vfnmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddsd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132sd {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231sd {{.*#+}} xmm0 = -(xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132sd {{.*#+}} xmm0 = -(xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231sd {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddsd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2300,6 +2611,18 @@ define void @test_vfnmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmaddss_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm2) + xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd231ss {{.*#+}} xmm0 = -(xmm1 * xmm2) + xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * mem) + xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmadd231ss {{.*#+}} xmm0 = -(xmm1 * mem) + xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmaddss_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2392,6 +2715,18 @@ define void @test_vfnmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubpd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} xmm0 = -(xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubpd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2481,6 +2816,19 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubpd_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * ymm2) - ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132pd {{.*#+}} ymm0 = -(ymm0 * mem) - ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:1.00]
+; BDVER2-NEXT:    vfnmsub231pd {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubpd_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2574,6 +2922,18 @@ define void @test_vfnmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubps_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} xmm0 = -(xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} xmm0 = -(xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubps_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2663,6 +3023,19 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubps_256:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * ymm2) - ymm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132ps {{.*#+}} ymm0 = -(ymm0 * mem) - ymm1 sched: [10:1.00]
+; BDVER2-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - mem sched: [10:1.00]
+; BDVER2-NEXT:    vfnmsub231ps {{.*#+}} ymm0 = -(ymm1 * mem) - ymm0 sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubps_256:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2756,6 +3129,18 @@ define void @test_vfnmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubsd_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132sd {{.*#+}} xmm0 = -(xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231sd {{.*#+}} xmm0 = -(xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132sd {{.*#+}} xmm0 = -(xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231sd {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubsd_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
@@ -2844,6 +3229,18 @@ define void @test_vfnmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_vfnmsubss_128:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    vfnmsub132ss {{.*#+}} xmm0 = -(xmm0 * xmm2) - xmm1 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub231ss {{.*#+}} xmm0 = -(xmm1 * xmm2) - xmm0 sched: [5:0.50]
+; BDVER2-NEXT:    vfnmsub132ss {{.*#+}} xmm0 = -(xmm0 * mem) - xmm1 sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - mem sched: [10:0.50]
+; BDVER2-NEXT:    vfnmsub231ss {{.*#+}} xmm0 = -(xmm1 * mem) - xmm0 sched: [10:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; HASWELL-LABEL: test_vfnmsubss_128:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    #APP
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 149053ce056450c55c415d1767bf86c79f6f260c..c894a9f3d40cc77c5163fce80e5083b245024c86 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -247,76 +247,6 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #
 ; FMA32-NEXT:    ## xmm0 = (xmm1 * xmm0) + xmm2
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v4f32:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    subl $108, %esp ## encoding: [0x83,0xec,0x6c]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x40]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x30]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vextractps $2, %xmm1, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x4c,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x60]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x54]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x54]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x60]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
-; FMACALL32-NEXT:    addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v4f32:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
@@ -407,6 +337,76 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #
 ; AVX512VL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v4f32:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    subl $108, %esp ## encoding: [0x83,0xec,0x6c]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x4c,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x54]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x54]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x10,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
   ret <4 x float> %call
@@ -419,165 +419,6 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #
 ; FMA32-NEXT:    ## ymm0 = (ymm1 * ymm0) + ymm2
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v8f32:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    subl $316, %esp ## encoding: [0x81,0xec,0x3c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## imm = 0x13C
-; FMACALL32-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $2, %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x5c,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x9c,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x90,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x84,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x78]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x78]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x84,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x90,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x9c,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xa8,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xb4,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32-NEXT:    addl $316, %esp ## encoding: [0x81,0xc4,0x3c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## imm = 0x13C
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v8f32:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
@@ -745,6 +586,165 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #
 ; AVX512VL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## ymm0 = (ymm1 * ymm0) + ymm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v8f32:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    subl $316, %esp ## encoding: [0x81,0xec,0x3c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## imm = 0x13C
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x5c,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa8,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x9c,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x90,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x84,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x78]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x78]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x84,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x90,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x9c,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xa8,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xb4,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x30,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x20,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    addl $316, %esp ## encoding: [0x81,0xc4,0x3c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## imm = 0x13C
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c)
   ret <8 x float> %call
@@ -765,321 +765,6 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMA32-NEXT:    popl %ebp ## encoding: [0x5d]
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v16f32:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    pushl %ebp ## encoding: [0x55]
-; FMACALL32-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
-; FMACALL32-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
-; FMACALL32-NEXT:    subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## imm = 0x1C0
-; FMACALL32-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60]
-; FMACALL32-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm3, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd8,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xb0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x60]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x50]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x50]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x60]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00]
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
-; FMACALL32-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x4c]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x10]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x20]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x40,0x30]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x3c]
-; FMACALL32-NEXT:    ## xmm2 = mem[0],zero,zero,zero
-; FMACALL32-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x38,0x10]
-; FMACALL32-NEXT:    ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; FMACALL32-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x20]
-; FMACALL32-NEXT:    ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; FMACALL32-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x30]
-; FMACALL32-NEXT:    ## xmm2 = xmm2[0,1,2],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
-; FMACALL32-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
-; FMACALL32-NEXT:    popl %ebp ## encoding: [0x5d]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v16f32:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213ps %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0x6d,0xa8,0xc4]
@@ -1378,6 +1063,321 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; AVX512VL-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## zmm0 = (zmm1 * zmm0) + zmm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v16f32:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    pushl %ebp ## encoding: [0x55]
+; FMACALL32_BDVER2-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
+; FMACALL32_BDVER2-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
+; FMACALL32_BDVER2-NEXT:    subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## imm = 0x1C0
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x48,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x3c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x24,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x18,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x0c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xe8,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $2, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x02]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $1, %xmm0, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x04,0x24,0x01]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xdc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x08]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfa,0x11,0x44,0x24,0x04]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xb0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03]
+; FMACALL32_BDVER2-NEXT:    vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xdc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x24]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xe8,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xf4,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x14]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x0c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x4c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x18,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x48]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x24,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x44]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x3c,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x48,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x38]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x54,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x34]
+; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x3c]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x38,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x14,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x34,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0,1],mem[0],xmm2[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x20,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x10,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x30,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x4c]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x48,0x10]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x44,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x40,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1,2],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
+; FMACALL32_BDVER2-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
+; FMACALL32_BDVER2-NEXT:    popl %ebp ## encoding: [0x5d]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c)
   ret <16 x float> %call
@@ -1390,41 +1390,6 @@ define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %
 ; FMA32-NEXT:    ## xmm0 = (xmm1 * xmm0) + xmm2
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v2f64:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    subl $108, %esp ## encoding: [0x83,0xec,0x6c]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x30]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x54,0x24,0x10]
-; FMACALL32-NEXT:    vmovlhps %xmm1, %xmm0, %xmm2 ## encoding: [0xc5,0xf8,0x16,0xd1]
-; FMACALL32-NEXT:    ## xmm2 = xmm0[0],xmm1[0]
-; FMACALL32-NEXT:    vmovups %xmm2, (%esp) ## encoding: [0xc5,0xf8,0x11,0x14,0x24]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
-; FMACALL32-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58]
-; FMACALL32-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x20]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v2f64:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
@@ -1477,6 +1442,41 @@ define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %
 ; AVX512VL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## xmm0 = (xmm1 * xmm0) + xmm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v2f64:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    subl $108, %esp ## encoding: [0x83,0xec,0x6c]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc1]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],xmm1[0]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x54,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40]
+; FMACALL32_BDVER2-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
   ret <2 x double> %call
@@ -1489,90 +1489,6 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %
 ; FMA32-NEXT:    ## ymm0 = (ymm1 * ymm0) + ymm2
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v4f64:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    subl $252, %esp ## encoding: [0x81,0xec,0xfc,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x70]
-; FMACALL32-NEXT:    vmovlps %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x5c,0x24,0x10]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x60]
-; FMACALL32-NEXT:    vmovlhps %xmm2, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc2]
-; FMACALL32-NEXT:    ## xmm0 = xmm1[0],xmm2[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x44]
-; FMACALL32-NEXT:    vmovupd {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[1],mem[1]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x38]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
-; FMACALL32-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68]
-; FMACALL32-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x38]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x44]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x28]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x18]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32-NEXT:    addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v4f64:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
@@ -1664,6 +1580,90 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %
 ; AVX512VL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## ymm0 = (ymm1 * ymm0) + ymm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v4f64:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    subl $252, %esp ## encoding: [0x81,0xec,0xfc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],xmm2[0]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x70]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x5c,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x44]
+; FMACALL32_BDVER2-NEXT:    vmovupd {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfd,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[1],mem[1]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x38]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x10,0x84,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x38]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x44]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x28]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x18]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    addl $252, %esp ## encoding: [0x81,0xc4,0xfc,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c)
   ret <4 x double> %call
@@ -1684,179 +1684,6 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMA32-NEXT:    popl %ebp ## encoding: [0x5d]
 ; FMA32-NEXT:    retl ## encoding: [0xc3]
 ;
-; FMACALL32-LABEL: test_v8f64:
-; FMACALL32:       ## %bb.0: ## %entry
-; FMACALL32-NEXT:    pushl %ebp ## encoding: [0x55]
-; FMACALL32-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
-; FMACALL32-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
-; FMACALL32-NEXT:    subl $384, %esp ## encoding: [0x81,0xec,0x80,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## imm = 0x180
-; FMACALL32-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x40,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x50,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],xmm2[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovapd 40(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[1],mem[1]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
-; FMACALL32-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[1],xmm1[1]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
-; FMACALL32-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x30]
-; FMACALL32-NEXT:    vmovapd 8(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[1],mem[1]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x20]
-; FMACALL32-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
-; FMACALL32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x00,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
-; FMACALL32-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00]
-; FMACALL32-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
-; FMACALL32-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x20]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x30]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78]
-; FMACALL32-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
-; FMACALL32-NEXT:    ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00]
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70]
-; FMACALL32-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
-; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x58]
-; FMACALL32-NEXT:    ## xmm0 = xmm0[0],mem[0]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x48]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32-NEXT:    ## xmm1 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x78]
-; FMACALL32-NEXT:    ## xmm1 = xmm1[0],mem[0]
-; FMACALL32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70]
-; FMACALL32-NEXT:    ## xmm2 = mem[0],zero
-; FMACALL32-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x16,0x54,0x24,0x68]
-; FMACALL32-NEXT:    ## xmm2 = xmm2[0],mem[0]
-; FMACALL32-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
-; FMACALL32-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
-; FMACALL32-NEXT:    popl %ebp ## encoding: [0x5d]
-; FMACALL32-NEXT:    retl ## encoding: [0xc3]
-;
 ; FMA64-LABEL: test_v8f64:
 ; FMA64:       ## %bb.0: ## %entry
 ; FMA64-NEXT:    vfmadd213pd %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0xed,0xa8,0xc4]
@@ -2011,6 +1838,179 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; AVX512VL-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
 ; AVX512VL-NEXT:    ## zmm0 = (zmm1 * zmm0) + zmm2
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL32_BDVER2-LABEL: test_v8f64:
+; FMACALL32_BDVER2:       ## %bb.0: ## %entry
+; FMACALL32_BDVER2-NEXT:    pushl %ebp ## encoding: [0x55]
+; FMACALL32_BDVER2-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
+; FMACALL32_BDVER2-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
+; FMACALL32_BDVER2-NEXT:    subl $384, %esp ## encoding: [0x81,0xec,0x80,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## imm = 0x180
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x9c,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x40,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x50,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],xmm2[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd 40(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[1],mem[1]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 40(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x29,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x15,0xc1]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[1],xmm1[1]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x30,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd 8(%ebp), %ymm0 ## encoding: [0xc5,0xfd,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[1],mem[1]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovaps 8(%ebp), %ymm0 ## encoding: [0xc5,0xfc,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x14,0x84,0x24,0x00,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    vmovapd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0,1],xmm0[2,3]
+; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x20]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x58]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x6c,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xa0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x48]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0xc0,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x9c,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x88,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x78]
+; FMACALL32_BDVER2-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xac,0x24,0x94,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x70]
+; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
+; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
+; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x16,0x44,0x24,0x58]
+; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x48]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x16,0x54,0x24,0x68]
+; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovhpd {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x16,0x4c,0x24,0x78]
+; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0]
+; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
+; FMACALL32_BDVER2-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
+; FMACALL32_BDVER2-NEXT:    popl %ebp ## encoding: [0x5d]
+; FMACALL32_BDVER2-NEXT:    retl ## encoding: [0xc3]
 entry:
   %call = call <8 x double> @llvm.fma.v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c)
   ret <8 x double> %call
diff --git a/test/CodeGen/X86/fma4-schedule.ll b/test/CodeGen/X86/fma4-schedule.ll
index 65d5273bec7d861e5775fffbe533f33f2e7aeab7..c8b5debd3fbb61c54c97165a21a43c2471607939 100644
--- a/test/CodeGen/X86/fma4-schedule.ll
+++ b/test/CodeGen/X86/fma4-schedule.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 | FileCheck %s --check-prefixes=CHECK,GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1              | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER12,BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER12,BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER34,BDVER3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 -mattr=-fma  | FileCheck %s --check-prefixes=CHECK,BDVER,BDVER34,BDVER4
 
 ;
 ; VFMADD
@@ -19,14 +19,23 @@ define void @test_vfmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddpd $2, $1, $0, $0 \0A\09 vfmaddpd $3, $1, $0, $0 \0A\09 vfmaddpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -42,15 +51,25 @@ define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddpd $2, $1, $0, $0 \0A\09 vfmaddpd $3, $1, $0, $0 \0A\09 vfmaddpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -65,14 +84,23 @@ define void @test_vfmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddps $2, $1, $0, $0 \0A\09 vfmaddps $3, $1, $0, $0 \0A\09 vfmaddps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -88,15 +116,25 @@ define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddps $2, $1, $0, $0 \0A\09 vfmaddps $3, $1, $0, $0 \0A\09 vfmaddps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -111,14 +149,23 @@ define void @test_vfmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsd $2, $1, $0, $0 \0A\09 vfmaddsd $3, $1, $0, $0 \0A\09 vfmaddsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -133,14 +180,23 @@ define void @test_vfmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddss_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddss (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddss %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddss_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddss_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddss $2, $1, $0, $0 \0A\09 vfmaddss $3, $1, $0, $0 \0A\09 vfmaddss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -159,14 +215,23 @@ define void @test_vfmaddsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsubpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsubpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsubpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsubpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsubpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsubpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsubpd $2, $1, $0, $0 \0A\09 vfmaddsubpd $3, $1, $0, $0 \0A\09 vfmaddsubpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -182,15 +247,25 @@ define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsubpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsubpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsubpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsubpd $2, $1, $0, $0 \0A\09 vfmaddsubpd $3, $1, $0, $0 \0A\09 vfmaddsubpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -205,14 +280,23 @@ define void @test_vfmaddsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsubps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsubps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmaddsubps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsubps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmaddsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsubps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsubps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmaddsubps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsubps $2, $1, $0, $0 \0A\09 vfmaddsubps $3, $1, $0, $0 \0A\09 vfmaddsubps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -228,15 +312,25 @@ define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmaddsubps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmaddsubps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmaddsubps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmaddsubps $2, $1, $0, $0 \0A\09 vfmaddsubps $3, $1, $0, $0 \0A\09 vfmaddsubps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -255,14 +349,23 @@ define void @test_vfmsubaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubaddpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubaddpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubaddpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubaddpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubaddpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubaddpd $2, $1, $0, $0 \0A\09 vfmsubaddpd $3, $1, $0, $0 \0A\09 vfmsubaddpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -278,15 +381,25 @@ define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x doubl
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubaddpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubaddpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubaddpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubaddpd $2, $1, $0, $0 \0A\09 vfmsubaddpd $3, $1, $0, $0 \0A\09 vfmsubaddpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -301,14 +414,23 @@ define void @test_vfmsubaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubaddps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubaddps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubaddps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubaddps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubaddps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubaddps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubaddps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubaddps $2, $1, $0, $0 \0A\09 vfmsubaddps $3, $1, $0, $0 \0A\09 vfmsubaddps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -324,15 +446,25 @@ define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubaddps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubaddps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubaddps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubaddps $2, $1, $0, $0 \0A\09 vfmsubaddps $3, $1, $0, $0 \0A\09 vfmsubaddps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -351,14 +483,23 @@ define void @test_vfmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubpd $2, $1, $0, $0 \0A\09 vfmsubpd $3, $1, $0, $0 \0A\09 vfmsubpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -374,15 +515,25 @@ define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubpd $2, $1, $0, $0 \0A\09 vfmsubpd $3, $1, $0, $0 \0A\09 vfmsubpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -397,14 +548,23 @@ define void @test_vfmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubps $2, $1, $0, $0 \0A\09 vfmsubps $3, $1, $0, $0 \0A\09 vfmsubps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -420,15 +580,25 @@ define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfmsubps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfmsubps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubps $2, $1, $0, $0 \0A\09 vfmsubps $3, $1, $0, $0 \0A\09 vfmsubps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -443,14 +613,23 @@ define void @test_vfmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubsd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubsd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubsd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubsd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubsd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubsd $2, $1, $0, $0 \0A\09 vfmsubsd $3, $1, $0, $0 \0A\09 vfmsubsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -465,14 +644,23 @@ define void @test_vfmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfmsubss_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubss (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfmsubss %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfmsubss_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfmsubss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfmsubss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfmsubss_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfmsubss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfmsubss $2, $1, $0, $0 \0A\09 vfmsubss $3, $1, $0, $0 \0A\09 vfmsubss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -491,14 +679,23 @@ define void @test_vfnmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddpd $2, $1, $0, $0 \0A\09 vfnmaddpd $3, $1, $0, $0 \0A\09 vfnmaddpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -514,15 +711,25 @@ define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddpd $2, $1, $0, $0 \0A\09 vfnmaddpd $3, $1, $0, $0 \0A\09 vfnmaddpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -537,14 +744,23 @@ define void @test_vfnmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddps $2, $1, $0, $0 \0A\09 vfnmaddps $3, $1, $0, $0 \0A\09 vfnmaddps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -560,15 +776,25 @@ define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddps $2, $1, $0, $0 \0A\09 vfnmaddps $3, $1, $0, $0 \0A\09 vfnmaddps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -583,14 +809,23 @@ define void @test_vfnmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddsd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddsd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddsd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddsd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddsd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddsd $2, $1, $0, $0 \0A\09 vfnmaddsd $3, $1, $0, $0 \0A\09 vfnmaddsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -605,14 +840,23 @@ define void @test_vfnmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmaddss_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddss (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmaddss %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmaddss_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmaddss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmaddss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmaddss_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmaddss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmaddss $2, $1, $0, $0 \0A\09 vfnmaddss $3, $1, $0, $0 \0A\09 vfnmaddss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -631,14 +875,23 @@ define void @test_vfnmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubpd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubpd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubpd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubpd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubpd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubpd $2, $1, $0, $0 \0A\09 vfnmsubpd $3, $1, $0, $0 \0A\09 vfnmsubpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -654,15 +907,25 @@ define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double>
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubpd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubpd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubpd_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubpd $2, $1, $0, $0 \0A\09 vfnmsubpd $3, $1, $0, $0 \0A\09 vfnmsubpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
   ret void
 }
@@ -677,14 +940,23 @@ define void @test_vfnmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubps (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubps %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubps_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubps $2, $1, $0, $0 \0A\09 vfnmsubps $3, $1, $0, $0 \0A\09 vfnmsubps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
@@ -700,15 +972,25 @@ define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubps_256:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER34-NEXT:    vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    vzeroupper
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubps $2, $1, $0, $0 \0A\09 vfnmsubps $3, $1, $0, $0 \0A\09 vfnmsubps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
   ret void
 }
@@ -723,14 +1005,23 @@ define void @test_vfnmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubsd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubsd (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubsd %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubsd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubsd_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubsd $2, $1, $0, $0 \0A\09 vfnmsubsd $3, $1, $0, $0 \0A\09 vfnmsubsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
   ret void
 }
@@ -745,14 +1036,23 @@ define void @test_vfnmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfnmsubss_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubss (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vfnmsubss %xmm1, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfnmsubss_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER12-NEXT:    vfnmsubss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    vfnmsubss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER34-LABEL: test_vfnmsubss_128:
+; BDVER34:       # %bb.0:
+; BDVER34-NEXT:    #APP
+; BDVER34-NEXT:    vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER34-NEXT:    vfnmsubss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER34-NEXT:    #NO_APP
+; BDVER34-NEXT:    retq
   tail call void asm "vfnmsubss $2, $1, $0, $0 \0A\09 vfnmsubss $3, $1, $0, $0 \0A\09 vfnmsubss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
   ret void
 }
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 2d01c570f99341736ba0ef2eb892a280238fd2ce..e59d6b66bc60a2397cb8bfcf70e4edf60ff292d4 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -636,6 +636,44 @@ define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
+;
+; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
+;
+; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
+; AVX512-NOINFS-NEXT:    retq
+  %a = fadd <4 x float> %x, <float 1.0, float undef, float 1.0, float undef>
+  %m = fmul <4 x float> %y, %a
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
 ; FMA-INFS:       # %bb.0:
@@ -712,24 +750,62 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
+;
+; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
+;
+; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
+; AVX512-NOINFS-NEXT:    retq
+  %a = fadd <4 x float> %x, <float undef, float -1.0, float undef, float -1.0>
+  %m = fmul <4 x float> %y, %a
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -756,21 +832,21 @@ define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
 define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -794,24 +870,65 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0>
+; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
+;
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0>
+; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
+;
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
+; AVX512-NOINFS-NEXT:    retq
+  %s = fsub <4 x float> <float 1.0, float undef, float 1.0, float 1.0>, %x
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -838,21 +955,21 @@ define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y
 define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA-INFS-NEXT:    retq
 ;
 ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA4-INFS-NEXT:    retq
 ;
 ; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX512-INFS-NEXT:    retq
@@ -876,6 +993,47 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0>
+; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
+;
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0>
+; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
+;
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
+; AVX512-NOINFS-NEXT:    retq
+  %s = fsub <4 x float> <float -1.0, float -1.0, float undef, float -1.0>, %x
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
 ; FMA-INFS:       # %bb.0:
@@ -952,6 +1110,44 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
+;
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
+;
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
+; AVX512-NOINFS-NEXT:    retq
+  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float undef>
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
 define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) {
 ; FMA-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
 ; FMA-INFS:       # %bb.0:
@@ -1028,6 +1224,44 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y
   ret <4 x float> %m
 }
 
+define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x float> %y) {
+; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA-INFS:       # %bb.0:
+; FMA-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-INFS-NEXT:    retq
+;
+; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA4-INFS:       # %bb.0:
+; FMA4-INFS-NEXT:    vsubps {{.*}}(%rip), %xmm0, %xmm0
+; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA4-INFS-NEXT:    retq
+;
+; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; AVX512-INFS:       # %bb.0:
+; AVX512-INFS-NEXT:    vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX512-INFS-NEXT:    retq
+;
+; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA-NOINFS:       # %bb.0:
+; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
+; FMA-NOINFS-NEXT:    retq
+;
+; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; FMA4-NOINFS:       # %bb.0:
+; FMA4-NOINFS-NEXT:    vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
+; FMA4-NOINFS-NEXT:    retq
+;
+; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
+; AVX512-NOINFS:       # %bb.0:
+; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
+; AVX512-NOINFS-NEXT:    retq
+  %s = fsub <4 x float> %x, <float undef, float -1.0, float -1.0, float -1.0>
+  %m = fmul <4 x float> %y, %s
+  ret <4 x float> %m
+}
+
 ;
 ; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
 ;
@@ -1084,7 +1318,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
 define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
 ; FMA-INFS-LABEL: test_v4f32_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
 ; FMA-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
 ; FMA-INFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1092,7 +1326,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
 ;
 ; FMA4-INFS-LABEL: test_v4f32_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
 ; FMA4-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
 ; FMA4-INFS-NEXT:    vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
@@ -1100,7 +1334,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
 ;
 ; AVX512-INFS-LABEL: test_v4f32_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
 ; AVX512-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
 ; AVX512-INFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1133,7 +1367,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
 define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
 ; FMA-INFS-LABEL: test_v8f32_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
 ; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
@@ -1141,7 +1375,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
 ;
 ; FMA4-INFS-LABEL: test_v8f32_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
 ; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
@@ -1149,7 +1383,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
 ;
 ; AVX512-INFS-LABEL: test_v8f32_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
 ; AVX512-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
 ; AVX512-INFS-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
@@ -1231,7 +1465,7 @@ define double @test_f64_interp(double %x, double %y, double %t) {
 define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
 ; FMA-INFS-LABEL: test_v2f64_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
 ; FMA-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
 ; FMA-INFS-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1239,7 +1473,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
 ;
 ; FMA4-INFS-LABEL: test_v2f64_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
 ; FMA4-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
 ; FMA4-INFS-NEXT:    vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
@@ -1247,7 +1481,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
 ;
 ; AVX512-INFS-LABEL: test_v2f64_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1,1]
+; AVX512-INFS-NEXT:    vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
 ; AVX512-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
 ; AVX512-INFS-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1280,7 +1514,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
 define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
 ; FMA-INFS-LABEL: test_v4f64_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
 ; FMA-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
@@ -1288,7 +1522,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
 ;
 ; FMA4-INFS-LABEL: test_v4f64_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
 ; FMA4-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
@@ -1296,7 +1530,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
 ;
 ; AVX512-INFS-LABEL: test_v4f64_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
 ; AVX512-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; AVX512-INFS-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll
index 2bd64135712ebdd5ed26c8dead17b9946cf691f3..bef31d8a8cc8a53bb376c6f3cacb841c4298b04f 100644
--- a/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/test/CodeGen/X86/fma_patterns_wide.ll
@@ -259,7 +259,7 @@ define <8 x double> @test_8f64_fmsub_load(<8 x double>* %a0, <8 x double> %a1, <
 define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -268,7 +268,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -305,7 +305,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
 define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -314,7 +314,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -351,7 +351,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
 define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -360,7 +360,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -397,7 +397,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
 define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -406,7 +406,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -443,7 +443,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
 define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
 ; FMA-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -452,7 +452,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
 ; FMA4-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -461,7 +461,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
 ;
 ; AVX512-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %zmm0, %zmm2, %zmm0
 ; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-INFS-NEXT:    retq
@@ -490,7 +490,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
 define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
 ; FMA-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -499,7 +499,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
 ; FMA4-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -508,7 +508,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
 ;
 ; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
 ; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
 ; AVX512-INFS-NEXT:    retq
@@ -537,7 +537,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
 define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
 ; FMA-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -546,7 +546,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
 ; FMA4-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -555,7 +555,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
 ;
 ; AVX512-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %zmm0, %zmm2, %zmm0
 ; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-INFS-NEXT:    retq
@@ -584,7 +584,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
 define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
 ; FMA-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -593,7 +593,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
 ; FMA4-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -602,7 +602,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
 ;
 ; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
 ; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
 ; AVX512-INFS-NEXT:    retq
@@ -631,7 +631,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
 define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -640,7 +640,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -677,7 +677,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
 define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -686,7 +686,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -723,7 +723,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
 define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -732,7 +732,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
 ;
 ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1,-1,-1,-1,-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
@@ -769,7 +769,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
 define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) {
 ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
 ; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -778,7 +778,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
 ;
 ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1,-1,-1,-1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm1, %ymm1
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm0, %ymm0
 ; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
@@ -819,7 +819,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
 define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) {
 ; FMA-INFS-LABEL: test_v16f32_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1]
+; FMA-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubps %ymm4, %ymm6, %ymm7
 ; FMA-INFS-NEXT:    vsubps %ymm5, %ymm6, %ymm6
 ; FMA-INFS-NEXT:    vmulps %ymm6, %ymm3, %ymm3
@@ -830,7 +830,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
 ;
 ; FMA4-INFS-LABEL: test_v16f32_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1]
+; FMA4-INFS-NEXT:    vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm6, %ymm7
 ; FMA4-INFS-NEXT:    vsubps %ymm5, %ymm6, %ymm6
 ; FMA4-INFS-NEXT:    vmulps %ymm6, %ymm3, %ymm3
@@ -841,7 +841,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
 ;
 ; AVX512-INFS-LABEL: test_v16f32_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubps %zmm2, %zmm3, %zmm3
 ; AVX512-INFS-NEXT:    vmulps %zmm3, %zmm1, %zmm1
 ; AVX512-INFS-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1
@@ -878,7 +878,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
 define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) {
 ; FMA-INFS-LABEL: test_v8f64_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1,1,1,1]
+; FMA-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
 ; FMA-INFS-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
 ; FMA-INFS-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
@@ -889,7 +889,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
 ;
 ; FMA4-INFS-LABEL: test_v8f64_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1,1,1,1]
+; FMA4-INFS-NEXT:    vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
 ; FMA4-INFS-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
 ; FMA4-INFS-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
@@ -900,7 +900,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
 ;
 ; AVX512-INFS-LABEL: test_v8f64_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1]
+; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-INFS-NEXT:    vsubpd %zmm2, %zmm3, %zmm3
 ; AVX512-INFS-NEXT:    vmulpd %zmm3, %zmm1, %zmm1
 ; AVX512-INFS-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1
@@ -1143,7 +1143,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; FMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
-; FMA-NEXT:    vmovapd {{.*#+}} ymm2 = [-0,-0,-0,-0]
+; FMA-NEXT:    vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; FMA-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
 ; FMA-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
 ; FMA-NEXT:    retq
@@ -1152,7 +1152,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
 ; FMA4:       # %bb.0:
 ; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
 ; FMA4-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm2 = [-0,-0,-0,-0]
+; FMA4-NEXT:    vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; FMA4-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
 ; FMA4-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
 ; FMA4-NEXT:    retq
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
index 85f86110d05928ef5c187512f9d6c2bc97a4860d..f9843dced1b4216aadb9f6c1149249a4df00cb01 100644
--- a/test/CodeGen/X86/fmul-combines.ll
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -61,7 +61,7 @@ define <4 x float> @fmul2_v4f32_undef(<4 x float> %x) {
 define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) {
 ; CHECK-LABEL: constant_fold_fmul_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [8,8,8,8]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [8.0E+0,8.0E+0,8.0E+0,8.0E+0]
 ; CHECK-NEXT:    retq
   %y = fmul <4 x float> <float 4.0, float 4.0, float 4.0, float 4.0>, <float 2.0, float 2.0, float 2.0, float 2.0>
   ret <4 x float> %y
@@ -70,7 +70,7 @@ define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) {
 define <4 x float> @constant_fold_fmul_v4f32_undef(<4 x float> %x) {
 ; CHECK-LABEL: constant_fold_fmul_v4f32_undef:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [8,NaN,8,NaN]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [8.0E+0,NaN,8.0E+0,NaN]
 ; CHECK-NEXT:    retq
   %y = fmul <4 x float> <float 4.0, float undef, float 4.0, float 4.0>, <float 2.0, float 2.0, float 2.0, float undef>
   ret <4 x float> %y
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
index 5523846dd1921f60807331bcccf59c78f3ece766..115f2bf7a5b961b137820112fd03ca02895ee911 100644
--- a/test/CodeGen/X86/fold-load-vec.ll
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -16,8 +16,8 @@ define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
 ; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    movlps %xmm0, (%rsi)
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
 ; CHECK-NEXT:    callq ext
 ; CHECK-NEXT:    addq $24, %rsp
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/fold-sext-trunc.ll b/test/CodeGen/X86/fold-sext-trunc.ll
index 7cab8ebe537f24fa1cdc754e107ec64e8af3685f..475fbea4285c3ec3a3d9eab654ecf7a881819204 100644
--- a/test/CodeGen/X86/fold-sext-trunc.ll
+++ b/test/CodeGen/X86/fold-sext-trunc.ll
@@ -13,7 +13,7 @@ define void @int322(i32 %foo) !dbg !5 {
 entry:
   %val = load i64, i64* getelementptr (%0, %0* bitcast (%struct.S1* @g_10 to %0*), i32 0, i32 0), !dbg !16
   %0 = load i32, i32* getelementptr inbounds (%struct.S1, %struct.S1* @g_10, i32 0, i32 1), align 4, !dbg !17
-; MIR: renamable $rax = MOVSX64rm32 {{.*}}, @g_10 + 4,{{.*}} debug-location !17 :: (dereferenceable load 4 from `i64* getelementptr (%0, %0* bitcast (%struct.S1* @g_10 to %0*), i32 0, i32 0)` + 4)
+; MIR: renamable {{\$r[a-z]+}} = MOVSX64rm32 {{.*}}, @g_10 + 4,{{.*}} debug-location !17 :: (dereferenceable load 4 from `i64* getelementptr (%0, %0* bitcast (%struct.S1* @g_10 to %0*), i32 0, i32 0)` + 4)
   %1 = sext i32 %0 to i64, !dbg !18
   %tmp4.i = lshr i64 %val, 32, !dbg !19
   %tmp5.i = trunc i64 %tmp4.i to i32, !dbg !20
diff --git a/test/CodeGen/X86/fold-vector-trunc-sitofp.ll b/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
index e53e7f8f9c651783de44658be93be78b17c8bc37..73c7dc1fae56f08be627b8c3925c6db51722ce65 100644
--- a/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
+++ b/test/CodeGen/X86/fold-vector-trunc-sitofp.ll
@@ -7,7 +7,7 @@
 define <4 x float> @test1() {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,0,-1,0]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = trunc <4 x i3> <i3 -1, i3 -22, i3 7, i3 8> to <4 x i1>
   %2 = sitofp <4 x i1> %1 to <4 x float>
diff --git a/test/CodeGen/X86/fold-vex.ll b/test/CodeGen/X86/fold-vex.ll
index 006db6effdf631904a7b2953fed78cc8b9baaa8c..c7b376a053d4ac5a75e8b89d5bd1777bd8e94a4b 100644
--- a/test/CodeGen/X86/fold-vex.ll
+++ b/test/CodeGen/X86/fold-vex.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Use CPU parameters to ensure that a CPU-specific attribute is not overriding the AVX definition.
 
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown                  -mattr=+avx | FileCheck %s
@@ -14,18 +15,20 @@
 ; unless specially configured on some CPUs such as AMD Family 10H.
 
 define <4 x i32> @test1(<4 x i32>* %p0, <4 x i32> %in1) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vandps (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+;
+; SSE-LABEL: test1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movups (%rdi), %xmm1
+; SSE-NEXT:    andps %xmm1, %xmm0
+; SSE-NEXT:    retq
   %in0 = load <4 x i32>, <4 x i32>* %p0, align 2
   %a = and <4 x i32> %in0, %in1
   ret <4 x i32> %a
 
-; CHECK-LABEL: @test1
-; CHECK-NOT:   vmovups
-; CHECK:       vandps (%rdi), %xmm0, %xmm0
-; CHECK-NEXT:  ret
 
-; SSE-LABEL: @test1
-; SSE:       movups (%rdi), %xmm1
-; SSE-NEXT:  andps %xmm1, %xmm0
-; SSE-NEXT:  ret
 }
 
diff --git a/test/CodeGen/X86/fp128-compare.ll b/test/CodeGen/X86/fp128-compare.ll
index 7ee2e90657c06e174e0c5d88139e271aaee88262..6f2b0c514a87435604662457f63f2f7ad3551155 100644
--- a/test/CodeGen/X86/fp128-compare.ll
+++ b/test/CodeGen/X86/fp128-compare.ll
@@ -1,103 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx \
 ; RUN:     -enable-legalize-types-checking | FileCheck %s
 ; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx \
 ; RUN:     -enable-legalize-types-checking | FileCheck %s
 
 define i32 @TestComp128GT(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128GT:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __gttf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    setg %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp ogt fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128GT:
-; CHECK:       callq __gttf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       setg  %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define i32 @TestComp128GE(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128GE:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __getf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp oge fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128GE:
-; CHECK:       callq __getf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       setns %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define i32 @TestComp128LT(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128LT:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __lttf2
+; CHECK-NEXT:    shrl $31, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp olt fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128LT:
-; CHECK:       callq __lttf2
-; CHECK-NEXT:  shrl $31, %eax
-; CHECK:       retq
-;
 ; The 'shrl' is a special optimization in llvm to combine
 ; the effect of 'fcmp olt' and 'zext'. The main purpose is
 ; to test soften call to __lttf2.
 }
 
 define i32 @TestComp128LE(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128LE:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __letf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    setle %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp ole fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128LE:
-; CHECK:       callq __letf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       setle %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define i32 @TestComp128EQ(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128EQ:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __eqtf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    sete %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp oeq fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128EQ:
-; CHECK:       callq __eqtf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       sete  %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define i32 @TestComp128NE(fp128 %d1, fp128 %d2) {
+; CHECK-LABEL: TestComp128NE:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    callq __netf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp une fp128 %d1, %d2
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: TestComp128NE:
-; CHECK:       callq __netf2
-; CHECK:       xorl  %ecx, %ecx
-; CHECK:       testl %eax, %eax
-; CHECK:       setne %cl
-; CHECK:       movl  %ecx, %eax
-; CHECK:       retq
 }
 
 define fp128 @TestMax(fp128 %x, fp128 %y) {
+; CHECK-LABEL: TestMax:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq __gttf2
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jg .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:  .LBB6_2: # %entry
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cmp = fcmp ogt fp128 %x, %y
   %cond = select i1 %cmp, fp128 %x, fp128 %y
   ret fp128 %cond
-; CHECK-LABEL: TestMax:
-; CHECK: movaps %xmm0
-; CHECK: movaps %xmm1
-; CHECK: callq __gttf2
-; CHECK: movaps {{.*}}, %xmm0
-; CHECK: testl %eax, %eax
-; CHECK: movaps {{.*}}, %xmm0
-; CHECK: retq
 }
diff --git a/test/CodeGen/X86/ftrunc.ll b/test/CodeGen/X86/ftrunc.ll
index 01112f48bf475726ff6a4e4f5a62debac65e0d1a..26cea1d71f3112a1532e7c63fcdb88c28cda04d8 100644
--- a/test/CodeGen/X86/ftrunc.ll
+++ b/test/CodeGen/X86/ftrunc.ll
@@ -63,25 +63,19 @@ define double @trunc_unsigned_f64(double %x) #0 {
 define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_v4f32:
 ; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
-; SSE2-NEXT:    cvttss2si %xmm1, %rax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE2-NEXT:    cvttss2si %xmm2, %rax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    cvttss2si %xmm0, %rax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535]
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    cmpltps %xmm2, %xmm1
+; SSE2-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE2-NEXT:    subps %xmm2, %xmm0
+; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE2-NEXT:    xorps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    andps %xmm1, %xmm3
+; SSE2-NEXT:    andnps %xmm0, %xmm1
+; SSE2-NEXT:    orps %xmm3, %xmm1
+; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [65535,65535,65535,65535]
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    psrld $16, %xmm1
 ; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    addps {{.*}}(%rip), %xmm1
@@ -106,39 +100,34 @@ define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 {
 define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_v2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    movapd %xmm1, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rax
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    cvttsd2si %xmm1, %rdx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm1
-; SSE2-NEXT:    cmovaeq %rax, %rdx
 ; SSE2-NEXT:    movapd %xmm0, %xmm1
 ; SSE2-NEXT:    subsd %xmm2, %xmm1
 ; SSE2-NEXT:    cvttsd2si %xmm1, %rax
+; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE2-NEXT:    xorq %rcx, %rax
+; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE2-NEXT:    ucomisd %xmm2, %xmm0
+; SSE2-NEXT:    cmovaeq %rax, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm1
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movapd %xmm0, %xmm3
+; SSE2-NEXT:    subsd %xmm2, %xmm3
+; SSE2-NEXT:    cvttsd2si %xmm3, %rax
 ; SSE2-NEXT:    xorq %rcx, %rax
 ; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
 ; SSE2-NEXT:    ucomisd %xmm2, %xmm0
 ; SSE2-NEXT:    cmovaeq %rax, %rcx
-; SSE2-NEXT:    movq %rcx, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm3, %xmm1
+; SSE2-NEXT:    movq %rcx, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    addpd %xmm0, %xmm1
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movq %rdx, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_unsigned_v2f64:
@@ -158,68 +147,62 @@ define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 {
 define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_v4f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm1, %xmm3
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    movapd %xmm3, %xmm4
-; SSE2-NEXT:    subsd %xmm2, %xmm4
-; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE2-NEXT:    movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
-; SSE2-NEXT:    xorq %rdx, %rcx
-; SSE2-NEXT:    cvttsd2si %xmm3, %rax
-; SSE2-NEXT:    ucomisd %xmm2, %xmm3
-; SSE2-NEXT:    cmovaeq %rcx, %rax
-; SSE2-NEXT:    movapd %xmm1, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rsi
-; SSE2-NEXT:    xorq %rdx, %rsi
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE2-NEXT:    subsd %xmm3, %xmm1
 ; SSE2-NEXT:    cvttsd2si %xmm1, %rcx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm1
-; SSE2-NEXT:    cmovaeq %rsi, %rcx
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    movapd %xmm1, %xmm3
-; SSE2-NEXT:    subsd %xmm2, %xmm3
-; SSE2-NEXT:    cvttsd2si %xmm3, %rsi
-; SSE2-NEXT:    xorq %rdx, %rsi
-; SSE2-NEXT:    cvttsd2si %xmm1, %rdi
-; SSE2-NEXT:    ucomisd %xmm2, %xmm1
-; SSE2-NEXT:    cmovaeq %rsi, %rdi
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    subsd %xmm2, %xmm1
-; SSE2-NEXT:    cvttsd2si %xmm1, %rsi
-; SSE2-NEXT:    xorq %rdx, %rsi
-; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE2-NEXT:    ucomisd %xmm2, %xmm0
-; SSE2-NEXT:    cmovaeq %rsi, %rdx
+; SSE2-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE2-NEXT:    xorq %rax, %rcx
+; SSE2-NEXT:    cvttsd2si %xmm2, %rdx
+; SSE2-NEXT:    ucomisd %xmm3, %xmm2
+; SSE2-NEXT:    cmovaeq %rcx, %rdx
 ; SSE2-NEXT:    movq %rdx, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm3, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movq %rdi, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm4
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm4
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE2-NEXT:    movq %rcx, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm1
-; SSE2-NEXT:    movq %rax, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm3, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
+; SSE2-NEXT:    movapd %xmm2, %xmm4
+; SSE2-NEXT:    subsd %xmm3, %xmm4
+; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE2-NEXT:    xorq %rax, %rcx
+; SSE2-NEXT:    cvttsd2si %xmm2, %rdx
+; SSE2-NEXT:    ucomisd %xmm3, %xmm2
+; SSE2-NEXT:    cmovaeq %rcx, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm2
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    movapd %xmm0, %xmm2
+; SSE2-NEXT:    subsd %xmm3, %xmm2
+; SSE2-NEXT:    cvttsd2si %xmm2, %rcx
+; SSE2-NEXT:    xorq %rax, %rcx
+; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
+; SSE2-NEXT:    ucomisd %xmm3, %xmm0
+; SSE2-NEXT:    cmovaeq %rcx, %rdx
+; SSE2-NEXT:    movq %rdx, %xmm2
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    movapd %xmm0, %xmm4
+; SSE2-NEXT:    subsd %xmm3, %xmm4
+; SSE2-NEXT:    cvttsd2si %xmm4, %rcx
+; SSE2-NEXT:    xorq %rax, %rcx
+; SSE2-NEXT:    cvttsd2si %xmm0, %rax
+; SSE2-NEXT:    ucomisd %xmm3, %xmm0
+; SSE2-NEXT:    cmovaeq %rcx, %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT:    subpd %xmm6, %xmm2
+; SSE2-NEXT:    addpd %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    subpd %xmm6, %xmm1
+; SSE2-NEXT:    addpd %xmm0, %xmm1
+; SSE2-NEXT:    movapd %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_unsigned_v4f64:
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index ca8fd2acfa3bc39684e8e39af8817bf9905953c9..6468523b3c4fea6ecbbde0749379c2b90b86cd8c 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -149,11 +149,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
 ; LIN-SSE2-NEXT:    andl %ecx, %edx
 ; LIN-SSE2-NEXT:    andl %ecx, %esi
 ; LIN-SSE2-NEXT:    andl %ecx, %edi
-; LIN-SSE2-NEXT:    movd %eax, %xmm0
-; LIN-SSE2-NEXT:    movd %edx, %xmm1
+; LIN-SSE2-NEXT:    movq %rax, %xmm0
+; LIN-SSE2-NEXT:    movq %rdx, %xmm1
 ; LIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; LIN-SSE2-NEXT:    movd %edi, %xmm2
-; LIN-SSE2-NEXT:    movd %esi, %xmm1
+; LIN-SSE2-NEXT:    movq %rdi, %xmm2
+; LIN-SSE2-NEXT:    movq %rsi, %xmm1
 ; LIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; LIN-SSE2-NEXT:    retq
 ;
@@ -169,11 +169,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
 ; LIN-SSE4-NEXT:    andl %ecx, %edx
 ; LIN-SSE4-NEXT:    andl %ecx, %esi
 ; LIN-SSE4-NEXT:    andl %ecx, %edi
-; LIN-SSE4-NEXT:    movd %edx, %xmm1
-; LIN-SSE4-NEXT:    movd %eax, %xmm0
+; LIN-SSE4-NEXT:    movq %rdx, %xmm1
+; LIN-SSE4-NEXT:    movq %rax, %xmm0
 ; LIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; LIN-SSE4-NEXT:    movd %edi, %xmm2
-; LIN-SSE4-NEXT:    movd %esi, %xmm1
+; LIN-SSE4-NEXT:    movq %rdi, %xmm2
+; LIN-SSE4-NEXT:    movq %rsi, %xmm1
 ; LIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; LIN-SSE4-NEXT:    retq
 ;
@@ -192,11 +192,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
 ; WIN-SSE2-NEXT:    andl %r9d, %ecx
 ; WIN-SSE2-NEXT:    andl %r9d, %r8d
 ; WIN-SSE2-NEXT:    andl %r9d, %edx
-; WIN-SSE2-NEXT:    movd %eax, %xmm0
-; WIN-SSE2-NEXT:    movd %ecx, %xmm1
+; WIN-SSE2-NEXT:    movq %rax, %xmm0
+; WIN-SSE2-NEXT:    movq %rcx, %xmm1
 ; WIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIN-SSE2-NEXT:    movd %edx, %xmm2
-; WIN-SSE2-NEXT:    movd %r8d, %xmm1
+; WIN-SSE2-NEXT:    movq %rdx, %xmm2
+; WIN-SSE2-NEXT:    movq %r8, %xmm1
 ; WIN-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; WIN-SSE2-NEXT:    retq
 ;
@@ -212,11 +212,11 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
 ; WIN-SSE4-NEXT:    andl %r9d, %ecx
 ; WIN-SSE4-NEXT:    andl %r9d, %r8d
 ; WIN-SSE4-NEXT:    andl %r9d, %edx
-; WIN-SSE4-NEXT:    movd %ecx, %xmm1
-; WIN-SSE4-NEXT:    movd %eax, %xmm0
+; WIN-SSE4-NEXT:    movq %rcx, %xmm1
+; WIN-SSE4-NEXT:    movq %rax, %xmm0
 ; WIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIN-SSE4-NEXT:    movd %edx, %xmm2
-; WIN-SSE4-NEXT:    movd %r8d, %xmm1
+; WIN-SSE4-NEXT:    movq %rdx, %xmm2
+; WIN-SSE4-NEXT:    movq %r8, %xmm1
 ; WIN-SSE4-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; WIN-SSE4-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/haddsub-shuf.ll b/test/CodeGen/X86/haddsub-shuf.ll
index ac5d5a70e30bf13eef0be81336c72e913b6b908d..0ece3fe1414d5a5bde050164b83784f8c3ac6a96 100644
--- a/test/CodeGen/X86/haddsub-shuf.ll
+++ b/test/CodeGen/X86/haddsub-shuf.ll
@@ -1,21 +1,54 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
 
 ; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
 ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
 
 define <4 x float> @hadd_v4f32(<4 x float> %a) {
-; SSSE3-LABEL: hadd_v4f32:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    haddps %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v4f32:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    addps %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v4f32:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    haddps %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v4f32:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v4f32:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v4f32:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v4f32:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %hop = fadd <2 x float> %a02, %a13
@@ -54,16 +87,51 @@ define <8 x float> @hadd_v8f32a(<8 x float> %a) {
 }
 
 define <8 x float> @hadd_v8f32b(<8 x float> %a) {
-; SSSE3-LABEL: hadd_v8f32b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    haddps %xmm0, %xmm0
-; SSSE3-NEXT:    haddps %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v8f32b:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v8f32b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT:    movaps %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    addps %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT:    addps %xmm3, %xmm1
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v8f32b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    haddps %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v8f32b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v8f32b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v8f32b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v8f32b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = fadd <8 x float> %a0, %a1
@@ -72,15 +140,45 @@ define <8 x float> @hadd_v8f32b(<8 x float> %a) {
 }
 
 define <4 x float> @hsub_v4f32(<4 x float> %a) {
-; SSSE3-LABEL: hsub_v4f32:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v4f32:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    subps %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v4f32:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v4f32:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v4f32:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v4f32:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v4f32:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
   %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
   %hop = fsub <2 x float> %a02, %a13
@@ -119,16 +217,51 @@ define <8 x float> @hsub_v8f32a(<8 x float> %a) {
 }
 
 define <8 x float> @hsub_v8f32b(<8 x float> %a) {
-; SSSE3-LABEL: hsub_v8f32b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSSE3-NEXT:    hsubps %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v8f32b:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v8f32b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3]
+; SSSE3_SLOW-NEXT:    movaps %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3]
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    subps %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT:    subps %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm3[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v8f32b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    hsubps %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v8f32b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v8f32b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v8f32b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v8f32b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = fsub <8 x float> %a0, %a1
@@ -137,15 +270,42 @@ define <8 x float> @hsub_v8f32b(<8 x float> %a) {
 }
 
 define <2 x double> @hadd_v2f64(<2 x double> %a) {
-; SSSE3-LABEL: hadd_v2f64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    haddpd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v2f64:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSSE3_SLOW-NEXT:    addpd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v2f64:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v2f64:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1_SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v2f64:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v2f64:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2_SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v2f64:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %hop = fadd <2 x double> %a0, %a1
@@ -154,16 +314,47 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
 }
 
 define <4 x double> @hadd_v4f64(<4 x double> %a) {
-; SSSE3-LABEL: hadd_v4f64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    haddpd %xmm0, %xmm0
-; SSSE3-NEXT:    haddpd %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v4f64:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSSE3_SLOW-NEXT:    addpd %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    addpd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm3[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v4f64:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    haddpd %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v4f64:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX1_SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v4f64:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v4f64:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX2_SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v4f64:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
   %hop = fadd <4 x double> %a0, %a1
@@ -172,15 +363,42 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) {
 }
 
 define <2 x double> @hsub_v2f64(<2 x double> %a) {
-; SSSE3-LABEL: hsub_v2f64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v2f64:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSSE3_SLOW-NEXT:    subpd %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v2f64:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    hsubpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v2f64:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1_SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v2f64:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v2f64:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2_SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v2f64:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %hop = fsub <2 x double> %a0, %a1
@@ -189,16 +407,47 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
 }
 
 define <4 x double> @hsub_v4f64(<4 x double> %a) {
-; SSSE3-LABEL: hsub_v4f64:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
-; SSSE3-NEXT:    hsubpd %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v4f64:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm3
+; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
+; SSSE3_SLOW-NEXT:    subpd %xmm3, %xmm1
+; SSSE3_SLOW-NEXT:    subpd %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
+; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v4f64:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    hsubpd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    hsubpd %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v4f64:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX1_SLOW-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v4f64:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v4f64:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; AVX2_SLOW-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
+; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v4f64:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
   %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
   %hop = fsub <4 x double> %a0, %a1
@@ -207,15 +456,44 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) {
 }
 
 define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
-; SSSE3-LABEL: hadd_v4i32:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v4i32:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v4i32:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v4i32:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v4i32:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v4i32:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v4i32:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %hop = add <4 x i32> %a02, %a13
@@ -254,25 +532,57 @@ define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
 }
 
 define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
-; SSSE3-LABEL: hadd_v8i32b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    phaddd %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX1-LABEL: hadd_v8i32b:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: hadd_v8i32b:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v8i32b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    paddd %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT:    paddd %xmm3, %xmm1
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v8i32b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    phaddd %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v8i32b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1_SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX1_SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v8i32b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v8i32b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v8i32b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = add <8 x i32> %a0, %a1
@@ -281,15 +591,44 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
 }
 
 define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
-; SSSE3-LABEL: hsub_v4i32:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v4i32:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    psubd %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v4i32:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v4i32:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1_SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v4i32:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v4i32:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2_SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v4i32:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %hop = sub <4 x i32> %a02, %a13
@@ -328,25 +667,57 @@ define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
 }
 
 define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
-; SSSE3-LABEL: hsub_v8i32b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    phsubd %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX1-LABEL: hsub_v8i32b:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: hsub_v8i32b:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v8i32b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3_SLOW-NEXT:    psubd %xmm0, %xmm2
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSSE3_SLOW-NEXT:    psubd %xmm0, %xmm3
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v8i32b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    phsubd %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v8i32b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX1_SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1_SLOW-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; AVX1_SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v8i32b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v8i32b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2_SLOW-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v8i32b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
   %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
   %hop = sub <8 x i32> %a0, %a1
@@ -355,15 +726,45 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
 }
 
 define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
-; SSSE3-LABEL: hadd_v8i16:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hadd_v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v8i16:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v8i16:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v8i16:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v8i16:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v8i16:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX2_SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v8i16:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = add <8 x i16> %a0246, %a1357
@@ -402,25 +803,64 @@ define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
 }
 
 define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
-; SSSE3-LABEL: hadd_v16i16b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddw %xmm0, %xmm0
-; SSSE3-NEXT:    phaddw %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX1-LABEL: hadd_v16i16b:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: hadd_v16i16b:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; SSSE3_SLOW-LABEL: hadd_v16i16b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm3
+; SSSE3_SLOW-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3_SLOW-NEXT:    movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    paddw %xmm3, %xmm0
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3_SLOW-NEXT:    paddw %xmm4, %xmm1
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hadd_v16i16b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    phaddw %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hadd_v16i16b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1_SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX1_SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX1_SLOW-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
+; AVX1_SLOW-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
+; AVX1_SLOW-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
+; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hadd_v16i16b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hadd_v16i16b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX2_SLOW-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hadd_v16i16b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = add <16 x i16> %a0, %a1
@@ -429,15 +869,45 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
 }
 
 define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
-; SSSE3-LABEL: hsub_v8i16:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubw %xmm0, %xmm0
-; SSSE3-NEXT:    retq
-;
-; AVX-LABEL: hsub_v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v8i16:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT:    psubw %xmm0, %xmm1
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v8i16:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phsubw %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v8i16:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
+; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v8i16:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v8i16:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX2_SLOW-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
+; AVX2_SLOW-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v8i16:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
+; AVX2_FAST-NEXT:    retq
   %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = sub <8 x i16> %a0246, %a1357
@@ -476,25 +946,64 @@ define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
 }
 
 define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
-; SSSE3-LABEL: hsub_v16i16b:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubw %xmm0, %xmm0
-; SSSE3-NEXT:    phsubw %xmm1, %xmm1
-; SSSE3-NEXT:    retq
-;
-; AVX1-LABEL: hsub_v16i16b:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: hsub_v16i16b:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT:    retq
+; SSSE3_SLOW-LABEL: hsub_v16i16b:
+; SSSE3_SLOW:       # %bb.0:
+; SSSE3_SLOW-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm3
+; SSSE3_SLOW-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3_SLOW-NEXT:    movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3_SLOW-NEXT:    psubw %xmm0, %xmm3
+; SSSE3_SLOW-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3_SLOW-NEXT:    psubw %xmm1, %xmm4
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1]
+; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
+; SSSE3_SLOW-NEXT:    retq
+;
+; SSSE3_FAST-LABEL: hsub_v16i16b:
+; SSSE3_FAST:       # %bb.0:
+; SSSE3_FAST-NEXT:    phsubw %xmm0, %xmm0
+; SSSE3_FAST-NEXT:    phsubw %xmm1, %xmm1
+; SSSE3_FAST-NEXT:    retq
+;
+; AVX1_SLOW-LABEL: hsub_v16i16b:
+; AVX1_SLOW:       # %bb.0:
+; AVX1_SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
+; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1_SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
+; AVX1_SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1_SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX1_SLOW-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
+; AVX1_SLOW-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
+; AVX1_SLOW-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
+; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_SLOW-NEXT:    retq
+;
+; AVX1_FAST-LABEL: hsub_v16i16b:
+; AVX1_FAST:       # %bb.0:
+; AVX1_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm1
+; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
+; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1_FAST-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1_FAST-NEXT:    retq
+;
+; AVX2_SLOW-LABEL: hsub_v16i16b:
+; AVX2_SLOW:       # %bb.0:
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2_SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX2_SLOW-NEXT:    vpsubw %ymm0, %ymm1, %ymm0
+; AVX2_SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2_SLOW-NEXT:    retq
+;
+; AVX2_FAST-LABEL: hsub_v16i16b:
+; AVX2_FAST:       # %bb.0:
+; AVX2_FAST-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
+; AVX2_FAST-NEXT:    retq
   %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
   %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
   %hop = sub <16 x i16> %a0, %a1
diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll
index 84decabcbce4c3b3538a1cf9cfb1722594dbc203..e0590a766159d13f71640132c266485a90b9bd06 100644
--- a/test/CodeGen/X86/haddsub-undef.ll
+++ b/test/CodeGen/X86/haddsub-undef.ll
@@ -1,7 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSE,SSE-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
 
 ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs.
 
@@ -339,8 +342,6 @@ define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %vecinit5
 }
 
-; On AVX2, the following sequence can be folded into a single horizontal add.
-; If the Subtarget doesn't support AVX2, then we avoid emitting two packed
 ; integer horizontal adds instead of two scalar adds followed by vector inserts.
 define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: test15_undef:
@@ -451,17 +452,38 @@ define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <2 x double> @add_pd_003(<2 x double> %x) {
-; SSE-LABEL: add_pd_003:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
-; SSE-NEXT:    addpd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_pd_003:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT:    addpd %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_pd_003:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_pd_003:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_pd_003:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_pd_003:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_pd_003:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_pd_003:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
   %add = fadd <2 x double> %l, %x
   ret <2 x double> %add
@@ -470,39 +492,84 @@ define <2 x double> @add_pd_003(<2 x double> %x) {
 ; Change shuffle mask - no undefs.
 
 define <2 x double> @add_pd_003_2(<2 x double> %x) {
-; SSE-LABEL: add_pd_003_2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movapd %xmm0, %xmm1
-; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
-; SSE-NEXT:    addpd %xmm0, %xmm1
-; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_pd_003_2:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSE-SLOW-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
+; SSE-SLOW-NEXT:    addpd %xmm0, %xmm1
+; SSE-SLOW-NEXT:    movapd %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_pd_003_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_pd_003_2:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_pd_003_2:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_pd_003_2:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_pd_003_2:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_pd_003_2:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 0>
   %add = fadd <2 x double> %l, %x
   ret <2 x double> %add
 }
 
 define <2 x double> @add_pd_010(<2 x double> %x) {
-; SSE-LABEL: add_pd_010:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
-; SSE-NEXT:    addpd %xmm0, %xmm1
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT:    movapd %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_pd_010:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT:    addpd %xmm0, %xmm1
+; SSE-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-SLOW-NEXT:    movapd %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_pd_010:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_pd_010:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_pd_010:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_pd_010:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_pd_010:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_pd_010:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
   %add = fadd <2 x double> %l, %x
   %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
@@ -510,20 +577,42 @@ define <2 x double> @add_pd_010(<2 x double> %x) {
 }
 
 define <4 x float> @add_ps_007(<4 x float> %x) {
-; SSE-LABEL: add_ps_007:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE-NEXT:    addps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_007:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_007:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_007:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_007:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_007:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_007:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_007:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = fadd <4 x float> %l, %r
@@ -531,22 +620,48 @@ define <4 x float> @add_ps_007(<4 x float> %x) {
 }
 
 define <4 x float> @add_ps_030(<4 x float> %x) {
-; SSE-LABEL: add_ps_030:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE-NEXT:    addps %xmm1, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_030:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_030:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_030:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_030:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_030:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_030:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_030:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = fadd <4 x float> %l, %r
@@ -555,19 +670,41 @@ define <4 x float> @add_ps_030(<4 x float> %x) {
 }
 
 define <4 x float> @add_ps_007_2(<4 x float> %x) {
-; SSE-LABEL: add_ps_007_2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE-NEXT:    addps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_007_2:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_007_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_007_2:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_007_2:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_007_2:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_007_2:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_007_2:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = fadd <4 x float> %l, %r
@@ -575,37 +712,83 @@ define <4 x float> @add_ps_007_2(<4 x float> %x) {
 }
 
 define <4 x float> @add_ps_008(<4 x float> %x) {
-; SSE-LABEL: add_ps_008:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE-NEXT:    addps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_008:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_008:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_008:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_008:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_008:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_008:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_008:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = fadd <4 x float> %l, %x
   ret <4 x float> %add
 }
 
 define <4 x float> @add_ps_017(<4 x float> %x) {
-; SSE-LABEL: add_ps_017:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE-NEXT:    addps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_017:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE-SLOW-NEXT:    addps %xmm0, %xmm1
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-SLOW-NEXT:    movaps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_017:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_017:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_017:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_017:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_017:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_017:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = fadd <4 x float> %l, %x
   %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
@@ -613,21 +796,47 @@ define <4 x float> @add_ps_017(<4 x float> %x) {
 }
 
 define <4 x float> @add_ps_018(<4 x float> %x) {
-; SSE-LABEL: add_ps_018:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE-NEXT:    addps %xmm1, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT:    retq
+; SSE-SLOW-LABEL: add_ps_018:
+; SSE-SLOW:       # %bb.0:
+; SSE-SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0]
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: add_ps_018:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT:    retq
+; SSE-FAST-LABEL: add_ps_018:
+; SSE-FAST:       # %bb.0:
+; SSE-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE-FAST-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-FAST-NEXT:    retq
+;
+; AVX1-SLOW-LABEL: add_ps_018:
+; AVX1-SLOW:       # %bb.0:
+; AVX1-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-SLOW-NEXT:    retq
+;
+; AVX1-FAST-LABEL: add_ps_018:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-FAST-NEXT:    retq
+;
+; AVX2-SLOW-LABEL: add_ps_018:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: add_ps_018:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-FAST-NEXT:    retq
   %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = fadd <4 x float> %l, %r
diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll
index 030de9c7f14de4ada391f3573b7c0627cc8b324d..6221d4e43bcfe469d89e3145b9b1883afa104b26 100644
--- a/test/CodeGen/X86/haddsub.ll
+++ b/test/CodeGen/X86/haddsub.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3           | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx            | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX-FAST
 
 define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
 ; SSE3-LABEL: haddpd1:
@@ -35,15 +37,29 @@ define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
 }
 
 define <2 x double> @haddpd3(<2 x double> %x) {
-; SSE3-LABEL: haddpd3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddpd %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddpd3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-SLOW-NEXT:    addpd %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    movapd %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddpd3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddpd3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddpd %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddpd3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddpd3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %r = fadd <2 x double> %a, %b
@@ -83,15 +99,30 @@ define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
 }
 
 define <4 x float> @haddps3(<4 x float> %x) {
-; SSE3-LABEL: haddps3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = fadd <4 x float> %a, %b
@@ -99,15 +130,30 @@ define <4 x float> @haddps3(<4 x float> %x) {
 }
 
 define <4 x float> @haddps4(<4 x float> %x) {
-; SSE3-LABEL: haddps4:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps4:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps4:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -115,15 +161,30 @@ define <4 x float> @haddps4(<4 x float> %x) {
 }
 
 define <4 x float> @haddps5(<4 x float> %x) {
-; SSE3-LABEL: haddps5:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps5:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps5:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps5:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps5:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,3]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps5:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -131,15 +192,27 @@ define <4 x float> @haddps5(<4 x float> %x) {
 }
 
 define <4 x float> @haddps6(<4 x float> %x) {
-; SSE3-LABEL: haddps6:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps6:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps6:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps6:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps6:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps6:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -147,15 +220,30 @@ define <4 x float> @haddps6(<4 x float> %x) {
 }
 
 define <4 x float> @haddps7(<4 x float> %x) {
-; SSE3-LABEL: haddps7:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: haddps7:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: haddps7:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: haddps7:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: haddps7:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: haddps7:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
   %r = fadd <4 x float> %a, %b
@@ -179,15 +267,28 @@ define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
 }
 
 define <2 x double> @hsubpd2(<2 x double> %x) {
-; SSE3-LABEL: hsubpd2:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubpd %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: hsubpd2:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movapd %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE3-SLOW-NEXT:    subpd %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: hsubpd2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: hsubpd2:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubpd %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: hsubpd2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: hsubpd2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
   %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
   %r = fsub <2 x double> %a, %b
@@ -211,15 +312,31 @@ define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
 }
 
 define <4 x float> @hsubps2(<4 x float> %x) {
-; SSE3-LABEL: hsubps2:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: hsubps2:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE3-SLOW-NEXT:    subps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: hsubps2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: hsubps2:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: hsubps2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: hsubps2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = fsub <4 x float> %a, %b
@@ -227,15 +344,31 @@ define <4 x float> @hsubps2(<4 x float> %x) {
 }
 
 define <4 x float> @hsubps3(<4 x float> %x) {
-; SSE3-LABEL: hsubps3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: hsubps3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT:    subps %xmm0, %xmm1
+; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: hsubps3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: hsubps3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: hsubps3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT:    vsubps %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: hsubps3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = fsub <4 x float> %a, %b
@@ -243,15 +376,27 @@ define <4 x float> @hsubps3(<4 x float> %x) {
 }
 
 define <4 x float> @hsubps4(<4 x float> %x) {
-; SSE3-LABEL: hsubps4:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: hsubps4:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE3-SLOW-NEXT:    subps %xmm1, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: hsubps4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: hsubps4:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: hsubps4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-SLOW-NEXT:    vsubps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: hsubps4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = fsub <4 x float> %a, %b
@@ -293,16 +438,35 @@ define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
 }
 
 define <8 x float> @vhaddps3(<8 x float> %x) {
-; SSE3-LABEL: vhaddps3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    haddps %xmm0, %xmm0
-; SSE3-NEXT:    haddps %xmm1, %xmm1
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: vhaddps3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm3
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm2, %xmm1
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT:    addps %xmm3, %xmm0
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: vhaddps3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: vhaddps3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    haddps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    haddps %xmm1, %xmm1
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: vhaddps3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX-SLOW-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: vhaddps3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = fadd <8 x float> %a, %b
@@ -327,16 +491,37 @@ define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
 }
 
 define <8 x float> @vhsubps3(<8 x float> %x) {
-; SSE3-LABEL: vhsubps3:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    hsubps %xmm0, %xmm0
-; SSE3-NEXT:    hsubps %xmm1, %xmm1
-; SSE3-NEXT:    retq
+; SSE3-SLOW-LABEL: vhsubps3:
+; SSE3-SLOW:       # %bb.0:
+; SSE3-SLOW-NEXT:    movaps %xmm1, %xmm2
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
+; SSE3-SLOW-NEXT:    movaps %xmm0, %xmm3
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3]
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE3-SLOW-NEXT:    subps %xmm1, %xmm2
+; SSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE3-SLOW-NEXT:    subps %xmm0, %xmm3
+; SSE3-SLOW-NEXT:    movaps %xmm3, %xmm0
+; SSE3-SLOW-NEXT:    movaps %xmm2, %xmm1
+; SSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: vhsubps3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; SSE3-FAST-LABEL: vhsubps3:
+; SSE3-FAST:       # %bb.0:
+; SSE3-FAST-NEXT:    hsubps %xmm0, %xmm0
+; SSE3-FAST-NEXT:    hsubps %xmm1, %xmm1
+; SSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: vhsubps3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
+; AVX-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX-SLOW-NEXT:    vsubps %ymm0, %ymm1, %ymm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: vhsubps3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
   %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15>
   %r = fsub <8 x float> %a, %b
diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll
index a4888e1cd3f5b2cb109a5cc171de476a818e8728..88f6b01131a2c8e32f528099284af229c037487c 100644
--- a/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -230,15 +230,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -273,15 +272,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -832,20 +830,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -896,20 +893,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1670,35 +1666,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; X86-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1748,35 +1739,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE2-LABEL: test_reduce_v32i16:
 ; X64-SSE2:       ## %bb.0:
 ; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm3
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X64-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; X64-SSE2-NEXT:    pmaxsw %xmm2, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll
index 3ce01cfdf4dc74c4535db37830afde177b785304..482d08260374e4c94426b7a744961e6f32abb09f 100644
--- a/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -232,15 +232,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -269,15 +268,14 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -772,20 +770,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -827,20 +824,19 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1574,35 +1570,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pminsw %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pminsw %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE2-NEXT:    pminsw %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; X86-SSE2-NEXT:    pminsw %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1643,35 +1634,30 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE2-LABEL: test_reduce_v32i16:
 ; X64-SSE2:       ## %bb.0:
 ; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pminsw %xmm2, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm3
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X64-SSE2-NEXT:    pminsw %xmm3, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE2-NEXT:    pminsw %xmm1, %xmm2
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; X64-SSE2-NEXT:    pminsw %xmm2, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
-; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/i256-add.ll b/test/CodeGen/X86/i256-add.ll
index 85a885a4315b9a871f433d11a58945da49df2308..23973bca7d56264c4c106bedde85e009c179a510 100644
--- a/test/CodeGen/X86/i256-add.ll
+++ b/test/CodeGen/X86/i256-add.ll
@@ -12,7 +12,7 @@ define void @add(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    subl $8, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 24(%eax), %ecx
 ; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl 20(%eax), %esi
@@ -30,7 +30,7 @@ define void @add(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    adcl %esi, 20(%eax)
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, 24(%eax)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    adcl %ecx, 28(%eax)
 ; X32-NEXT:    addl $8, %esp
 ; X32-NEXT:    popl %esi
@@ -66,7 +66,7 @@ define void @sub(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    subl $8, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl 28(%eax), %ecx
-; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl 24(%eax), %ecx
 ; X32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X32-NEXT:    movl 20(%eax), %esi
@@ -84,7 +84,7 @@ define void @sub(i256* %p, i256* %q) nounwind {
 ; X32-NEXT:    sbbl %esi, 20(%eax)
 ; X32-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X32-NEXT:    sbbl %ecx, 24(%eax)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X32-NEXT:    sbbl %ecx, 28(%eax)
 ; X32-NEXT:    addl $8, %esp
 ; X32-NEXT:    popl %esi
diff --git a/test/CodeGen/X86/i386-shrink-wrapping.ll b/test/CodeGen/X86/i386-shrink-wrapping.ll
index 8a5b92a82fb221bbe03dc5af638074b9417056ed..495ead223b2e40a471c47232ade4c0197739a3f5 100644
--- a/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -56,7 +56,7 @@ target triple = "i386-apple-macosx10.5"
 ;
 ; CHECK-NEXT: L_e$non_lazy_ptr, [[E:%[a-z]+]]
 ; CHECK-NEXT: movb %dl, ([[E]])
-; CHECK-NEXT: movsbl ([[E]]), [[CONV:%[a-z]+]]
+; CHECK-NEXT: movzbl %dl, [[CONV:%[a-z]+]]
 ; CHECK-NEXT: movl $6, [[CONV:%[a-z]+]]
 ; The eflags is used in the next instruction.
 ; If that instruction disappear, we are not exercising the bug
diff --git a/test/CodeGen/X86/icall-branch-funnel.ll b/test/CodeGen/X86/icall-branch-funnel.ll
index 010734cd8565f7358d4e475e484dc152cf3e8cb4..6d7e0c3d2c491b85ff2f007f2601ee79dab1cd5d 100644
--- a/test/CodeGen/X86/icall-branch-funnel.ll
+++ b/test/CodeGen/X86/icall-branch-funnel.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=x86_64-unknown-linux < %s | FileCheck %s
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
+; RUN: llc -mtriple=x86_64-unknown-linux -verify-machineinstrs=0 < %s | FileCheck %s
 
 @g = external global i8
 
diff --git a/test/CodeGen/X86/indirect-branch-tracking.ll b/test/CodeGen/X86/indirect-branch-tracking.ll
index 99d8085260257eb6a851a68d2a54b7cd935190f5..dc738bb7b5433da9853cc5246befefabca0a767e 100644
--- a/test/CodeGen/X86/indirect-branch-tracking.ll
+++ b/test/CodeGen/X86/indirect-branch-tracking.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=X86_64
 ; RUN: llc -mtriple=i386-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=X86
-; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj < %s | FileCheck %s --check-prefix=SJLJ
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39439.
+; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj -verify-machineinstrs=0 < %s | FileCheck %s --check-prefix=SJLJ
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Test1
diff --git a/test/CodeGen/X86/insert-into-constant-vector.ll b/test/CodeGen/X86/insert-into-constant-vector.ll
index 3c8fbc5819e0e157e35fd7a7e804f748969399f3..9a70bc8fffd30235570b11c46c966ea53d61a0fd 100644
--- a/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -167,40 +167,40 @@ define <4 x float> @elt1_v4f32(float %x) {
 ; X32SSE2-LABEL: elt1_v4f32:
 ; X32SSE2:       # %bb.0:
 ; X32SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32SSE2-NEXT:    movaps {{.*#+}} xmm1 = <42,u,2,3>
+; X32SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X32SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
 ; X32SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; X32SSE2-NEXT:    retl
 ;
 ; X64SSE2-LABEL: elt1_v4f32:
 ; X64SSE2:       # %bb.0:
-; X64SSE2-NEXT:    movaps {{.*#+}} xmm1 = <42,u,2,3>
+; X64SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X64SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
 ; X64SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; X64SSE2-NEXT:    retq
 ;
 ; X32SSE4-LABEL: elt1_v4f32:
 ; X32SSE4:       # %bb.0:
-; X32SSE4-NEXT:    movaps {{.*#+}} xmm0 = <42,u,2,3>
+; X32SSE4-NEXT:    movaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X32SSE4-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X32SSE4-NEXT:    retl
 ;
 ; X64SSE4-LABEL: elt1_v4f32:
 ; X64SSE4:       # %bb.0:
-; X64SSE4-NEXT:    movaps {{.*#+}} xmm1 = <42,u,2,3>
+; X64SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X64SSE4-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
 ; X64SSE4-NEXT:    movaps %xmm1, %xmm0
 ; X64SSE4-NEXT:    retq
 ;
 ; X32AVX-LABEL: elt1_v4f32:
 ; X32AVX:       # %bb.0:
-; X32AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <42,u,2,3>
+; X32AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X32AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X32AVX-NEXT:    retl
 ;
 ; X64AVX-LABEL: elt1_v4f32:
 ; X64AVX:       # %bb.0:
-; X64AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <42,u,2,3>
+; X64AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X64AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
 ; X64AVX-NEXT:    retq
    %ins = insertelement <4 x float> <float 42.0, float 1.0, float 2.0, float 3.0>, float %x, i32 1
@@ -210,26 +210,26 @@ define <4 x float> @elt1_v4f32(float %x) {
 define <2 x double> @elt1_v2f64(double %x) {
 ; X32SSE-LABEL: elt1_v2f64:
 ; X32SSE:       # %bb.0:
-; X32SSE-NEXT:    movapd {{.*#+}} xmm0 = <42,u>
+; X32SSE-NEXT:    movapd {{.*#+}} xmm0 = <4.2E+1,u>
 ; X32SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X32SSE-NEXT:    retl
 ;
 ; X64SSE-LABEL: elt1_v2f64:
 ; X64SSE:       # %bb.0:
-; X64SSE-NEXT:    movaps {{.*#+}} xmm1 = <42,u>
+; X64SSE-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u>
 ; X64SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64SSE-NEXT:    movaps %xmm1, %xmm0
 ; X64SSE-NEXT:    retq
 ;
 ; X32AVX-LABEL: elt1_v2f64:
 ; X32AVX:       # %bb.0:
-; X32AVX-NEXT:    vmovapd {{.*#+}} xmm0 = <42,u>
+; X32AVX-NEXT:    vmovapd {{.*#+}} xmm0 = <4.2E+1,u>
 ; X32AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; X32AVX-NEXT:    retl
 ;
 ; X64AVX-LABEL: elt1_v2f64:
 ; X64AVX:       # %bb.0:
-; X64AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <42,u>
+; X64AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <4.2E+1,u>
 ; X64AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64AVX-NEXT:    retq
    %ins = insertelement <2 x double> <double 42.0, double 1.0>, double %x, i32 1
@@ -292,37 +292,37 @@ define <8 x float> @elt6_v8f32(float %x) {
 ; X32SSE2-LABEL: elt6_v8f32:
 ; X32SSE2:       # %bb.0:
 ; X32SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X32SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
 ; X32SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
 ; X32SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; X32SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X32SSE2-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X32SSE2-NEXT:    retl
 ;
 ; X64SSE2-LABEL: elt6_v8f32:
 ; X64SSE2:       # %bb.0:
-; X64SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X64SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
 ; X64SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
 ; X64SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
-; X64SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X64SSE2-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X64SSE2-NEXT:    retq
 ;
 ; X32SSE4-LABEL: elt6_v8f32:
 ; X32SSE4:       # %bb.0:
-; X32SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X32SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
 ; X32SSE4-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
-; X32SSE4-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X32SSE4-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X32SSE4-NEXT:    retl
 ;
 ; X64SSE4-LABEL: elt6_v8f32:
 ; X64SSE4:       # %bb.0:
-; X64SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X64SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
 ; X64SSE4-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
-; X64SSE4-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X64SSE4-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X64SSE4-NEXT:    retq
 ;
 ; X32AVX-LABEL: elt6_v8f32:
 ; X32AVX:       # %bb.0:
-; X32AVX-NEXT:    vmovaps {{.*#+}} ymm0 = <42,1,2,3,4,5,u,7>
+; X32AVX-NEXT:    vmovaps {{.*#+}} ymm0 = <4.2E+1,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,u,7.0E+0>
 ; X32AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X32AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
 ; X32AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -330,7 +330,7 @@ define <8 x float> @elt6_v8f32(float %x) {
 ;
 ; X64AVX-LABEL: elt6_v8f32:
 ; X64AVX:       # %bb.0:
-; X64AVX-NEXT:    vmovaps {{.*#+}} ymm1 = <42,1,2,3,4,5,u,7>
+; X64AVX-NEXT:    vmovaps {{.*#+}} ymm1 = <4.2E+1,1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,u,7.0E+0>
 ; X64AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; X64AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
 ; X64AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -413,49 +413,49 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 define <8 x double> @elt1_v8f64(double %x) {
 ; X32SSE-LABEL: elt1_v8f64:
 ; X32SSE:       # %bb.0:
-; X32SSE-NEXT:    movapd {{.*#+}} xmm0 = <42,u>
+; X32SSE-NEXT:    movapd {{.*#+}} xmm0 = <4.2E+1,u>
 ; X32SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; X32SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
-; X32SSE-NEXT:    movaps {{.*#+}} xmm2 = [4,5]
-; X32SSE-NEXT:    movaps {{.*#+}} xmm3 = [6,7]
+; X32SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
+; X32SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
+; X32SSE-NEXT:    movaps {{.*#+}} xmm3 = [6.0E+0,7.0E+0]
 ; X32SSE-NEXT:    retl
 ;
 ; X64SSE-LABEL: elt1_v8f64:
 ; X64SSE:       # %bb.0:
-; X64SSE-NEXT:    movaps {{.*#+}} xmm4 = <42,u>
+; X64SSE-NEXT:    movaps {{.*#+}} xmm4 = <4.2E+1,u>
 ; X64SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; X64SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
-; X64SSE-NEXT:    movaps {{.*#+}} xmm2 = [4,5]
-; X64SSE-NEXT:    movaps {{.*#+}} xmm3 = [6,7]
+; X64SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
+; X64SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
+; X64SSE-NEXT:    movaps {{.*#+}} xmm3 = [6.0E+0,7.0E+0]
 ; X64SSE-NEXT:    movaps %xmm4, %xmm0
 ; X64SSE-NEXT:    retq
 ;
 ; X32AVX2-LABEL: elt1_v8f64:
 ; X32AVX2:       # %bb.0:
-; X32AVX2-NEXT:    vmovapd {{.*#+}} ymm0 = <42,u,2,3>
+; X32AVX2-NEXT:    vmovapd {{.*#+}} ymm0 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X32AVX2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
 ; X32AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; X32AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,6,7]
+; X32AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X32AVX2-NEXT:    retl
 ;
 ; X64AVX2-LABEL: elt1_v8f64:
 ; X64AVX2:       # %bb.0:
-; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <42,u,2,3>
+; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <4.2E+1,u,2.0E+0,3.0E+0>
 ; X64AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,6,7]
+; X64AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64AVX2-NEXT:    retq
 ;
 ; X32AVX512F-LABEL: elt1_v8f64:
 ; X32AVX512F:       # %bb.0:
-; X32AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = <42,u,2,3,4,5,6,7>
+; X32AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
 ; X32AVX512F-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
 ; X32AVX512F-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
 ; X32AVX512F-NEXT:    retl
 ;
 ; X64AVX512F-LABEL: elt1_v8f64:
 ; X64AVX512F:       # %bb.0:
-; X64AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <42,u,2,3,4,5,6,7>
+; X64AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
 ; X64AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64AVX512F-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X64AVX512F-NEXT:    retq
diff --git a/test/CodeGen/X86/is-constant.ll b/test/CodeGen/X86/is-constant.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d02bbae2085fc3d83b8704985b98a641f899c7bb
--- /dev/null
+++ b/test/CodeGen/X86/is-constant.ll
@@ -0,0 +1,50 @@
+; RUN: llc -O2 < %s | FileCheck %s --check-prefix=CHECK-O2 --check-prefix=CHECK
+; RUN: llc -O0 -fast-isel < %s | FileCheck %s --check-prefix=CHECK-O0 --check-prefix=CHECK
+; RUN: llc -O0 -fast-isel=0 < %s | FileCheck %s --check-prefix=CHECK-O0 --check-prefix=CHECK
+; RUN: llc -O0 -global-isel < %s | FileCheck %s --check-prefix=CHECK-O0 --check-prefix=CHECK
+
+;; Ensure that an unfoldable is.constant gets lowered reasonably in
+;; optimized codegen, in particular, that the "true" branch is
+;; eliminated.
+;;
+;; This isn't asserting any specific output from non-optimized runs,
+;; (e.g., currently the not-taken branch does not get eliminated). But
+;; it does ensure that lowering succeeds in all 3 codegen paths.
+
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i1 @llvm.is.constant.i32(i32 %a) nounwind readnone
+declare i1 @llvm.is.constant.i64(i64 %a) nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1) nounwind readnone
+
+declare i32 @subfun_1()
+declare i32 @subfun_2()
+
+define i32 @test_branch(i32 %in) nounwind {
+; CHECK-LABEL:    test_branch:
+; CHECK-O2:       %bb.0:
+; CHECK-O2-NEXT:  jmp subfun_2
+  %v = call i1 @llvm.is.constant.i32(i32 %in)
+  br i1 %v, label %True, label %False
+
+True:
+  %call1 = tail call i32 @subfun_1()
+  ret i32 %call1
+
+False:
+  %call2 = tail call i32 @subfun_2()
+  ret i32 %call2
+}
+
+;; llvm.objectsize is another tricky case which gets folded to -1 very
+;; late in the game. We'd like to ensure that llvm.is.constant of
+;; llvm.objectsize is true.
+define i1 @test_objectsize(i8* %obj) nounwind {
+; CHECK-LABEL:    test_objectsize:
+; CHECK-O2:       %bb.0:
+; CHECK-O2:       movb $1, %al
+; CHECK-O2-NEXT:  retq
+  %os = call i64 @llvm.objectsize.i64.p0i8(i8* %obj, i1 false, i1 false)
+  %v = call i1 @llvm.is.constant.i64(i64 %os)
+  ret i1 %v
+}
diff --git a/test/CodeGen/X86/known-bits.ll b/test/CodeGen/X86/known-bits.ll
index 5d574391c50b5534fff648f226967616cf5fba77..5066e4777cc6947198454f6922262ed12340030f 100644
--- a/test/CodeGen/X86/known-bits.ll
+++ b/test/CodeGen/X86/known-bits.ll
@@ -19,7 +19,7 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
 ; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm1
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; X32-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; X32-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; X32-NEXT:    movzbl %cl, %eax
 ; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
@@ -69,7 +69,7 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
 ; X64-NEXT:    movzbl %cl, %ecx
 ; X64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm1
-; X64-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; X64-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; X64-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll
index 679e068b9658a867d7fafd924873efcf85220d26..64bca7330689598cb0857000d7e5b38ec6aaca32 100644
--- a/test/CodeGen/X86/known-signbits-vector.ll
+++ b/test/CodeGen/X86/known-signbits-vector.ll
@@ -91,17 +91,14 @@ define float @signbits_ashr_extract_sitofp_1(<2 x i64> %a0) nounwind {
 ; X32-LABEL: signbits_ashr_extract_sitofp_1:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
-; X32-NEXT:    vpsrlq $63, %xmm1, %xmm2
-; X32-NEXT:    vpsrlq $32, %xmm1, %xmm1
-; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; X32-NEXT:    vpsrlq $63, %xmm0, %xmm2
+; X32-NEXT:    vpsrlq $63, %xmm0, %xmm1
 ; X32-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,32768,0,0,1,0,0,0]
 ; X32-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
 ; X32-NEXT:    vmovss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -128,18 +125,15 @@ define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
 ; X32-LABEL: signbits_ashr_shl_extract_sitofp:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
-; X32-NEXT:    vpsrlq $60, %xmm1, %xmm2
-; X32-NEXT:    vpsrlq $61, %xmm1, %xmm1
-; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
-; X32-NEXT:    vpsrlq $60, %xmm0, %xmm2
+; X32-NEXT:    vpsrlq $60, %xmm0, %xmm1
 ; X32-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,0,0,0,8,0,0,0]
 ; X32-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vpsllq $20, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
 ; X32-NEXT:    vmovss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -240,21 +234,13 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1)
 define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
 ; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
 ; X32:       # %bb.0:
-; X32-NEXT:    vpsrad $16, %xmm0, %xmm1
-; X32-NEXT:    vpsrlq $16, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; X32-NEXT:    vpsrlq $16, %xmm0, %xmm0
-; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X32-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpsrad $16, %xmm0, %xmm1
-; X64-NEXT:    vpsrlq $16, %xmm0, %xmm0
-; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; X64-NEXT:    vpsrlq $16, %xmm0, %xmm0
-; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; X64-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; X64-NEXT:    retq
   %1 = ashr <2 x i64> %a0, <i64 16, i64 16>
@@ -271,13 +257,10 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-NEXT:    vpsrlq $60, %xmm2, %xmm3
-; X32-NEXT:    vpsrlq $61, %xmm2, %xmm2
-; X32-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; X32-NEXT:    vpsrlq $60, %xmm0, %xmm3
+; X32-NEXT:    vpsrlq $60, %xmm0, %xmm2
 ; X32-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
 ; X32-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
 ; X32-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
@@ -289,7 +272,7 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2
 ; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; X32-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    vcvtsi2ssl %eax, %xmm4, %xmm0
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
 ; X32-NEXT:    vmovss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -328,13 +311,10 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
 ; X32-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp:
 ; X32:       # %bb.0:
 ; X32-NEXT:    pushl %eax
-; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-NEXT:    vpsrlq $60, %xmm2, %xmm3
-; X32-NEXT:    vpsrlq $61, %xmm2, %xmm2
-; X32-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
-; X32-NEXT:    vpsrlq $60, %xmm0, %xmm3
+; X32-NEXT:    vpsrlq $60, %xmm0, %xmm2
 ; X32-NEXT:    vpsrlq $61, %xmm0, %xmm0
-; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0]
 ; X32-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; X32-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
 ; X32-NEXT:    vpmovsxdq %xmm1, %xmm1
@@ -342,7 +322,7 @@ define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4
 ; X32-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; X32-NEXT:    vpxor %xmm0, %xmm1, %xmm0
 ; X32-NEXT:    vmovd %xmm0, %eax
-; X32-NEXT:    vcvtsi2ssl %eax, %xmm4, %xmm0
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm3, %xmm0
 ; X32-NEXT:    vmovss %xmm0, (%esp)
 ; X32-NEXT:    flds (%esp)
 ; X32-NEXT:    popl %eax
@@ -383,22 +363,19 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
 ; X32-NEXT:    subl $16, %esp
 ; X32-NEXT:    vpmovsxdq 16(%ebp), %xmm3
 ; X32-NEXT:    vpmovsxdq 8(%ebp), %xmm4
-; X32-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
+; X32-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; X32-NEXT:    vpsrlq $63, %xmm5, %xmm6
 ; X32-NEXT:    vpsrlq $33, %xmm5, %xmm5
 ; X32-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
-; X32-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; X32-NEXT:    vpsrlq $63, %xmm6, %xmm7
-; X32-NEXT:    vpsrlq $33, %xmm6, %xmm6
-; X32-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4,5,6,7]
-; X32-NEXT:    vpxor %xmm5, %xmm6, %xmm6
-; X32-NEXT:    vpsubq %xmm5, %xmm6, %xmm6
+; X32-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,16384,0,0,1,0,0,0]
+; X32-NEXT:    vpxor %xmm6, %xmm5, %xmm5
+; X32-NEXT:    vpsubq %xmm6, %xmm5, %xmm5
 ; X32-NEXT:    vpsrlq $63, %xmm2, %xmm7
 ; X32-NEXT:    vpsrlq $33, %xmm2, %xmm2
 ; X32-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4,5,6,7]
-; X32-NEXT:    vpxor %xmm5, %xmm2, %xmm2
-; X32-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; X32-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; X32-NEXT:    vpxor %xmm6, %xmm2, %xmm2
+; X32-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
+; X32-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
 ; X32-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; X32-NEXT:    vextractf128 $1, %ymm1, %xmm4
 ; X32-NEXT:    vextractf128 $1, %ymm0, %xmm5
diff --git a/test/CodeGen/X86/large-pic-string.ll b/test/CodeGen/X86/large-pic-string.ll
index be8a629c31c20cdb35f53851cfede5786039f8fe..e677ed85c66324f5066111da02945284d40b1d63 100644
--- a/test/CodeGen/X86/large-pic-string.ll
+++ b/test/CodeGen/X86/large-pic-string.ll
@@ -1,19 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
-; RUN: llc < %s -code-model=large -relocation-model=pic -mtriple=x86_64--linux | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -code-model=large -relocation-model=pic -mtriple=x86_64--linux | FileCheck %s
 
 @.str = private unnamed_addr constant [2 x i8] c"a\00", align 1
 
 define void @pr38385() {
 ; CHECK-LABEL: pr38385:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    leaq {{.*}}(%rip), %rax
-; CHECK-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.Ltmp0, %rcx
-; CHECK-NEXT:    addq %rcx, %rax
-; CHECK-NEXT:    movabsq $.L.str@GOTOFF, %rcx
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:  .L0$pb:
+; CHECK-NEXT:    leaq .L0${{.*}}(%rip), %rax
+; CHECK-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
+; CHECK-NEXT:    addq %rax, %rcx
+; CHECK-NEXT:    movabsq $.L.str@GOTOFF, %rax
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    retq
   %p = alloca i8, align 1
   store i8 ptrtoint ([2 x i8]* @.str to i8), i8* %p, align 1
diff --git a/test/CodeGen/X86/lea-opt-with-debug.mir b/test/CodeGen/X86/lea-opt-with-debug.mir
index 34525d73ea7746fbed0510dc1ef0806c53299b67..a1cf2041db6ad2bdcce68cc48436456b2fb25463 100644
--- a/test/CodeGen/X86/lea-opt-with-debug.mir
+++ b/test/CodeGen/X86/lea-opt-with-debug.mir
@@ -98,7 +98,7 @@ body:             |
     ; CHECK: %3:gr64_nosp = LEA64r %2, 2, %2, 0, $noreg, debug-location !13
     ; CHECK-NEXT: %4:gr64 = LEA64r %1, 4, %3, 0, $noreg, debug-location !13
     ; CHECK-NOT: %0:gr64 = LEA64r %1, 4, %3, 8, $noreg, debug-location !14
-    ; CHECK: DBG_VALUE debug-use %4, debug-use $noreg, !11, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_stack_value), debug-location !15
+    ; CHECK: DBG_VALUE %4, $noreg, !11, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_stack_value), debug-location !15
 
     %1 = MOV64rm $rip, 1, $noreg, @c, $noreg, debug-location !13 :: (dereferenceable load 8 from @c)
     %2 = MOVSX64rm32 $rip, 1, $noreg, @a, $noreg, debug-location !13 :: (dereferenceable load 4 from @a)
@@ -107,7 +107,7 @@ body:             |
     %5 = COPY %4.sub_32bit, debug-location !13
     MOV32mr $rip, 1, $noreg, @d, $noreg, killed %5, debug-location !13 :: (store 4 into @d)
     %0 = LEA64r %1, 4, %3, 8, $noreg, debug-location !14
-    DBG_VALUE debug-use %0, debug-use $noreg, !11, !DIExpression(), debug-location !15
+    DBG_VALUE %0, $noreg, !11, !DIExpression(), debug-location !15
 
     ; CHECK-LABEL: bb.1 (%ir-block.8):
     ; CHECK: %6:gr32 = MOV32rm %4, 1, $noreg, 8, $noreg, debug-location !17 :: (load 4 from %ir.7)
diff --git a/test/CodeGen/X86/lea32-schedule.ll b/test/CodeGen/X86/lea32-schedule.ll
index a9608f0bd8cad05077007b3a25975f161bda8467..1e8ebfb766b386175291ec59017b7c336eb7d8a7 100644
--- a/test/CodeGen/X86/lea32-schedule.ll
+++ b/test/CodeGen/X86/lea32-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell   | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -60,6 +61,12 @@ define i32 @test_lea_offset(i32) {
 ; SKYLAKE-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal -24(%rdi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -124,6 +131,12 @@ define i32 @test_lea_offset_big(i32) {
 ; SKYLAKE-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal 1024(%rdi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -196,6 +209,13 @@ define i32 @test_lea_add(i32, i32) {
 ; SKYLAKE-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -274,6 +294,13 @@ define i32 @test_lea_add_offset(i32, i32) {
 ; SKYLAKE-NEXT:    addl $16, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal 16(%rdi,%rsi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -358,6 +385,13 @@ define i32 @test_lea_add_offset_big(i32, i32) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal -4096(%rdi,%rsi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -425,6 +459,12 @@ define i32 @test_lea_mul(i32) {
 ; SKYLAKE-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -494,6 +534,12 @@ define i32 @test_lea_mul_offset(i32) {
 ; SKYLAKE-NEXT:    addl $-32, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -569,6 +615,12 @@ define i32 @test_lea_mul_offset_big(i32) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $edi killed $edi def $rdi
@@ -641,6 +693,13 @@ define i32 @test_lea_add_scale(i32, i32) {
 ; SKYLAKE-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -720,6 +779,13 @@ define i32 @test_lea_add_scale_offset(i32, i32) {
 ; SKYLAKE-NEXT:    addl $96, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
@@ -805,6 +871,13 @@ define i32 @test_lea_add_scale_offset_big(i32, i32) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    # kill: def $esi killed $esi def $rsi
+; BDVER2-NEXT:    # kill: def $edi killed $edi def $rdi
+; BDVER2-NEXT:    leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    # kill: def $esi killed $esi def $rsi
diff --git a/test/CodeGen/X86/lea64-schedule.ll b/test/CodeGen/X86/lea64-schedule.ll
index df9df9b21ef33e05cbd9854aa330612aefa5d11d..cac9d2b506267664d77edb0683e5c813a4965eba 100644
--- a/test/CodeGen/X86/lea64-schedule.ll
+++ b/test/CodeGen/X86/lea64-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell   | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -53,6 +54,11 @@ define i64 @test_lea_offset(i64) {
 ; SKYLAKE-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq -24(%rdi), %rax # sched: [1:0.50]
@@ -108,6 +114,11 @@ define i64 @test_lea_offset_big(i64) {
 ; SKYLAKE-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq 1024(%rdi), %rax # sched: [1:0.50]
@@ -164,6 +175,11 @@ define i64 @test_lea_add(i64, i64) {
 ; SKYLAKE-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq (%rdi,%rsi), %rax # sched: [1:0.50]
@@ -224,6 +240,11 @@ define i64 @test_lea_add_offset(i64, i64) {
 ; SKYLAKE-NEXT:    addq $16, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq 16(%rdi,%rsi), %rax # sched: [2:1.00]
@@ -290,6 +311,11 @@ define i64 @test_lea_add_offset_big(i64, i64) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq -4096(%rdi,%rsi), %rax # sched: [2:1.00]
@@ -346,6 +372,11 @@ define i64 @test_lea_mul(i64) {
 ; SKYLAKE-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [2:1.00]
@@ -406,6 +437,11 @@ define i64 @test_lea_mul_offset(i64) {
 ; SKYLAKE-NEXT:    addq $-32, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq -32(%rdi,%rdi,2), %rax # sched: [2:1.00]
@@ -472,6 +508,11 @@ define i64 @test_lea_mul_offset_big(i64) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_mul_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_mul_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq 10000(%rdi,%rdi,8), %rax # sched: [2:1.00]
@@ -528,6 +569,11 @@ define i64 @test_lea_add_scale(i64, i64) {
 ; SKYLAKE-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq (%rdi,%rsi,2), %rax # sched: [2:1.00]
@@ -589,6 +635,11 @@ define i64 @test_lea_add_scale_offset(i64, i64) {
 ; SKYLAKE-NEXT:    addq $96, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale_offset:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale_offset:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq 96(%rdi,%rsi,4), %rax # sched: [2:1.00]
@@ -656,6 +707,11 @@ define i64 @test_lea_add_scale_offset_big(i64, i64) {
 ; SKYLAKE-NEXT:    # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lea_add_scale_offset_big:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lea_add_scale_offset_big:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    leaq -1200(%rdi,%rsi,8), %rax # sched: [2:1.00]
diff --git a/test/CodeGen/X86/llc-print-machineinstrs.mir b/test/CodeGen/X86/llc-print-machineinstrs.mir
new file mode 100644
index 0000000000000000000000000000000000000000..a890840a478ae8983c5697279e5ac740a712e7b9
--- /dev/null
+++ b/test/CodeGen/X86/llc-print-machineinstrs.mir
@@ -0,0 +1,12 @@
+# Check that -print-machineinstrs doesn't assert when it's passed an unknown pass name.
+# RUN: llc -mtriple=x86_64-- -start-before=greedy -print-machineinstrs=greedy %s -o /dev/null
+# RUN: not llc -mtriple=x86_64-- -start-before=greedy -print-machineinstrs=unknown %s -o /dev/null 2>&1 | FileCheck %s
+# CHECK: LLVM ERROR: "unknown" pass is not registered.
+
+...
+---
+name: fun
+tracksRegLiveness: true
+body: |
+  bb.0:
+    RET 0
diff --git a/test/CodeGen/X86/lower-vec-shift.ll b/test/CodeGen/X86/lower-vec-shift.ll
index 31059c40648992a9c53c9c0d25df369ea14c4abc..4480642afb24f0509cdc0834daf1727d9a423726 100644
--- a/test/CodeGen/X86/lower-vec-shift.ll
+++ b/test/CodeGen/X86/lower-vec-shift.ll
@@ -234,8 +234,6 @@ define <8 x i32> @test10(<8 x i32>* %a) {
 ; SSE-LABEL: test10:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
-; SSE-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE-NEXT:    psrad %xmm0, %xmm1
 ; SSE-NEXT:    psrad $1, %xmm0
 ; SSE-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
index 0b5ce8a4ffb4c894e5aea2d4b3ebe9ac7813792a..7a26623510944785cfb81844e07ef2ba09abf782 100644
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -97,8 +97,8 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    pushq %r14
 ; ATOM-NEXT:    pushq %rbx
 ; ATOM-NEXT:    ## kill: def $ecx killed $ecx def $rcx
-; ATOM-NEXT:    movl 4(%rdx), %eax
 ; ATOM-NEXT:    movl (%rdx), %r15d
+; ATOM-NEXT:    movl 4(%rdx), %eax
 ; ATOM-NEXT:    leaq 20(%rdx), %r14
 ; ATOM-NEXT:    movq _Te0@{{.*}}(%rip), %r9
 ; ATOM-NEXT:    movq _Te1@{{.*}}(%rip), %r8
@@ -116,8 +116,8 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    movzbl %bl, %eax
 ; ATOM-NEXT:    movl (%r10,%rax,4), %eax
 ; ATOM-NEXT:    xorl (%r8,%rbp,4), %r15d
-; ATOM-NEXT:    xorl -4(%r14), %r15d
 ; ATOM-NEXT:    xorl (%r9,%rdi,4), %eax
+; ATOM-NEXT:    xorl -4(%r14), %r15d
 ; ATOM-NEXT:    xorl (%r14), %eax
 ; ATOM-NEXT:    addq $16, %r14
 ; ATOM-NEXT:  LBB0_1: ## %bb
@@ -130,14 +130,14 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    movzbl %dil, %edi
 ; ATOM-NEXT:    movl (%r8,%rdi,4), %ebx
 ; ATOM-NEXT:    movzbl %r15b, %edi
-; ATOM-NEXT:    movl (%r10,%rdi,4), %edi
 ; ATOM-NEXT:    xorl (%r9,%rbp,4), %ebx
+; ATOM-NEXT:    movl (%r10,%rdi,4), %edi
 ; ATOM-NEXT:    xorl -12(%r14), %ebx
 ; ATOM-NEXT:    xorl (%r9,%rax,4), %edi
 ; ATOM-NEXT:    movl %ebx, %eax
+; ATOM-NEXT:    xorl -8(%r14), %edi
 ; ATOM-NEXT:    shrl $24, %eax
 ; ATOM-NEXT:    movl (%r9,%rax,4), %r15d
-; ATOM-NEXT:    xorl -8(%r14), %edi
 ; ATOM-NEXT:    testq %r11, %r11
 ; ATOM-NEXT:    movl %edi, %eax
 ; ATOM-NEXT:    jne LBB0_2
diff --git a/test/CodeGen/X86/lwp-intrinsics.ll b/test/CodeGen/X86/lwp-intrinsics.ll
index 2d293651bcb6e74f937465f364c481fb0bc1a24a..a9b8a65d2dd8630d66e864b0eb3b5b08339ed17f 100644
--- a/test/CodeGen/X86/lwp-intrinsics.ll
+++ b/test/CodeGen/X86/lwp-intrinsics.ll
@@ -40,14 +40,41 @@ define i8* @test_slwpcb(i8 *%a0) nounwind {
 }
 
 define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
-; X86-LABEL: test_lwpins32_rri:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
-; X86-NEXT:    setb %al
-; X86-NEXT:    retl
+; X86_BDVER1-LABEL: test_lwpins32_rri:
+; X86_BDVER1:       # %bb.0:
+; X86_BDVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER1-NEXT:    addl %ecx, %ecx
+; X86_BDVER1-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER1-NEXT:    setb %al
+; X86_BDVER1-NEXT:    retl
+;
+; X86_BDVER2-LABEL: test_lwpins32_rri:
+; X86_BDVER2:       # %bb.0:
+; X86_BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER2-NEXT:    addl %ecx, %ecx
+; X86_BDVER2-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER2-NEXT:    setb %al
+; X86_BDVER2-NEXT:    retl
+;
+; X86_BDVER3-LABEL: test_lwpins32_rri:
+; X86_BDVER3:       # %bb.0:
+; X86_BDVER3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER3-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER3-NEXT:    addl %ecx, %ecx
+; X86_BDVER3-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER3-NEXT:    setb %al
+; X86_BDVER3-NEXT:    retl
+;
+; X86_BDVER4-LABEL: test_lwpins32_rri:
+; X86_BDVER4:       # %bb.0:
+; X86_BDVER4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER4-NEXT:    addl %ecx, %ecx
+; X86_BDVER4-NEXT:    lwpins $-1985229329, %ecx, %eax # imm = 0x89ABCDEF
+; X86_BDVER4-NEXT:    setb %al
+; X86_BDVER4-NEXT:    retl
 ;
 ; X64-LABEL: test_lwpins32_rri:
 ; X64:       # %bb.0:
@@ -80,13 +107,37 @@ define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
 }
 
 define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
-; X86-LABEL: test_lwpval32_rri:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ecx, %ecx
-; X86-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
-; X86-NEXT:    retl
+; X86_BDVER1-LABEL: test_lwpval32_rri:
+; X86_BDVER1:       # %bb.0:
+; X86_BDVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER1-NEXT:    addl %ecx, %ecx
+; X86_BDVER1-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER1-NEXT:    retl
+;
+; X86_BDVER2-LABEL: test_lwpval32_rri:
+; X86_BDVER2:       # %bb.0:
+; X86_BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER2-NEXT:    addl %ecx, %ecx
+; X86_BDVER2-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER2-NEXT:    retl
+;
+; X86_BDVER3-LABEL: test_lwpval32_rri:
+; X86_BDVER3:       # %bb.0:
+; X86_BDVER3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER3-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER3-NEXT:    addl %ecx, %ecx
+; X86_BDVER3-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER3-NEXT:    retl
+;
+; X86_BDVER4-LABEL: test_lwpval32_rri:
+; X86_BDVER4:       # %bb.0:
+; X86_BDVER4-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86_BDVER4-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86_BDVER4-NEXT:    addl %ecx, %ecx
+; X86_BDVER4-NEXT:    lwpval $-19088744, %ecx, %eax # imm = 0xFEDCBA98
+; X86_BDVER4-NEXT:    retl
 ;
 ; X64-LABEL: test_lwpval32_rri:
 ; X64:       # %bb.0:
diff --git a/test/CodeGen/X86/lwp-schedule.ll b/test/CodeGen/X86/lwp-schedule.ll
index 9e517ac62da984229bbd30f1e79c3eb16a9fd2d4..c10282cfb8e9700caecee65bb99c96ffae1c3254 100644
--- a/test/CodeGen/X86/lwp-schedule.ll
+++ b/test/CodeGen/X86/lwp-schedule.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=x86-64 -mattr=+lwp | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
 ; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
 
@@ -11,10 +11,20 @@ define void @test_llwpcb(i8 *%a0) nounwind {
 ; GENERIC-NEXT:    llwpcb %rdi # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_llwpcb:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    llwpcb %rdi
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_llwpcb:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    llwpcb %rdi # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_llwpcb:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    llwpcb %rdi
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_llwpcb:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    llwpcb %rdi
+; BDVER4-NEXT:    retq
   tail call void @llvm.x86.llwpcb(i8 *%a0)
   ret void
 }
@@ -25,10 +35,20 @@ define i8* @test_slwpcb(i8 *%a0) nounwind {
 ; GENERIC-NEXT:    slwpcb %rax # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_slwpcb:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    slwpcb %rax
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_slwpcb:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    slwpcb %rax # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_slwpcb:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    slwpcb %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_slwpcb:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    slwpcb %rax
+; BDVER4-NEXT:    retq
   %1 = tail call i8* @llvm.x86.slwpcb()
   ret i8 *%1
 }
@@ -42,12 +62,27 @@ define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpins32_rri:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    addl %esi, %esi
-; BDVER-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
-; BDVER-NEXT:    setb %al
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpins32_rri:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    addl %esi, %esi # sched: [1:0.50]
+; BDVER12-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    setb %al # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_lwpins32_rri:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    addl %esi, %esi
+; BDVER3-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
+; BDVER3-NEXT:    setb %al
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpins32_rri:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    addl %esi, %esi
+; BDVER4-NEXT:    lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
+; BDVER4-NEXT:    setb %al
+; BDVER4-NEXT:    retq
   %1 = add i32 %a1, %a1
   %2 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %1, i32 2309737967)
   ret i8 %2
@@ -61,11 +96,24 @@ define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpins32_rmi:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
-; BDVER-NEXT:    setb %al
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpins32_rmi:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    setb %al # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_lwpins32_rmi:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
+; BDVER3-NEXT:    setb %al
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpins32_rmi:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
+; BDVER4-NEXT:    setb %al
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32 *%p1
   %1 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 1985229328)
   ret i8 %1
@@ -79,11 +127,24 @@ define i8 @test_lwpins64_rri(i64 %a0, i32 %a1) nounwind {
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpins64_rri:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
-; BDVER-NEXT:    setb %al
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpins64_rri:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    setb %al # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_lwpins64_rri:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
+; BDVER3-NEXT:    setb %al
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpins64_rri:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
+; BDVER4-NEXT:    setb %al
+; BDVER4-NEXT:    retq
   %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2309737967)
   ret i8 %1
 }
@@ -96,11 +157,24 @@ define i8 @test_lwpins64_rmi(i64 %a0, i32 *%p1) nounwind {
 ; GENERIC-NEXT:    setb %al # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpins64_rmi:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
-; BDVER-NEXT:    setb %al
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpins64_rmi:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    setb %al # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_lwpins64_rmi:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
+; BDVER3-NEXT:    setb %al
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpins64_rmi:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
+; BDVER4-NEXT:    setb %al
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32 *%p1
   %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 1985229328)
   ret i8 %1
@@ -114,11 +188,24 @@ define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
 ; GENERIC-NEXT:    # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpval32_rri:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    addl %esi, %esi
-; BDVER-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpval32_rri:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    addl %esi, %esi # sched: [1:0.50]
+; BDVER12-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_lwpval32_rri:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    addl %esi, %esi
+; BDVER3-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpval32_rri:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    addl %esi, %esi
+; BDVER4-NEXT:    lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
+; BDVER4-NEXT:    retq
   %1 = add i32 %a1, %a1
   tail call void @llvm.x86.lwpval32(i32 %a0, i32 %1, i32 4275878552)
   ret void
@@ -131,10 +218,21 @@ define void @test_lwpval32_rmi(i32 %a0, i32 *%p1) nounwind {
 ; GENERIC-NEXT:    # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpval32_rmi:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpval32_rmi:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_lwpval32_rmi:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpval32_rmi:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpval $305419896, (%rsi), %edi # imm = 0x12345678
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32 *%p1
   tail call void @llvm.x86.lwpval32(i32 %a0, i32 %a1, i32 305419896)
   ret void
@@ -147,10 +245,21 @@ define void @test_lwpval64_rri(i64 %a0, i32 %a1) nounwind {
 ; GENERIC-NEXT:    # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpval64_rri:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpval64_rri:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_lwpval64_rri:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpval64_rri:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
+; BDVER4-NEXT:    retq
   tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 4275878552)
   ret void
 }
@@ -162,10 +271,21 @@ define void @test_lwpval64_rmi(i64 %a0, i32 *%p1) nounwind {
 ; GENERIC-NEXT:    # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_lwpval64_rmi:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_lwpval64_rmi:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
+; BDVER12-NEXT:    # sched: [100:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_lwpval64_rmi:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_lwpval64_rmi:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32 *%p1
   tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 305419896)
   ret void
diff --git a/test/CodeGen/X86/lzcnt-schedule.ll b/test/CodeGen/X86/lzcnt-schedule.ll
index 001bb0be397b3b61716046bd7b2a9d12e2e1365f..d8f9416b92ba5a98911a7c4f54b12b00596c2d95 100644
--- a/test/CodeGen/X86/lzcnt-schedule.ll
+++ b/test/CodeGen/X86/lzcnt-schedule.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake   | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl       | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2    | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2    | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1    | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -40,6 +41,14 @@ define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) {
 ; SKYLAKE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctlz_i16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    lzcntw (%rsi), %cx # sched: [6:0.50]
+; BDVER2-NEXT:    lzcntw %di, %ax # sched: [2:0.50]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ctlz_i16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    lzcntw (%rsi), %cx # sched: [4:1.00]
@@ -92,6 +101,13 @@ define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctlz_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    lzcntl (%rsi), %ecx # sched: [6:0.50]
+; BDVER2-NEXT:    lzcntl %edi, %eax # sched: [2:0.50]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ctlz_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    lzcntl (%rsi), %ecx # sched: [4:1.00]
@@ -142,6 +158,13 @@ define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctlz_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    lzcntq (%rsi), %rcx # sched: [6:0.50]
+; BDVER2-NEXT:    lzcntq %rdi, %rax # sched: [2:0.50]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ctlz_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    lzcntq (%rsi), %rcx # sched: [4:1.00]
diff --git a/test/CodeGen/X86/machine-cp-debug.mir b/test/CodeGen/X86/machine-cp-debug.mir
index a7fcd9801e79c3e4761ecd59cb0709acfdbf96d3..a3230e8910cb3783730fa81f3621ff0e972f943f 100644
--- a/test/CodeGen/X86/machine-cp-debug.mir
+++ b/test/CodeGen/X86/machine-cp-debug.mir
@@ -19,5 +19,5 @@ body: |
   bb.0:
     liveins: $eax
     $ebx = COPY $eax
-    DBG_VALUE debug-use $ebx, debug-use _, !1, !1
+    DBG_VALUE $ebx, _, !1, !1
 ...
diff --git a/test/CodeGen/X86/machine-trace-metrics-crash.ll b/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 6369ee4eb0ef9da68ce333c4daffc3b58d4254a8..c9e8c6361860765e181be5b72560fb7c685d6c70 100644
--- a/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s
 
 ; The debug info in this test case was causing a crash because machine trace metrics
@@ -6,9 +7,41 @@
 ; used machine trace metrics.
 
 define void @PR24199() {
-; CHECK-LABEL:	PR24199:
-; CHECK:	addss	%xmm1, %xmm0
-; CHECK:	addss	%xmm2, %xmm0
+; CHECK-LABEL: PR24199:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je .LBB0_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # %if.then
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:  .LBB0_3: # %if.end
+; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
+; CHECK-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    mulss %xmm0, %xmm2
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss %xmm1, %xmm0
+; CHECK-NEXT:    addss %xmm2, %xmm0
+; CHECK-NEXT:    movss %xmm0, (%rax)
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %if.end
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:  .LBB0_5: # %if.end
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss %xmm0, %xmm0
+; CHECK-NEXT:    addss %xmm1, %xmm0
+; CHECK-NEXT:    callq bar
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 
 entry:
   %i = alloca %struct.A, align 8
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index 30320a750e008e504e19e549e6b7bf476641cd7c..bf46887b0748243f4e2d71040cf84aaa06cd6b44 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -50,7 +50,8 @@ define i32 @_Z10test_shortPsS_i_128(i16* nocapture readonly, i16* nocapture read
 ; AVX-NEXT:  # %bb.2: # %middle.block
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 entry:
@@ -129,7 +130,8 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -153,7 +155,8 @@ define i32 @_Z10test_shortPsS_i_256(i16* nocapture readonly, i16* nocapture read
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
@@ -252,7 +255,8 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -278,7 +282,8 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -305,7 +310,7 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -437,7 +442,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -469,7 +475,8 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -501,7 +508,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -530,7 +537,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -620,7 +627,8 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly
 ; AVX-NEXT:  # %bb.2: # %middle.block
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 entry:
@@ -704,7 +712,8 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -729,7 +738,8 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
@@ -836,7 +846,8 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -863,7 +874,8 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -891,7 +903,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -942,38 +954,38 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB7_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm6
 ; SSE2-NEXT:    movq {{.*#+}} xmm7 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm7
 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmaddwd %xmm5, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm9
-; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    pmaddwd %xmm6, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm4
+; SSE2-NEXT:    movq {{.*#+}} xmm6 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm6
+; SSE2-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm5
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    pmaddwd %xmm7, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm9
 ; SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psraw $8, %xmm2
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm2, %xmm4
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pmaddwd %xmm6, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm3
 ; SSE2-NEXT:    addq $32, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB7_1
@@ -1039,7 +1051,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1073,7 +1086,8 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1107,7 +1121,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1137,7 +1151,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1222,7 +1236,8 @@ define i32 @test_unsigned_short_128(i16* nocapture readonly, i16* nocapture read
 ; AVX-NEXT:  # %bb.2: # %middle.block
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 entry:
@@ -1313,7 +1328,8 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1338,7 +1354,8 @@ define i32 @test_unsigned_short_256(i16* nocapture readonly, i16* nocapture read
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
@@ -1460,7 +1477,8 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1491,7 +1509,8 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1519,7 +1538,7 @@ define i32 @test_unsigned_short_512(i16* nocapture readonly, i16* nocapture read
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1699,7 +1718,8 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1742,7 +1762,8 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1776,7 +1797,7 @@ define i32 @test_unsigned_short_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -2040,12 +2061,11 @@ define <4 x i32> @pmaddwd_negative1(<8 x i16> %A, <8 x i16> %B) {
 ;
 ; AVX1-LABEL: pmaddwd_negative1:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
@@ -2692,7 +2712,8 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    retq
 ;
@@ -2707,7 +2728,8 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX256-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX256-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX256-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX256-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vmovd %xmm0, %eax
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
diff --git a/test/CodeGen/X86/memset.ll b/test/CodeGen/X86/memset.ll
index 6d5c4cd0f8a59b06844e3493dd9be1afd9debe93..02dfb34e10091d174a908dad10a5ed5dc7ec1da9 100644
--- a/test/CodeGen/X86/memset.ll
+++ b/test/CodeGen/X86/memset.ll
@@ -22,7 +22,6 @@ define void @t() nounwind  {
 ; X86-NEXT:    calll _foo
 ; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    retl
-; X86-NEXT:    ## -- End function
 ;
 ; XMM-LABEL: t:
 ; XMM:       ## %bb.0: ## %entry
@@ -35,7 +34,6 @@ define void @t() nounwind  {
 ; XMM-NEXT:    calll _foo
 ; XMM-NEXT:    addl $60, %esp
 ; XMM-NEXT:    retl
-; XMM-NEXT:    ## -- End function
 ;
 ; YMM-LABEL: t:
 ; YMM:       ## %bb.0: ## %entry
@@ -43,16 +41,15 @@ define void @t() nounwind  {
 ; YMM-NEXT:    movl %esp, %ebp
 ; YMM-NEXT:    andl $-32, %esp
 ; YMM-NEXT:    subl $96, %esp
+; YMM-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; YMM-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; YMM-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; YMM-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; YMM-NEXT:    movl %eax, (%esp)
 ; YMM-NEXT:    vzeroupper
 ; YMM-NEXT:    calll _foo
 ; YMM-NEXT:    movl %ebp, %esp
 ; YMM-NEXT:    popl %ebp
 ; YMM-NEXT:    retl
-; YMM-NEXT:    ## -- End function
 entry:
 	%up_mvd = alloca [8 x %struct.x]		; <[8 x %struct.x]*> [#uses=2]
 	%up_mvd116 = getelementptr [8 x %struct.x], [8 x %struct.x]* %up_mvd, i32 0, i32 0		; <%struct.x*> [#uses=1]
diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll
index f421d41f886cf4584f94414e6259041e8433e7b2..2feb9742c60952dd885e2d3559b64b5254dc362c 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-256.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll
@@ -237,33 +237,35 @@ define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp {
 define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noinline ssp {
 ; AVX1-LABEL: merge_8f32_2f32_23z5:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT:    vmovups 16(%rdi), %xmm1
-; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vmovups 16(%rdi), %xmm0
+; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: merge_8f32_2f32_23z5:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT:    vmovdqu 16(%rdi), %xmm1
-; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovupd 16(%rdi), %xmm0
+; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: merge_8f32_2f32_23z5:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vmovdqu 16(%rdi), %xmm1
-; AVX512F-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vmovupd 16(%rdi), %xmm0
+; AVX512F-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; X32-AVX-LABEL: merge_8f32_2f32_23z5:
 ; X32-AVX:       # %bb.0:
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX-NEXT:    vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7]
+; X32-AVX-NEXT:    vmovups 16(%eax), %xmm0
+; X32-AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X32-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    retl
   %ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2
   %ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
diff --git a/test/CodeGen/X86/required-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll
similarity index 94%
rename from test/CodeGen/X86/required-vector-width.ll
rename to test/CodeGen/X86/min-legal-vector-width.ll
index 368c8acd4f8cca807af90c826b6272ced424890b..9fc12e6a094035c5649cba592a0cdb782b7f717b 100644
--- a/test/CodeGen/X86/required-vector-width.ll
+++ b/test/CodeGen/X86/min-legal-vector-width.ll
@@ -3,7 +3,7 @@
 
 ; This file primarily contains tests for specific places in X86ISelLowering.cpp that needed be made aware of the legalizer not allowing 512-bit vectors due to prefer-256-bit even though AVX512 is enabled.
 
-define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="256" {
+define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: add256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
@@ -21,7 +21,7 @@ define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-ve
   ret void
 }
 
-define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-vector-width"="512" {
+define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: add512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
@@ -36,7 +36,7 @@ define void @add512(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "required-ve
   ret void
 }
 
-define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="256" {
+define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: avg_v64i8_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rsi), %ymm0
@@ -60,7 +60,7 @@ define void @avg_v64i8_256(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"
 }
 
 
-define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"="512" {
+define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: avg_v64i8_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rsi), %zmm0
@@ -80,7 +80,7 @@ define void @avg_v64i8_512(<64 x i8>* %a, <64 x i8>* %b) "required-vector-width"
   ret void
 }
 
-define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="256" {
+define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: pmaddwd_32_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
@@ -103,7 +103,7 @@ define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %C
    ret void
 }
 
-define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "required-vector-width"="512" {
+define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %CPtr) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: pmaddwd_32_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
@@ -123,7 +123,7 @@ define void @pmaddwd_32_512(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %C
    ret void
 }
 
-define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="256" {
+define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: psubus_64i8_max_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
@@ -143,7 +143,7 @@ define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>*
   ret void
 }
 
-define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "required-vector-width"="512" {
+define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* %zptr) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: psubus_64i8_max_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
@@ -160,7 +160,7 @@ define void @psubus_64i8_max_512(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>*
   ret void
 }
 
-define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="256" {
+define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: _Z9test_charPcS_i_256:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %edx, %eax
@@ -190,7 +190,8 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -230,7 +231,7 @@ middle.block:
   ret i32 %13
 }
 
-define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "required-vector-width"="512" {
+define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly, i32) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: _Z9test_charPcS_i_512:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %edx, %eax
@@ -256,7 +257,7 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -299,7 +300,7 @@ middle.block:
 @a = global [1024 x i8] zeroinitializer, align 16
 @b = global [1024 x i8] zeroinitializer, align 16
 
-define i32 @sad_16i8_256() "required-vector-width"="256" {
+define i32 @sad_16i8_256() "min-legal-vector-width"="256" {
 ; CHECK-LABEL: sad_16i8_256:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
@@ -319,7 +320,8 @@ define i32 @sad_16i8_256() "required-vector-width"="256" {
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -360,7 +362,7 @@ middle.block:
   ret i32 %12
 }
 
-define i32 @sad_16i8_512() "required-vector-width"="512" {
+define i32 @sad_16i8_512() "min-legal-vector-width"="512" {
 ; CHECK-LABEL: sad_16i8_512:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
@@ -381,7 +383,7 @@ define i32 @sad_16i8_512() "required-vector-width"="512" {
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
@@ -422,7 +424,7 @@ middle.block:
   ret i32 %12
 }
 
-define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" {
+define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: sbto16f32_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -441,7 +443,7 @@ define void @sbto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-w
   ret void
 }
 
-define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" {
+define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: sbto16f32_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -456,7 +458,7 @@ define void @sbto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-w
   ret void
 }
 
-define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res)  "required-vector-width"="256" {
+define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res)  "min-legal-vector-width"="256" {
 ; CHECK-LABEL: sbto16f64_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -481,7 +483,7 @@ define void @sbto16f64_256(<16 x i16> %a, <16 x double>* %res)  "required-vector
   ret void
 }
 
-define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res)  "required-vector-width"="512" {
+define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res)  "min-legal-vector-width"="512" {
 ; CHECK-LABEL: sbto16f64_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -499,7 +501,7 @@ define void @sbto16f64_512(<16 x i16> %a, <16 x double>* %res)  "required-vector
   ret void
 }
 
-define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="256" {
+define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: ubto16f32_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -520,7 +522,7 @@ define void @ubto16f32_256(<16 x i16> %a, <16 x float>* %res) "required-vector-w
   ret void
 }
 
-define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-width"="512" {
+define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: ubto16f32_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -536,7 +538,7 @@ define void @ubto16f32_512(<16 x i16> %a, <16 x float>* %res) "required-vector-w
   ret void
 }
 
-define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="256" {
+define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: ubto16f64_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -563,7 +565,7 @@ define void @ubto16f64_256(<16 x i16> %a, <16 x double>* %res) "required-vector-
   ret void
 }
 
-define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-width"="512" {
+define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: ubto16f64_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0
@@ -582,7 +584,7 @@ define void @ubto16f64_512(<16 x i16> %a, <16 x double>* %res) "required-vector-
   ret void
 }
 
-define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" {
+define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: test_16f32toub_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
@@ -600,7 +602,7 @@ define <16 x i16> @test_16f32toub_256(<16 x float>* %ptr, <16 x i16> %passthru)
   ret <16 x i16> %select
 }
 
-define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" {
+define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: test_16f32toub_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
@@ -614,7 +616,7 @@ define <16 x i16> @test_16f32toub_512(<16 x float>* %ptr, <16 x i16> %passthru)
   ret <16 x i16> %select
 }
 
-define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="256" {
+define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="256" {
 ; CHECK-LABEL: test_16f32tosb_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %ymm1
@@ -632,7 +634,7 @@ define <16 x i16> @test_16f32tosb_256(<16 x float>* %ptr, <16 x i16> %passthru)
   ret <16 x i16> %select
 }
 
-define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "required-vector-width"="512" {
+define <16 x i16> @test_16f32tosb_512(<16 x float>* %ptr, <16 x i16> %passthru) "min-legal-vector-width"="512" {
 ; CHECK-LABEL: test_16f32tosb_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvttps2dq (%rdi), %zmm1
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index 2d24cb8df35ef40974bda689d28a7e28c5a551ee..60735fba4cd71f63f754cc096a19be4ce06945d4 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -213,29 +213,24 @@ define void @test1(x86_mmx* %A, x86_mmx* %B) {
 ; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT:    movdqa %xmm1, %xmm2
-; X32-NEXT:    pmuludq %xmm0, %xmm2
-; X32-NEXT:    psrlq $32, %xmm1
-; X32-NEXT:    pmuludq %xmm0, %xmm1
-; X32-NEXT:    psllq $32, %xmm1
-; X32-NEXT:    paddq %xmm2, %xmm1
+; X32-NEXT:    pmuludq %xmm1, %xmm0
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; X32-NEXT:    movq %xmm1, (%eax)
+; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X32-NEXT:    andps %xmm0, %xmm1
 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT:    andps %xmm1, %xmm0
+; X32-NEXT:    orps %xmm1, %xmm0
 ; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X32-NEXT:    orps %xmm0, %xmm1
+; X32-NEXT:    xorps %xmm0, %xmm1
 ; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X32-NEXT:    movq %xmm0, (%eax)
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT:    xorps %xmm1, %xmm0
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    emms
 ; X32-NEXT:    retl
 ;
@@ -250,29 +245,24 @@ define void @test1(x86_mmx* %A, x86_mmx* %B) {
 ; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT:    movdqa %xmm1, %xmm2
-; X64-NEXT:    pmuludq %xmm0, %xmm2
-; X64-NEXT:    psrlq $32, %xmm1
-; X64-NEXT:    pmuludq %xmm0, %xmm1
-; X64-NEXT:    psllq $32, %xmm1
-; X64-NEXT:    paddq %xmm2, %xmm1
+; X64-NEXT:    pmuludq %xmm1, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; X64-NEXT:    movq %xmm1, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X64-NEXT:    pand %xmm0, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    por %xmm1, %xmm0
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; X64-NEXT:    movq %xmm1, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; X64-NEXT:    por %xmm0, %xmm1
+; X64-NEXT:    pxor %xmm0, %xmm1
 ; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; X64-NEXT:    movq %xmm0, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/mmx-coalescing.ll b/test/CodeGen/X86/mmx-coalescing.ll
index 8f9204a4a854e18d42c03f0ac2a98eb28675927d..8cd57aa8c534ede1f77f5039291140722babeaed 100644
--- a/test/CodeGen/X86/mmx-coalescing.ll
+++ b/test/CodeGen/X86/mmx-coalescing.ll
@@ -16,16 +16,17 @@ define i32 @test(%SA* %pSA, i16* %A, i32 %B, i32 %C, i32 %D, i8* %E) {
 ; CHECK-NEXT:  # %bb.2: # %if.B
 ; CHECK-NEXT:    pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3]
 ; CHECK-NEXT:    movq %mm0, %rax
-; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jne .LBB0_4
 ; CHECK-NEXT:  .LBB0_1: # %if.A
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movd %edx, %mm1
 ; CHECK-NEXT:    psllq %mm1, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    testq %rax, %rax
 ; CHECK-NEXT:    jne .LBB0_4
-; CHECK-NEXT:  .LBB0_3: # %if.C
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:  # %bb.3: # %if.C
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    je .LBB0_1
 ; CHECK-NEXT:  .LBB0_4: # %merge
diff --git a/test/CodeGen/X86/mmx-schedule.ll b/test/CodeGen/X86/mmx-schedule.ll
index f4e047cd6863f102692648456e435f83785e3dbc..6a8a487d7c1e35df9aa9d51674c1e2914d55a560 100644
--- a/test/CodeGen/X86/mmx-schedule.ll
+++ b/test/CodeGen/X86/mmx-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -76,6 +77,14 @@ define i64 @test_cvtpd2pi(<2 x double> %a0, <2 x double>* %a1) optsize {
 ; SKX-NEXT:    movq %mm1, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpd2pi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvtpd2pi (%rdi), %mm1 # sched: [13:1.00]
+; BDVER2-NEXT:    cvtpd2pi %xmm0, %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvtpd2pi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvtpd2pi (%rdi), %mm1 # sched: [8:1.00]
@@ -157,6 +166,13 @@ define <2 x double> @test_cvtpi2pd(x86_mmx %a0, x86_mmx* %a1) optsize {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpi2pd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvtpi2pd (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    cvtpi2pd %mm0, %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvtpi2pd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvtpi2pd (%rdi), %xmm1 # sched: [8:1.00]
@@ -235,6 +251,13 @@ define <4 x float> @test_cvtpi2ps(x86_mmx %a0, x86_mmx* %a1, <4 x float> %a2, <4
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtpi2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvtpi2ps (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    cvtpi2ps %mm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvtpi2ps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvtpi2ps (%rdi), %xmm1 # sched: [8:1.00]
@@ -321,6 +344,14 @@ define i64 @test_cvtps2pi(<4 x float> %a0, <4 x float>* %a1) optsize {
 ; SKX-NEXT:    movq %mm1, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvtps2pi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvtps2pi (%rdi), %mm1 # sched: [9:1.00]
+; BDVER2-NEXT:    cvtps2pi %xmm0, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvtps2pi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvtps2pi (%rdi), %mm1 # sched: [8:1.00]
@@ -410,6 +441,14 @@ define i64 @test_cvttpd2pi(<2 x double> %a0, <2 x double>* %a1) optsize {
 ; SKX-NEXT:    movq %mm1, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvttpd2pi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvttpd2pi (%rdi), %mm1 # sched: [13:1.00]
+; BDVER2-NEXT:    cvttpd2pi %xmm0, %mm0 # sched: [6:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvttpd2pi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvttpd2pi (%rdi), %mm1 # sched: [8:1.00]
@@ -499,6 +538,14 @@ define i64 @test_cvttps2pi(<4 x float> %a0, <4 x float>* %a1) optsize {
 ; SKX-NEXT:    movq %mm1, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cvttps2pi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    cvttps2pi (%rdi), %mm1 # sched: [9:1.00]
+; BDVER2-NEXT:    cvttps2pi %xmm0, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    por %mm0, %mm1 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm1, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cvttps2pi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    cvttps2pi (%rdi), %mm1 # sched: [8:1.00]
@@ -564,6 +611,11 @@ define void @test_emms() optsize {
 ; SKX-NEXT:    emms # sched: [10:4.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_emms:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    emms # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_emms:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    emms # sched: [2:0.50]
@@ -619,6 +671,11 @@ define void @test_maskmovq(x86_mmx %a0, x86_mmx %a1, i8* %a2) optsize {
 ; SKX-NEXT:    maskmovq %mm1, %mm0 # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_maskmovq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    maskmovq %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_maskmovq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    maskmovq %mm1, %mm0 # sched: [1:0.50]
@@ -722,6 +779,17 @@ define i32 @test_movd(x86_mmx %a0, i32 %a1, i32 *%a2) {
 ; SKX-NEXT:    movl %ecx, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movd %edi, %mm1 # sched: [10:0.50]
+; BDVER2-NEXT:    movd (%rsi), %mm2 # sched: [5:0.50]
+; BDVER2-NEXT:    paddd %mm1, %mm2 # sched: [2:0.50]
+; BDVER2-NEXT:    paddd %mm2, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movd %mm2, %ecx # sched: [10:1.00]
+; BDVER2-NEXT:    movd %mm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    movl %ecx, (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movd %edi, %mm1 # sched: [8:0.50]
@@ -815,6 +883,13 @@ define i64 @test_movdq2q(<2 x i64> %a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movdq2q:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movdq2q %xmm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddd %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movdq2q:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movdq2q %xmm0, %mm0 # sched: [1:0.50]
@@ -876,6 +951,11 @@ define void @test_movntq(x86_mmx* %a0, x86_mmx %a1) optsize {
 ; SKX-NEXT:    movntq %mm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movntq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movntq %mm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movntq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movntq %mm0, (%rdi) # sched: [2:1.00]
@@ -949,6 +1029,13 @@ define void @test_movq(i64 *%a0) {
 ; SKX-NEXT:    movq %mm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq (%rdi), %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    paddd %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq (%rdi), %mm0 # sched: [5:1.00]
@@ -1011,6 +1098,11 @@ define <2 x i64> @test_movq2dq(x86_mmx %a0) optsize {
 ; SKX-NEXT:    movq2dq %mm0, %xmm0 # sched: [2:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movq2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq2dq %mm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movq2dq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq2dq %mm0, %xmm0 # sched: [1:0.50]
@@ -1082,6 +1174,13 @@ define i64 @test_pabsb(x86_mmx *%a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pabsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pabsb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    pabsb %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pabsb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pabsb (%rdi), %mm0 # sched: [6:1.00]
@@ -1160,6 +1259,13 @@ define i64 @test_pabsd(x86_mmx *%a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pabsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pabsd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    pabsd %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pabsd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pabsd (%rdi), %mm0 # sched: [6:1.00]
@@ -1238,6 +1344,13 @@ define i64 @test_pabsw(x86_mmx *%a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pabsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pabsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    pabsw %mm0, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pabsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pabsw (%rdi), %mm0 # sched: [6:1.00]
@@ -1316,6 +1429,13 @@ define i64 @test_packssdw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_packssdw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    packssdw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    packssdw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_packssdw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    packssdw %mm1, %mm0 # sched: [1:0.50]
@@ -1394,6 +1514,13 @@ define i64 @test_packsswb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_packsswb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    packsswb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    packsswb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_packsswb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    packsswb %mm1, %mm0 # sched: [1:0.50]
@@ -1472,6 +1599,13 @@ define i64 @test_packuswb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_packuswb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    packuswb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    packuswb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_packuswb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    packuswb %mm1, %mm0 # sched: [1:0.50]
@@ -1550,6 +1684,13 @@ define i64 @test_paddb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_paddb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddb %mm1, %mm0 # sched: [1:0.50]
@@ -1628,6 +1769,13 @@ define i64 @test_paddd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_paddd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddd %mm1, %mm0 # sched: [1:0.50]
@@ -1706,6 +1854,13 @@ define i64 @test_paddq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddq %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddq (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_paddq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddq %mm1, %mm0 # sched: [1:0.50]
@@ -1784,6 +1939,13 @@ define i64 @test_paddsb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddsb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddsb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_paddsb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddsb %mm1, %mm0 # sched: [1:0.50]
@@ -1862,6 +2024,13 @@ define i64 @test_paddsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddsw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_paddsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddsw %mm1, %mm0 # sched: [1:0.50]
@@ -1940,6 +2109,13 @@ define i64 @test_paddusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddusb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddusb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddusb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_paddusb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddusb %mm1, %mm0 # sched: [1:0.50]
@@ -2018,6 +2194,13 @@ define i64 @test_paddusw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddusw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddusw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddusw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_paddusw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddusw %mm1, %mm0 # sched: [1:0.50]
@@ -2096,6 +2279,13 @@ define i64 @test_paddw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_paddw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    paddw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    paddw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_paddw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    paddw %mm1, %mm0 # sched: [1:0.50]
@@ -2174,6 +2364,13 @@ define i64 @test_palignr(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_palignr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    palignr $1, %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    palignr $1, (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_palignr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    palignr $1, %mm1, %mm0 # sched: [1:0.50]
@@ -2252,6 +2449,13 @@ define i64 @test_pand(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pand:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pand %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pand (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pand:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pand %mm1, %mm0 # sched: [1:0.50]
@@ -2330,6 +2534,13 @@ define i64 @test_pandn(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pandn:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pandn %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pandn (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pandn:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pandn %mm1, %mm0 # sched: [1:0.50]
@@ -2408,6 +2619,13 @@ define i64 @test_pavgb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pavgb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pavgb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pavgb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pavgb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pavgb %mm1, %mm0 # sched: [1:0.50]
@@ -2486,6 +2704,13 @@ define i64 @test_pavgw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pavgw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pavgw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pavgw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pavgw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pavgw %mm1, %mm0 # sched: [1:0.50]
@@ -2564,6 +2789,13 @@ define i64 @test_pcmpeqb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpeqb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpeqb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpeqb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pcmpeqb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpeqb %mm1, %mm0 # sched: [1:0.50]
@@ -2642,6 +2874,13 @@ define i64 @test_pcmpeqd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpeqd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpeqd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpeqd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pcmpeqd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpeqd %mm1, %mm0 # sched: [1:0.50]
@@ -2720,6 +2959,13 @@ define i64 @test_pcmpeqw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpeqw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpeqw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpeqw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pcmpeqw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpeqw %mm1, %mm0 # sched: [1:0.50]
@@ -2798,6 +3044,13 @@ define i64 @test_pcmpgtb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpgtb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpgtb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpgtb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pcmpgtb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpgtb %mm1, %mm0 # sched: [1:0.50]
@@ -2876,6 +3129,13 @@ define i64 @test_pcmpgtd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpgtd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpgtd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpgtd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pcmpgtd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpgtd %mm1, %mm0 # sched: [1:0.50]
@@ -2954,6 +3214,13 @@ define i64 @test_pcmpgtw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pcmpgtw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pcmpgtw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pcmpgtw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pcmpgtw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pcmpgtw %mm1, %mm0 # sched: [1:0.50]
@@ -3016,6 +3283,11 @@ define i32 @test_pextrw(x86_mmx %a0) optsize {
 ; SKX-NEXT:    pextrw $0, %mm0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pextrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pextrw $0, %mm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pextrw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pextrw $0, %mm0, %eax # sched: [3:1.00]
@@ -3087,6 +3359,13 @@ define i64 @test_phaddd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phaddd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phaddd %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phaddd (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_phaddd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phaddd %mm1, %mm0 # sched: [1:0.50]
@@ -3165,6 +3444,13 @@ define i64 @test_phaddsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phaddsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phaddsw %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phaddsw (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_phaddsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phaddsw %mm1, %mm0 # sched: [1:0.50]
@@ -3243,6 +3529,13 @@ define i64 @test_phaddw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phaddw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phaddw %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phaddw (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_phaddw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phaddw %mm1, %mm0 # sched: [1:0.50]
@@ -3321,6 +3614,13 @@ define i64 @test_phsubd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phsubd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phsubd %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phsubd (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_phsubd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phsubd %mm1, %mm0 # sched: [1:0.50]
@@ -3399,6 +3699,13 @@ define i64 @test_phsubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phsubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phsubsw %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phsubsw (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_phsubsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phsubsw %mm1, %mm0 # sched: [1:0.50]
@@ -3477,6 +3784,13 @@ define i64 @test_phsubw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_phsubw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    phsubw %mm1, %mm0 # sched: [5:0.50]
+; BDVER2-NEXT:    phsubw (%rdi), %mm0 # sched: [10:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_phsubw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    phsubw %mm1, %mm0 # sched: [1:0.50]
@@ -3563,6 +3877,14 @@ define i64 @test_pinsrw(x86_mmx %a0, i32 %a1, i16* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pinsrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movswl (%rsi), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    pinsrw $0, %edi, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pinsrw $1, %eax, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pinsrw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pinsrw $0, %edi, %mm0 # sched: [7:0.50]
@@ -3644,6 +3966,13 @@ define i64 @test_pmaddwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmaddwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmaddwd %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmaddwd (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmaddwd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmaddwd %mm1, %mm0 # sched: [2:1.00]
@@ -3722,6 +4051,13 @@ define i64 @test_pmaddubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmaddubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmaddubsw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmaddubsw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmaddubsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmaddubsw %mm1, %mm0 # sched: [2:1.00]
@@ -3800,6 +4136,13 @@ define i64 @test_pmaxsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmaxsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmaxsw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pmaxsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmaxsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmaxsw %mm1, %mm0 # sched: [1:0.50]
@@ -3878,6 +4221,13 @@ define i64 @test_pmaxub(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmaxub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmaxub %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pmaxub (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmaxub:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmaxub %mm1, %mm0 # sched: [1:0.50]
@@ -3956,6 +4306,13 @@ define i64 @test_pminsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pminsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pminsw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pminsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pminsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pminsw %mm1, %mm0 # sched: [1:0.50]
@@ -4034,6 +4391,13 @@ define i64 @test_pminub(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pminub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pminub %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pminub (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pminub:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pminub %mm1, %mm0 # sched: [1:0.50]
@@ -4096,6 +4460,11 @@ define i32 @test_pmovmskb(x86_mmx %a0) optsize {
 ; SKX-NEXT:    pmovmskb %mm0, %eax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmovmskb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmovmskb %mm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmovmskb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmovmskb %mm0, %eax # sched: [3:1.00]
@@ -4167,6 +4536,13 @@ define i64 @test_pmulhrsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmulhrsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmulhrsw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmulhrsw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmulhrsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmulhrsw %mm1, %mm0 # sched: [2:1.00]
@@ -4245,6 +4621,13 @@ define i64 @test_pmulhw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmulhw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmulhw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmulhw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmulhw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmulhw %mm1, %mm0 # sched: [2:1.00]
@@ -4323,6 +4706,13 @@ define i64 @test_pmulhuw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmulhuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmulhuw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmulhuw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmulhuw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmulhuw %mm1, %mm0 # sched: [2:1.00]
@@ -4401,6 +4791,13 @@ define i64 @test_pmullw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmullw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmullw %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmullw (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmullw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmullw %mm1, %mm0 # sched: [2:1.00]
@@ -4479,6 +4876,13 @@ define i64 @test_pmuludq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pmuludq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pmuludq %mm1, %mm0 # sched: [4:1.00]
+; BDVER2-NEXT:    pmuludq (%rdi), %mm0 # sched: [9:1.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pmuludq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pmuludq %mm1, %mm0 # sched: [2:1.00]
@@ -4557,6 +4961,13 @@ define i64 @test_por(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_por:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    por %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    por (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_por:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    por %mm1, %mm0 # sched: [1:0.50]
@@ -4635,6 +5046,13 @@ define i64 @test_psadbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psadbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psadbw %mm1, %mm0 # sched: [4:0.50]
+; BDVER2-NEXT:    psadbw (%rdi), %mm0 # sched: [9:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psadbw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psadbw %mm1, %mm0 # sched: [2:0.50]
@@ -4713,6 +5131,13 @@ define i64 @test_pshufb(x86_mmx %a0, x86_mmx %a1, x86_mmx *%a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pshufb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pshufb %mm1, %mm0 # sched: [3:2.00]
+; BDVER2-NEXT:    pshufb (%rdi), %mm0 # sched: [8:2.00]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pshufb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pshufb %mm1, %mm0 # sched: [2:0.50]
@@ -4791,6 +5216,13 @@ define i64 @test_pshufw(x86_mmx *%a0) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pshufw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [7:0.50]
+; BDVER2-NEXT:    pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pshufw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [6:1.00]
@@ -4869,6 +5301,13 @@ define i64 @test_psignb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psignb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psignb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psignb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psignb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psignb %mm1, %mm0 # sched: [1:0.50]
@@ -4947,6 +5386,13 @@ define i64 @test_psignd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psignd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psignd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psignd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psignd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psignd %mm1, %mm0 # sched: [1:0.50]
@@ -5025,6 +5471,13 @@ define i64 @test_psignw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psignw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psignw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psignw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psignw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psignw %mm1, %mm0 # sched: [1:0.50]
@@ -5111,6 +5564,14 @@ define i64 @test_pslld(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pslld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pslld %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    pslld (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    pslld $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pslld:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pslld %mm1, %mm0 # sched: [1:0.50]
@@ -5201,6 +5662,14 @@ define i64 @test_psllq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psllq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psllq %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psllq (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psllq $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psllq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psllq %mm1, %mm0 # sched: [1:0.50]
@@ -5291,6 +5760,14 @@ define i64 @test_psllw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psllw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psllw %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psllw (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psllw $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psllw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psllw %mm1, %mm0 # sched: [1:0.50]
@@ -5381,6 +5858,14 @@ define i64 @test_psrad(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psrad:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psrad %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psrad (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psrad $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psrad:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psrad %mm1, %mm0 # sched: [1:0.50]
@@ -5471,6 +5956,14 @@ define i64 @test_psraw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psraw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psraw %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psraw (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psraw $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psraw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psraw %mm1, %mm0 # sched: [1:0.50]
@@ -5561,6 +6054,14 @@ define i64 @test_psrld(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psrld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psrld %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psrld (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psrld $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psrld:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psrld %mm1, %mm0 # sched: [1:0.50]
@@ -5651,6 +6152,14 @@ define i64 @test_psrlq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psrlq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psrlq %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psrlq (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psrlq $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psrlq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psrlq %mm1, %mm0 # sched: [1:0.50]
@@ -5741,6 +6250,14 @@ define i64 @test_psrlw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psrlw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psrlw %mm1, %mm0 # sched: [3:0.50]
+; BDVER2-NEXT:    psrlw (%rdi), %mm0 # sched: [8:0.50]
+; BDVER2-NEXT:    psrlw $7, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psrlw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psrlw %mm1, %mm0 # sched: [1:0.50]
@@ -5823,6 +6340,13 @@ define i64 @test_psubb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psubb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubb %mm1, %mm0 # sched: [1:0.50]
@@ -5901,6 +6425,13 @@ define i64 @test_psubd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubd %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubd (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psubd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubd %mm1, %mm0 # sched: [1:0.50]
@@ -5979,6 +6510,13 @@ define i64 @test_psubq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubq %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubq (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psubq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubq %mm1, %mm0 # sched: [1:0.50]
@@ -6057,6 +6595,13 @@ define i64 @test_psubsb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubsb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubsb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psubsb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubsb %mm1, %mm0 # sched: [1:0.50]
@@ -6135,6 +6680,13 @@ define i64 @test_psubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubsw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubsw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psubsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubsw %mm1, %mm0 # sched: [1:0.50]
@@ -6213,6 +6765,13 @@ define i64 @test_psubusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubusb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubusb %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubusb (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psubusb:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubusb %mm1, %mm0 # sched: [1:0.50]
@@ -6291,6 +6850,13 @@ define i64 @test_psubusw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubusw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubusw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubusw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psubusw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubusw %mm1, %mm0 # sched: [1:0.50]
@@ -6369,6 +6935,13 @@ define i64 @test_psubw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_psubw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    psubw %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    psubw (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_psubw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    psubw %mm1, %mm0 # sched: [1:0.50]
@@ -6447,6 +7020,13 @@ define i64 @test_punpckhbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpckhbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [2:0.50]
+; BDVER2-NEXT:    punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_punpckhbw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:0.50]
@@ -6525,6 +7105,13 @@ define i64 @test_punpckhdq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpckhdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_punpckhdq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:0.50]
@@ -6603,6 +7190,13 @@ define i64 @test_punpckhwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpckhwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [2:0.50]
+; BDVER2-NEXT:    punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_punpckhwd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:0.50]
@@ -6681,6 +7275,13 @@ define i64 @test_punpcklbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpcklbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [2:0.50]
+; BDVER2-NEXT:    punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_punpcklbw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:0.50]
@@ -6759,6 +7360,13 @@ define i64 @test_punpckldq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpckldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [2:0.50]
+; BDVER2-NEXT:    punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_punpckldq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:0.50]
@@ -6837,6 +7445,13 @@ define i64 @test_punpcklwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_punpcklwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_punpcklwd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:0.50]
@@ -6915,6 +7530,13 @@ define i64 @test_pxor(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ; SKX-NEXT:    movq %mm0, %rax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pxor:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pxor %mm1, %mm0 # sched: [2:0.50]
+; BDVER2-NEXT:    pxor (%rdi), %mm0 # sched: [7:0.50]
+; BDVER2-NEXT:    movq %mm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pxor:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pxor %mm1, %mm0 # sched: [1:0.50]
diff --git a/test/CodeGen/X86/movmsk-cmp.ll b/test/CodeGen/X86/movmsk-cmp.ll
index 9f55ca31b1be69ca37dd9e5b29fdb30b52339b57..452676e19db4ffc522bc5bda8308fc18631b16dd 100644
--- a/test/CodeGen/X86/movmsk-cmp.ll
+++ b/test/CodeGen/X86/movmsk-cmp.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s  --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s  --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s  --check-prefix=KNL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s  --check-prefix=SKX
 
 define i1 @allones_v16i8_sign(<16 x i8> %arg) {
@@ -19,6 +20,17 @@ define i1 @allones_v16i8_sign(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
@@ -46,6 +58,17 @@ define i1 @allzeros_v16i8_sign(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
@@ -92,6 +115,24 @@ define i1 @allones_v32i8_sign(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %ymm0, %k0
@@ -137,6 +178,23 @@ define i1 @allzeros_v32i8_sign(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %ymm0, %k0
@@ -202,6 +260,36 @@ define i1 @allones_v64i8_sign(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v64i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    cmpq $-1, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v64i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
@@ -264,6 +352,35 @@ define i1 @allzeros_v64i8_sign(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v64i8_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpgtb %ymm1, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v64i8_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
@@ -298,6 +415,18 @@ define i1 @allones_v8i16_sign(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %xmm0, %k0
@@ -331,6 +460,18 @@ define i1 @allzeros_v8i16_sign(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %xmm0, %k0
@@ -381,6 +522,17 @@ define i1 @allones_v16i16_sign(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %ymm0, %k0
@@ -432,6 +584,17 @@ define i1 @allzeros_v16i16_sign(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %ymm0, %k0
@@ -499,6 +662,24 @@ define i1 @allones_v32i16_sign(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %zmm0, %k0
@@ -564,6 +745,23 @@ define i1 @allzeros_v32i16_sign(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i16_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i16_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovw2m %zmm0, %k0
@@ -592,6 +790,18 @@ define i1 @allones_v4i32_sign(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %xmm0, %k0
@@ -621,6 +831,17 @@ define i1 @allzeros_v4i32_sign(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %xmm0, %k0
@@ -656,6 +877,17 @@ define i1 @allones_v8i32_sign(<8 x i32> %arg) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %ymm0, %k0
@@ -691,6 +923,17 @@ define i1 @allzeros_v8i32_sign(<8 x i32> %arg) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %ymm0, %k0
@@ -756,6 +999,15 @@ define i1 @allones_v16i32_sign(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %zmm0, %k0
@@ -821,6 +1073,15 @@ define i1 @allzeros_v16i32_sign(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i32_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i32_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %zmm0, %k0
@@ -870,6 +1131,18 @@ define i1 @allones_v4i64_sign(<4 x i64> %arg) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i64_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i64_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %ymm0, %k0
@@ -921,6 +1194,17 @@ define i1 @allzeros_v4i64_sign(<4 x i64> %arg) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i64_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i64_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %ymm0, %k0
@@ -1015,6 +1299,16 @@ define i1 @allones_v8i64_sign(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i64_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i64_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %zmm0, %k0
@@ -1108,6 +1402,16 @@ define i1 @allzeros_v8i64_sign(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i64_sign:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i64_sign:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %zmm0, %k0
@@ -1138,6 +1442,18 @@ define i1 @allones_v16i8_and1(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %xmm0, %k0
@@ -1168,6 +1484,18 @@ define i1 @allzeros_v16i8_and1(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %xmm0, %k0
@@ -1222,6 +1550,25 @@ define i1 @allones_v32i8_and1(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %ymm0, %k0
@@ -1275,6 +1622,24 @@ define i1 @allzeros_v32i8_and1(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %ymm0, %k0
@@ -1355,6 +1720,38 @@ define i1 @allones_v64i8_and1(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v64i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    cmpq $-1, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v64i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %zmm0, %k0
@@ -1432,6 +1829,37 @@ define i1 @allzeros_v64i8_and1(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v64i8_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v64i8_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %zmm0, %k0
@@ -1469,6 +1897,19 @@ define i1 @allones_v8i16_and1(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %xmm0, %k0
@@ -1505,6 +1946,19 @@ define i1 @allzeros_v8i16_and1(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %xmm0, %k0
@@ -1562,6 +2016,18 @@ define i1 @allones_v16i16_and1(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %ymm0, %k0
@@ -1641,6 +2107,26 @@ define i1 @allones_v32i16_and1(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %zmm0, %k0
@@ -1718,6 +2204,25 @@ define i1 @allzeros_v32i16_and1(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %zmm0, %k0
@@ -1776,6 +2281,18 @@ define i1 @allzeros_v16i16_and1(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i16_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i16_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %ymm0, %k0
@@ -1807,10 +2324,21 @@ define i1 @allones_v4i32_and1(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; SKX-NEXT:    vptestmd %xmm1, %xmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andb $15, %al
 ; SKX-NEXT:    cmpb $15, %al
@@ -1840,10 +2368,20 @@ define i1 @allzeros_v4i32_and1(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; SKX-NEXT:    vptestmd %xmm1, %xmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testb $15, %al
 ; SKX-NEXT:    sete %al
@@ -1891,10 +2429,20 @@ define i1 @allones_v8i32_and1(<8 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vptestmd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0
 ; SKX-NEXT:    kortestb %k0, %k0
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vzeroupper
@@ -1942,10 +2490,20 @@ define i1 @allzeros_v8i32_and1(<8 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vptestmd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0
 ; SKX-NEXT:    kortestb %k0, %k0
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    vzeroupper
@@ -2020,10 +2578,17 @@ define i1 @allones_v16i32_and1(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; SKX-NEXT:    kortestw %k0, %k0
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vzeroupper
@@ -2098,10 +2663,17 @@ define i1 @allzeros_v16i32_and1(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i32_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i32_and1:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SKX-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; SKX-NEXT:    kortestw %k0, %k0
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    vzeroupper
@@ -2130,6 +2702,18 @@ define i1 @allones_v2i64_and1(<2 x i64> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v2i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $3, %al
+; KNL-NEXT:    cmpb $3, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v2i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip), %xmm0, %k0
@@ -2162,6 +2746,17 @@ define i1 @allzeros_v2i64_and1(<2 x i64> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v2i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $3, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v2i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip), %xmm0, %k0
@@ -2215,6 +2810,18 @@ define i1 @allones_v4i64_and1(<4 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0
@@ -2270,6 +2877,17 @@ define i1 @allzeros_v4i64_and1(<4 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0
@@ -2355,6 +2973,15 @@ define i1 @allones_v8i64_and1(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
@@ -2439,6 +3066,15 @@ define i1 @allzeros_v8i64_and1(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i64_and1:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i64_and1:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
@@ -2470,6 +3106,18 @@ define i1 @allones_v16i8_and4(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %xmm0, %k0
@@ -2500,6 +3148,18 @@ define i1 @allzeros_v16i8_and4(<16 x i8> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %xmm0, %k0
@@ -2554,6 +3214,25 @@ define i1 @allones_v32i8_and4(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %ymm0, %k0
@@ -2607,6 +3286,24 @@ define i1 @allzeros_v32i8_and4(<32 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %ymm0, %k0
@@ -2687,6 +3384,38 @@ define i1 @allones_v64i8_and4(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v64i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    cmpq $-1, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v64i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %zmm0, %k0
@@ -2764,6 +3493,37 @@ define i1 @allzeros_v64i8_and4(<64 x i8> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v64i8_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm3
+; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    shll $16, %edx
+; KNL-NEXT:    orl %eax, %edx
+; KNL-NEXT:    shlq $32, %rdx
+; KNL-NEXT:    orq %rcx, %rdx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v64i8_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmb {{.*}}(%rip), %zmm0, %k0
@@ -2801,6 +3561,19 @@ define i1 @allones_v8i16_and4(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %xmm0, %k0
@@ -2837,6 +3610,19 @@ define i1 @allzeros_v8i16_and4(<8 x i16> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
+; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %xmm0, %k0
@@ -2894,6 +3680,18 @@ define i1 @allones_v16i16_and4(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %ymm0, %k0
@@ -2973,6 +3771,26 @@ define i1 @allones_v32i16_and4(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v32i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    cmpl $-1, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v32i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %zmm0, %k0
@@ -3050,6 +3868,25 @@ define i1 @allzeros_v32i16_and4(<32 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v32i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpcmpeqw %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    shll $16, %ecx
+; KNL-NEXT:    orl %eax, %ecx
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v32i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %zmm0, %k0
@@ -3108,6 +3945,18 @@ define i1 @allzeros_v16i16_and4(<16 x i16> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i16_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i16_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmw {{.*}}(%rip), %ymm0, %k0
@@ -3139,10 +3988,21 @@ define i1 @allones_v4i32_and4(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; SKX-NEXT:    vptestmd %xmm1, %xmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andb $15, %al
 ; SKX-NEXT:    cmpb $15, %al
@@ -3172,10 +4032,20 @@ define i1 @allzeros_v4i32_and4(<4 x i32> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; SKX-NEXT:    vptestmd %xmm1, %xmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testb $15, %al
 ; SKX-NEXT:    sete %al
@@ -3223,10 +4093,20 @@ define i1 @allones_v8i32_and4(<8 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; SKX-NEXT:    vptestmd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0
 ; SKX-NEXT:    kortestb %k0, %k0
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vzeroupper
@@ -3274,10 +4154,20 @@ define i1 @allzeros_v8i32_and4(<8 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
+; KNL-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; SKX-NEXT:    vptestmd %ymm1, %ymm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to8}, %ymm0, %k0
 ; SKX-NEXT:    kortestb %k0, %k0
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    vzeroupper
@@ -3352,10 +4242,17 @@ define i1 @allones_v16i32_and4(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v16i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v16i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; SKX-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; SKX-NEXT:    kortestw %k0, %k0
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    vzeroupper
@@ -3430,10 +4327,17 @@ define i1 @allzeros_v16i32_and4(<16 x i32> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v16i32_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
+; KNL-NEXT:    kortestw %k0, %k0
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v16i32_and4:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4]
-; SKX-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; SKX-NEXT:    vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
 ; SKX-NEXT:    kortestw %k0, %k0
 ; SKX-NEXT:    sete %al
 ; SKX-NEXT:    vzeroupper
@@ -3462,6 +4366,18 @@ define i1 @allones_v2i64_and4(<2 x i64> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allones_v2i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $3, %al
+; KNL-NEXT:    cmpb $3, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v2i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip), %xmm0, %k0
@@ -3494,6 +4410,17 @@ define i1 @allzeros_v2i64_and4(<2 x i64> %arg) {
 ; AVX-NEXT:    sete %al
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v2i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,4]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $3, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v2i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip), %xmm0, %k0
@@ -3547,6 +4474,18 @@ define i1 @allones_v4i64_and4(<4 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v4i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andb $15, %al
+; KNL-NEXT:    cmpb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v4i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0
@@ -3602,6 +4541,17 @@ define i1 @allzeros_v4i64_and4(<4 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v4i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
+; KNL-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb $15, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v4i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0
@@ -3687,6 +4637,15 @@ define i1 @allones_v8i64_and4(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allones_v8i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    cmpb $-1, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allones_v8i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
@@ -3771,6 +4730,15 @@ define i1 @allzeros_v8i64_and4(<8 x i64> %arg) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: allzeros_v8i64_and4:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: allzeros_v8i64_and4:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
@@ -3799,6 +4767,16 @@ define i32 @movmskpd(<2 x double> %x) {
 ; AVX-NEXT:    vmovmskpd %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskpd:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $3, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskpd:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %xmm0, %k0
@@ -3823,6 +4801,16 @@ define i32 @movmskps(<4 x float> %x) {
 ; AVX-NEXT:    vmovmskps %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskps:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $15, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskps:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %xmm0, %k0
@@ -3868,6 +4856,16 @@ define i32 @movmskpd256(<4 x double> %x) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskpd256:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $15, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskpd256:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovq2m %ymm0, %k0
@@ -3901,6 +4899,16 @@ define i32 @movmskps256(<8 x float> %x) {
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskps256:
+; KNL:       # %bb.0:
+; KNL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskps256:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovd2m %ymm0, %k0
@@ -3925,6 +4933,16 @@ define i32 @movmskb(<16 x i8> %x) {
 ; AVX-NEXT:    vpmovmskb %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
+; KNL-LABEL: movmskb:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskb:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
@@ -3964,6 +4982,22 @@ define i32 @movmskb256(<32 x i8> %x) {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; KNL-LABEL: movmskb256:
+; KNL:       # %bb.0:
+; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    shll $16, %eax
+; KNL-NEXT:    orl %ecx, %eax
+; KNL-NEXT:    vzeroupper
+; KNL-NEXT:    retq
+;
 ; SKX-LABEL: movmskb256:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmovb2m %ymm0, %k0
diff --git a/test/CodeGen/X86/mulvi32.ll b/test/CodeGen/X86/mulvi32.ll
index 86bd96f88dbc56910eb097d5946ef213f30c62cf..6c6737a614b0dadffc9319451448c9e87496f175 100644
--- a/test/CodeGen/X86/mulvi32.ll
+++ b/test/CodeGen/X86/mulvi32.ll
@@ -9,28 +9,12 @@
 define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) {
 ; SSE-LABEL: _mul2xi32a:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrlq $32, %xmm2
-; SSE-NEXT:    pmuludq %xmm1, %xmm2
-; SSE-NEXT:    movdqa %xmm1, %xmm3
-; SSE-NEXT:    psrlq $32, %xmm3
-; SSE-NEXT:    pmuludq %xmm0, %xmm3
-; SSE-NEXT:    paddq %xmm2, %xmm3
-; SSE-NEXT:    psllq $32, %xmm3
 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
-; SSE-NEXT:    paddq %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: _mul2xi32a:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm2
-; AVX-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpsrlq $32, %xmm1, %xmm3
-; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
-; AVX-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsllq $32, %xmm2, %xmm2
 ; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %r = mul <2 x i32> %0, %1
   ret <2 x i32> %r
diff --git a/test/CodeGen/X86/musttail-indirect.ll b/test/CodeGen/X86/musttail-indirect.ll
index 7bb71c3fb038439c9daac9b80e6a26478fdd9121..c142ffae69d0bae678457f0795dbd42add2af00e 100644
--- a/test/CodeGen/X86/musttail-indirect.ll
+++ b/test/CodeGen/X86/musttail-indirect.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
-; RUN: llc < %s -mtriple=i686-win32 -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=i686-win32 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=i686-win32 -O0 | FileCheck %s
 
 ; IR simplified from the following C++ snippet compiled for i686-windows-msvc:
 
diff --git a/test/CodeGen/X86/musttail-thiscall.ll b/test/CodeGen/X86/musttail-thiscall.ll
index 454c66cd675e2f12837f9f537c52a9e41528467e..a1ddbd5d1cbcb78b3ce19d2605178c1939a79b42 100644
--- a/test/CodeGen/X86/musttail-thiscall.ll
+++ b/test/CodeGen/X86/musttail-thiscall.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i686-- < %s | FileCheck %s
-; RUN: llc -mtriple=i686-- -O0 < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=i686-- < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=i686-- -O0 < %s | FileCheck %s
 
 ; CHECK-LABEL: t1:
 ; CHECK: jmp {{_?}}t1_callee
diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll
index 080e5e5b1e0db1bac107cf8f19afeb67b3afcdf3..6a338c5c7dae55a792f0f7c636fee94da0f34449 100644
--- a/test/CodeGen/X86/musttail-varargs.ll
+++ b/test/CodeGen/X86/musttail-varargs.ll
@@ -83,7 +83,6 @@ define void @f_thunk(i8* %this, ...) {
 ; LINUX-NEXT:    movq %rbp, %rdx
 ; LINUX-NEXT:    movq %r13, %rcx
 ; LINUX-NEXT:    movq %r12, %r8
-; LINUX-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
 ; LINUX-NEXT:    movq %r15, %r9
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -93,6 +92,7 @@ define void @f_thunk(i8* %this, ...) {
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; LINUX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; LINUX-NEXT:    movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload
 ; LINUX-NEXT:    addq $360, %rsp # imm = 0x168
 ; LINUX-NEXT:    .cfi_def_cfa_offset 56
 ; LINUX-NEXT:    popq %rbx
@@ -177,7 +177,6 @@ define void @f_thunk(i8* %this, ...) {
 ; LINUX-X32-NEXT:    movq %rbp, %rdx
 ; LINUX-X32-NEXT:    movq %r13, %rcx
 ; LINUX-X32-NEXT:    movq %r12, %r8
-; LINUX-X32-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
 ; LINUX-X32-NEXT:    movq %r15, %r9
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -187,6 +186,7 @@ define void @f_thunk(i8* %this, ...) {
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload
 ; LINUX-X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
+; LINUX-X32-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
 ; LINUX-X32-NEXT:    addl $344, %esp # imm = 0x158
 ; LINUX-X32-NEXT:    .cfi_def_cfa_offset 56
 ; LINUX-X32-NEXT:    popq %rbx
diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll
index 37ff7115ac9fea38a238a9c1c6cbd305b5e00e96..efc08ca1718d7e92a6bb74511db3befad212d83d 100644
--- a/test/CodeGen/X86/nontemporal-loads.ll
+++ b/test/CodeGen/X86/nontemporal-loads.ll
@@ -1800,35 +1800,23 @@ define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) {
 define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
 ; SSE2-LABEL: test_masked_v16i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm10
-; SSE2-NEXT:    pxor %xmm12, %xmm12
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm7
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    pxor %xmm0, %xmm8
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm9
-; SSE2-NEXT:    pxor %xmm0, %xmm9
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm11
-; SSE2-NEXT:    pxor %xmm0, %xmm11
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm7
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm8, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
 ; SSE2-NEXT:    pandn (%rdi), %xmm4
-; SSE2-NEXT:    pandn %xmm10, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    pandn 16(%rdi), %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm11
-; SSE2-NEXT:    por %xmm5, %xmm11
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm6, %xmm2
 ; SSE2-NEXT:    pandn 32(%rdi), %xmm6
-; SSE2-NEXT:    pandn %xmm2, %xmm9
-; SSE2-NEXT:    por %xmm6, %xmm9
+; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm7, %xmm3
 ; SSE2-NEXT:    pandn 48(%rdi), %xmm7
-; SSE2-NEXT:    pandn %xmm3, %xmm8
-; SSE2-NEXT:    por %xmm7, %xmm8
-; SSE2-NEXT:    movdqa %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm7, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test_masked_v16i32:
@@ -1911,4 +1899,36 @@ define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %m
   ret <16 x i32>%res
 }
 
+; Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=10895
+define i32 @PR39256(float* %ptr) {
+; SSE-LABEL: PR39256:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    ucomiss {{.*}}(%rip), %xmm0
+; SSE-NEXT:    setb (%rax)
+; SSE-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR39256:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vucomiss {{.*}}(%rip), %xmm0
+; AVX-NEXT:    setb (%rax)
+; AVX-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: PR39256:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vucomiss {{.*}}(%rip), %xmm0
+; AVX512-NEXT:    setb (%rax)
+; AVX512-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
+; AVX512-NEXT:    retq
+entry:
+  %l = load float, float* %ptr, !nontemporal !1
+  %C = fcmp ult float %l, 0x36A0000000000000
+  store i1 %C, i1* undef
+  ret i32 -2147483648
+}
+
 !1 = !{i32 1}
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index 6affef3393279e16f6e83843a434fdb9241ed748..9216cad5882782db55ecc752783503cd9564dd95 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -1630,7 +1630,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-SLOW-NEXT:    vbroadcastsd 24(%rsi), %ymm5
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
@@ -1654,19 +1654,19 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
 ; AVX2-FAST-NEXT:    vbroadcastsd %xmm2, %ymm4
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7]
-; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
-; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [5,6,5,6,5,6,7,7]
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm5, %ymm1
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX2-FAST-NEXT:    vbroadcastsd 24(%rsi), %ymm2
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FAST-NEXT:    vmovups %ymm1, 64(%rdi)
-; AVX2-FAST-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,3,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-FAST-NEXT:    vmovups %ymm0, 64(%rdi)
+; AVX2-FAST-NEXT:    vmovups %ymm4, 32(%rdi)
 ; AVX2-FAST-NEXT:    vmovups %ymm3, (%rdi)
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
diff --git a/test/CodeGen/X86/opt_phis.mir b/test/CodeGen/X86/opt_phis.mir
index e282a92e20166b15a5f22fa27004c0bfa654829d..f00ee76385dc5cb763e91bfcc8b59be4c83d8878 100644
--- a/test/CodeGen/X86/opt_phis.mir
+++ b/test/CodeGen/X86/opt_phis.mir
@@ -27,7 +27,7 @@ body:             |
 
   bb.1:
     %1:gr32 = PHI %0, %bb.0, %2, %bb.1
-    DBG_VALUE debug-use %1, debug-use _, !7, !DIExpression(), debug-location !6
+    DBG_VALUE %1, _, !7, !DIExpression(), debug-location !6
     %2:gr32 = IMPLICIT_DEF
     JMP_1 %bb.1
 ...
diff --git a/test/CodeGen/X86/packss.ll b/test/CodeGen/X86/packss.ll
index 2a4ee1f783f3610d93fdae656e02f08e2e837519..3feb0d04f04d530d7979aa73f1c88e6d16492d02 100644
--- a/test/CodeGen/X86/packss.ll
+++ b/test/CodeGen/X86/packss.ll
@@ -166,7 +166,7 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) {
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
 ; X86-SSE-NEXT:    psrlq $63, %xmm4
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; X86-SSE-NEXT:    movapd {{.*#+}} xmm2 = [4.9406564584124654E-324,-0]
+; X86-SSE-NEXT:    movapd {{.*#+}} xmm2 = [4.9406564584124654E-324,-0.0E+0]
 ; X86-SSE-NEXT:    xorpd %xmm2, %xmm0
 ; X86-SSE-NEXT:    psubq %xmm2, %xmm0
 ; X86-SSE-NEXT:    psrlq $63, %xmm3
diff --git a/test/CodeGen/X86/paddus.ll b/test/CodeGen/X86/paddus.ll
index 75b0597d38951ac35d682b443681c55cdc009d0b..63ef093fdd90ad5a016dd3c16aea3671829c93d0 100644
--- a/test/CodeGen/X86/paddus.ll
+++ b/test/CodeGen/X86/paddus.ll
@@ -801,22 +801,20 @@ define <8 x i16> @test23(<8 x i16> %x) {
 ; SSE2-LABEL: test23:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pcmpgtw %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test23:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm0, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpgtw %xmm2, %xmm1
-; SSSE3-NEXT:    por %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -1029,37 +1027,33 @@ define <16 x i16> @test28(<16 x i16> %x) {
 define <16 x i16> @test29(<16 x i16> %x) {
 ; SSE2-LABEL: test29:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtw %xmm4, %xmm3
-; SSE2-NEXT:    por %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test29:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2
-; SSSE3-NEXT:    pxor %xmm0, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pcmpgtw %xmm4, %xmm3
-; SSSE3-NEXT:    por %xmm0, %xmm3
-; SSSE3-NEXT:    por %xmm1, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm1, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pcmpgtw %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
@@ -1343,66 +1337,58 @@ define <32 x i16> @test34(<32 x i16> %x) {
 define <32 x i16> @test35(<32 x i16> %x) {
 ; SSE2-LABEL: test35:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm3, %xmm8
-; SSE2-NEXT:    pcmpgtw %xmm5, %xmm8
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    pcmpgtw %xmm6, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm5, %xmm6
 ; SSE2-NEXT:    movdqa %xmm1, %xmm7
-; SSE2-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    pcmpgtw %xmm7, %xmm6
-; SSE2-NEXT:    pxor %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NEXT:    pcmpgtw %xmm4, %xmm7
-; SSE2-NEXT:    por %xmm0, %xmm7
-; SSE2-NEXT:    por %xmm1, %xmm6
-; SSE2-NEXT:    por %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm3, %xmm8
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    pxor %xmm5, %xmm8
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
 ; SSE2-NEXT:    movdqa %xmm8, %xmm3
+; SSE2-NEXT:    pcmpgtw %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm7, %xmm2
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSE2-NEXT:    por %xmm6, %xmm1
+; SSE2-NEXT:    por %xmm7, %xmm2
+; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test35:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa %xmm3, %xmm8
-; SSSE3-NEXT:    pcmpgtw %xmm5, %xmm8
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm4, %xmm6
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpgtw %xmm6, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm6
+; SSSE3-NEXT:    pxor %xmm5, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm7
-; SSSE3-NEXT:    pxor %xmm4, %xmm7
-; SSSE3-NEXT:    movdqa %xmm1, %xmm6
-; SSSE3-NEXT:    pcmpgtw %xmm7, %xmm6
-; SSSE3-NEXT:    pxor %xmm0, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm7
-; SSSE3-NEXT:    pcmpgtw %xmm4, %xmm7
-; SSSE3-NEXT:    por %xmm0, %xmm7
-; SSSE3-NEXT:    por %xmm1, %xmm6
-; SSSE3-NEXT:    por %xmm2, %xmm5
-; SSSE3-NEXT:    por %xmm3, %xmm8
-; SSSE3-NEXT:    movdqa %xmm7, %xmm0
-; SSSE3-NEXT:    movdqa %xmm6, %xmm1
-; SSSE3-NEXT:    movdqa %xmm5, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    movdqa %xmm2, %xmm8
+; SSSE3-NEXT:    pxor %xmm5, %xmm8
+; SSSE3-NEXT:    pxor %xmm3, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtw %xmm3, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm8, %xmm3
+; SSSE3-NEXT:    pcmpgtw %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa %xmm7, %xmm2
+; SSSE3-NEXT:    pcmpgtw %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm6, %xmm1
+; SSSE3-NEXT:    pcmpgtw %xmm0, %xmm1
+; SSSE3-NEXT:    por %xmm6, %xmm1
+; SSSE3-NEXT:    por %xmm7, %xmm2
+; SSSE3-NEXT:    por %xmm8, %xmm3
+; SSSE3-NEXT:    por %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: test35:
diff --git a/test/CodeGen/X86/patchable-prologue.ll b/test/CodeGen/X86/patchable-prologue.ll
index c8daff33181c6e734884a95a99cbb5f43b769ec7..3779bc39531ee9d91eba7d45e80054932504716d 100644
--- a/test/CodeGen/X86/patchable-prologue.ll
+++ b/test/CodeGen/X86/patchable-prologue.ll
@@ -1,5 +1,5 @@
-; RUN: llc -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump -triple x86_64-apple-macosx -disassemble - | FileCheck %s
-; RUN: llc -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
+; RUN: llc -verify-machineinstrs -filetype=obj -o - -mtriple=x86_64-apple-macosx < %s | llvm-objdump -triple x86_64-apple-macosx -disassemble - | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-macosx < %s | FileCheck %s --check-prefix=CHECK-ALIGN
 
 declare void @callee(i64*)
 
diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll
index 5d7c77b9a815958156248d3f6250d97bc308f701..b4ff08cd2543c50b2def89bb9bd66ccce1438db1 100644
--- a/test/CodeGen/X86/phaddsub.ll
+++ b/test/CodeGen/X86/phaddsub.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX-FAST
 
 define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
 ; SSSE3-LABEL: phaddw1:
@@ -67,15 +69,29 @@ define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
 }
 
 define <4 x i32> @phaddd3(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd3:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd3:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd3:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = add <4 x i32> %a, %b
@@ -83,15 +99,29 @@ define <4 x i32> @phaddd3(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd4(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd4:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd4:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd4:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = add <4 x i32> %a, %b
@@ -99,15 +129,29 @@ define <4 x i32> @phaddd4(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd5(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd5:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd5:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd5:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd5:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd5:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd5:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
   %r = add <4 x i32> %a, %b
@@ -115,15 +159,27 @@ define <4 x i32> @phaddd5(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd6(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd6:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd6:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd6:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd6:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd6:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd6:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = add <4 x i32> %a, %b
@@ -131,15 +187,29 @@ define <4 x i32> @phaddd6(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd7(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd7:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phaddd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd7:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd7:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd7:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd7:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd7:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
   %r = add <4 x i32> %a, %b
@@ -179,15 +249,30 @@ define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
 }
 
 define <4 x i32> @phsubd2(<4 x i32> %x) {
-; SSSE3-LABEL: phsubd2:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phsubd2:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSSE3-SLOW-NEXT:    psubd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phsubd2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phsubd2:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phsubd2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phsubd2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
   %r = sub <4 x i32> %a, %b
@@ -195,15 +280,30 @@ define <4 x i32> @phsubd2(<4 x i32> %x) {
 }
 
 define <4 x i32> @phsubd3(<4 x i32> %x) {
-; SSSE3-LABEL: phsubd3:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phsubd3:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSSE3-SLOW-NEXT:    psubd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phsubd3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phsubd3:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phsubd3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-SLOW-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phsubd3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   %r = sub <4 x i32> %a, %b
@@ -211,15 +311,27 @@ define <4 x i32> @phsubd3(<4 x i32> %x) {
 }
 
 define <4 x i32> @phsubd4(<4 x i32> %x) {
-; SSSE3-LABEL: phsubd4:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    phsubd %xmm0, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phsubd4:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSSE3-SLOW-NEXT:    psubd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phsubd4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phsubd4:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phsubd4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phsubd4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
   %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
   %r = sub <4 x i32> %a, %b
@@ -284,19 +396,29 @@ define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
 }
 
 define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source1:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source1:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source1:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source1:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source1:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = add <4 x i32> %l, %r
@@ -304,21 +426,33 @@ define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source2:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source2:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source2:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
   %add = add <4 x i32> %l, %r
@@ -327,19 +461,29 @@ define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source3:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source3:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source3:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = add <4 x i32> %l, %r
@@ -347,36 +491,58 @@ define <4 x i32> @phaddd_single_source3(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd_single_source4(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source4:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source4:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source4:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = add <4 x i32> %l, %x
   ret <4 x i32> %add
 }
 
 define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source5:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
-; SSSE3-NEXT:    paddd %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source5:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; SSSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source5:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source5:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source5:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2]
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source5:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
   %add = add <4 x i32> %l, %x
   %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
@@ -384,21 +550,33 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) {
 }
 
 define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source6:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSSE3-NEXT:    paddd %xmm1, %xmm0
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddd_single_source6:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddd_single_source6:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddd_single_source6:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source6:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-SLOW-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddd_single_source6:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
   %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
   %add = add <4 x i32> %l, %r
@@ -407,20 +585,30 @@ define <4 x i32> @phaddd_single_source6(<4 x i32> %x) {
 }
 
 define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source1:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source1:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
+; SSSE3-SLOW-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
+; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source1:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source1:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13]
+; AVX-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15]
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source1:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
   %add = add <8 x i16> %l, %r
@@ -428,27 +616,41 @@ define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
 }
 
 define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source2:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
-; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source2:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source2:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; SSSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source2:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source2:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
   %add = add <8 x i16> %l, %r
@@ -457,23 +659,33 @@ define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
 }
 
 define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source3:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source3:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSSE3-SLOW-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source3:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source3:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source3:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; AVX-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source3:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef>
   %add = add <8 x i16> %l, %r
@@ -481,41 +693,63 @@ define <8 x i16> @phaddw_single_source3(<8 x i16> %x) {
 }
 
 define <8 x i16> @phaddw_single_source4(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source4:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pslld $16, %xmm1
-; SSSE3-NEXT:    paddw %xmm0, %xmm1
-; SSSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source4:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    pslld $16, %xmm1
+; SSSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
+; SSSE3-SLOW-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpslld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source4:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source4:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpslld $16, %xmm0, %xmm1
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source4:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6>
   %add = add <8 x i16> %l, %x
   ret <8 x i16> %add
 }
 
 define <8 x i16> @phaddw_single_source6(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source6:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; SSSE3-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
-; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
+; SSSE3-SLOW-LABEL: phaddw_single_source6:
+; SSSE3-SLOW:       # %bb.0:
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSSE3-SLOW-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
+; SSSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-SLOW-NEXT:    retq
 ;
-; AVX-LABEL: phaddw_single_source6:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
-; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    retq
+; SSSE3-FAST-LABEL: phaddw_single_source6:
+; SSSE3-FAST:       # %bb.0:
+; SSSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSSE3-FAST-NEXT:    retq
+;
+; AVX-SLOW-LABEL: phaddw_single_source6:
+; AVX-SLOW:       # %bb.0:
+; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX-SLOW-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX-SLOW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
+; AVX-SLOW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-SLOW-NEXT:    retq
+;
+; AVX-FAST-LABEL: phaddw_single_source6:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-FAST-NEXT:    retq
   %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
   %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
   %add = add <8 x i16> %l, %r
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index a5453b9e1f8c9452e38f7c0eb22a7603b705d51d..2a129bc643b36383a2d619e1fa9f1eed5bb6cef6 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -20,9 +20,9 @@ entry:
 ; On Intel Atom the scheduler moves a movl instruction
 ; used for the printf call to follow movl 24(%esp), %eax
 ; ATOM: movl 24(%esp), %eax
-; ATOM: movl
-; ATOM: movl   %eax, 36(%esp)
 ; ATOM-NOT: movl
+; ATOM: movl   %eax, 36(%esp)
+; ATOM: movl
 ; ATOM: movl 28(%esp), %ebx
 ; ATOM-NOT: movl
 ; ATOM: movl   %ebx, 40(%esp)
diff --git a/test/CodeGen/X86/popcnt-schedule.ll b/test/CodeGen/X86/popcnt-schedule.ll
index 4f590bd96bbdd8c9b20f19858c417b1851ff0157..a039ba01a230a8de37d4a1ed84e87259509bfade 100644
--- a/test/CodeGen/X86/popcnt-schedule.ll
+++ b/test/CodeGen/X86/popcnt-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell   | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake     | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl         | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2      | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1      | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -60,6 +61,14 @@ define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) {
 ; SKYLAKE-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctpop_i16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    popcntw (%rsi), %cx # sched: [8:0.50]
+; BDVER2-NEXT:    popcntw %di, %ax # sched: [4:0.50]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ctpop_i16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    popcntw (%rsi), %cx # sched: [4:1.00]
@@ -126,6 +135,13 @@ define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) {
 ; SKYLAKE-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctpop_i32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    popcntl (%rsi), %ecx # sched: [8:0.50]
+; BDVER2-NEXT:    popcntl %edi, %eax # sched: [4:0.50]
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ctpop_i32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    popcntl (%rsi), %ecx # sched: [4:1.00]
@@ -190,6 +206,13 @@ define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) {
 ; SKYLAKE-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ctpop_i64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    popcntq (%rsi), %rcx # sched: [8:0.50]
+; BDVER2-NEXT:    popcntq %rdi, %rax # sched: [4:0.50]
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ctpop_i64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    popcntq (%rsi), %rcx # sched: [4:1.00]
diff --git a/test/CodeGen/X86/post-ra-sched-with-debug.mir b/test/CodeGen/X86/post-ra-sched-with-debug.mir
index f4f69b6027913b4ff03fb8202c5678f3e3c7b630..079374752b1a742b87fff5f89a2e31529d667a32 100644
--- a/test/CodeGen/X86/post-ra-sched-with-debug.mir
+++ b/test/CodeGen/X86/post-ra-sched-with-debug.mir
@@ -251,8 +251,8 @@ body:             |
     liveins: $esi, $rdi, $r14, $rbx, $rbp
 
     ; CHECK:      [[REGISTER:\$r[a-z0-9]+]] = LEA64r {{\$r[a-z0-9]+}}, 1, $noreg, -20, $noreg
-    ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use $noreg, ![[J_VAR]], !DIExpression(), debug-location ![[J_LOC]]
-    ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
+    ; CHECK-NEXT: DBG_VALUE [[REGISTER]], $noreg, ![[J_VAR]], !DIExpression(), debug-location ![[J_LOC]]
+    ; CHECK-NEXT: DBG_VALUE [[REGISTER]], $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
 
     frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION def_cfa_offset 16
@@ -268,8 +268,8 @@ body:             |
     $rbx = MOV64rr $rdi
     CALL64pcrel32 @_ZN1lC2Ei, csr_64, implicit $rsp, implicit $rdi, implicit $esi, implicit-def $rsp
     $rdi = LEA64r $rbx, 1, $noreg, 8, $noreg
-    DBG_VALUE debug-use $rdi, debug-use $noreg, !20, !17, debug-location !27
-    DBG_VALUE debug-use $rdi, debug-use $noreg, !10, !17, debug-location !18
+    DBG_VALUE $rdi, $noreg, !20, !17, debug-location !27
+    DBG_VALUE $rdi, $noreg, !10, !17, debug-location !18
     $rax = MOV64rm $rbx, 1, $noreg, 16, $noreg :: (load 8)
     MOV64mr $rbx, 1, $noreg, 8, $noreg, killed $rax :: (store 8)
     MOV64mr $rbx, 1, $noreg, 24, $noreg, $rdi :: (store 8)
@@ -286,9 +286,9 @@ body:             |
     $rsi = CMOVNE64rr killed $rsi, $rdx, implicit killed $eflags
     $rsi = OR64rr killed $rsi, killed $rcx, implicit-def $eflags
     $rcx = LEA64r $rbp, 1, $noreg, -20, $noreg
-    DBG_VALUE debug-use $rcx, debug-use $noreg, !46, !17, debug-location !48
-    DBG_VALUE debug-use $rcx, debug-use $noreg, !39, !17, debug-location !44
-    DBG_VALUE debug-use $rbp, -20, !29, !17, debug-location !36
+    DBG_VALUE $rcx, $noreg, !46, !17, debug-location !48
+    DBG_VALUE $rcx, $noreg, !39, !17, debug-location !44
+    DBG_VALUE $rbp, -20, !29, !17, debug-location !36
     $rcx = CMOVNE64rr killed $rcx, killed $rdx, implicit killed $eflags
     $rcx = OR64rr killed $rcx, killed $rsi, implicit-def dead $eflags
     $rdx = MOVSX64rm32 $rbx, 1, $noreg, 0, $noreg :: (load 4, align 8)
diff --git a/test/CodeGen/X86/postra-ignore-dbg-instrs.mir b/test/CodeGen/X86/postra-ignore-dbg-instrs.mir
index 0286e6e68bf7066e7a4beac659f5151017aa965d..25e6992b7b7ab85b5dab626821f5d2590d0678fa 100644
--- a/test/CodeGen/X86/postra-ignore-dbg-instrs.mir
+++ b/test/CodeGen/X86/postra-ignore-dbg-instrs.mir
@@ -62,7 +62,7 @@
 # CHECK-NOT: $eax = COPY $edi
 # CHECK: bb.1:
 # CHECK: renamable $eax = COPY $edi
-# CHECK-NEXT: DBG_VALUE debug-use $eax,
+# CHECK-NEXT: DBG_VALUE $eax,
 # CHECK: bb.2:
 name:            x1
 alignment:       4
@@ -71,9 +71,9 @@ body: |
   bb.0:
     successors: %bb.2, %bb.1; %bb.2, %bb.1
     liveins: $edi
-    DBG_VALUE debug-use $edi, debug-use $noreg, !14, !DIExpression(), debug-location !16
+    DBG_VALUE $edi, $noreg, !14, !DIExpression(), debug-location !16
     renamable $eax = COPY $edi
-    DBG_VALUE debug-use $eax, debug-use $noreg, !14, !DIExpression(), debug-location !16
+    DBG_VALUE $eax, $noreg, !14, !DIExpression(), debug-location !16
     CMP32mi8 $rip, 1, $noreg, @x0, $noreg, 0, implicit-def $eflags, debug-location !16
     JE_1 %bb.2, implicit killed $eflags, debug-location !16
     JMP_1 %bb.1, debug-location !16
diff --git a/test/CodeGen/X86/pow.ll b/test/CodeGen/X86/pow.ll
index f170488cb2fa4ae062274ec5cc9f5c28dd8bd752..456005402896916c43e653ef809daafd869c006e 100644
--- a/test/CodeGen/X86/pow.ll
+++ b/test/CodeGen/X86/pow.ll
@@ -56,11 +56,11 @@ define <4 x float> @pow_v4f32_one_fourth_fmf(<4 x float> %x) nounwind {
 ; CHECK-NEXT:    rsqrtps %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm0, %xmm2
 ; CHECK-NEXT:    mulps %xmm1, %xmm2
-; CHECK-NEXT:    movaps {{.*#+}} xmm3 = [-0.5,-0.5,-0.5,-0.5]
+; CHECK-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; CHECK-NEXT:    movaps %xmm2, %xmm4
 ; CHECK-NEXT:    mulps %xmm3, %xmm4
 ; CHECK-NEXT:    mulps %xmm1, %xmm2
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [-3,-3,-3,-3]
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; CHECK-NEXT:    addps %xmm1, %xmm2
 ; CHECK-NEXT:    mulps %xmm4, %xmm2
 ; CHECK-NEXT:    xorps %xmm4, %xmm4
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index dc1fd88e42567f34292a677b0a9ddbc32d9930ec..53d1ea79f486dd51234962221a695587dd1a6bd4 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -19,7 +19,7 @@ define void @foo(%struct.anon* byval %p) nounwind {
 ; CHECK-NEXT:    subl $28, %esp
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movaps {{.*#+}} xmm2 = [-0,-0,-0,-0]
+; CHECK-NEXT:    movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    xorps %xmm2, %xmm0
 ; CHECK-NEXT:    cvtss2sd %xmm0, %xmm0
 ; CHECK-NEXT:    xorps %xmm2, %xmm1
diff --git a/test/CodeGen/X86/pr28489.ll b/test/CodeGen/X86/pr28489.ll
index 898b0870b65d5c9d4570a02638ac4f930832c93a..8ab3fbb9d91c8d7b57853dac4e73aacd3eb97a4f 100644
--- a/test/CodeGen/X86/pr28489.ll
+++ b/test/CodeGen/X86/pr28489.ll
@@ -3,8 +3,8 @@ declare void @g(i32, i1)
 
 ;CHECK-LABEL: f:
 ;CHECK: cmpxchg8b
-;CHECK: sete %cl
-;CHECK: movzbl %cl
+;CHECK: sete [[REG:%[abcd]l]]
+;CHECK: movzbl [[REG]]
 define void @f(i64* %arg, i64 %arg1) {
 entry:
   %tmp5 = cmpxchg i64* %arg, i64 %arg1, i64 %arg1 seq_cst seq_cst
diff --git a/test/CodeGen/X86/pr30430.ll b/test/CodeGen/X86/pr30430.ll
index 94deca3a292aefc5b0b81f3078f8f91c2cbaa853..a81e26c51a17273e1b1f86bcd3948044252aad68 100644
--- a/test/CodeGen/X86/pr30430.ll
+++ b/test/CodeGen/X86/pr30430.ll
@@ -116,14 +116,14 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm24, %zmm24
 ; CHECK-NEXT:    vmovaps %zmm24, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
-; CHECK-NEXT:    vmovss %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    vmovss %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    vmovss %xmm14, (%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm15, (%rsp) # 4-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/test/CodeGen/X86/pr30813.ll b/test/CodeGen/X86/pr30813.ll
index b830f1e04b72b9ae60cf9345f71980e4c49bf955..e3e096bda6c2883bbc0941f8ffcdb07797f7fa61 100644
--- a/test/CodeGen/X86/pr30813.ll
+++ b/test/CodeGen/X86/pr30813.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=x86_64-linux-gnu -O0 %s -o - | FileCheck %s
 ; CHECK: patatino:
 ; CHECK:         .cfi_startproc
-; CHECK:         movzwl  (%rax), %ecx
-; CHECK:         movl    %ecx, %eax
-; CHECK:         movq    %rax, (%rdx)
+; CHECK:         movzwl  (%rax), [[REG0:%e[abcd]x]]
+; CHECK:         movl    [[REG0]], %e[[REG1C:[abcd]]]x
+; CHECK:         movq    %r[[REG1C]]x, ({{%r[abcd]x}})
 ; CHECK:         retq
 
 define void @patatino() {
diff --git a/test/CodeGen/X86/pr32108.ll b/test/CodeGen/X86/pr32108.ll
index bde5daff2857e31f5054386eb116ec6ceab635f2..dc14746440ae5f57fcb93fc30dae11b93b2bdc58 100644
--- a/test/CodeGen/X86/pr32108.ll
+++ b/test/CodeGen/X86/pr32108.ll
@@ -4,7 +4,6 @@
 define void @pr32108() {
 ; CHECK-LABEL: pr32108:
 ; CHECK:       # %bb.0: # %BB
-; CHECK-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %CF244
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll
index 7ec6a5dbf7fdd60ef02273b82d8a96cec620bcac..6b08e2fa4139c99e23dd1137d616b056d191bedd 100644
--- a/test/CodeGen/X86/pr32282.ll
+++ b/test/CodeGen/X86/pr32282.ll
@@ -9,11 +9,10 @@
 @d = common global i64 zeroinitializer, align 8
 @e = common global i64 zeroinitializer, align 8
 
-define void @foo() {
+define void @foo(i64 %x) nounwind {
 ; X86-LABEL: foo:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    movl d, %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    movl d+4, %ecx
@@ -26,40 +25,35 @@ define void @foo() {
 ; X86-NEXT:    addl $7, %eax
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    pushl %ecx
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $0
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NEXT:    pushl $0
-; X86-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divdi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_adjust_cfa_offset -16
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    setne {{[0-9]+}}(%esp)
 ; X86-NEXT:    popl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq {{.*}}(%rip), %rax
-; X64-NEXT:    movabsq $3013716102212485120, %rcx # imm = 0x29D2DED3DE400000
-; X64-NEXT:    andnq %rcx, %rax, %rcx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq {{.*}}(%rip), %rcx
+; X64-NEXT:    movabsq $3013716102212485120, %rdx # imm = 0x29D2DED3DE400000
+; X64-NEXT:    andnq %rdx, %rcx, %rcx
 ; X64-NEXT:    shrq $21, %rcx
 ; X64-NEXT:    addq $7, %rcx
-; X64-NEXT:    movabsq $4393751543808, %rax # imm = 0x3FF00000000
-; X64-NEXT:    testq %rax, %rcx
+; X64-NEXT:    movq %rdi, %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    shrq $32, %rdx
 ; X64-NEXT:    je .LBB0_1
 ; X64-NEXT:  # %bb.2:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    divq %rcx
+; X64-NEXT:    cqto
+; X64-NEXT:    idivq %rcx
 ; X64-NEXT:    jmp .LBB0_3
 ; X64-NEXT:  .LBB0_1:
-; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    divl %ecx
 ; X64-NEXT:    # kill: def $eax killed $eax def $rax
 ; X64-NEXT:  .LBB0_3:
@@ -86,7 +80,7 @@ define void @foo() {
   %18 = ashr i64 %4, %17
   %19 = and i64 %18, 9223372036854775806
   %20 = add nsw i64 7, %19
-  %21 = sdiv i64 0, %20
+  %21 = sdiv i64 %x, %20
   %22 = icmp ne i64 %21, 0
   %23 = zext i1 %22 to i8
   store i8 %23, i8* %1, align 1
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index ab6680cf45a791cc9169454cb99280ae301e0556..3998fcec9c7816dfe40fa3c4a6772166255bf7fe 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -222,8 +222,8 @@ define void @f1() {
 ; 686-O0-NEXT:    movl %ebp, _ZN8struct_210member_2_0E
 ; 686-O0-NEXT:    movl $0, _ZN8struct_210member_2_0E+4
 ; 686-O0-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; 686-O0-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 686-O0-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; 686-O0-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 686-O0-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 686-O0-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; 686-O0-NEXT:    addl $24, %esp
diff --git a/test/CodeGen/X86/pr32345.ll b/test/CodeGen/X86/pr32345.ll
index 3a2db27727a1950e723d76e2fda339ae24b7889e..65fcf055f284cabe63a75e41a01de9eeceb290d4 100644
--- a/test/CodeGen/X86/pr32345.ll
+++ b/test/CodeGen/X86/pr32345.ll
@@ -77,8 +77,8 @@ define void @foo() {
 ; 6860-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; 6860-NEXT:    shrdl %cl, %edi, %esi
 ; 6860-NEXT:    testb $32, %bl
-; 6860-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 6860-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; 6860-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; 6860-NEXT:    jne .LBB0_2
 ; 6860-NEXT:  # %bb.1: # %bb
 ; 6860-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
diff --git a/test/CodeGen/X86/pr34592.ll b/test/CodeGen/X86/pr34592.ll
index 34e80fb23c4505f51f27a0438500e24486f9976c..b010429d973e6aa7e70dc7ff477954ae5f32775d 100644
--- a/test/CodeGen/X86/pr34592.ll
+++ b/test/CodeGen/X86/pr34592.ll
@@ -53,12 +53,12 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
 ; CHECK-NEXT:    vmovaps %ymm5, %ymm1
 ; CHECK-NEXT:    vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps %ymm9, %ymm3
-; CHECK-NEXT:    vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-NEXT:    vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT:    vmovaps %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; CHECK-NEXT:    vmovaps %ymm14, (%rsp) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    vmovaps %ymm4, (%rsp) # 32-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll
index 858e0f46c3a725a18373172bd91b767dc2cb6b3b..3578806596f5987cfeb50e28e86d6665136812d6 100644
--- a/test/CodeGen/X86/pr34653.ll
+++ b/test/CodeGen/X86/pr34653.ll
@@ -12,7 +12,7 @@ define void @pr34653() {
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    andq $-512, %rsp # imm = 0xFE00
-; CHECK-NEXT:    subq $2048, %rsp # imm = 0x800
+; CHECK-NEXT:    subq $1536, %rsp # imm = 0x600
 ; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
 ; CHECK-NEXT:    callq test
 ; CHECK-NEXT:    vmovupd {{[0-9]+}}(%rsp), %xmm0
@@ -32,53 +32,48 @@ define void @pr34653() {
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm10, %xmm13
 ; CHECK-NEXT:    vmovaps %xmm13, %xmm14
 ; CHECK-NEXT:    vmovaps %xmm10, %xmm15
-; CHECK-NEXT:    vmovaps %xmm15, %xmm2
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vextractf32x4 $3, %zmm9, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %zmm15, %zmm16
+; CHECK-NEXT:    vextractf32x4 $3, %zmm9, %xmm2
+; CHECK-NEXT:    vmovaps %zmm2, %zmm17
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm9, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm18
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm9, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm19
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm8, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm20
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm8, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm21
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm8, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm22
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm7, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm23
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm7, %xmm0
+; CHECK-NEXT:    vmovaps %zmm0, %zmm24
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps %xmm7, %xmm0
-; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    vmovaps %zmm0, %zmm25
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
 ; CHECK-NEXT:    # kill: def $ymm10 killed $ymm10 killed $zmm10
 ; CHECK-NEXT:    vextractf128 $1, %ymm10, %xmm10
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm10, %xmm0
+; CHECK-NEXT:    vmovaps %zmm10, %zmm26
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    # kill: def $ymm9 killed $ymm9 killed $zmm9
 ; CHECK-NEXT:    vextractf128 $1, %ymm9, %xmm9
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm9, %xmm0
+; CHECK-NEXT:    vmovaps %zmm9, %zmm27
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -90,8 +85,7 @@ define void @pr34653() {
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    # kill: def $ymm8 killed $ymm8 killed $zmm8
 ; CHECK-NEXT:    vextractf128 $1, %ymm8, %xmm8
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm8, %xmm0
+; CHECK-NEXT:    vmovaps %zmm8, %zmm28
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -103,8 +97,7 @@ define void @pr34653() {
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
 ; CHECK-NEXT:    # kill: def $ymm7 killed $ymm7 killed $zmm7
 ; CHECK-NEXT:    vextractf128 $1, %ymm7, %xmm7
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovaps %xmm7, %xmm0
+; CHECK-NEXT:    vmovaps %zmm7, %zmm29
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -112,6 +105,10 @@ define void @pr34653() {
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 8-byte Reload
+; CHECK-NEXT:    # xmm30 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 8-byte Reload
+; CHECK-NEXT:    # xmm31 = mem[0],zero
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
@@ -133,70 +130,38 @@ define void @pr34653() {
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT:    # xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    vmovsd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    vmovsd %xmm7, (%rsp) # 8-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
diff --git a/test/CodeGen/X86/pr35918.ll b/test/CodeGen/X86/pr35918.ll
index f53bb86ee48b6b305dad987016ba891d73907b88..5c84bd946fddccffbc6ed71c034d38822752566b 100644
--- a/test/CodeGen/X86/pr35918.ll
+++ b/test/CodeGen/X86/pr35918.ll
@@ -11,9 +11,9 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X86-SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKYLAKE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SKYLAKE-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X86-SKYLAKE-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X86-SKYLAKE-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; X86-SKYLAKE-NEXT:    vpsrad $16, %xmm0, %xmm0
+; X86-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-SKYLAKE-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; X86-SKYLAKE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
 ; X86-SKYLAKE-NEXT:    vpsrld $7, %xmm0, %xmm0
@@ -29,7 +29,7 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X86-SKX-NEXT:    subl $8, %esp
 ; X86-SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SKX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u]
+; X86-SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
 ; X86-SKX-NEXT:    vpsrad $16, %xmm0, %xmm0
 ; X86-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X86-SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
@@ -50,9 +50,9 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X64-SKYLAKE-LABEL: fetch_r16g16_snorm_unorm8:
 ; X64-SKYLAKE:       # %bb.0: # %entry
 ; X64-SKYLAKE-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X64-SKYLAKE-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; X64-SKYLAKE-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; X64-SKYLAKE-NEXT:    vpsrad $16, %xmm0, %xmm0
+; X64-SKYLAKE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKYLAKE-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; X64-SKYLAKE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
 ; X64-SKYLAKE-NEXT:    vpsrld $7, %xmm0, %xmm0
@@ -65,7 +65,7 @@ define void @fetch_r16g16_snorm_unorm8(<4 x i8>*, i8*, i32, i32, { [2048 x i32],
 ; X64-SKX-LABEL: fetch_r16g16_snorm_unorm8:
 ; X64-SKX:       # %bb.0: # %entry
 ; X64-SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SKX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[u,u],zero,zero,xmm0[u,u]
+; X64-SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,u,u,2,3,u,u,u,u,u,u,u,u]
 ; X64-SKX-NEXT:    vpsrad $16, %xmm0, %xmm0
 ; X64-SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/pr38533.ll b/test/CodeGen/X86/pr38533.ll
index 96d003ba1a84ddc247e889a5379afc1ce29638dc..59c67acc9be65b13544f668e61d40f6b8afbeb8f 100644
--- a/test/CodeGen/X86/pr38533.ll
+++ b/test/CodeGen/X86/pr38533.ll
@@ -19,8 +19,6 @@ define void @pr38533_2(half %x) {
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    callq __gnu_f2h_ieee
-; SSE-NEXT:    movw %ax, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
 ; SSE-NEXT:    movw %ax, (%rax)
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -30,8 +28,6 @@ define void @pr38533_2(half %x) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
 ; AVX512-NEXT:    movw %ax, (%rax)
 ; AVX512-NEXT:    retq
   %a = bitcast half %x to <4 x i4>
@@ -46,8 +42,6 @@ define void @pr38533_3(half %x) {
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
 ; SSE-NEXT:    callq __gnu_f2h_ieee
-; SSE-NEXT:    movw %ax, (%rsp)
-; SSE-NEXT:    movzwl (%rsp), %eax
 ; SSE-NEXT:    movw %ax, (%rax)
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
diff --git a/test/CodeGen/X86/pr38539.ll b/test/CodeGen/X86/pr38539.ll
index 9e16f7ca40662d19d2ce56c04a3ea29a9114c0bd..215d908a03f4da0607a5efb8975df058ede20173 100644
--- a/test/CodeGen/X86/pr38539.ll
+++ b/test/CodeGen/X86/pr38539.ll
@@ -6,68 +6,13 @@
 define void @f() {
 ; X64-LABEL: f:
 ; X64:       # %bb.0: # %BB
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    subq $16, %rsp
-; X64-NEXT:    .cfi_def_cfa_offset 48
-; X64-NEXT:    .cfi_offset %rbx, -32
-; X64-NEXT:    .cfi_offset %r14, -24
-; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
-; X64-NEXT:    movq (%rsp), %rbx
 ; X64-NEXT:    movb (%rax), %al
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    # kill: def $eax killed $eax def $ax
-; X64-NEXT:    divb (%rax)
-; X64-NEXT:    movl %eax, %r14d
-; X64-NEXT:    movq %rbp, %rcx
-; X64-NEXT:    shlq $62, %rcx
-; X64-NEXT:    sarq $62, %rcx
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    xorl %esi, %esi
-; X64-NEXT:    movq %rbx, %rdx
-; X64-NEXT:    callq __modti3
-; X64-NEXT:    andl $3, %edx
+; X64-NEXT:    movb (%rax), %al
 ; X64-NEXT:    testb %al, %al
 ; X64-NEXT:    setne (%rax)
-; X64-NEXT:    cmpq %rax, %rbx
-; X64-NEXT:    sbbq %rdx, %rbp
-; X64-NEXT:    setae %dl
-; X64-NEXT:    sbbb %cl, %cl
-; X64-NEXT:    testb %al, %al
-; X64-NEXT:    setne %bl
-; X64-NEXT:    negb %dl
-; X64-NEXT:    cmpb %r14b, %al
-; X64-NEXT:    setle %al
-; X64-NEXT:    negb %al
-; X64-NEXT:    cbtw
-; X64-NEXT:    idivb %dl
-; X64-NEXT:    movsbl %ah, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    negb %bl
-; X64-NEXT:    leaq -16(%rsp,%rax), %rax
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq %rax, (%rax)
-; X64-NEXT:    movl %ecx, %eax
-; X64-NEXT:    cbtw
-; X64-NEXT:    idivb %bl
-; X64-NEXT:    movsbl %ah, %eax
-; X64-NEXT:    andb $1, %al
-; X64-NEXT:    movb %al, (%rax)
-; X64-NEXT:    addq $16, %rsp
-; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    popq %r14
-; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    popq %rbp
-; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    movb $0, (%rax)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: f:
@@ -77,75 +22,16 @@ define void @f() {
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movb (%eax), %al
 ; X86-NEXT:    movb (%eax), %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    # kill: def $eax killed $eax def $ax
-; X86-NEXT:    divb (%eax)
-; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shll $30, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $30, %ecx
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    setae %dl
-; X86-NEXT:    sbbb %cl, %cl
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    setne %ch
 ; X86-NEXT:    setne (%eax)
-; X86-NEXT:    negb %ch
-; X86-NEXT:    negb %dl
-; X86-NEXT:    cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    setle %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    cbtw
-; X86-NEXT:    idivb %dl
-; X86-NEXT:    movsbl %ah, %eax
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    leal -4(%esp,%eax,4), %eax
+; X86-NEXT:    leal -{{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%eax)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cbtw
-; X86-NEXT:    idivb %ch
-; X86-NEXT:    movsbl %ah, %eax
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    movb %al, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movb $0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
@@ -177,50 +63,13 @@ BB:
 define void @g() {
 ; X64-LABEL: g:
 ; X64:       # %bb.0: # %BB
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; X64-NEXT:    shlq $32, %rsi
-; X64-NEXT:    orq %rax, %rsi
-; X64-NEXT:    movq %rsi, %rdi
-; X64-NEXT:    shlq $30, %rdi
-; X64-NEXT:    sarq $30, %rdi
 ; X64-NEXT:    movb (%rax), %al
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    # kill: def $eax killed $eax def $ax
-; X64-NEXT:    divb (%rax)
-; X64-NEXT:    movl %eax, %r8d
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    idivq %rdi
-; X64-NEXT:    movabsq $17179869183, %rax # imm = 0x3FFFFFFFF
-; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    movb (%rax), %al
 ; X64-NEXT:    testb %al, %al
-; X64-NEXT:    setne %dil
 ; X64-NEXT:    setne (%rax)
-; X64-NEXT:    cmpq %rsi, %rax
-; X64-NEXT:    seta %dl
-; X64-NEXT:    setbe %cl
-; X64-NEXT:    negb %cl
-; X64-NEXT:    cmpb %r8b, %al
-; X64-NEXT:    setle %al
-; X64-NEXT:    negb %al
-; X64-NEXT:    cbtw
-; X64-NEXT:    idivb %cl
-; X64-NEXT:    movsbl %ah, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    shlq $3, %rax
-; X64-NEXT:    negq %rax
-; X64-NEXT:    negb %dil
-; X64-NEXT:    negb %dl
-; X64-NEXT:    leaq -16(%rsp,%rax), %rax
+; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; X64-NEXT:    movq %rax, (%rax)
-; X64-NEXT:    movl %edx, %eax
-; X64-NEXT:    cbtw
-; X64-NEXT:    idivb %dil
-; X64-NEXT:    movsbl %ah, %eax
-; X64-NEXT:    andb $1, %al
-; X64-NEXT:    movb %al, (%rax)
+; X64-NEXT:    movb $0, (%rax)
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: g:
@@ -230,63 +79,16 @@ define void @g() {
 ; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esp), %edi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movb (%eax), %al
 ; X86-NEXT:    movb (%eax), %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    # kill: def $eax killed $eax def $ax
-; X86-NEXT:    divb (%eax)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    shll $30, %eax
-; X86-NEXT:    sarl $30, %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    calll __moddi3
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    andl $3, %edx
 ; X86-NEXT:    testb %al, %al
 ; X86-NEXT:    setne (%eax)
-; X86-NEXT:    cmpl %eax, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    setae %dl
-; X86-NEXT:    sbbb %cl, %cl
-; X86-NEXT:    testb %al, %al
-; X86-NEXT:    setne %ch
-; X86-NEXT:    negb %dl
-; X86-NEXT:    cmpb %bl, %al
-; X86-NEXT:    setle %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    cbtw
-; X86-NEXT:    idivb %dl
-; X86-NEXT:    movsbl %ah, %eax
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    andl $1, %eax
-; X86-NEXT:    shll $3, %eax
-; X86-NEXT:    negl %eax
-; X86-NEXT:    negb %ch
-; X86-NEXT:    leal -8(%esp,%eax), %eax
+; X86-NEXT:    leal -{{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, (%eax)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cbtw
-; X86-NEXT:    idivb %ch
-; X86-NEXT:    movsbl %ah, %eax
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    movb %al, (%eax)
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    movb $0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
diff --git a/test/CodeGen/X86/pr38639.ll b/test/CodeGen/X86/pr38639.ll
index 4218db41185a3c760d50f52ab81b18570884f1aa..bea6c84279f71f2d186138a9463ed41543c0fb43 100644
--- a/test/CodeGen/X86/pr38639.ll
+++ b/test/CodeGen/X86/pr38639.ll
@@ -4,11 +4,11 @@
 define <8 x double> @test(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: test:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = <u,0.82071743224100002,0.82071743224100002,0.82071743224100002>
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = <u,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1>
 ; CHECK-NEXT:    vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [0.82071743224100002,0.82071743224100002]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1]
 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; CHECK-NEXT:    retq
   %1 = shufflevector <4 x double> %a, <4 x double> <double undef, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C, double 0x3FEA435134576E1C>, <8 x i32> <i32 6, i32 5, i32 2, i32 3, i32 5, i32 1, i32 3, i32 7>
diff --git a/test/CodeGen/X86/pr38762.ll b/test/CodeGen/X86/pr38762.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dc4d535511ceb0cf580c306dc6d6a025ef1be410
--- /dev/null
+++ b/test/CodeGen/X86/pr38762.ll
@@ -0,0 +1,101 @@
+; RUN: opt < %s -S -simplifycfg | FileCheck %s
+
+; Note: This patch is a complement to pr38763.
+;
+; When SimplifyCFG changes the PHI node into a select instruction, the debug
+; information becomes ambiguous. It causes the debugger to display unreached
+; lines and invalid variable values.
+;
+; When in the debugger, on the line "if (read1 > 3)", and we step from the
+; 'if' condition, onto the addition, then back to the 'if' again, which is
+; misleading because that addition doesn't really "happen" (it's speculated).
+
+; IR generated with:
+; clang -S -g -gno-column-info -O2 -emit-llvm pr38762.cpp -o pr38762.ll -mllvm -opt-bisect-limit=10
+
+; // pr38762.cpp
+; int main() {
+;   volatile int foo = 0;
+;   int read1 = foo;
+;   int brains = foo;
+; 
+;   if (read1 > 3) {
+;     brains *= 2;
+;     brains += 1;
+;   }
+; 
+;   return brains;
+; }
+
+; Change the debug locations associated with the PHI nodes being promoted, to
+; the debug locations from the insertion point in the dominant block.
+
+; CHECK-LABEL: entry
+; CHECK:  %cmp = icmp sgt i32 %foo.0., 3, !dbg !14
+; CHECK:  %mul = shl nsw i32 %foo.0.5, 1, !dbg !16
+; CHECK-NOT:  call void @llvm.dbg.value(metadata i32 %mul, metadata !15, metadata !DIExpression()), !dbg !25
+; CHECK:  %add = or i32 %mul, 1, !dbg !16
+; CHECK-NOT:  call void @llvm.dbg.value(metadata i32 %add, metadata !15, metadata !DIExpression()), !dbg !25
+; CHECK:  %brains.0 = select i1 %cmp, i32 %add, i32 %foo.0.5, !dbg !16
+
+; ModuleID = 'pr38762.cpp'
+source_filename = "pr38762.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
+entry:
+  %foo = alloca i32, align 4
+  %foo.0..sroa_cast = bitcast i32* %foo to i8*
+  store volatile i32 0, i32* %foo, align 4
+  %foo.0. = load volatile i32, i32* %foo, align 4
+  %foo.0.5 = load volatile i32, i32* %foo, align 4
+  call void @llvm.dbg.value(metadata i32 %foo.0.5, metadata !15, metadata !DIExpression()), !dbg !25
+  %cmp = icmp sgt i32 %foo.0., 3, !dbg !26
+  br i1 %cmp, label %if.then, label %if.end, !dbg !28
+
+if.then:                                          ; preds = %entry
+  %mul = shl nsw i32 %foo.0.5, 1, !dbg !29
+  call void @llvm.dbg.value(metadata i32 %mul, metadata !15, metadata !DIExpression()), !dbg !25
+  %add = or i32 %mul, 1, !dbg !31
+  call void @llvm.dbg.value(metadata i32 %add, metadata !15, metadata !DIExpression()), !dbg !25
+  br label %if.end, !dbg !32
+
+if.end:                                           ; preds = %if.then, %entry
+  %brains.0 = phi i32 [ %add, %if.then ], [ %foo.0.5, %entry ], !dbg !33
+  call void @llvm.dbg.value(metadata i32 %brains.0, metadata !15, metadata !DIExpression()), !dbg !25
+  ret i32 %brains.0, !dbg !35
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 343753)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "pr38762.cpp", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 343753)"}
+!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!15}
+!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !10)
+!15 = !DILocalVariable(name: "brains", scope: !7, file: !1, line: 4, type: !10)
+!25 = !DILocation(line: 4, scope: !7)
+!26 = !DILocation(line: 6, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !7, file: !1, line: 6)
+!28 = !DILocation(line: 6, scope: !7)
+!29 = !DILocation(line: 7, scope: !30)
+!30 = distinct !DILexicalBlock(scope: !27, file: !1, line: 6)
+!31 = !DILocation(line: 8, scope: !30)
+!32 = !DILocation(line: 9, scope: !30)
+!33 = !DILocation(line: 0, scope: !7)
+!34 = !DILocation(line: 12, scope: !7)
+!35 = !DILocation(line: 11, scope: !7)
diff --git a/test/CodeGen/X86/pr38763.ll b/test/CodeGen/X86/pr38763.ll
index b36e1efd9278806789d8ea81048884dbeda31a7d..ee0872748d6b085bd5d3f6e458f59f317f652101 100644
--- a/test/CodeGen/X86/pr38763.ll
+++ b/test/CodeGen/X86/pr38763.ll
@@ -30,13 +30,13 @@
 ; branches, as they becomes ambiguous.
 
 ; CHECK-LABEL: entry
-; CHECK:  %cmp = icmp eq i32 %foo.0., 4
-; CHECK:  %add = add nsw i32 %foo.0.4, 2, !dbg !18
+; CHECK:  %cmp = icmp eq i32 %foo.0., 4, !dbg !14
+; CHECK:  %add = add nsw i32 %foo.0.4, 2, !dbg !16
 ; CHECK-NOT: @llvm.dbg.value(metadata i32 %add
-; CHECK:  %sub = add nsw i32 %foo.0.4, -2, !dbg !21
+; CHECK:  %sub = add nsw i32 %foo.0.4, -2, !dbg !16
 ; CHECK-NOT: @llvm.dbg.value(metadata i32 %sub
 ; CHECK:  %result.0 = select i1 %cmp, i32 %add, i32 %sub
-; CHECK:  call void @llvm.dbg.value(metadata i32 %result.0, metadata !12, metadata !DIExpression()), !dbg !17
+; CHECK:  call void @llvm.dbg.value(metadata i32 %result.0, metadata !12, metadata !DIExpression()), !dbg !13
 
 ; ModuleID = 'pr38763.cpp'
 source_filename = "pr38763.cpp"
@@ -48,12 +48,12 @@ define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
 entry:
   %foo = alloca i32, align 4
   %foo.0..sroa_cast = bitcast i32* %foo to i8*
-  store volatile i32 4, i32* %foo, align 4, !tbaa !19
+  store volatile i32 4, i32* %foo, align 4
   %foo.0. = load volatile i32, i32* %foo, align 4
   %foo.0.4 = load volatile i32, i32* %foo, align 4
   call void @llvm.dbg.value(metadata i32 0, metadata !16, metadata !DIExpression()), !dbg !27
-  %cmp = icmp eq i32 %foo.0., 4
-  br i1 %cmp, label %if.then, label %if.else
+  %cmp = icmp eq i32 %foo.0., 4, !dbg !28
+  br i1 %cmp, label %if.then, label %if.else, !dbg !30
 
 if.then:                                          ; preds = %entry
   %add = add nsw i32 %foo.0.4, 2, !dbg !31
@@ -91,12 +91,10 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) #2
 !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 !11 = !{!16}
 !16 = !DILocalVariable(name: "result", scope: !7, file: !1, line: 6, type: !10)
-!19 = !{!20, !20, i64 0}
-!20 = !{!"int", !21, i64 0}
-!21 = !{!"omnipotent char", !22, i64 0}
-!22 = !{!"Simple C++ TBAA"}
 !27 = !DILocation(line: 6, column: 7, scope: !7)
+!28 = !DILocation(line: 7, column: 12, scope: !29)
 !29 = distinct !DILexicalBlock(scope: !7, file: !1, line: 7, column: 7)
+!30 = !DILocation(line: 7, column: 7, scope: !7)
 !31 = !DILocation(line: 8, column: 20, scope: !32)
 !32 = distinct !DILexicalBlock(scope: !29, file: !1, line: 7, column: 18)
 !34 = !DILocation(line: 10, column: 20, scope: !35)
diff --git a/test/CodeGen/X86/pr38771.ll b/test/CodeGen/X86/pr38771.ll
deleted file mode 100644
index 2a9ee66f7ef500752d356431d6d2ba9f5011bc63..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/pr38771.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
-
-define void @function() nounwind {
-; CHECK-LABEL: function:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movabsq $281474976710656, %rax # imm = 0x1000000000000
-; CHECK-NEXT:    notq %rax
-; CHECK-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-NEXT:    shldq $65, %rax, %rcx
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movb $64, %dl
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    cmoveq %rcx, %rax
-; CHECK-NEXT:    movq %rax, (%rax)
-; CHECK-NEXT:    movl $0, (%rax)
-; CHECK-NEXT:    retq
-entry:
-  %B68 = sub i96 39614081257132168796771975167, 281474976710656
-  %B49 = or i96 39614081257132168796771975167, 39614081257132168796771975167
-  %B33 = lshr i96 %B68, %B68
-  store i96 %B33, i96* undef
-  ret void
-}
diff --git a/test/CodeGen/X86/pr38795.ll b/test/CodeGen/X86/pr38795.ll
index 5603f056c67ba37a60edd9c7ddff5705383ce33e..6cb2a0859e34817e06f95538af6767dc87b18837 100644
--- a/test/CodeGen/X86/pr38795.ll
+++ b/test/CodeGen/X86/pr38795.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc %s -O2 -mtriple=i386-unknown-linux-gnu -o - | FileCheck %s
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39440.
+; RUN: llc %s -O2 -mtriple=i386-unknown-linux-gnu -o - -verify-machineinstrs=0 | FileCheck %s
 @.str = external dso_local unnamed_addr constant [6 x i8], align 1
 @a = external dso_local local_unnamed_addr global i32, align 4
 @h = external dso_local local_unnamed_addr global i32, align 4
diff --git a/test/CodeGen/X86/pr38952.mir b/test/CodeGen/X86/pr38952.mir
new file mode 100644
index 0000000000000000000000000000000000000000..57cdc017f9eca93b8d5bd38f14c0b30bed93a025
--- /dev/null
+++ b/test/CodeGen/X86/pr38952.mir
@@ -0,0 +1,103 @@
+# RUN: llc %s -run-pass=postra-machine-sink -o - | FileCheck %s
+--- |
+  ; Module stripped of everything, MIR below is what's interesting
+  ; ModuleID = '<stdin>'
+  source_filename = "justacall.cpp"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  ; Function Attrs: noinline norecurse nounwind uwtable
+  define dso_local i32 @main(i32 %argc, i8** nocapture readnone %argv) local_unnamed_addr #0 {
+  entry:
+    br label %if.end
+  if.end:
+    br label %return
+  return:
+    ret i32 0
+  }
+
+  !0 = !{!"dummy metadata"}
+  !2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: None)
+  !3 = !DIFile(filename: "justacall.cpp", directory: "/tmp")
+  !4 = !{}
+  !5 = !{!0}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !14 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 7, type: !15, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !2, retainedNodes: !20)
+  !15 = !DISubroutineType(types: !16)
+  !16 = !{!7, !7}
+  !20 = !{!21}
+  !21 = !DILocalVariable(name: "argc", arg: 1, scope: !14, file: !3, line: 7, type: !7)
+
+...
+---
+name:            main
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+liveins:
+  - { reg: '$edi', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $edi
+  
+  ; Test that the DBG_VALUE on ebx below is sunk with the def of ebx, despite
+  ; not being adjacent to the def, see PR38952
+
+    DBG_VALUE $edi, $noreg, !21, !DIExpression()
+    renamable $ebx = COPY $edi
+    renamable $eax = MOV32r0 implicit-def dead $eflags
+    DBG_VALUE $ebx, $noreg, !21, !DIExpression()
+    CMP32ri $edi, 255, implicit-def $eflags
+    JG_1 %bb.2, implicit killed $eflags
+    JMP_1 %bb.1
+  
+  bb.1.if.end:
+  ; CHECK-LABEL: bb.1.if.end
+    successors: %bb.2(0x80000000)
+    liveins: $ebx
+  
+  ; CHECK: $ebx = COPY $edi
+  ; CHECK-NEXT: DBG_VALUE $ebx
+    renamable $rdx = MOVSX64rr32 renamable $ebx
+    renamable $rdx = nsw SHL64ri killed renamable $rdx, 2, implicit-def dead $eflags
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $rdi = MOV32ri64 0
+    $esi = MOV32r0 implicit-def dead $eflags
+    CALL64pcrel32 &memset, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit killed $esi, implicit $rdx, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  
+  bb.2.return:
+    liveins: $eax
+  
+    RET 0, $eax
+
+...
diff --git a/test/CodeGen/X86/pr39243.ll b/test/CodeGen/X86/pr39243.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a901e291eca3025e51d8f90eb2a7155bdd2b0311
--- /dev/null
+++ b/test/CodeGen/X86/pr39243.ll
@@ -0,0 +1,132 @@
+; RUN: opt < %s -S -simplifycfg | FileCheck %s
+
+; Note: This patch fixes the regression introduced by pr38762.
+;
+; When SimplifyCFG changes the PHI node into a select instruction, the debug
+; information becomes ambiguous. It causes the debugger to display unreached
+; lines and invalid variable values.
+;
+; When the function 'hoistAllInstructionsInto' hoist a basic block:
+; - Remove their dbg.values.
+; - Set their debug locations to the values from the insertion point.
+;
+; But, if one of the instructions being hoisted is a debug intrinsic from an
+; inlined function, assigning it the debug location from the insertion point
+; will create a mismatch between the intrinsic's subprogram and the location's
+; subprogram, causing the assertion "Expected inlined-at fields to agree" in
+; SelectionDAG".
+
+; IR generated with:
+; clang -S -g -gno-column-info -O2 -emit-llvm pr39243.cpp -o pr39243.ll -mllvm -opt-bisect-limit=103
+
+; // pr39243.cpp
+; union onion {
+;   double dd;
+;   int ii[2];
+; };
+;
+; int alpha;
+; int bravo();
+;
+; int charlie() {
+;   int delta = 0;
+;   return bravo();
+; }
+;
+; int echo(onion foxtrot) {
+;   alpha = foxtrot.ii[0];
+;   if (alpha) {
+;     int golf = bravo();
+;     return -golf;
+;   }
+;   alpha = foxtrot.ii[1];
+;   return -charlie();
+; }
+
+; Change the debug locations associated with the PHI nodes being promoted, to
+; the debug locations from the insertion point in the dominant block.
+
+; CHECK-LABEL: entry
+; CHECK:  %foxtrot.sroa.0.0.extract.trunc = trunc i64 %foxtrot.coerce to i32
+; CHECK:  %tobool = icmp eq i32 %foxtrot.sroa.0.0.extract.trunc, 0
+; CHECK:  %foxtrot.sroa.2.0.extract.shift = lshr i64 %foxtrot.coerce, 32
+; CHECK-NOT:  call void @llvm.dbg.value(metadata i32 %foxtrot.sroa.2.0.extract.trunc, metadata !30, metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32)), !dbg !34
+; CHECK:  %foxtrot.sroa.2.0.extract.trunc = trunc i64 %foxtrot.sroa.2.0.extract.shift to i32
+; CHECK:  store i32 %foxtrot.sroa.2.0.extract.trunc, i32* @alpha, align 4, !dbg !25
+; CHECK-NOT:  call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !DIExpression()), !dbg !43
+
+; ModuleID = 'pr39243.cpp'
+source_filename = "pr39243.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+@alpha = dso_local local_unnamed_addr global i32 0, align 4, !dbg !0
+
+define dso_local i32 @_Z7charliev() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 @_Z5bravov()
+  ret i32 %call
+}
+
+declare dso_local i32 @_Z5bravov() local_unnamed_addr #1
+
+define dso_local i32 @_Z4echo5onion(i64 %foxtrot.coerce) local_unnamed_addr #0 !dbg !18 {
+entry:
+  %foxtrot.sroa.0.0.extract.trunc = trunc i64 %foxtrot.coerce to i32
+  store i32 %foxtrot.sroa.0.0.extract.trunc, i32* @alpha, align 4
+  %tobool = icmp eq i32 %foxtrot.sroa.0.0.extract.trunc, 0
+  br i1 %tobool, label %if.end, label %return
+
+if.end:                                           ; preds = %entry
+  %foxtrot.sroa.2.0.extract.shift = lshr i64 %foxtrot.coerce, 32
+  %foxtrot.sroa.2.0.extract.trunc = trunc i64 %foxtrot.sroa.2.0.extract.shift to i32
+  call void @llvm.dbg.value(metadata i32 %foxtrot.sroa.2.0.extract.trunc, metadata !30, metadata !DIExpression(DW_OP_LLVM_fragment, 32, 32)), !dbg !34
+  store i32 %foxtrot.sroa.2.0.extract.trunc, i32* @alpha, align 4, !dbg !42
+  call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !DIExpression()), !dbg !43
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  %call.i = tail call i32 @_Z5bravov()
+  %retval.0 = sub nsw i32 0, %call.i
+  ret i32 %retval.0
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "alpha", scope: !2, file: !3, line: 6, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 8.0.0 (trunk 344502)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: None)
+!3 = !DIFile(filename: "pr39243.cpp", directory: ".")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{!"clang version 8.0.0 (trunk 344502)"}
+!11 = distinct !DISubprogram(name: "charlie", linkageName: "_Z7charliev", scope: !3, file: !3, line: 9, type: !12, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !2, retainedNodes: !14)
+!12 = !DISubroutineType(types: !13)
+!13 = !{!6}
+!14 = !{!15}
+!15 = !DILocalVariable(name: "delta", scope: !11, file: !3, line: 10, type: !6)
+!18 = distinct !DISubprogram(name: "echo", linkageName: "_Z4echo5onion", scope: !3, file: !3, line: 14, type: !19, isLocal: false, isDefinition: true, scopeLine: 14, flags: DIFlagPrototyped, isOptimized: true, unit: !2, retainedNodes: !29)
+!19 = !DISubroutineType(types: !20)
+!20 = !{!6, !21}
+!21 = distinct !DICompositeType(tag: DW_TAG_union_type, name: "onion", file: !3, line: 1, size: 64, flags: DIFlagTypePassByValue | DIFlagTrivial, elements: !22, identifier: "_ZTS5onion")
+!22 = !{!23, !25}
+!23 = !DIDerivedType(tag: DW_TAG_member, name: "dd", scope: !21, file: !3, line: 2, baseType: !24, size: 64)
+!24 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+!25 = !DIDerivedType(tag: DW_TAG_member, name: "ii", scope: !21, file: !3, line: 3, baseType: !26, size: 64)
+!26 = !DICompositeType(tag: DW_TAG_array_type, baseType: !6, size: 64, elements: !27)
+!27 = !{!28}
+!28 = !DISubrange(count: 2)
+!29 = !{!30}
+!30 = !DILocalVariable(name: "foxtrot", arg: 1, scope: !18, file: !3, line: 14, type: !21)
+!34 = !DILocation(line: 14, scope: !18)
+!42 = !DILocation(line: 20, scope: !18)
+!43 = !DILocation(line: 10, scope: !11, inlinedAt: !44)
+!44 = distinct !DILocation(line: 21, scope: !18)
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
index 7da7c299791e9557d080b08cb18ba3e18120f303..02e9b4c159327512bd7f9e72d345fb8989934d5d 100644
--- a/test/CodeGen/X86/pr5145.ll
+++ b/test/CodeGen/X86/pr5145.ll
@@ -1,31 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
 @sc8 = external global i8
 
 define void @atomic_maxmin_i8() {
-; CHECK: atomic_maxmin_i8
+; CHECK-LABEL: atomic_maxmin_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpb $4, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    jg .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.start
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    movb $5, %cl
+; CHECK-NEXT:  .LBB0_3: # %atomicrmw.start
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.4: # %atomicrmw.end
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start2
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpb $7, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    jl .LBB0_7
+; CHECK-NEXT:  # %bb.6: # %atomicrmw.start2
+; CHECK-NEXT:    # in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    movb $6, %cl
+; CHECK-NEXT:  .LBB0_7: # %atomicrmw.start2
+; CHECK-NEXT:    # in Loop: Header=BB0_5 Depth=1
+; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
+; CHECK-NEXT:    jne .LBB0_5
+; CHECK-NEXT:  # %bb.8: # %atomicrmw.end1
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_9: # %atomicrmw.start8
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpb $7, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    ja .LBB0_11
+; CHECK-NEXT:  # %bb.10: # %atomicrmw.start8
+; CHECK-NEXT:    # in Loop: Header=BB0_9 Depth=1
+; CHECK-NEXT:    movb $7, %cl
+; CHECK-NEXT:  .LBB0_11: # %atomicrmw.start8
+; CHECK-NEXT:    # in Loop: Header=BB0_9 Depth=1
+; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
+; CHECK-NEXT:    jne .LBB0_9
+; CHECK-NEXT:  # %bb.12: # %atomicrmw.end7
+; CHECK-NEXT:    movb {{.*}}(%rip), %al
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_13: # %atomicrmw.start14
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpb $9, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    jb .LBB0_15
+; CHECK-NEXT:  # %bb.14: # %atomicrmw.start14
+; CHECK-NEXT:    # in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    movb $8, %cl
+; CHECK-NEXT:  .LBB0_15: # %atomicrmw.start14
+; CHECK-NEXT:    # in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    lock cmpxchgb %cl, {{.*}}(%rip)
+; CHECK-NEXT:    jne .LBB0_13
+; CHECK-NEXT:  # %bb.16: # %atomicrmw.end13
+; CHECK-NEXT:    retq
   %1 = atomicrmw max  i8* @sc8, i8 5 acquire
-; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: jg
-; CHECK: lock cmpxchgb
-; CHECK: jne [[LABEL1]]
   %2 = atomicrmw min  i8* @sc8, i8 6 acquire
-; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: jl
-; CHECK: lock cmpxchgb
-; CHECK: jne [[LABEL3]]
   %3 = atomicrmw umax i8* @sc8, i8 7 acquire
-; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: ja
-; CHECK: lock cmpxchgb
-; CHECK: jne [[LABEL5]]
   %4 = atomicrmw umin i8* @sc8, i8 8 acquire
-; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
-; CHECK: cmpb
-; CHECK: jb
-; CHECK: lock cmpxchgb
-; CHECK: jne [[LABEL7]]
   ret void
 }
diff --git a/test/CodeGen/X86/pseudo_cmov_lower2.ll b/test/CodeGen/X86/pseudo_cmov_lower2.ll
index 1a61b0b970004fd06f3c6d9636fd092605cf0e73..5218e1f0cee9ee9012766b0699af900186a4695b 100644
--- a/test/CodeGen/X86/pseudo_cmov_lower2.ll
+++ b/test/CodeGen/X86/pseudo_cmov_lower2.ll
@@ -1,14 +1,29 @@
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s 
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -o - | FileCheck %s
 
 ; This test checks that only a single jae gets generated in the final code
 ; for lowering the CMOV pseudos that get created for this IR.  The tricky part
 ; of this test is that it tests the special PHI operand rewriting code in
 ; X86TargetLowering::EmitLoweredSelect.
 ;
-; CHECK-LABEL: foo1:
-; CHECK: jae
-; CHECK-NOT: jae
 define double @foo1(float %p1, double %p2, double %p3) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm3, %xmm3
+; CHECK-NEXT:    ucomiss %xmm3, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    jae .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    addsd %xmm0, %xmm1
+; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    movapd %xmm1, %xmm2
+; CHECK-NEXT:  .LBB0_3: # %entry
+; CHECK-NEXT:    subsd %xmm1, %xmm0
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %c1 = fcmp oge float %p1, 0.000000e+00
   %d0 = fadd double %p2, 1.25e0
@@ -26,10 +41,24 @@ entry:
 ; of this test is that it tests the special PHI operand rewriting code in
 ; X86TargetLowering::EmitLoweredSelect.
 ;
-; CHECK-LABEL: foo2:
-; CHECK: jae
-; CHECK-NOT: jae
 define double @foo2(float %p1, double %p2, double %p3) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm3, %xmm3
+; CHECK-NEXT:    ucomiss %xmm3, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    jae .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    addsd %xmm0, %xmm2
+; CHECK-NEXT:    movapd %xmm2, %xmm0
+; CHECK-NEXT:    movapd %xmm2, %xmm1
+; CHECK-NEXT:    jmp .LBB1_3
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    addsd %xmm1, %xmm0
+; CHECK-NEXT:  .LBB1_3: # %entry
+; CHECK-NEXT:    subsd %xmm1, %xmm0
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %c1 = fcmp oge float %p1, 0.000000e+00
   %d0 = fadd double %p2, 1.25e0
@@ -48,16 +77,17 @@ entry:
 ; X86TargetLowering::EmitLoweredSelect.  It also tests to make sure all
 ; the operands of the resulting instructions are from the proper places.
 ;
-; CHECK-LABEL: foo3:
-; CHECK:          js
-; CHECK-NOT: js
-; CHECK-LABEL: # %bb.1:
-; CHECK-DAG:      movapd  %xmm2, %xmm1
-; CHECK-DAG:      movapd  %xmm2, %xmm0
-; CHECK-LABEL:.LBB2_2:
-; CHECK:          divsd   %xmm1, %xmm0
-; CHECK:          ret
 define double @foo3(i32 %p1, double %p2, double %p3,
+; CHECK-LABEL: foo3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    js .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movapd %xmm2, %xmm1
+; CHECK-NEXT:    movapd %xmm2, %xmm0
+; CHECK-NEXT:  .LBB2_2: # %entry
+; CHECK-NEXT:    divsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
                              double %p4, double %p5) nounwind {
 entry:
   %c1 = icmp slt i32 %p1, 0
@@ -78,16 +108,17 @@ entry:
 ; condition code in the second two selects, but we also swap the operands
 ; of the selects to give the same actual computation.
 ;
-; CHECK-LABEL: foo4:
-; CHECK:          js
-; CHECK-NOT: js
-; CHECK-LABEL: # %bb.1:
-; CHECK-DAG:      movapd  %xmm2, %xmm1
-; CHECK-DAG:      movapd  %xmm2, %xmm0
-; CHECK-LABEL:.LBB3_2:
-; CHECK:          divsd   %xmm1, %xmm0
-; CHECK:          ret
 define double @foo4(i32 %p1, double %p2, double %p3,
+; CHECK-LABEL: foo4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    js .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %entry
+; CHECK-NEXT:    movapd %xmm2, %xmm1
+; CHECK-NEXT:    movapd %xmm2, %xmm0
+; CHECK-NEXT:  .LBB3_2: # %entry
+; CHECK-NEXT:    divsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
                              double %p4, double %p5) nounwind {
 entry:
   %c1 = icmp slt i32 %p1, 0
@@ -103,10 +134,24 @@ entry:
 ; for lowering the CMOV pseudos that get created for this IR.  The tricky part
 ; of this test is that it tests the special code in CodeGenPrepare.
 ;
-; CHECK-LABEL: foo5:
-; CHECK: jae
-; CHECK-NOT: jae
 define double @foo5(float %p1, double %p2, double %p3) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm3, %xmm3
+; CHECK-NEXT:    ucomiss %xmm3, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    jae .LBB4_1
+; CHECK-NEXT:  # %bb.2: # %select.false
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:  .LBB4_3: # %select.end
+; CHECK-NEXT:    subsd %xmm1, %xmm0
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB4_1:
+; CHECK-NEXT:    addsd %xmm0, %xmm1
+; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    movapd %xmm1, %xmm2
+; CHECK-NEXT:    jmp .LBB4_3
 entry:
   %c1 = fcmp oge float %p1, 0.000000e+00
   %d0 = fadd double %p2, 1.25e0
@@ -122,11 +167,35 @@ entry:
 ; We should expand select instructions into 3 conditional branches as their
 ; condtions are different.
 ;
-; CHECK-LABEL: foo6:
-; CHECK: jae
-; CHECK: jae
-; CHECK: jae
 define double @foo6(float %p1, double %p2, double %p3) nounwind {
+; CHECK-LABEL: foo6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movaps %xmm0, %xmm3
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    ucomiss %xmm0, %xmm3
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    jae .LBB5_1
+; CHECK-NEXT:  # %bb.2: # %select.false
+; CHECK-NEXT:    addsd %xmm2, %xmm0
+; CHECK-NEXT:  .LBB5_3: # %select.end
+; CHECK-NEXT:    ucomiss {{.*}}(%rip), %xmm3
+; CHECK-NEXT:    movapd %xmm0, %xmm4
+; CHECK-NEXT:    jae .LBB5_5
+; CHECK-NEXT:  # %bb.4: # %select.false2
+; CHECK-NEXT:    movapd %xmm1, %xmm4
+; CHECK-NEXT:  .LBB5_5: # %select.end1
+; CHECK-NEXT:    ucomiss {{.*}}(%rip), %xmm3
+; CHECK-NEXT:    movapd %xmm4, %xmm1
+; CHECK-NEXT:    jae .LBB5_7
+; CHECK-NEXT:  # %bb.6: # %select.false4
+; CHECK-NEXT:    movapd %xmm2, %xmm1
+; CHECK-NEXT:  .LBB5_7: # %select.end3
+; CHECK-NEXT:    subsd %xmm4, %xmm0
+; CHECK-NEXT:    addsd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB5_1:
+; CHECK-NEXT:    addsd %xmm1, %xmm0
+; CHECK-NEXT:    jmp .LBB5_3
 entry:
   %c1 = fcmp oge float %p1, 0.000000e+00
   %c2 = fcmp oge float %p1, 1.000000e+00
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index 0900fdccb49b8f1d5dd436bb107c180f0f7d3d17..d0ed99f92f3a79891bd31ed0046edcb92b564edd 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -57,9 +57,9 @@ define <16 x i8> @test5(<16 x i8> %V) {
 ; CHECK-NEXT:    movl $1, %eax
 ; CHECK-NEXT:    movq %rax, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, (%rax)
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
-; CHECK-NEXT:    movdqa %xmm1, (%rax)
-; CHECK-NEXT:    pshufb %xmm1, %xmm0
+; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
+; CHECK-NEXT:    movaps %xmm1, (%rax)
+; CHECK-NEXT:    pshufb (%rax), %xmm0
 ; CHECK-NEXT:    retq
   store <2 x i64> <i64 1, i64 0>, <2 x i64>* undef, align 16
   %l = load <2 x i64>, <2 x i64>* undef, align 16
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index e2089f6b0d259828c3201692d082e035a4f8e89e..6e2e97980c7bd5c987e8c5a4798853b98698cc13 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -531,43 +531,41 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE41-NEXT:    pxor %xmm5, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT:    pshufb %xmm6, %xmm0
-; SSE41-NEXT:    movdqa %xmm3, %xmm7
-; SSE41-NEXT:    pmaxud %xmm2, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pshufb %xmm6, %xmm7
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pmaxud %xmm2, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    packssdw %xmm6, %xmm0
 ; SSE41-NEXT:    psubd %xmm2, %xmm3
 ; SSE41-NEXT:    psubd %xmm1, %xmm4
-; SSE41-NEXT:    pshufb %xmm6, %xmm4
-; SSE41-NEXT:    pshufb %xmm6, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
 ; SSE41-NEXT:    pandn %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test13:
 ; AVX1:       # %bb.0: # %vector.ph
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpmaxud %xmm5, %xmm2, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm2, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpandn %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -792,7 +790,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsubd %xmm9, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsubd %xmm11, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm2, %xmm0
@@ -916,43 +914,41 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE41-NEXT:    pxor %xmm5, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT:    pshufb %xmm6, %xmm4
-; SSE41-NEXT:    movdqa %xmm3, %xmm7
-; SSE41-NEXT:    pminud %xmm2, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pshufb %xmm6, %xmm7
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    pminud %xmm2, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    packssdw %xmm6, %xmm4
 ; SSE41-NEXT:    psubd %xmm2, %xmm3
 ; SSE41-NEXT:    psubd %xmm1, %xmm0
-; SSE41-NEXT:    pshufb %xmm6, %xmm0
-; SSE41-NEXT:    pshufb %xmm6, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; SSE41-NEXT:    pand %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test15:
 ; AVX1:       # %bb.0: # %vector.ph
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpminud %xmm5, %xmm2, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm2, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpminud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -1052,43 +1048,41 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE41-NEXT:    pcmpeqd %xmm1, %xmm4
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
 ; SSE41-NEXT:    pxor %xmm5, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT:    pshufb %xmm6, %xmm4
-; SSE41-NEXT:    movdqa %xmm2, %xmm7
-; SSE41-NEXT:    pmaxud %xmm3, %xmm7
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm7
-; SSE41-NEXT:    pxor %xmm5, %xmm7
-; SSE41-NEXT:    pshufb %xmm6, %xmm7
-; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pmaxud %xmm3, %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    packssdw %xmm6, %xmm4
 ; SSE41-NEXT:    psubd %xmm2, %xmm3
 ; SSE41-NEXT:    psubd %xmm1, %xmm0
-; SSE41-NEXT:    pshufb %xmm6, %xmm0
-; SSE41-NEXT:    pshufb %xmm6, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; SSE41-NEXT:    pand %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: test16:
 ; AVX1:       # %bb.0: # %vector.ph
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpmaxud %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm5, %xmm6
-; AVX1-NEXT:    vpxor %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm3, %xmm4
+; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm0, %xmm1, %xmm6
+; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpackssdw %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 47484865693ff0012903f405253ae94b86284368..a68940eb11a3b6414f1a145acce3432b0077d05a 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
@@ -37,6 +38,12 @@ define float @f32_no_estimate(float %x) #0 {
 ; FMA-RECIP-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_no_estimate:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: f32_no_estimate:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -105,6 +112,13 @@ define float @f32_one_step(float %x) #1 {
 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_one_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: f32_one_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -202,6 +216,16 @@ define float @f32_two_step(float %x) #2 {
 ; FMA-RECIP-NEXT:    vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_two_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: f32_two_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -283,56 +307,62 @@ define float @f32_two_step(float %x) #2 {
 define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
 ; SSE-LABEL: v4f32_no_estimate:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    divps %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-RECIP-LABEL: v4f32_no_estimate:
 ; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v4f32_no_estimate:
 ; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_no_estimate:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v4f32_no_estimate:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-LABEL: v4f32_no_estimate:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [14:14.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v4f32_no_estimate:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v4f32_no_estimate:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [13:7.00]
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: v4f32_no_estimate:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SKX-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [11:3.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -344,7 +374,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    subps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    addps %xmm2, %xmm1
@@ -355,7 +385,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -368,9 +398,16 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_one_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v4f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
@@ -382,7 +419,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -391,7 +428,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; HASWELL-LABEL: v4f32_one_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
@@ -400,7 +437,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -409,7 +446,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; KNL-LABEL: v4f32_one_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; KNL-NEXT:    retq # sched: [7:1.00]
@@ -430,7 +467,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    subps %xmm3, %xmm4
 ; SSE-NEXT:    mulps %xmm2, %xmm4
@@ -446,7 +483,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
 ; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
 ; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
@@ -459,7 +496,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v4f32_two_step:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
@@ -467,9 +504,19 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm3) + xmm3
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_two_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v4f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
@@ -485,7 +532,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -498,7 +545,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; HASWELL-LABEL: v4f32_two_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
@@ -510,7 +557,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
@@ -523,7 +570,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; KNL-LABEL: v4f32_two_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
@@ -534,7 +581,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; SKX-LABEL: v4f32_two_step:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
@@ -548,7 +595,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ; SSE-LABEL: v8f32_no_estimate:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm2, %xmm3
 ; SSE-NEXT:    divps %xmm0, %xmm3
 ; SSE-NEXT:    divps %xmm1, %xmm2
@@ -558,49 +605,55 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ;
 ; AVX-RECIP-LABEL: v8f32_no_estimate:
 ; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v8f32_no_estimate:
 ; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_no_estimate:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [9:19.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v8f32_no_estimate:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-LABEL: v8f32_no_estimate:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [29:28.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v8f32_no_estimate:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v8f32_no_estimate:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [21:14.00]
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: v8f32_no_estimate:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SKX-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [11:5.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -612,7 +665,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm4
 ; SSE-NEXT:    mulps %xmm4, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm2, %xmm3
 ; SSE-NEXT:    subps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm4, %xmm3
@@ -630,7 +683,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -643,9 +696,16 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_one_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v8f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
@@ -657,7 +717,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -666,7 +726,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; HASWELL-LABEL: v8f32_one_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
@@ -675,7 +735,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -684,7 +744,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; KNL-LABEL: v8f32_one_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; KNL-NEXT:    retq # sched: [7:1.00]
@@ -706,7 +766,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm0, %xmm3
 ; SSE-NEXT:    movaps %xmm0, %xmm4
 ; SSE-NEXT:    mulps %xmm3, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm5
 ; SSE-NEXT:    subps %xmm4, %xmm5
 ; SSE-NEXT:    mulps %xmm3, %xmm5
@@ -734,7 +794,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
 ; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
@@ -747,7 +807,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v8f32_two_step:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
@@ -755,9 +815,19 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm3) + ymm3
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_two_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v8f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
@@ -773,7 +843,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -786,7 +856,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; HASWELL-LABEL: v8f32_two_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
@@ -798,7 +868,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
@@ -811,7 +881,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; KNL-LABEL: v8f32_two_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
@@ -822,7 +892,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; SKX-LABEL: v8f32_two_step:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
@@ -836,7 +906,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ; SSE-LABEL: v16f32_no_estimate:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm4, %xmm5
 ; SSE-NEXT:    divps %xmm0, %xmm5
 ; SSE-NEXT:    movaps %xmm4, %xmm6
@@ -852,55 +922,62 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ;
 ; AVX-RECIP-LABEL: v16f32_no_estimate:
 ; AVX-RECIP:       # %bb.0:
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; AVX-RECIP-NEXT:    retq
 ;
 ; FMA-RECIP-LABEL: v16f32_no_estimate:
 ; FMA-RECIP:       # %bb.0:
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm2, %ymm0
 ; FMA-RECIP-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_no_estimate:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [9:19.00]
+; BDVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [9:19.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v16f32_no_estimate:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [38:38.00]
 ; BTVER2-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [38:38.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-LABEL: v16f32_no_estimate:
 ; SANDY:       # %bb.0:
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [29:28.00]
 ; SANDY-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [29:28.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-LABEL: v16f32_no_estimate:
 ; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vdivps %ymm0, %ymm2, %ymm0 # sched: [21:14.00]
 ; HASWELL-NEXT:    vdivps %ymm1, %ymm2, %ymm1 # sched: [21:14.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-NO-FMA-LABEL: v16f32_no_estimate:
 ; HASWELL-NO-FMA:       # %bb.0:
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm2, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; HASWELL-NO-FMA-NEXT:    retq
 ;
 ; KNL-LABEL: v16f32_no_estimate:
 ; KNL:       # %bb.0:
-; KNL-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
 ; KNL-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [21:14.00]
 ; KNL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: v16f32_no_estimate:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
 ; SKX-NEXT:    vdivps %zmm0, %zmm1, %zmm0 # sched: [18:10.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -914,7 +991,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm0, %xmm5
 ; SSE-NEXT:    rcpps %xmm0, %xmm6
 ; SSE-NEXT:    mulps %xmm6, %xmm5
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    subps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm6, %xmm0
@@ -944,7 +1021,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
@@ -958,7 +1035,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; FMA-RECIP-LABEL: v16f32_one_step:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
@@ -966,9 +1043,20 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_one_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm4, %ymm1, %ymm4, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v16f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm4 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
@@ -985,7 +1073,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
@@ -999,7 +1087,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; HASWELL-LABEL: v16f32_one_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
@@ -1011,7 +1099,7 @@ define <16 x float> @v16f32_one_step(<16 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
@@ -1048,7 +1136,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm0, %xmm0
 ; SSE-NEXT:    movaps %xmm1, %xmm6
 ; SSE-NEXT:    mulps %xmm0, %xmm6
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm3, %xmm7
 ; SSE-NEXT:    subps %xmm6, %xmm7
 ; SSE-NEXT:    mulps %xmm0, %xmm7
@@ -1100,7 +1188,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm3
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
 ; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
 ; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
@@ -1122,7 +1210,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v16f32_two_step:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
@@ -1136,9 +1224,24 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm4) + ymm4
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_two_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v16f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
@@ -1163,7 +1266,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
@@ -1185,7 +1288,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; HASWELL-LABEL: v16f32_two_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
@@ -1203,7 +1306,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
@@ -1225,7 +1328,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; KNL-LABEL: v16f32_two_step:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
 ; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
@@ -1236,7 +1339,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
 ; SKX-LABEL: v16f32_two_step:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index fdd441581dcbbd93ab3c96490de371c08dafd62c..dbe2689077e1a1cf95db5326fbfdde2e3a0a69a3 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -print-schedule       | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -print-schedule  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -print-schedule      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -print-schedule     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
@@ -30,6 +31,12 @@ define float @f32_no_step_2(float %x) #3 {
 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_no_step_2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: f32_no_step_2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
@@ -101,6 +108,14 @@ define float @f32_one_step_2(float %x) #1 {
 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_one_step_2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: f32_one_step_2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -196,6 +211,15 @@ define float @f32_one_step_2_divs(float %x) #1 {
 ; FMA-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_one_step_2_divs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -309,6 +333,17 @@ define float @f32_two_step_2(float %x) #2 {
 ; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: f32_two_step_2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: f32_two_step_2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -398,7 +433,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    subps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    addps %xmm2, %xmm1
@@ -410,7 +445,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -425,9 +460,17 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_one_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v4f32_one_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
@@ -440,7 +483,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -450,7 +493,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; HASWELL-LABEL: v4f32_one_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
@@ -460,7 +503,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -470,7 +513,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; KNL-LABEL: v4f32_one_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
@@ -492,11 +535,11 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm1, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    subps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm1, %xmm2
 ; SSE-NEXT:    addps %xmm1, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,3,4]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; SSE-NEXT:    mulps %xmm2, %xmm0
 ; SSE-NEXT:    mulps %xmm2, %xmm0
 ; SSE-NEXT:    retq
@@ -505,7 +548,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
 ; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -522,9 +565,18 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_one_step_2_divs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v4f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
@@ -538,7 +590,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -549,7 +601,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; HASWELL-LABEL: v4f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
@@ -560,7 +612,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
@@ -571,7 +623,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
 ; KNL-LABEL: v4f32_one_step_2_divs:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 sched: [5:0.50]
 ; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
@@ -597,7 +649,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    subps %xmm3, %xmm4
 ; SSE-NEXT:    mulps %xmm2, %xmm4
@@ -614,7 +666,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
 ; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
 ; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
@@ -628,7 +680,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v4f32_two_step2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1
@@ -637,9 +689,20 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v4f32_two_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm3, %xmm1, %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %xmm1, %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v4f32_two_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %xmm0, %xmm1 # sched: [2:1.00]
 ; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
 ; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
@@ -656,7 +719,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -670,7 +733,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; HASWELL-LABEL: v4f32_two_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
@@ -683,7 +746,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -697,7 +760,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; KNL-LABEL: v4f32_two_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; KNL-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [5:0.50]
@@ -709,7 +772,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; SKX-LABEL: v4f32_two_step2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %xmm0, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [6:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} xmm3 = -(xmm0 * xmm3) + xmm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} xmm3 = (xmm3 * xmm1) + xmm1 sched: [4:0.50]
@@ -726,7 +789,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm1, %xmm4
 ; SSE-NEXT:    mulps %xmm4, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm2, %xmm3
 ; SSE-NEXT:    subps %xmm1, %xmm3
 ; SSE-NEXT:    mulps %xmm4, %xmm3
@@ -746,7 +809,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -761,9 +824,17 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_one_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v8f32_one_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
@@ -776,7 +847,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -786,7 +857,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; HASWELL-LABEL: v8f32_one_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
@@ -796,7 +867,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -806,7 +877,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; KNL-LABEL: v8f32_one_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
@@ -828,7 +899,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm3, %xmm4
 ; SSE-NEXT:    subps %xmm0, %xmm4
 ; SSE-NEXT:    mulps %xmm2, %xmm4
@@ -838,9 +909,9 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; SSE-NEXT:    subps %xmm1, %xmm3
 ; SSE-NEXT:    mulps %xmm0, %xmm3
 ; SSE-NEXT:    addps %xmm0, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5,6,7,8]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
 ; SSE-NEXT:    mulps %xmm3, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,3,4]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; SSE-NEXT:    mulps %xmm4, %xmm0
 ; SSE-NEXT:    mulps %xmm4, %xmm0
 ; SSE-NEXT:    mulps %xmm3, %xmm1
@@ -850,7 +921,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -867,9 +938,18 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_one_step_2_divs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm0 # sched: [10:1.00]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v8f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:2.00]
@@ -883,7 +963,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -894,7 +974,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; HASWELL-LABEL: v8f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
@@ -905,7 +985,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
@@ -916,7 +996,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
 ; KNL-LABEL: v8f32_one_step_2_divs:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm1 sched: [5:0.50]
 ; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
@@ -943,7 +1023,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SSE-NEXT:    rcpps %xmm1, %xmm3
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    mulps %xmm3, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm5
 ; SSE-NEXT:    subps %xmm4, %xmm5
 ; SSE-NEXT:    mulps %xmm3, %xmm5
@@ -973,7 +1053,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
 ; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
@@ -987,7 +1067,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v8f32_two_step2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1
@@ -996,9 +1076,20 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_two_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm3 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm3, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm1, %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v8f32_two_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm1 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:2.00]
@@ -1015,7 +1106,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm1 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -1029,7 +1120,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; HASWELL-LABEL: v8f32_two_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
@@ -1042,7 +1133,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -1056,7 +1147,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; KNL-LABEL: v8f32_two_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcpps %ymm0, %ymm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; KNL-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [5:0.50]
@@ -1068,7 +1159,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SKX-LABEL: v8f32_two_step2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcpps %ymm0, %ymm1 # sched: [4:1.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} ymm3 = -(ymm0 * ymm3) + ymm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm1) + ymm1 sched: [4:0.50]
@@ -1097,6 +1188,11 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_no_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v8f32_no_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
@@ -1151,6 +1247,12 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v8f32_no_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v8f32_no_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
@@ -1198,7 +1300,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm0, %xmm6
 ; SSE-NEXT:    rcpps %xmm3, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    subps %xmm4, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm3
@@ -1231,7 +1333,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
 ; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
 ; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
@@ -1247,7 +1349,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; FMA-RECIP-LABEL: v16f32_one_step2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
@@ -1257,9 +1359,22 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_one_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm4, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm4, %ymm0, %ymm4, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v16f32_one_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm4 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [2:2.00]
@@ -1278,7 +1393,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
@@ -1294,7 +1409,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; HASWELL-LABEL: v16f32_one_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm4 # sched: [11:2.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * ymm2) + ymm2 sched: [5:0.50]
@@ -1308,7 +1423,7 @@ define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm2, %ymm1 # sched: [3:1.00]
@@ -1345,7 +1460,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpps %xmm0, %xmm6
 ; SSE-NEXT:    mulps %xmm6, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm4, %xmm5
 ; SSE-NEXT:    subps %xmm0, %xmm5
 ; SSE-NEXT:    mulps %xmm6, %xmm5
@@ -1367,13 +1482,13 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; SSE-NEXT:    subps %xmm3, %xmm4
 ; SSE-NEXT:    mulps %xmm0, %xmm4
 ; SSE-NEXT:    addps %xmm0, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [13,14,15,16]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
 ; SSE-NEXT:    mulps %xmm4, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [9,10,11,12]
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
 ; SSE-NEXT:    mulps %xmm7, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5,6,7,8]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
 ; SSE-NEXT:    mulps %xmm6, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,3,4]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; SSE-NEXT:    mulps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm6, %xmm1
@@ -1385,7 +1500,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
 ; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
 ; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
@@ -1403,7 +1518,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; FMA-RECIP-LABEL: v16f32_one_step_2_divs:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
@@ -1415,9 +1530,24 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; FMA-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_one_step_2_divs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm3 # sched: [10:2.00]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm2 # sched: [10:2.00]
+; BDVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v16f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:2.00]
@@ -1438,7 +1568,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm0, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
 ; SANDY-NEXT:    vrcpps %ymm1, %ymm4 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:1.00]
@@ -1456,7 +1586,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; HASWELL-LABEL: v16f32_one_step_2_divs:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 sched: [5:0.50]
 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
@@ -1472,7 +1602,7 @@ define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm4 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
@@ -1517,7 +1647,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; SSE-NEXT:    movaps %xmm0, %xmm4
 ; SSE-NEXT:    rcpps %xmm3, %xmm2
 ; SSE-NEXT:    mulps %xmm2, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm7
 ; SSE-NEXT:    subps %xmm3, %xmm7
 ; SSE-NEXT:    mulps %xmm2, %xmm7
@@ -1573,7 +1703,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
 ; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
-; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1]
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
 ; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
 ; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
@@ -1597,7 +1727,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; FMA-RECIP-LABEL: v16f32_two_step2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
-; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
 ; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3
 ; FMA-RECIP-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2
@@ -1613,9 +1743,26 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_two_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm4 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm1, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm1, %ymm2, %ymm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm2 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 # sched: [5:0.50]
+; BDVER2-NEXT:    vfnmaddps %ymm3, %ymm2, %ymm0, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vfmaddps %ymm2, %ymm0, %ymm2, %ymm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v16f32_two_step2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [5:1.00]
 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm2 # sched: [2:2.00]
 ; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [2:2.00]
 ; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:2.00]
@@ -1642,7 +1789,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpps %ymm1, %ymm2 # sched: [7:2.00]
 ; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:1.00]
-; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
 ; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:1.00]
 ; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
@@ -1666,7 +1813,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; HASWELL-LABEL: v16f32_two_step2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
-; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NEXT:    vmovaps %ymm2, %ymm4 # sched: [1:1.00]
 ; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm4 = -(ymm1 * ymm4) + ymm3 sched: [5:0.50]
 ; HASWELL-NEXT:    vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 sched: [5:0.50]
@@ -1686,7 +1833,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm2 # sched: [11:2.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [7:0.50]
 ; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3 # sched: [3:1.00]
 ; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3 # sched: [5:0.50]
 ; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2 # sched: [3:1.00]
@@ -1710,7 +1857,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; KNL-LABEL: v16f32_two_step2:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [11:2.00]
-; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [10:1.00]
+; KNL-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [10:1.00]
 ; KNL-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:1.00]
 ; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [5:0.50]
 ; KNL-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [5:0.50]
@@ -1722,7 +1869,7 @@ define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
 ; SKX-LABEL: v16f32_two_step2:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vrcp14ps %zmm0, %zmm1 # sched: [4:2.00]
-; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] sched: [8:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] sched: [8:0.50]
 ; SKX-NEXT:    vmovaps %zmm1, %zmm3 # sched: [1:0.33]
 ; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm3 = -(zmm0 * zmm3) + zmm2 sched: [4:0.50]
 ; SKX-NEXT:    vfmadd132ps {{.*#+}} zmm3 = (zmm3 * zmm1) + zmm1 sched: [4:0.50]
@@ -1755,6 +1902,12 @@ define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
 ; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_no_step:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v16f32_no_step:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [2:2.00]
@@ -1821,6 +1974,14 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
 ; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
 ; FMA-RECIP-NEXT:    retq
 ;
+; BDVER2-LABEL: v16f32_no_step2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [5:2.00]
+; BDVER2-NEXT:    vrcpps %ymm0, %ymm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [10:2.00]
+; BDVER2-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: v16f32_no_step2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vrcpps %ymm1, %ymm1 # sched: [2:2.00]
diff --git a/test/CodeGen/X86/retpoline-external.ll b/test/CodeGen/X86/retpoline-external.ll
index 308a1a3181bbd2204e144c4f66aa497ad809a8d0..849660cdedbd4a8118a2e303d94f91931dce2204 100644
--- a/test/CodeGen/X86/retpoline-external.ll
+++ b/test/CodeGen/X86/retpoline-external.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
-; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
 
-; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
-; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
+; RUN: llc -verify-machineinstrs -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
+; RUN: llc -verify-machineinstrs -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
 
 declare void @bar(i32)
 
diff --git a/test/CodeGen/X86/retpoline-regparm.ll b/test/CodeGen/X86/retpoline-regparm.ll
index 472cf0b1f0d173761eaf18fea3ae4434831f375f..668047c3891741cf89698238d6ce7986ee7de63b 100644
--- a/test/CodeGen/X86/retpoline-regparm.ll
+++ b/test/CodeGen/X86/retpoline-regparm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s
+; RUN: llc -verify-machineinstrs -mtriple=i686-linux < %s | FileCheck --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" %s
 
 ; Test 32-bit retpoline when -mregparm=3 is used. This case is interesting
 ; because there are no available scratch registers.  The Linux kernel builds
diff --git a/test/CodeGen/X86/retpoline.ll b/test/CodeGen/X86/retpoline.ll
index 2625435ab8c17573b1663bf4dacdc6b376f2ec6f..9a1673e8a5672ccd1785e60a8cfcc1da575d7119 100644
--- a/test/CodeGen/X86/retpoline.ll
+++ b/test/CodeGen/X86/retpoline.ll
@@ -1,8 +1,8 @@
-; RUN: llc -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
-; RUN: llc -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X64FAST
 
-; RUN: llc -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
-; RUN: llc -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
+; RUN: llc -verify-machineinstrs -mtriple=i686-unknown < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86
+; RUN: llc -verify-machineinstrs -mtriple=i686-unknown -O0 < %s | FileCheck %s --implicit-check-not="jmp.*\*" --implicit-check-not="call.*\*" --check-prefix=X86FAST
 
 declare void @bar(i32)
 
@@ -428,8 +428,9 @@ latch:
 ; X64-NEXT:          lfence
 ; X64-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X64-NEXT:          .p2align        4, 0x90
-; X64-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X64-NEXT:  {{.*}}                                  # Block address taken
 ; X64-NEXT:                                          # %entry
+; X64-NEXT:  [[CALL_TARGET]]:
 ; X64-NEXT:          movq    %r11, (%rsp)
 ; X64-NEXT:          retq
 ;
@@ -446,8 +447,9 @@ latch:
 ; X86-NEXT:          lfence
 ; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X86-NEXT:          .p2align        4, 0x90
-; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:  {{.*}}                                  # Block address taken
 ; X86-NEXT:                                          # %entry
+; X86-NEXT:  [[CALL_TARGET]]:
 ; X86-NEXT:          movl    %eax, (%esp)
 ; X86-NEXT:          retl
 ;
@@ -464,8 +466,9 @@ latch:
 ; X86-NEXT:          lfence
 ; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X86-NEXT:          .p2align        4, 0x90
-; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:  {{.*}}                                  # Block address taken
 ; X86-NEXT:                                          # %entry
+; X86-NEXT:  [[CALL_TARGET]]:
 ; X86-NEXT:          movl    %ecx, (%esp)
 ; X86-NEXT:          retl
 ;
@@ -482,8 +485,9 @@ latch:
 ; X86-NEXT:          lfence
 ; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X86-NEXT:          .p2align        4, 0x90
-; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:  {{.*}}                                  # Block address taken
 ; X86-NEXT:                                          # %entry
+; X86-NEXT:  [[CALL_TARGET]]:
 ; X86-NEXT:          movl    %edx, (%esp)
 ; X86-NEXT:          retl
 ;
@@ -500,8 +504,9 @@ latch:
 ; X86-NEXT:          lfence
 ; X86-NEXT:          jmp     [[CAPTURE_SPEC]]
 ; X86-NEXT:          .p2align        4, 0x90
-; X86-NEXT:  [[CALL_TARGET]]:                        # Block address taken
+; X86-NEXT:  {{.*}}                                  # Block address taken
 ; X86-NEXT:                                          # %entry
+; X86-NEXT:  [[CALL_TARGET]]:
 ; X86-NEXT:          movl    %edi, (%esp)
 ; X86-NEXT:          retl
 
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index 314b0c74f9f059166c3fde02ec9333b8dfb58805..51ac1d5caeac74919d917831c39b739b0b453c77 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -56,7 +56,8 @@ define i32 @sad_16i8() nounwind {
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -80,7 +81,8 @@ define i32 @sad_16i8() nounwind {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -105,7 +107,7 @@ define i32 @sad_16i8() nounwind {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -152,16 +154,16 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    pxor %xmm12, %xmm12
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm6, %xmm6
 ; SSE2-NEXT:    pxor %xmm13, %xmm13
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm15, %xmm15
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm14, %xmm14
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
@@ -219,17 +221,17 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm6
 ; SSE2-NEXT:    paddd %xmm6, %xmm7
 ; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm7, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm4, %xmm6
 ; SSE2-NEXT:    psrad $31, %xmm6
 ; SSE2-NEXT:    paddd %xmm6, %xmm4
 ; SSE2-NEXT:    pxor %xmm6, %xmm4
 ; SSE2-NEXT:    movdqa %xmm10, %xmm6
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm4, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
@@ -244,9 +246,9 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm5
 ; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
@@ -256,9 +258,9 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm8, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm8
@@ -267,13 +269,13 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm15, %xmm0
 ; SSE2-NEXT:    paddd %xmm14, %xmm13
 ; SSE2-NEXT:    paddd %xmm0, %xmm13
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm13, %xmm6
 ; SSE2-NEXT:    paddd %xmm0, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1]
@@ -317,7 +319,8 @@ define i32 @sad_32i8() nounwind {
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -343,7 +346,8 @@ define i32 @sad_32i8() nounwind {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -370,7 +374,7 @@ define i32 @sad_32i8() nounwind {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -420,42 +424,42 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    pxor %xmm14, %xmm14
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB2_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE2-NEXT:    movaps a+1040(%rax), %xmm0
-; SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm12
 ; SSE2-NEXT:    movdqa a+1056(%rax), %xmm15
 ; SSE2-NEXT:    movdqa a+1072(%rax), %xmm4
@@ -516,7 +520,7 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
 ; SSE2-NEXT:    psubd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
 ; SSE2-NEXT:    psubd %xmm0, %xmm15
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
@@ -524,8 +528,8 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
 ; SSE2-NEXT:    psubd %xmm3, %xmm9
-; SSE2-NEXT:    movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm2, %xmm9
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
@@ -534,7 +538,7 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
 ; SSE2-NEXT:    psubd %xmm0, %xmm13
-; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm9, %xmm0
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7]
@@ -563,16 +567,16 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm3
 ; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm6, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm6
 ; SSE2-NEXT:    pxor %xmm1, %xmm6
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm6, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm5
@@ -584,118 +588,118 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm8, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm8
 ; SSE2-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm8, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm11, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm11
 ; SSE2-NEXT:    pxor %xmm1, %xmm11
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm11, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm15, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm15
 ; SSE2-NEXT:    pxor %xmm1, %xmm15
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm15, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm10, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm10
 ; SSE2-NEXT:    pxor %xmm1, %xmm10
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm10, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm12, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm12
 ; SSE2-NEXT:    pxor %xmm1, %xmm12
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm12, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm9, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm9
 ; SSE2-NEXT:    pxor %xmm0, %xmm9
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm9, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
 ; SSE2-NEXT:    pxor %xmm0, %xmm7
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm7, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm13, %xmm1
 ; SSE2-NEXT:    movdqa %xmm13, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB2_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm1, %xmm4
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    paddd (%rsp), %xmm1 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
@@ -737,30 +741,30 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpsubd %xmm6, %xmm5, %xmm4
@@ -803,27 +807,27 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX1-NEXT:    vpabsd %xmm4, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm13, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddd %xmm13, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm13
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddd %xmm8, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddd %xmm9, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm9
 ; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddd %xmm10, %xmm1, %xmm1
-; AVX1-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; AVX1-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm10
 ; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
@@ -858,7 +862,8 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm14, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    addq $24, %rsp
 ; AVX1-NEXT:    vzeroupper
@@ -886,10 +891,10 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm8
-; AVX2-NEXT:    vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX2-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm9, %ymm9
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -903,9 +908,9 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm14, %ymm14
 ; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-NEXT:    vpsubd %ymm15, %ymm8, %ymm15
-; AVX2-NEXT:    vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT:    vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-NEXT:    vpaddd %ymm7, %ymm8, %ymm7
 ; AVX2-NEXT:    vpabsd %ymm9, %ymm8
 ; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5
@@ -935,7 +940,8 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -983,7 +989,7 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1012,7 +1018,7 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1430,7 +1436,8 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    retq
 ;
@@ -1448,7 +1455,8 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1470,7 +1478,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1533,7 +1541,8 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
 ; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    retq
 ;
@@ -1548,7 +1557,8 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1567,7 +1577,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/test/CodeGen/X86/sadd_sat.ll b/test/CodeGen/X86/sadd_sat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..39788e86cc7b17165fed06f39cefb618c70ca39c
--- /dev/null
+++ b/test/CodeGen/X86/sadd_sat.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=CHECK32
+
+declare  i4  @llvm.sadd.sat.i4   (i4,  i4)
+declare  i32 @llvm.sadd.sat.i32  (i32, i32)
+declare  i64 @llvm.sadd.sat.i64  (i64, i64)
+declare  <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
+
+define i32 @func(i32 %x, i32 %y) {
+; CHECK-LABEL: func:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    addl %esi, %ecx
+; CHECK-NEXT:    setns %al
+; CHECK-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    cmovnol %edi, %eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    .cfi_offset %esi, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    movl %eax, %esi
+; CHECK32-NEXT:    addl %edx, %esi
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %edx, %eax
+; CHECK32-NEXT:    cmovol %ecx, %eax
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+  %tmp = call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; CHECK-LABEL: func2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq %rsi, %rax
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    addq %rsi, %rdi
+; CHECK-NEXT:    cmovnoq %rdi, %rax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func2:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl %ebx, %ebp
+; CHECK32-NEXT:    adcl %esi, %ebp
+; CHECK32-NEXT:    movl %ebp, %eax
+; CHECK32-NEXT:    sarl $31, %eax
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    testl %ebp, %ebp
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    movl %ecx, %edx
+; CHECK32-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    testl %ebx, %ebx
+; CHECK32-NEXT:    setns %bl
+; CHECK32-NEXT:    cmpb %cl, %bl
+; CHECK32-NEXT:    setne %cl
+; CHECK32-NEXT:    testl %esi, %esi
+; CHECK32-NEXT:    setns %ch
+; CHECK32-NEXT:    cmpb %ch, %bl
+; CHECK32-NEXT:    sete %ch
+; CHECK32-NEXT:    testb %cl, %ch
+; CHECK32-NEXT:    cmovel %ebp, %edx
+; CHECK32-NEXT:    cmovel %edi, %eax
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+  %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) {
+; CHECK-LABEL: func3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $4, %sil
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    addb %sil, %cl
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    addb %sil, %al
+; CHECK-NEXT:    jno .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    addb $127, %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    sarb $4, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func3:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK32-NEXT:    shlb $4, %dl
+; CHECK32-NEXT:    shlb $4, %al
+; CHECK32-NEXT:    movl %eax, %ecx
+; CHECK32-NEXT:    addb %dl, %cl
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    addb %dl, %al
+; CHECK32-NEXT:    jno .LBB2_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    addb $127, %cl
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:  .LBB2_2:
+; CHECK32-NEXT:    sarb $4, %al
+; CHECK32-NEXT:    retl
+  %tmp = call i4 @llvm.sadd.sat.i4(i4 %x, i4 %y);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %r8d
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %r8d, %esi
+; CHECK-NEXT:    addl %ecx, %esi
+; CHECK-NEXT:    setns %dl
+; CHECK-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %ecx, %r8d
+; CHECK-NEXT:    cmovol %edx, %r8d
+; CHECK-NEXT:    movd %xmm1, %edx
+; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    addl %edx, %edi
+; CHECK-NEXT:    setns %sil
+; CHECK-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    cmovol %esi, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT:    movd %xmm2, %edx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    addl %edx, %esi
+; CHECK-NEXT:    setns %dil
+; CHECK-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    cmovol %edi, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; CHECK-NEXT:    movd %xmm1, %r9d
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-NEXT:    movd %xmm0, %edx
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    addl %r9d, %esi
+; CHECK-NEXT:    setns %dil
+; CHECK-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    addl %r9d, %edx
+; CHECK-NEXT:    cmovol %edi, %edx
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    movd %eax, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movd %ecx, %xmm0
+; CHECK-NEXT:    movd %r8d, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: vec:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %ecx, %esi
+; CHECK32-NEXT:    addl %edx, %esi
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %edx, %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovol %eax, %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %edx, %edi
+; CHECK32-NEXT:    addl %esi, %edi
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %esi, %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    cmovol %eax, %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %esi, %ebx
+; CHECK32-NEXT:    addl %edi, %ebx
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %edi, %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    cmovol %eax, %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    xorl %ebx, %ebx
+; CHECK32-NEXT:    movl %edi, %ebp
+; CHECK32-NEXT:    addl %eax, %ebp
+; CHECK32-NEXT:    setns %bl
+; CHECK32-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    addl %eax, %edi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    cmovol %ebx, %edi
+; CHECK32-NEXT:    movl %ecx, 12(%eax)
+; CHECK32-NEXT:    movl %edx, 8(%eax)
+; CHECK32-NEXT:    movl %esi, 4(%eax)
+; CHECK32-NEXT:    movl %edi, (%eax)
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
+  ret <4 x i32> %tmp;
+}
diff --git a/test/CodeGen/X86/sat-add.ll b/test/CodeGen/X86/sat-add.ll
index 3cb11b11ec36de6498a9b89249c5fe02a2c5db01..f0989e8b081af7d8e23b541c1fc4ab53ef3c5bdd 100644
--- a/test/CodeGen/X86/sat-add.ll
+++ b/test/CodeGen/X86/sat-add.ll
@@ -679,13 +679,12 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_notval(<16 x i8> %x, <16
 define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16> %y) {
 ; SSE2-LABEL: unsigned_sat_variable_v8i16_using_min:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pminsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    pminsw %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -717,15 +716,12 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i
 define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_notval(<8 x i16> %x, <8 x i16> %y) {
 ; SSE2-LABEL: unsigned_sat_variable_v8i16_using_cmp_notval:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    paddw %xmm1, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtw %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    paddw %xmm1, %xmm2
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: unsigned_sat_variable_v8i16_using_cmp_notval:
@@ -751,15 +747,14 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32>
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm4
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm4, %xmm2
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -809,15 +804,12 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i
 define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4 x i32> %y) {
 ; SSE2-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval:
diff --git a/test/CodeGen/X86/schedule-x86-64-shld.ll b/test/CodeGen/X86/schedule-x86-64-shld.ll
index 46388d7b4fd1f2ada6a474227fadcfc52721b2cb..0e66329f7b457895d9567708739892a9e8068685 100644
--- a/test/CodeGen/X86/schedule-x86-64-shld.ll
+++ b/test/CodeGen/X86/schedule-x86-64-shld.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER12 --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER1
 
 
 ; uint64_t lshift10(uint64_t a, uint64_t b)
@@ -16,17 +17,17 @@ define i64 @lshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
 ; GENERIC-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift10_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    shldq $10, %rsi, %rax # sched: [4:3.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift10_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    shldq $10, %rsi, %rax # sched: [3:3.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift10_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdi, %rax
-; BDVER1-NEXT:    shldq $10, %rsi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, 10
   %shr = lshr i64 %b, 54
@@ -41,19 +42,19 @@ define i64 @lshift10(i64 %a, i64 %b) nounwind readnone {
 ; GENERIC-NEXT:    shldq $10, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift10:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    shlq $10, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    shrq $54, %rsi # sched: [1:0.50]
+; BDVER12-NEXT:    leaq (%rsi,%rdi), %rax # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift10:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    shlq $10, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    shrq $54, %rsi # sched: [1:0.50]
 ; BTVER2-NEXT:    leaq (%rsi,%rdi), %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift10:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    shlq $10, %rdi
-; BDVER1-NEXT:    shrq $54, %rsi
-; BDVER1-NEXT:    leaq (%rsi,%rdi), %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, 10
   %shr = lshr i64 %b, 54
@@ -74,17 +75,17 @@ define i64 @rshift10_optsize(i64 %a, i64 %b) nounwind readnone optsize {
 ; GENERIC-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: rshift10_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    shrdq $62, %rsi, %rax # sched: [4:3.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: rshift10_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    shrdq $62, %rsi, %rax # sched: [3:3.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: rshift10_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdi, %rax
-; BDVER1-NEXT:    shrdq $62, %rsi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = lshr i64 %a, 62
   %shr = shl i64 %b, 2
@@ -100,17 +101,17 @@ define i64 @rshift10(i64 %a, i64 %b) nounwind readnone {
 ; GENERIC-NEXT:    shrdq $62, %rsi, %rax # sched: [2:0.67]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: rshift10:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    shrq $62, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: rshift10:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    shrq $62, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    leaq (%rdi,%rsi,4), %rax # sched: [2:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: rshift10:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    shrq $62, %rdi
-; BDVER1-NEXT:    leaq (%rdi,%rsi,4), %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = lshr i64 %a, 62
   %shr = shl i64 %b, 2
@@ -132,21 +133,21 @@ define i64 @lshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ; GENERIC-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_cl_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shldq %cl, %rsi, %rax # sched: [4:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shldq %cl, %rsi, %rax # sched: [4:4.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_cl_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdx, %rcx
-; BDVER1-NEXT:    movq %rdi, %rax
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shldq %cl, %rsi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, %c
   %sub = sub nsw i64 64, %c
@@ -164,6 +165,17 @@ define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; GENERIC-NEXT:    shldq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_cl:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    shlq %cl, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    negl %ecx # sched: [1:0.50]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shrq %cl, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift_cl:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
@@ -174,17 +186,6 @@ define i64 @lshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; BTVER2-NEXT:    shrq %cl, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_cl:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdx, %rcx
-; BDVER1-NEXT:    movq %rsi, %rax
-; BDVER1-NEXT:    shlq %cl, %rdi
-; BDVER1-NEXT:    negl %ecx
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shrq %cl, %rax
-; BDVER1-NEXT:    orq %rdi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shl = shl i64 %a, %c
   %sub = sub nsw i64 64, %c
@@ -208,21 +209,21 @@ define i64 @rshift_cl_optsize(i64 %a, i64 %b, i64 %c) nounwind readnone optsize
 ; GENERIC-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: rshift_cl_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: rshift_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
-; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:4.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: rshift_cl_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdx, %rcx
-; BDVER1-NEXT:    movq %rdi, %rax
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shrdq %cl, %rsi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shr = lshr i64 %a, %c
   %sub = sub nsw i64 64, %c
@@ -240,6 +241,17 @@ define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; GENERIC-NEXT:    shrdq %cl, %rsi, %rax # sched: [4:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: rshift_cl:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rsi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    negl %ecx # sched: [1:0.50]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shlq %cl, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: rshift_cl:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rdx, %rcx # sched: [1:0.50]
@@ -250,17 +262,6 @@ define i64 @rshift_cl(i64 %a, i64 %b, i64 %c) nounwind readnone {
 ; BTVER2-NEXT:    shlq %cl, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: rshift_cl:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rdx, %rcx
-; BDVER1-NEXT:    movq %rsi, %rax
-; BDVER1-NEXT:    shrq %cl, %rdi
-; BDVER1-NEXT:    negl %ecx
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shlq %cl, %rax
-; BDVER1-NEXT:    orq %rdi, %rax
-; BDVER1-NEXT:    retq
 entry:
   %shr = lshr i64 %a, %c
   %sub = sub nsw i64 64, %c
@@ -284,19 +285,19 @@ define void @lshift_mem_cl_optsize(i64 %a, i64 %c) nounwind readnone optsize {
 ; GENERIC-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_cl_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [4:11.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_cl_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
 ; BTVER2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; BTVER2-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [9:11.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_cl_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rsi, %rcx
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shldq %cl, %rdi, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %b = load i64, i64* @x
   %shl = shl i64 %b, %c
@@ -315,6 +316,18 @@ define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone {
 ; GENERIC-NEXT:    shldq %cl, %rdi, {{.*}}(%rip) # sched: [10:1.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_cl:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
+; BDVER12-NEXT:    movq %rsi, %rcx # sched: [1:0.50]
+; BDVER12-NEXT:    shlq %cl, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    negl %ecx # sched: [1:0.50]
+; BDVER12-NEXT:    # kill: def $cl killed $cl killed $rcx
+; BDVER12-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_cl:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:1.00]
@@ -326,18 +339,6 @@ define void @lshift_mem_cl(i64 %a, i64 %c) nounwind readnone {
 ; BTVER2-NEXT:    orq %rax, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_cl:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq %rsi, %rcx
-; BDVER1-NEXT:    movq {{.*}}(%rip), %rax
-; BDVER1-NEXT:    shlq %cl, %rax
-; BDVER1-NEXT:    negl %ecx
-; BDVER1-NEXT:    # kill: def $cl killed $cl killed $rcx
-; BDVER1-NEXT:    shrq %cl, %rdi
-; BDVER1-NEXT:    orq %rax, %rdi
-; BDVER1-NEXT:    movq %rdi, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %b = load i64, i64* @x
   %shl = shl i64 %b, %c
@@ -354,6 +355,15 @@ define void @lshift_mem(i64 %a) nounwind readnone {
 ; GENERIC-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
+; BDVER12-NEXT:    shrq $54, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    shlq $10, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rax, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift_mem:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:1.00]
@@ -362,15 +372,6 @@ define void @lshift_mem(i64 %a) nounwind readnone {
 ; BTVER2-NEXT:    orq %rax, %rdi # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rdi, {{.*}}(%rip) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq {{.*}}(%rip), %rax
-; BDVER1-NEXT:    shlq $10, %rax
-; BDVER1-NEXT:    shrq $54, %rdi
-; BDVER1-NEXT:    orq %rax, %rdi
-; BDVER1-NEXT:    movq %rdi, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %b = load i64, i64* @x
   %shl = shl i64 %b, 10
@@ -386,15 +387,15 @@ define void @lshift_mem_optsize(i64 %a) nounwind readnone optsize {
 ; GENERIC-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [8:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [4:11.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    shldq $10, %rdi, {{.*}}(%rip) # sched: [9:11.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    shldq $10, %rdi, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %b = load i64, i64* @x
   %shl = shl i64 %b, 10
@@ -412,6 +413,15 @@ define void @lshift_mem_b(i64 %b) nounwind readnone {
 ; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_b:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
+; BDVER12-NEXT:    shlq $10, %rdi # sched: [1:0.50]
+; BDVER12-NEXT:    shrq $54, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    orq %rdi, %rax # sched: [1:0.50]
+; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_b:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:1.00]
@@ -420,15 +430,6 @@ define void @lshift_mem_b(i64 %b) nounwind readnone {
 ; BTVER2-NEXT:    orq %rdi, %rax # sched: [1:0.50]
 ; BTVER2-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_b:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq {{.*}}(%rip), %rax
-; BDVER1-NEXT:    shlq $10, %rdi
-; BDVER1-NEXT:    shrq $54, %rax
-; BDVER1-NEXT:    orq %rdi, %rax
-; BDVER1-NEXT:    movq %rax, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %a = load i64, i64* @x
   %shl = shl i64 %b, 10
@@ -446,19 +447,19 @@ define void @lshift_mem_b_optsize(i64 %b) nounwind readnone optsize {
 ; GENERIC-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER12-LABEL: lshift_mem_b_optsize:
+; BDVER12:       # %bb.0: # %entry
+; BDVER12-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:0.50]
+; BDVER12-NEXT:    shrdq $54, %rdi, %rax # sched: [4:3.00]
+; BDVER12-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:0.50]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: lshift_mem_b_optsize:
 ; BTVER2:       # %bb.0: # %entry
 ; BTVER2-NEXT:    movq {{.*}}(%rip), %rax # sched: [5:1.00]
 ; BTVER2-NEXT:    shrdq $54, %rdi, %rax # sched: [3:3.00]
 ; BTVER2-NEXT:    movq %rax, {{.*}}(%rip) # sched: [1:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; BDVER1-LABEL: lshift_mem_b_optsize:
-; BDVER1:       # %bb.0: # %entry
-; BDVER1-NEXT:    movq {{.*}}(%rip), %rax
-; BDVER1-NEXT:    shrdq $54, %rdi, %rax
-; BDVER1-NEXT:    movq %rax, {{.*}}(%rip)
-; BDVER1-NEXT:    retq
 entry:
   %a = load i64, i64* @x
   %shl = shl i64 %b, 10
diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll
index 873d6a679b0056d2bf10aa1309dc18f22f07ccd1..6b8ad906fec6788c6bf70778aa1e847cdde96631 100644
--- a/test/CodeGen/X86/schedule-x86_32.ll
+++ b/test/CodeGen/X86/schedule-x86_32.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -76,6 +77,14 @@ define i8 @test_aaa(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_aaa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    aaa # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_aaa:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -168,6 +177,15 @@ define void @test_aad(i16 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_aad:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    aad # sched: [100:0.50]
+; BDVER2-NEXT:    aad $16 # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_aad:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -262,6 +280,15 @@ define void @test_aam(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_aam:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    aam # sched: [100:0.50]
+; BDVER2-NEXT:    aam $16 # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_aam:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -348,6 +375,14 @@ define i8 @test_aas(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_aas:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    aas # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_aas:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -440,6 +475,15 @@ define void @test_arpl(i16 %a0, i16 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_arpl:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    arpl %ax, (%ecx) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_arpl:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -451,8 +495,8 @@ define void @test_arpl(i16 %a0, i16 *%a1) optsize {
 ;
 ; ZNVER1-LABEL: test_arpl:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    arpl %ax, (%ecx) # sched: [100:0.25]
 ; ZNVER1-NEXT:    #NO_APP
@@ -598,6 +642,23 @@ define void @test_bound(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3) optsize {
 ; SKX-NEXT:    .cfi_def_cfa_offset 4
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_bound:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    pushl %esi # sched: [1:0.50]
+; BDVER2-NEXT:    .cfi_def_cfa_offset 8
+; BDVER2-NEXT:    .cfi_offset %esi, -8
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bound %ax, (%esi) # sched: [100:0.50]
+; BDVER2-NEXT:    bound %ecx, (%edx) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    popl %esi # sched: [5:0.50]
+; BDVER2-NEXT:    .cfi_def_cfa_offset 4
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bound:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    pushl %esi # sched: [1:1.00]
@@ -620,10 +681,10 @@ define void @test_bound(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3) optsize {
 ; ZNVER1-NEXT:    pushl %esi # sched: [1:0.50]
 ; ZNVER1-NEXT:    .cfi_def_cfa_offset 8
 ; ZNVER1-NEXT:    .cfi_offset %esi, -8
+; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
 ; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %esi # sched: [8:0.50]
-; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    bound %ax, (%esi) # sched: [100:0.25]
 ; ZNVER1-NEXT:    bound %ecx, (%edx) # sched: [100:0.25]
@@ -702,6 +763,14 @@ define i8 @test_daa(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_daa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    daa # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_daa:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -786,6 +855,14 @@ define i8 @test_das(i8 %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_das:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    das # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_das:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
@@ -886,6 +963,16 @@ define void @test_dec16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_dec16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decw %ax # sched: [1:0.50]
+; BDVER2-NEXT:    decw (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_dec16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -898,8 +985,8 @@ define void @test_dec16(i16 %a0, i16* %a1) optsize {
 ;
 ; ZNVER1-LABEL: test_dec16:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    decw %ax # sched: [1:0.25]
 ; ZNVER1-NEXT:    decw (%ecx) # sched: [5:0.50]
@@ -989,6 +1076,16 @@ define void @test_dec32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_dec32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decl %eax # sched: [1:0.50]
+; BDVER2-NEXT:    decl (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_dec32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1093,6 +1190,16 @@ define void @test_inc16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_inc16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incw %ax # sched: [1:0.50]
+; BDVER2-NEXT:    incw (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_inc16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -1105,8 +1212,8 @@ define void @test_inc16(i16 %a0, i16* %a1) optsize {
 ;
 ; ZNVER1-LABEL: test_inc16:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    incw %ax # sched: [1:0.25]
 ; ZNVER1-NEXT:    incw (%ecx) # sched: [5:0.50]
@@ -1196,6 +1303,16 @@ define void @test_inc32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_inc32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incl %eax # sched: [1:0.50]
+; BDVER2-NEXT:    incl (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_inc32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1276,6 +1393,13 @@ define void @test_into() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_into:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    into # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_into:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1368,6 +1492,15 @@ define void @test_jcxz_jecxz() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_jcxz_jecxz:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:  JXTGT:
+; BDVER2-NEXT:    jcxz JXTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jecxz JXTGT # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_jcxz_jecxz:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1448,6 +1581,13 @@ define void @test_leave() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_leave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    leave # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_leave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1604,6 +1744,23 @@ define void @test_pop_push() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_pop_push:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popl %ds # sched: [100:0.50]
+; BDVER2-NEXT:    popl %es # sched: [100:0.50]
+; BDVER2-NEXT:    popl %ss # sched: [100:0.50]
+; BDVER2-NEXT:    popl %fs # sched: [100:0.50]
+; BDVER2-NEXT:    popl %gs # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %cs # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %ds # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %es # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %ss # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %fs # sched: [100:0.50]
+; BDVER2-NEXT:    pushl %gs # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pop_push:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1760,6 +1917,21 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_pop_push_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popw %ax # sched: [5:0.50]
+; BDVER2-NEXT:    popw (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    pushw %ax # sched: [1:0.50]
+; BDVER2-NEXT:    pushw (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    pushw $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    pushw $7 # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pop_push_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
@@ -1777,8 +1949,8 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ;
 ; ZNVER1-LABEL: test_pop_push_16:
 ; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
 ; ZNVER1-NEXT:    #APP
 ; ZNVER1-NEXT:    popw %ax # sched: [8:0.50]
 ; ZNVER1-NEXT:    popw (%ecx) # sched: [5:0.50]
@@ -1912,6 +2084,21 @@ define i32 @test_pop_push_32(i32 %a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_pop_push_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popl %eax # sched: [5:0.50]
+; BDVER2-NEXT:    popl (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    pushl %eax # sched: [1:0.50]
+; BDVER2-NEXT:    pushl (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    pushl $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    pushl $7 # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pop_push_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2026,6 +2213,16 @@ define void @test_popa_popf_pusha_pushf() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_popa_popf_pusha_pushf:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popal # sched: [5:0.50]
+; BDVER2-NEXT:    popfl # sched: [5:0.50]
+; BDVER2-NEXT:    pushal # sched: [1:0.50]
+; BDVER2-NEXT:    pushfl # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_popa_popf_pusha_pushf:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2144,6 +2341,18 @@ define void @test_ret() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_ret:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+; BDVER2-NEXT:    retl $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [5:1.00]
+; BDVER2-NEXT:    lretl # sched: [5:1.00]
+; BDVER2-NEXT:    lretl $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ret:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2228,6 +2437,13 @@ define i8 @test_salc() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_salc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    salc # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_salc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2345,6 +2561,18 @@ define void @test_xchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_xchg_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgl %eax, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    xchgl %ecx, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    xchgl %eax, (%edx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xchg_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
diff --git a/test/CodeGen/X86/schedule-x86_64.ll b/test/CodeGen/X86/schedule-x86_64.ll
index e903ff510537f6114d59e94b7b7d563586d43674..18541184eb8486afed092feb1258b8b33505796f 100644
--- a/test/CodeGen/X86/schedule-x86_64.ll
+++ b/test/CodeGen/X86/schedule-x86_64.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -108,6 +109,18 @@ define void @test_adc_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_adc_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    adcb $7, %al # sched: [1:1.00]
+; BDVER2-NEXT:    adcb $7, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    adcb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcb %dl, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    adcb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcb (%rsi), %dil # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_adc_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -271,6 +284,23 @@ define void @test_adc_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_adc_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    adcw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    adcw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    adcw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    adcw $7, %di # sched: [1:1.00]
+; BDVER2-NEXT:    adcw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcw %dx, %di # sched: [1:1.00]
+; BDVER2-NEXT:    adcw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcw (%rsi), %di # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_adc_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -444,6 +474,23 @@ define void @test_adc_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_adc_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    adcl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    adcl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    adcl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    adcl $7, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    adcl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcl %edx, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    adcl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcl (%rsi), %edi # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_adc_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -617,6 +664,23 @@ define void @test_adc_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_adc_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    adcq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    adcq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    adcq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    adcq $7, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    adcq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcq %rdx, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    adcq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    adcq (%rsi), %rdi # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_adc_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -751,6 +815,18 @@ define void @test_add_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_add_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    addb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    addb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    addb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    addb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addb (%rsi), %dil # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_add_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -914,6 +990,23 @@ define void @test_add_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_add_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    addw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    addw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    addw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    addw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    addw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    addw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addw (%rsi), %di # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_add_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1087,6 +1180,23 @@ define void @test_add_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_add_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    addl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    addl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    addl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    addl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    addl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    addl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addl (%rsi), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_add_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1260,6 +1370,23 @@ define void @test_add_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_add_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    addq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    addq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    addq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    addq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    addq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    addq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    addq (%rsi), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_add_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1394,6 +1521,18 @@ define void @test_and_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_and_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    andb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    andb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    andb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    andb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andb (%rsi), %dil # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_and_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1557,6 +1696,23 @@ define void @test_and_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_and_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    andw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    andw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    andw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    andw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    andw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    andw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andw (%rsi), %di # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_and_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1730,6 +1886,23 @@ define void @test_and_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_and_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    andl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    andl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    andl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    andl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    andl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    andl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andl (%rsi), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_and_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1903,6 +2076,23 @@ define void @test_and_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_and_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    andq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    andq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    andq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    andq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    andq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    andq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    andq (%rsi), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_and_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2021,6 +2211,16 @@ define i16 @test_bsf16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsf16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsfw %di, %ax # sched: [3:2.00]
+; BDVER2-NEXT:    bsfw (%rsi), %cx # sched: [7:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bsf16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2119,6 +2319,15 @@ define i32 @test_bsf32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsf32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsfl %edi, %eax # sched: [3:2.00]
+; BDVER2-NEXT:    bsfl (%rsi), %ecx # sched: [7:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bsf32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2215,6 +2424,15 @@ define i64 @test_bsf64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsf64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsfq %rdi, %rax # sched: [3:2.00]
+; BDVER2-NEXT:    bsfq (%rsi), %rcx # sched: [7:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bsf64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2320,6 +2538,16 @@ define i16 @test_bsr16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsr16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsrw %di, %ax # sched: [4:2.00]
+; BDVER2-NEXT:    bsrw (%rsi), %cx # sched: [8:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bsr16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2418,6 +2646,15 @@ define i32 @test_bsr32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    orl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsr32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsrl %edi, %eax # sched: [4:2.00]
+; BDVER2-NEXT:    bsrl (%rsi), %ecx # sched: [8:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bsr32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2514,6 +2751,15 @@ define i64 @test_bsr64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bsr64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    bsrq %rdi, %rax # sched: [4:2.00]
+; BDVER2-NEXT:    bsrq (%rsi), %rcx # sched: [8:2.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bsr64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2587,6 +2833,12 @@ define i32 @test_bswap32(i32 %a0) optsize {
 ; SKX-NEXT:    bswapl %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bswap32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    bswapl %eax # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bswap32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
@@ -2650,6 +2902,12 @@ define i64 @test_bswap64(i64 %a0) optsize {
 ; SKX-NEXT:    bswapq %rax # sched: [2:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bswap64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    bswapq %rax # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bswap64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
@@ -2842,6 +3100,28 @@ define void @test_bt_btc_btr_bts_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bt_btc_btr_bts_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    btw %si, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btcw %si, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btrw %si, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btsw %si, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btw %si, (%rdx) # sched: [5:0.50]
+; BDVER2-NEXT:    btcw %si, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrw %si, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsw %si, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    btcw $7, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btrw $7, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btsw $7, %di # sched: [2:0.50]
+; BDVER2-NEXT:    btw $7, (%rdx) # sched: [5:0.50]
+; BDVER2-NEXT:    btcw $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrw $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsw $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bt_btc_btr_bts_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3065,6 +3345,28 @@ define void @test_bt_btc_btr_bts_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bt_btc_btr_bts_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    btl %esi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btcl %esi, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btrl %esi, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btsl %esi, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btl %esi, (%rdx) # sched: [5:0.50]
+; BDVER2-NEXT:    btcl %esi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrl %esi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsl %esi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    btcl $7, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btrl $7, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btsl $7, %edi # sched: [2:0.50]
+; BDVER2-NEXT:    btl $7, (%rdx) # sched: [5:0.50]
+; BDVER2-NEXT:    btcl $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrl $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsl $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bt_btc_btr_bts_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3288,6 +3590,28 @@ define void @test_bt_btc_btr_bts_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_bt_btc_btr_bts_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    btq %rsi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btcq %rsi, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btrq %rsi, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btsq %rsi, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btq %rsi, (%rdx) # sched: [5:0.50]
+; BDVER2-NEXT:    btcq %rsi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrq %rsi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsq %rsi, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    btcq $7, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btrq $7, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btsq $7, %rdi # sched: [2:0.50]
+; BDVER2-NEXT:    btq $7, (%rdx) # sched: [5:0.50]
+; BDVER2-NEXT:    btcq $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btrq $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    btsq $7, (%rdx) # sched: [7:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_bt_btc_btr_bts_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3434,6 +3758,18 @@ define void @test_cbw_cdq_cdqe_cqo_cwd_cwde() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cbtw # sched: [1:0.50]
+; BDVER2-NEXT:    cltd # sched: [1:0.50]
+; BDVER2-NEXT:    cltq # sched: [1:0.50]
+; BDVER2-NEXT:    cqto # sched: [1:0.50]
+; BDVER2-NEXT:    cwtd # sched: [1:0.50]
+; BDVER2-NEXT:    cwtl # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3534,6 +3870,15 @@ define void @test_clc_cld_cmc() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_clc_cld_cmc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    clc # sched: [1:0.50]
+; BDVER2-NEXT:    cld # sched: [1:0.50]
+; BDVER2-NEXT:    cmc # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_clc_cld_cmc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3652,6 +3997,18 @@ define void @test_cmp_8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmp_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    cmpb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    cmpb $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpb %dil, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    cmpb %dil, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpb (%rsi), %dil # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmp_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3815,6 +4172,23 @@ define void @test_cmp_16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmp_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    cmpw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    cmpw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    cmpw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmpw $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpw %di, %di # sched: [1:0.50]
+; BDVER2-NEXT:    cmpw %di, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpw (%rsi), %di # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmp_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3988,6 +4362,23 @@ define void @test_cmp_32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmp_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    cmpl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    cmpl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    cmpl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmpl $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpl %edi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    cmpl %edi, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpl (%rsi), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmp_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4161,6 +4552,23 @@ define void @test_cmp_64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmp_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    cmpq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    cmpq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    cmpq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmpq $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpq %rdi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    cmpq %rdi, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    cmpq (%rsi), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmp_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4279,6 +4687,16 @@ define void @test_cmps() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpsb %es:(%rdi), (%rsi) # sched: [100:0.50]
+; BDVER2-NEXT:    cmpsw %es:(%rdi), (%rsi) # sched: [100:0.50]
+; BDVER2-NEXT:    cmpsl %es:(%rdi), (%rsi) # sched: [100:0.50]
+; BDVER2-NEXT:    cmpsq %es:(%rdi), (%rsi) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmps:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4367,6 +4785,14 @@ define void @test_cmpxchg_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchgb %dil, %sil # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchgb %dil, (%rdx) # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4450,6 +4876,14 @@ define void @test_cmpxchg_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchgw %di, %si # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchgw %di, (%rdx) # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4533,6 +4967,14 @@ define void @test_cmpxchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchgl %edi, %esi # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchgl %edi, (%rdx) # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4616,6 +5058,14 @@ define void @test_cmpxchg_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchgq %rdi, %rsi # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchgq %rdi, (%rdx) # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4699,6 +5149,14 @@ define void @test_cmpxchg8b_cmpxchg16b(i8 *%a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cmpxchg8b_cmpxchg16b:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cmpxchg8b (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    cmpxchg16b (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cmpxchg8b_cmpxchg16b:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4775,6 +5233,13 @@ define void @test_cpuid() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_cpuid:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    cpuid # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_cpuid:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4857,6 +5322,14 @@ define void @test_dec8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dec8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    decb (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_dec8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4940,6 +5413,14 @@ define void @test_dec16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dec16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decw %di # sched: [1:0.50]
+; BDVER2-NEXT:    decw (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_dec16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5023,6 +5504,14 @@ define void @test_dec32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dec32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    decl (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_dec32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5106,6 +5595,14 @@ define void @test_dec64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_dec64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    decq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    decq (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_dec64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5254,6 +5751,22 @@ define void @test_div(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_div:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    divb %dil # sched: [12:12.00]
+; BDVER2-NEXT:    divb (%r8) # sched: [16:12.00]
+; BDVER2-NEXT:    divw %si # sched: [15:15.00]
+; BDVER2-NEXT:    divw (%r9) # sched: [19:15.00]
+; BDVER2-NEXT:    divl %edx # sched: [14:14.00]
+; BDVER2-NEXT:    divl (%rax) # sched: [18:14.00]
+; BDVER2-NEXT:    divq %rcx # sched: [14:14.00]
+; BDVER2-NEXT:    divq (%r10) # sched: [18:14.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_div:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -5354,6 +5867,14 @@ define void @test_enter() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_enter:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    enter $7, $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_enter:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5502,6 +6023,22 @@ define void @test_idiv(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_idiv:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    idivb %dil # sched: [12:12.00]
+; BDVER2-NEXT:    idivb (%r8) # sched: [16:12.00]
+; BDVER2-NEXT:    idivw %si # sched: [15:17.00]
+; BDVER2-NEXT:    idivw (%r9) # sched: [19:17.00]
+; BDVER2-NEXT:    idivl %edx # sched: [14:25.00]
+; BDVER2-NEXT:    idivl (%rax) # sched: [18:25.00]
+; BDVER2-NEXT:    idivq %rcx # sched: [14:14.00]
+; BDVER2-NEXT:    idivq (%r10) # sched: [18:14.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_idiv:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -5602,6 +6139,14 @@ define void @test_imul_8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_imul_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    imulb %dil # sched: [4:1.00]
+; BDVER2-NEXT:    imulb (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_imul_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5749,6 +6294,22 @@ define void @test_imul_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_imul_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    imulw %di # sched: [4:1.00]
+; BDVER2-NEXT:    imulw (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    imulw %dx, %di # sched: [4:1.00]
+; BDVER2-NEXT:    imulw (%rsi), %di # sched: [8:1.00]
+; BDVER2-NEXT:    imulw $511, %di, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [5:1.00]
+; BDVER2-NEXT:    imulw $511, (%rsi), %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [9:1.00]
+; BDVER2-NEXT:    imulw $7, %di, %di # sched: [5:1.00]
+; BDVER2-NEXT:    imulw $7, (%rsi), %di # sched: [9:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_imul_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5912,6 +6473,22 @@ define void @test_imul_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_imul_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    imull %edi # sched: [4:1.00]
+; BDVER2-NEXT:    imull (%rsi) # sched: [8:1.00]
+; BDVER2-NEXT:    imull %edx, %edi # sched: [4:1.00]
+; BDVER2-NEXT:    imull (%rsi), %edi # sched: [8:1.00]
+; BDVER2-NEXT:    imull $665536, %edi, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [4:1.00]
+; BDVER2-NEXT:    imull $665536, (%rsi), %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [8:1.00]
+; BDVER2-NEXT:    imull $7, %edi, %edi # sched: [4:1.00]
+; BDVER2-NEXT:    imull $7, (%rsi), %edi # sched: [8:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_imul_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6075,6 +6652,22 @@ define void @test_imul_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_imul_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    imulq %rdi # sched: [6:4.00]
+; BDVER2-NEXT:    imulq (%rsi) # sched: [10:4.00]
+; BDVER2-NEXT:    imulq %rdx, %rdi # sched: [6:4.00]
+; BDVER2-NEXT:    imulq (%rsi), %rdi # sched: [10:4.00]
+; BDVER2-NEXT:    imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:4.00]
+; BDVER2-NEXT:    imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [10:4.00]
+; BDVER2-NEXT:    imulq $7, %rdi, %rdi # sched: [6:4.00]
+; BDVER2-NEXT:    imulq $7, (%rsi), %rdi # sched: [10:4.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_imul_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6207,6 +6800,18 @@ define void @test_in() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_in:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    inb $7, %al # sched: [100:0.50]
+; BDVER2-NEXT:    inw $7, %ax # sched: [100:0.50]
+; BDVER2-NEXT:    inl $7, %eax # sched: [100:0.50]
+; BDVER2-NEXT:    inb %dx, %al # sched: [100:0.50]
+; BDVER2-NEXT:    inw %dx, %ax # sched: [100:0.50]
+; BDVER2-NEXT:    inl %dx, %eax # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_in:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6299,6 +6904,14 @@ define void @test_inc8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_inc8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    incb (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_inc8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6382,6 +6995,14 @@ define void @test_inc16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_inc16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incw %di # sched: [1:0.50]
+; BDVER2-NEXT:    incw (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_inc16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6465,6 +7086,14 @@ define void @test_inc32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_inc32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    incl (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_inc32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6548,6 +7177,14 @@ define void @test_inc64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_inc64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    incq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    incq (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_inc64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6640,6 +7277,15 @@ define void @test_ins() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ins:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    insb %dx, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    insw %dx, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    insl %dx, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ins:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6718,6 +7364,13 @@ define void @test_int() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_int:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    int $7 # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_int:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -6800,6 +7453,14 @@ define void @test_invlpg_invlpga(i8 *%a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_invlpg_invlpga:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    invlpg (%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    invlpga %rax, %ecx # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_invlpg_invlpga:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7116,6 +7777,43 @@ define void @test_jcc() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_jcc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:  JCCTGT:
+; BDVER2-NEXT:    jo JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jno JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jb JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jb JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jb JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jae JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jae JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jae JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    je JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    je JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jne JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jne JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jbe JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jbe JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    ja JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    ja JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    js JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jns JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jp JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jp JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jnp JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jnp JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jl JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jl JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jge JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jge JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jle JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jle JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jg JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jg JCCTGT # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_jcc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7266,6 +7964,15 @@ define void @test_jecxz_jrcxz() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_jecxz_jrcxz:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:  JXTGT:
+; BDVER2-NEXT:    jecxz JXTGT # sched: [1:1.00]
+; BDVER2-NEXT:    jrcxz JXTGT # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_jecxz_jrcxz:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7354,6 +8061,14 @@ define void @test_lahf_sahf() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lahf_sahf:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    lahf # sched: [2:0.50]
+; BDVER2-NEXT:    sahf # sched: [2:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lahf_sahf:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7438,6 +8153,13 @@ define void @test_leave() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_leave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    leave # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_leave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7536,6 +8258,16 @@ define void @test_lods() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_lods:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    lodsb (%rsi), %al # sched: [100:0.50]
+; BDVER2-NEXT:    lodsw (%rsi), %ax # sched: [100:0.50]
+; BDVER2-NEXT:    lodsl (%rsi), %eax # sched: [100:0.50]
+; BDVER2-NEXT:    lodsq (%rsi), %rax # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_lods:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7640,6 +8372,16 @@ define void @test_loop() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_loop:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:  LTGT:
+; BDVER2-NEXT:    loop LTGT # sched: [1:1.00]
+; BDVER2-NEXT:    loope LTGT # sched: [1:1.00]
+; BDVER2-NEXT:    loopne LTGT # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_loop:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7730,6 +8472,14 @@ define void @test_movnti(i32 %a0, i32 *%a1, i64 %a2, i64 *%a3) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movnti:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    movntil %edi, (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    movntiq %rdx, (%rcx) # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movnti:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7830,6 +8580,16 @@ define void @test_movs() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    movsb (%rsi), %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    movsw (%rsi), %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    movsl (%rsi), %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    movsq (%rsi), %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -7929,6 +8689,15 @@ define i64 @test_movslq(i32 %a0, i32 *%a1) optsize {
 ; SKX-NEXT:    orq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_movslq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    movslq %edi, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    movslq (%rsi), %rcx # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movslq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -8082,6 +8851,22 @@ define void @test_mul(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_mul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    mulb %dil # sched: [4:1.00]
+; BDVER2-NEXT:    mulb (%r8) # sched: [8:1.00]
+; BDVER2-NEXT:    mulw %si # sched: [4:1.00]
+; BDVER2-NEXT:    mulw (%r9) # sched: [8:1.00]
+; BDVER2-NEXT:    mull %edx # sched: [4:1.00]
+; BDVER2-NEXT:    mull (%rax) # sched: [8:1.00]
+; BDVER2-NEXT:    mulq %rcx # sched: [6:4.00]
+; BDVER2-NEXT:    mulq (%r10) # sched: [10:4.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_mul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -8246,6 +9031,22 @@ define void @test_neg(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_neg:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    negb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    negb (%r8) # sched: [6:1.00]
+; BDVER2-NEXT:    negw %si # sched: [1:0.50]
+; BDVER2-NEXT:    negw (%r9) # sched: [6:1.00]
+; BDVER2-NEXT:    negl %edx # sched: [1:0.50]
+; BDVER2-NEXT:    negl (%rax) # sched: [6:1.00]
+; BDVER2-NEXT:    negq %rcx # sched: [1:0.50]
+; BDVER2-NEXT:    negq (%r10) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_neg:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -8386,6 +9187,19 @@ define void @test_nop(i16 %a0, i32 %a1, i64 %a2, i16 *%p0, i32 *%p1, i64 *%p2) o
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_nop:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    nop # sched: [1:0.50]
+; BDVER2-NEXT:    nopw %di # sched: [1:0.50]
+; BDVER2-NEXT:    nopw (%rcx) # sched: [1:0.50]
+; BDVER2-NEXT:    nopl %esi # sched: [1:0.50]
+; BDVER2-NEXT:    nopl (%r8) # sched: [1:0.50]
+; BDVER2-NEXT:    nopq %rdx # sched: [1:0.50]
+; BDVER2-NEXT:    nopq (%r9) # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_nop:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -8544,6 +9358,22 @@ define void @test_not(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_not:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BDVER2-NEXT:    movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    notb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    notb (%r8) # sched: [6:1.00]
+; BDVER2-NEXT:    notw %si # sched: [1:0.50]
+; BDVER2-NEXT:    notw (%r9) # sched: [6:1.00]
+; BDVER2-NEXT:    notl %edx # sched: [1:0.50]
+; BDVER2-NEXT:    notl (%rax) # sched: [6:1.00]
+; BDVER2-NEXT:    notq %rcx # sched: [1:0.50]
+; BDVER2-NEXT:    notq (%r10) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_not:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
@@ -8676,6 +9506,18 @@ define void @test_or_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_or_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    orb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    orb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    orb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    orb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orb (%rsi), %dil # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_or_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -8839,6 +9681,23 @@ define void @test_or_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_or_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    orw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    orw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    orw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    orw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    orw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    orw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orw (%rsi), %di # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_or_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9012,6 +9871,23 @@ define void @test_or_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_or_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    orl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    orl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    orl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    orl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    orl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    orl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orl (%rsi), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_or_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9185,6 +10061,23 @@ define void @test_or_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_or_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    orq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    orq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    orq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    orq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    orq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    orq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    orq (%rsi), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_or_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9319,6 +10212,18 @@ define void @test_out() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_out:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    outb %al, $7 # sched: [100:0.50]
+; BDVER2-NEXT:    outw %ax, $7 # sched: [100:0.50]
+; BDVER2-NEXT:    outl %eax, $7 # sched: [100:0.50]
+; BDVER2-NEXT:    outb %al, %dx # sched: [100:0.50]
+; BDVER2-NEXT:    outw %ax, %dx # sched: [100:0.50]
+; BDVER2-NEXT:    outl %eax, %dx # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_out:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9419,6 +10324,15 @@ define void @test_outs() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_outs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    outsb (%rsi), %dx # sched: [100:0.50]
+; BDVER2-NEXT:    outsw (%rsi), %dx # sched: [100:0.50]
+; BDVER2-NEXT:    outsl (%rsi), %dx # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_outs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9497,6 +10411,13 @@ define void @test_pause() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pause:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    pause # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pause:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9595,6 +10516,16 @@ define void @test_pop_push() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pop_push:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popq %fs # sched: [100:0.50]
+; BDVER2-NEXT:    popq %gs # sched: [100:0.50]
+; BDVER2-NEXT:    pushq %fs # sched: [100:0.50]
+; BDVER2-NEXT:    pushq %gs # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pop_push:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9722,6 +10653,19 @@ define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pop_push_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popw %ax # sched: [5:0.50]
+; BDVER2-NEXT:    popw (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    pushw %di # sched: [1:0.50]
+; BDVER2-NEXT:    pushw (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    pushw $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    pushw $7 # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pop_push_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9855,6 +10799,19 @@ define i64 @test_pop_push_64(i64 %a0, i64 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_pop_push_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popq %rax # sched: [5:0.50]
+; BDVER2-NEXT:    popq (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    pushq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    pushq (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    pushq $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    pushq $7 # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_pop_push_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -9949,6 +10906,14 @@ define void @test_popf_pushf() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_popf_pushf:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    popfq # sched: [5:0.50]
+; BDVER2-NEXT:    pushfq # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_popf_pushf:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10113,6 +11078,24 @@ define void @test_rcl_rcr_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcl_rcr_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rclb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rcrb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rclb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclb $7, %dil # sched: [13:0.50]
+; BDVER2-NEXT:    rcrb $7, %dil # sched: [12:0.50]
+; BDVER2-NEXT:    rclb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclb %cl, %dil # sched: [12:0.50]
+; BDVER2-NEXT:    rcrb %cl, %dil # sched: [11:0.50]
+; BDVER2-NEXT:    rclb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rcl_rcr_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10296,6 +11279,24 @@ define void @test_rcl_rcr_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcl_rcr_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rclw %di # sched: [1:0.50]
+; BDVER2-NEXT:    rcrw %di # sched: [1:0.50]
+; BDVER2-NEXT:    rclw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclw $7, %di # sched: [11:0.50]
+; BDVER2-NEXT:    rcrw $7, %di # sched: [10:0.50]
+; BDVER2-NEXT:    rclw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclw %cl, %di # sched: [10:0.50]
+; BDVER2-NEXT:    rcrw %cl, %di # sched: [9:0.50]
+; BDVER2-NEXT:    rclw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rcl_rcr_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10479,6 +11480,24 @@ define void @test_rcl_rcr_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcl_rcr_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rcll %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rcrl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rcll (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrl (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcll $7, %edi # sched: [8:0.50]
+; BDVER2-NEXT:    rcrl $7, %edi # sched: [7:0.50]
+; BDVER2-NEXT:    rcll $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrl $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcll %cl, %edi # sched: [7:0.50]
+; BDVER2-NEXT:    rcrl %cl, %edi # sched: [7:0.50]
+; BDVER2-NEXT:    rcll %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrl %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rcl_rcr_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10662,6 +11681,24 @@ define void @test_rcl_rcr_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rcl_rcr_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rclq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rcrq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rclq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclq $7, %rdi # sched: [8:0.50]
+; BDVER2-NEXT:    rcrq $7, %rdi # sched: [7:0.50]
+; BDVER2-NEXT:    rclq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rclq %cl, %rdi # sched: [7:0.50]
+; BDVER2-NEXT:    rcrq %cl, %rdi # sched: [7:0.50]
+; BDVER2-NEXT:    rclq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rcrq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rcl_rcr_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10766,6 +11803,14 @@ define void @test_rdmsr_wrmsr() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rdmsr_wrmsr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rdmsr # sched: [100:0.50]
+; BDVER2-NEXT:    wrmsr # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rdmsr_wrmsr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10842,6 +11887,13 @@ define void @test_rdpmc() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rdpmc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rdpmc # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rdpmc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -10924,6 +11976,14 @@ define void @test_rdtsc_rdtscp() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rdtsc_rdtscp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rdtsc # sched: [100:0.50]
+; BDVER2-NEXT:    rdtscp # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rdtsc_rdtscp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11040,6 +12100,18 @@ define void @test_ret() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ret:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+; BDVER2-NEXT:    retq $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [5:1.00]
+; BDVER2-NEXT:    lretl # sched: [5:1.00]
+; BDVER2-NEXT:    lretl $4095 # imm = 0xFFF
+; BDVER2-NEXT:    # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ret:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11212,6 +12284,24 @@ define void @test_rol_ror_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rol_ror_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rolb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rorb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rolb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rorb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rolb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rorb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    rolb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rol_ror_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11395,6 +12485,24 @@ define void @test_rol_ror_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rol_ror_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rolw %di # sched: [1:0.50]
+; BDVER2-NEXT:    rorw %di # sched: [1:0.50]
+; BDVER2-NEXT:    rolw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    rorw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    rolw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    rorw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    rolw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rol_ror_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11578,6 +12686,24 @@ define void @test_rol_ror_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rol_ror_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    roll %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rorl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    roll (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorl (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    roll $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rorl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    roll $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorl $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    roll %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    rorl %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    roll %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorl %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rol_ror_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11761,6 +12887,24 @@ define void @test_rol_ror_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_rol_ror_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    rolq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rorq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rolq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rorq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rolq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rolq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rorq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    rolq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    rorq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_rol_ror_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -11993,6 +13137,30 @@ define void @test_sar_shl_shr_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sar_shl_shr_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sarb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shlb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shrb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    sarb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrb (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shlb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shrb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    sarb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrb $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shlb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    shrb %cl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    sarb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrb %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sar_shl_shr_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -12236,6 +13404,30 @@ define void @test_sar_shl_shr_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sar_shl_shr_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sarw %di # sched: [1:0.50]
+; BDVER2-NEXT:    shlw %di # sched: [1:0.50]
+; BDVER2-NEXT:    shrw %di # sched: [1:0.50]
+; BDVER2-NEXT:    sarw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrw (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    shlw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    shrw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    sarw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrw $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    shlw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    shrw %cl, %di # sched: [1:0.50]
+; BDVER2-NEXT:    sarw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrw %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sar_shl_shr_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -12479,6 +13671,30 @@ define void @test_sar_shl_shr_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sar_shl_shr_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sarl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shll %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shrl %edi # sched: [1:0.50]
+; BDVER2-NEXT:    sarl (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shll (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrl (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shll $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shrl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    sarl $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shll $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrl $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarl %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shll %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    shrl %cl, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    sarl %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shll %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrl %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sar_shl_shr_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -12722,6 +13938,30 @@ define void @test_sar_shl_shr_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sar_shl_shr_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sarq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shlq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shrq %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    sarq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrq (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shlq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shrq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    sarq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrq $7, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    sarq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shlq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    shrq %cl, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    sarq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shlq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    shrq %cl, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sar_shl_shr_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -12870,6 +14110,18 @@ define void @test_sbb_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sbb_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sbbb $7, %al # sched: [1:1.00]
+; BDVER2-NEXT:    sbbb $7, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    sbbb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbb %dl, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    sbbb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbb (%rsi), %dil # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sbb_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13033,6 +14285,23 @@ define void @test_sbb_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sbb_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sbbw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    sbbw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    sbbw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    sbbw $7, %di # sched: [1:1.00]
+; BDVER2-NEXT:    sbbw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbw %dx, %di # sched: [1:1.00]
+; BDVER2-NEXT:    sbbw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbw (%rsi), %di # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sbb_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13206,6 +14475,23 @@ define void @test_sbb_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sbb_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sbbl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    sbbl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    sbbl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    sbbl $7, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    sbbl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbl %edx, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    sbbl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbl (%rsi), %edi # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sbb_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13379,6 +14665,23 @@ define void @test_sbb_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sbb_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    sbbq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    sbbq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:1.00]
+; BDVER2-NEXT:    sbbq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    sbbq $7, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    sbbq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbq %rdx, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    sbbq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    sbbq (%rsi), %rdi # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sbb_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13497,6 +14800,16 @@ define void @test_scas() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_scas:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    scasb %es:(%rdi), %al # sched: [100:0.50]
+; BDVER2-NEXT:    scasw %es:(%rdi), %ax # sched: [100:0.50]
+; BDVER2-NEXT:    scasl %es:(%rdi), %eax # sched: [100:0.50]
+; BDVER2-NEXT:    scasq %es:(%rdi), %rax # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_scas:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -13825,6 +15138,44 @@ define void @test_setcc(i8 %a0, i8 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_setcc:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    seto %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setno %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setb %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setae %dil # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setne %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setbe %dil # sched: [1:0.50]
+; BDVER2-NEXT:    seta %dil # sched: [1:0.50]
+; BDVER2-NEXT:    sets %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setns %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setp %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setnp %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setl %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setge %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setle %dil # sched: [1:0.50]
+; BDVER2-NEXT:    setg %dil # sched: [1:0.50]
+; BDVER2-NEXT:    seto (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setno (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setb (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setae (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    sete (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setne (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setbe (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    seta (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    sets (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setns (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setp (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setnp (%rsi) # sched: [1:0.50]
+; BDVER2-NEXT:    setl (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setge (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setle (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    setg (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_setcc:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14019,6 +15370,20 @@ define void @test_shld_shrd_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shld_shrd_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    shldw %cl, %si, %di # sched: [4:4.00]
+; BDVER2-NEXT:    shrdw %cl, %si, %di # sched: [4:4.00]
+; BDVER2-NEXT:    shldw %cl, %si, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdw %cl, %si, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shldw $7, %si, %di # sched: [4:3.00]
+; BDVER2-NEXT:    shrdw $7, %si, %di # sched: [3:3.00]
+; BDVER2-NEXT:    shldw $7, %si, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdw $7, %si, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_shld_shrd_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14162,6 +15527,20 @@ define void @test_shld_shrd_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shld_shrd_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    shldl %cl, %esi, %edi # sched: [4:4.00]
+; BDVER2-NEXT:    shrdl %cl, %esi, %edi # sched: [4:4.00]
+; BDVER2-NEXT:    shldl %cl, %esi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdl %cl, %esi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shldl $7, %esi, %edi # sched: [3:3.00]
+; BDVER2-NEXT:    shrdl $7, %esi, %edi # sched: [4:3.00]
+; BDVER2-NEXT:    shldl $7, %esi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdl $7, %esi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_shld_shrd_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14305,6 +15684,20 @@ define void @test_shld_shrd_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_shld_shrd_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    shldq %cl, %rsi, %rdi # sched: [4:4.00]
+; BDVER2-NEXT:    shrdq %cl, %rsi, %rdi # sched: [4:4.00]
+; BDVER2-NEXT:    shldq %cl, %rsi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdq %cl, %rsi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shldq $7, %rsi, %rdi # sched: [4:3.00]
+; BDVER2-NEXT:    shrdq $7, %rsi, %rdi # sched: [4:3.00]
+; BDVER2-NEXT:    shldq $7, %rsi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    shrdq $7, %rsi, (%rdx) # sched: [4:11.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_shld_shrd_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14405,6 +15798,14 @@ define void @test_stc_std() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_stc_std:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    stc # sched: [1:0.50]
+; BDVER2-NEXT:    std # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_stc_std:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14508,6 +15909,16 @@ define void @test_stos() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_stos:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    stosb %al, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    stosw %ax, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    stosl %eax, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    stosq %rax, %es:(%rdi) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_stos:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14630,6 +16041,18 @@ define void @test_sub_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sub_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    subb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    subb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    subb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    subb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subb (%rsi), %dil # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sub_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14793,6 +16216,23 @@ define void @test_sub_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sub_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    subw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    subw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    subw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    subw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    subw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    subw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subw (%rsi), %di # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sub_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -14966,6 +16406,23 @@ define void @test_sub_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sub_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    subl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    subl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    subl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    subl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    subl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    subl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subl (%rsi), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sub_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15139,6 +16596,23 @@ define void @test_sub_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_sub_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    subq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    subq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    subq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    subq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    subq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    subq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    subq (%rsi), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_sub_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15271,6 +16745,17 @@ define void @test_test_8(i8 %a0, i8* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_test_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    testb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    testb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    testb $7, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    testb %dil, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    testb %dil, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_test_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15408,6 +16893,20 @@ define void @test_test_16(i16 %a0, i16* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_test_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    testw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    testw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    testw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    testw %di, %di # sched: [1:0.50]
+; BDVER2-NEXT:    testw %di, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_test_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15551,6 +17050,20 @@ define void @test_test_32(i32 %a0, i32* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_test_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    testl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    testl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    testl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    testl %edi, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    testl %edi, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_test_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15694,6 +17207,20 @@ define void @test_test_64(i64 %a0, i64* %a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_test_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    testq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    testq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    testq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [5:0.50]
+; BDVER2-NEXT:    testq %rdi, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    testq %rdi, (%rsi) # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_test_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15783,6 +17310,13 @@ define void @test_ud2() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_ud2:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    ud2 # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ud2:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15873,6 +17407,14 @@ define void @test_xadd_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xadd_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xaddb %dil, %sil # sched: [2:1.00]
+; BDVER2-NEXT:    xaddb %dil, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xadd_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -15956,6 +17498,14 @@ define void @test_xadd_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xadd_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xaddw %di, %si # sched: [2:1.00]
+; BDVER2-NEXT:    xaddw %di, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xadd_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16039,6 +17589,14 @@ define void @test_xadd_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xadd_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xaddl %edi, %esi # sched: [2:1.00]
+; BDVER2-NEXT:    xaddl %edi, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xadd_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16122,6 +17680,14 @@ define void @test_xadd_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xadd_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xaddq %rdi, %rsi # sched: [2:1.00]
+; BDVER2-NEXT:    xaddq %rdi, (%rdx) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xadd_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16206,6 +17772,14 @@ define void @test_xchg_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xchg_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgb %sil, %dil # sched: [1:1.00]
+; BDVER2-NEXT:    xchgb %dil, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xchg_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16297,6 +17871,15 @@ define void @test_xchg_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xchg_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgw %di, %ax # sched: [1:1.00]
+; BDVER2-NEXT:    xchgw %si, %di # sched: [2:1.00]
+; BDVER2-NEXT:    xchgw %di, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xchg_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16390,6 +17973,15 @@ define void @test_xchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xchg_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgl %edi, %eax # sched: [1:1.00]
+; BDVER2-NEXT:    xchgl %esi, %edi # sched: [1:1.00]
+; BDVER2-NEXT:    xchgl %edi, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xchg_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16483,6 +18075,15 @@ define void @test_xchg_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xchg_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xchgq %rdi, %rax # sched: [1:1.00]
+; BDVER2-NEXT:    xchgq %rsi, %rdi # sched: [1:1.00]
+; BDVER2-NEXT:    xchgq %rdi, (%rdx) # sched: [5:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xchg_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16561,6 +18162,13 @@ define void @test_xlat() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xlat:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xlatb # sched: [6:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xlat:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16675,6 +18283,18 @@ define void @test_xor_8(i8 %a0, i8* %a1, i8 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xor_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xorb $7, %al # sched: [1:0.50]
+; BDVER2-NEXT:    xorb $7, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    xorb $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorb %dl, %dil # sched: [1:0.50]
+; BDVER2-NEXT:    xorb %dil, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorb (%rsi), %dil # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xor_8:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -16838,6 +18458,23 @@ define void @test_xor_16(i16 %a0, i16* %a1, i16 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xor_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xorw $511, %ax # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    xorw $511, %di # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    xorw $511, (%rsi) # imm = 0x1FF
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    xorw $7, %di # sched: [1:0.50]
+; BDVER2-NEXT:    xorw $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorw %dx, %di # sched: [1:0.50]
+; BDVER2-NEXT:    xorw %di, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorw (%rsi), %di # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xor_16:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -17011,6 +18648,23 @@ define void @test_xor_32(i32 %a0, i32* %a1, i32 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xor_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xorl $665536, %eax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    xorl $665536, %edi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    xorl $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    xorl $7, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    xorl $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorl %edx, %edi # sched: [1:0.50]
+; BDVER2-NEXT:    xorl %edi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorl (%rsi), %edi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xor_32:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -17184,6 +18838,23 @@ define void @test_xor_64(i64 %a0, i64* %a1, i64 %a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-LABEL: test_xor_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    xorq $665536, %rax # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    xorq $665536, %rdi # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [1:0.50]
+; BDVER2-NEXT:    xorq $665536, (%rsi) # imm = 0xA27C0
+; BDVER2-NEXT:    # sched: [6:1.00]
+; BDVER2-NEXT:    xorq $7, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    xorq $7, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorq %rdx, %rdi # sched: [1:0.50]
+; BDVER2-NEXT:    xorq %rdi, (%rsi) # sched: [6:1.00]
+; BDVER2-NEXT:    xorq (%rsi), %rdi # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_xor_64:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
diff --git a/test/CodeGen/X86/scheduler-backtracking.ll b/test/CodeGen/X86/scheduler-backtracking.ll
index d62f07fa0f7decb1a527167bd82bc619bd617bcb..811bd9bd031c2fc850cf4c7bab03bc7e50e5a03b 100644
--- a/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@@ -1,15 +1,463 @@
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-ilp    | FileCheck %s
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-hybrid | FileCheck %s
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=source      | FileCheck %s
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-burr   | FileCheck %s
-; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=linearize   | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-ilp    | FileCheck %s --check-prefix=ILP
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-hybrid | FileCheck %s --check-prefix=HYBRID
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-burr   | FileCheck %s --check-prefix=BURR
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=source      | FileCheck %s --check-prefix=SRC
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39452.
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=linearize -verify-machineinstrs=0 | FileCheck %s --check-prefix=LIN
 
 ; PR22304 https://llvm.org/bugs/show_bug.cgi?id=22304
 ; Tests checking backtracking in source scheduler. llc used to crash on them.
 
-; CHECK-LABEL: test1
-define i256 @test1(i256 %a) {
-  %b = add i256 %a, 1 
+define i256 @test1(i256 %a) nounwind {
+; ILP-LABEL: test1:
+; ILP:       # %bb.0:
+; ILP-NEXT:    pushq %rbp
+; ILP-NEXT:    pushq %r15
+; ILP-NEXT:    pushq %r14
+; ILP-NEXT:    pushq %r13
+; ILP-NEXT:    pushq %r12
+; ILP-NEXT:    pushq %rbx
+; ILP-NEXT:    movq %rcx, %r9
+; ILP-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; ILP-NEXT:    xorl %eax, %eax
+; ILP-NEXT:    addq $1, %rsi
+; ILP-NEXT:    adcq $0, %rdx
+; ILP-NEXT:    adcq $0, %r9
+; ILP-NEXT:    adcq $0, %r8
+; ILP-NEXT:    leal 1(%rsi,%rsi), %edi
+; ILP-NEXT:    movl $1, %ebp
+; ILP-NEXT:    xorl %r14d, %r14d
+; ILP-NEXT:    movl %edi, %ecx
+; ILP-NEXT:    shldq %cl, %rbp, %r14
+; ILP-NEXT:    movl $1, %r11d
+; ILP-NEXT:    shlq %cl, %r11
+; ILP-NEXT:    movb $-128, %r10b
+; ILP-NEXT:    subb %dil, %r10b
+; ILP-NEXT:    movq %r9, %r13
+; ILP-NEXT:    movl %r10d, %ecx
+; ILP-NEXT:    shlq %cl, %r13
+; ILP-NEXT:    movl $1, %r12d
+; ILP-NEXT:    shrdq %cl, %rax, %r12
+; ILP-NEXT:    xorl %r15d, %r15d
+; ILP-NEXT:    movl %edi, %ecx
+; ILP-NEXT:    shldq %cl, %r15, %r15
+; ILP-NEXT:    movq %rsi, %rbx
+; ILP-NEXT:    shrdq %cl, %rdx, %rbx
+; ILP-NEXT:    shrq %cl, %rdx
+; ILP-NEXT:    addb $-128, %cl
+; ILP-NEXT:    shrdq %cl, %r8, %r9
+; ILP-NEXT:    testb $64, %dil
+; ILP-NEXT:    cmovneq %r11, %r14
+; ILP-NEXT:    cmoveq %rbx, %rdx
+; ILP-NEXT:    cmovneq %rax, %r15
+; ILP-NEXT:    cmovneq %rax, %r11
+; ILP-NEXT:    testb $64, %r10b
+; ILP-NEXT:    cmovneq %rax, %r12
+; ILP-NEXT:    cmovneq %rax, %r13
+; ILP-NEXT:    movl $1, %ebx
+; ILP-NEXT:    shlq %cl, %rbx
+; ILP-NEXT:    orl %edx, %r13d
+; ILP-NEXT:    xorl %edx, %edx
+; ILP-NEXT:    movl $1, %ebp
+; ILP-NEXT:    shldq %cl, %rbp, %rdx
+; ILP-NEXT:    shrq %cl, %r8
+; ILP-NEXT:    testb $64, %cl
+; ILP-NEXT:    cmoveq %r9, %r8
+; ILP-NEXT:    cmovneq %rbx, %rdx
+; ILP-NEXT:    cmovneq %rax, %rbx
+; ILP-NEXT:    testb %dil, %dil
+; ILP-NEXT:    cmovsq %rax, %r14
+; ILP-NEXT:    cmovsq %rax, %r11
+; ILP-NEXT:    jns .LBB0_2
+; ILP-NEXT:  # %bb.1:
+; ILP-NEXT:    movl %r8d, %r13d
+; ILP-NEXT:  .LBB0_2:
+; ILP-NEXT:    je .LBB0_4
+; ILP-NEXT:  # %bb.3:
+; ILP-NEXT:    movl %r13d, %esi
+; ILP-NEXT:  .LBB0_4:
+; ILP-NEXT:    cmovnsq %r12, %rbx
+; ILP-NEXT:    cmoveq %rax, %rbx
+; ILP-NEXT:    cmovnsq %r15, %rdx
+; ILP-NEXT:    cmoveq %rax, %rdx
+; ILP-NEXT:    testb $1, %sil
+; ILP-NEXT:    cmovneq %rax, %rdx
+; ILP-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; ILP-NEXT:    movq %rdx, 24(%rax)
+; ILP-NEXT:    cmovneq %rax, %rbx
+; ILP-NEXT:    movq %rbx, 16(%rax)
+; ILP-NEXT:    cmovneq %rax, %r14
+; ILP-NEXT:    movq %r14, 8(%rax)
+; ILP-NEXT:    cmovneq %rax, %r11
+; ILP-NEXT:    movq %r11, (%rax)
+; ILP-NEXT:    popq %rbx
+; ILP-NEXT:    popq %r12
+; ILP-NEXT:    popq %r13
+; ILP-NEXT:    popq %r14
+; ILP-NEXT:    popq %r15
+; ILP-NEXT:    popq %rbp
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: test1:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    pushq %rbp
+; HYBRID-NEXT:    pushq %r15
+; HYBRID-NEXT:    pushq %r14
+; HYBRID-NEXT:    pushq %r13
+; HYBRID-NEXT:    pushq %r12
+; HYBRID-NEXT:    pushq %rbx
+; HYBRID-NEXT:    movq %rcx, %r9
+; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    addq $1, %rsi
+; HYBRID-NEXT:    adcq $0, %rdx
+; HYBRID-NEXT:    adcq $0, %r9
+; HYBRID-NEXT:    adcq $0, %r8
+; HYBRID-NEXT:    xorl %r10d, %r10d
+; HYBRID-NEXT:    leal 1(%rsi,%rsi), %edi
+; HYBRID-NEXT:    xorl %r14d, %r14d
+; HYBRID-NEXT:    movl %edi, %ecx
+; HYBRID-NEXT:    shldq %cl, %r14, %r14
+; HYBRID-NEXT:    testb $64, %dil
+; HYBRID-NEXT:    cmovneq %r10, %r14
+; HYBRID-NEXT:    movl $1, %ebp
+; HYBRID-NEXT:    movl $1, %r12d
+; HYBRID-NEXT:    shlq %cl, %r12
+; HYBRID-NEXT:    testb $64, %dil
+; HYBRID-NEXT:    movq %r12, %r11
+; HYBRID-NEXT:    cmovneq %r10, %r11
+; HYBRID-NEXT:    movq %rsi, %rbx
+; HYBRID-NEXT:    shrdq %cl, %rdx, %rbx
+; HYBRID-NEXT:    shrq %cl, %rdx
+; HYBRID-NEXT:    testb $64, %dil
+; HYBRID-NEXT:    cmoveq %rbx, %rdx
+; HYBRID-NEXT:    xorl %r15d, %r15d
+; HYBRID-NEXT:    shldq %cl, %rbp, %r15
+; HYBRID-NEXT:    testb $64, %dil
+; HYBRID-NEXT:    cmovneq %r12, %r15
+; HYBRID-NEXT:    movb $-128, %cl
+; HYBRID-NEXT:    subb %dil, %cl
+; HYBRID-NEXT:    movq %r9, %r13
+; HYBRID-NEXT:    shlq %cl, %r13
+; HYBRID-NEXT:    movl $1, %r12d
+; HYBRID-NEXT:    shrdq %cl, %r10, %r12
+; HYBRID-NEXT:    testb $64, %cl
+; HYBRID-NEXT:    cmovneq %r10, %r12
+; HYBRID-NEXT:    cmovneq %r10, %r13
+; HYBRID-NEXT:    orl %edx, %r13d
+; HYBRID-NEXT:    movl %edi, %ecx
+; HYBRID-NEXT:    addb $-128, %cl
+; HYBRID-NEXT:    shrdq %cl, %r8, %r9
+; HYBRID-NEXT:    shrq %cl, %r8
+; HYBRID-NEXT:    xorl %edx, %edx
+; HYBRID-NEXT:    shldq %cl, %rbp, %rdx
+; HYBRID-NEXT:    shlq %cl, %rbp
+; HYBRID-NEXT:    testb $64, %cl
+; HYBRID-NEXT:    cmovneq %rbp, %rdx
+; HYBRID-NEXT:    cmoveq %r9, %r8
+; HYBRID-NEXT:    cmovneq %r10, %rbp
+; HYBRID-NEXT:    testb %dil, %dil
+; HYBRID-NEXT:    jns .LBB0_2
+; HYBRID-NEXT:  # %bb.1:
+; HYBRID-NEXT:    movl %r8d, %r13d
+; HYBRID-NEXT:  .LBB0_2:
+; HYBRID-NEXT:    je .LBB0_4
+; HYBRID-NEXT:  # %bb.3:
+; HYBRID-NEXT:    movl %r13d, %esi
+; HYBRID-NEXT:  .LBB0_4:
+; HYBRID-NEXT:    cmovsq %r10, %r15
+; HYBRID-NEXT:    cmovnsq %r12, %rbp
+; HYBRID-NEXT:    cmoveq %r10, %rbp
+; HYBRID-NEXT:    cmovnsq %r14, %rdx
+; HYBRID-NEXT:    cmoveq %r10, %rdx
+; HYBRID-NEXT:    cmovsq %r10, %r11
+; HYBRID-NEXT:    testb $1, %sil
+; HYBRID-NEXT:    cmovneq %rax, %rdx
+; HYBRID-NEXT:    movq %rdx, 24(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %rbp
+; HYBRID-NEXT:    movq %rbp, 16(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %r15
+; HYBRID-NEXT:    movq %r15, 8(%rax)
+; HYBRID-NEXT:    cmovneq %rax, %r11
+; HYBRID-NEXT:    movq %r11, (%rax)
+; HYBRID-NEXT:    popq %rbx
+; HYBRID-NEXT:    popq %r12
+; HYBRID-NEXT:    popq %r13
+; HYBRID-NEXT:    popq %r14
+; HYBRID-NEXT:    popq %r15
+; HYBRID-NEXT:    popq %rbp
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: test1:
+; BURR:       # %bb.0:
+; BURR-NEXT:    pushq %rbp
+; BURR-NEXT:    pushq %r15
+; BURR-NEXT:    pushq %r14
+; BURR-NEXT:    pushq %r13
+; BURR-NEXT:    pushq %r12
+; BURR-NEXT:    pushq %rbx
+; BURR-NEXT:    movq %rcx, %r9
+; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    addq $1, %rsi
+; BURR-NEXT:    adcq $0, %rdx
+; BURR-NEXT:    adcq $0, %r9
+; BURR-NEXT:    adcq $0, %r8
+; BURR-NEXT:    xorl %r10d, %r10d
+; BURR-NEXT:    leal 1(%rsi,%rsi), %edi
+; BURR-NEXT:    xorl %r14d, %r14d
+; BURR-NEXT:    movl %edi, %ecx
+; BURR-NEXT:    shldq %cl, %r14, %r14
+; BURR-NEXT:    testb $64, %dil
+; BURR-NEXT:    cmovneq %r10, %r14
+; BURR-NEXT:    movl $1, %ebp
+; BURR-NEXT:    movl $1, %r12d
+; BURR-NEXT:    shlq %cl, %r12
+; BURR-NEXT:    testb $64, %dil
+; BURR-NEXT:    movq %r12, %r11
+; BURR-NEXT:    cmovneq %r10, %r11
+; BURR-NEXT:    movq %rsi, %rbx
+; BURR-NEXT:    shrdq %cl, %rdx, %rbx
+; BURR-NEXT:    shrq %cl, %rdx
+; BURR-NEXT:    testb $64, %dil
+; BURR-NEXT:    cmoveq %rbx, %rdx
+; BURR-NEXT:    xorl %r15d, %r15d
+; BURR-NEXT:    shldq %cl, %rbp, %r15
+; BURR-NEXT:    testb $64, %dil
+; BURR-NEXT:    cmovneq %r12, %r15
+; BURR-NEXT:    movb $-128, %cl
+; BURR-NEXT:    subb %dil, %cl
+; BURR-NEXT:    movq %r9, %r13
+; BURR-NEXT:    shlq %cl, %r13
+; BURR-NEXT:    movl $1, %r12d
+; BURR-NEXT:    shrdq %cl, %r10, %r12
+; BURR-NEXT:    testb $64, %cl
+; BURR-NEXT:    cmovneq %r10, %r12
+; BURR-NEXT:    cmovneq %r10, %r13
+; BURR-NEXT:    orl %edx, %r13d
+; BURR-NEXT:    movl %edi, %ecx
+; BURR-NEXT:    addb $-128, %cl
+; BURR-NEXT:    shrdq %cl, %r8, %r9
+; BURR-NEXT:    xorl %edx, %edx
+; BURR-NEXT:    shldq %cl, %rbp, %rdx
+; BURR-NEXT:    shrq %cl, %r8
+; BURR-NEXT:    shlq %cl, %rbp
+; BURR-NEXT:    testb $64, %cl
+; BURR-NEXT:    cmovneq %rbp, %rdx
+; BURR-NEXT:    cmoveq %r9, %r8
+; BURR-NEXT:    cmovneq %r10, %rbp
+; BURR-NEXT:    testb %dil, %dil
+; BURR-NEXT:    jns .LBB0_2
+; BURR-NEXT:  # %bb.1:
+; BURR-NEXT:    movl %r8d, %r13d
+; BURR-NEXT:  .LBB0_2:
+; BURR-NEXT:    je .LBB0_4
+; BURR-NEXT:  # %bb.3:
+; BURR-NEXT:    movl %r13d, %esi
+; BURR-NEXT:  .LBB0_4:
+; BURR-NEXT:    cmovsq %r10, %r15
+; BURR-NEXT:    cmovnsq %r12, %rbp
+; BURR-NEXT:    cmoveq %r10, %rbp
+; BURR-NEXT:    cmovnsq %r14, %rdx
+; BURR-NEXT:    cmoveq %r10, %rdx
+; BURR-NEXT:    cmovsq %r10, %r11
+; BURR-NEXT:    testb $1, %sil
+; BURR-NEXT:    cmovneq %rax, %rdx
+; BURR-NEXT:    movq %rdx, 24(%rax)
+; BURR-NEXT:    cmovneq %rax, %rbp
+; BURR-NEXT:    movq %rbp, 16(%rax)
+; BURR-NEXT:    cmovneq %rax, %r15
+; BURR-NEXT:    movq %r15, 8(%rax)
+; BURR-NEXT:    cmovneq %rax, %r11
+; BURR-NEXT:    movq %r11, (%rax)
+; BURR-NEXT:    popq %rbx
+; BURR-NEXT:    popq %r12
+; BURR-NEXT:    popq %r13
+; BURR-NEXT:    popq %r14
+; BURR-NEXT:    popq %r15
+; BURR-NEXT:    popq %rbp
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: test1:
+; SRC:       # %bb.0:
+; SRC-NEXT:    pushq %rbp
+; SRC-NEXT:    pushq %r15
+; SRC-NEXT:    pushq %r14
+; SRC-NEXT:    pushq %r13
+; SRC-NEXT:    pushq %r12
+; SRC-NEXT:    pushq %rbx
+; SRC-NEXT:    movq %rcx, %r9
+; SRC-NEXT:    movq %rdi, %rax
+; SRC-NEXT:    addq $1, %rsi
+; SRC-NEXT:    adcq $0, %rdx
+; SRC-NEXT:    adcq $0, %r9
+; SRC-NEXT:    adcq $0, %r8
+; SRC-NEXT:    leal 1(%rsi,%rsi), %r11d
+; SRC-NEXT:    movb $-128, %r10b
+; SRC-NEXT:    subb %r11b, %r10b
+; SRC-NEXT:    movq %r9, %r12
+; SRC-NEXT:    movl %r10d, %ecx
+; SRC-NEXT:    shlq %cl, %r12
+; SRC-NEXT:    movq %rsi, %rbp
+; SRC-NEXT:    movl %r11d, %ecx
+; SRC-NEXT:    shrdq %cl, %rdx, %rbp
+; SRC-NEXT:    shrq %cl, %rdx
+; SRC-NEXT:    xorl %r15d, %r15d
+; SRC-NEXT:    movl $1, %edi
+; SRC-NEXT:    xorl %r14d, %r14d
+; SRC-NEXT:    shldq %cl, %rdi, %r14
+; SRC-NEXT:    xorl %r13d, %r13d
+; SRC-NEXT:    shldq %cl, %r13, %r13
+; SRC-NEXT:    movl $1, %ebx
+; SRC-NEXT:    shlq %cl, %rbx
+; SRC-NEXT:    testb $64, %r11b
+; SRC-NEXT:    cmoveq %rbp, %rdx
+; SRC-NEXT:    cmovneq %rbx, %r14
+; SRC-NEXT:    cmovneq %r15, %rbx
+; SRC-NEXT:    cmovneq %r15, %r13
+; SRC-NEXT:    movl $1, %ebp
+; SRC-NEXT:    movl %r10d, %ecx
+; SRC-NEXT:    shrdq %cl, %r15, %rbp
+; SRC-NEXT:    testb $64, %r10b
+; SRC-NEXT:    cmovneq %r15, %r12
+; SRC-NEXT:    cmovneq %r15, %rbp
+; SRC-NEXT:    orl %edx, %r12d
+; SRC-NEXT:    movl %r11d, %ecx
+; SRC-NEXT:    addb $-128, %cl
+; SRC-NEXT:    shrdq %cl, %r8, %r9
+; SRC-NEXT:    shrq %cl, %r8
+; SRC-NEXT:    xorl %edx, %edx
+; SRC-NEXT:    shldq %cl, %rdi, %rdx
+; SRC-NEXT:    shlq %cl, %rdi
+; SRC-NEXT:    testb $64, %cl
+; SRC-NEXT:    cmoveq %r9, %r8
+; SRC-NEXT:    cmovneq %rdi, %rdx
+; SRC-NEXT:    cmovneq %r15, %rdi
+; SRC-NEXT:    testb %r11b, %r11b
+; SRC-NEXT:    jns .LBB0_2
+; SRC-NEXT:  # %bb.1:
+; SRC-NEXT:    movl %r8d, %r12d
+; SRC-NEXT:  .LBB0_2:
+; SRC-NEXT:    je .LBB0_4
+; SRC-NEXT:  # %bb.3:
+; SRC-NEXT:    movl %r12d, %esi
+; SRC-NEXT:  .LBB0_4:
+; SRC-NEXT:    cmovnsq %r13, %rdx
+; SRC-NEXT:    cmoveq %r15, %rdx
+; SRC-NEXT:    cmovnsq %rbp, %rdi
+; SRC-NEXT:    cmoveq %r15, %rdi
+; SRC-NEXT:    cmovsq %r15, %r14
+; SRC-NEXT:    cmovsq %r15, %rbx
+; SRC-NEXT:    testb $1, %sil
+; SRC-NEXT:    cmovneq %rax, %rbx
+; SRC-NEXT:    cmovneq %rax, %r14
+; SRC-NEXT:    cmovneq %rax, %rdi
+; SRC-NEXT:    cmovneq %rax, %rdx
+; SRC-NEXT:    movq %rdx, 24(%rax)
+; SRC-NEXT:    movq %rdi, 16(%rax)
+; SRC-NEXT:    movq %r14, 8(%rax)
+; SRC-NEXT:    movq %rbx, (%rax)
+; SRC-NEXT:    popq %rbx
+; SRC-NEXT:    popq %r12
+; SRC-NEXT:    popq %r13
+; SRC-NEXT:    popq %r14
+; SRC-NEXT:    popq %r15
+; SRC-NEXT:    popq %rbp
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: test1:
+; LIN:       # %bb.0:
+; LIN-NEXT:    pushq %rbp
+; LIN-NEXT:    pushq %r15
+; LIN-NEXT:    pushq %r14
+; LIN-NEXT:    pushq %r12
+; LIN-NEXT:    pushq %rbx
+; LIN-NEXT:    movq %rcx, %r9
+; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    xorl %r15d, %r15d
+; LIN-NEXT:    movl $1, %r14d
+; LIN-NEXT:    addq $1, %rsi
+; LIN-NEXT:    leal 1(%rsi,%rsi), %ebp
+; LIN-NEXT:    movl $1, %r12d
+; LIN-NEXT:    movl %ebp, %ecx
+; LIN-NEXT:    shlq %cl, %r12
+; LIN-NEXT:    testb $64, %bpl
+; LIN-NEXT:    movq %r12, %rbx
+; LIN-NEXT:    cmovneq %r15, %rbx
+; LIN-NEXT:    testb %bpl, %bpl
+; LIN-NEXT:    cmovsq %r15, %rbx
+; LIN-NEXT:    adcq $0, %rdx
+; LIN-NEXT:    adcq $0, %r9
+; LIN-NEXT:    adcq $0, %r8
+; LIN-NEXT:    movl %ebp, %r10d
+; LIN-NEXT:    addb $-128, %r10b
+; LIN-NEXT:    movq %r9, %rdi
+; LIN-NEXT:    movl %r10d, %ecx
+; LIN-NEXT:    shrdq %cl, %r8, %rdi
+; LIN-NEXT:    shrq %cl, %r8
+; LIN-NEXT:    testb $64, %r10b
+; LIN-NEXT:    cmoveq %rdi, %r8
+; LIN-NEXT:    movq %rsi, %rdi
+; LIN-NEXT:    movl %ebp, %ecx
+; LIN-NEXT:    shrdq %cl, %rdx, %rdi
+; LIN-NEXT:    shrq %cl, %rdx
+; LIN-NEXT:    cmoveq %rdi, %rdx
+; LIN-NEXT:    movb $-128, %r11b
+; LIN-NEXT:    subb %bpl, %r11b
+; LIN-NEXT:    movl %r11d, %ecx
+; LIN-NEXT:    shlq %cl, %r9
+; LIN-NEXT:    testb $64, %r11b
+; LIN-NEXT:    cmovneq %r15, %r9
+; LIN-NEXT:    orl %edx, %r9d
+; LIN-NEXT:    jns .LBB0_2
+; LIN-NEXT:  # %bb.1:
+; LIN-NEXT:    movl %r8d, %r9d
+; LIN-NEXT:  .LBB0_2:
+; LIN-NEXT:    je .LBB0_4
+; LIN-NEXT:  # %bb.3:
+; LIN-NEXT:    movl %r9d, %esi
+; LIN-NEXT:  .LBB0_4:
+; LIN-NEXT:    testb $1, %sil
+; LIN-NEXT:    cmovneq %rax, %rbx
+; LIN-NEXT:    movq %rbx, (%rax)
+; LIN-NEXT:    xorl %edx, %edx
+; LIN-NEXT:    movl %ebp, %ecx
+; LIN-NEXT:    shldq %cl, %r14, %rdx
+; LIN-NEXT:    cmovneq %r12, %rdx
+; LIN-NEXT:    cmovsq %r15, %rdx
+; LIN-NEXT:    cmovneq %rax, %rdx
+; LIN-NEXT:    movq %rdx, 8(%rax)
+; LIN-NEXT:    movl $1, %edx
+; LIN-NEXT:    movl %r10d, %ecx
+; LIN-NEXT:    shlq %cl, %rdx
+; LIN-NEXT:    movq %rdx, %rsi
+; LIN-NEXT:    cmovneq %r15, %rsi
+; LIN-NEXT:    movl $1, %edi
+; LIN-NEXT:    movl %r11d, %ecx
+; LIN-NEXT:    shrdq %cl, %r15, %rdi
+; LIN-NEXT:    cmovneq %r15, %rdi
+; LIN-NEXT:    cmovsq %rsi, %rdi
+; LIN-NEXT:    cmoveq %r15, %rdi
+; LIN-NEXT:    cmovneq %rax, %rdi
+; LIN-NEXT:    movq %rdi, 16(%rax)
+; LIN-NEXT:    xorl %esi, %esi
+; LIN-NEXT:    movl %r10d, %ecx
+; LIN-NEXT:    shldq %cl, %r14, %rsi
+; LIN-NEXT:    cmovneq %rdx, %rsi
+; LIN-NEXT:    xorl %edx, %edx
+; LIN-NEXT:    movl %ebp, %ecx
+; LIN-NEXT:    shldq %cl, %rdx, %rdx
+; LIN-NEXT:    cmovneq %r15, %rdx
+; LIN-NEXT:    cmovsq %rsi, %rdx
+; LIN-NEXT:    cmoveq %r15, %rdx
+; LIN-NEXT:    cmovneq %rax, %rdx
+; LIN-NEXT:    movq %rdx, 24(%rax)
+; LIN-NEXT:    popq %rbx
+; LIN-NEXT:    popq %r12
+; LIN-NEXT:    popq %r14
+; LIN-NEXT:    popq %r15
+; LIN-NEXT:    popq %rbp
+; LIN-NEXT:    retq
+  %b = add i256 %a, 1
   %m = shl i256 %b, 1
   %p = add i256 %m, 1
   %v = lshr i256 %b, %p
@@ -19,16 +467,436 @@ define i256 @test1(i256 %a) {
   ret i256 %f
 }
 
-; CHECK-LABEL: test2
-define i256 @test2(i256 %a) {
+define i256 @test2(i256 %a) nounwind {
+; ILP-LABEL: test2:
+; ILP:       # %bb.0:
+; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorl %edi, %edi
+; ILP-NEXT:    movq %rsi, %r11
+; ILP-NEXT:    negq %r11
+; ILP-NEXT:    movl $0, %r10d
+; ILP-NEXT:    sbbq %rdx, %r10
+; ILP-NEXT:    movl $0, %r9d
+; ILP-NEXT:    sbbq %rcx, %r9
+; ILP-NEXT:    sbbq %r8, %rdi
+; ILP-NEXT:    andq %rcx, %r9
+; ILP-NEXT:    bsrq %r9, %rcx
+; ILP-NEXT:    xorq $63, %rcx
+; ILP-NEXT:    andq %r8, %rdi
+; ILP-NEXT:    bsrq %rdi, %r8
+; ILP-NEXT:    andq %rdx, %r10
+; ILP-NEXT:    bsrq %r10, %rdx
+; ILP-NEXT:    xorq $63, %r8
+; ILP-NEXT:    addq $64, %rcx
+; ILP-NEXT:    testq %rdi, %rdi
+; ILP-NEXT:    movq $0, 24(%rax)
+; ILP-NEXT:    movq $0, 16(%rax)
+; ILP-NEXT:    movq $0, 8(%rax)
+; ILP-NEXT:    cmovneq %r8, %rcx
+; ILP-NEXT:    xorq $63, %rdx
+; ILP-NEXT:    andq %rsi, %r11
+; ILP-NEXT:    movl $127, %r8d
+; ILP-NEXT:    bsrq %r11, %rsi
+; ILP-NEXT:    cmoveq %r8, %rsi
+; ILP-NEXT:    xorq $63, %rsi
+; ILP-NEXT:    addq $64, %rsi
+; ILP-NEXT:    testq %r10, %r10
+; ILP-NEXT:    cmovneq %rdx, %rsi
+; ILP-NEXT:    subq $-128, %rsi
+; ILP-NEXT:    orq %r9, %rdi
+; ILP-NEXT:    cmovneq %rcx, %rsi
+; ILP-NEXT:    movq %rsi, (%rax)
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: test2:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    xorl %r9d, %r9d
+; HYBRID-NEXT:    movq %rsi, %r11
+; HYBRID-NEXT:    negq %r11
+; HYBRID-NEXT:    movl $0, %r10d
+; HYBRID-NEXT:    sbbq %rdx, %r10
+; HYBRID-NEXT:    movl $0, %edi
+; HYBRID-NEXT:    sbbq %rcx, %rdi
+; HYBRID-NEXT:    sbbq %r8, %r9
+; HYBRID-NEXT:    andq %r8, %r9
+; HYBRID-NEXT:    bsrq %r9, %r8
+; HYBRID-NEXT:    xorq $63, %r8
+; HYBRID-NEXT:    andq %rcx, %rdi
+; HYBRID-NEXT:    bsrq %rdi, %rcx
+; HYBRID-NEXT:    xorq $63, %rcx
+; HYBRID-NEXT:    addq $64, %rcx
+; HYBRID-NEXT:    testq %r9, %r9
+; HYBRID-NEXT:    cmovneq %r8, %rcx
+; HYBRID-NEXT:    andq %rdx, %r10
+; HYBRID-NEXT:    bsrq %r10, %rdx
+; HYBRID-NEXT:    xorq $63, %rdx
+; HYBRID-NEXT:    andq %rsi, %r11
+; HYBRID-NEXT:    movl $127, %r8d
+; HYBRID-NEXT:    bsrq %r11, %rsi
+; HYBRID-NEXT:    cmoveq %r8, %rsi
+; HYBRID-NEXT:    xorq $63, %rsi
+; HYBRID-NEXT:    addq $64, %rsi
+; HYBRID-NEXT:    testq %r10, %r10
+; HYBRID-NEXT:    cmovneq %rdx, %rsi
+; HYBRID-NEXT:    subq $-128, %rsi
+; HYBRID-NEXT:    orq %r9, %rdi
+; HYBRID-NEXT:    cmovneq %rcx, %rsi
+; HYBRID-NEXT:    movq %rsi, (%rax)
+; HYBRID-NEXT:    movq $0, 24(%rax)
+; HYBRID-NEXT:    movq $0, 16(%rax)
+; HYBRID-NEXT:    movq $0, 8(%rax)
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: test2:
+; BURR:       # %bb.0:
+; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    xorl %r9d, %r9d
+; BURR-NEXT:    movq %rsi, %r11
+; BURR-NEXT:    negq %r11
+; BURR-NEXT:    movl $0, %r10d
+; BURR-NEXT:    sbbq %rdx, %r10
+; BURR-NEXT:    movl $0, %edi
+; BURR-NEXT:    sbbq %rcx, %rdi
+; BURR-NEXT:    sbbq %r8, %r9
+; BURR-NEXT:    andq %r8, %r9
+; BURR-NEXT:    bsrq %r9, %r8
+; BURR-NEXT:    xorq $63, %r8
+; BURR-NEXT:    andq %rcx, %rdi
+; BURR-NEXT:    bsrq %rdi, %rcx
+; BURR-NEXT:    xorq $63, %rcx
+; BURR-NEXT:    addq $64, %rcx
+; BURR-NEXT:    testq %r9, %r9
+; BURR-NEXT:    cmovneq %r8, %rcx
+; BURR-NEXT:    andq %rdx, %r10
+; BURR-NEXT:    bsrq %r10, %rdx
+; BURR-NEXT:    xorq $63, %rdx
+; BURR-NEXT:    andq %rsi, %r11
+; BURR-NEXT:    movl $127, %r8d
+; BURR-NEXT:    bsrq %r11, %rsi
+; BURR-NEXT:    cmoveq %r8, %rsi
+; BURR-NEXT:    xorq $63, %rsi
+; BURR-NEXT:    addq $64, %rsi
+; BURR-NEXT:    testq %r10, %r10
+; BURR-NEXT:    cmovneq %rdx, %rsi
+; BURR-NEXT:    subq $-128, %rsi
+; BURR-NEXT:    orq %r9, %rdi
+; BURR-NEXT:    cmovneq %rcx, %rsi
+; BURR-NEXT:    movq %rsi, (%rax)
+; BURR-NEXT:    movq $0, 24(%rax)
+; BURR-NEXT:    movq $0, 16(%rax)
+; BURR-NEXT:    movq $0, 8(%rax)
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: test2:
+; SRC:       # %bb.0:
+; SRC-NEXT:    movq %rdi, %rax
+; SRC-NEXT:    xorl %edi, %edi
+; SRC-NEXT:    movq %rsi, %r11
+; SRC-NEXT:    negq %r11
+; SRC-NEXT:    movl $0, %r10d
+; SRC-NEXT:    sbbq %rdx, %r10
+; SRC-NEXT:    movl $0, %r9d
+; SRC-NEXT:    sbbq %rcx, %r9
+; SRC-NEXT:    sbbq %r8, %rdi
+; SRC-NEXT:    andq %rdx, %r10
+; SRC-NEXT:    andq %rcx, %r9
+; SRC-NEXT:    andq %r8, %rdi
+; SRC-NEXT:    andq %rsi, %r11
+; SRC-NEXT:    bsrq %rdi, %rcx
+; SRC-NEXT:    xorq $63, %rcx
+; SRC-NEXT:    bsrq %r9, %rdx
+; SRC-NEXT:    xorq $63, %rdx
+; SRC-NEXT:    addq $64, %rdx
+; SRC-NEXT:    testq %rdi, %rdi
+; SRC-NEXT:    cmovneq %rcx, %rdx
+; SRC-NEXT:    bsrq %r10, %rcx
+; SRC-NEXT:    xorq $63, %rcx
+; SRC-NEXT:    bsrq %r11, %r8
+; SRC-NEXT:    movl $127, %esi
+; SRC-NEXT:    cmovneq %r8, %rsi
+; SRC-NEXT:    xorq $63, %rsi
+; SRC-NEXT:    addq $64, %rsi
+; SRC-NEXT:    testq %r10, %r10
+; SRC-NEXT:    cmovneq %rcx, %rsi
+; SRC-NEXT:    subq $-128, %rsi
+; SRC-NEXT:    orq %r9, %rdi
+; SRC-NEXT:    cmovneq %rdx, %rsi
+; SRC-NEXT:    movq %rsi, (%rax)
+; SRC-NEXT:    movq $0, 24(%rax)
+; SRC-NEXT:    movq $0, 16(%rax)
+; SRC-NEXT:    movq $0, 8(%rax)
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: test2:
+; LIN:       # %bb.0:
+; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    movq %rsi, %rdi
+; LIN-NEXT:    negq %rdi
+; LIN-NEXT:    andq %rsi, %rdi
+; LIN-NEXT:    bsrq %rdi, %rsi
+; LIN-NEXT:    movl $127, %edi
+; LIN-NEXT:    cmovneq %rsi, %rdi
+; LIN-NEXT:    xorq $63, %rdi
+; LIN-NEXT:    addq $64, %rdi
+; LIN-NEXT:    xorl %r9d, %r9d
+; LIN-NEXT:    movl $0, %esi
+; LIN-NEXT:    sbbq %rdx, %rsi
+; LIN-NEXT:    andq %rdx, %rsi
+; LIN-NEXT:    bsrq %rsi, %rdx
+; LIN-NEXT:    xorq $63, %rdx
+; LIN-NEXT:    testq %rsi, %rsi
+; LIN-NEXT:    cmoveq %rdi, %rdx
+; LIN-NEXT:    subq $-128, %rdx
+; LIN-NEXT:    movl $0, %esi
+; LIN-NEXT:    sbbq %rcx, %rsi
+; LIN-NEXT:    andq %rcx, %rsi
+; LIN-NEXT:    bsrq %rsi, %rcx
+; LIN-NEXT:    xorq $63, %rcx
+; LIN-NEXT:    addq $64, %rcx
+; LIN-NEXT:    sbbq %r8, %r9
+; LIN-NEXT:    andq %r8, %r9
+; LIN-NEXT:    bsrq %r9, %rdi
+; LIN-NEXT:    xorq $63, %rdi
+; LIN-NEXT:    testq %r9, %r9
+; LIN-NEXT:    cmoveq %rcx, %rdi
+; LIN-NEXT:    orq %rsi, %r9
+; LIN-NEXT:    cmoveq %rdx, %rdi
+; LIN-NEXT:    movq %rdi, (%rax)
+; LIN-NEXT:    movq $0, 8(%rax)
+; LIN-NEXT:    movq $0, 16(%rax)
+; LIN-NEXT:    movq $0, 24(%rax)
+; LIN-NEXT:    retq
   %b = sub i256 0, %a
   %c = and i256 %b, %a
   %d = call i256 @llvm.ctlz.i256(i256 %c, i1 false)
   ret i256 %d
 }
 
-; CHECK-LABEL: test3
-define i256 @test3(i256 %n) {
+define i256 @test3(i256 %n) nounwind {
+; ILP-LABEL: test3:
+; ILP:       # %bb.0:
+; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorl %r10d, %r10d
+; ILP-NEXT:    movq %rsi, %r9
+; ILP-NEXT:    negq %r9
+; ILP-NEXT:    movl $0, %r11d
+; ILP-NEXT:    sbbq %rdx, %r11
+; ILP-NEXT:    movl $0, %edi
+; ILP-NEXT:    sbbq %rcx, %rdi
+; ILP-NEXT:    sbbq %r8, %r10
+; ILP-NEXT:    notq %rcx
+; ILP-NEXT:    andq %rdi, %rcx
+; ILP-NEXT:    bsrq %rcx, %rdi
+; ILP-NEXT:    notq %rdx
+; ILP-NEXT:    andq %r11, %rdx
+; ILP-NEXT:    xorq $63, %rdi
+; ILP-NEXT:    notq %r8
+; ILP-NEXT:    andq %r10, %r8
+; ILP-NEXT:    bsrq %r8, %r10
+; ILP-NEXT:    xorq $63, %r10
+; ILP-NEXT:    addq $64, %rdi
+; ILP-NEXT:    bsrq %rdx, %r11
+; ILP-NEXT:    notq %rsi
+; ILP-NEXT:    testq %r8, %r8
+; ILP-NEXT:    movq $0, 24(%rax)
+; ILP-NEXT:    movq $0, 16(%rax)
+; ILP-NEXT:    movq $0, 8(%rax)
+; ILP-NEXT:    cmovneq %r10, %rdi
+; ILP-NEXT:    xorq $63, %r11
+; ILP-NEXT:    andq %r9, %rsi
+; ILP-NEXT:    movl $127, %r9d
+; ILP-NEXT:    bsrq %rsi, %rsi
+; ILP-NEXT:    cmoveq %r9, %rsi
+; ILP-NEXT:    xorq $63, %rsi
+; ILP-NEXT:    addq $64, %rsi
+; ILP-NEXT:    testq %rdx, %rdx
+; ILP-NEXT:    cmovneq %r11, %rsi
+; ILP-NEXT:    subq $-128, %rsi
+; ILP-NEXT:    orq %rcx, %r8
+; ILP-NEXT:    cmovneq %rdi, %rsi
+; ILP-NEXT:    movq %rsi, (%rax)
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: test3:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    pushq %rbx
+; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    xorl %edi, %edi
+; HYBRID-NEXT:    movq %rsi, %r9
+; HYBRID-NEXT:    negq %r9
+; HYBRID-NEXT:    movl $0, %r10d
+; HYBRID-NEXT:    sbbq %rdx, %r10
+; HYBRID-NEXT:    movl $0, %r11d
+; HYBRID-NEXT:    sbbq %rcx, %r11
+; HYBRID-NEXT:    sbbq %r8, %rdi
+; HYBRID-NEXT:    notq %r8
+; HYBRID-NEXT:    andq %rdi, %r8
+; HYBRID-NEXT:    bsrq %r8, %rbx
+; HYBRID-NEXT:    xorq $63, %rbx
+; HYBRID-NEXT:    notq %rcx
+; HYBRID-NEXT:    andq %r11, %rcx
+; HYBRID-NEXT:    bsrq %rcx, %rdi
+; HYBRID-NEXT:    xorq $63, %rdi
+; HYBRID-NEXT:    addq $64, %rdi
+; HYBRID-NEXT:    testq %r8, %r8
+; HYBRID-NEXT:    cmovneq %rbx, %rdi
+; HYBRID-NEXT:    notq %rdx
+; HYBRID-NEXT:    andq %r10, %rdx
+; HYBRID-NEXT:    bsrq %rdx, %rbx
+; HYBRID-NEXT:    xorq $63, %rbx
+; HYBRID-NEXT:    notq %rsi
+; HYBRID-NEXT:    andq %r9, %rsi
+; HYBRID-NEXT:    movl $127, %r9d
+; HYBRID-NEXT:    bsrq %rsi, %rsi
+; HYBRID-NEXT:    cmoveq %r9, %rsi
+; HYBRID-NEXT:    xorq $63, %rsi
+; HYBRID-NEXT:    addq $64, %rsi
+; HYBRID-NEXT:    testq %rdx, %rdx
+; HYBRID-NEXT:    cmovneq %rbx, %rsi
+; HYBRID-NEXT:    subq $-128, %rsi
+; HYBRID-NEXT:    orq %r8, %rcx
+; HYBRID-NEXT:    cmovneq %rdi, %rsi
+; HYBRID-NEXT:    movq %rsi, (%rax)
+; HYBRID-NEXT:    movq $0, 24(%rax)
+; HYBRID-NEXT:    movq $0, 16(%rax)
+; HYBRID-NEXT:    movq $0, 8(%rax)
+; HYBRID-NEXT:    popq %rbx
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: test3:
+; BURR:       # %bb.0:
+; BURR-NEXT:    pushq %rbx
+; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    xorl %edi, %edi
+; BURR-NEXT:    movq %rsi, %r9
+; BURR-NEXT:    negq %r9
+; BURR-NEXT:    movl $0, %r10d
+; BURR-NEXT:    sbbq %rdx, %r10
+; BURR-NEXT:    movl $0, %r11d
+; BURR-NEXT:    sbbq %rcx, %r11
+; BURR-NEXT:    sbbq %r8, %rdi
+; BURR-NEXT:    notq %r8
+; BURR-NEXT:    andq %rdi, %r8
+; BURR-NEXT:    bsrq %r8, %rbx
+; BURR-NEXT:    xorq $63, %rbx
+; BURR-NEXT:    notq %rcx
+; BURR-NEXT:    andq %r11, %rcx
+; BURR-NEXT:    bsrq %rcx, %rdi
+; BURR-NEXT:    xorq $63, %rdi
+; BURR-NEXT:    addq $64, %rdi
+; BURR-NEXT:    testq %r8, %r8
+; BURR-NEXT:    cmovneq %rbx, %rdi
+; BURR-NEXT:    notq %rdx
+; BURR-NEXT:    andq %r10, %rdx
+; BURR-NEXT:    bsrq %rdx, %rbx
+; BURR-NEXT:    xorq $63, %rbx
+; BURR-NEXT:    notq %rsi
+; BURR-NEXT:    andq %r9, %rsi
+; BURR-NEXT:    movl $127, %r9d
+; BURR-NEXT:    bsrq %rsi, %rsi
+; BURR-NEXT:    cmoveq %r9, %rsi
+; BURR-NEXT:    xorq $63, %rsi
+; BURR-NEXT:    addq $64, %rsi
+; BURR-NEXT:    testq %rdx, %rdx
+; BURR-NEXT:    cmovneq %rbx, %rsi
+; BURR-NEXT:    subq $-128, %rsi
+; BURR-NEXT:    orq %r8, %rcx
+; BURR-NEXT:    cmovneq %rdi, %rsi
+; BURR-NEXT:    movq %rsi, (%rax)
+; BURR-NEXT:    movq $0, 24(%rax)
+; BURR-NEXT:    movq $0, 16(%rax)
+; BURR-NEXT:    movq $0, 8(%rax)
+; BURR-NEXT:    popq %rbx
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: test3:
+; SRC:       # %bb.0:
+; SRC-NEXT:    movq %rdi, %rax
+; SRC-NEXT:    movq %rsi, %r9
+; SRC-NEXT:    notq %r9
+; SRC-NEXT:    xorl %r10d, %r10d
+; SRC-NEXT:    negq %rsi
+; SRC-NEXT:    movl $0, %r11d
+; SRC-NEXT:    sbbq %rdx, %r11
+; SRC-NEXT:    notq %rdx
+; SRC-NEXT:    movl $0, %edi
+; SRC-NEXT:    sbbq %rcx, %rdi
+; SRC-NEXT:    notq %rcx
+; SRC-NEXT:    sbbq %r8, %r10
+; SRC-NEXT:    notq %r8
+; SRC-NEXT:    andq %r11, %rdx
+; SRC-NEXT:    andq %rdi, %rcx
+; SRC-NEXT:    andq %r10, %r8
+; SRC-NEXT:    andq %r9, %rsi
+; SRC-NEXT:    bsrq %r8, %r9
+; SRC-NEXT:    xorq $63, %r9
+; SRC-NEXT:    bsrq %rcx, %rdi
+; SRC-NEXT:    xorq $63, %rdi
+; SRC-NEXT:    addq $64, %rdi
+; SRC-NEXT:    testq %r8, %r8
+; SRC-NEXT:    cmovneq %r9, %rdi
+; SRC-NEXT:    bsrq %rdx, %r9
+; SRC-NEXT:    xorq $63, %r9
+; SRC-NEXT:    bsrq %rsi, %r10
+; SRC-NEXT:    movl $127, %esi
+; SRC-NEXT:    cmovneq %r10, %rsi
+; SRC-NEXT:    xorq $63, %rsi
+; SRC-NEXT:    addq $64, %rsi
+; SRC-NEXT:    testq %rdx, %rdx
+; SRC-NEXT:    cmovneq %r9, %rsi
+; SRC-NEXT:    subq $-128, %rsi
+; SRC-NEXT:    orq %rcx, %r8
+; SRC-NEXT:    cmovneq %rdi, %rsi
+; SRC-NEXT:    movq %rsi, (%rax)
+; SRC-NEXT:    movq $0, 24(%rax)
+; SRC-NEXT:    movq $0, 16(%rax)
+; SRC-NEXT:    movq $0, 8(%rax)
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: test3:
+; LIN:       # %bb.0:
+; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    movq %rsi, %rdi
+; LIN-NEXT:    negq %rdi
+; LIN-NEXT:    notq %rsi
+; LIN-NEXT:    andq %rdi, %rsi
+; LIN-NEXT:    bsrq %rsi, %rsi
+; LIN-NEXT:    movl $127, %edi
+; LIN-NEXT:    cmovneq %rsi, %rdi
+; LIN-NEXT:    xorq $63, %rdi
+; LIN-NEXT:    addq $64, %rdi
+; LIN-NEXT:    xorl %r9d, %r9d
+; LIN-NEXT:    movl $0, %esi
+; LIN-NEXT:    sbbq %rdx, %rsi
+; LIN-NEXT:    notq %rdx
+; LIN-NEXT:    andq %rsi, %rdx
+; LIN-NEXT:    bsrq %rdx, %rsi
+; LIN-NEXT:    xorq $63, %rsi
+; LIN-NEXT:    testq %rdx, %rdx
+; LIN-NEXT:    cmoveq %rdi, %rsi
+; LIN-NEXT:    subq $-128, %rsi
+; LIN-NEXT:    movl $0, %edx
+; LIN-NEXT:    sbbq %rcx, %rdx
+; LIN-NEXT:    notq %rcx
+; LIN-NEXT:    andq %rdx, %rcx
+; LIN-NEXT:    bsrq %rcx, %rdx
+; LIN-NEXT:    xorq $63, %rdx
+; LIN-NEXT:    addq $64, %rdx
+; LIN-NEXT:    sbbq %r8, %r9
+; LIN-NEXT:    notq %r8
+; LIN-NEXT:    andq %r9, %r8
+; LIN-NEXT:    bsrq %r8, %rdi
+; LIN-NEXT:    xorq $63, %rdi
+; LIN-NEXT:    testq %r8, %r8
+; LIN-NEXT:    cmoveq %rdx, %rdi
+; LIN-NEXT:    orq %rcx, %r8
+; LIN-NEXT:    cmoveq %rsi, %rdi
+; LIN-NEXT:    movq %rdi, (%rax)
+; LIN-NEXT:    movq $0, 8(%rax)
+; LIN-NEXT:    movq $0, 16(%rax)
+; LIN-NEXT:    movq $0, 24(%rax)
+; LIN-NEXT:    retq
   %m = sub i256 -1, %n
   %x = sub i256 0, %n
   %y = and i256 %x, %m
@@ -38,8 +906,91 @@ define i256 @test3(i256 %n) {
 
 declare i256 @llvm.ctlz.i256(i256, i1) nounwind readnone
 
-; CHECK-LABEL: test4
-define i64 @test4(i64 %a, i64 %b) {
+define i64 @test4(i64 %a, i64 %b) nounwind {
+; ILP-LABEL: test4:
+; ILP:       # %bb.0:
+; ILP-NEXT:    xorl %ecx, %ecx
+; ILP-NEXT:    xorl %edx, %edx
+; ILP-NEXT:    addq $1, %rsi
+; ILP-NEXT:    setb %dl
+; ILP-NEXT:    movl $2, %eax
+; ILP-NEXT:    cmpq %rdi, %rsi
+; ILP-NEXT:    sbbq $0, %rdx
+; ILP-NEXT:    movl $0, %edx
+; ILP-NEXT:    sbbq $0, %rdx
+; ILP-NEXT:    sbbq $0, %rcx
+; ILP-NEXT:    setae %cl
+; ILP-NEXT:    movzbl %cl, %ecx
+; ILP-NEXT:    subq %rcx, %rax
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: test4:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    xorl %eax, %eax
+; HYBRID-NEXT:    xorl %ecx, %ecx
+; HYBRID-NEXT:    addq $1, %rsi
+; HYBRID-NEXT:    setb %cl
+; HYBRID-NEXT:    cmpq %rdi, %rsi
+; HYBRID-NEXT:    sbbq $0, %rcx
+; HYBRID-NEXT:    movl $0, %ecx
+; HYBRID-NEXT:    sbbq $0, %rcx
+; HYBRID-NEXT:    sbbq $0, %rax
+; HYBRID-NEXT:    setae %al
+; HYBRID-NEXT:    movzbl %al, %ecx
+; HYBRID-NEXT:    movl $2, %eax
+; HYBRID-NEXT:    subq %rcx, %rax
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: test4:
+; BURR:       # %bb.0:
+; BURR-NEXT:    xorl %eax, %eax
+; BURR-NEXT:    xorl %ecx, %ecx
+; BURR-NEXT:    addq $1, %rsi
+; BURR-NEXT:    setb %cl
+; BURR-NEXT:    cmpq %rdi, %rsi
+; BURR-NEXT:    sbbq $0, %rcx
+; BURR-NEXT:    movl $0, %ecx
+; BURR-NEXT:    sbbq $0, %rcx
+; BURR-NEXT:    sbbq $0, %rax
+; BURR-NEXT:    setae %al
+; BURR-NEXT:    movzbl %al, %ecx
+; BURR-NEXT:    movl $2, %eax
+; BURR-NEXT:    subq %rcx, %rax
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: test4:
+; SRC:       # %bb.0:
+; SRC-NEXT:    xorl %eax, %eax
+; SRC-NEXT:    addq $1, %rsi
+; SRC-NEXT:    setb %al
+; SRC-NEXT:    xorl %ecx, %ecx
+; SRC-NEXT:    cmpq %rdi, %rsi
+; SRC-NEXT:    sbbq $0, %rax
+; SRC-NEXT:    movl $0, %eax
+; SRC-NEXT:    sbbq $0, %rax
+; SRC-NEXT:    sbbq $0, %rcx
+; SRC-NEXT:    setae %al
+; SRC-NEXT:    movzbl %al, %ecx
+; SRC-NEXT:    movl $2, %eax
+; SRC-NEXT:    subq %rcx, %rax
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: test4:
+; LIN:       # %bb.0:
+; LIN-NEXT:    movl $2, %eax
+; LIN-NEXT:    xorl %ecx, %ecx
+; LIN-NEXT:    xorl %edx, %edx
+; LIN-NEXT:    addq $1, %rsi
+; LIN-NEXT:    setb %dl
+; LIN-NEXT:    cmpq %rdi, %rsi
+; LIN-NEXT:    sbbq $0, %rdx
+; LIN-NEXT:    movl $0, %edx
+; LIN-NEXT:    sbbq $0, %rdx
+; LIN-NEXT:    sbbq $0, %rcx
+; LIN-NEXT:    setae %cl
+; LIN-NEXT:    movzbl %cl, %ecx
+; LIN-NEXT:    subq %rcx, %rax
+; LIN-NEXT:    retq
   %r = zext i64 %b to i256
   %u = add i256 %r, 1
   %w = and i256 %u, 1461501637330902918203684832716283019655932542975
@@ -49,3 +1000,252 @@ define i64 @test4(i64 %a, i64 %b) {
   %z = add i64 %y, 1
   ret i64 %z
 }
+
+define i256 @PR25498(i256 %a) nounwind {
+; ILP-LABEL: PR25498:
+; ILP:       # %bb.0:
+; ILP-NEXT:    pushq %rbx
+; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorl %r9d, %r9d
+; ILP-NEXT:    movq %rsi, %rbx
+; ILP-NEXT:    negq %rbx
+; ILP-NEXT:    movl $0, %r11d
+; ILP-NEXT:    sbbq %rdx, %r11
+; ILP-NEXT:    movl $0, %r10d
+; ILP-NEXT:    sbbq %rcx, %r10
+; ILP-NEXT:    movl $0, %edi
+; ILP-NEXT:    sbbq %r8, %rdi
+; ILP-NEXT:    orq %r8, %rdx
+; ILP-NEXT:    orq %rcx, %rsi
+; ILP-NEXT:    orq %rdx, %rsi
+; ILP-NEXT:    je .LBB4_1
+; ILP-NEXT:  # %bb.2: # %cond.false
+; ILP-NEXT:    bsrq %r11, %rdx
+; ILP-NEXT:    bsrq %rdi, %rcx
+; ILP-NEXT:    xorq $63, %rcx
+; ILP-NEXT:    bsrq %r10, %rsi
+; ILP-NEXT:    xorq $63, %rsi
+; ILP-NEXT:    addq $64, %rsi
+; ILP-NEXT:    testq %rdi, %rdi
+; ILP-NEXT:    cmovneq %rcx, %rsi
+; ILP-NEXT:    xorq $63, %rdx
+; ILP-NEXT:    bsrq %rbx, %rcx
+; ILP-NEXT:    xorq $63, %rcx
+; ILP-NEXT:    addq $64, %rcx
+; ILP-NEXT:    testq %r11, %r11
+; ILP-NEXT:    cmovneq %rdx, %rcx
+; ILP-NEXT:    subq $-128, %rcx
+; ILP-NEXT:    xorl %r9d, %r9d
+; ILP-NEXT:    orq %rdi, %r10
+; ILP-NEXT:    cmovneq %rsi, %rcx
+; ILP-NEXT:    jmp .LBB4_3
+; ILP-NEXT:  .LBB4_1:
+; ILP-NEXT:    movl $256, %ecx # imm = 0x100
+; ILP-NEXT:  .LBB4_3: # %cond.end
+; ILP-NEXT:    movq %rcx, (%rax)
+; ILP-NEXT:    movq %r9, 8(%rax)
+; ILP-NEXT:    movq %r9, 16(%rax)
+; ILP-NEXT:    movq %r9, 24(%rax)
+; ILP-NEXT:    popq %rbx
+; ILP-NEXT:    retq
+;
+; HYBRID-LABEL: PR25498:
+; HYBRID:       # %bb.0:
+; HYBRID-NEXT:    pushq %rbx
+; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    xorl %r9d, %r9d
+; HYBRID-NEXT:    movq %rsi, %rbx
+; HYBRID-NEXT:    negq %rbx
+; HYBRID-NEXT:    movl $0, %r11d
+; HYBRID-NEXT:    sbbq %rdx, %r11
+; HYBRID-NEXT:    movl $0, %r10d
+; HYBRID-NEXT:    sbbq %rcx, %r10
+; HYBRID-NEXT:    movl $0, %edi
+; HYBRID-NEXT:    sbbq %r8, %rdi
+; HYBRID-NEXT:    orq %r8, %rdx
+; HYBRID-NEXT:    orq %rcx, %rsi
+; HYBRID-NEXT:    orq %rdx, %rsi
+; HYBRID-NEXT:    je .LBB4_1
+; HYBRID-NEXT:  # %bb.2: # %cond.false
+; HYBRID-NEXT:    bsrq %rdi, %rcx
+; HYBRID-NEXT:    xorq $63, %rcx
+; HYBRID-NEXT:    bsrq %r10, %rdx
+; HYBRID-NEXT:    xorq $63, %rdx
+; HYBRID-NEXT:    addq $64, %rdx
+; HYBRID-NEXT:    testq %rdi, %rdi
+; HYBRID-NEXT:    cmovneq %rcx, %rdx
+; HYBRID-NEXT:    bsrq %r11, %rsi
+; HYBRID-NEXT:    xorq $63, %rsi
+; HYBRID-NEXT:    bsrq %rbx, %rcx
+; HYBRID-NEXT:    xorq $63, %rcx
+; HYBRID-NEXT:    addq $64, %rcx
+; HYBRID-NEXT:    testq %r11, %r11
+; HYBRID-NEXT:    cmovneq %rsi, %rcx
+; HYBRID-NEXT:    subq $-128, %rcx
+; HYBRID-NEXT:    orq %rdi, %r10
+; HYBRID-NEXT:    cmovneq %rdx, %rcx
+; HYBRID-NEXT:    xorl %r9d, %r9d
+; HYBRID-NEXT:    jmp .LBB4_3
+; HYBRID-NEXT:  .LBB4_1:
+; HYBRID-NEXT:    movl $256, %ecx # imm = 0x100
+; HYBRID-NEXT:  .LBB4_3: # %cond.end
+; HYBRID-NEXT:    movq %rcx, (%rax)
+; HYBRID-NEXT:    movq %r9, 8(%rax)
+; HYBRID-NEXT:    movq %r9, 16(%rax)
+; HYBRID-NEXT:    movq %r9, 24(%rax)
+; HYBRID-NEXT:    popq %rbx
+; HYBRID-NEXT:    retq
+;
+; BURR-LABEL: PR25498:
+; BURR:       # %bb.0:
+; BURR-NEXT:    pushq %rbx
+; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    xorl %r9d, %r9d
+; BURR-NEXT:    movq %rsi, %rbx
+; BURR-NEXT:    negq %rbx
+; BURR-NEXT:    movl $0, %r11d
+; BURR-NEXT:    sbbq %rdx, %r11
+; BURR-NEXT:    movl $0, %r10d
+; BURR-NEXT:    sbbq %rcx, %r10
+; BURR-NEXT:    movl $0, %edi
+; BURR-NEXT:    sbbq %r8, %rdi
+; BURR-NEXT:    orq %r8, %rdx
+; BURR-NEXT:    orq %rcx, %rsi
+; BURR-NEXT:    orq %rdx, %rsi
+; BURR-NEXT:    je .LBB4_1
+; BURR-NEXT:  # %bb.2: # %cond.false
+; BURR-NEXT:    bsrq %rdi, %rcx
+; BURR-NEXT:    xorq $63, %rcx
+; BURR-NEXT:    bsrq %r10, %rdx
+; BURR-NEXT:    xorq $63, %rdx
+; BURR-NEXT:    addq $64, %rdx
+; BURR-NEXT:    testq %rdi, %rdi
+; BURR-NEXT:    cmovneq %rcx, %rdx
+; BURR-NEXT:    bsrq %r11, %rsi
+; BURR-NEXT:    xorq $63, %rsi
+; BURR-NEXT:    bsrq %rbx, %rcx
+; BURR-NEXT:    xorq $63, %rcx
+; BURR-NEXT:    addq $64, %rcx
+; BURR-NEXT:    testq %r11, %r11
+; BURR-NEXT:    cmovneq %rsi, %rcx
+; BURR-NEXT:    subq $-128, %rcx
+; BURR-NEXT:    orq %rdi, %r10
+; BURR-NEXT:    cmovneq %rdx, %rcx
+; BURR-NEXT:    xorl %r9d, %r9d
+; BURR-NEXT:    jmp .LBB4_3
+; BURR-NEXT:  .LBB4_1:
+; BURR-NEXT:    movl $256, %ecx # imm = 0x100
+; BURR-NEXT:  .LBB4_3: # %cond.end
+; BURR-NEXT:    movq %rcx, (%rax)
+; BURR-NEXT:    movq %r9, 8(%rax)
+; BURR-NEXT:    movq %r9, 16(%rax)
+; BURR-NEXT:    movq %r9, 24(%rax)
+; BURR-NEXT:    popq %rbx
+; BURR-NEXT:    retq
+;
+; SRC-LABEL: PR25498:
+; SRC:       # %bb.0:
+; SRC-NEXT:    pushq %rbx
+; SRC-NEXT:    movq %rdi, %rax
+; SRC-NEXT:    xorl %r9d, %r9d
+; SRC-NEXT:    movq %rsi, %rbx
+; SRC-NEXT:    negq %rbx
+; SRC-NEXT:    movl $0, %r11d
+; SRC-NEXT:    sbbq %rdx, %r11
+; SRC-NEXT:    movl $0, %r10d
+; SRC-NEXT:    sbbq %rcx, %r10
+; SRC-NEXT:    movl $0, %edi
+; SRC-NEXT:    sbbq %r8, %rdi
+; SRC-NEXT:    orq %r8, %rdx
+; SRC-NEXT:    orq %rcx, %rsi
+; SRC-NEXT:    orq %rdx, %rsi
+; SRC-NEXT:    je .LBB4_1
+; SRC-NEXT:  # %bb.2: # %cond.false
+; SRC-NEXT:    bsrq %rdi, %rcx
+; SRC-NEXT:    xorq $63, %rcx
+; SRC-NEXT:    bsrq %r10, %rdx
+; SRC-NEXT:    xorq $63, %rdx
+; SRC-NEXT:    addq $64, %rdx
+; SRC-NEXT:    testq %rdi, %rdi
+; SRC-NEXT:    cmovneq %rcx, %rdx
+; SRC-NEXT:    bsrq %r11, %rsi
+; SRC-NEXT:    xorq $63, %rsi
+; SRC-NEXT:    bsrq %rbx, %rcx
+; SRC-NEXT:    xorq $63, %rcx
+; SRC-NEXT:    addq $64, %rcx
+; SRC-NEXT:    testq %r11, %r11
+; SRC-NEXT:    cmovneq %rsi, %rcx
+; SRC-NEXT:    subq $-128, %rcx
+; SRC-NEXT:    orq %rdi, %r10
+; SRC-NEXT:    cmovneq %rdx, %rcx
+; SRC-NEXT:    xorl %r9d, %r9d
+; SRC-NEXT:    jmp .LBB4_3
+; SRC-NEXT:  .LBB4_1:
+; SRC-NEXT:    movl $256, %ecx # imm = 0x100
+; SRC-NEXT:  .LBB4_3: # %cond.end
+; SRC-NEXT:    movq %rcx, (%rax)
+; SRC-NEXT:    movq %r9, 8(%rax)
+; SRC-NEXT:    movq %r9, 16(%rax)
+; SRC-NEXT:    movq %r9, 24(%rax)
+; SRC-NEXT:    popq %rbx
+; SRC-NEXT:    retq
+;
+; LIN-LABEL: PR25498:
+; LIN:       # %bb.0:
+; LIN-NEXT:    pushq %rbx
+; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    movq %rsi, %rbx
+; LIN-NEXT:    negq %rbx
+; LIN-NEXT:    xorl %r9d, %r9d
+; LIN-NEXT:    movl $0, %edi
+; LIN-NEXT:    sbbq %rdx, %rdi
+; LIN-NEXT:    movl $0, %r10d
+; LIN-NEXT:    sbbq %rcx, %r10
+; LIN-NEXT:    movl $0, %r11d
+; LIN-NEXT:    sbbq %r8, %r11
+; LIN-NEXT:    orq %rcx, %rsi
+; LIN-NEXT:    orq %r8, %rdx
+; LIN-NEXT:    orq %rsi, %rdx
+; LIN-NEXT:    je .LBB4_1
+; LIN-NEXT:  # %bb.2: # %cond.false
+; LIN-NEXT:    bsrq %rbx, %rcx
+; LIN-NEXT:    xorq $63, %rcx
+; LIN-NEXT:    addq $64, %rcx
+; LIN-NEXT:    bsrq %rdi, %rdx
+; LIN-NEXT:    xorq $63, %rdx
+; LIN-NEXT:    testq %rdi, %rdi
+; LIN-NEXT:    cmoveq %rcx, %rdx
+; LIN-NEXT:    subq $-128, %rdx
+; LIN-NEXT:    bsrq %r10, %rsi
+; LIN-NEXT:    xorq $63, %rsi
+; LIN-NEXT:    addq $64, %rsi
+; LIN-NEXT:    bsrq %r11, %rcx
+; LIN-NEXT:    xorq $63, %rcx
+; LIN-NEXT:    testq %r11, %r11
+; LIN-NEXT:    cmoveq %rsi, %rcx
+; LIN-NEXT:    orq %r11, %r10
+; LIN-NEXT:    cmoveq %rdx, %rcx
+; LIN-NEXT:    xorl %r9d, %r9d
+; LIN-NEXT:    jmp .LBB4_3
+; LIN-NEXT:  .LBB4_1:
+; LIN-NEXT:    movl $256, %ecx # imm = 0x100
+; LIN-NEXT:  .LBB4_3: # %cond.end
+; LIN-NEXT:    movq %rcx, (%rax)
+; LIN-NEXT:    movq %r9, 8(%rax)
+; LIN-NEXT:    movq %r9, 16(%rax)
+; LIN-NEXT:    movq %r9, 24(%rax)
+; LIN-NEXT:    popq %rbx
+; LIN-NEXT:    retq
+  %b = sub i256 0, %a
+  %cmpz = icmp eq i256 %b, 0
+  br i1 %cmpz, label %cond.end, label %cond.false
+
+cond.false:
+  %d = call i256 @llvm.ctlz.i256(i256 %b, i1 true)
+  br label %cond.end
+
+cond.end:
+  %ctz = phi i256 [ 256, %0 ], [ %d, %cond.false ]
+  ret i256 %ctz
+}
+
diff --git a/test/CodeGen/X86/section_mergeable_size.ll b/test/CodeGen/X86/section_mergeable_size.ll
new file mode 100644
index 0000000000000000000000000000000000000000..73b70c47f036c35326991f73c3a2ee77f0c8bb98
--- /dev/null
+++ b/test/CodeGen/X86/section_mergeable_size.ll
@@ -0,0 +1,3 @@
+; RUN: llc -mtriple x86_64-linux-gnu < %s | FileCheck %s
+@a = internal unnamed_addr constant [1 x [1 x i32]] zeroinitializer, section ".init.rodata", align 4
+; CHECK: .init.rodata,"aM",{{[@%]}}progbits,4
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index e34ba5412f07a8bb63fd8ce5a84230de59de3bb2..bce51d6bc2373812289879cb3f6993e6cdfca595 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -25,12 +25,7 @@ false:
 ; X32-LABEL:      test_basic:
 
 ; X32:      cmpl %gs:48, %esp
-; X32-NEXT: ja      .LBB0_2
-
-; X32:      pushl $4
-; X32-NEXT: pushl $12
-; X32-NEXT: calll __morestack
-; X32-NEXT: ret
+; X32-NEXT: jbe	.LBB0_1
 
 ; X32:      movl %esp, %eax
 ; X32:      subl %ecx, %eax
@@ -43,15 +38,15 @@ false:
 ; X32-NEXT: calll __morestack_allocate_stack_space
 ; X32-NEXT: addl $16, %esp
 
+; X32:      pushl $4
+; X32-NEXT: pushl $12
+; X32-NEXT: calll __morestack
+; X32-NEXT: ret
+
 ; X64-LABEL:      test_basic:
 
 ; X64:      cmpq %fs:112, %rsp
-; X64-NEXT: ja      .LBB0_2
-
-; X64:      movabsq $24, %r10
-; X64-NEXT: movabsq $0, %r11
-; X64-NEXT: callq __morestack
-; X64-NEXT: ret
+; X64-NEXT: jbe      .LBB0_1
 
 ; X64:      movq %rsp, %[[RDI:rdi|rax]]
 ; X64:      subq %{{.*}}, %[[RDI]]
@@ -63,15 +58,15 @@ false:
 ; X64-NEXT: callq __morestack_allocate_stack_space
 ; X64:      movq %rax, %rdi
 
+; X64:      movabsq $24, %r10
+; X64-NEXT: movabsq $0, %r11
+; X64-NEXT: callq __morestack
+; X64-NEXT: ret
+
 ; X32ABI-LABEL:      test_basic:
 
 ; X32ABI:      cmpl %fs:64, %esp
-; X32ABI-NEXT: ja      .LBB0_2
-
-; X32ABI:      movl $24, %r10d
-; X32ABI-NEXT: movl $0, %r11d
-; X32ABI-NEXT: callq __morestack
-; X32ABI-NEXT: ret
+; X32ABI-NEXT: jbe      .LBB0_1
 
 ; X32ABI:      movl %esp, %[[EDI:edi|eax]]
 ; X32ABI:      subl %{{.*}}, %[[EDI]]
@@ -83,6 +78,11 @@ false:
 ; X32ABI-NEXT: callq __morestack_allocate_stack_space
 ; X32ABI:      movl %eax, %edi
 
+; X32ABI:      movl $24, %r10d
+; X32ABI-NEXT: movl $0, %r11d
+; X32ABI-NEXT: callq __morestack
+; X32ABI-NEXT: ret
+
 }
 
 attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 588262bbb39526c6700548287464319f275d6735..fac9a33394bcd0d1e763f1476d2284778b59e93f 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -41,7 +41,7 @@ define void @test_basic() #0 {
 ; X32-Linux-LABEL:       test_basic:
 
 ; X32-Linux:       cmpl %gs:48, %esp
-; X32-Linux-NEXT:  ja      .LBB0_2
+; X32-Linux-NEXT:  jbe	.LBB0_1
 
 ; X32-Linux:       pushl $0
 ; X32-Linux-NEXT:  pushl $44
@@ -51,7 +51,7 @@ define void @test_basic() #0 {
 ; X64-Linux-LABEL:       test_basic:
 
 ; X64-Linux:       cmpq %fs:112, %rsp
-; X64-Linux-NEXT:  ja      .LBB0_2
+; X64-Linux-NEXT:  jbe	.LBB0_1
 
 ; X64-Linux:       movabsq $40, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
@@ -61,7 +61,7 @@ define void @test_basic() #0 {
 ; X64-Linux-Large-LABEL:       test_basic:
 
 ; X64-Linux-Large:       cmpq %fs:112, %rsp
-; X64-Linux-Large-NEXT:  ja      .LBB0_2
+; X64-Linux-Large-NEXT:  jbe	.LBB0_1
 
 ; X64-Linux-Large:       movabsq $40, %r10
 ; X64-Linux-Large-NEXT:  movabsq $0, %r11
@@ -71,7 +71,7 @@ define void @test_basic() #0 {
 ; X32ABI-LABEL:       test_basic:
 
 ; X32ABI:       cmpl %fs:64, %esp
-; X32ABI-NEXT:  ja      .LBB0_2
+; X32ABI-NEXT:  jbe	.LBB0_1
 
 ; X32ABI:       movl $40, %r10d
 ; X32ABI-NEXT:  movl $0, %r11d
@@ -82,7 +82,7 @@ define void @test_basic() #0 {
 
 ; X32-Darwin:      movl $432, %ecx
 ; X32-Darwin-NEXT: cmpl %gs:(%ecx), %esp
-; X32-Darwin-NEXT: ja      LBB0_2
+; X32-Darwin-NEXT: jbe	LBB0_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $60
@@ -92,7 +92,7 @@ define void @test_basic() #0 {
 ; X64-Darwin-LABEL:      test_basic:
 
 ; X64-Darwin:      cmpq %gs:816, %rsp
-; X64-Darwin-NEXT: ja      LBB0_2
+; X64-Darwin-NEXT: jbe	LBB0_1
 
 ; X64-Darwin:      movabsq $40, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
@@ -102,7 +102,7 @@ define void @test_basic() #0 {
 ; X32-MinGW-LABEL:       test_basic:
 
 ; X32-MinGW:       cmpl %fs:20, %esp
-; X32-MinGW-NEXT:  ja      LBB0_2
+; X32-MinGW-NEXT:  jbe      LBB0_1
 
 ; X32-MinGW:       pushl $0
 ; X32-MinGW-NEXT:  pushl $40
@@ -112,7 +112,7 @@ define void @test_basic() #0 {
 ; X64-MinGW-LABEL:       test_basic:
 
 ; X64-MinGW:       cmpq %gs:40, %rsp
-; X64-MinGW-NEXT:  ja      .LBB0_2
+; X64-MinGW-NEXT:  jbe      .LBB0_1
 
 ; X64-MinGW:       movabsq $72, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
@@ -122,7 +122,7 @@ define void @test_basic() #0 {
 ; X64-FreeBSD-LABEL:       test_basic:
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
-; X64-FreeBSD-NEXT:  ja      .LBB0_2
+; X64-FreeBSD-NEXT:  jbe      .LBB0_1
 
 ; X64-FreeBSD:       movabsq $40, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
@@ -132,7 +132,7 @@ define void @test_basic() #0 {
 ; X32-DFlyBSD-LABEL:       test_basic:
 
 ; X32-DFlyBSD:       cmpl %fs:16, %esp
-; X32-DFlyBSD-NEXT:  ja      .LBB0_2
+; X32-DFlyBSD-NEXT:  jbe      .LBB0_1
 
 ; X32-DFlyBSD:       pushl $0
 ; X32-DFlyBSD-NEXT:  pushl $40
@@ -142,7 +142,7 @@ define void @test_basic() #0 {
 ; X64-DFlyBSD-LABEL:       test_basic:
 
 ; X64-DFlyBSD:       cmpq %fs:32, %rsp
-; X64-DFlyBSD-NEXT:  ja      .LBB0_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB0_1
 
 ; X64-DFlyBSD:       movabsq $40, %r10
 ; X64-DFlyBSD-NEXT:  movabsq $0, %r11
@@ -159,7 +159,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        ret i32 %result
 
 ; X32-Linux:       cmpl %gs:48, %esp
-; X32-Linux-NEXT:  ja      .LBB1_2
+; X32-Linux-NEXT:  jbe	.LBB1_1
 
 ; X32-Linux:       pushl $4
 ; X32-Linux-NEXT:  pushl $44
@@ -167,7 +167,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X32-Linux-NEXT:  ret
 
 ; X64-Linux:       cmpq %fs:112, %rsp
-; X64-Linux-NEXT:  ja      .LBB1_2
+; X64-Linux-NEXT:  jbe	.LBB1_1
 
 ; X64-Linux:       movq %r10, %rax
 ; X64-Linux-NEXT:  movabsq $56, %r10
@@ -177,7 +177,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-Linux-NEXT:  movq %rax, %r10
 
 ; X32ABI:       cmpl %fs:64, %esp
-; X32ABI-NEXT:  ja      .LBB1_2
+; X32ABI-NEXT:  jbe	.LBB1_1
 
 ; X32ABI:       movl %r10d, %eax
 ; X32ABI-NEXT:  movl $56, %r10d
@@ -188,7 +188,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 
 ; X32-Darwin:      movl $432, %edx
 ; X32-Darwin-NEXT: cmpl %gs:(%edx), %esp
-; X32-Darwin-NEXT: ja      LBB1_2
+; X32-Darwin-NEXT: jbe	LBB1_1
 
 ; X32-Darwin:      pushl $4
 ; X32-Darwin-NEXT: pushl $60
@@ -196,7 +196,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X32-Darwin-NEXT: ret
 
 ; X64-Darwin:      cmpq %gs:816, %rsp
-; X64-Darwin-NEXT: ja      LBB1_2
+; X64-Darwin-NEXT: jbe	LBB1_1
 
 ; X64-Darwin:      movq %r10, %rax
 ; X64-Darwin-NEXT: movabsq $56, %r10
@@ -206,7 +206,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-Darwin-NEXT: movq %rax, %r10
 
 ; X32-MinGW:       cmpl %fs:20, %esp
-; X32-MinGW-NEXT:  ja      LBB1_2
+; X32-MinGW-NEXT:  jbe      LBB1_1
 
 ; X32-MinGW:       pushl $4
 ; X32-MinGW-NEXT:  pushl $44
@@ -215,7 +215,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 
 ; X64-MinGW-LABEL: test_nested:
 ; X64-MinGW:       cmpq %gs:40, %rsp
-; X64-MinGW-NEXT:  ja      .LBB1_2
+; X64-MinGW-NEXT:  jbe      .LBB1_1
 
 ; X64-MinGW:       movq %r10, %rax
 ; X64-MinGW-NEXT:  movabsq $88, %r10
@@ -225,7 +225,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-MinGW-NEXT:  movq %rax, %r10
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
-; X64-FreeBSD-NEXT:  ja      .LBB1_2
+; X64-FreeBSD-NEXT:  jbe      .LBB1_1
 
 ; X64-FreeBSD:       movq %r10, %rax
 ; X64-FreeBSD-NEXT:  movabsq $56, %r10
@@ -235,7 +235,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X64-FreeBSD-NEXT:  movq %rax, %r10
 
 ; X32-DFlyBSD:       cmpl %fs:16, %esp
-; X32-DFlyBSD-NEXT:  ja      .LBB1_2
+; X32-DFlyBSD-NEXT:  jbe      .LBB1_1
 
 ; X32-DFlyBSD:       pushl $4
 ; X32-DFlyBSD-NEXT:  pushl $44
@@ -243,7 +243,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 ; X32-DFlyBSD-NEXT:  ret
 
 ; X64-DFlyBSD:       cmpq %fs:32, %rsp
-; X64-DFlyBSD-NEXT:  ja      .LBB1_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB1_1
 
 ; X64-DFlyBSD:       movq %r10, %rax
 ; X64-DFlyBSD-NEXT:  movabsq $56, %r10
@@ -256,12 +256,14 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
 
 define void @test_large() #0 {
         %mem = alloca i32, i32 10000
-        call void @dummy_use (i32* %mem, i32 0)
+        call void @dummy_use (i32* %mem, i32 3)
         ret void
 
+; X32-Linux-LABEL:       test_large:
+
 ; X32-Linux:       leal -40012(%esp), %ecx
 ; X32-Linux-NEXT:  cmpl %gs:48, %ecx
-; X32-Linux-NEXT:  ja      .LBB2_2
+; X32-Linux-NEXT:  jbe	.LBB2_1
 
 ; X32-Linux:       pushl $0
 ; X32-Linux-NEXT:  pushl $40012
@@ -270,7 +272,7 @@ define void @test_large() #0 {
 
 ; X64-Linux:       leaq -40008(%rsp), %r11
 ; X64-Linux-NEXT:  cmpq %fs:112, %r11
-; X64-Linux-NEXT:  ja      .LBB2_2
+; X64-Linux-NEXT:  jbe	.LBB2_1
 
 ; X64-Linux:       movabsq $40008, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
@@ -279,7 +281,7 @@ define void @test_large() #0 {
 
 ; X32ABI:       leal -40008(%rsp), %r11d
 ; X32ABI-NEXT:  cmpl %fs:64, %r11d
-; X32ABI-NEXT:  ja      .LBB2_2
+; X32ABI-NEXT:  jbe	.LBB2_1
 
 ; X32ABI:       movl $40008, %r10d
 ; X32ABI-NEXT:  movl $0, %r11d
@@ -289,7 +291,7 @@ define void @test_large() #0 {
 ; X32-Darwin:      leal -40012(%esp), %ecx
 ; X32-Darwin-NEXT: movl $432, %eax
 ; X32-Darwin-NEXT: cmpl %gs:(%eax), %ecx
-; X32-Darwin-NEXT: ja      LBB2_2
+; X32-Darwin-NEXT: jbe	LBB2_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $40012
@@ -298,7 +300,7 @@ define void @test_large() #0 {
 
 ; X64-Darwin:      leaq -40008(%rsp), %r11
 ; X64-Darwin-NEXT: cmpq %gs:816, %r11
-; X64-Darwin-NEXT: ja      LBB2_2
+; X64-Darwin-NEXT: jbe      LBB2_1
 
 ; X64-Darwin:      movabsq $40008, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
@@ -307,7 +309,7 @@ define void @test_large() #0 {
 
 ; X32-MinGW:       leal -40000(%esp), %ecx
 ; X32-MinGW-NEXT:  cmpl %fs:20, %ecx
-; X32-MinGW-NEXT:  ja      LBB2_2
+; X32-MinGW-NEXT:  jbe      LBB2_1
 
 ; X32-MinGW:       pushl $0
 ; X32-MinGW-NEXT:  pushl $40000
@@ -317,7 +319,7 @@ define void @test_large() #0 {
 ; X64-MinGW-LABEL: test_large:
 ; X64-MinGW:       leaq -40040(%rsp), %r11
 ; X64-MinGW-NEXT:  cmpq %gs:40, %r11
-; X64-MinGW-NEXT:  ja      .LBB2_2
+; X64-MinGW-NEXT:  jbe      .LBB2_1
 
 ; X64-MinGW:       movabsq $40040, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
@@ -326,7 +328,7 @@ define void @test_large() #0 {
 
 ; X64-FreeBSD:       leaq -40008(%rsp), %r11
 ; X64-FreeBSD-NEXT:  cmpq %fs:24, %r11
-; X64-FreeBSD-NEXT:  ja      .LBB2_2
+; X64-FreeBSD-NEXT:  jbe      .LBB2_1
 
 ; X64-FreeBSD:       movabsq $40008, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
@@ -335,7 +337,7 @@ define void @test_large() #0 {
 
 ; X32-DFlyBSD:       leal -40000(%esp), %ecx
 ; X32-DFlyBSD-NEXT:  cmpl %fs:16, %ecx
-; X32-DFlyBSD-NEXT:  ja      .LBB2_2
+; X32-DFlyBSD-NEXT:  jbe      .LBB2_1
 
 ; X32-DFlyBSD:       pushl $0
 ; X32-DFlyBSD-NEXT:  pushl $40000
@@ -344,7 +346,7 @@ define void @test_large() #0 {
 
 ; X64-DFlyBSD:       leaq -40008(%rsp), %r11
 ; X64-DFlyBSD-NEXT:  cmpq %fs:32, %r11
-; X64-DFlyBSD-NEXT:  ja      .LBB2_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB2_1
 
 ; X64-DFlyBSD:       movabsq $40008, %r10
 ; X64-DFlyBSD-NEXT:  movabsq $0, %r11
@@ -361,7 +363,7 @@ define fastcc void @test_fastcc() #0 {
 ; X32-Linux-LABEL:       test_fastcc:
 
 ; X32-Linux:       cmpl %gs:48, %esp
-; X32-Linux-NEXT:  ja      .LBB3_2
+; X32-Linux-NEXT:  jbe	.LBB3_1
 
 ; X32-Linux:       pushl $0
 ; X32-Linux-NEXT:  pushl $44
@@ -371,7 +373,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-Linux-LABEL:       test_fastcc:
 
 ; X64-Linux:       cmpq %fs:112, %rsp
-; X64-Linux-NEXT:  ja      .LBB3_2
+; X64-Linux-NEXT:  jbe	.LBB3_1
 
 ; X64-Linux:       movabsq $40, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
@@ -381,7 +383,7 @@ define fastcc void @test_fastcc() #0 {
 ; X32ABI-LABEL:       test_fastcc:
 
 ; X32ABI:       cmpl %fs:64, %esp
-; X32ABI-NEXT:  ja      .LBB3_2
+; X32ABI-NEXT:  jbe	.LBB3_1
 
 ; X32ABI:       movl $40, %r10d
 ; X32ABI-NEXT:  movl $0, %r11d
@@ -392,7 +394,7 @@ define fastcc void @test_fastcc() #0 {
 
 ; X32-Darwin:      movl $432, %eax
 ; X32-Darwin-NEXT: cmpl %gs:(%eax), %esp
-; X32-Darwin-NEXT: ja      LBB3_2
+; X32-Darwin-NEXT: jbe	LBB3_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $60
@@ -402,7 +404,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-Darwin-LABEL:      test_fastcc:
 
 ; X64-Darwin:      cmpq %gs:816, %rsp
-; X64-Darwin-NEXT: ja      LBB3_2
+; X64-Darwin-NEXT: jbe	LBB3_1
 
 ; X64-Darwin:      movabsq $40, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
@@ -412,7 +414,7 @@ define fastcc void @test_fastcc() #0 {
 ; X32-MinGW-LABEL:       test_fastcc:
 
 ; X32-MinGW:       cmpl %fs:20, %esp
-; X32-MinGW-NEXT:  ja      LBB3_2
+; X32-MinGW-NEXT:  jbe      LBB3_1
 
 ; X32-MinGW:       pushl $0
 ; X32-MinGW-NEXT:  pushl $40
@@ -422,7 +424,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-MinGW-LABEL:       test_fastcc:
 
 ; X64-MinGW:       cmpq %gs:40, %rsp
-; X64-MinGW-NEXT:  ja      .LBB3_2
+; X64-MinGW-NEXT:  jbe      .LBB3_1
 
 ; X64-MinGW:       movabsq $72, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
@@ -432,7 +434,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-FreeBSD-LABEL:       test_fastcc:
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
-; X64-FreeBSD-NEXT:  ja      .LBB3_2
+; X64-FreeBSD-NEXT:  jbe    .LBB3_1
 
 ; X64-FreeBSD:       movabsq $40, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
@@ -442,7 +444,7 @@ define fastcc void @test_fastcc() #0 {
 ; X32-DFlyBSD-LABEL:       test_fastcc:
 
 ; X32-DFlyBSD:       cmpl %fs:16, %esp
-; X32-DFlyBSD-NEXT:  ja      .LBB3_2
+; X32-DFlyBSD-NEXT:  jbe     .LBB3_1
 
 ; X32-DFlyBSD:       pushl $0
 ; X32-DFlyBSD-NEXT:  pushl $40
@@ -452,7 +454,7 @@ define fastcc void @test_fastcc() #0 {
 ; X64-DFlyBSD-LABEL:       test_fastcc:
 
 ; X64-DFlyBSD:       cmpq %fs:32, %rsp
-; X64-DFlyBSD-NEXT:  ja      .LBB3_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB3_1
 
 ; X64-DFlyBSD:       movabsq $40, %r10
 ; X64-DFlyBSD-NEXT:  movabsq $0, %r11
@@ -463,14 +465,14 @@ define fastcc void @test_fastcc() #0 {
 
 define fastcc void @test_fastcc_large() #0 {
         %mem = alloca i32, i32 10000
-        call void @dummy_use (i32* %mem, i32 0)
+        call void @dummy_use (i32* %mem, i32 3)
         ret void
 
 ; X32-Linux-LABEL:       test_fastcc_large:
 
 ; X32-Linux:       leal -40012(%esp), %eax
 ; X32-Linux-NEXT:  cmpl %gs:48, %eax
-; X32-Linux-NEXT:  ja      .LBB4_2
+; X32-Linux-NEXT:  jbe	.LBB4_1
 
 ; X32-Linux:       pushl $0
 ; X32-Linux-NEXT:  pushl $40012
@@ -481,7 +483,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-Linux:       leaq -40008(%rsp), %r11
 ; X64-Linux-NEXT:  cmpq %fs:112, %r11
-; X64-Linux-NEXT:  ja      .LBB4_2
+; X64-Linux-NEXT:  jbe	.LBB4_1
 
 ; X64-Linux:       movabsq $40008, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
@@ -492,7 +494,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X32ABI:       leal -40008(%rsp), %r11d
 ; X32ABI-NEXT:  cmpl %fs:64, %r11d
-; X32ABI-NEXT:  ja      .LBB4_2
+; X32ABI-NEXT:  jbe	.LBB4_1
 
 ; X32ABI:       movl $40008, %r10d
 ; X32ABI-NEXT:  movl $0, %r11d
@@ -504,7 +506,7 @@ define fastcc void @test_fastcc_large() #0 {
 ; X32-Darwin:      leal -40012(%esp), %eax
 ; X32-Darwin-NEXT: movl $432, %ecx
 ; X32-Darwin-NEXT: cmpl %gs:(%ecx), %eax
-; X32-Darwin-NEXT: ja      LBB4_2
+; X32-Darwin-NEXT: jbe	LBB4_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $40012
@@ -515,7 +517,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-Darwin:      leaq -40008(%rsp), %r11
 ; X64-Darwin-NEXT: cmpq %gs:816, %r11
-; X64-Darwin-NEXT: ja      LBB4_2
+; X64-Darwin-NEXT: jbe	LBB4_1
 
 ; X64-Darwin:      movabsq $40008, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
@@ -526,7 +528,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X32-MinGW:       leal -40000(%esp), %eax
 ; X32-MinGW-NEXT:  cmpl %fs:20, %eax
-; X32-MinGW-NEXT:  ja      LBB4_2
+; X32-MinGW-NEXT:  jbe      LBB4_1
 
 ; X32-MinGW:       pushl $0
 ; X32-MinGW-NEXT:  pushl $40000
@@ -537,7 +539,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-MinGW:       leaq -40040(%rsp), %r11
 ; X64-MinGW-NEXT:  cmpq %gs:40, %r11
-; X64-MinGW-NEXT:  ja      .LBB4_2
+; X64-MinGW-NEXT:  jbe      .LBB4_1
 
 ; X64-MinGW:       movabsq $40040, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
@@ -548,7 +550,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-FreeBSD:       leaq -40008(%rsp), %r11
 ; X64-FreeBSD-NEXT:  cmpq %fs:24, %r11
-; X64-FreeBSD-NEXT:  ja      .LBB4_2
+; X64-FreeBSD-NEXT:  jbe     .LBB4_1
 
 ; X64-FreeBSD:       movabsq $40008, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
@@ -559,7 +561,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X32-DFlyBSD:       leal -40000(%esp), %eax
 ; X32-DFlyBSD-NEXT:  cmpl %fs:16, %eax
-; X32-DFlyBSD-NEXT:  ja      .LBB4_2
+; X32-DFlyBSD-NEXT:  jbe      .LBB4_1
 
 ; X32-DFlyBSD:       pushl $0
 ; X32-DFlyBSD-NEXT:  pushl $40000
@@ -570,7 +572,7 @@ define fastcc void @test_fastcc_large() #0 {
 
 ; X64-DFlyBSD:       leaq -40008(%rsp), %r11
 ; X64-DFlyBSD-NEXT:  cmpq %fs:32, %r11
-; X64-DFlyBSD-NEXT:  ja      .LBB4_2
+; X64-DFlyBSD-NEXT:  jbe      .LBB4_1
 
 ; X64-DFlyBSD:       movabsq $40008, %r10
 ; X64-DFlyBSD-NEXT:  movabsq $0, %r11
@@ -593,7 +595,7 @@ define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) #0 {
 ; X32-Darwin-NEXT: movl $432, %ecx
 ; X32-Darwin-NEXT: cmpl %gs:(%ecx), %eax
 ; X32-Darwin-NEXT: popl %ecx
-; X32-Darwin-NEXT: ja      LBB5_2
+; X32-Darwin-NEXT: jbe	LBB5_1
 
 ; X32-Darwin:      pushl $0
 ; X32-Darwin-NEXT: pushl $40012
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
index dcd57d2f537d0398a77dad865a7ed9d760bb0334..68c83391e60d26466eacdd48abc6493679c91a38 100644
--- a/test/CodeGen/X86/select_const.ll
+++ b/test/CodeGen/X86/select_const.ll
@@ -468,10 +468,10 @@ define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) {
 ; CHECK-NEXT:    testb $1, %dil
 ; CHECK-NEXT:    jne .LBB37_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [118.83,34.539999999999999]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.1883E+2,3.4539999999999999E+1]
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB37_1:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [-20.399999999999999,37.68]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [-2.0399999999999999E+1,3.768E+1]
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, <2 x double> <double -4.0, double 12.0>, <2 x double> <double 23.3, double 11.0>
   %bo = fmul <2 x double> %sel, <double 5.1, double 3.14>
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index ce057b28cc948415efb26767e19f6d2a5cc74fd5..100461d22c995ffccb3229071ae7a8c888b1a9b8 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -45,19 +45,17 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
 ; AVX-LABEL: pr26232:
 ; AVX:       # %bb.0: # %allocas
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX-NEXT:    .p2align 4, 0x90
 ; AVX-NEXT:  .LBB1_1: # %for_loop599
 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    cmpq $65536, %rdi # imm = 0x10000
 ; AVX-NEXT:    setl %al
-; AVX-NEXT:    vmovd %eax, %xmm3
-; AVX-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
-; AVX-NEXT:    vpand %xmm0, %xmm3, %xmm3
-; AVX-NEXT:    vpsllw $7, %xmm3, %xmm3
-; AVX-NEXT:    vpand %xmm2, %xmm3, %xmm3
-; AVX-NEXT:    vpmovmskb %xmm3, %eax
+; AVX-NEXT:    vmovd %eax, %xmm2
+; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX-NEXT:    vpand %xmm0, %xmm2, %xmm2
+; AVX-NEXT:    vpsllw $7, %xmm2, %xmm2
+; AVX-NEXT:    vpmovmskb %xmm2, %eax
 ; AVX-NEXT:    testw %ax, %ax
 ; AVX-NEXT:    jne .LBB1_1
 ; AVX-NEXT:  # %bb.2: # %for_exit600
diff --git a/test/CodeGen/X86/shift-i256.ll b/test/CodeGen/X86/shift-i256.ll
index 4fa3303baf0487e198cc228bff8b277fcb381d4e..9947d45649dc86f5ef6ca72f8567c3293ab2444f 100644
--- a/test/CodeGen/X86/shift-i256.ll
+++ b/test/CodeGen/X86/shift-i256.ll
@@ -15,7 +15,7 @@ define i256 @shift2(i256 %c) nounwind
 {
   %b = shl i256 1, %c  ; %c must not be a constant
   ; Special case when %c is 0:
-  ; CHECK-X64: testb [[REG:%r[0-9]+b]], [[REG]]
+  ; CHECK-X64: testb [[REG:%(bpl|r[0-9]+b)]], {{%(bpl|r[0-9]+b)}}
   ; CHECK-X64: cmoveq
   ret i256 %b
 }
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index 018aee6ad0646dd9f9821178f4c0814e7b86deae..f190a4174197ba24fc4a444231ac54aeaea3e240 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -14,12 +14,10 @@
 ; %op2 = zext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -35,14 +33,11 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -53,7 +48,6 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8:
@@ -103,12 +97,10 @@ entry:
 ; %op2 = zext<4 x i32> %val2
 ; %rst = mul <4 x i32> %op1, %op2
 ;
-define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_4xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -123,14 +115,11 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    pmaddwd %xmm0, %xmm2
 ; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_4xi8:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -140,7 +129,6 @@ define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_4xi8:
@@ -188,12 +176,10 @@ entry:
 ; %op2 = zext<8 x i32> %val2
 ; %rst = mul <8 x i32> %op1, %op2
 ;
-define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_8xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -210,14 +196,11 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_8xi8:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -231,15 +214,12 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vmovups %ymm0, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_8xi8:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -249,7 +229,6 @@ define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -315,12 +294,10 @@ entry:
 ; %op2 = zext<16 x i32> %val2
 ; %rst = mul <16 x i32> %op1, %op2
 ;
-define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_16xi8:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -347,14 +324,11 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    movdqu %xmm4, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm3, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi8:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -376,15 +350,12 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_16xi8:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -398,7 +369,6 @@ define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -486,12 +456,10 @@ entry:
 ; %op2 = zext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi16:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -504,14 +472,11 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -523,7 +488,6 @@ define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16:
@@ -571,12 +535,10 @@ entry:
 ; %op2 = zext<4 x i32> %val2
 ; %rst = mul <4 x i32> %op1, %op2
 ;
-define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_4xi16:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -589,14 +551,11 @@ define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_4xi16:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -606,7 +565,6 @@ define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
 ; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_4xi16:
@@ -652,12 +610,10 @@ entry:
 ; %op2 = zext<8 x i32> %val2
 ; %rst = mul <8 x i32> %op1, %op2
 ;
-define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_8xi16:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -673,14 +629,11 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_8xi16:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -694,15 +647,12 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vmovups %ymm0, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_8xi16:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -712,7 +662,6 @@ define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64
 ; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -777,12 +726,10 @@ entry:
 ; %op2 = zext<16 x i32> %val2
 ; %rst = mul <16 x i32> %op1, %op2
 ;
-define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_16xi16:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -808,14 +755,11 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
 ; X86-SSE-NEXT:    movdqu %xmm2, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi16:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -837,15 +781,12 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_16xi16:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -859,7 +800,6 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -946,12 +886,10 @@ entry:
 ; %op2 = sext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi8_sext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -969,14 +907,11 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
 ; X86-SSE-NEXT:    psrad $16, %xmm0
 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_sext:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -987,7 +922,6 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b,
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_sext:
@@ -1039,12 +973,10 @@ entry:
 ; %op2 = zext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi8_sext_zext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1063,14 +995,11 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi8_sext_zext:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1081,7 +1010,6 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi8_sext_zext:
@@ -1134,12 +1062,10 @@ entry:
 ; %op2 = sext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi16_sext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1152,14 +1078,11 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b
 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_sext:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1170,7 +1093,6 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_sext:
@@ -1217,12 +1139,10 @@ entry:
 ; %op2 = zext<2 x i32> %val2
 ; %rst = mul <2 x i32> %op1, %op2
 ;
-define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_2xi16_sext_zext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1242,14 +1162,11 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: mul_2xi16_sext_zext:
 ; X86-AVX:       # %bb.0: # %entry
 ; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1262,7 +1179,6 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
 ; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mul_2xi16_sext_zext:
@@ -1318,12 +1234,10 @@ entry:
 ; %op2 = sext<16 x i32> %val2
 ; %rst = mul <16 x i32> %op1, %op2
 ;
-define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
+define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
 ; X86-SSE-LABEL: mul_16xi16_sext:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1349,14 +1263,11 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
 ; X86-SSE-NEXT:    movdqu %xmm2, 16(%esi,%ecx,4)
 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: mul_16xi16_sext:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1378,15 +1289,12 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: mul_16xi16_sext:
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX2-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1400,7 +1308,6 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %
 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -2204,12 +2111,10 @@ entry:
 ; Illegal Types
 ;
 
-define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
+define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
 ; X86-SSE-LABEL: PR34947:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    .cfi_offset %esi, -8
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movdqa (%eax), %xmm5
@@ -2303,31 +2208,21 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-SSE-NEXT:    movdqa %xmm1, (%eax)
 ; X86-SSE-NEXT:    movdqa %xmm4, (%eax)
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: PR34947:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX1-NEXT:    pushl %ebx
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 12
 ; X86-AVX1-NEXT:    pushl %edi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 20
-; X86-AVX1-NEXT:    subl $16, %esp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 36
-; X86-AVX1-NEXT:    .cfi_offset %esi, -20
-; X86-AVX1-NEXT:    .cfi_offset %edi, -16
-; X86-AVX1-NEXT:    .cfi_offset %ebx, -12
-; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
+; X86-AVX1-NEXT:    subl $8, %esp
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX1-NEXT:    vmovdqa (%eax), %ymm2
 ; X86-AVX1-NEXT:    vmovdqa (%ecx), %ymm1
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X86-AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -2339,50 +2234,50 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX1-NEXT:    vpextrd $3, %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
 ; X86-AVX1-NEXT:    vpextrd $2, %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-AVX1-NEXT:    movl %edx, %edi
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %ecx
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-AVX1-NEXT:    movl %edx, %ebx
 ; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
 ; X86-AVX1-NEXT:    vmovd %xmm3, %eax
 ; X86-AVX1-NEXT:    xorl %edx, %edx
 ; X86-AVX1-NEXT:    divl %ecx
 ; X86-AVX1-NEXT:    movl %edx, %ebp
+; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
 ; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, %ebx
+; X86-AVX1-NEXT:    movl %edx, %ecx
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
 ; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
 ; X86-AVX1-NEXT:    divl %esi
 ; X86-AVX1-NEXT:    movl %edx, %esi
+; X86-AVX1-NEXT:    vmovd %ebp, %xmm2
 ; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %edi
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; X86-AVX1-NEXT:    divl %edi
-; X86-AVX1-NEXT:    movl %edx, %edi
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
+; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %ebp
+; X86-AVX1-NEXT:    divl %ebp
+; X86-AVX1-NEXT:    movl %edx, %ebp
+; X86-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm2, %xmm2
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
-; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    vmovd %edx, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $3, %ebx, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
-; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    vpinsrd $2, %edi, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm1, %edi
+; X86-AVX1-NEXT:    vpinsrd $3, (%esp), %xmm0, %xmm0 # 4-byte Folded Reload
+; X86-AVX1-NEXT:    xorl %edx, %edx
+; X86-AVX1-NEXT:    divl %edi
+; X86-AVX1-NEXT:    vmovd %edx, %xmm1
+; X86-AVX1-NEXT:    vpinsrd $1, %ebp, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
 ; X86-AVX1-NEXT:    vmovd {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 # 4-byte Folded Reload
 ; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
@@ -2390,31 +2285,22 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
 ; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpmulld %xmm4, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vmovd %xmm1, (%eax)
 ; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
-; X86-AVX1-NEXT:    addl $16, %esp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 20
+; X86-AVX1-NEXT:    addl $8, %esp
 ; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-AVX1-NEXT:    popl %edi
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 12
 ; X86-AVX1-NEXT:    popl %ebx
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX1-NEXT:    popl %ebp
-; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: PR34947:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    pushl %edi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 12
-; X86-AVX2-NEXT:    .cfi_offset %esi, -12
-; X86-AVX2-NEXT:    .cfi_offset %edi, -8
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX2-NEXT:    vmovdqa (%eax), %ymm2
@@ -2475,13 +2361,11 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; X86-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X86-AVX2-NEXT:    vmovd %eax, %xmm2
-; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, (%eax)
 ; X86-AVX2-NEXT:    vmovdqa %ymm1, (%eax)
 ; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX2-NEXT:    popl %edi
-; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX2-NEXT:    vzeroupper
 ; X86-AVX2-NEXT:    retl
 ;
@@ -2582,15 +2466,11 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X64-AVX1-LABEL: PR34947:
 ; X64-AVX1:       # %bb.0:
 ; X64-AVX1-NEXT:    pushq %rbp
-; X64-AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; X64-AVX1-NEXT:    pushq %rbx
-; X64-AVX1-NEXT:    .cfi_def_cfa_offset 24
-; X64-AVX1-NEXT:    .cfi_offset %rbx, -24
-; X64-AVX1-NEXT:    .cfi_offset %rbp, -16
 ; X64-AVX1-NEXT:    vmovdqa (%rdi), %ymm2
 ; X64-AVX1-NEXT:    vmovdqa (%rsi), %ymm1
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -2618,38 +2498,38 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %esi
+; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %edi
-; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
 ; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ecx
 ; X64-AVX1-NEXT:    movl %edx, %ecx
-; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ebx
 ; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
+; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ebx
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ebx
 ; X64-AVX1-NEXT:    movl %edx, %ebx
-; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
 ; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
 ; X64-AVX1-NEXT:    xorl %edx, %edx
 ; X64-AVX1-NEXT:    divl %ebp
-; X64-AVX1-NEXT:    vmovd %edx, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %esi, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vmovd %esi, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vmovd %edx, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm2, %xmm2
 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X64-AVX1-NEXT:    vmovd %r8d, %xmm1
 ; X64-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X64-AVX1-NEXT:    vmovd %eax, %xmm2
@@ -2657,9 +2537,7 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X64-AVX1-NEXT:    vmovd %xmm1, (%rax)
 ; X64-AVX1-NEXT:    vmovaps %ymm0, (%rax)
 ; X64-AVX1-NEXT:    popq %rbx
-; X64-AVX1-NEXT:    .cfi_def_cfa_offset 16
 ; X64-AVX1-NEXT:    popq %rbp
-; X64-AVX1-NEXT:    .cfi_def_cfa_offset 8
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -2723,7 +2601,7 @@ define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) {
 ; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
 ; X64-AVX2-NEXT:    vmovd %eax, %xmm2
-; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vmovd %xmm0, (%rax)
 ; X64-AVX2-NEXT:    vmovdqa %ymm1, (%rax)
 ; X64-AVX2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/shrink_wrap_dbg_value.mir b/test/CodeGen/X86/shrink_wrap_dbg_value.mir
index 429ea72db8ec30e92b4f78d0cc900de55c39987a..6943033c565c1bea3068e37367c0b5d77fb29721 100644
--- a/test/CodeGen/X86/shrink_wrap_dbg_value.mir
+++ b/test/CodeGen/X86/shrink_wrap_dbg_value.mir
@@ -136,8 +136,8 @@ body:             |
     successors: %bb.4(0x40000000), %bb.1(0x40000000)
     liveins: $ecx, $edx
   
-    DBG_VALUE debug-use $edx, debug-use $noreg, !15, !DIExpression(), debug-location !25
-    DBG_VALUE debug-use $ecx, debug-use $noreg, !16, !DIExpression(), debug-location !26
+    DBG_VALUE $edx, $noreg, !15, !DIExpression(), debug-location !25
+    DBG_VALUE $ecx, $noreg, !16, !DIExpression(), debug-location !26
     $eax = COPY $ecx
     DBG_VALUE %fixed-stack.0, 0, !16, !DIExpression(), debug-location !26
     DBG_VALUE %fixed-stack.1, 0, !15, !DIExpression(), debug-location !25
@@ -149,9 +149,9 @@ body:             |
     successors: %bb.2(0x80000000)
   
     $esi = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
-    DBG_VALUE debug-use $esi, debug-use $noreg, !13, !DIExpression(), debug-location !19
+    DBG_VALUE $esi, $noreg, !13, !DIExpression(), debug-location !19
     $edi = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.1)
-    DBG_VALUE debug-use $edi, debug-use $noreg, !14, !DIExpression(), debug-location !20
+    DBG_VALUE $edi, $noreg, !14, !DIExpression(), debug-location !20
     $edi = DEC32r killed $edi, implicit-def dead $eflags, debug-location !30
     $ebx = LEA32r %fixed-stack.1, 1, $noreg, 0, $noreg
   
diff --git a/test/CodeGen/X86/sibcall-2.ll b/test/CodeGen/X86/sibcall-2.ll
index 1b9d2db47c37143f39d3cd3eb9e783f432922349..6ed7b5a1505b0465e42ea9c90aaabdba2857dbcd 100644
--- a/test/CodeGen/X86/sibcall-2.ll
+++ b/test/CodeGen/X86/sibcall-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin   -disable-fp-elim | FileCheck %s -check-prefix=32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck %s -check-prefix=64
+; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin   -disable-fp-elim | FileCheck %s -check-prefix=32
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck %s -check-prefix=64
 
 ; Tail call should not use ebp / rbp after it's popped. Use esp / rsp.
 
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index 784b10b3566ae54814204b834a1cd60f1d2ec1f5..2b4af2e5830d4866a2204a87b046c01e8cf00662 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-linux   -mcpu=core2 -mattr=+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 -mattr=+sse2 | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -mcpu=core2 -mattr=+sse2  | FileCheck %s --check-prefix=X32
+; RUN: llc -verify-machineinstrs < %s -mtriple=i686-linux   -mcpu=core2 -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-linux -mcpu=core2 -mattr=+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-linux-gnux32 -mcpu=core2 -mattr=+sse2  | FileCheck %s --check-prefix=X32
 
 define void @t1(i32 %x) nounwind ssp {
 ; X86-LABEL: t1:
@@ -101,41 +101,62 @@ define void @t5(void ()* nocapture %x) nounwind ssp {
   ret void
 }
 
+; Basically the same test as t5, except pass the function pointer on the stack
+; for x86_64.
+
+define void @t5_x64(i32, i32, i32, i32, i32, i32, void ()* nocapture %x) nounwind ssp {
+; X86-LABEL: t5_x64:
+; X86:       # %bb.0:
+; X86-NEXT:    jmpl *{{[0-9]+}}(%esp) # TAILCALL
+;
+; X64-LABEL: t5_x64:
+; X64:       # %bb.0:
+; X64-NEXT:    jmpq *{{[0-9]+}}(%rsp) # TAILCALL
+;
+; X32-LABEL: t5_x64:
+; X32:       # %bb.0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    jmpq *%rax # TAILCALL
+  tail call void %x() nounwind
+  ret void
+}
+
+
 define i32 @t6(i32 %x) nounwind ssp {
 ; X86-LABEL: t6:
 ; X86:       # %bb.0:
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl $9, %eax
-; X86-NEXT:    jg .LBB5_2
+; X86-NEXT:    jg .LBB6_2
 ; X86-NEXT:  # %bb.1: # %bb
 ; X86-NEXT:    decl %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll t6
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB5_2: # %bb1
+; X86-NEXT:  .LBB6_2: # %bb1
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    jmp bar # TAILCALL
 ;
 ; X64-LABEL: t6:
 ; X64:       # %bb.0:
 ; X64-NEXT:    cmpl $9, %edi
-; X64-NEXT:    jg .LBB5_2
+; X64-NEXT:    jg .LBB6_2
 ; X64-NEXT:  # %bb.1: # %bb
 ; X64-NEXT:    decl %edi
 ; X64-NEXT:    jmp t6 # TAILCALL
-; X64-NEXT:  .LBB5_2: # %bb1
+; X64-NEXT:  .LBB6_2: # %bb1
 ; X64-NEXT:    jmp bar # TAILCALL
 ;
 ; X32-LABEL: t6:
 ; X32:       # %bb.0:
 ; X32-NEXT:    cmpl $9, %edi
-; X32-NEXT:    jg .LBB5_2
+; X32-NEXT:    jg .LBB6_2
 ; X32-NEXT:  # %bb.1: # %bb
 ; X32-NEXT:    decl %edi
 ; X32-NEXT:    jmp t6 # TAILCALL
-; X32-NEXT:  .LBB5_2: # %bb1
+; X32-NEXT:  .LBB6_2: # %bb1
 ; X32-NEXT:    jmp bar # TAILCALL
   %t0 = icmp slt i32 %x, 10
   br i1 %t0, label %bb, label %bb1
@@ -245,30 +266,30 @@ define i32 @t11(i32 %x, i32 %y, i32 %z.0, i32 %z.1, i32 %z.2) nounwind ssp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB10_1
+; X86-NEXT:    je .LBB11_1
 ; X86-NEXT:  # %bb.2: # %bb
 ; X86-NEXT:    jmp foo5 # TAILCALL
-; X86-NEXT:  .LBB10_1: # %bb6
+; X86-NEXT:  .LBB11_1: # %bb6
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t11:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB10_1
+; X64-NEXT:    je .LBB11_1
 ; X64-NEXT:  # %bb.2: # %bb
 ; X64-NEXT:    jmp foo5 # TAILCALL
-; X64-NEXT:  .LBB10_1: # %bb6
+; X64-NEXT:  .LBB11_1: # %bb6
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: t11:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    testl %edi, %edi
-; X32-NEXT:    je .LBB10_1
+; X32-NEXT:    je .LBB11_1
 ; X32-NEXT:  # %bb.2: # %bb
 ; X32-NEXT:    jmp foo5 # TAILCALL
-; X32-NEXT:  .LBB10_1: # %bb6
+; X32-NEXT:  .LBB11_1: # %bb6
 ; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    retq
 entry:
@@ -292,30 +313,30 @@ define i32 @t12(i32 %x, i32 %y, %struct.t* byval align 4 %z) nounwind ssp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB11_1
+; X86-NEXT:    je .LBB12_1
 ; X86-NEXT:  # %bb.2: # %bb
 ; X86-NEXT:    jmp foo6 # TAILCALL
-; X86-NEXT:  .LBB11_1: # %bb2
+; X86-NEXT:  .LBB12_1: # %bb2
 ; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: t12:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB11_1
+; X64-NEXT:    je .LBB12_1
 ; X64-NEXT:  # %bb.2: # %bb
 ; X64-NEXT:    jmp foo6 # TAILCALL
-; X64-NEXT:  .LBB11_1: # %bb2
+; X64-NEXT:  .LBB12_1: # %bb2
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: t12:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    testl %edi, %edi
-; X32-NEXT:    je .LBB11_1
+; X32-NEXT:    je .LBB12_1
 ; X32-NEXT:  # %bb.2: # %bb
 ; X32-NEXT:    jmp foo6 # TAILCALL
-; X32-NEXT:  .LBB11_1: # %bb2
+; X32-NEXT:  .LBB12_1: # %bb2
 ; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/sjlj-eh.ll b/test/CodeGen/X86/sjlj-eh.ll
index 9a40b5932d497d34e49f42e45ab7faae35e868cb..8020e26234d9bef07aa721d7b3a18dd26c2260bf 100644
--- a/test/CodeGen/X86/sjlj-eh.ll
+++ b/test/CodeGen/X86/sjlj-eh.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj -filetype asm -o - %s | FileCheck %s
-; RUN: llc -mtriple x86_64-windows-gnu -exception-model sjlj -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-X64
-; RUN: llc -mtriple x86_64-linux -exception-model sjlj -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-X64-LINUX
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39439.
+; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj -filetype asm -o - %s -verify-machineinstrs=0 | FileCheck %s
+; RUN: llc -mtriple x86_64-windows-gnu -exception-model sjlj -filetype asm -o - %s -verify-machineinstrs=0 | FileCheck %s -check-prefix CHECK-X64
+; RUN: llc -mtriple x86_64-linux -exception-model sjlj -filetype asm -o - %s -verify-machineinstrs=0 | FileCheck %s -check-prefix CHECK-X64-LINUX
 
 declare void @_Z20function_that_throwsv()
 declare i32 @__gxx_personality_sj0(...)
diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll
index c5c9a3d841628d949117acab6139eaf6f455ff85..12498123e3ac3ea29a12b2d44e6015f9944e547a 100644
--- a/test/CodeGen/X86/small-byval-memcpy.ll
+++ b/test/CodeGen/X86/small-byval-memcpy.ll
@@ -1,25 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s --check-prefix=CORE2
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)
 
 define void @copy16bytes(i8* nocapture %a, i8* nocapture readonly %b) {
+; CORE2-LABEL: copy16bytes:
+; CORE2:       ## %bb.0:
+; CORE2-NEXT:    movq (%rsi), %rax
+; CORE2-NEXT:    movq 8(%rsi), %rcx
+; CORE2-NEXT:    movq %rcx, 8(%rdi)
+; CORE2-NEXT:    movq %rax, (%rdi)
+; CORE2-NEXT:    retq
+;
+; NEHALEM-LABEL: copy16bytes:
+; NEHALEM:       ## %bb.0:
+; NEHALEM-NEXT:    movups (%rsi), %xmm0
+; NEHALEM-NEXT:    movups %xmm0, (%rdi)
+; NEHALEM-NEXT:    retq
+;
+; BDVER2-LABEL: copy16bytes:
+; BDVER2:       ## %bb.0:
+; BDVER2-NEXT:    movups (%rsi), %xmm0
+; BDVER2-NEXT:    movups %xmm0, (%rdi)
+; BDVER2-NEXT:    retq
+;
+; BTVER2-LABEL: copy16bytes:
+; BTVER2:       ## %bb.0:
+; BTVER2-NEXT:    vmovups (%rsi), %xmm0
+; BTVER2-NEXT:    vmovups %xmm0, (%rdi)
+; BTVER2-NEXT:    retq
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i1 false)
   ret void
 
   ; CHECK-LABEL: copy16bytes
-  ; CORE2: movq
-  ; CORE2-NEXT: movq
-  ; CORE2-NEXT: movq
-  ; CORE2-NEXT: movq
-  ; CORE2-NEXT: retq
 
-  ; NEHALEM: movups
-  ; NEHALEM-NEXT: movups
-  ; NEHALEM-NEXT: retq
 
-  ; BTVER2: movups
-  ; BTVER2-NEXT: movups
-  ; BTVER2-NEXT: retq
 }
diff --git a/test/CodeGen/X86/speculative-load-hardening-indirect.ll b/test/CodeGen/X86/speculative-load-hardening-indirect.ll
index 8761fcff5d92d4f595ba833a89733d76b19704fe..0d04a85d36790f78611a0ecd52fd2ec5087d5adc 100644
--- a/test/CodeGen/X86/speculative-load-hardening-indirect.ll
+++ b/test/CodeGen/X86/speculative-load-hardening-indirect.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -relocation-model pic -data-sections | FileCheck %s --check-prefix=X64-PIC
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections -mattr=+retpoline | FileCheck %s --check-prefix=X64-RETPOLINE
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39451.
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -relocation-model pic -data-sections -verify-machineinstrs=0 | FileCheck %s --check-prefix=X64-PIC
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -data-sections -mattr=+retpoline -verify-machineinstrs=0 | FileCheck %s --check-prefix=X64-RETPOLINE
 ;
 ; FIXME: Add support for 32-bit.
 
diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll
index 5a98a00338bf163509ab7256b33519286041b1eb..7aae59080fd0e7d5951c825113b5d932a5c89f67 100644
--- a/test/CodeGen/X86/splat-for-size.ll
+++ b/test/CodeGen/X86/splat-for-size.ll
@@ -9,7 +9,8 @@
 define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
 ; CHECK-LABEL: splat_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
+; CHECK-NEXT:    # xmm1 = mem[0,0]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %add = fadd <2 x double> %x, <double 1.0, double 1.0>
@@ -19,7 +20,7 @@ define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
 define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
 ; CHECK-LABEL: splat_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastsd {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
@@ -29,7 +30,7 @@ define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
 define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
 ; CHECK-LABEL: splat_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
@@ -39,7 +40,7 @@ define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
 define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
 ; CHECK-LABEL: splat_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
@@ -51,13 +52,14 @@ define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
 define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
 ; AVX-LABEL: splat_v2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [2,2]
+; AVX-NEXT:    # xmm1 = mem[0,0]
 ; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: splat_v2i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2,2]
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %add = add <2 x i64> %x, <i64 2, i64 2>
@@ -70,7 +72,8 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
 ; AVX-LABEL: splat_v4i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
+; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = [2,2]
+; AVX-NEXT:    # xmm2 = mem[0,0]
 ; AVX-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -78,7 +81,7 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
 ;
 ; AVX2-LABEL: splat_v4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2]
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %add = add <4 x i64> %x, <i64 2, i64 2, i64 2, i64 2>
@@ -89,13 +92,13 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
 define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
 ; AVX-LABEL: splat_v4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2,2,2,2]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: splat_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %add = add <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -107,7 +110,7 @@ define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
 ; AVX-LABEL: splat_v8i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; AVX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2,2,2,2]
 ; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -115,7 +118,7 @@ define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
 ;
 ; AVX2-LABEL: splat_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %add = add <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -131,7 +134,7 @@ define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
 ;
 ; AVX2-LABEL: splat_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %add = add <8 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
@@ -151,7 +154,7 @@ define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
 ;
 ; AVX2-LABEL: splat_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastw {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %add = add <16 x i16> %x, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
@@ -167,7 +170,7 @@ define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
 ;
 ; AVX2-LABEL: splat_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb {{.*}}(%rip), %xmm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
   %add = add <16 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
@@ -187,7 +190,7 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
 ;
 ; AVX2-LABEL: splat_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastb {{.*}}(%rip), %ymm1
+; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
   %add = add <32 x i8> %x, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
@@ -201,6 +204,31 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
 @A = common global <3 x i64> zeroinitializer, align 32
 
 define <8 x i64> @pr23259() #1 {
+; AVX-LABEL: pr23259:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq $1
+; AVX-NEXT:    .cfi_adjust_cfa_offset 8
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    .cfi_adjust_cfa_offset -8
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX-NEXT:    vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
+; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: pr23259:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovdqa {{.*}}(%rip), %ymm0
+; AVX2-NEXT:    pushq $1
+; AVX2-NEXT:    .cfi_adjust_cfa_offset 8
+; AVX2-NEXT:    popq %rax
+; AVX2-NEXT:    .cfi_adjust_cfa_offset -8
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,1,1]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
+; AVX2-NEXT:    retq
 entry:
   %0 = load <4 x i64>, <4 x i64>* bitcast (<3 x i64>* @A to <4 x i64>*), align 32
   %1 = shufflevector <4 x i64> %0, <4 x i64> undef, <3 x i32> <i32 undef, i32 undef, i32 2>
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 78a0514b91dc5cf07d692b4efcb2688504e6ea12..6e0273d513fc6251cb065a0afb08871a34eb6496 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -178,7 +178,7 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; SSE-NEXT:    rsqrtps %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    mulps %xmm2, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-0.5,-0.5,-0.5,-0.5]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; SSE-NEXT:    mulps %xmm1, %xmm3
 ; SSE-NEXT:    mulps %xmm2, %xmm1
 ; SSE-NEXT:    addps {{.*}}(%rip), %xmm1
@@ -208,9 +208,9 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3,-3,-3,-3]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.5,-0.5,-0.5,-0.5]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
 ; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
 ; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
@@ -282,21 +282,21 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
 ; SSE-LABEL: v4f32_no_estimate:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    sqrtps %xmm0, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    divps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: v4f32_no_estimate:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vsqrtps %xmm0, %xmm0
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [1,1,1,1]
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX1-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: v4f32_no_estimate:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vsqrtps %xmm0, %xmm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
@@ -331,9 +331,9 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-3,-3,-3,-3]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; AVX512-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-0.5,-0.5,-0.5,-0.5]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %xmm0, %xmm2, %xmm0
 ; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
@@ -347,7 +347,7 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    sqrtps %xmm1, %xmm2
 ; SSE-NEXT:    sqrtps %xmm0, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    divps %xmm3, %xmm0
 ; SSE-NEXT:    divps %xmm2, %xmm1
@@ -356,14 +356,14 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ; AVX1-LABEL: v8f32_no_estimate:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX1-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: v8f32_no_estimate:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vsqrtps %ymm0, %ymm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
@@ -375,11 +375,11 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
 ; SSE-LABEL: v8f32_estimate:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rsqrtps %xmm0, %xmm3
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-0.5,-0.5,-0.5,-0.5]
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; SSE-NEXT:    movaps %xmm3, %xmm2
 ; SSE-NEXT:    mulps %xmm3, %xmm2
 ; SSE-NEXT:    mulps %xmm0, %xmm2
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [-3,-3,-3,-3]
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; SSE-NEXT:    addps %xmm0, %xmm2
 ; SSE-NEXT:    mulps %xmm4, %xmm2
 ; SSE-NEXT:    mulps %xmm3, %xmm2
@@ -408,9 +408,9 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrsqrtps %ymm0, %ymm1
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3,-3,-3,-3,-3,-3,-3,-3]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; AVX512-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
-; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm0
 ; AVX512-NEXT:    vmulps %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    retq
@@ -426,7 +426,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ; SSE-NEXT:    sqrtps %xmm2, %xmm5
 ; SSE-NEXT:    sqrtps %xmm1, %xmm2
 ; SSE-NEXT:    sqrtps %xmm0, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1,1,1,1]
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    divps %xmm1, %xmm0
 ; SSE-NEXT:    movaps %xmm3, %xmm1
@@ -440,7 +440,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vsqrtps %ymm1, %ymm1
 ; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    retq
@@ -448,7 +448,7 @@ define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
 ; AVX512-LABEL: v16f32_no_estimate:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vsqrtps %zmm0, %zmm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
@@ -462,11 +462,11 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    rsqrtps %xmm0, %xmm5
-; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-0.5,-0.5,-0.5,-0.5]
+; SSE-NEXT:    movaps {{.*#+}} xmm6 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; SSE-NEXT:    movaps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm5, %xmm0
 ; SSE-NEXT:    mulps %xmm1, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm7 = [-3,-3,-3,-3]
+; SSE-NEXT:    movaps {{.*#+}} xmm7 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; SSE-NEXT:    addps %xmm7, %xmm0
 ; SSE-NEXT:    mulps %xmm6, %xmm0
 ; SSE-NEXT:    mulps %xmm5, %xmm0
@@ -498,10 +498,10 @@ define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
 ; AVX1-LABEL: v16f32_estimate:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vrsqrtps %ymm0, %ymm2
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 ; AVX1-NEXT:    vmulps %ymm2, %ymm2, %ymm4
 ; AVX1-NEXT:    vmulps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [-3,-3,-3,-3,-3,-3,-3,-3]
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
 ; AVX1-NEXT:    vaddps %ymm4, %ymm0, %ymm0
 ; AVX1-NEXT:    vmulps %ymm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vmulps %ymm0, %ymm2, %ymm0
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 90e31eb5fb3fe8f592b3db3fab029fd48335f4bf..2441a4cf40a313cd4321fa4e87389f8bbbd502b4 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -79,12 +79,15 @@ define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind
 ;
 ; AVX1-LABEL: test_mm_andnot_ps:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1]
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xef,0xc2]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_andnot_ps:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1]
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
@@ -1320,28 +1323,24 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
 ; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
-; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X86-SSE-NEXT:    shufps $0, %xmm1, %xmm2 # encoding: [0x0f,0xc6,0xd1,0x00]
+; X86-SSE-NEXT:    # xmm2 = xmm2[0,0],xmm1[0,0]
+; X86-SSE-NEXT:    shufps $36, %xmm2, %xmm0 # encoding: [0x0f,0xc6,0xc2,0x24]
+; X86-SSE-NEXT:    # xmm0 = xmm0[0,1],xmm2[2,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_loadh_pi:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
-; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
-; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X86-AVX1-NEXT:    vmovhpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x16,0x00]
+; X86-AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_loadh_pi:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
-; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
-; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
+; X86-AVX512-NEXT:    vmovhpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x16,0x00]
+; X86-AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_loadh_pi:
@@ -1382,33 +1381,29 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
 ; X86-SSE-LABEL: test_mm_loadl_pi:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
+; X86-SSE-NEXT:    movss (%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x10]
 ; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE-NEXT:    shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4]
-; X86-SSE-NEXT:    # xmm1 = xmm1[0,1],xmm0[2,3]
+; X86-SSE-NEXT:    movss 4(%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x48,0x04]
+; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    shufps $0, %xmm2, %xmm1 # encoding: [0x0f,0xc6,0xca,0x00]
+; X86-SSE-NEXT:    # xmm1 = xmm1[0,0],xmm2[0,0]
+; X86-SSE-NEXT:    shufps $226, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe2]
+; X86-SSE-NEXT:    # xmm1 = xmm1[2,0],xmm0[2,3]
 ; X86-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_loadl_pi:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
-; X86-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX1-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX1-NEXT:    vmovlpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x12,0x00]
+; X86-AVX1-NEXT:    # xmm0 = mem[0],xmm0[1]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_loadl_pi:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
-; X86-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX512-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX512-NEXT:    vmovlpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x12,0x00]
+; X86-AVX512-NEXT:    # xmm0 = mem[0],xmm0[1]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_loadl_pi:
@@ -2060,16 +2055,16 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
 ; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
 ; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
-; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X86-AVX512-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
-; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
-; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
+; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
+; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
+; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
+; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
+; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_set_ps:
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index 662061d8c881b6d99f6213617471c0c591a18051..aca2ec8c5f59e2d10a01ee9414c596e78a4d3639 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -14,6 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -100,6 +102,18 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_addps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
@@ -208,6 +222,18 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addss (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_addss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [3:1.00]
@@ -320,6 +346,18 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_andps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    andps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    andps (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_andps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_andps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    andps %xmm1, %xmm0 # sched: [1:0.50]
@@ -436,6 +474,18 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_andnotps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    andnps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    andnps (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_andnotps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_andnotps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    andnps %xmm1, %xmm0 # sched: [1:0.50]
@@ -563,6 +613,20 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cmpps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cmpeqps %xmm0, %xmm1 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    cmpeqps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cmpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [2:1.00]
+; BDVER2-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vorps %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cmpps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cmpeqps %xmm0, %xmm1 # sched: [2:1.00]
@@ -679,6 +743,18 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cmpss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cmpeqss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    cmpeqss (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cmpss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cmpss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cmpeqss %xmm1, %xmm0 # sched: [2:1.00]
@@ -896,6 +972,34 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_comiss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    comiss %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    comiss (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_comiss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcomiss %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    vcomiss (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_comiss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    comiss %xmm1, %xmm0 # sched: [3:1.00]
@@ -1051,6 +1155,20 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
 ; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsi2ss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsi2ssl (%rsi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2ssl %edi, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtsi2ss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsi2ss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsi2ssl (%rsi), %xmm0 # sched: [14:1.00]
@@ -1177,6 +1295,20 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
 ; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsi2ssq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsi2ssq %rdi, %xmm1 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2ssq (%rsi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtsi2ssq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsi2ssq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsi2ssq (%rsi), %xmm0 # sched: [14:1.00]
@@ -1303,6 +1435,20 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtss2si:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtss2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvtss2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtss2si:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtss2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvtss2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtss2si:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtss2si (%rdi), %eax # sched: [12:1.00]
@@ -1432,6 +1578,20 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtss2siq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtss2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvtss2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtss2siq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtss2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvtss2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtss2siq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtss2si (%rdi), %rax # sched: [12:1.00]
@@ -1561,6 +1721,20 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttss2si:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttss2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvttss2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvttss2si:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttss2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvttss2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttss2si:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttss2si (%rdi), %eax # sched: [12:1.00]
@@ -1687,6 +1861,20 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttss2siq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttss2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvttss2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvttss2siq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttss2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvttss2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttss2siq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttss2si (%rdi), %rax # sched: [12:1.00]
@@ -1800,6 +1988,18 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [17:5.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_divps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    divps %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-SSE-NEXT:    divps (%rdi), %xmm0 # sched: [14:9.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_divps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [14:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_divps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    divps %xmm1, %xmm0 # sched: [19:19.00]
@@ -1908,6 +2108,18 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [16:3.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_divss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    divss %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-SSE-NEXT:    divss (%rdi), %xmm0 # sched: [14:9.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_divss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [14:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_divss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    divss %xmm1, %xmm0 # sched: [19:19.00]
@@ -2016,6 +2228,18 @@ define void @test_ldmxcsr(i32 %a0) {
 ; SKX-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_ldmxcsr:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    ldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_ldmxcsr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_ldmxcsr:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
@@ -2126,6 +2350,18 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maxps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maxps %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    maxps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_maxps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maxps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maxps %xmm1, %xmm0 # sched: [2:1.00]
@@ -2235,6 +2471,18 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maxss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maxss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    maxss (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_maxss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maxss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maxss %xmm1, %xmm0 # sched: [2:1.00]
@@ -2344,6 +2592,18 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_minps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    minps %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    minps (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_minps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_minps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    minps %xmm1, %xmm0 # sched: [2:1.00]
@@ -2453,6 +2713,18 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_minss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    minss %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    minss (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_minss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_minss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    minss %xmm1, %xmm0 # sched: [2:1.00]
@@ -2575,6 +2847,20 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movaps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movaps (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movaps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movaps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movaps (%rdi), %xmm0 # sched: [5:1.00]
@@ -2682,6 +2968,16 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movhlps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movhlps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movhlps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
@@ -2712,8 +3008,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; GENERIC-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2723,16 +3018,14 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; ATOM-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
 ; ATOM-NEXT:    addps %xmm1, %xmm2 # sched: [5:5.00]
 ; ATOM-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
-; ATOM-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1] sched: [1:1.00]
-; ATOM-NEXT:    movlps %xmm2, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT:    movhps %xmm2, (%rdi) # sched: [1:1.00]
 ; ATOM-NEXT:    retq # sched: [79:39.50]
 ;
 ; SLM-LABEL: test_movhps:
 ; SLM:       # %bb.0:
 ; SLM-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00]
 ; SLM-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; SLM-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SLM-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SLM-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SLM-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
@@ -2740,8 +3033,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SANDY-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SANDY-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2749,7 +3041,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SANDY-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2757,8 +3049,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; HASWELL-SSE:       # %bb.0:
 ; HASWELL-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; HASWELL-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; HASWELL-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; HASWELL-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2766,7 +3057,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2774,8 +3065,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; BROADWELL-SSE:       # %bb.0:
 ; BROADWELL-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BROADWELL-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BROADWELL-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; BROADWELL-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; BROADWELL-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2783,7 +3073,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BROADWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BROADWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; BROADWELL-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2791,8 +3081,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKYLAKE-SSE:       # %bb.0:
 ; SKYLAKE-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKYLAKE-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [4:0.50]
-; SKYLAKE-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SKYLAKE-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SKYLAKE-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2800,7 +3089,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKYLAKE-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKYLAKE-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SKYLAKE-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2808,8 +3097,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKX-SSE:       # %bb.0:
 ; SKX-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKX-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SKX-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2817,16 +3105,31 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movhps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movhps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movhps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BTVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50]
-; BTVER2-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [2:1.00]
+; BTVER2-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [2:1.00]
 ; BTVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
@@ -2834,7 +3137,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [3:1.00]
+; BTVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [2:1.00]
 ; BTVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
@@ -2842,8 +3145,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; ZNVER1-SSE:       # %bb.0:
 ; ZNVER1-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
 ; ZNVER1-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50]
-; ZNVER1-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:0.50]
 ; ZNVER1-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.25]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
@@ -2851,7 +3153,7 @@ define <4 x float> @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; ZNVER1:       # %bb.0:
 ; ZNVER1-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
 ; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:3.00]
+; ZNVER1-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:0.50]
 ; ZNVER1-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = bitcast x86_mmx* %a2 to <2 x float>*
@@ -2945,6 +3247,18 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
 ; SKX-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movlhps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movlhps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [2:0.50]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movlhps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
@@ -3079,6 +3393,22 @@ define <4 x float> @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2)
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movlps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movlps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovlps %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movlps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
@@ -3188,6 +3518,16 @@ define i32 @test_movmskps(<4 x float> %a0) {
 ; SKX-NEXT:    vmovmskps %xmm0, %eax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movmskps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movmskps %xmm0, %eax # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movmskps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovmskps %xmm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movmskps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movmskps %xmm0, %eax # sched: [3:1.00]
@@ -3284,6 +3624,16 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movntps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movntps %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movntps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovntps %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movntps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movntps %xmm0, (%rdi) # sched: [3:1.00]
@@ -3399,6 +3749,20 @@ define void @test_movss_mem(float* %a0, float* %a1) {
 ; SKX-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movss_mem:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addss %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movss %xmm0, (%rsi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movss_mem:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovss %xmm0, (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movss_mem:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -3504,6 +3868,16 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
 ; SKX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movss_reg:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movss_reg:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movss_reg:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
@@ -3619,6 +3993,20 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movups:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movups (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movups %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movups:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovups (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movups:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movups (%rdi), %xmm0 # sched: [5:1.00]
@@ -3731,6 +4119,18 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mulps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mulps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    mulps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_mulps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mulps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mulps %xmm1, %xmm0 # sched: [2:1.00]
@@ -3839,6 +4239,18 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mulss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mulss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    mulss (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_mulss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mulss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mulss %xmm1, %xmm0 # sched: [2:1.00]
@@ -3951,6 +4363,18 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ; SKX-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_orps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    orps (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_orps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_orps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    orps %xmm1, %xmm0 # sched: [1:0.50]
@@ -4115,6 +4539,26 @@ define void @test_prefetch(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_prefetch:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    #APP
+; BDVER2-SSE-NEXT:    prefetchnta (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    prefetcht0 (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    prefetcht1 (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    prefetcht2 (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    #NO_APP
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_prefetch:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    prefetchnta (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    prefetcht0 (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    prefetcht1 (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    prefetcht2 (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_prefetch:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    #APP
@@ -4252,6 +4696,20 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_rcpps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    rcpps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    rcpps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_rcpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrcpps (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vrcpps %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_rcpps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    rcpps %xmm0, %xmm1 # sched: [2:1.00]
@@ -4394,6 +4852,22 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_rcpss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    rcpss %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    rcpss %xmm1, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_rcpss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_rcpss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -4529,6 +5003,20 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_rsqrtps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    rsqrtps %xmm0, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    rsqrtps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_rsqrtps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
+; BDVER2-NEXT:    vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_rsqrtps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    rsqrtps %xmm0, %xmm1 # sched: [2:1.00]
@@ -4671,6 +5159,22 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_rsqrtss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    rsqrtss %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    rsqrtss %xmm1, %xmm1 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_rsqrtss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_rsqrtss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -4784,6 +5288,16 @@ define void @test_sfence() {
 ; SKX-NEXT:    sfence # sched: [2:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sfence:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    sfence # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_sfence:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    sfence # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sfence:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    sfence # sched: [1:1.00]
@@ -4900,6 +5414,20 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_shufps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_shufps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [2:0.50]
+; BDVER2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,3],mem[0,0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_shufps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
@@ -5027,6 +5555,20 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sqrtps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    sqrtps %xmm0, %xmm1 # sched: [9:10.50]
+; BDVER2-SSE-NEXT:    sqrtps (%rdi), %xmm0 # sched: [14:10.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_sqrtps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [14:10.50]
+; BDVER2-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [9:10.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sqrtps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    sqrtps %xmm0, %xmm1 # sched: [21:21.00]
@@ -5169,6 +5711,22 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sqrtss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movaps (%rdi), %xmm1 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    sqrtss %xmm0, %xmm0 # sched: [9:10.50]
+; BDVER2-SSE-NEXT:    sqrtss %xmm1, %xmm1 # sched: [9:10.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_sqrtss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovaps (%rdi), %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [9:10.50]
+; BDVER2-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [9:10.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sqrtss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movaps (%rdi), %xmm1 # sched: [5:1.00]
@@ -5287,6 +5845,18 @@ define i32 @test_stmxcsr() {
 ; SKX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_stmxcsr:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    stmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_stmxcsr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; BDVER2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_stmxcsr:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    stmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
@@ -5397,6 +5967,18 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_subps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    subps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    subps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_subps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_subps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    subps %xmm1, %xmm0 # sched: [3:1.00]
@@ -5505,6 +6087,18 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
 ; SKX-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_subss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    subss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    subss (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_subss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_subss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    subss %xmm1, %xmm0 # sched: [3:1.00]
@@ -5717,6 +6311,34 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_ucomiss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    ucomiss %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    ucomiss (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_ucomiss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vucomiss %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    vucomiss (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_ucomiss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    ucomiss %xmm1, %xmm0 # sched: [3:1.00]
@@ -5872,6 +6494,20 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_unpckhps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_unpckhps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
+; BDVER2-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_unpckhps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
@@ -5998,6 +6634,20 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_unpcklps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_unpcklps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_unpcklps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
@@ -6115,6 +6765,18 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
 ; SKX-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_xorps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    xorps %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    xorps (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_xorps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_xorps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    xorps %xmm1, %xmm0 # sched: [1:0.50]
@@ -6259,6 +6921,22 @@ define <4 x float> @test_fnop() nounwind {
 ; SKX-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_fnop:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    xorps %xmm0, %xmm0 # sched: [0:0.25]
+; BDVER2-SSE-NEXT:    #APP
+; BDVER2-SSE-NEXT:    nop # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    #NO_APP
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_fnop:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # sched: [0:0.25]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    nop # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_fnop:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    xorps %xmm0, %xmm0 # sched: [0:0.50]
diff --git a/test/CodeGen/X86/sse1-fcopysign.ll b/test/CodeGen/X86/sse1-fcopysign.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ed7f31e444cf3d6c30472759404e0cda8366a661
--- /dev/null
+++ b/test/CodeGen/X86/sse1-fcopysign.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=-sse2,+sse | FileCheck %s --check-prefix=ALL --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse2,+sse | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+define float @f32_pos(float %a, float %b) nounwind {
+; X86-LABEL: f32_pos:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: f32_pos:
+; X64:       # %bb.0:
+; X64-NEXT:    andps {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call float @llvm.copysign.f32(float %a, float 1.0)
+  ret float %tmp
+}
+
+define float @f32_neg(float %a, float %b) nounwind {
+; X86-LABEL: f32_neg:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    orps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    movss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: f32_neg:
+; X64:       # %bb.0:
+; X64-NEXT:    orps {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call float @llvm.copysign.f32(float %a, float -1.0)
+  ret float %tmp
+}
+
+define <4 x float> @v4f32_pos(<4 x float> %a, <4 x float> %b) nounwind {
+; X86-LABEL: v4f32_pos:
+; X86:       # %bb.0:
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: v4f32_pos:
+; X64:       # %bb.0:
+; X64-NEXT:    andps {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  ret <4 x float> %tmp
+}
+
+define <4 x float> @v4f32_neg(<4 x float> %a, <4 x float> %b) nounwind {
+; X86-LABEL: v4f32_neg:
+; X86:       # %bb.0:
+; X86-NEXT:    orps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: v4f32_neg:
+; X64:       # %bb.0:
+; X64-NEXT:    orps {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>)
+  ret <4 x float> %tmp
+}
+
+define <4 x float> @v4f32_const_mag(<4 x float> %a, <4 x float> %b) nounwind {
+; X86-LABEL: v4f32_const_mag:
+; X86:       # %bb.0:
+; X86-NEXT:    movaps %xmm1, %xmm0
+; X86-NEXT:    andps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    orps {{\.LCPI.*}}, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: v4f32_const_mag:
+; X64:       # %bb.0:
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    andps {{.*}}(%rip), %xmm0
+; X64-NEXT:    orps {{.*}}(%rip), %xmm0
+; X64-NEXT:    retq
+  %tmp = tail call <4 x float> @llvm.copysign.v4f32(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> %b )
+  ret <4 x float> %tmp
+}
+
+declare float @llvm.copysign.f32(float, float)
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
diff --git a/test/CodeGen/X86/sse2-intrinsics-canonical.ll b/test/CodeGen/X86/sse2-intrinsics-canonical.ll
index 04cd7ec47a18d1c8c9c0aaa64d4d07a6eda58c0d..506fb9eb10003a8af2a33764f90a5df6ca800253 100644
--- a/test/CodeGen/X86/sse2-intrinsics-canonical.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-canonical.ll
@@ -198,9 +198,9 @@ define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
 ;
 ; AVX2-LABEL: test_x86_sse2_psubus_b_64:
 ; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX2-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI6_0, kind: FK_Data_4
+; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX2-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4
 ; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3 ## encoding: [0xc5,0xf1,0xdb,0xda]
 ; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0xc2]
 ; AVX2-NEXT:    vpmaxuw %xmm3, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
@@ -209,9 +209,9 @@ define <8 x i8> @test_x86_sse2_psubus_b_64(<8 x i8> %a0, <8 x i8> %a1) {
 ;
 ; SKX-LABEL: test_x86_sse2_psubus_b_64:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vmovdqa LCPI6_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SKX-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x15,A,A,A,A]
-; SKX-NEXT:    ## fixup A - offset: 4, value: LCPI6_0, kind: FK_Data_4
+; SKX-NEXT:    vpbroadcastw LCPI6_0, %xmm2 ## EVEX TO VEX Compression xmm2 = [255,255,255,255,255,255,255,255]
+; SKX-NEXT:    ## encoding: [0xc4,0xe2,0x79,0x79,0x15,A,A,A,A]
+; SKX-NEXT:    ## fixup A - offset: 5, value: LCPI6_0, kind: FK_Data_4
 ; SKX-NEXT:    vpand %xmm2, %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xdb,0xda]
 ; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc2]
 ; SKX-NEXT:    vpmaxuw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc3]
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 83d3a0e0b9546415fbc7b2cac21b57236902f4b9..23d0d66acfbca50966e73886bb737ea21a3a488e 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -272,17 +272,22 @@ define <2 x i64> @test_mm_and_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
 define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; SSE-LABEL: test_mm_andnot_pd:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm2 # encoding: [0x66,0x0f,0x76,0xd2]
+; SSE-NEXT:    pxor %xmm2, %xmm0 # encoding: [0x66,0x0f,0xef,0xc2]
+; SSE-NEXT:    pand %xmm1, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc1]
 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX1-LABEL: test_mm_andnot_pd:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1]
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xef,0xc2]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_mm_andnot_pd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1]
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %arg0 = bitcast <2 x double> %a0 to <4 x i32>
   %arg1 = bitcast <2 x double> %a1 to <4 x i32>
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 068b0421a0bf62d30ab0f917fb4135d24717a902..8dedce5fc8b478dfbf5623aba68870d7126425af 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1418,6 +1418,45 @@ define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
 declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 
+define <8 x i16> @test_x86_sse2_psrl_w_load(<8 x i16> %a0, <8 x i16>* %p) {
+; X86-SSE-LABEL: test_x86_sse2_psrl_w_load:
+; X86-SSE:       ## %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-SSE-NEXT:    psrlw (%eax), %xmm0 ## encoding: [0x66,0x0f,0xd1,0x00]
+; X86-SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX1-LABEL: test_x86_sse2_psrl_w_load:
+; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX1-NEXT:    vpsrlw (%eax), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd1,0x00]
+; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
+;
+; X86-AVX512-LABEL: test_x86_sse2_psrl_w_load:
+; X86-AVX512:       ## %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512-NEXT:    vpsrlw (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0x00]
+; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-SSE-LABEL: test_x86_sse2_psrl_w_load:
+; X64-SSE:       ## %bb.0:
+; X64-SSE-NEXT:    psrlw (%rdi), %xmm0 ## encoding: [0x66,0x0f,0xd1,0x07]
+; X64-SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX1-LABEL: test_x86_sse2_psrl_w_load:
+; X64-AVX1:       ## %bb.0:
+; X64-AVX1-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd1,0x07]
+; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
+;
+; X64-AVX512-LABEL: test_x86_sse2_psrl_w_load:
+; X64-AVX512:       ## %bb.0:
+; X64-AVX512-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0x07]
+; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
+  %a1 = load <8 x i16>, <8 x i16>* %p
+  %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
 define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
 ; SSE-LABEL: test_x86_sse2_psrli_d:
 ; SSE:       ## %bb.0:
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index f66ccedc052058676b0a0e7ab0e670798e14f215..a833dcf0735058d04c1ed32d68c7998b4a3a2b46 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -14,6 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2,-xop | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -98,6 +100,18 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addpd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_addpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -206,6 +220,18 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addsd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_addsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [3:1.00]
@@ -327,6 +353,20 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_andpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    andpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    andpd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_andpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_andpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    andpd %xmm1, %xmm0 # sched: [1:0.50]
@@ -457,6 +497,20 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_andnotpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    andnpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    andnpd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_andnotpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_andnotpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    andnpd %xmm1, %xmm0 # sched: [1:0.50]
@@ -569,6 +623,16 @@ define void @test_clflush(i8* %p){
 ; SKX-NEXT:    clflush (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_clflush:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    clflush (%rdi) # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_clflush:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    clflush (%rdi) # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_clflush:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    clflush (%rdi) # sched: [5:1.00]
@@ -685,6 +749,20 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cmppd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cmpeqpd %xmm0, %xmm1 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    cmpeqpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cmppd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [2:1.00]
+; BDVER2-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    vorpd %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cmppd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cmpeqpd %xmm0, %xmm1 # sched: [2:1.00]
@@ -800,6 +878,18 @@ define double @test_cmpsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cmpsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cmpeqsd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    cmpeqsd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cmpsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cmpsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cmpeqsd %xmm1, %xmm0 # sched: [2:1.00]
@@ -1017,6 +1107,34 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_comisd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    comisd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    comisd (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_comisd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcomisd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    vcomisd (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_comisd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    comisd %xmm1, %xmm0 # sched: [3:1.00]
@@ -1174,6 +1292,20 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtdq2pd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtdq2pd %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvtdq2pd (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtdq2pd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtdq2pd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtdq2pd %xmm0, %xmm1 # sched: [3:1.00]
@@ -1303,6 +1435,20 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtdq2ps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtdq2ps %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvtdq2ps (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtdq2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtdq2ps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtdq2ps %xmm0, %xmm1 # sched: [3:1.00]
@@ -1431,6 +1577,20 @@ define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtpd2dq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtpd2dq %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvtpd2dq (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtpd2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtpd2dq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtpd2dq %xmm0, %xmm1 # sched: [3:1.00]
@@ -1560,6 +1720,20 @@ define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtpd2ps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtpd2ps %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvtpd2ps (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtpd2ps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtpd2ps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtpd2ps %xmm0, %xmm1 # sched: [3:1.00]
@@ -1688,6 +1862,20 @@ define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtps2dq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtps2dq %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvtps2dq (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtps2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtps2dq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtps2dq %xmm0, %xmm1 # sched: [3:1.00]
@@ -1816,6 +2004,20 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtps2pd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtps2pd %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvtps2pd (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtps2pd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtps2pd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtps2pd %xmm0, %xmm1 # sched: [2:1.00]
@@ -1944,6 +2146,20 @@ define i32 @test_cvtsd2si(double %a0, double *%a1) {
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsd2si:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsd2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvtsd2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtsd2si:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsd2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsd2si:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsd2si (%rdi), %eax # sched: [12:1.00]
@@ -2073,6 +2289,20 @@ define i64 @test_cvtsd2siq(double %a0, double *%a1) {
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsd2siq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsd2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvtsd2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtsd2siq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsd2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsd2siq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsd2si (%rdi), %rax # sched: [12:1.00]
@@ -2216,6 +2446,22 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
 ; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsd2ss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsd2ss %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    cvtsd2ss %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addss %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtsd2ss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; BDVER2-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsd2ss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsd2ss %xmm0, %xmm1 # sched: [7:2.00]
@@ -2346,6 +2592,20 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
 ; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsi2sd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsi2sdl (%rsi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2sdl %edi, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtsi2sd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsi2sd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsi2sdl (%rsi), %xmm0 # sched: [14:1.00]
@@ -2472,6 +2732,20 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
 ; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtsi2sdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtsi2sdq %rdi, %xmm1 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    cvtsi2sdq (%rsi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtsi2sdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtsi2sdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtsi2sdq (%rsi), %xmm0 # sched: [14:1.00]
@@ -2614,6 +2888,22 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
 ; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvtss2sd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvtss2sd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvtss2sd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvtss2sd %xmm0, %xmm1 # sched: [7:2.00]
@@ -2746,6 +3036,20 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttpd2dq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttpd2dq %xmm0, %xmm1 # sched: [8:1.00]
+; BDVER2-SSE-NEXT:    cvttpd2dq (%rdi), %xmm0 # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvttpd2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [13:1.00]
+; BDVER2-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [8:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttpd2dq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttpd2dq %xmm0, %xmm1 # sched: [3:1.00]
@@ -2875,6 +3179,20 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttps2dq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttps2dq %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    cvttps2dq (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvttps2dq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttps2dq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttps2dq %xmm0, %xmm1 # sched: [3:1.00]
@@ -3001,6 +3319,20 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttsd2si:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttsd2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvttsd2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvttsd2si:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttsd2si (%rdi), %eax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [13:1.00]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttsd2si:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttsd2si (%rdi), %eax # sched: [12:1.00]
@@ -3127,6 +3459,20 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_cvttsd2siq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    cvttsd2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-SSE-NEXT:    cvttsd2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_cvttsd2siq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vcvttsd2si (%rdi), %rax # sched: [18:1.00]
+; BDVER2-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [13:1.00]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_cvttsd2siq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    cvttsd2si (%rdi), %rax # sched: [12:1.00]
@@ -3240,6 +3586,18 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [20:4.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_divpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    divpd %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-SSE-NEXT:    divpd (%rdi), %xmm0 # sched: [14:9.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_divpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [14:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_divpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    divpd %xmm1, %xmm0 # sched: [19:19.00]
@@ -3348,6 +3706,18 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [19:4.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_divsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    divsd %xmm1, %xmm0 # sched: [9:9.50]
+; BDVER2-SSE-NEXT:    divsd (%rdi), %xmm0 # sched: [14:9.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_divsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [9:9.50]
+; BDVER2-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [14:9.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_divsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    divsd %xmm1, %xmm0 # sched: [19:19.00]
@@ -3449,6 +3819,16 @@ define void @test_lfence() {
 ; SKX-NEXT:    lfence # sched: [2:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_lfence:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    lfence # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_lfence:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    lfence # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_lfence:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    lfence # sched: [1:1.00]
@@ -3545,6 +3925,16 @@ define void @test_mfence() {
 ; SKX-NEXT:    mfence # sched: [3:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mfence:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mfence # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_mfence:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    mfence # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mfence:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mfence # sched: [1:1.00]
@@ -3639,6 +4029,16 @@ define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
 ; SKX-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maskmovdqu:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_maskmovdqu:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maskmovdqu:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
@@ -3742,6 +4142,18 @@ define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maxpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maxpd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    maxpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_maxpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maxpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maxpd %xmm1, %xmm0 # sched: [2:1.00]
@@ -3851,6 +4263,18 @@ define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_maxsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    maxsd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    maxsd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_maxsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_maxsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    maxsd %xmm1, %xmm0 # sched: [2:1.00]
@@ -3960,6 +4384,18 @@ define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_minpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    minpd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    minpd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_minpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_minpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    minpd %xmm1, %xmm0 # sched: [2:1.00]
@@ -4069,6 +4505,18 @@ define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_minsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    minsd %xmm1, %xmm0 # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    minsd (%rdi), %xmm0 # sched: [7:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_minsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BDVER2-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_minsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    minsd %xmm1, %xmm0 # sched: [2:1.00]
@@ -4191,6 +4639,20 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movapd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movapd (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movapd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovapd (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movapd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd (%rdi), %xmm0 # sched: [5:1.00]
@@ -4316,6 +4778,20 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movdqa:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movdqa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movdqa:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa (%rdi), %xmm0 # sched: [5:1.00]
@@ -4441,6 +4917,20 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movdqu:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqu (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movdqu %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movdqu:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movdqu:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqu (%rdi), %xmm0 # sched: [5:1.00]
@@ -4605,6 +5095,26 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ; SKX-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    movd %edi, %xmm1 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movd %xmm2, %eax # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movd %xmm1, (%rsi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovd %edi, %xmm1 # sched: [10:0.50]
+; BDVER2-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovd %xmm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    vmovd %xmm1, (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
@@ -4786,6 +5296,26 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
 ; SKX-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movd_64:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    movq %rdi, %xmm1 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movq %xmm2, %rax # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movq %xmm1, (%rsi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movd_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovq %rdi, %xmm1 # sched: [10:0.50]
+; BDVER2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [5:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovq %xmm0, %rax # sched: [10:1.00]
+; BDVER2-NEXT:    vmovq %xmm1, (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movd_64:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero sched: [5:1.00]
@@ -4942,6 +5472,22 @@ define <2 x double> @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a
 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movhpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movhpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movhpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movhpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
@@ -5088,6 +5634,22 @@ define <2 x double> @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a
 ; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movlpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movlpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    movapd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movlpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movlpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
@@ -5196,6 +5758,16 @@ define i32 @test_movmskpd(<2 x double> %a0) {
 ; SKX-NEXT:    vmovmskpd %xmm0, %eax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movmskpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movmskpd %xmm0, %eax # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movmskpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovmskpd %xmm0, %eax # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movmskpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movmskpd %xmm0, %eax # sched: [3:1.00]
@@ -5301,6 +5873,18 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
 ; SKX-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movntdqa:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movntdq %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movntdqa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movntdqa:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddq %xmm0, %xmm0 # sched: [1:0.50]
@@ -5408,6 +5992,18 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movntpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movntpd %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movntpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movntpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [3:1.00]
@@ -5528,6 +6124,20 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
 ; SKX-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movq_mem:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movq %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movq_mem:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vmovq %xmm0, (%rdi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movq_mem:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero sched: [5:1.00]
@@ -5644,6 +6254,18 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
 ; SKX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movq_reg:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movq_reg:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movq_reg:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50]
@@ -5764,6 +6386,20 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
 ; SKX-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movsd_mem:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addsd %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movsd %xmm0, (%rsi) # sched: [2:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movsd_mem:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; BDVER2-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovsd %xmm0, (%rsi) # sched: [2:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movsd_mem:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero sched: [5:1.00]
@@ -5875,6 +6511,17 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
 ; SKX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movsd_reg:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movsd_reg:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movsd_reg:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:0.50]
@@ -5992,6 +6639,20 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movupd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movupd (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    movupd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movupd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovupd (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movupd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movupd (%rdi), %xmm0 # sched: [5:1.00]
@@ -6104,6 +6765,18 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mulpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mulpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    mulpd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_mulpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mulpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mulpd %xmm1, %xmm0 # sched: [4:2.00]
@@ -6212,6 +6885,18 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mulsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mulsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    mulsd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_mulsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mulsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mulsd %xmm1, %xmm0 # sched: [4:2.00]
@@ -6333,6 +7018,20 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_orpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    orpd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_orpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_orpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    orpd %xmm1, %xmm0 # sched: [1:0.50]
@@ -6454,6 +7153,18 @@ define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_packssdw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    packssdw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    packssdw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_packssdw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_packssdw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    packssdw %xmm1, %xmm0 # sched: [1:0.50]
@@ -6568,6 +7279,18 @@ define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_packsswb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    packsswb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    packsswb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_packsswb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_packsswb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    packsswb %xmm1, %xmm0 # sched: [1:0.50]
@@ -6682,6 +7405,18 @@ define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_packuswb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    packuswb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    packuswb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_packuswb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_packuswb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    packuswb %xmm1, %xmm0 # sched: [1:0.50]
@@ -6796,6 +7531,18 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_paddb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddb %xmm1, %xmm0 # sched: [1:0.50]
@@ -6908,6 +7655,18 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_paddd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [1:0.50]
@@ -7016,6 +7775,18 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_paddq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [1:0.50]
@@ -7128,6 +7899,18 @@ define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddsb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_paddsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddsb %xmm1, %xmm0 # sched: [1:0.50]
@@ -7241,6 +8024,18 @@ define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddsw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_paddsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -7354,6 +8149,18 @@ define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddusb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddusb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddusb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_paddusb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddusb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddusb %xmm1, %xmm0 # sched: [1:0.50]
@@ -7467,6 +8274,18 @@ define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddusw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddusw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddusw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_paddusw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddusw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddusw %xmm1, %xmm0 # sched: [1:0.50]
@@ -7580,6 +8399,18 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_paddw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_paddw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_paddw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [1:0.50]
@@ -7701,6 +8532,20 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pand:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pand %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pand (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pand:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pand:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pand %xmm1, %xmm0 # sched: [1:0.50]
@@ -7843,6 +8688,22 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pandn:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pandn %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pandn (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pandn:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pandn:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pandn %xmm1, %xmm0 # sched: [1:0.50]
@@ -7966,6 +8827,18 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pavgb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pavgb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pavgb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pavgb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pavgb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pavgb %xmm1, %xmm0 # sched: [1:0.50]
@@ -8088,6 +8961,18 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pavgw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pavgw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pavgw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pavgw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pavgw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pavgw %xmm1, %xmm0 # sched: [1:0.50]
@@ -8221,6 +9106,20 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpeqb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpeqb %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpeqb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpeqb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpeqb %xmm0, %xmm1 # sched: [1:0.50]
@@ -8350,6 +9249,20 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpeqd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpeqd %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpeqd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpeqd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpeqd %xmm0, %xmm1 # sched: [1:0.50]
@@ -8479,6 +9392,20 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpeqw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpeqw %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpeqw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpeqw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpeqw %xmm0, %xmm1 # sched: [1:0.50]
@@ -8614,6 +9541,21 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpgtb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pcmpgtb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    pcmpgtb %xmm1, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpgtb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpgtb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.50]
@@ -8751,6 +9693,21 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpgtd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    pcmpgtd %xmm1, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpgtd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpgtd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.50]
@@ -8888,6 +9845,21 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpgtw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pcmpgtw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    pcmpgtw %xmm1, %xmm2 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpgtw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpor %xmm0, %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpgtw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa %xmm0, %xmm2 # sched: [1:0.50]
@@ -9004,6 +9976,18 @@ define i16 @test_pextrw(<8 x i16> %a0) {
 ; SKX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pextrw $6, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pextrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpextrw $6, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pextrw $6, %xmm0, %eax # sched: [3:1.00]
@@ -9114,6 +10098,18 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
 ; SKX-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pinsrw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pinsrw $1, %edi, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pinsrw $3, (%rsi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pinsrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pinsrw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pinsrw $1, %edi, %xmm0 # sched: [7:0.50]
@@ -9222,6 +10218,18 @@ define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaddwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaddwd %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmaddwd (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmaddwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaddwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaddwd %xmm1, %xmm0 # sched: [2:1.00]
@@ -9336,6 +10344,18 @@ define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxsw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmaxsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmaxsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -9449,6 +10469,18 @@ define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxub:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxub %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmaxub (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmaxub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxub:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxub %xmm1, %xmm0 # sched: [1:0.50]
@@ -9562,6 +10594,18 @@ define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminsw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pminsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pminsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -9675,6 +10719,18 @@ define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminub:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminub %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pminub (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pminub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminub:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminub %xmm1, %xmm0 # sched: [1:0.50]
@@ -9773,6 +10829,16 @@ define i32 @test_pmovmskb(<16 x i8> %a0) {
 ; SKX-NEXT:    vpmovmskb %xmm0, %eax # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovmskb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovmskb %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovmskb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovmskb %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovmskb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovmskb %xmm0, %eax # sched: [3:1.00]
@@ -9876,6 +10942,18 @@ define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmulhuw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmulhuw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmulhuw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmulhuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmulhuw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmulhuw %xmm1, %xmm0 # sched: [2:1.00]
@@ -9985,6 +11063,18 @@ define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmulhw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmulhw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmulhw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmulhw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmulhw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmulhw %xmm1, %xmm0 # sched: [2:1.00]
@@ -10094,6 +11184,18 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmullw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmullw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmullw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmullw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmullw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmullw %xmm1, %xmm0 # sched: [2:1.00]
@@ -10202,6 +11304,18 @@ define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmuludq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmuludq %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmuludq (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmuludq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmuludq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmuludq %xmm1, %xmm0 # sched: [2:1.00]
@@ -10325,6 +11439,20 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_por:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    por (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_por:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_por:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [1:0.50]
@@ -10438,6 +11566,18 @@ define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psadbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psadbw %xmm1, %xmm0 # sched: [4:0.50]
+; BDVER2-SSE-NEXT:    psadbw (%rdi), %xmm0 # sched: [9:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psadbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; BDVER2-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psadbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psadbw %xmm1, %xmm0 # sched: [2:0.50]
@@ -10564,6 +11704,20 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pshufd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,2,1,0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pshufd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
+; BDVER2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pshufd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] sched: [1:0.50]
@@ -10693,6 +11847,20 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pshufhw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pshufhw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
+; BDVER2-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [2:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pshufhw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
@@ -10822,6 +11990,20 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pshuflw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pshuflw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
+; BDVER2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [2:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pshuflw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
@@ -10948,6 +12130,20 @@ define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pslld:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pslld %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    pslld (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    pslld $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pslld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pslld:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pslld %xmm1, %xmm0 # sched: [1:0.50]
@@ -11056,6 +12252,16 @@ define <4 x i32> @test_pslldq(<4 x i32> %a0) {
 ; SKX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pslldq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pslldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pslldq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
@@ -11171,6 +12377,20 @@ define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psllq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psllq %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psllq (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psllq $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psllq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psllq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psllq %xmm1, %xmm0 # sched: [1:0.50]
@@ -11299,6 +12519,20 @@ define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psllw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psllw %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psllw (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psllw $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psllw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psllw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psllw %xmm1, %xmm0 # sched: [1:0.50]
@@ -11427,6 +12661,20 @@ define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrad:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrad %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psrad (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psrad $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psrad:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrad:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrad %xmm1, %xmm0 # sched: [1:0.50]
@@ -11555,6 +12803,20 @@ define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psraw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psraw %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psraw (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psraw $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psraw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psraw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psraw %xmm1, %xmm0 # sched: [1:0.50]
@@ -11683,6 +12945,20 @@ define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrld:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrld %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psrld (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psrld $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psrld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrld:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrld %xmm1, %xmm0 # sched: [1:0.50]
@@ -11791,6 +13067,16 @@ define <4 x i32> @test_psrldq(<4 x i32> %a0) {
 ; SKX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrldq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psrldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrldq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
@@ -11906,6 +13192,20 @@ define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrlq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrlq %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psrlq (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psrlq $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psrlq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrlq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrlq %xmm1, %xmm0 # sched: [1:0.50]
@@ -12034,6 +13334,20 @@ define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psrlw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psrlw %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-SSE-NEXT:    psrlw (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER2-SSE-NEXT:    psrlw $2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psrlw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER2-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psrlw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psrlw %xmm1, %xmm0 # sched: [1:0.50]
@@ -12153,6 +13467,18 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psubb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psubb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubb %xmm1, %xmm0 # sched: [1:0.50]
@@ -12265,6 +13591,18 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psubd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psubd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubd %xmm1, %xmm0 # sched: [1:0.50]
@@ -12373,6 +13711,18 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psubq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psubq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubq %xmm1, %xmm0 # sched: [1:0.50]
@@ -12485,6 +13835,18 @@ define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubsb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psubsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psubsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubsb %xmm1, %xmm0 # sched: [1:0.50]
@@ -12598,6 +13960,18 @@ define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubsw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psubsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -12711,6 +14085,18 @@ define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubusb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubusb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psubusb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psubusb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubusb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubusb %xmm1, %xmm0 # sched: [1:0.50]
@@ -12824,6 +14210,18 @@ define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubusw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubusw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psubusw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psubusw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubusw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubusw %xmm1, %xmm0 # sched: [1:0.50]
@@ -12937,6 +14335,18 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psubw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psubw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psubw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psubw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psubw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psubw %xmm1, %xmm0 # sched: [1:0.50]
@@ -13049,6 +14459,18 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckhbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_punpckhbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [2:0.50]
+; BDVER2-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckhbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
@@ -13172,6 +14594,20 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckhdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_punpckhdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
+; BDVER2-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckhdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
@@ -13298,6 +14734,20 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckhqdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_punpckhqdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckhqdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
@@ -13415,6 +14865,18 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckhwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_punpckhwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [2:0.50]
+; BDVER2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckhwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
@@ -13527,6 +14989,18 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpcklbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_punpcklbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [2:0.50]
+; BDVER2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpcklbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
@@ -13650,6 +15124,20 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpckldq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_punpckldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpckldq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
@@ -13776,6 +15264,20 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpcklqdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_punpcklqdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [2:0.50]
+; BDVER2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpcklqdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
@@ -13893,6 +15395,18 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_punpcklwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_punpcklwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [2:0.50]
+; BDVER2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_punpcklwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
@@ -14014,6 +15528,20 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pxor:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pxor %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pxor (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pxor:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pxor:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pxor %xmm1, %xmm0 # sched: [1:0.50]
@@ -14140,6 +15668,20 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_shufpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_shufpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [2:0.50]
+; BDVER2-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_shufpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50]
@@ -14267,6 +15809,20 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sqrtpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    sqrtpd %xmm0, %xmm1 # sched: [9:13.50]
+; BDVER2-SSE-NEXT:    sqrtpd (%rdi), %xmm0 # sched: [14:13.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_sqrtpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [14:13.50]
+; BDVER2-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [9:13.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sqrtpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    sqrtpd %xmm0, %xmm1 # sched: [27:27.00]
@@ -14409,6 +15965,22 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_sqrtsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movapd (%rdi), %xmm1 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    sqrtsd %xmm0, %xmm0 # sched: [9:13.50]
+; BDVER2-SSE-NEXT:    sqrtsd %xmm1, %xmm1 # sched: [9:13.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_sqrtsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovapd (%rdi), %xmm1 # sched: [5:0.50]
+; BDVER2-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [9:13.50]
+; BDVER2-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [9:13.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_sqrtsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd (%rdi), %xmm1 # sched: [5:1.00]
@@ -14527,6 +16099,18 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_subpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    subpd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_subpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_subpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -14635,6 +16219,18 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
 ; SKX-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_subsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    subsd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    subsd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_subsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_subsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    subsd %xmm1, %xmm0 # sched: [3:1.00]
@@ -14847,6 +16443,34 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_ucomisd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    ucomisd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    ucomisd (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-SSE-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_ucomisd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vucomisd %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    vucomisd (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    setnp %al # sched: [1:0.50]
+; BDVER2-NEXT:    sete %dl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    orb %cl, %dl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %dl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_ucomisd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    ucomisd %xmm1, %xmm0 # sched: [3:1.00]
@@ -15002,6 +16626,20 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_unpckhpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_unpckhpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_unpckhpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
@@ -15136,6 +16774,21 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_unpcklpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm2, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_unpcklpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [2:0.50]
+; BDVER2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_unpcklpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:0.50]
@@ -15264,6 +16917,20 @@ define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_xorpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    xorpd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    xorpd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_xorpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_xorpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    xorpd %xmm1, %xmm0 # sched: [1:0.50]
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index b2efb5b2933fa46b8abbf59575e2fb7dc59a1e21..be019aff5142fb419780d5ff1de4453013ae33c6 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -395,7 +395,7 @@ define void @test12() nounwind {
 ; SSE-LABEL: test12:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movapd 0, %xmm0
-; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1,1,1,1]
+; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE-NEXT:    xorps %xmm2, %xmm2
 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
@@ -416,7 +416,7 @@ define void @test12() nounwind {
 ; AVX512-LABEL: test12:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovaps 0, %xmm0
-; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
+; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll
index bb37f21e4f6bb50b9a46820954ea7780a1eb889d..1c3419a35ff3eccc31b567576a41144f0d3e0e20 100644
--- a/test/CodeGen/X86/sse3-schedule.ll
+++ b/test/CodeGen/X86/sse3-schedule.ll
@@ -14,6 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2  | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -98,6 +100,18 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addsubpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addsubpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addsubpd (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_addsubpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addsubpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addsubpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -207,6 +221,18 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_addsubps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    addsubps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    addsubps (%rdi), %xmm0 # sched: [10:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_addsubps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_addsubps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    addsubps %xmm1, %xmm0 # sched: [3:1.00]
@@ -316,6 +342,18 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; SKX-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_haddpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    haddpd %xmm1, %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    haddpd (%rdi), %xmm0 # sched: [16:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_haddpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_haddpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    haddpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -425,6 +463,18 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; SKX-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_haddps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    haddps %xmm1, %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    haddps (%rdi), %xmm0 # sched: [16:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_haddps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_haddps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    haddps %xmm1, %xmm0 # sched: [3:1.00]
@@ -534,6 +584,18 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; SKX-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_hsubpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    hsubpd %xmm1, %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    hsubpd (%rdi), %xmm0 # sched: [16:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_hsubpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_hsubpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    hsubpd %xmm1, %xmm0 # sched: [3:1.00]
@@ -643,6 +705,18 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ; SKX-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_hsubps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    hsubps %xmm1, %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    hsubps (%rdi), %xmm0 # sched: [16:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_hsubps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_hsubps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    hsubps %xmm1, %xmm0 # sched: [3:1.00]
@@ -741,6 +815,16 @@ define <16 x i8> @test_lddqu(i8* %a0) {
 ; SKX-NEXT:    vlddqu (%rdi), %xmm0 # sched: [6:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_lddqu:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    lddqu (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_lddqu:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vlddqu (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_lddqu:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    lddqu (%rdi), %xmm0 # sched: [5:1.00]
@@ -857,6 +941,20 @@ define void @test_monitor(i8* %a0, i32 %a1, i32 %a2) {
 ; SKX-NEXT:    monitor # sched: [100:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_monitor:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    monitor # sched: [100:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_monitor:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
+; BDVER2-NEXT:    movl %esi, %ecx # sched: [1:0.50]
+; BDVER2-NEXT:    monitor # sched: [100:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_monitor:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %esi, %ecx # sched: [1:0.50]
@@ -982,6 +1080,20 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vsubpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movddup:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    subpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movddup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [7:0.50]
+; BDVER2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [2:0.50]
+; BDVER2-NEXT:    vsubpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movddup:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:0.50]
@@ -1109,6 +1221,20 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movshdup:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movshdup {{.*#+}} xmm0 = mem[1,1,3,3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movshdup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [7:0.50]
+; BDVER2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [2:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movshdup:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:0.50]
@@ -1236,6 +1362,20 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movsldup:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movsldup {{.*#+}} xmm0 = mem[0,0,2,2] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movsldup:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [7:0.50]
+; BDVER2-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [2:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movsldup:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:0.50]
@@ -1362,6 +1502,20 @@ define void @test_mwait(i32 %a0, i32 %a1) {
 ; SKX-NEXT:    mwait # sched: [20:2.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mwait:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl %edi, %ecx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    mwait # sched: [100:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_mwait:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %esi, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl %edi, %ecx # sched: [1:0.50]
+; BDVER2-NEXT:    mwait # sched: [100:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mwait:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %esi, %eax # sched: [1:0.50]
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index 3af917491c93f75d697aac5324bc3751afdeeb69..ea606463fc17b09d26a912d2e3b76c57bec09cfe 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -13,6 +13,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2,-xop   | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse4.2 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -103,6 +105,20 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_blendpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_blendpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [2:0.50]
+; BDVER2-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_blendpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1] sched: [1:0.50]
@@ -222,6 +238,20 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_blendps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_blendps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [2:0.50]
+; BDVER2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],mem[1],xmm1[2,3] sched: [7:0.50]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_blendps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
@@ -350,6 +380,21 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
 ; SKX-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_blendvpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm3 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    blendvpd %xmm0, %xmm1, %xmm3 # sched: [2:2.00]
+; BDVER2-SSE-NEXT:    blendvpd %xmm0, (%rdi), %xmm3 # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    movapd %xmm3, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_blendvpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_blendvpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd %xmm0, %xmm3 # sched: [1:0.50]
@@ -480,6 +525,21 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
 ; SKX-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_blendvps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movaps %xmm0, %xmm3 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3 # sched: [2:2.00]
+; BDVER2-SSE-NEXT:    blendvps %xmm0, (%rdi), %xmm3 # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    movaps %xmm3, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_blendvps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_blendvps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movaps %xmm0, %xmm3 # sched: [1:0.50]
@@ -589,6 +649,18 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
 ; SKX-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_dppd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    dppd $7, %xmm1, %xmm0 # sched: [15:1.50]
+; BDVER2-SSE-NEXT:    dppd $7, (%rdi), %xmm0 # sched: [20:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_dppd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [15:1.50]
+; BDVER2-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [20:1.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_dppd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    dppd $7, %xmm1, %xmm0 # sched: [9:3.00]
@@ -692,6 +764,18 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
 ; SKX-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [19:1.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_dpps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    dpps $7, %xmm1, %xmm0 # sched: [25:1.50]
+; BDVER2-SSE-NEXT:    dpps $7, (%rdi), %xmm0 # sched: [30:1.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_dpps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [25:1.50]
+; BDVER2-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [30:1.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_dpps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    dpps $7, %xmm1, %xmm0 # sched: [11:3.00]
@@ -795,6 +879,18 @@ define i32 @test_extractps(<4 x float> %a0, i32 *%a1) {
 ; SKX-NEXT:    vextractps $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_extractps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    extractps $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    extractps $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_extractps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vextractps $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    vextractps $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_extractps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    extractps $3, %xmm0, %eax # sched: [3:1.00]
@@ -899,6 +995,18 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
 ; SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_insertps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_insertps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [2:0.50]
+; BDVER2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_insertps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
@@ -990,6 +1098,16 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
 ; SKX-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_movntdqa:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movntdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_movntdqa:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_movntdqa:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movntdqa (%rdi), %xmm0 # sched: [5:1.00]
@@ -1087,6 +1205,18 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_mpsadbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    mpsadbw $7, %xmm1, %xmm0 # sched: [9:2.00]
+; BDVER2-SSE-NEXT:    mpsadbw $7, (%rdi), %xmm0 # sched: [14:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_mpsadbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [9:2.00]
+; BDVER2-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [14:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_mpsadbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    mpsadbw $7, %xmm1, %xmm0 # sched: [3:2.00]
@@ -1191,6 +1321,18 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_packusdw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    packusdw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    packusdw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_packusdw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_packusdw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    packusdw %xmm1, %xmm0 # sched: [1:0.50]
@@ -1316,6 +1458,21 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
 ; SKX-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pblendvb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movdqa %xmm0, %xmm3 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    movaps %xmm2, %xmm0 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm3 # sched: [2:2.00]
+; BDVER2-SSE-NEXT:    pblendvb %xmm0, (%rdi), %xmm3 # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    movdqa %xmm3, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pblendvb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BDVER2-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pblendvb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movdqa %xmm0, %xmm3 # sched: [1:0.50]
@@ -1437,6 +1594,20 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pblendw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6],mem[7] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pblendw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [2:0.50]
+; BDVER2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6],mem[7] sched: [7:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pblendw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
@@ -1544,6 +1715,18 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpeqq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpeqq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pcmpeqq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpeqq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpeqq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpeqq %xmm1, %xmm0 # sched: [1:0.50]
@@ -1648,6 +1831,18 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
 ; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pextrb $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    pextrb $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pextrb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpextrb $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pextrb $3, %xmm0, %eax # sched: [3:1.00]
@@ -1763,6 +1958,20 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
 ; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    paddd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pextrd $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    pextrd $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pextrd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpextrd $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    paddd %xmm0, %xmm0 # sched: [1:0.50]
@@ -1870,6 +2079,18 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
 ; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pextrq $1, %xmm0, %rax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    pextrq $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pextrq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpextrq $1, %xmm0, %rax # sched: [13:1.00]
+; BDVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pextrq $1, %xmm0, %rax # sched: [3:1.00]
@@ -1972,6 +2193,18 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
 ; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [2:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pextrw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pextrw $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    pextrw $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pextrw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpextrw $3, %xmm0, %eax # sched: [13:1.00]
+; BDVER2-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [13:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pextrw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pextrw $3, %xmm0, %eax # sched: [3:1.00]
@@ -2075,6 +2308,18 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
 ; SKX-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [4:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phminposuw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    phminposuw %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_phminposuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phminposuw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phminposuw (%rdi), %xmm0 # sched: [7:1.00]
@@ -2178,6 +2423,18 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
 ; SKX-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pinsrb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pinsrb $1, %edi, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pinsrb $3, (%rsi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pinsrb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pinsrb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pinsrb $1, %edi, %xmm0 # sched: [7:0.50]
@@ -2280,6 +2537,18 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
 ; SKX-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pinsrd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pinsrd $1, %edi, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pinsrd $3, (%rsi), %xmm0 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pinsrd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pinsrd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pinsrd $1, %edi, %xmm0 # sched: [7:0.50]
@@ -2394,6 +2663,20 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pinsrq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pinsrq $1, (%rsi), %xmm1 # sched: [6:0.50]
+; BDVER2-SSE-NEXT:    pinsrq $1, %rdi, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pinsrq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:0.50]
+; BDVER2-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pinsrq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pinsrq $1, %rdi, %xmm0 # sched: [7:0.50]
@@ -2501,6 +2784,18 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxsb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmaxsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmaxsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxsb %xmm1, %xmm0 # sched: [1:0.50]
@@ -2604,6 +2899,18 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxsd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmaxsd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmaxsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxsd %xmm1, %xmm0 # sched: [1:0.50]
@@ -2707,6 +3014,18 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxud:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxud %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmaxud (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmaxud:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxud:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxud %xmm1, %xmm0 # sched: [1:0.50]
@@ -2810,6 +3129,18 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaxuw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaxuw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmaxuw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmaxuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaxuw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaxuw %xmm1, %xmm0 # sched: [1:0.50]
@@ -2913,6 +3244,18 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminsb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pminsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pminsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminsb %xmm1, %xmm0 # sched: [1:0.50]
@@ -3016,6 +3359,18 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminsd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pminsd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pminsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminsd %xmm1, %xmm0 # sched: [1:0.50]
@@ -3119,6 +3474,18 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminud:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminud %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pminud (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pminud:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminud:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminud %xmm1, %xmm0 # sched: [1:0.50]
@@ -3222,6 +3589,18 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pminuw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pminuw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pminuw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pminuw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pminuw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pminuw %xmm1, %xmm0 # sched: [1:0.50]
@@ -3338,6 +3717,20 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxbw %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovsxbw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovsxbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxbw (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxbw %xmm0, %xmm1 # sched: [1:0.50]
@@ -3459,6 +3852,20 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxbd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxbd %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovsxbd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovsxbd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxbd (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxbd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxbd %xmm0, %xmm1 # sched: [1:0.50]
@@ -3580,6 +3987,20 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxbq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxbq %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovsxbq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovsxbq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxbq (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxbq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxbq %xmm0, %xmm1 # sched: [1:0.50]
@@ -3701,6 +4122,20 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxdq %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovsxdq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovsxdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxdq (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxdq %xmm0, %xmm1 # sched: [1:0.50]
@@ -3822,6 +4257,20 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxwd %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovsxwd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovsxwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxwd (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxwd %xmm0, %xmm1 # sched: [1:0.50]
@@ -3943,6 +4392,20 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovsxwq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovsxwq %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovsxwq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovsxwq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovsxwq (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovsxwq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovsxwq %xmm0, %xmm1 # sched: [1:0.50]
@@ -4064,6 +4527,20 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
 ; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxbw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovzxbw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
+; BDVER2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxbw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
@@ -4185,6 +4662,20 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxbd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovzxbd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
+; BDVER2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxbd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
@@ -4306,6 +4797,20 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxbq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovzxbq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
+; BDVER2-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxbq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
@@ -4427,6 +4932,20 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovzxdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [7:0.50]
+; BDVER2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
@@ -4548,6 +5067,20 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
 ; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxwd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovzxwd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
+; BDVER2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxwd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
@@ -4669,6 +5202,20 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
 ; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmovzxwq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
+; BDVER2-SSE-NEXT:    paddq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmovzxwq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
+; BDVER2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [2:0.50]
+; BDVER2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmovzxwq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
@@ -4704,106 +5251,136 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
   ret <2 x i64> %5
 }
 
-define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> *%a3) {
 ; GENERIC-LABEL: test_pmuldq:
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
-; GENERIC-NEXT:    pmuldq (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
+; GENERIC-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
 ; SLM-LABEL: test_pmuldq:
 ; SLM:       # %bb.0:
+; SLM-NEXT:    pmuldq (%rdi), %xmm2 # sched: [7:1.00]
 ; SLM-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:1.00]
-; SLM-NEXT:    pmuldq (%rdi), %xmm0 # sched: [7:1.00]
+; SLM-NEXT:    por %xmm2, %xmm0 # sched: [1:0.50]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
 ; SANDY-SSE-LABEL: test_pmuldq:
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [11:1.00]
+; SANDY-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
+; SANDY-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
 ; SANDY-LABEL: test_pmuldq:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [11:1.00]
+; SANDY-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
 ; HASWELL-SSE-LABEL: test_pmuldq:
 ; HASWELL-SSE:       # %bb.0:
 ; HASWELL-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
-; HASWELL-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [11:1.00]
+; HASWELL-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
+; HASWELL-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; HASWELL-LABEL: test_pmuldq:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [11:1.00]
+; HASWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-SSE-LABEL: test_pmuldq:
 ; BROADWELL-SSE:       # %bb.0:
 ; BROADWELL-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [5:1.00]
-; BROADWELL-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [10:1.00]
+; BROADWELL-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [10:1.00]
+; BROADWELL-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; BROADWELL-LABEL: test_pmuldq:
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; BROADWELL-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [10:1.00]
+; BROADWELL-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-SSE-LABEL: test_pmuldq:
 ; SKYLAKE-SSE:       # %bb.0:
 ; SKYLAKE-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:0.50]
-; SKYLAKE-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [10:0.50]
+; SKYLAKE-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [10:0.50]
+; SKYLAKE-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKYLAKE-LABEL: test_pmuldq:
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKYLAKE-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [10:0.50]
+; SKYLAKE-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-SSE-LABEL: test_pmuldq:
 ; SKX-SSE:       # %bb.0:
 ; SKX-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [10:0.50]
+; SKX-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [10:0.50]
+; SKX-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.33]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
 ; SKX-LABEL: test_pmuldq:
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vpmuldq (%rdi), %xmm2, %xmm1 # sched: [10:0.50]
+; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmuldq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmuldq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmuldq (%rdi), %xmm2, %xmm2 # sched: [9:1.00]
+; BDVER2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpor %xmm2, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmuldq:
 ; BTVER2-SSE:       # %bb.0:
+; BTVER2-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [7:1.00]
 ; BTVER2-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [2:1.00]
-; BTVER2-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [7:1.00]
+; BTVER2-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.50]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_pmuldq:
 ; BTVER2:       # %bb.0:
+; BTVER2-NEXT:    vpmuldq (%rdi), %xmm2, %xmm2 # sched: [7:1.00]
 ; BTVER2-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT:    vpor %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_pmuldq:
 ; ZNVER1-SSE:       # %bb.0:
+; ZNVER1-SSE-NEXT:    pmuldq (%rdi), %xmm2 # sched: [11:1.00]
 ; ZNVER1-SSE-NEXT:    pmuldq %xmm1, %xmm0 # sched: [4:1.00]
-; ZNVER1-SSE-NEXT:    pmuldq (%rdi), %xmm0 # sched: [11:1.00]
+; ZNVER1-SSE-NEXT:    por %xmm2, %xmm0 # sched: [1:0.25]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
 ; ZNVER1-LABEL: test_pmuldq:
 ; ZNVER1:       # %bb.0:
+; ZNVER1-NEXT:    vpmuldq (%rdi), %xmm2, %xmm2 # sched: [11:1.00]
 ; ZNVER1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
-; ZNVER1-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT:    vpor %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
-  %2 = bitcast <2 x i64> %1 to <4 x i32>
-  %3 = load <4 x i32>, <4 x i32> *%a2, align 16
-  %4 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %2, <4 x i32> %3)
+  %2 = load <4 x i32>, <4 x i32> *%a3, align 16
+  %3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a2, <4 x i32> %2)
+  %4 = or <2 x i64> %1, %3
   ret <2 x i64> %4
 }
 declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
@@ -4881,6 +5458,18 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmulld:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmulld %xmm1, %xmm0 # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    pmulld (%rdi), %xmm0 # sched: [10:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmulld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER2-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmulld:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmulld %xmm1, %xmm0 # sched: [4:2.00]
@@ -5031,6 +5620,26 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    movzbl %cl, %eax # sched: [1:0.25]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_ptest:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    ptest %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-SSE-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    ptest (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-SSE-NEXT:    setb %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movzbl %cl, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_ptest:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vptest %xmm1, %xmm0 # sched: [1:1.00]
+; BDVER2-NEXT:    setb %al # sched: [1:0.50]
+; BDVER2-NEXT:    vptest (%rdi), %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    setb %cl # sched: [1:0.50]
+; BDVER2-NEXT:    andb %al, %cl # sched: [1:0.50]
+; BDVER2-NEXT:    movzbl %cl, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_ptest:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    ptest %xmm1, %xmm0 # sched: [3:1.00]
@@ -5165,6 +5774,20 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
 ; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_roundpd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    roundpd $7, %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    roundpd $7, (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_roundpd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundpd $7, (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_roundpd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    roundpd $7, %xmm0, %xmm1 # sched: [3:1.00]
@@ -5287,6 +5910,20 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_roundps:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    roundps $7, %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    roundps $7, (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_roundps:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundps $7, (%rdi), %xmm1 # sched: [9:1.00]
+; BDVER2-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_roundps:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    roundps $7, %xmm0, %xmm1 # sched: [3:1.00]
@@ -5414,6 +6051,21 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_roundsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    roundsd $7, (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    roundsd $7, %xmm1, %xmm2 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addpd %xmm2, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_roundsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_roundsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movapd %xmm0, %xmm2 # sched: [1:0.50]
@@ -5543,6 +6195,21 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
 ; SKX-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_roundss:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movaps %xmm0, %xmm2 # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    roundss $7, (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    roundss $7, %xmm1, %xmm2 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    addps %xmm2, %xmm0 # sched: [5:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_roundss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [4:1.00]
+; BDVER2-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_roundss:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movaps %xmm0, %xmm2 # sched: [1:0.50]
diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
index 7bb4ac6a995783c9c8ef6142363d637f7289d792..97dffb4db0943460d665ae15cb1d68a944469ffd 100644
--- a/test/CodeGen/X86/sse42-schedule.ll
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -13,6 +13,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2,-xop | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx  | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -103,6 +105,20 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
 ; SKX-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_32_8:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BDVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: crc32_32_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BDVER2-NEXT:    crc32b (%rdx), %eax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_32_8:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
@@ -222,6 +238,20 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
 ; SKX-NEXT:    crc32w (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_32_16:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32w %si, %eax # sched: [5:2.00]
+; BDVER2-SSE-NEXT:    crc32w (%rdx), %eax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: crc32_32_16:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32w %si, %eax # sched: [5:2.00]
+; BDVER2-NEXT:    crc32w (%rdx), %eax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_32_16:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
@@ -341,6 +371,20 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
 ; SKX-NEXT:    crc32l (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_32_32:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32l %esi, %eax # sched: [6:2.00]
+; BDVER2-SSE-NEXT:    crc32l (%rdx), %eax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: crc32_32_32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32l %esi, %eax # sched: [6:2.00]
+; BDVER2-NEXT:    crc32l (%rdx), %eax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_32_32:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl %edi, %eax # sched: [1:0.50]
@@ -460,6 +504,20 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
 ; SKX-NEXT:    crc32b (%rdx), %eax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_64_8:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BDVER2-SSE-NEXT:    crc32b (%rdx), %eax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: crc32_64_8:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32b %sil, %eax # sched: [3:2.00]
+; BDVER2-NEXT:    crc32b (%rdx), %eax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_64_8:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
@@ -579,6 +637,20 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
 ; SKX-NEXT:    crc32q (%rdx), %rax # sched: [8:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: crc32_64_64:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    crc32q %rsi, %rax # sched: [10:2.00]
+; BDVER2-SSE-NEXT:    crc32q (%rdx), %rax # sched: [7:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: crc32_64_64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    crc32q %rsi, %rax # sched: [10:2.00]
+; BDVER2-NEXT:    crc32q (%rdx), %rax # sched: [7:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: crc32_64_64:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movq %rdi, %rax # sched: [1:0.50]
@@ -770,6 +842,32 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpestri:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpestri $7, %xmm1, %xmm0 # sched: [15:4.00]
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl %ecx, %esi # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpestri $7, (%rdi), %xmm0 # sched: [20:4.50]
+; BDVER2-SSE-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; BDVER2-SSE-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpestri:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpestri $7, %xmm1, %xmm0 # sched: [15:4.00]
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-NEXT:    movl %ecx, %esi # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [20:4.50]
+; BDVER2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; BDVER2-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpestri:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
@@ -950,6 +1048,26 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [25:4.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpestrm:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00]
+; BDVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpestrm $7, (%rdi), %xmm0 # sched: [15:4.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpestrm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00]
+; BDVER2-NEXT:    movl $7, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    movl $7, %edx # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [15:4.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpestrm:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movl $7, %eax # sched: [1:0.50]
@@ -1105,6 +1223,24 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpistri:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpistri $7, %xmm1, %xmm0 # sched: [14:1.00]
+; BDVER2-SSE-NEXT:    movl %ecx, %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    pcmpistri $7, (%rdi), %xmm0 # sched: [19:1.00]
+; BDVER2-SSE-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; BDVER2-SSE-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpistri:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpistri $7, %xmm1, %xmm0 # sched: [14:1.00]
+; BDVER2-NEXT:    movl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    vpcmpistri $7, (%rdi), %xmm0 # sched: [19:1.00]
+; BDVER2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; BDVER2-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpistri:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpistri $7, %xmm1, %xmm0 # sched: [7:2.00]
@@ -1221,6 +1357,18 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [16:3.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpistrm:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpistrm $7, %xmm1, %xmm0 # sched: [6:1.00]
+; BDVER2-SSE-NEXT:    pcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpistrm:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [6:1.00]
+; BDVER2-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpistrm:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpistrm $7, %xmm1, %xmm0 # sched: [8:2.00]
@@ -1324,6 +1472,18 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pcmpgtq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pcmpgtq %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pcmpgtq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pcmpgtq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pcmpgtq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pcmpgtq %xmm1, %xmm0 # sched: [1:0.50]
@@ -1428,6 +1588,18 @@ define <2 x i64> @test_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; SKX-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pclmulqdq:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pclmulqdq $0, %xmm1, %xmm0 # sched: [12:1.00]
+; BDVER2-SSE-NEXT:    pclmulqdq $0, (%rdi), %xmm0 # sched: [17:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pclmulqdq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; BDVER2-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [17:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pclmulqdq:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pclmulqdq $0, %xmm1, %xmm0 # sched: [2:1.00]
diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll
index 681953a8358654ce725a8ed25f5f4c8f20606519..ad76845a73d26c9adda4d3501d3d2b9b1913e8c5 100644
--- a/test/CodeGen/X86/sse4a-schedule.ll
+++ b/test/CodeGen/X86/sse4a-schedule.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
 
@@ -9,6 +10,11 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
 ; GENERIC-NEXT:    extrq %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_extrq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    extrq %xmm1, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_extrq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    extrq %xmm1, %xmm0 # sched: [1:0.50]
@@ -29,6 +35,11 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) {
 ; GENERIC-NEXT:    extrq $2, $3, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_extrqi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    extrq $2, $3, %xmm0 # sched: [3:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_extrqi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    extrq $2, $3, %xmm0 # sched: [1:0.50]
@@ -49,6 +60,11 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
 ; GENERIC-NEXT:    insertq %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_insertq:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    insertq %xmm1, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_insertq:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    insertq %xmm1, %xmm0 # sched: [2:2.00]
@@ -69,6 +85,11 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
 ; GENERIC-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [1:0.50]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_insertqi:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_insertqi:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00]
@@ -89,6 +110,11 @@ define void @test_movntsd(i8* %p, <2 x double> %a) {
 ; GENERIC-NEXT:    movntsd %xmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_movntsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movntsd %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movntsd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movntsd %xmm0, (%rdi) # sched: [3:1.00]
@@ -109,6 +135,11 @@ define void @test_movntss(i8* %p, <4 x float> %a) {
 ; GENERIC-NEXT:    movntss %xmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
+; BDVER2-LABEL: test_movntss:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movntss %xmm0, (%rdi) # sched: [3:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_movntss:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movntss %xmm0, (%rdi) # sched: [3:1.00]
diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll
index b10b1bb5c74cd5eb5be84ed1f1d0486563cef24b..5c8bd2dc843ea91e89d06f16bdf980f8cd50c303 100644
--- a/test/CodeGen/X86/ssse3-schedule.ll
+++ b/test/CodeGen/X86/ssse3-schedule.ll
@@ -14,6 +14,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKYLAKE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2   | FileCheck %s --check-prefixes=CHECK,BTVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
@@ -113,6 +115,20 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pabsb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pabsb %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pabsb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pabsb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpabsb (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpabsb %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pabsb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pabsb %xmm0, %xmm1 # sched: [1:0.50]
@@ -242,6 +258,20 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pabsd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pabsd %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pabsd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pabsd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpabsd (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpabsd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pabsd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pabsd %xmm0, %xmm1 # sched: [1:0.50]
@@ -371,6 +401,20 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
 ; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pabsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pabsw %xmm0, %xmm1 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    pabsw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    por %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pabsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpabsw (%rdi), %xmm1 # sched: [7:0.50]
+; BDVER2-NEXT:    vpabsw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pabsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pabsw %xmm0, %xmm1 # sched: [1:0.50]
@@ -495,6 +539,19 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_palignr:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [2:0.50]
+; BDVER2-SSE-NEXT:    palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
+; BDVER2-SSE-NEXT:    movdqa %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_palignr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [2:0.50]
+; BDVER2-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_palignr:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
@@ -605,6 +662,18 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phaddd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phaddd %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phaddd (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_phaddd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phaddd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phaddd %xmm1, %xmm0 # sched: [1:0.50]
@@ -714,6 +783,18 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phaddsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phaddsw %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phaddsw (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_phaddsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phaddsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phaddsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -823,6 +904,18 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phaddw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phaddw %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phaddw (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_phaddw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phaddw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phaddw %xmm1, %xmm0 # sched: [1:0.50]
@@ -932,6 +1025,18 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phsubd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phsubd %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phsubd (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_phsubd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phsubd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phsubd %xmm1, %xmm0 # sched: [1:0.50]
@@ -1041,6 +1146,18 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phsubsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phsubsw %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phsubsw (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_phsubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phsubsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phsubsw %xmm1, %xmm0 # sched: [1:0.50]
@@ -1150,6 +1267,18 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_phsubw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    phsubw %xmm1, %xmm0 # sched: [5:0.50]
+; BDVER2-SSE-NEXT:    phsubw (%rdi), %xmm0 # sched: [10:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_phsubw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; BDVER2-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_phsubw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    phsubw %xmm1, %xmm0 # sched: [1:0.50]
@@ -1259,6 +1388,18 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmaddubsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmaddubsw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmaddubsw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmaddubsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmaddubsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmaddubsw %xmm1, %xmm0 # sched: [2:1.00]
@@ -1369,6 +1510,18 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pmulhrsw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pmulhrsw %xmm1, %xmm0 # sched: [4:1.00]
+; BDVER2-SSE-NEXT:    pmulhrsw (%rdi), %xmm0 # sched: [9:1.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pmulhrsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER2-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pmulhrsw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pmulhrsw %xmm1, %xmm0 # sched: [2:1.00]
@@ -1478,6 +1631,18 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_pshufb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    pshufb %xmm1, %xmm0 # sched: [3:2.00]
+; BDVER2-SSE-NEXT:    pshufb (%rdi), %xmm0 # sched: [8:2.00]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_pshufb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER2-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_pshufb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    pshufb %xmm1, %xmm0 # sched: [2:2.00]
@@ -1591,6 +1756,18 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
 ; SKX-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psignb:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psignb %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psignb (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psignb:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psignb:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psignb %xmm1, %xmm0 # sched: [1:0.50]
@@ -1704,6 +1881,18 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ; SKX-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psignd:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psignd %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psignd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psignd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psignd:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psignd %xmm1, %xmm0 # sched: [1:0.50]
@@ -1817,6 +2006,18 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ; SKX-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
+; BDVER2-SSE-LABEL: test_psignw:
+; BDVER2-SSE:       # %bb.0:
+; BDVER2-SSE-NEXT:    psignw %xmm1, %xmm0 # sched: [2:0.50]
+; BDVER2-SSE-NEXT:    psignw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER2-SSE-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER2-LABEL: test_psignw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER2-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
 ; BTVER2-SSE-LABEL: test_psignw:
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    psignw %xmm1, %xmm0 # sched: [1:0.50]
diff --git a/test/CodeGen/X86/ssub_sat.ll b/test/CodeGen/X86/ssub_sat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6d9a534fad90f7229550c69cabb29bebaca46dcd
--- /dev/null
+++ b/test/CodeGen/X86/ssub_sat.ll
@@ -0,0 +1,267 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=CHECK32
+
+declare  i4  @llvm.ssub.sat.i4   (i4,  i4)
+declare  i32 @llvm.ssub.sat.i32  (i32, i32)
+declare  i64 @llvm.ssub.sat.i64  (i64, i64)
+declare  <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
+
+define i32 @func(i32 %x, i32 %y) {
+; CHECK-LABEL: func:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    subl %esi, %ecx
+; CHECK-NEXT:    setns %al
+; CHECK-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %esi, %edi
+; CHECK-NEXT:    cmovnol %edi, %eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    .cfi_offset %esi, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    movl %eax, %esi
+; CHECK32-NEXT:    subl %edx, %esi
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %edx, %eax
+; CHECK32-NEXT:    cmovol %ecx, %eax
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+  %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; CHECK-LABEL: func2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    subq %rsi, %rax
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    subq %rsi, %rdi
+; CHECK-NEXT:    cmovnoq %rdi, %rax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func2:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl %ebx, %ebp
+; CHECK32-NEXT:    sbbl %esi, %ebp
+; CHECK32-NEXT:    movl %ebp, %eax
+; CHECK32-NEXT:    sarl $31, %eax
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    testl %ebp, %ebp
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    movl %ecx, %edx
+; CHECK32-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    testl %ebx, %ebx
+; CHECK32-NEXT:    setns %bl
+; CHECK32-NEXT:    cmpb %cl, %bl
+; CHECK32-NEXT:    setne %cl
+; CHECK32-NEXT:    testl %esi, %esi
+; CHECK32-NEXT:    setns %ch
+; CHECK32-NEXT:    cmpb %ch, %bl
+; CHECK32-NEXT:    setne %ch
+; CHECK32-NEXT:    testb %cl, %ch
+; CHECK32-NEXT:    cmovel %ebp, %edx
+; CHECK32-NEXT:    cmovel %edi, %eax
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl
+  %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) {
+; CHECK-LABEL: func3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $4, %sil
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    subb %sil, %cl
+; CHECK-NEXT:    setns %cl
+; CHECK-NEXT:    subb %sil, %al
+; CHECK-NEXT:    jno .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    addb $127, %cl
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    sarb $4, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func3:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; CHECK32-NEXT:    shlb $4, %dl
+; CHECK32-NEXT:    shlb $4, %al
+; CHECK32-NEXT:    movl %eax, %ecx
+; CHECK32-NEXT:    subb %dl, %cl
+; CHECK32-NEXT:    setns %cl
+; CHECK32-NEXT:    subb %dl, %al
+; CHECK32-NEXT:    jno .LBB2_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    addb $127, %cl
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:  .LBB2_2:
+; CHECK32-NEXT:    sarb $4, %al
+; CHECK32-NEXT:    retl
+  %tmp = call i4 @llvm.ssub.sat.i4(i4 %x, i4 %y);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %r8d
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    movl %r8d, %esi
+; CHECK-NEXT:    subl %ecx, %esi
+; CHECK-NEXT:    setns %dl
+; CHECK-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %ecx, %r8d
+; CHECK-NEXT:    cmovol %edx, %r8d
+; CHECK-NEXT:    movd %xmm1, %edx
+; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    subl %edx, %edi
+; CHECK-NEXT:    setns %sil
+; CHECK-NEXT:    addl $2147483647, %esi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %edx, %ecx
+; CHECK-NEXT:    cmovol %esi, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT:    movd %xmm2, %edx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    subl %edx, %esi
+; CHECK-NEXT:    setns %dil
+; CHECK-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %edx, %eax
+; CHECK-NEXT:    cmovol %edi, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; CHECK-NEXT:    movd %xmm1, %r9d
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-NEXT:    movd %xmm0, %edx
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    subl %r9d, %esi
+; CHECK-NEXT:    setns %dil
+; CHECK-NEXT:    addl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    subl %r9d, %edx
+; CHECK-NEXT:    cmovol %edi, %edx
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    movd %eax, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movd %ecx, %xmm0
+; CHECK-NEXT:    movd %r8d, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: vec:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 20
+; CHECK32-NEXT:    .cfi_offset %esi, -20
+; CHECK32-NEXT:    .cfi_offset %edi, -16
+; CHECK32-NEXT:    .cfi_offset %ebx, -12
+; CHECK32-NEXT:    .cfi_offset %ebp, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %ecx, %esi
+; CHECK32-NEXT:    subl %edx, %esi
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %edx, %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovol %eax, %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %edx, %edi
+; CHECK32-NEXT:    subl %esi, %edi
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %esi, %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    cmovol %eax, %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    movl %esi, %ebx
+; CHECK32-NEXT:    subl %edi, %ebx
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %edi, %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    cmovol %eax, %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    xorl %ebx, %ebx
+; CHECK32-NEXT:    movl %edi, %ebp
+; CHECK32-NEXT:    subl %eax, %ebp
+; CHECK32-NEXT:    setns %bl
+; CHECK32-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; CHECK32-NEXT:    subl %eax, %edi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    cmovol %ebx, %edi
+; CHECK32-NEXT:    movl %ecx, 12(%eax)
+; CHECK32-NEXT:    movl %edx, 8(%eax)
+; CHECK32-NEXT:    movl %esi, 4(%eax)
+; CHECK32-NEXT:    movl %edi, (%eax)
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
+  ret <4 x i32> %tmp;
+}
diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll
index 061a8c971f9bec68879635483a157caa31f4bb05..9335acb90c0d6700becc86fa841d74f4fc386aa6 100644
--- a/test/CodeGen/X86/stack-folding-int-avx2.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx2.ll
@@ -38,14 +38,14 @@ define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
   ret <8 x float> %3
 }
 
-define <4 x i32> @stack_fold_extracti128(<8 x i32> %a0, <8 x i32> %a1) {
+define <4 x i32> @stack_fold_extracti128(<8 x i16> %a0, <8 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti128
   ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  ret <4 x i32> %2
+  ; zext forces execution domain
+  %t1 = zext <8 x i16> %a0 to <8 x i32>
+  %t2 = shufflevector <8 x i32> %t1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %t3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  ret <4 x i32> %t2
 }
 
 define <8 x i32> @stack_fold_inserti128(<4 x i32> %a0, <4 x i32> %a1) {
diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll
index 9e6abf6cf5df387fcee38c2bba9cbce62a184621..01ae7ff6d43f0d659cfbda02784191d1258466ed 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512.ll
@@ -154,41 +154,41 @@ define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %m
   ret <32 x i16> %9
 }
 
-define <4 x i32> @stack_fold_extracti32x4(<16 x i32> %a0, <16 x i32> %a1) {
+define <4 x i32> @stack_fold_extracti32x4(<16 x i16> %a0, <16 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti32x4
   ;CHECK:       vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; zext forces execution domain
+  %1 = zext <16 x i16> %a0 to <16 x i32>
   %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <4 x i32> %2
 }
 
-define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) {
+define <2 x i64> @stack_fold_extracti64x2(<8 x i32> %a0, <8 x i64> %a1) {
   ;CHECK-LABEL: stack_fold_extracti64x2
   ;CHECK:       vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ; zext forces execution domain
+  %1 = zext <8 x i32> %a0 to <8 x i64>
   %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <2 x i64> %2
 }
 
-define <8 x i32> @stack_fold_extracti32x8(<16 x i32> %a0, <16 x i32> %a1) {
+define <8 x i32> @stack_fold_extracti32x8(<16 x i16> %a0, <16 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti32x8
   ;CHECK:       vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; zext forces execution domain
+  %1 = zext <16 x i16> %a0 to <16 x i32>
   %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <8 x i32> %2
 }
 
-define <4 x i64> @stack_fold_extracti64x4(<8 x i64> %a0, <8 x i64> %a1) {
+define <4 x i64> @stack_fold_extracti64x4(<8 x i32> %a0, <8 x i64> %a1) {
   ;CHECK-LABEL: stack_fold_extracti64x4
   ;CHECK:       vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ; zext forces execution domain
+  %1 = zext <8 x i32> %a0 to <8 x i64>
   %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <4 x i64> %2
diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
index 76542f4761bfe2a343c66bcc7c42eb118b9331a1..8d8676f0f9e7f5273eca525abebfcba308fb41c1 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
@@ -133,21 +133,21 @@ define <4 x i64> @stack_fold_vpconflictq_ymm(<4 x i64> %a0) {
 }
 declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8) nounwind readnone
 
-define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) {
+define <4 x i32> @stack_fold_extracti32x4(<8 x i16> %a0, <8 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti32x4
   ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; zext forces execution domain
+  %1 = zext <8 x i16> %a0 to <8 x i32>
   %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <4 x i32> %2
 }
 
-define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) {
+define <2 x i64> @stack_fold_extracti64x2(<4 x i32> %a0, <4 x i64> %a1) {
   ;CHECK-LABEL: stack_fold_extracti64x2
   ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <4 x i64> %a0, <i64 1, i64 1, i64 1, i64 1>
+  ; zext forces execution domain
+  %1 = zext <4 x i32> %a0 to <4 x i64>
   %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3>
   %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <2 x i64> %2
diff --git a/test/CodeGen/X86/subvector-broadcast.ll b/test/CodeGen/X86/subvector-broadcast.ll
index 5e93944c7ac5fea114e47212949ab4a3f2996057..a05288ac0318c1a2108a09deb884d3a833115ced 100644
--- a/test/CodeGen/X86/subvector-broadcast.ll
+++ b/test/CodeGen/X86/subvector-broadcast.ll
@@ -949,7 +949,7 @@ entry:
 define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
 ; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
 ; X32-AVX:       # %bb.0: # %entry
-; X32-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1,2,3,4]
+; X32-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X32-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
 ; X32-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
@@ -963,7 +963,7 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
 ;
 ; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
 ; X32-AVX512:       # %bb.0: # %entry
-; X32-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1,2,3,4]
+; X32-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X32-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
 ; X32-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
@@ -975,7 +975,7 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
 ;
 ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
 ; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1,2,3,4]
+; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X64-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
 ; X64-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
 ; X64-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
@@ -989,7 +989,7 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
 ;
 ; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
 ; X64-AVX512:       # %bb.0: # %entry
-; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1,2,3,4]
+; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
 ; X64-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
 ; X64-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
diff --git a/test/CodeGen/X86/switch.ll b/test/CodeGen/X86/switch.ll
index 95b2ed0e618fd67044be8711486935e149c5d12e..a0ebc4eeba24b26cfd796a27d6fe0f6cdaa4ea3b 100644
--- a/test/CodeGen/X86/switch.ll
+++ b/test/CodeGen/X86/switch.ll
@@ -318,15 +318,15 @@ return: ret void
 ; NOOPT-LABEL: optimal_jump_table1
 ; NOOPT: testl %edi, %edi
 ; NOOPT: je
-; NOOPT: subl $5, %eax
+; NOOPT: subl $5, [[REG:%e[abcd][xi]]]
 ; NOOPT: je
-; NOOPT: subl $6, %eax
+; NOOPT: subl $6, [[REG]]
 ; NOOPT: je
-; NOOPT: subl $12, %eax
+; NOOPT: subl $12, [[REG]]
 ; NOOPT: je
-; NOOPT: subl $13, %eax
+; NOOPT: subl $13, [[REG]]
 ; NOOPT: je
-; NOOPT: subl $15, %eax
+; NOOPT: subl $15, [[REG]]
 ; NOOPT: je
 }
 
diff --git a/test/CodeGen/X86/tailcall-lifetime-end.ll b/test/CodeGen/X86/tailcall-lifetime-end.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3aedd007d44d3d26aac058aa9fdaaae5a711ab96
--- /dev/null
+++ b/test/CodeGen/X86/tailcall-lifetime-end.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s
+
+; A lifetime end intrinsic should not prevent a call from being tail call
+; optimized.
+
+define void @foobar() {
+; CHECK-LABEL: foobar
+; CHECK: pushq	%rax
+; CHECK: leaq	4(%rsp), %rdi
+; CHECK: callq	foo
+; CHECK: popq	%rax
+; CHECK: jmp	bar
+entry:
+  %i = alloca i32
+  %0 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0)
+  call void @foo(i32* nonnull %i)
+  tail call void @bar()
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0)
+  ret void
+}
+
+declare void @foo(i32* nocapture %p)
+declare void @bar()
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
diff --git a/test/CodeGen/X86/tbm-schedule.ll b/test/CodeGen/X86/tbm-schedule.ll
index 94bedaa04ae414f8c7b4ceeedb3ab16133a06b5c..b8f9bb08f3e3e7de5eedf2f1ea0e10c5b258d55c 100644
--- a/test/CodeGen/X86/tbm-schedule.ll
+++ b/test/CodeGen/X86/tbm-schedule.ll
@@ -14,12 +14,28 @@ define i32 @test_x86_tbm_bextri_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_bextri_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
-; BDVER-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_bextri_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER2-NEXT:    # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_bextri_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER3-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_bextri_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER4-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = lshr i32 %a0, 4
   %m0 = lshr i32 %a1, 4
@@ -39,12 +55,28 @@ define i64 @test_x86_tbm_bextri_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_bextri_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
-; BDVER-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_bextri_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER2-NEXT:    # sched: [6:0.50]
+; BDVER2-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER2-NEXT:    # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_bextri_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER3-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_bextri_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
+; BDVER4-NEXT:    bextrl $3076, (%rsi), %eax # imm = 0xC04
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = lshr i64 %a0, 4
   %m0 = lshr i64 %a1, 4
@@ -62,12 +94,26 @@ define i32 @test_x86_tbm_blcfill_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcfill_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcfilll %edi, %ecx
-; BDVER-NEXT:    blcfilll (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcfill_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcfilll (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    blcfilll %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcfill_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcfilll %edi, %ecx
+; BDVER3-NEXT:    blcfilll (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcfill_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcfilll %edi, %ecx
+; BDVER4-NEXT:    blcfilll (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 %a0, 1
   %m0 = add i32 %a1, 1
@@ -85,12 +131,26 @@ define i64 @test_x86_tbm_blcfill_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcfill_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcfillq %rdi, %rcx
-; BDVER-NEXT:    blcfillq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcfill_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcfillq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    blcfillq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcfill_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcfillq %rdi, %rcx
+; BDVER3-NEXT:    blcfillq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcfill_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcfillq %rdi, %rcx
+; BDVER4-NEXT:    blcfillq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 %a0, 1
   %m0 = add i64 %a1, 1
@@ -108,12 +168,26 @@ define i32 @test_x86_tbm_blci_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blci_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcil %edi, %ecx
-; BDVER-NEXT:    blcil (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blci_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcil (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    blcil %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blci_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcil %edi, %ecx
+; BDVER3-NEXT:    blcil (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blci_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcil %edi, %ecx
+; BDVER4-NEXT:    blcil (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 1, %a0
   %m0 = add i32 1, %a1
@@ -133,12 +207,26 @@ define i64 @test_x86_tbm_blci_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blci_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blciq %rdi, %rcx
-; BDVER-NEXT:    blciq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blci_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blciq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    blciq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blci_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blciq %rdi, %rcx
+; BDVER3-NEXT:    blciq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blci_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blciq %rdi, %rcx
+; BDVER4-NEXT:    blciq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 1, %a0
   %m0 = add i64 1, %a1
@@ -158,12 +246,26 @@ define i32 @test_x86_tbm_blcic_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcic_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcicl %edi, %ecx
-; BDVER-NEXT:    blcicl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcic_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcicl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    blcicl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcic_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcicl %edi, %ecx
+; BDVER3-NEXT:    blcicl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcic_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcicl %edi, %ecx
+; BDVER4-NEXT:    blcicl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = xor i32 %a0, -1
   %m0 = xor i32 %a1, -1
@@ -183,12 +285,26 @@ define i64 @test_x86_tbm_blcic_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcic_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcicq %rdi, %rcx
-; BDVER-NEXT:    blcicq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcic_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcicq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    blcicq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcic_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcicq %rdi, %rcx
+; BDVER3-NEXT:    blcicq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcic_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcicq %rdi, %rcx
+; BDVER4-NEXT:    blcicq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = xor i64 %a0, -1
   %m0 = xor i64 %a1, -1
@@ -208,12 +324,26 @@ define i32 @test_x86_tbm_blcmsk_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcmsk_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcmskl %edi, %ecx
-; BDVER-NEXT:    blcmskl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcmsk_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcmskl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    blcmskl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcmsk_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcmskl %edi, %ecx
+; BDVER3-NEXT:    blcmskl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcmsk_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcmskl %edi, %ecx
+; BDVER4-NEXT:    blcmskl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 %a0, 1
   %m0 = add i32 %a1, 1
@@ -231,12 +361,26 @@ define i64 @test_x86_tbm_blcmsk_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcmsk_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcmskq %rdi, %rcx
-; BDVER-NEXT:    blcmskq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcmsk_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcmskq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    blcmskq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcmsk_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcmskq %rdi, %rcx
+; BDVER3-NEXT:    blcmskq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcmsk_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcmskq %rdi, %rcx
+; BDVER4-NEXT:    blcmskq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 %a0, 1
   %m0 = add i64 %a1, 1
@@ -254,12 +398,26 @@ define i32 @test_x86_tbm_blcs_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcs_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcsl %edi, %ecx
-; BDVER-NEXT:    blcsl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcs_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcsl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    blcsl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcs_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcsl %edi, %ecx
+; BDVER3-NEXT:    blcsl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcs_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcsl %edi, %ecx
+; BDVER4-NEXT:    blcsl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 %a0, 1
   %m0 = add i32 %a1, 1
@@ -277,12 +435,26 @@ define i64 @test_x86_tbm_blcs_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blcs_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blcsq %rdi, %rcx
-; BDVER-NEXT:    blcsq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blcs_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blcsq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    blcsq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blcs_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blcsq %rdi, %rcx
+; BDVER3-NEXT:    blcsq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blcs_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blcsq %rdi, %rcx
+; BDVER4-NEXT:    blcsq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 %a0, 1
   %m0 = add i64 %a1, 1
@@ -300,12 +472,26 @@ define i32 @test_x86_tbm_blsfill_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blsfill_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blsfilll %edi, %ecx
-; BDVER-NEXT:    blsfilll (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blsfill_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsfilll (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    blsfilll %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blsfill_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blsfilll %edi, %ecx
+; BDVER3-NEXT:    blsfilll (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blsfill_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blsfilll %edi, %ecx
+; BDVER4-NEXT:    blsfilll (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = add i32 %a0, -1
   %m0 = add i32 %a1, -1
@@ -323,12 +509,26 @@ define i64 @test_x86_tbm_blsfill_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blsfill_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blsfillq %rdi, %rcx
-; BDVER-NEXT:    blsfillq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blsfill_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsfillq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    blsfillq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blsfill_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blsfillq %rdi, %rcx
+; BDVER3-NEXT:    blsfillq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blsfill_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blsfillq %rdi, %rcx
+; BDVER4-NEXT:    blsfillq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = add i64 %a0, -1
   %m0 = add i64 %a1, -1
@@ -346,12 +546,26 @@ define i32 @test_x86_tbm_blsic_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blsic_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blsicl %edi, %ecx
-; BDVER-NEXT:    blsicl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blsic_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsicl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    blsicl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blsic_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blsicl %edi, %ecx
+; BDVER3-NEXT:    blsicl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blsic_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blsicl %edi, %ecx
+; BDVER4-NEXT:    blsicl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = xor i32 %a0, -1
   %m0 = xor i32 %a1, -1
@@ -371,12 +585,26 @@ define i64 @test_x86_tbm_blsic_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_blsic_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    blsicq %rdi, %rcx
-; BDVER-NEXT:    blsicq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_blsic_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    blsicq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    blsicq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_blsic_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    blsicq %rdi, %rcx
+; BDVER3-NEXT:    blsicq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_blsic_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    blsicq %rdi, %rcx
+; BDVER4-NEXT:    blsicq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = xor i64 %a0, -1
   %m0 = xor i64 %a1, -1
@@ -396,12 +624,26 @@ define i32 @test_x86_tbm_t1mskc_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_t1mskc_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    t1mskcl %edi, %ecx
-; BDVER-NEXT:    t1mskcl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_t1mskc_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    t1mskcl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    t1mskcl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_t1mskc_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    t1mskcl %edi, %ecx
+; BDVER3-NEXT:    t1mskcl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_t1mskc_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    t1mskcl %edi, %ecx
+; BDVER4-NEXT:    t1mskcl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = xor i32 %a0, -1
   %m0 = xor i32 %a1, -1
@@ -421,12 +663,26 @@ define i64 @test_x86_tbm_t1mskc_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_t1mskc_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    t1mskcq %rdi, %rcx
-; BDVER-NEXT:    t1mskcq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_t1mskc_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    t1mskcq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    t1mskcq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_t1mskc_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    t1mskcq %rdi, %rcx
+; BDVER3-NEXT:    t1mskcq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_t1mskc_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    t1mskcq %rdi, %rcx
+; BDVER4-NEXT:    t1mskcq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = xor i64 %a0, -1
   %m0 = xor i64 %a1, -1
@@ -446,12 +702,26 @@ define i32 @test_x86_tbm_tzmsk_u32(i32 %a0, i32* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_tzmsk_u32:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    tzmskl %edi, %ecx
-; BDVER-NEXT:    tzmskl (%rsi), %eax
-; BDVER-NEXT:    addl %ecx, %eax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_tzmsk_u32:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzmskl (%rsi), %eax # sched: [6:0.50]
+; BDVER2-NEXT:    tzmskl %edi, %ecx # sched: [2:0.50]
+; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_tzmsk_u32:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    tzmskl %edi, %ecx
+; BDVER3-NEXT:    tzmskl (%rsi), %eax
+; BDVER3-NEXT:    addl %ecx, %eax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_tzmsk_u32:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    tzmskl %edi, %ecx
+; BDVER4-NEXT:    tzmskl (%rsi), %eax
+; BDVER4-NEXT:    addl %ecx, %eax
+; BDVER4-NEXT:    retq
   %a1 = load i32, i32* %p1
   %r0 = xor i32 %a0, -1
   %m0 = xor i32 %a1, -1
@@ -471,12 +741,26 @@ define i64 @test_x86_tbm_tzmsk_u64(i64 %a0, i64* nocapture %p1) nounwind {
 ; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_x86_tbm_tzmsk_u64:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    tzmskq %rdi, %rcx
-; BDVER-NEXT:    tzmskq (%rsi), %rax
-; BDVER-NEXT:    addq %rcx, %rax
-; BDVER-NEXT:    retq
+; BDVER2-LABEL: test_x86_tbm_tzmsk_u64:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    tzmskq (%rsi), %rax # sched: [6:0.50]
+; BDVER2-NEXT:    tzmskq %rdi, %rcx # sched: [2:0.50]
+; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
+; BDVER2-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_x86_tbm_tzmsk_u64:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    tzmskq %rdi, %rcx
+; BDVER3-NEXT:    tzmskq (%rsi), %rax
+; BDVER3-NEXT:    addq %rcx, %rax
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_x86_tbm_tzmsk_u64:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    tzmskq %rdi, %rcx
+; BDVER4-NEXT:    tzmskq (%rsi), %rax
+; BDVER4-NEXT:    addq %rcx, %rax
+; BDVER4-NEXT:    retq
   %a1 = load i64, i64* %p1
   %r0 = xor i64 %a0, -1
   %m0 = xor i64 %a1, -1
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
index 6865cc5a0ef376c8883b88aa6a301f2e6755021f..2b335ea426898302b687c83a8309b1551a720fd2 100644
--- a/test/CodeGen/X86/tbm_patterns.ll
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -53,8 +53,7 @@ define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u32_z2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    shrl $4, %edi
-; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
+; CHECK-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
 ; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %t0 = lshr i32 %a, 4
@@ -114,8 +113,7 @@ define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
 ; CHECK-LABEL: test_x86_tbm_bextri_u64_z2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    shrl $4, %edi
-; CHECK-NEXT:    testl $4095, %edi # imm = 0xFFF
+; CHECK-NEXT:    bextrl $3076, %edi, %ecx # imm = 0xC04
 ; CHECK-NEXT:    cmovneq %rdx, %rax
 ; CHECK-NEXT:    retq
   %t0 = lshr i64 %a, 4
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index ddfebcd0b66a713d611afb312d3bbfe8aa437d85..759f3d7c85500e2ff86f02a5e6820ccaf4f02f8f 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck -check-prefix=X86_LINUX %s
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64_LINUX %s
+; RUN: llc < %s -mtriple=i386-linux-gnu -fast-isel | FileCheck -check-prefix=X86_ISEL_LINUX %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck -check-prefix=X64_ISEL_LINUX %s
 ; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck -check-prefix=X86_WIN %s
 ; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=X64_WIN %s
 ; RUN: llc < %s -mtriple=i686-pc-windows-gnu | FileCheck -check-prefix=MINGW32 %s
@@ -453,3 +455,59 @@ define i32* @f16() {
 
   ret i32* @i6
 }
+
+; NOTE: Similar to f1() but with direct TLS segment access disabled
+define i32 @f17() #0 {
+; X86_LINUX-LABEL: f17:
+; X86_LINUX:      movl %gs:0, %eax
+; X86_LINUX-NEXT: movl i1@NTPOFF(%eax), %eax
+; X86_LINUX-NEXT: ret
+; X64_LINUX-LABEL: f17:
+; X64_LINUX:      movq %fs:0, %rax
+; X64_LINUX-NEXT: movl i1@TPOFF(%rax), %eax
+; X64_LINUX-NEXT: ret
+; X86_ISEL_LINUX-LABEL: f17:
+; X86_ISEL_LINUX:      movl %gs:0, %eax
+; X86_ISEL_LINUX-NEXT: movl i1@NTPOFF(%eax), %eax
+; X86_ISEL_LINUX-NEXT: ret
+; X64_ISEL_LINUX-LABEL: f17:
+; X64_ISEL_LINUX:      movq %fs:0, %rax
+; X64_ISEL_LINUX-NEXT: movl i1@TPOFF(%rax), %eax
+; X64_ISEL_LINUX-NEXT: ret
+
+entry:
+	%tmp1 = load i32, i32* @i1
+	ret i32 %tmp1
+}
+
+; NOTE: Similar to f3() but with direct TLS segment access disabled
+define i32 @f18() #1 {
+; X86_LINUX-LABEL: f18:
+; X86_LINUX:      movl i2@INDNTPOFF, %eax
+; X86_LINUX-NEXT: movl %gs:0, %ecx
+; X86_LINUX-NEXT: movl (%ecx,%eax), %eax
+; X86_LINUX-NEXT: ret
+; X64_LINUX-LABEL: f18:
+; X64_LINUX:      movq i2@GOTTPOFF(%rip), %rax
+; X64_LINUX-NEXT: movq %fs:0, %rcx
+; X64_LINUX-NEXT: movl (%rcx,%rax), %eax
+; X64_LINUX-NEXT: ret
+; X86_ISEL_LINUX-LABEL: f18:
+; X86_ISEL_LINUX:      movl i2@INDNTPOFF, %eax
+; X86_ISEL_LINUX-NEXT: movl %gs:0, %ecx
+; X86_ISEL_LINUX-NEXT: movl (%ecx,%eax), %eax
+; X86_ISEL_LINUX-NEXT: ret
+; X64_ISEL_LINUX-LABEL: f18:
+; X64_ISEL_LINUX:      movq i2@GOTTPOFF(%rip), %rax
+; X64_ISEL_LINUX-NEXT: movq %fs:0, %rcx
+; X64_ISEL_LINUX-NEXT: movl (%rcx,%rax), %eax
+; X64_ISEL_LINUX-NEXT: ret
+
+
+entry:
+	%tmp1 = load i32, i32* @i2
+	ret i32 %tmp1
+}
+
+attributes #0 = { "indirect-tls-seg-refs" }
+attributes #1 = { nounwind "indirect-tls-seg-refs" }
diff --git a/test/CodeGen/X86/uadd_sat.ll b/test/CodeGen/X86/uadd_sat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f72d036288d045fb7a2309552efad1c6cfc8f232
--- /dev/null
+++ b/test/CodeGen/X86/uadd_sat.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=CHECK32
+
+declare  i4  @llvm.uadd.sat.i4   (i4,  i4)
+declare  i32 @llvm.uadd.sat.i32  (i32, i32)
+declare  i64 @llvm.uadd.sat.i64  (i64, i64)
+declare  <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>)
+
+define i32 @func(i32 %x, i32 %y) {
+; CHECK-LABEL: func:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    cmovael %edi, %eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl $-1, %eax
+; CHECK32-NEXT:    cmovael %ecx, %eax
+; CHECK32-NEXT:    retl
+  %tmp = call i32 @llvm.uadd.sat.i32(i32 %x, i32 %y);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; CHECK-LABEL: func2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addq %rsi, %rdi
+; CHECK-NEXT:    movq $-1, %rax
+; CHECK-NEXT:    cmovaeq %rdi, %rax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func2:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl $-1, %ecx
+; CHECK32-NEXT:    cmovbl %ecx, %edx
+; CHECK32-NEXT:    cmovbl %ecx, %eax
+; CHECK32-NEXT:    retl
+  %tmp = call i64 @llvm.uadd.sat.i64(i64 %x, i64 %y);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) {
+; CHECK-LABEL: func3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    shlb $4, %sil
+; CHECK-NEXT:    shlb $4, %dil
+; CHECK-NEXT:    addb %sil, %dil
+; CHECK-NEXT:    movb $-1, %al
+; CHECK-NEXT:    jb .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    shrb $4, %al
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func3:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    shlb $4, %al
+; CHECK32-NEXT:    shlb $4, %cl
+; CHECK32-NEXT:    addb %al, %cl
+; CHECK32-NEXT:    movb $-1, %al
+; CHECK32-NEXT:    jb .LBB2_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:  .LBB2_2:
+; CHECK32-NEXT:    shrb $4, %al
+; CHECK32-NEXT:    retl
+  %tmp = call i4 @llvm.uadd.sat.i4(i4 %x, i4 %y);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %ecx
+; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    movl $-1, %eax
+; CHECK-NEXT:    cmovbl %eax, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; CHECK-NEXT:    movd %xmm3, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm3, %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    cmovbl %eax, %edx
+; CHECK-NEXT:    movd %edx, %xmm3
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-NEXT:    movd %xmm1, %ecx
+; CHECK-NEXT:    movd %xmm0, %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    cmovbl %eax, %edx
+; CHECK-NEXT:    movd %edx, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movd %xmm1, %ecx
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT:    movd %xmm0, %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    cmovbl %eax, %edx
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: vec:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    .cfi_offset %esi, -16
+; CHECK32-NEXT:    .cfi_offset %edi, -12
+; CHECK32-NEXT:    .cfi_offset %ebx, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl $-1, %ebx
+; CHECK32-NEXT:    cmovbl %ebx, %edi
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    cmovbl %ebx, %esi
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovbl %ebx, %edx
+; CHECK32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    cmovbl %ebx, %ecx
+; CHECK32-NEXT:    movl %ecx, 12(%eax)
+; CHECK32-NEXT:    movl %edx, 8(%eax)
+; CHECK32-NEXT:    movl %esi, 4(%eax)
+; CHECK32-NEXT:    movl %edi, (%eax)
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
+  ret <4 x i32> %tmp;
+}
diff --git a/test/CodeGen/X86/undef-eflags.mir b/test/CodeGen/X86/undef-eflags.mir
new file mode 100644
index 0000000000000000000000000000000000000000..e5cf58bac6826a2f42a9d6362b0060e6f2a7a9a7
--- /dev/null
+++ b/test/CodeGen/X86/undef-eflags.mir
@@ -0,0 +1,18 @@
+# RUN: llc -o - %s -mtriple=x86_64-- -verify-machineinstrs -run-pass branch-folder | FileCheck %s
+# Check that we do not generate invalid MIR when optimizing condjumps with undef
+# flags on the eflags input (currently we should just bail out).
+---
+# CHECK-LABEL: name: fallundef
+name: fallundef
+tracksRegLiveness: true
+body: |
+  bb.0:
+    JE_1 %bb.1, implicit undef $eflags
+    ; CHECK: JE_1 %bb.1, implicit undef $eflags
+    JMP_1 %bb.2
+  bb.1:
+    RET 2, undef $eax
+
+  bb.2:
+    RET 0, undef $eax
+...
diff --git a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 27541c44b9d52fea334449969563e6ab9420e826..82385386c88e18061079f0f583d398fc7a39db84 100644
--- a/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -629,7 +629,8 @@ define <4 x i32> @test_urem_both(<4 x i32> %X) nounwind readnone {
 ;
 ; CHECK-AVX1-LABEL: test_urem_both:
 ; CHECK-AVX1:       # %bb.0:
-; CHECK-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
+; CHECK-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [-9.255967385052751E+61,-9.255967385052751E+61]
+; CHECK-AVX1-NEXT:    # xmm1 = mem[0,0]
 ; CHECK-AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; CHECK-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
diff --git a/test/CodeGen/X86/usub_sat.ll b/test/CodeGen/X86/usub_sat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1c9a5c56c250f551a7170b981caf72fdc6aecda6
--- /dev/null
+++ b/test/CodeGen/X86/usub_sat.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=CHECK32
+
+declare  i4  @llvm.usub.sat.i4   (i4,  i4)
+declare  i32 @llvm.usub.sat.i32  (i32, i32)
+declare  i64 @llvm.usub.sat.i64  (i64, i64)
+declare  <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>)
+
+define i32 @func(i32 %x, i32 %y) {
+; CHECK-LABEL: func:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    subl %esi, %edi
+; CHECK-NEXT:    cmovael %edi, %eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    cmovbl %ecx, %eax
+; CHECK32-NEXT:    retl
+  %tmp = call i32 @llvm.usub.sat.i32(i32 %x, i32 %y);
+  ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) {
+; CHECK-LABEL: func2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    subq %rsi, %rdi
+; CHECK-NEXT:    cmovaeq %rdi, %rax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func2:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    xorl %ecx, %ecx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovbl %ecx, %edx
+; CHECK32-NEXT:    cmovbl %ecx, %eax
+; CHECK32-NEXT:    retl
+  %tmp = call i64 @llvm.usub.sat.i64(i64 %x, i64 %y);
+  ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) {
+; CHECK-LABEL: func3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shlb $4, %sil
+; CHECK-NEXT:    shlb $4, %al
+; CHECK-NEXT:    subb %sil, %al
+; CHECK-NEXT:    jae .LBB2_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    shrb $4, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: func3:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK32-NEXT:    shlb $4, %cl
+; CHECK32-NEXT:    shlb $4, %al
+; CHECK32-NEXT:    subb %cl, %al
+; CHECK32-NEXT:    jae .LBB2_2
+; CHECK32-NEXT:  # %bb.1:
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:  .LBB2_2:
+; CHECK32-NEXT:    shrb $4, %al
+; CHECK32-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK32-NEXT:    retl
+  %tmp = call i4 @llvm.usub.sat.i4(i4 %x, i4 %y);
+  ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; CHECK-NEXT:    movd %xmm2, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    cmovbl %edx, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; CHECK-NEXT:    movd %xmm3, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; CHECK-NEXT:    movd %xmm3, %ecx
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    cmovbl %edx, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm3
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    cmovbl %edx, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    subl %eax, %ecx
+; CHECK-NEXT:    cmovbl %edx, %ecx
+; CHECK-NEXT:    movd %ecx, %xmm0
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    retq
+;
+; CHECK32-LABEL: vec:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK32-NEXT:    .cfi_offset %esi, -16
+; CHECK32-NEXT:    .cfi_offset %edi, -12
+; CHECK32-NEXT:    .cfi_offset %ebx, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    xorl %ebx, %ebx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    cmovbl %ebx, %edi
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    cmovbl %ebx, %esi
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    cmovbl %ebx, %edx
+; CHECK32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    cmovbl %ebx, %ecx
+; CHECK32-NEXT:    movl %ecx, 12(%eax)
+; CHECK32-NEXT:    movl %edx, 8(%eax)
+; CHECK32-NEXT:    movl %esi, 4(%eax)
+; CHECK32-NEXT:    movl %edi, (%eax)
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %ebx
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
+; CHECK32-NEXT:    retl $4
+  %tmp = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
+  ret <4 x i32> %tmp;
+}
diff --git a/test/CodeGen/X86/v4f32-immediate.ll b/test/CodeGen/X86/v4f32-immediate.ll
index a0eb40925999e5535fca5f44fcb31abdcfb5ee21..690ef825f7ad74a1a109eae4bbee87cb843a689d 100644
--- a/test/CodeGen/X86/v4f32-immediate.ll
+++ b/test/CodeGen/X86/v4f32-immediate.ll
@@ -5,12 +5,12 @@
 define <4 x float> @foo() {
 ; X32-LABEL: foo:
 ; X32:       # %bb.0:
-; X32-NEXT:    movaps {{.*#+}} xmm0 = [3.22354245,2.29999995,1.20000005,0.100000001]
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [3.22354245E+0,2.29999995E+0,1.20000005E+0,1.00000001E-1]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [3.22354245,2.29999995,1.20000005,0.100000001]
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [3.22354245E+0,2.29999995E+0,1.20000005E+0,1.00000001E-1]
 ; X64-NEXT:    retq
   ret <4 x float> <float 0x4009C9D0A0000000, float 0x4002666660000000, float 0x3FF3333340000000, float 0x3FB99999A0000000>
 }
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
index a799b0e6f12dce5ba2be8030ce7da691c441742f..de97281d60e229cf7aea0f0655547853702d1256 100644
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -44,10 +44,9 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
 ; X32-AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vmovups (%eax), %ymm2
 ; X32-AVX2-NEXT:    vcmpltps %ymm0, %ymm2, %ymm0
-; X32-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
-; X32-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; X32-AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vmovaps %ymm0, (%eax)
+; X32-AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vmovdqa %ymm0, (%eax)
 ; X32-AVX2-NEXT:    vzeroupper
 ; X32-AVX2-NEXT:    retl
 ;
@@ -58,10 +57,9 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
 ; X64-AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vmovups (%rdx), %ymm2
 ; X64-AVX2-NEXT:    vcmpltps %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
-; X64-AVX2-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; X64-AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vmovaps %ymm0, (%rax)
+; X64-AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovdqa %ymm0, (%rax)
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
   %v0 = load <8 x float>, <8 x float>* %a, align 16
@@ -133,12 +131,8 @@ define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) {
 ; X32-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; X32-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vpand LCPI2_0, %xmm0, %xmm0
-; X32-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT:    vandps LCPI2_0, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: and_mask_constant:
@@ -147,12 +141,8 @@ define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) {
 ; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
-; X64-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; X64-NEXT:    retq
 ;
 ; X32-AVX2-LABEL: and_mask_constant:
diff --git a/test/CodeGen/X86/vec-copysign-avx512.ll b/test/CodeGen/X86/vec-copysign-avx512.ll
index 6fb0033e7504c87c62035e493fa6e6e23d9d6ac4..b08b15ce004ebd006bce7309717eb2172937d3ea 100644
--- a/test/CodeGen/X86/vec-copysign-avx512.ll
+++ b/test/CodeGen/X86/vec-copysign-avx512.ll
@@ -43,7 +43,7 @@ define <16 x float> @v16f32(<16 x float> %a, <16 x float> %b) nounwind {
 ; AVX512VL:       ## %bb.0:
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
 ; AVX512VL-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: v16f32:
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index 9a5feb83dbbd21f7458e010155a036845975badf..1bc4b690487f5f09bb9700d22673d7178f530e63 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -87,10 +87,10 @@ define <8 x float> @cvt_v8u8_v8f32(<8 x i8> %src) {
 ; CHECK-LABEL: cvt_v8u8_v8f32:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vpand LCPI4_0, %xmm0, %xmm0
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
 ;
@@ -109,19 +109,19 @@ define <8 x float> @cvt_v8u8_v8f32(<8 x i8> %src) {
 define <8 x float> @cvt_v8u16_v8f32(<8 x i16> %src) {
 ; CHECK-LABEL: cvt_v8u16_v8f32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v8u16_v8f32:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-WIDE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-WIDE-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-WIDE-NEXT:    retl
   %res = uitofp <8 x i16> %src to <8 x float>
diff --git a/test/CodeGen/X86/vec_cast3.ll b/test/CodeGen/X86/vec_cast3.ll
index 9af324b76bcf6a40ee9c25867cd123e104e6cb35..e8662b8cc34d9394940fe9fcc3c2ff230c61ec0d 100644
--- a/test/CodeGen/X86/vec_cast3.ll
+++ b/test/CodeGen/X86/vec_cast3.ll
@@ -90,7 +90,7 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4503599627370496,4503599627370496]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
 ; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm0
@@ -99,7 +99,7 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) {
 ; CHECK-WIDE-LABEL: cvt_v2u32_v2f32:
 ; CHECK-WIDE:       ## %bb.0:
 ; CHECK-WIDE-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm1 = [4503599627370496,4503599627370496]
+; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
 ; CHECK-WIDE-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
 ; CHECK-WIDE-NEXT:    vcvtpd2ps %xmm0, %xmm0
@@ -111,19 +111,8 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) {
 define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) {
 ; CHECK-LABEL: cvt_v2f32_v2i8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subl $68, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll (%esp)
-; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-NEXT:    addl $68, %esp
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2i8:
@@ -141,19 +130,8 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) {
 define <2 x i16> @cvt_v2f32_v2i16(<2 x float> %src) {
 ; CHECK-LABEL: cvt_v2f32_v2i16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subl $68, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll (%esp)
-; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-NEXT:    addl $68, %esp
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2i16:
@@ -186,37 +164,8 @@ define <2 x i32> @cvt_v2f32_v2i32(<2 x float> %src) {
 define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) {
 ; CHECK-LABEL: cvt_v2f32_v2u8:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subl $68, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vcmpltss %xmm2, %xmm1, %xmm3
-; CHECK-NEXT:    vsubss %xmm2, %xmm1, %xmm4
-; CHECK-NEXT:    vblendvps %xmm3, %xmm1, %xmm4, %xmm3
-; CHECK-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    vcmpltss %xmm2, %xmm0, %xmm3
-; CHECK-NEXT:    vsubss %xmm2, %xmm0, %xmm4
-; CHECK-NEXT:    vblendvps %xmm3, %xmm0, %xmm4, %xmm3
-; CHECK-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll (%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    vucomiss %xmm2, %xmm1
-; CHECK-NEXT:    setae %al
-; CHECK-NEXT:    shll $31, %eax
-; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    vucomiss %xmm2, %xmm0
-; CHECK-NEXT:    setae %cl
-; CHECK-NEXT:    shll $31, %ecx
-; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    addl $68, %esp
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2u8:
@@ -234,37 +183,8 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) {
 define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) {
 ; CHECK-LABEL: cvt_v2f32_v2u16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    subl $68, %esp
-; CHECK-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vcmpltss %xmm2, %xmm1, %xmm3
-; CHECK-NEXT:    vsubss %xmm2, %xmm1, %xmm4
-; CHECK-NEXT:    vblendvps %xmm3, %xmm1, %xmm4, %xmm3
-; CHECK-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    vcmpltss %xmm2, %xmm0, %xmm3
-; CHECK-NEXT:    vsubss %xmm2, %xmm0, %xmm4
-; CHECK-NEXT:    vblendvps %xmm3, %xmm0, %xmm4, %xmm3
-; CHECK-NEXT:    vmovss %xmm3, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll (%esp)
-; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    vucomiss %xmm2, %xmm1
-; CHECK-NEXT:    setae %al
-; CHECK-NEXT:    shll $31, %eax
-; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    vucomiss %xmm2, %xmm0
-; CHECK-NEXT:    setae %cl
-; CHECK-NEXT:    shll $31, %ecx
-; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $2, (%esp), %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    addl $68, %esp
+; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; CHECK-NEXT:    retl
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2u16:
@@ -317,25 +237,13 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: cvt_v2f32_v2u32:
 ; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    subl $68, %esp
-; CHECK-WIDE-NEXT:    .cfi_def_cfa_offset 72
-; CHECK-WIDE-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    vextractps $2, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    fisttpll {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-WIDE-NEXT:    fisttpll (%esp)
-; CHECK-WIDE-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-WIDE-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpinsrd $3, (%esp), %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    addl $68, %esp
+; CHECK-WIDE-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; CHECK-WIDE-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; CHECK-WIDE-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vxorps LCPI11_1, %xmm1, %xmm1
+; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
+; CHECK-WIDE-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; CHECK-WIDE-NEXT:    retl
   %res = fptoui <2 x float> %src to <2 x i32>
   ret <2 x i32> %res
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 781c61b5789ee556dd0bd5d1eb77f0674fabe80a..26330f940af8eccbb4e3c969030e1f74635cd6b7 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -8,27 +8,26 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
 define <2 x i64> @footz(<2 x i64> %a) nounwind {
 ; CHECK-LABEL: footz:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm2, %xmm2
-; CHECK-NEXT:    psubq %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
-; CHECK-NEXT:    paddq %xmm2, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlq $1, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubq %xmm0, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
-; CHECK-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $2, %xmm3
-; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    paddq %xmm2, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlq $4, %xmm0
-; CHECK-NEXT:    paddq %xmm3, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psadbw %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    pandn %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm0, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    psadbw %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true)
   ret <2 x i64> %c
@@ -58,18 +57,18 @@ define <2 x i64> @foolz(<2 x i64> %a) nounwind {
 ; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
 ; CHECK-NEXT:    pxor %xmm0, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrlq $1, %xmm0
+; CHECK-NEXT:    psrlw $1, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm0, %xmm1
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
 ; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $2, %xmm1
+; CHECK-NEXT:    psrlw $2, %xmm1
 ; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    paddq %xmm2, %xmm1
+; CHECK-NEXT:    paddb %xmm2, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrlq $4, %xmm2
-; CHECK-NEXT:    paddq %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $4, %xmm2
+; CHECK-NEXT:    paddb %xmm1, %xmm2
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm2
 ; CHECK-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-NEXT:    psadbw %xmm2, %xmm0
@@ -83,18 +82,18 @@ define <2 x i64> @foopop(<2 x i64> %a) nounwind {
 ; CHECK-LABEL: foopop:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $1, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    psubq %xmm1, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
 ; CHECK-NEXT:    pand %xmm1, %xmm2
-; CHECK-NEXT:    psrlq $2, %xmm0
+; CHECK-NEXT:    psrlw $2, %xmm0
 ; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    paddq %xmm2, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $4, %xmm1
-; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm0, %xmm1
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
 ; CHECK-NEXT:    pxor %xmm0, %xmm0
 ; CHECK-NEXT:    psadbw %xmm0, %xmm1
@@ -112,27 +111,26 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind {
 ; CHECK-LABEL: promtz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    por {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm2, %xmm2
-; CHECK-NEXT:    psubq %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
-; CHECK-NEXT:    paddq %xmm2, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlq $1, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubq %xmm0, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
-; CHECK-NEXT:    movdqa %xmm3, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    psrlq $2, %xmm3
-; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    paddq %xmm2, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    psrlq $4, %xmm0
-; CHECK-NEXT:    paddq %xmm3, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psadbw %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    pandn %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm0, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    psadbw %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false)
   ret <2 x i32> %c
@@ -164,18 +162,18 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind {
 ; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
 ; CHECK-NEXT:    pxor %xmm0, %xmm2
 ; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlq $1, %xmm0
+; CHECK-NEXT:    psrlw $1, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubq %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm0, %xmm2
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm2, %xmm3
 ; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    psrlq $2, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm2
 ; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    paddq %xmm3, %xmm2
+; CHECK-NEXT:    paddb %xmm3, %xmm2
 ; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlq $4, %xmm0
-; CHECK-NEXT:    paddq %xmm2, %xmm0
+; CHECK-NEXT:    psrlw $4, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    psadbw %xmm1, %xmm0
 ; CHECK-NEXT:    psubq {{.*}}(%rip), %xmm0
@@ -191,18 +189,18 @@ define <2 x i32> @prompop(<2 x i32> %a) nounwind {
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    pxor %xmm2, %xmm2
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $1, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    psubq %xmm1, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm3
 ; CHECK-NEXT:    pand %xmm1, %xmm3
-; CHECK-NEXT:    psrlq $2, %xmm0
+; CHECK-NEXT:    psrlw $2, %xmm0
 ; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    paddq %xmm3, %xmm0
+; CHECK-NEXT:    paddb %xmm3, %xmm0
 ; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $4, %xmm1
-; CHECK-NEXT:    paddq %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm0, %xmm1
 ; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
 ; CHECK-NEXT:    psadbw %xmm2, %xmm1
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll
index 9a12d69b46bedc66038524a9dace2483b8efacf4..a15424a763e193d1700740b52d2c2a789bd58da7 100644
--- a/test/CodeGen/X86/vec_extract-avx.ll
+++ b/test/CodeGen/X86/vec_extract-avx.ll
@@ -171,7 +171,9 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X32-NEXT:    vmovaps %ymm0, (%eax)
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
index 3bbc468d07d4379b5b4f1e28ecd07cf6a1a9ab02..ef499af754050d38e160ec88489ebbc04d0e47a8 100644
--- a/test/CodeGen/X86/vec_floor.ll
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -703,17 +703,17 @@ declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
 define <2 x double> @const_floor_v2f64() {
 ; SSE41-LABEL: const_floor_v2f64:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-2,2]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_floor_v2f64:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-2,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_floor_v2f64:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-2,2]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
   ret <2 x double> %t
@@ -722,17 +722,17 @@ define <2 x double> @const_floor_v2f64() {
 define <4 x float> @const_floor_v4f32() {
 ; SSE41-LABEL: const_floor_v4f32:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-4,6,-9,2]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_floor_v4f32:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-4,6,-9,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_floor_v4f32:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-4,6,-9,2]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
   ret <4 x float> %t
@@ -741,17 +741,17 @@ define <4 x float> @const_floor_v4f32() {
 define <2 x double> @const_ceil_v2f64() {
 ; SSE41-LABEL: const_ceil_v2f64:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1,3]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_ceil_v2f64:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,3]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_ceil_v2f64:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,3]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,3.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
   ret <2 x double> %t
@@ -760,17 +760,17 @@ define <2 x double> @const_ceil_v2f64() {
 define <4 x float> @const_ceil_v4f32() {
 ; SSE41-LABEL: const_ceil_v4f32:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3,6,-9,3]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_ceil_v4f32:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3,6,-9,3]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_ceil_v4f32:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3,6,-9,3]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,3.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
   ret <4 x float> %t
@@ -779,17 +779,17 @@ define <4 x float> @const_ceil_v4f32() {
 define <2 x double> @const_trunc_v2f64() {
 ; SSE41-LABEL: const_trunc_v2f64:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1,2]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_trunc_v2f64:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_trunc_v2f64:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1,2]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-1.0E+0,2.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
   ret <2 x double> %t
@@ -798,17 +798,17 @@ define <2 x double> @const_trunc_v2f64() {
 define <4 x float> @const_trunc_v4f32() {
 ; SSE41-LABEL: const_trunc_v4f32:
 ; SSE41:       ## %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3,6,-9,2]
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: const_trunc_v4f32:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3,6,-9,2]
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: const_trunc_v4f32:
 ; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3,6,-9,2]
+; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [-3.0E+0,6.0E+0,-9.0E+0,2.0E+0]
 ; AVX512-NEXT:    retq
   %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
   ret <4 x float> %t
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index e09cd0a43d7c95c14aa5604537726cc72cd4a65b..651c0e65aa054e3065b05ba22138656c5a117483 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -627,16 +627,36 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; VEX-LABEL: fptoui_4f64_to_2i32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vmovd %eax, %xmm1
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vmovd %eax, %xmm0
-; VEX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; VEX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; VEX-NEXT:    retq
+; AVX1-LABEL: fptoui_4f64_to_2i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd %xmm0, %xmm0
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f64_to_2i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovapd %xmm0, %xmm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_4f64_to_2i32:
 ; AVX512F:       # %bb.0:
@@ -930,21 +950,34 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; VEX-LABEL: fptoui_4f64_to_4i32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; VEX-NEXT:    vcvttsd2si %xmm1, %rax
-; VEX-NEXT:    vcvttsd2si %xmm0, %rcx
-; VEX-NEXT:    vmovd %ecx, %xmm1
-; VEX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; VEX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; VEX-NEXT:    vzeroupper
-; VEX-NEXT:    retq
+; AVX1-LABEL: fptoui_4f64_to_4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f64_to_4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
+; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_4f64_to_4i32:
 ; AVX512F:       # %bb.0:
@@ -1570,39 +1603,41 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
 define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
 ; SSE-LABEL: fptoui_4f32_to_4i32:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
 ; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; SSE-NEXT:    cvttss2si %xmm2, %rax
-; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    cmpltps %xmm2, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE-NEXT:    subps %xmm2, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    andps %xmm1, %xmm3
+; SSE-NEXT:    andnps %xmm0, %xmm1
+; SSE-NEXT:    orps %xmm3, %xmm1
+; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; VEX-LABEL: fptoui_4f32_to_4i32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; VEX-NEXT:    vcvttss2si %xmm1, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rcx
-; VEX-NEXT:    vmovd %ecx, %xmm1
-; VEX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; VEX-NEXT:    vcvttss2si %xmm2, %rax
-; VEX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; VEX-NEXT:    vcvttss2si %xmm0, %rax
-; VEX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: fptoui_4f32_to_4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: fptoui_4f32_to_4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorps %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_4f32_to_4i32:
 ; AVX512F:       # %bb.0:
@@ -1853,95 +1888,51 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
 define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
 ; SSE-LABEL: fptoui_8f32_to_8i32:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
 ; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    movaps %xmm2, %xmm3
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    movd %eax, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE-NEXT:    cvttss2si %xmm2, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm2, %rax
-; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE-NEXT:    movaps %xmm1, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
-; SSE-NEXT:    cvttss2si %xmm2, %rax
-; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    cmpltps %xmm4, %xmm2
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE-NEXT:    subps %xmm4, %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    xorps %xmm5, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm3
+; SSE-NEXT:    andnps %xmm0, %xmm2
+; SSE-NEXT:    orps %xmm3, %xmm2
 ; SSE-NEXT:    movaps %xmm1, %xmm3
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    movd %eax, %xmm3
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    cmpltps %xmm4, %xmm3
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
+; SSE-NEXT:    subps %xmm4, %xmm1
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
+; SSE-NEXT:    xorps %xmm5, %xmm1
+; SSE-NEXT:    andps %xmm3, %xmm0
+; SSE-NEXT:    andnps %xmm1, %xmm3
+; SSE-NEXT:    orps %xmm0, %xmm3
+; SSE-NEXT:    movaps %xmm2, %xmm0
+; SSE-NEXT:    movaps %xmm3, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: fptoui_8f32_to_8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT:    vcvttss2si %xmm2, %rax
-; AVX1-NEXT:    vcvttss2si %xmm1, %rcx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX1-NEXT:    vcvttss2si %xmm1, %rax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vcvttss2si %xmm2, %rax
-; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX1-NEXT:    vmovd %ecx, %xmm2
-; AVX1-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX1-NEXT:    vcvttss2si %xmm0, %rax
-; AVX1-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX1-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
+; AVX1-NEXT:    vsubps %ymm1, %ymm0, %ymm1
+; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: fptoui_8f32_to_8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT:    vcvttss2si %xmm2, %rax
-; AVX2-NEXT:    vcvttss2si %xmm1, %rcx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vcvttss2si %xmm2, %rax
-; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX2-NEXT:    vmovd %ecx, %xmm2
-; AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vcvttss2si %xmm0, %rax
-; AVX2-NEXT:    vpinsrd $3, %eax, %xmm2, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vsubps %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vxorps %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: fptoui_8f32_to_8i32:
@@ -2875,3 +2866,445 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %ext
 }
+
+define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
+; SSE-LABEL: fptosi_2f32_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f32_to_2i8:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f32_to_2i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f32_to_2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f32_to_2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f32_to_2i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN-LABEL: fptosi_2f32_to_2i8:
+; WIDEN:       # %bb.0:
+; WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
+; WIDEN-NEXT:    vpmovdb %zmm0, %xmm0
+; WIDEN-NEXT:    vzeroupper
+; WIDEN-NEXT:    retq
+  %cvt = fptosi <2 x float> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) {
+; SSE-LABEL: fptosi_2f32_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f32_to_2i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f32_to_2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f32_to_2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f32_to_2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN_SKX-LABEL: fptosi_2f32_to_2i16:
+; WIDEN_SKX:       # %bb.0:
+; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; WIDEN_SKX-NEXT:    vcvttps2dq %ymm0, %ymm0
+; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; WIDEN_SKX-NEXT:    vzeroupper
+; WIDEN_SKX-NEXT:    retq
+;
+; WIDEN_KNL-LABEL: fptosi_2f32_to_2i16:
+; WIDEN_KNL:       # %bb.0:
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; WIDEN_KNL-NEXT:    vcvttps2dq %ymm0, %ymm0
+; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; WIDEN_KNL-NEXT:    vzeroupper
+; WIDEN_KNL-NEXT:    retq
+  %cvt = fptosi <2 x float> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) {
+; SSE-LABEL: fptoui_2f32_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f32_to_2i8:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f32_to_2i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f32_to_2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f32_to_2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f32_to_2i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN-LABEL: fptoui_2f32_to_2i8:
+; WIDEN:       # %bb.0:
+; WIDEN-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN-NEXT:    vcvttps2dq %zmm0, %zmm0
+; WIDEN-NEXT:    vpmovdb %zmm0, %xmm0
+; WIDEN-NEXT:    vzeroupper
+; WIDEN-NEXT:    retq
+  %cvt = fptoui <2 x float> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) {
+; SSE-LABEL: fptoui_2f32_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f32_to_2i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f32_to_2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f32_to_2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f32_to_2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN_SKX-LABEL: fptoui_2f32_to_2i16:
+; WIDEN_SKX:       # %bb.0:
+; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; WIDEN_SKX-NEXT:    vcvttps2dq %ymm0, %ymm0
+; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; WIDEN_SKX-NEXT:    vzeroupper
+; WIDEN_SKX-NEXT:    retq
+;
+; WIDEN_KNL-LABEL: fptoui_2f32_to_2i16:
+; WIDEN_KNL:       # %bb.0:
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; WIDEN_KNL-NEXT:    vcvttps2dq %ymm0, %ymm0
+; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; WIDEN_KNL-NEXT:    vzeroupper
+; WIDEN_KNL-NEXT:    retq
+  %cvt = fptoui <2 x float> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f64_to_2i8:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f64_to_2i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f64_to_2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f64_to_2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f64_to_2i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN-LABEL: fptosi_2f64_to_2i8:
+; WIDEN:       # %bb.0:
+; WIDEN-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; WIDEN-NEXT:    vcvttsd2si %xmm1, %eax
+; WIDEN-NEXT:    vcvttsd2si %xmm0, %ecx
+; WIDEN-NEXT:    vmovd %ecx, %xmm0
+; WIDEN-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; WIDEN-NEXT:    retq
+  %cvt = fptosi <2 x double> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) {
+; SSE-LABEL: fptosi_2f64_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptosi_2f64_to_2i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovsxdq %xmm0, %xmm0
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptosi_2f64_to_2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptosi_2f64_to_2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptosi_2f64_to_2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN_SKX-LABEL: fptosi_2f64_to_2i16:
+; WIDEN_SKX:       # %bb.0:
+; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN_SKX-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; WIDEN_SKX-NEXT:    vzeroupper
+; WIDEN_SKX-NEXT:    retq
+;
+; WIDEN_KNL-LABEL: fptosi_2f64_to_2i16:
+; WIDEN_KNL:       # %bb.0:
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN_KNL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; WIDEN_KNL-NEXT:    vzeroupper
+; WIDEN_KNL-NEXT:    retq
+  %cvt = fptosi <2 x double> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
+
+define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f64_to_2i8:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_2i8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_2i8:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_2i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_2i8:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN-LABEL: fptoui_2f64_to_2i8:
+; WIDEN:       # %bb.0:
+; WIDEN-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; WIDEN-NEXT:    vcvttsd2si %xmm1, %eax
+; WIDEN-NEXT:    vcvttsd2si %xmm0, %ecx
+; WIDEN-NEXT:    vmovd %ecx, %xmm0
+; WIDEN-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; WIDEN-NEXT:    retq
+  %cvt = fptoui <2 x double> %a to <2 x i8>
+  ret <2 x i8> %cvt
+}
+
+define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) {
+; SSE-LABEL: fptoui_2f64_to_2i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    xorpd %xmm1, %xmm1
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT:    retq
+;
+; VEX-LABEL: fptoui_2f64_to_2i16:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; VEX-NEXT:    retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_2i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_2i16:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_2i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
+; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16:
+; AVX512VLDQ:       # %bb.0:
+; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    retq
+;
+; WIDEN_SKX-LABEL: fptoui_2f64_to_2i16:
+; WIDEN_SKX:       # %bb.0:
+; WIDEN_SKX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN_SKX-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; WIDEN_SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; WIDEN_SKX-NEXT:    vzeroupper
+; WIDEN_SKX-NEXT:    retq
+;
+; WIDEN_KNL-LABEL: fptoui_2f64_to_2i16:
+; WIDEN_KNL:       # %bb.0:
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; WIDEN_KNL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; WIDEN_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; WIDEN_KNL-NEXT:    vzeroupper
+; WIDEN_KNL-NEXT:    retq
+  %cvt = fptoui <2 x double> %a to <2 x i16>
+  ret <2 x i16> %cvt
+}
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 7bc05fb39f0d20b40af06655fc74489509e19fbf..b66d5d1bfffe0013e65af5b3c6c3498fd585da6c 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -253,42 +253,42 @@ entry:
 define <2 x double> @fpext_fromconst() {
 ; X32-SSE-LABEL: fpext_fromconst:
 ; X32-SSE:       # %bb.0: # %entry
-; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,-2]
+; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0]
 ; X32-SSE-NEXT:    # encoding: [0x0f,0x28,0x05,A,A,A,A]
 ; X32-SSE-NEXT:    # fixup A - offset: 3, value: {{\.LCPI.*}}, kind: FK_Data_4
 ; X32-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X32-AVX-LABEL: fpext_fromconst:
 ; X32-AVX:       # %bb.0: # %entry
-; X32-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,-2]
+; X32-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0]
 ; X32-AVX-NEXT:    # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
 ; X32-AVX-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4
 ; X32-AVX-NEXT:    retl # encoding: [0xc3]
 ;
 ; X32-AVX512VL-LABEL: fpext_fromconst:
 ; X32-AVX512VL:       # %bb.0: # %entry
-; X32-AVX512VL-NEXT:    vmovaps {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1,-2]
+; X32-AVX512VL-NEXT:    vmovaps {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0]
 ; X32-AVX512VL-NEXT:    # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
 ; X32-AVX512VL-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4
 ; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: fpext_fromconst:
 ; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,-2]
+; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0]
 ; X64-SSE-NEXT:    # encoding: [0x0f,0x28,0x05,A,A,A,A]
 ; X64-SSE-NEXT:    # fixup A - offset: 3, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX-LABEL: fpext_fromconst:
 ; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,-2]
+; X64-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.0E+0,-2.0E+0]
 ; X64-AVX-NEXT:    # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
 ; X64-AVX-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512VL-LABEL: fpext_fromconst:
 ; X64-AVX512VL:       # %bb.0: # %entry
-; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1,-2]
+; X64-AVX512VL-NEXT:    vmovaps {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1.0E+0,-2.0E+0]
 ; X64-AVX512VL-NEXT:    # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
 ; X64-AVX512VL-NEXT:    # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
 ; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
diff --git a/test/CodeGen/X86/vec_fptrunc.ll b/test/CodeGen/X86/vec_fptrunc.ll
index 79abeb0c59f7268ddf5342b232cd5b04caa696cc..bb6be6cd9e84b69a63c7f8f024270d0689538401 100644
--- a/test/CodeGen/X86/vec_fptrunc.ll
+++ b/test/CodeGen/X86/vec_fptrunc.ll
@@ -10,8 +10,7 @@ define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-SSE-NEXT:    cvtpd2ps (%ecx), %xmm0
-; X32-SSE-NEXT:    extractps $1, %xmm0, 4(%eax)
-; X32-SSE-NEXT:    movss %xmm0, (%eax)
+; X32-SSE-NEXT:    movlpd %xmm0, (%eax)
 ; X32-SSE-NEXT:    retl
 ;
 ; X32-AVX-LABEL: fptrunc_frommem2:
@@ -19,8 +18,7 @@ define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-AVX-NEXT:    vcvtpd2psx (%ecx), %xmm0
-; X32-AVX-NEXT:    vextractps $1, %xmm0, 4(%eax)
-; X32-AVX-NEXT:    vmovss %xmm0, (%eax)
+; X32-AVX-NEXT:    vmovlpd %xmm0, (%eax)
 ; X32-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: fptrunc_frommem2:
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 2b8ceeba7f3c3d30152c2a92f74e2e57780110ec..9ea75e493517729b5b3366859332129fbb3fc6da 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -497,63 +497,67 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
 define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
 ; SSE2-LABEL: uitofp_2i64_to_2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm4, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm3
-; SSE2-NEXT:    movapd %xmm3, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
-; SSE2-NEXT:    addpd %xmm3, %xmm0
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    addpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_2i64_to_2f64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT:    subpd %xmm3, %xmm0
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE41-NEXT:    subpd %xmm3, %xmm2
-; SSE41-NEXT:    haddpd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_2i64_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT:    vmovapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
-; VEX-NEXT:    vhaddpd %xmm0, %xmm2, %xmm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_2i64_to_2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_2i64_to_2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_2i64_to_2f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_2i64_to_2f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_2i64_to_2f64:
@@ -659,32 +663,16 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
 ; SSE41-NEXT:    addpd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: uitofp_4i32_to_2f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_4i32_to_2f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65536,65536,65536,65536]
-; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; VEX-LABEL: uitofp_4i32_to_2f64:
+; VEX:       # %bb.0:
+; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
+; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
+; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_4i32_to_2f64:
 ; AVX512F:       # %bb.0:
@@ -837,104 +825,96 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
 define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
 ; SSE2-LABEL: uitofp_4i64_to_4f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm5, %xmm2
-; SSE2-NEXT:    movapd %xmm2, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT:    movapd %xmm2, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT:    subpd %xmm6, %xmm0
+; SSE2-NEXT:    addpd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    subpd %xmm6, %xmm1
+; SSE2-NEXT:    addpd %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_4i64_to_4f64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT:    subpd %xmm4, %xmm0
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm3
-; SSE41-NEXT:    haddpd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm1
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm3
-; SSE41-NEXT:    haddpd %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE41-NEXT:    subpd %xmm6, %xmm0
+; SSE41-NEXT:    addpd %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    por %xmm4, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    subpd %xmm6, %xmm1
+; SSE41-NEXT:    addpd %xmm2, %xmm1
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_4i64_to_4f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; VEX-NEXT:    vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT:    vmovapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
-; VEX-NEXT:    vhaddpd %xmm1, %xmm3, %xmm1
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
-; VEX-NEXT:    vhaddpd %xmm0, %xmm3, %xmm0
-; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_4i64_to_4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_4i64_to_4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_4i64_to_4f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_4i64_to_4f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
@@ -958,7 +938,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
 ; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [65536,65536]
+; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE2-NEXT:    mulpd %xmm2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
@@ -978,7 +958,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrld $16, %xmm1
 ; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65536,65536]
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE41-NEXT:    mulpd %xmm2, %xmm1
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
@@ -1008,7 +988,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65536,65536,65536,65536]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
 ; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
@@ -1746,13 +1726,11 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
 define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
 ; SSE2-LABEL: sitofp_8i8_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    psrad $24, %xmm1
 ; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm0
 ; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -1796,13 +1774,11 @@ define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
 define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
 ; SSE2-LABEL: sitofp_16i8_to_8f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    psrad $24, %xmm1
 ; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm0
 ; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; SSE2-NEXT:    movaps %xmm2, %xmm0
@@ -2433,10 +2409,10 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
 ;
 ; AVX1-LABEL: uitofp_8i16_to_4f32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 ; AVX1-NEXT:    vzeroupper
@@ -2972,10 +2948,10 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
 ;
 ; AVX1-LABEL: uitofp_8i16_to_8f32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -3446,67 +3422,73 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
 define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; SSE2-LABEL: uitofp_load_2i64_to_2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm4, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    subpd %xmm4, %xmm3
-; SSE2-NEXT:    movapd %xmm3, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_load_2i64_to_2f64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT:    movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT:    subpd %xmm3, %xmm0
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE41-NEXT:    subpd %xmm3, %xmm2
-; SSE41-NEXT:    haddpd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    addpd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_load_2i64_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovapd (%rdi), %xmm0
-; VEX-NEXT:    vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT:    vmovapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT:    vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VEX-NEXT:    vsubpd %xmm3, %xmm0, %xmm0
-; VEX-NEXT:    vhaddpd %xmm0, %xmm2, %xmm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_load_2i64_to_2f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_2i64_to_2f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_load_2i64_to_2f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_2i64_to_2f64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64:
@@ -3652,109 +3634,104 @@ define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
 define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ; SSE2-LABEL: uitofp_load_4i64_to_4f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT:    movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25]
-; SSE2-NEXT:    subpd %xmm5, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm1
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm2
-; SSE2-NEXT:    movapd %xmm2, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE2-NEXT:    subpd %xmm6, %xmm0
+; SSE2-NEXT:    addpd %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    psrlq $32, %xmm1
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    subpd %xmm6, %xmm1
 ; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-NEXT:    subpd %xmm5, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm2
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm2
-; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: uitofp_load_4i64_to_4f64:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa (%rdi), %xmm0
 ; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE41-NEXT:    movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; SSE41-NEXT:    subpd %xmm4, %xmm0
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm3
-; SSE41-NEXT:    haddpd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm1
-; SSE41-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE41-NEXT:    subpd %xmm4, %xmm3
-; SSE41-NEXT:    haddpd %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
+; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
+; SSE41-NEXT:    por %xmm5, %xmm0
+; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
+; SSE41-NEXT:    subpd %xmm6, %xmm0
+; SSE41-NEXT:    addpd %xmm3, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE41-NEXT:    por %xmm4, %xmm2
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    por %xmm5, %xmm1
+; SSE41-NEXT:    subpd %xmm6, %xmm1
+; SSE41-NEXT:    addpd %xmm2, %xmm1
 ; SSE41-NEXT:    retq
 ;
-; VEX-LABEL: uitofp_load_4i64_to_4f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovapd (%rdi), %ymm0
-; VEX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; VEX-NEXT:    vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT:    vmovapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25]
-; VEX-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm1, %xmm1
-; VEX-NEXT:    vhaddpd %xmm1, %xmm3, %xmm1
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm3, %xmm3
-; VEX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; VEX-NEXT:    vsubpd %xmm4, %xmm0, %xmm0
-; VEX-NEXT:    vhaddpd %xmm0, %xmm3, %xmm0
-; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT:    retq
+; AVX1-LABEL: uitofp_load_4i64_to_4f64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovaps (%rdi), %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: uitofp_load_4i64_to_4f64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: uitofp_load_4i64_to_4f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
+; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
@@ -3780,7 +3757,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
 ; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [65536,65536]
+; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE2-NEXT:    mulpd %xmm2, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
@@ -3801,7 +3778,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    psrld $16, %xmm1
 ; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [65536,65536]
+; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
 ; SSE41-NEXT:    mulpd %xmm2, %xmm1
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
@@ -3833,7 +3810,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [65536,65536,65536,65536]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
 ; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
@@ -5748,10 +5725,8 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
 ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    movq 24(%rdi), %rax
-; AVX1-NEXT:    vmovdqu 8(%rdi), %xmm0
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
+; AVX1-NEXT:    vpmovsxwd 16(%rdi), %xmm0
+; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    vmovaps %ymm0, (%rax)
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index 6b3f7acac900706a49772ce478bba43b60273da2..1f5503067c619fe642cfb589c105616e68a5ca0c 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -247,22 +247,22 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
 define  <2 x double> @test5() nounwind uwtable readnone noinline {
 ; X32-LABEL: test5:
 ; X32:       ## %bb.0: ## %entry
-; X32-NEXT:    movaps {{.*#+}} xmm0 = [128,123.321]
+; X32-NEXT:    movaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test5:
 ; X64:       ## %bb.0: ## %entry
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [128,123.321]
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
 ; X64-NEXT:    retq
 ;
 ; X32_AVX-LABEL: test5:
 ; X32_AVX:       ## %bb.0: ## %entry
-; X32_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [128,123.321]
+; X32_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
 ; X32_AVX-NEXT:    retl
 ;
 ; X64_AVX-LABEL: test5:
 ; X64_AVX:       ## %bb.0: ## %entry
-; X64_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [128,123.321]
+; X64_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1.28E+2,1.23321E+2]
 ; X64_AVX-NEXT:    retq
 entry:
   %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index 8abd26805e69a227b4f975ee1f52c1505ffb4f72..5f4489c5ed2595267fbfcd22449b3246d8ba28e1 100644
--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -109,7 +109,7 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psrld $16, %xmm2
 ; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT:    movaps {{.*#+}} xmm3 = [65536,65536,65536,65536]
+; SSE2-NEXT:    movaps {{.*#+}} xmm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
 ; SSE2-NEXT:    mulps %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
 ; SSE2-NEXT:    pand %xmm4, %xmm0
@@ -129,7 +129,7 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psrld $16, %xmm2
 ; SSE41-NEXT:    cvtdq2ps %xmm2, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm3 = [65536,65536,65536,65536]
+; SSE41-NEXT:    movaps {{.*#+}} xmm3 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
 ; SSE41-NEXT:    mulps %xmm3, %xmm2
 ; SSE41-NEXT:    pxor %xmm4, %xmm4
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
index fa4c8abe6d2d12225189548fbeebf19ce832ab1e..b249eed2fc76d1baa99694ab1fc8cae2e6e3f746 100644
--- a/test/CodeGen/X86/vector-bitreverse.ll
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -2046,27 +2046,27 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
 ; AVX512F-NEXT:    vpsrld $24, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsrld $8, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vpord %zmm1, %zmm2, %zmm1
 ; AVX512F-NEXT:    vpslld $24, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpslld $8, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm0, %zmm2, %zmm0
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpslld $4, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsrld $4, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpslld $2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsrld $2, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpslld $1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_bitreverse_v16i32:
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index 934d1027e9b3f2c50e0a1de1f7631397be40b8e6..5008a1e865da6e76a92cd20a61b01dfa6e8f2c14 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -629,7 +629,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
 ;
 ; AVX1-LABEL: constant_pblendvb_avx2:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303]
 ; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
diff --git a/test/CodeGen/X86/vector-compare-all_of.ll b/test/CodeGen/X86/vector-compare-all_of.ll
index 1974ad5facd84902d35cb0c5b6be7e2c92cb8f88..fe74d07512b023f0e8ba665fca8855242b650b88 100644
--- a/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/test/CodeGen/X86/vector-compare-all_of.ll
@@ -64,7 +64,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vandpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vandpd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -200,7 +200,7 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -344,7 +344,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -510,7 +510,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -668,7 +668,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -695,7 +695,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -870,7 +870,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -899,7 +899,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-compare-any_of.ll b/test/CodeGen/X86/vector-compare-any_of.ll
index 92c2d0b5841c9e8ba5b7020e8e85a6f51c9b08e9..b7fa5cb64dfd756109d94355a232bf498a6ac8d3 100644
--- a/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/test/CodeGen/X86/vector-compare-any_of.ll
@@ -62,7 +62,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vorpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vorpd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vorpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -188,7 +188,7 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -324,7 +324,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -477,7 +477,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -623,7 +623,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -649,7 +649,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -812,7 +812,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -840,7 +840,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll b/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e35e76d2f38a0616cd421ee9c768e5756d33dc68
--- /dev/null
+++ b/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s
+
+define <1 x float> @constrained_vector_fma_v1f32() {
+; CHECK-LABEL: constrained_vector_fma_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <1 x float> @llvm.experimental.constrained.fma.v1f32(
+           <1 x float> <float 0.5>,
+           <1 x float> <float 2.5>,
+           <1 x float> <float 4.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <1 x float> %fma
+}
+
+define <2 x double> @constrained_vector_fma_v2f64() {
+; CHECK-LABEL: constrained_vector_fma_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm1 = [1.5E+0,5.0E-1]
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [3.5E+0,2.5E+0]
+; CHECK-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <2 x double> @llvm.experimental.constrained.fma.v2f64(
+           <2 x double> <double 1.5, double 0.5>,
+           <2 x double> <double 3.5, double 2.5>,
+           <2 x double> <double 5.5, double 4.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <2 x double> %fma
+}
+
+define <3 x float> @constrained_vector_fma_v3f32() {
+; CHECK-LABEL: constrained_vector_fma_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm3 = (xmm0 * xmm3) + mem
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[2,3]
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <3 x float> @llvm.experimental.constrained.fma.v3f32(
+           <3 x float> <float 2.5, float 1.5, float 0.5>,
+           <3 x float> <float 5.5, float 4.5, float 3.5>,
+           <3 x float> <float 8.5, float 7.5, float 6.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <3 x float> %fma
+}
+
+define <3 x double> @constrained_vector_fma_v3f64() {
+; CHECK-LABEL: constrained_vector_fma_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.5E+0,1.5E+0]
+; CHECK-NEXT:    vmovapd {{.*#+}} xmm2 = [5.5E+0,4.5E+0]
+; CHECK-NEXT:    vfmadd213pd {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <3 x double> @llvm.experimental.constrained.fma.v3f64(
+           <3 x double> <double 2.5, double 1.5, double 0.5>,
+           <3 x double> <double 5.5, double 4.5, double 3.5>,
+           <3 x double> <double 8.5, double 7.5, double 6.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <3 x double> %fma
+}
+
+define <4 x double> @constrained_vector_fma_v4f64() {
+; CHECK-LABEL: constrained_vector_fma_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1]
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0]
+; CHECK-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <4 x double> @llvm.experimental.constrained.fma.v4f64(
+           <4 x double> <double 3.5, double 2.5, double 1.5, double 0.5>,
+           <4 x double> <double 7.5, double 6.5, double 5.5, double 4.5>,
+           <4 x double> <double 11.5, double 10.5, double 9.5, double 8.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <4 x double> %fma
+}
+
+define <4 x float> @constrained_vector_fma_v4f32() {
+; CHECK-LABEL: constrained_vector_fma_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1]
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0]
+; CHECK-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <4 x float> @llvm.experimental.constrained.fma.v4f32(
+           <4 x float> <float 3.5, float 2.5, float 1.5, float 0.5>,
+           <4 x float> <float 7.5, float 6.5, float 5.5, float 4.5>,
+           <4 x float> <float 11.5, float 10.5, float 9.5, float 8.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <4 x float> %fma
+}
+
+define <8 x float> @constrained_vector_fma_v8f32() {
+; CHECK-LABEL: constrained_vector_fma_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3.5E+0,2.5E+0,1.5E+0,5.0E-1,7.5E+0,6.5E+0,5.5E+0,4.5E+0]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [7.5E+0,6.5E+0,5.5E+0,4.5E+0,1.15E+1,1.05E+1,9.5E+0,8.5E+0]
+; CHECK-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
+; CHECK-NEXT:    retq
+entry:
+  %fma = call <8 x float> @llvm.experimental.constrained.fma.v8f32(
+           <8 x float> <float 3.5, float 2.5, float 1.5, float 0.5,
+                        float 7.5, float 6.5, float 5.5, float 4.5>,
+           <8 x float> <float 7.5, float 6.5, float 5.5, float 4.5,
+                        float 11.5, float 10.5, float 9.5, float 8.5>,
+           <8 x float> <float 11.5, float 10.5, float 9.5, float 8.5,
+                        float 15.5, float 14.5, float 13.5, float 12.5>,
+           metadata !"round.dynamic",
+           metadata !"fpexcept.strict")
+  ret <8 x float> %fma
+}
+
+; Single width declarations
+declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
+declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
+
+; Scalar width declarations
+declare <1 x float> @llvm.experimental.constrained.fma.v1f32(<1 x float>, <1 x float>, <1 x float>, metadata, metadata)
+
+; Illegal width declarations
+declare <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float>, <3 x float>, <3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double>, <3 x double>, <3 x double>, metadata, metadata)
+
+; Double width declarations
+declare <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double>, <4 x double>, <4 x double>, metadata, metadata)
+declare <8 x float> @llvm.experimental.constrained.fma.v8f32(<8 x float>, <8 x float>, <8 x float>, metadata, metadata)
diff --git a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index f13178ed5ce0a405fb3577ed079a121bb9f4af9b..55f5bc6bf3664960518a97fc1c9279fe6a10970a 100644
--- a/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -1,19 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck --check-prefix=COMMON --check-prefix=NO-FMA --check-prefix=FMACALL64 --check-prefix=FMACALL32 %s
-; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck -check-prefix=COMMON --check-prefix=HAS-FMA --check-prefix=FMA64 --check-prefix=FMA32 %s
+; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s
 
 define <1 x float> @constrained_vector_fdiv_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    divss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vdivss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %div = call <1 x float> @llvm.experimental.constrained.fdiv.v1f32(
            <1 x float> <float 1.000000e+00>,
@@ -24,17 +17,11 @@ entry:
 }
 
 define <2 x double> @constrained_vector_fdiv_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1,2]
-; NO-FMA-NEXT:    divpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [1,2]
-; HAS-FMA-NEXT:    vdivpd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
+; CHECK-NEXT:    divpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %div = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(
            <2 x double> <double 1.000000e+00, double 2.000000e+00>,
@@ -45,31 +32,18 @@ entry:
 }
 
 define <3 x float> @constrained_vector_fdiv_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    divss %xmm1, %xmm2
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    divss %xmm1, %xmm0
-; NO-FMA-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    divss %xmm1, %xmm3
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm1
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vdivss %xmm0, %xmm2, %xmm2
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vdivss %xmm0, %xmm3, %xmm0
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm1, %xmm2
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm1, %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm1, %xmm3
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    retq
 entry:
   %div = call <3 x float> @llvm.experimental.constrained.fdiv.v3f32(
            <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>,
@@ -80,26 +54,17 @@ entry:
 }
 
 define <3 x double> @constrained_vector_fdiv_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1,2]
-; NO-FMA-NEXT:    divpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    divsd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vdivsd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1,2]
-; HAS-FMA-NEXT:    vdivpd {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
+; CHECK-NEXT:    divpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    divsd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %div = call <3 x double> @llvm.experimental.constrained.fdiv.v3f64(
            <3 x double> <double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>,
@@ -110,20 +75,14 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fdiv_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fdiv_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm2 = [10,10]
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1,2]
-; NO-FMA-NEXT:    divpd %xmm2, %xmm0
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [3,4]
-; NO-FMA-NEXT:    divpd %xmm2, %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fdiv_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [1,2,3,4]
-; HAS-FMA-NEXT:    vdivpd {{.*}}(%rip), %ymm0, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fdiv_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm2 = [1.0E+1,1.0E+1]
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
+; CHECK-NEXT:    divpd %xmm2, %xmm0
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [3.0E+0,4.0E+0]
+; CHECK-NEXT:    divpd %xmm2, %xmm1
+; CHECK-NEXT:    retq
 entry:
   %div = call <4 x double> @llvm.experimental.constrained.fdiv.v4f64(
            <4 x double> <double 1.000000e+00, double 2.000000e+00,
@@ -136,27 +95,16 @@ entry:
 }
 
 define <1 x float> @constrained_vector_frem_v1f32() {
-; NO-FMA-LABEL: constrained_vector_frem_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmodf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmodf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmodf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rem = call <1 x float> @llvm.experimental.constrained.frem.v1f32(
            <1 x float> <float 1.000000e+00>,
@@ -167,39 +115,22 @@ entry:
 }
 
 define <2 x double> @constrained_vector_frem_v2f64() {
-; NO-FMA-LABEL: constrained_vector_frem_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rem = call <2 x double> @llvm.experimental.constrained.frem.v2f64(
            <2 x double> <double 1.000000e+00, double 2.000000e+00>,
@@ -210,52 +141,29 @@ entry:
 }
 
 define <3 x float> @constrained_vector_frem_v3f32() {
-; NO-FMA-LABEL: constrained_vector_frem_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmodf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmodf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmodf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmodf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmodf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq fmodf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmodf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmodf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmodf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rem = call <3 x float> @llvm.experimental.constrained.frem.v3f32(
            <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>,
@@ -266,54 +174,30 @@ entry:
 }
 
 define <3 x double> @constrained_vector_frem_v3f64() {
-; NO-FMA-LABEL: constrained_vector_frem_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rem = call <3 x double> @llvm.experimental.constrained.frem.v3f64(
            <3 x double> <double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>,
@@ -324,62 +208,34 @@ entry:
 }
 
 define <4 x double> @constrained_vector_frem_v4f64() {
-; NO-FMA-LABEL: constrained_vector_frem_v4f64:
-; NO-FMA:       # %bb.0:
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq fmod
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_frem_v4f64:
-; HAS-FMA:       # %bb.0:
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq fmod
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_frem_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmod
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
   %rem = call <4 x double> @llvm.experimental.constrained.frem.v4f64(
            <4 x double> <double 1.000000e+00, double 2.000000e+00,
                          double 3.000000e+00, double 4.000000e+00>,
@@ -391,17 +247,11 @@ define <4 x double> @constrained_vector_frem_v4f64() {
 }
 
 define <1 x float> @constrained_vector_fmul_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fmul_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    mulss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %mul = call <1 x float> @llvm.experimental.constrained.fmul.v1f32(
            <1 x float> <float 0x7FF0000000000000>,
@@ -412,17 +262,11 @@ entry:
 }
 
 define <2 x double> @constrained_vector_fmul_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fmul_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %mul = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(
            <2 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF>,
@@ -433,27 +277,17 @@ entry:
 }
 
 define <3 x float> @constrained_vector_fmul_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fmul_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    mulss %xmm1, %xmm2
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    mulss %xmm1, %xmm0
-; NO-FMA-NEXT:    mulss {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
-; HAS-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm2
-; HAS-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    mulss %xmm1, %xmm2
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    mulss %xmm1, %xmm0
+; CHECK-NEXT:    mulss {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    retq
 entry:
   %mul = call <3 x float> @llvm.experimental.constrained.fmul.v3f32(
            <3 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000,
@@ -465,26 +299,17 @@ entry:
 }
 
 define <3 x double> @constrained_vector_fmul_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fmul_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    mulsd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmulsd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vmulpd {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    mulpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    mulsd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %mul = call <3 x double> @llvm.experimental.constrained.fmul.v3f64(
            <3 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -496,19 +321,13 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fmul_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fmul_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [2,3]
-; NO-FMA-NEXT:    mulpd %xmm1, %xmm0
-; NO-FMA-NEXT:    mulpd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fmul_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fmul_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [2.0E+0,3.0E+0]
+; CHECK-NEXT:    mulpd %xmm1, %xmm0
+; CHECK-NEXT:    mulpd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    retq
 entry:
   %mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64(
            <4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -521,17 +340,11 @@ entry:
 }
 
 define <1 x float> @constrained_vector_fadd_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fadd_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    addss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %add = call <1 x float> @llvm.experimental.constrained.fadd.v1f32(
            <1 x float> <float 0x7FF0000000000000>,
@@ -542,17 +355,11 @@ entry:
 }
 
 define <2 x double> @constrained_vector_fadd_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fadd_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    addpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vaddpd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    addpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %add = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(
            <2 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF>,
@@ -563,28 +370,17 @@ entry:
 }
 
 define <3 x float> @constrained_vector_fadd_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fadd_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    xorps %xmm1, %xmm1
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    addss %xmm2, %xmm1
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    addss %xmm2, %xmm0
-; NO-FMA-NEXT:    addss {{.*}}(%rip), %xmm2
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; HAS-FMA-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm2
-; HAS-FMA-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss %xmm2, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    addss %xmm2, %xmm0
+; CHECK-NEXT:    addss {{.*}}(%rip), %xmm2
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
 entry:
   %add = call <3 x float> @llvm.experimental.constrained.fadd.v3f32(
            <3 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000,
@@ -596,26 +392,17 @@ entry:
 }
 
 define <3 x double> @constrained_vector_fadd_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fadd_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    addpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    xorpd %xmm1, %xmm1
-; NO-FMA-NEXT:    addsd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vaddpd {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    addpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    xorpd %xmm1, %xmm1
+; CHECK-NEXT:    addsd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %add = call <3 x double> @llvm.experimental.constrained.fadd.v3f64(
            <3 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -627,19 +414,13 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fadd_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fadd_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [1,0.10000000000000001]
-; NO-FMA-NEXT:    addpd %xmm1, %xmm0
-; NO-FMA-NEXT:    addpd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fadd_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fadd_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,1.0000000000000001E-1]
+; CHECK-NEXT:    addpd %xmm1, %xmm0
+; CHECK-NEXT:    addpd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    retq
 entry:
   %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64(
            <4 x double> <double 0x7FEFFFFFFFFFFFFF, double 0x7FEFFFFFFFFFFFFF,
@@ -652,17 +433,11 @@ entry:
 }
 
 define <1 x float> @constrained_vector_fsub_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fsub_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    subss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsubss {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    subss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %sub = call <1 x float> @llvm.experimental.constrained.fsub.v1f32(
            <1 x float> <float 0x7FF0000000000000>,
@@ -673,17 +448,11 @@ entry:
 }
 
 define <2 x double> @constrained_vector_fsub_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fsub_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; NO-FMA-NEXT:    subpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; CHECK-NEXT:    subpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %sub = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(
            <2 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF>,
@@ -694,29 +463,18 @@ entry:
 }
 
 define <3 x float> @constrained_vector_fsub_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fsub_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    xorps %xmm0, %xmm0
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movaps %xmm1, %xmm2
-; NO-FMA-NEXT:    subss %xmm0, %xmm2
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    subss {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    subss {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsubss %xmm0, %xmm1, %xmm0
-; HAS-FMA-NEXT:    vsubss {{.*}}(%rip), %xmm1, %xmm2
-; HAS-FMA-NEXT:    vsubss {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    subss %xmm0, %xmm2
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    subss {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    subss {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    retq
 entry:
   %sub = call <3 x float> @llvm.experimental.constrained.fsub.v3f32(
            <3 x float> <float 0xFFFFFFFFE0000000, float 0xFFFFFFFFE0000000,
@@ -728,28 +486,18 @@ entry:
 }
 
 define <3 x double> @constrained_vector_fsub_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fsub_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    xorpd %xmm0, %xmm0
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    subsd %xmm0, %xmm1
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; NO-FMA-NEXT:    subpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vsubpd {{.*}}(%rip), %xmm1, %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xorpd %xmm0, %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    subsd %xmm0, %xmm1
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; CHECK-NEXT:    subpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %sub = call <3 x double> @llvm.experimental.constrained.fsub.v3f64(
            <3 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF,
@@ -761,19 +509,13 @@ entry:
 }
 
 define <4 x double> @constrained_vector_fsub_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fsub_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
-; NO-FMA-NEXT:    movapd %xmm1, %xmm0
-; NO-FMA-NEXT:    subpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    subpd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fsub_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308,-1.7976931348623157E+308]
-; HAS-FMA-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_fsub_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
+; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    subpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    subpd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    retq
 entry:
   %sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64(
            <4 x double> <double 0xFFEFFFFFFFFFFFFF, double 0xFFEFFFFFFFFFFFFF,
@@ -785,365 +527,12 @@ entry:
   ret <4 x double> %sub
 }
 
-define <1 x float> @constrained_vector_fma_v1f32() {
-; NO-FMA-LABEL: constrained_vector_fma_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <1 x float> @llvm.experimental.constrained.fma.v1f32(
-           <1 x float> <float 0.5>,
-           <1 x float> <float 2.5>,
-           <1 x float> <float 4.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <1 x float> %fma
-}
-
-define <2 x double> @constrained_vector_fma_v2f64() {
-; NO-FMA-LABEL: constrained_vector_fma_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm1 = [1.5,0.5]
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [3.5,2.5]
-; HAS-FMA-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <2 x double> @llvm.experimental.constrained.fma.v2f64(
-           <2 x double> <double 1.5, double 0.5>,
-           <2 x double> <double 3.5, double 2.5>,
-           <2 x double> <double 5.5, double 4.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <2 x double> %fma
-}
-
-define <3 x float> @constrained_vector_fma_v3f32() {
-; NO-FMA-LABEL: constrained_vector_fma_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vfmadd213ss {{.*#+}} xmm3 = (xmm0 * xmm3) + mem
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <3 x float> @llvm.experimental.constrained.fma.v3f32(
-           <3 x float> <float 2.5, float 1.5, float 0.5>,
-           <3 x float> <float 5.5, float 4.5, float 3.5>,
-           <3 x float> <float 8.5, float 7.5, float 6.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <3 x float> %fma
-}
-
-define <3 x double> @constrained_vector_fma_v3f64() {
-; NO-FMA-LABEL: constrained_vector_fma_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm0 = [2.5,1.5]
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} xmm2 = [5.5,4.5]
-; HAS-FMA-NEXT:    vfmadd213pd {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm0
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <3 x double> @llvm.experimental.constrained.fma.v3f64(
-           <3 x double> <double 2.5, double 1.5, double 0.5>,
-           <3 x double> <double 5.5, double 4.5, double 3.5>,
-           <3 x double> <double 8.5, double 7.5, double 6.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <3 x double> %fma
-}
-
-define <4 x double> @constrained_vector_fma_v4f64() {
-; NO-FMA-LABEL: constrained_vector_fma_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; NO-FMA-NEXT:    callq fma
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm1 = [3.5,2.5,1.5,0.5]
-; HAS-FMA-NEXT:    vmovapd {{.*#+}} ymm0 = [7.5,6.5,5.5,4.5]
-; HAS-FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <4 x double> @llvm.experimental.constrained.fma.v4f64(
-           <4 x double> <double 3.5, double 2.5, double 1.5, double 0.5>,
-           <4 x double> <double 7.5, double 6.5, double 5.5, double 4.5>,
-           <4 x double> <double 11.5, double 10.5, double 9.5, double 8.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <4 x double> %fma
-}
-
-define <4 x float> @constrained_vector_fma_v4f32() {
-; NO-FMA-LABEL: constrained_vector_fma_v4f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v4f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} xmm1 = [3.5,2.5,1.5,0.5]
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} xmm0 = [7.5,6.5,5.5,4.5]
-; HAS-FMA-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <4 x float> @llvm.experimental.constrained.fma.v4f32(
-           <4 x float> <float 3.5, float 2.5, float 1.5, float 0.5>,
-           <4 x float> <float 7.5, float 6.5, float 5.5, float 4.5>,
-           <4 x float> <float 11.5, float 10.5, float 9.5, float 8.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <4 x float> %fma
-}
-
-define <8 x float> @constrained_vector_fma_v8f32() {
-; NO-FMA-LABEL: constrained_vector_fma_v8f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $56, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 64
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq fmaf
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $56, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_fma_v8f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} ymm1 = [3.5,2.5,1.5,0.5,7.5,6.5,5.5,4.5]
-; HAS-FMA-NEXT:    vmovaps {{.*#+}} ymm0 = [7.5,6.5,5.5,4.5,11.5,10.5,9.5,8.5]
-; HAS-FMA-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem
-; HAS-FMA-NEXT:    retq
-entry:
-  %fma = call <8 x float> @llvm.experimental.constrained.fma.v8f32(
-           <8 x float> <float 3.5, float 2.5, float 1.5, float 0.5,
-                        float 7.5, float 6.5, float 5.5, float 4.5>,
-           <8 x float> <float 7.5, float 6.5, float 5.5, float 4.5,
-                        float 11.5, float 10.5, float 9.5, float 8.5>,
-           <8 x float> <float 11.5, float 10.5, float 9.5, float 8.5,
-                        float 15.5, float 14.5, float 13.5, float 12.5>,
-           metadata !"round.dynamic",
-           metadata !"fpexcept.strict")
-  ret <8 x float> %fma
-}
-
 define <1 x float> @constrained_vector_sqrt_v1f32() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    sqrtss %xmm0, %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sqrt_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    sqrtss %xmm0, %xmm0
+; CHECK-NEXT:    retq
 entry:
   %sqrt = call <1 x float> @llvm.experimental.constrained.sqrt.v1f32(
                               <1 x float> <float 42.0>,
@@ -1153,15 +542,10 @@ entry:
 }
 
 define <2 x double> @constrained_vector_sqrt_v2f64() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vsqrtpd {{.*}}(%rip), %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sqrt_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %sqrt = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(
                               <2 x double> <double 42.0, double 42.1>,
@@ -1171,29 +555,17 @@ entry:
 }
 
 define <3 x float> @constrained_vector_sqrt_v3f32() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    sqrtss %xmm0, %xmm1
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    sqrtss %xmm0, %xmm0
-; NO-FMA-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    sqrtss %xmm2, %xmm2
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; NO-FMA-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vsqrtss %xmm2, %xmm2, %xmm2
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sqrt_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    sqrtss %xmm0, %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    sqrtss %xmm0, %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    sqrtss %xmm2, %xmm2
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
 entry:
   %sqrt = call <3 x float> @llvm.experimental.constrained.sqrt.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -1203,24 +575,16 @@ entry:
 }
 
 define <3 x double> @constrained_vector_sqrt_v3f64() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    sqrtsd %xmm0, %xmm1
-; NO-FMA-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movapd %xmm0, %xmm1
-; NO-FMA-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vsqrtpd {{.*}}(%rip), %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sqrt_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    sqrtsd %xmm0, %xmm1
+; CHECK-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movapd %xmm0, %xmm1
+; CHECK-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    retq
 entry:
   %sqrt = call <3 x double> @llvm.experimental.constrained.sqrt.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -1230,17 +594,12 @@ entry:
 }
 
 define <4 x double> @constrained_vector_sqrt_v4f64() {
-; NO-FMA-LABEL: constrained_vector_sqrt_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
-; NO-FMA-NEXT:    sqrtpd {{.*}}(%rip), %xmm1
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sqrt_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vsqrtpd {{.*}}(%rip), %ymm0
-; HAS-FMA-NEXT:    retq
-entry:
+; CHECK-LABEL: constrained_vector_sqrt_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sqrtpd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    sqrtpd {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    retq
+ entry:
   %sqrt = call <4 x double> @llvm.experimental.constrained.sqrt.v4f64(
                               <4 x double> <double 42.0, double 42.1,
                                             double 42.2, double 42.3>,
@@ -1250,27 +609,16 @@ entry:
 }
 
 define <1 x float> @constrained_vector_pow_v1f32() {
-; NO-FMA-LABEL: constrained_vector_pow_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq powf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq powf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq powf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <1 x float> @llvm.experimental.constrained.pow.v1f32(
                              <1 x float> <float 42.0>,
@@ -1281,39 +629,22 @@ entry:
 }
 
 define <2 x double> @constrained_vector_pow_v2f64() {
-; NO-FMA-LABEL: constrained_vector_pow_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <2 x double> @llvm.experimental.constrained.pow.v2f64(
                              <2 x double> <double 42.1, double 42.2>,
@@ -1324,52 +655,29 @@ entry:
 }
 
 define <3 x float> @constrained_vector_pow_v3f32() {
-; NO-FMA-LABEL: constrained_vector_pow_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq powf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq powf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq powf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq powf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq powf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq powf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq powf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq powf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq powf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <3 x float> @llvm.experimental.constrained.pow.v3f32(
                              <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -1380,54 +688,30 @@ entry:
 }
 
 define <3 x double> @constrained_vector_pow_v3f64() {
-; NO-FMA-LABEL: constrained_vector_pow_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <3 x double> @llvm.experimental.constrained.pow.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -1438,62 +722,34 @@ entry:
 }
 
 define <4 x double> @constrained_vector_pow_v4f64() {
-; NO-FMA-LABEL: constrained_vector_pow_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; NO-FMA-NEXT:    callq pow
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_pow_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; HAS-FMA-NEXT:    callq pow
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_pow_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq pow
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %pow = call <4 x double> @llvm.experimental.constrained.pow.v4f64(
                              <4 x double> <double 42.1, double 42.2,
@@ -1506,27 +762,16 @@ entry:
 }
 
 define <1 x float> @constrained_vector_powi_v1f32() {
-; NO-FMA-LABEL: constrained_vector_powi_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powisf2
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powisf2
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powisf2
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <1 x float> @llvm.experimental.constrained.powi.v1f32(
                               <1 x float> <float 42.0>,
@@ -1537,39 +782,22 @@ entry:
 }
 
 define <2 x double> @constrained_vector_powi_v2f64() {
-; NO-FMA-LABEL: constrained_vector_powi_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <2 x double> @llvm.experimental.constrained.powi.v2f64(
                               <2 x double> <double 42.1, double 42.2>,
@@ -1580,52 +808,29 @@ entry:
 }
 
 define <3 x float> @constrained_vector_powi_v3f32() {
-; NO-FMA-LABEL: constrained_vector_powi_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powisf2
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powisf2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powisf2
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powisf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powisf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powisf2
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powisf2
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powisf2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powisf2
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <3 x float> @llvm.experimental.constrained.powi.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -1636,54 +841,30 @@ entry:
 }
 
 define <3 x double> @constrained_vector_powi_v3f64() {
-; NO-FMA-LABEL: constrained_vector_powi_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <3 x double> @llvm.experimental.constrained.powi.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -1694,62 +875,34 @@ entry:
 }
 
 define <4 x double> @constrained_vector_powi_v4f64() {
-; NO-FMA-LABEL: constrained_vector_powi_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movl $3, %edi
-; NO-FMA-NEXT:    callq __powidf2
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_powi_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    movl $3, %edi
-; HAS-FMA-NEXT:    callq __powidf2
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_powi_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movl $3, %edi
+; CHECK-NEXT:    callq __powidf2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %powi = call <4 x double> @llvm.experimental.constrained.powi.v4f64(
                               <4 x double> <double 42.1, double 42.2,
@@ -1761,25 +914,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_sin_v1f32() {
-; NO-FMA-LABEL: constrained_vector_sin_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq sinf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq sinf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq sinf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <1 x float> @llvm.experimental.constrained.sin.v1f32(
                              <1 x float> <float 42.0>,
@@ -1789,35 +932,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_sin_v2f64() {
-; NO-FMA-LABEL: constrained_vector_sin_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <2 x double> @llvm.experimental.constrained.sin.v2f64(
                              <2 x double> <double 42.0, double 42.1>,
@@ -1827,46 +955,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_sin_v3f32() {
-; NO-FMA-LABEL: constrained_vector_sin_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq sinf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq sinf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq sinf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq sinf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq sinf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq sinf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq sinf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq sinf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq sinf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <3 x float> @llvm.experimental.constrained.sin.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -1876,48 +984,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_sin_v3f64() {
-; NO-FMA-LABEL: constrained_vector_sin_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <3 x double> @llvm.experimental.constrained.sin.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -1927,54 +1014,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_sin_v4f64() {
-; NO-FMA-LABEL: constrained_vector_sin_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq sin
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_sin_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq sin
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_sin_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq sin
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %sin = call <4 x double> @llvm.experimental.constrained.sin.v4f64(
                              <4 x double> <double 42.0, double 42.1,
@@ -1985,25 +1048,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_cos_v1f32() {
-; NO-FMA-LABEL: constrained_vector_cos_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq cosf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq cosf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq cosf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <1 x float> @llvm.experimental.constrained.cos.v1f32(
                              <1 x float> <float 42.0>,
@@ -2013,35 +1066,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_cos_v2f64() {
-; NO-FMA-LABEL: constrained_vector_cos_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <2 x double> @llvm.experimental.constrained.cos.v2f64(
                              <2 x double> <double 42.0, double 42.1>,
@@ -2051,46 +1089,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_cos_v3f32() {
-; NO-FMA-LABEL: constrained_vector_cos_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq cosf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq cosf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq cosf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq cosf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq cosf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq cosf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq cosf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq cosf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq cosf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <3 x float> @llvm.experimental.constrained.cos.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2100,48 +1118,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_cos_v3f64() {
-; NO-FMA-LABEL: constrained_vector_cos_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <3 x double> @llvm.experimental.constrained.cos.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -2151,54 +1148,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_cos_v4f64() {
-; NO-FMA-LABEL: constrained_vector_cos_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq cos
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_cos_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq cos
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_cos_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq cos
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %cos = call <4 x double> @llvm.experimental.constrained.cos.v4f64(
                              <4 x double> <double 42.0, double 42.1,
@@ -2209,25 +1182,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_exp_v1f32() {
-; NO-FMA-LABEL: constrained_vector_exp_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq expf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq expf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq expf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <1 x float> @llvm.experimental.constrained.exp.v1f32(
                              <1 x float> <float 42.0>,
@@ -2237,35 +1200,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_exp_v2f64() {
-; NO-FMA-LABEL: constrained_vector_exp_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <2 x double> @llvm.experimental.constrained.exp.v2f64(
                              <2 x double> <double 42.0, double 42.1>,
@@ -2275,46 +1223,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_exp_v3f32() {
-; NO-FMA-LABEL: constrained_vector_exp_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq expf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq expf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq expf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq expf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq expf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq expf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq expf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq expf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq expf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <3 x float> @llvm.experimental.constrained.exp.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2324,48 +1252,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_exp_v3f64() {
-; NO-FMA-LABEL: constrained_vector_exp_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <3 x double> @llvm.experimental.constrained.exp.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -2375,54 +1282,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_exp_v4f64() {
-; NO-FMA-LABEL: constrained_vector_exp_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp = call <4 x double> @llvm.experimental.constrained.exp.v4f64(
                              <4 x double> <double 42.0, double 42.1,
@@ -2433,25 +1316,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_exp2_v1f32() {
-; NO-FMA-LABEL: constrained_vector_exp2_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq exp2f
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq exp2f
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq exp2f
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <1 x float> @llvm.experimental.constrained.exp2.v1f32(
                              <1 x float> <float 42.0>,
@@ -2461,35 +1334,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_exp2_v2f64() {
-; NO-FMA-LABEL: constrained_vector_exp2_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <2 x double> @llvm.experimental.constrained.exp2.v2f64(
                               <2 x double> <double 42.1, double 42.0>,
@@ -2499,46 +1357,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_exp2_v3f32() {
-; NO-FMA-LABEL: constrained_vector_exp2_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq exp2f
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq exp2f
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq exp2f
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq exp2f
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq exp2f
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq exp2f
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq exp2f
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq exp2f
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq exp2f
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <3 x float> @llvm.experimental.constrained.exp2.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2548,48 +1386,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_exp2_v3f64() {
-; NO-FMA-LABEL: constrained_vector_exp2_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <3 x double> @llvm.experimental.constrained.exp2.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -2599,54 +1416,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_exp2_v4f64() {
-; NO-FMA-LABEL: constrained_vector_exp2_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq exp2
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_exp2_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq exp2
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_exp2_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq exp2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %exp2 = call <4 x double> @llvm.experimental.constrained.exp2.v4f64(
                               <4 x double> <double 42.1, double 42.2,
@@ -2657,25 +1450,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_log_v1f32() {
-; NO-FMA-LABEL: constrained_vector_log_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq logf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq logf
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq logf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <1 x float> @llvm.experimental.constrained.log.v1f32(
                              <1 x float> <float 42.0>,
@@ -2685,35 +1468,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_log_v2f64() {
-; NO-FMA-LABEL: constrained_vector_log_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <2 x double> @llvm.experimental.constrained.log.v2f64(
                              <2 x double> <double 42.0, double 42.1>,
@@ -2723,46 +1491,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_log_v3f32() {
-; NO-FMA-LABEL: constrained_vector_log_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq logf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq logf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq logf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq logf
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq logf
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq logf
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq logf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq logf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq logf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <3 x float> @llvm.experimental.constrained.log.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2772,48 +1520,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_log_v3f64() {
-; NO-FMA-LABEL: constrained_vector_log_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <3 x double> @llvm.experimental.constrained.log.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -2823,54 +1550,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_log_v4f64() {
-; NO-FMA-LABEL: constrained_vector_log_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log = call <4 x double> @llvm.experimental.constrained.log.v4f64(
                              <4 x double> <double 42.0, double 42.1,
@@ -2881,25 +1584,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_log10_v1f32() {
-; NO-FMA-LABEL: constrained_vector_log10_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log10f
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log10f
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log10f
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <1 x float> @llvm.experimental.constrained.log10.v1f32(
                              <1 x float> <float 42.0>,
@@ -2909,35 +1602,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_log10_v2f64() {
-; NO-FMA-LABEL: constrained_vector_log10_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <2 x double> @llvm.experimental.constrained.log10.v2f64(
                                <2 x double> <double 42.0, double 42.1>,
@@ -2947,46 +1625,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_log10_v3f32() {
-; NO-FMA-LABEL: constrained_vector_log10_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log10f
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log10f
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log10f
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log10f
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log10f
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log10f
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log10f
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log10f
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log10f
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <3 x float> @llvm.experimental.constrained.log10.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -2996,48 +1654,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_log10_v3f64() {
-; NO-FMA-LABEL: constrained_vector_log10_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <3 x double> @llvm.experimental.constrained.log10.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -3047,54 +1684,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_log10_v4f64() {
-; NO-FMA-LABEL: constrained_vector_log10_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log10
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log10_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log10
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log10_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log10
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log10 = call <4 x double> @llvm.experimental.constrained.log10.v4f64(
                                <4 x double> <double 42.0, double 42.1,
@@ -3105,25 +1718,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_log2_v1f32() {
-; NO-FMA-LABEL: constrained_vector_log2_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log2f
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    pushq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 16
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log2f
-; HAS-FMA-NEXT:    popq %rax
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log2f
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <1 x float> @llvm.experimental.constrained.log2.v1f32(
                              <1 x float> <float 42.0>,
@@ -3133,35 +1736,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_log2_v2f64() {
-; NO-FMA-LABEL: constrained_vector_log2_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 32
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    addq $24, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <2 x double> @llvm.experimental.constrained.log2.v2f64(
                               <2 x double> <double 42.0, double 42.1>,
@@ -3171,46 +1759,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_log2_v3f32() {
-; NO-FMA-LABEL: constrained_vector_log2_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log2f
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log2f
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq log2f
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log2f
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log2f
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    callq log2f
-; HAS-FMA-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log2f
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log2f
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq log2f
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <3 x float> @llvm.experimental.constrained.log2.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -3220,48 +1788,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_log2_v3f64() {
-; NO-FMA-LABEL: constrained_vector_log2_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 64
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vzeroupper
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    addq $56, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <3 x double> @llvm.experimental.constrained.log2.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -3271,54 +1818,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_log2_v4f64() {
-; NO-FMA-LABEL: constrained_vector_log2_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq log2
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_log2_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    subq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 48
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    callq log2
-; HAS-FMA-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; HAS-FMA-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
-; HAS-FMA-NEXT:    addq $40, %rsp
-; HAS-FMA-NEXT:    .cfi_def_cfa_offset 8
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_log2_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq log2
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %log2 = call <4 x double> @llvm.experimental.constrained.log2.v4f64(
                               <4 x double> <double 42.0, double 42.1,
@@ -3329,21 +1852,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_rint_v1f32() {
-; NO-FMA-LABEL: constrained_vector_rint_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq rintf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_rint_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq rintf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rint = call <1 x float> @llvm.experimental.constrained.rint.v1f32(
                              <1 x float> <float 42.0>,
@@ -3353,25 +1870,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_rint_v2f64() {
-; NO-FMA-LABEL: constrained_vector_rint_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vroundpd $4, {{.*}}(%rip), %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_rint_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rint = call <2 x double> @llvm.experimental.constrained.rint.v2f64(
                         <2 x double> <double 42.1, double 42.0>,
@@ -3381,39 +1893,27 @@ entry:
 }
 
 define <3 x float> @constrained_vector_rint_v3f32() {
-; NO-FMA-LABEL: constrained_vector_rint_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq rintf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq rintf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq rintf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $4, %xmm1, %xmm1, %xmm1
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $4, %xmm2, %xmm2, %xmm2
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
-entry:
+; CHECK-LABEL: constrained_vector_rint_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq rintf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq rintf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq rintf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+ entry:
   %rint = call <3 x float> @llvm.experimental.constrained.rint.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
                               metadata !"round.dynamic",
@@ -3422,35 +1922,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_rint_v3f64() {
-; NO-FMA-LABEL: constrained_vector_rint_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vroundpd $4, {{.*}}(%rip), %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_rint_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rint = call <3 x double> @llvm.experimental.constrained.rint.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -3460,35 +1952,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_rint_v4f64() {
-; NO-FMA-LABEL: constrained_vector_rint_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq rint
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_rint_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vroundpd $4, {{.*}}(%rip), %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_rint_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq rint
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64(
                         <4 x double> <double 42.1, double 42.2,
@@ -3499,21 +1986,15 @@ entry:
 }
 
 define <1 x float> @constrained_vector_nearbyint_v1f32() {
-; NO-FMA-LABEL: constrained_vector_nearbyint_v1f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    pushq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 16
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq nearbyintf
-; NO-FMA-NEXT:    popq %rax
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearbyint_v1f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearbyint_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq nearbyintf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <1 x float> @llvm.experimental.constrained.nearbyint.v1f32(
                                <1 x float> <float 42.0>,
@@ -3523,25 +2004,20 @@ entry:
 }
 
 define <2 x double> @constrained_vector_nearbyint_v2f64() {
-; NO-FMA-LABEL: constrained_vector_nearbyint_v2f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearbyint_v2f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vroundpd $12, {{.*}}(%rip), %xmm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearbyint_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(
                                 <2 x double> <double 42.1, double 42.0>,
@@ -3551,38 +2027,26 @@ entry:
 }
 
 define <3 x float> @constrained_vector_nearbyint_v3f32() {
-; NO-FMA-LABEL: constrained_vector_nearbyint_v3f32:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq nearbyintf
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq nearbyintf
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; NO-FMA-NEXT:    callq nearbyintf
-; NO-FMA-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; NO-FMA-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm1, %xmm0
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearbyint_v3f32:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $12, %xmm1, %xmm1, %xmm1
-; HAS-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; HAS-FMA-NEXT:    vroundss $12, %xmm2, %xmm2, %xmm2
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; HAS-FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearbyint_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq nearbyintf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq nearbyintf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq nearbyintf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <3 x float> @llvm.experimental.constrained.nearbyint.v3f32(
                               <3 x float> <float 42.0, float 43.0, float 44.0>,
@@ -3592,35 +2056,27 @@ entry:
 }
 
 define <3 x double> @constrained_vector_nearby_v3f64() {
-; NO-FMA-LABEL: constrained_vector_nearby_v3f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 32
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    fldl {{[0-9]+}}(%rsp)
-; NO-FMA-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm0 = mem[0],zero
-; NO-FMA-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
-; NO-FMA-NEXT:    # xmm1 = mem[0],zero
-; NO-FMA-NEXT:    addq $24, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearby_v3f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; HAS-FMA-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
-; HAS-FMA-NEXT:    vroundpd $12, {{.*}}(%rip), %xmm1
-; HAS-FMA-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearby_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <3 x double> @llvm.experimental.constrained.nearbyint.v3f64(
                           <3 x double> <double 42.0, double 42.1, double 42.2>,
@@ -3630,35 +2086,30 @@ entry:
 }
 
 define <4 x double> @constrained_vector_nearbyint_v4f64() {
-; NO-FMA-LABEL: constrained_vector_nearbyint_v4f64:
-; NO-FMA:       # %bb.0: # %entry
-; NO-FMA-NEXT:    subq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 48
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm0 = xmm0[0],mem[0]
-; NO-FMA-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; NO-FMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; NO-FMA-NEXT:    callq nearbyint
-; NO-FMA-NEXT:    movaps %xmm0, %xmm1
-; NO-FMA-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; NO-FMA-NEXT:    # xmm1 = xmm1[0],mem[0]
-; NO-FMA-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; NO-FMA-NEXT:    addq $40, %rsp
-; NO-FMA-NEXT:    .cfi_def_cfa_offset 8
-; NO-FMA-NEXT:    retq
-;
-; HAS-FMA-LABEL: constrained_vector_nearbyint_v4f64:
-; HAS-FMA:       # %bb.0: # %entry
-; HAS-FMA-NEXT:    vroundpd $12, {{.*}}(%rip), %ymm0
-; HAS-FMA-NEXT:    retq
+; CHECK-LABEL: constrained_vector_nearbyint_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq nearbyint
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
 entry:
   %nearby = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(
                                 <4 x double> <double 42.1, double 42.2,
@@ -3668,14 +2119,719 @@ entry:
   ret <4 x double> %nearby
 }
 
+define <1 x float> @constrained_vector_maxnum_v1f32() {
+; CHECK-LABEL: constrained_vector_maxnum_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmaxf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %max = call <1 x float> @llvm.experimental.constrained.maxnum.v1f32(
+                               <1 x float> <float 42.0>, <1 x float> <float 41.0>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %max
+}
+
+define <2 x double> @constrained_vector_maxnum_v2f64() {
+; CHECK-LABEL: constrained_vector_maxnum_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %max = call <2 x double> @llvm.experimental.constrained.maxnum.v2f64(
+                                <2 x double> <double 43.0, double 42.0>,
+                                <2 x double> <double 41.0, double 40.0>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %max
+}
+
+define <3 x float> @constrained_vector_maxnum_v3f32() {
+; CHECK-LABEL: constrained_vector_maxnum_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmaxf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmaxf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fmaxf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %max = call <3 x float> @llvm.experimental.constrained.maxnum.v3f32(
+                              <3 x float> <float 43.0, float 44.0, float 45.0>,
+                              <3 x float> <float 41.0, float 42.0, float 43.0>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %max
+}
+
+define <3 x double> @constrained_vector_max_v3f64() {
+; CHECK-LABEL: constrained_vector_max_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %max = call <3 x double> @llvm.experimental.constrained.maxnum.v3f64(
+                          <3 x double> <double 43.0, double 44.0, double 45.0>,
+                          <3 x double> <double 40.0, double 41.0, double 42.0>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %max
+}
+
+define <4 x double> @constrained_vector_maxnum_v4f64() {
+; CHECK-LABEL: constrained_vector_maxnum_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmax
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %max = call <4 x double> @llvm.experimental.constrained.maxnum.v4f64(
+                                <4 x double> <double 44.0, double 45.0,
+                                              double 46.0, double 47.0>,
+                                <4 x double> <double 40.0, double 41.0,
+                                              double 42.0, double 43.0>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <4 x double> %max
+}
+
+define <1 x float> @constrained_vector_minnum_v1f32() {
+; CHECK-LABEL: constrained_vector_minnum_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fminf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+ entry:
+  %min = call <1 x float> @llvm.experimental.constrained.minnum.v1f32(
+                               <1 x float> <float 42.0>, <1 x float> <float 41.0>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %min
+}
+
+define <2 x double> @constrained_vector_minnum_v2f64() {
+; CHECK-LABEL: constrained_vector_minnum_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %min = call <2 x double> @llvm.experimental.constrained.minnum.v2f64(
+                                <2 x double> <double 43.0, double 42.0>,
+                                <2 x double> <double 41.0, double 40.0>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %min
+}
+
+define <3 x float> @constrained_vector_minnum_v3f32() {
+; CHECK-LABEL: constrained_vector_minnum_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fminf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fminf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq fminf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %min = call <3 x float> @llvm.experimental.constrained.minnum.v3f32(
+                              <3 x float> <float 43.0, float 44.0, float 45.0>,
+                              <3 x float> <float 41.0, float 42.0, float 43.0>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %min
+}
+
+define <3 x double> @constrained_vector_min_v3f64() {
+; CHECK-LABEL: constrained_vector_min_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+ %min = call <3 x double> @llvm.experimental.constrained.minnum.v3f64(
+                          <3 x double> <double 43.0, double 44.0, double 45.0>,
+                          <3 x double> <double 40.0, double 41.0, double 42.0>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %min
+}
+
+define <4 x double> @constrained_vector_minnum_v4f64() {
+; CHECK-LABEL: constrained_vector_minnum_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    callq fmin
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %min = call <4 x double> @llvm.experimental.constrained.minnum.v4f64(
+                                <4 x double> <double 44.0, double 45.0,
+                                              double 46.0, double 47.0>,
+                                <4 x double> <double 40.0, double 41.0,
+                                              double 42.0, double 43.0>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <4 x double> %min
+}
+
+define <1 x float> @constrained_vector_ceil_v1f32() {
+; CHECK-LABEL: constrained_vector_ceil_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ceilf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %ceil = call <1 x float> @llvm.experimental.constrained.ceil.v1f32(
+                               <1 x float> <float 1.5>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %ceil
+}
+
+define <2 x double> @constrained_vector_ceil_v2f64() {
+; CHECK-LABEL: constrained_vector_ceil_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %ceil = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(
+                                <2 x double> <double 1.1, double 1.9>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %ceil
+}
+
+define <3 x float> @constrained_vector_ceil_v3f32() {
+; CHECK-LABEL: constrained_vector_ceil_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ceilf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ceilf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq ceilf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %ceil = call <3 x float> @llvm.experimental.constrained.ceil.v3f32(
+                              <3 x float> <float 1.5, float 2.5, float 3.5>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %ceil
+}
+
+define <3 x double> @constrained_vector_ceil_v3f64() {
+; CHECK-LABEL: constrained_vector_ceil_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq ceil
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %ceil = call <3 x double> @llvm.experimental.constrained.ceil.v3f64(
+                          <3 x double> <double 1.1, double 1.9, double 1.5>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %ceil
+}
+
+define <1 x float> @constrained_vector_floor_v1f32() {
+; CHECK-LABEL: constrained_vector_floor_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq floorf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %floor = call <1 x float> @llvm.experimental.constrained.floor.v1f32(
+                               <1 x float> <float 1.5>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %floor
+}
+
+
+define <2 x double> @constrained_vector_floor_v2f64() {
+; CHECK-LABEL: constrained_vector_floor_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %floor = call <2 x double> @llvm.experimental.constrained.floor.v2f64(
+                                <2 x double> <double 1.1, double 1.9>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %floor
+}
+
+define <3 x float> @constrained_vector_floor_v3f32() {
+; CHECK-LABEL: constrained_vector_floor_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq floorf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq floorf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq floorf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %floor = call <3 x float> @llvm.experimental.constrained.floor.v3f32(
+                              <3 x float> <float 1.5, float 2.5, float 3.5>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %floor
+}
+
+define <3 x double> @constrained_vector_floor_v3f64() {
+; CHECK-LABEL: constrained_vector_floor_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq floor
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %floor = call <3 x double> @llvm.experimental.constrained.floor.v3f64(
+                          <3 x double> <double 1.1, double 1.9, double 1.5>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %floor
+}
+
+define <1 x float> @constrained_vector_round_v1f32() {
+; CHECK-LABEL: constrained_vector_round_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq roundf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %round = call <1 x float> @llvm.experimental.constrained.round.v1f32(
+                               <1 x float> <float 1.5>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %round
+}
+
+define <2 x double> @constrained_vector_round_v2f64() {
+; CHECK-LABEL: constrained_vector_round_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %round = call <2 x double> @llvm.experimental.constrained.round.v2f64(
+                                <2 x double> <double 1.1, double 1.9>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %round
+}
+
+define <3 x float> @constrained_vector_round_v3f32() {
+; CHECK-LABEL: constrained_vector_round_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq roundf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq roundf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq roundf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %round = call <3 x float> @llvm.experimental.constrained.round.v3f32(
+                              <3 x float> <float 1.5, float 2.5, float 3.5>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %round
+}
+
+
+define <3 x double> @constrained_vector_round_v3f64() {
+; CHECK-LABEL: constrained_vector_round_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq round
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %round = call <3 x double> @llvm.experimental.constrained.round.v3f64(
+                          <3 x double> <double 1.1, double 1.9, double 1.5>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %round
+}
+
+define <1 x float> @constrained_vector_trunc_v1f32() {
+; CHECK-LABEL: constrained_vector_trunc_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq truncf
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %trunc = call <1 x float> @llvm.experimental.constrained.trunc.v1f32(
+                               <1 x float> <float 1.5>,
+                               metadata !"round.dynamic",
+                               metadata !"fpexcept.strict")
+  ret <1 x float> %trunc
+}
+
+define <2 x double> @constrained_vector_trunc_v2f64() {
+; CHECK-LABEL: constrained_vector_trunc_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %trunc = call <2 x double> @llvm.experimental.constrained.trunc.v2f64(
+                                <2 x double> <double 1.1, double 1.9>,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict")
+  ret <2 x double> %trunc
+}
+
+define <3 x float> @constrained_vector_trunc_v3f32() {
+; CHECK-LABEL: constrained_vector_trunc_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq truncf
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq truncf
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq truncf
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %trunc = call <3 x float> @llvm.experimental.constrained.trunc.v3f32(
+                              <3 x float> <float 1.5, float 2.5, float 3.5>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict")
+  ret <3 x float> %trunc
+}
+
+define <3 x double> @constrained_vector_trunc_v3f64() {
+; CHECK-LABEL: constrained_vector_trunc_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    callq trunc
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd (%rsp), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %trunc = call <3 x double> @llvm.experimental.constrained.trunc.v3f64(
+                          <3 x double> <double 1.1, double 1.9, double 1.5>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict")
+  ret <3 x double> %trunc
+}
+
+
 ; Single width declarations
 declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double>, <2 x double>, metadata, metadata)
-declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)
-declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.pow.v2f64(<2 x double>, <2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32, metadata, metadata)
@@ -3688,6 +2844,12 @@ declare <2 x double> @llvm.experimental.constrained.log10.v2f64(<2 x double>, me
 declare <2 x double> @llvm.experimental.constrained.log2.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.maxnum.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.minnum.v2f64(<2 x double>, <2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata, metadata)
 
 ; Scalar width declarations
 declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata)
@@ -3695,7 +2857,6 @@ declare <1 x float> @llvm.experimental.constrained.fsub.v1f32(<1 x float>, <1 x
 declare <1 x float> @llvm.experimental.constrained.fmul.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.fdiv.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.frem.v1f32(<1 x float>, <1 x float>, metadata, metadata)
-declare <1 x float> @llvm.experimental.constrained.fma.v1f32(<1 x float>, <1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.sqrt.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.pow.v1f32(<1 x float>, <1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32, metadata, metadata)
@@ -3708,6 +2869,12 @@ declare <1 x float> @llvm.experimental.constrained.log10.v1f32(<1 x float>, meta
 declare <1 x float> @llvm.experimental.constrained.log2.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.rint.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.nearbyint.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.maxnum.v1f32(<1 x float>, <1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.minnum.v1f32(<1 x float>, <1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.ceil.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.round.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.trunc.v1f32(<1 x float>, metadata, metadata)
 
 ; Illegal width declarations
 declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata)
@@ -3720,8 +2887,6 @@ declare <3 x float> @llvm.experimental.constrained.fdiv.v3f32(<3 x float>, <3 x
 declare <3 x double> @llvm.experimental.constrained.fdiv.v3f64(<3 x double>, <3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.frem.v3f32(<3 x float>, <3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.frem.v3f64(<3 x double>, <3 x double>, metadata, metadata)
-declare <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float>, <3 x float>, <3 x float>, metadata, metadata)
-declare <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double>, <3 x double>, <3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.sqrt.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.sqrt.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.pow.v3f32(<3 x float>, <3 x float>, metadata, metadata)
@@ -3746,6 +2911,18 @@ declare <3 x float> @llvm.experimental.constrained.rint.v3f32(<3 x float>, metad
 declare <3 x double> @llvm.experimental.constrained.rint.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.nearbyint.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.nearbyint.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.maxnum.v3f32(<3 x float>, <3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.maxnum.v3f64(<3 x double>, <3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.minnum.v3f32(<3 x float>, <3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.minnum.v3f64(<3 x double>, <3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.ceil.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.ceil.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.floor.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.floor.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.round.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.round.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.trunc.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.trunc.v3f64(<3 x double>, metadata, metadata)
 
 ; Double width declarations
 declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata)
@@ -3753,8 +2930,6 @@ declare <4 x double> @llvm.experimental.constrained.fsub.v4f64(<4 x double>, <4
 declare <4 x double> @llvm.experimental.constrained.fmul.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.fdiv.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.frem.v4f64(<4 x double>, <4 x double>, metadata, metadata)
-declare <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double>, <4 x double>, <4 x double>, metadata, metadata)
-declare <8 x float> @llvm.experimental.constrained.fma.v8f32(<8 x float>, <8 x float>, <8 x float>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.sqrt.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata)
@@ -3767,3 +2942,10 @@ declare <4 x double> @llvm.experimental.constrained.log10.v4f64(<4 x double>, me
 declare <4 x double> @llvm.experimental.constrained.log2.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.rint.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.maxnum.v4f64(<4 x double>, <4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.minnum.v4f64(<4 x double>, <4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.ceil.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, metadata, metadata)
+
diff --git a/test/CodeGen/X86/vector-extend-inreg.ll b/test/CodeGen/X86/vector-extend-inreg.ll
index 86bb13f57ebf285047454e1004c1283111027d87..d790cb54b61f123483357a7dd23c9964f1674263 100644
--- a/test/CodeGen/X86/vector-extend-inreg.ll
+++ b/test/CodeGen/X86/vector-extend-inreg.ll
@@ -13,6 +13,7 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X32-SSE-NEXT:    subl $384, %esp # imm = 0x180
 ; X32-SSE-NEXT:    movl 88(%ebp), %ecx
 ; X32-SSE-NEXT:    movdqa 72(%ebp), %xmm0
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
@@ -21,7 +22,6 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; X32-SSE-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
diff --git a/test/CodeGen/X86/vector-idiv-v2i32.ll b/test/CodeGen/X86/vector-idiv-v2i32.ll
index 49e29ac17a59902255c547a3652459f1481384b3..00126d675322aa2001e5a9a7a95de1826f2dbcf4 100644
--- a/test/CodeGen/X86/vector-idiv-v2i32.ll
+++ b/test/CodeGen/X86/vector-idiv-v2i32.ll
@@ -693,20 +693,20 @@ define void @test_sdiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X86-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-NEXT:    movdqa {{.*#+}} xmm3 = [31,0,31,0]
-; X86-NEXT:    movdqa %xmm2, %xmm4
-; X86-NEXT:    psrlq %xmm3, %xmm4
+; X86-NEXT:    movdqa {{.*#+}} xmm2 = [31,0,31,0]
+; X86-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-NEXT:    movdqa %xmm3, %xmm4
+; X86-NEXT:    psrlq %xmm2, %xmm4
 ; X86-NEXT:    movl $31, %ecx
 ; X86-NEXT:    movd %ecx, %xmm5
-; X86-NEXT:    psrlq %xmm5, %xmm2
-; X86-NEXT:    movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
+; X86-NEXT:    psrlq %xmm5, %xmm3
+; X86-NEXT:    movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
 ; X86-NEXT:    movdqa %xmm1, %xmm4
-; X86-NEXT:    psrlq %xmm3, %xmm4
+; X86-NEXT:    psrlq %xmm2, %xmm4
 ; X86-NEXT:    psrlq %xmm5, %xmm1
 ; X86-NEXT:    movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
-; X86-NEXT:    xorpd %xmm2, %xmm1
-; X86-NEXT:    psubq %xmm2, %xmm1
+; X86-NEXT:    xorpd %xmm3, %xmm1
+; X86-NEXT:    psubq %xmm3, %xmm1
 ; X86-NEXT:    pand {{\.LCPI.*}}, %xmm1
 ; X86-NEXT:    psrlq $29, %xmm1
 ; X86-NEXT:    paddq %xmm0, %xmm1
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index dc945c84b19b5556fd58e4162c1f1710d2bc56aa..34ea33d576c68ff54e3ab2543273cbb2e863f4ed 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -37,18 +37,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubq %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrlq $4, %xmm2
-; SSE2-NEXT:    paddq %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $4, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    psadbw %xmm2, %xmm0
@@ -77,18 +77,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubq %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm1
+; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddq %xmm2, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    psrlq $4, %xmm2
-; SSE3-NEXT:    paddq %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $4, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSE3-NEXT:    psadbw %xmm2, %xmm0
@@ -303,18 +303,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlq $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubq %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm1
+; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrlq $4, %xmm2
-; SSE2-NEXT:    paddq %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $4, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    psadbw %xmm2, %xmm0
@@ -343,18 +343,18 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
-; SSE3-NEXT:    psrlq $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubq %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm1
+; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddq %xmm2, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE3-NEXT:    psrlq $4, %xmm2
-; SSE3-NEXT:    paddq %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $4, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSE3-NEXT:    psadbw %xmm2, %xmm0
@@ -566,18 +566,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrld $2, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrld $4, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -608,18 +608,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE3-NEXT:    pxor %xmm1, %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrld $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    psubb %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrld $2, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddd %xmm1, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrld $4, %xmm0
-; SSE3-NEXT:    paddd %xmm2, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
@@ -808,18 +808,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrld $2, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrld $4, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -850,18 +850,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE3-NEXT:    pxor %xmm1, %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrld $1, %xmm0
+; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubd %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    psubb %xmm0, %xmm2
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrld $2, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddd %xmm1, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrld $4, %xmm0
-; SSE3-NEXT:    paddd %xmm2, %xmm0
+; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
@@ -1049,16 +1049,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddw %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $4, %xmm2
-; SSE2-NEXT:    paddw %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -1085,16 +1085,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubw %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddw %xmm2, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $4, %xmm2
-; SSE3-NEXT:    paddw %xmm1, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
@@ -1255,16 +1255,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    paddw %xmm2, %xmm1
+; SSE2-NEXT:    paddb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $4, %xmm2
-; SSE2-NEXT:    paddw %xmm1, %xmm2
+; SSE2-NEXT:    paddb %xmm1, %xmm2
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -1291,16 +1291,16 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psrlw $1, %xmm0
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubw %xmm0, %xmm1
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm0, %xmm1
+; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    pand %xmm0, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm1
 ; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    paddw %xmm2, %xmm1
+; SSE3-NEXT:    paddb %xmm2, %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $4, %xmm2
-; SSE3-NEXT:    paddw %xmm1, %xmm2
+; SSE3-NEXT:    paddb %xmm1, %xmm2
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll
index 10db0aeb25eb14b91cf1e977d1495eb70252c7c8..71a9ba19396864dc50c93e5e17067bf12ed85c32 100644
--- a/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -172,15 +172,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512BW-LABEL: testv16i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrld $1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $8, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -201,15 +201,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512DQ-LABEL: testv16i32:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpsrld $1, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $2, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $4, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $8, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $16, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -257,15 +257,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512BW-LABEL: testv16i32u:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrld $1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $2, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $8, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %zmm0, %zmm1
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -286,15 +286,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512DQ-LABEL: testv16i32u:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpsrld $1, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $2, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $4, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $8, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpsrld $16, %zmm0, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll
index 9b05ce4485edb8128218deb0d3039e7a07d26517..c20dc09a6b2999d6508139cca3f99ae6bc3d23e4 100644
--- a/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/test/CodeGen/X86/vector-narrow-binop.ll
@@ -80,3 +80,21 @@ define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c,
   ret <4 x i32> %sub
 }
 
+; When extracting from a vector binop, the source width should be a multiple of the destination width.
+; https://bugs.llvm.org/show_bug.cgi?id=39511
+
+define <3 x float> @PR39511(<4 x float> %t0, <3 x float>* %b) {
+; SSE-LABEL: PR39511:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addps {{.*}}(%rip), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR39511:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %add = fadd <4 x float> %t0, <float 1.0, float 2.0, float 3.0, float 4.0>
+  %ext = shufflevector <4 x float> %add, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %ext
+}
+
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
index c963922adddb1ff3ec7ce01391d1bc9a80e48dc4..b75f61f3e6c07ac1b5e430122446b1e470ac658a 100644
--- a/test/CodeGen/X86/vector-pcmp.ll
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -438,3 +438,107 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
   %zext = zext <2 x i1> %cmp to <2 x i64>
   ret <2 x i64> %zext
 }
+
+; Test that we optimize a zext of a vector setcc ne zero where all bits but the
+; lsb are known to be zero.
+define <8 x i32> @cmpne_knownzeros_zext_v8i16_v8i32(<8 x i16> %x) {
+; SSE2-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $15, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    psrlw $15, %xmm0
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT:    movdqa %xmm2, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: cmpne_knownzeros_zext_v8i16_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    retq
+  %a = lshr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %b = icmp ne <8 x i16> %a, zeroinitializer
+  %c = zext <8 x i1> %b to <8 x i32>
+  ret <8 x i32> %c
+}
+
+define <8 x i32> @cmpne_knownzeros_zext_v8i32_v8i32(<8 x i32> %x) {
+; SSE-LABEL: cmpne_knownzeros_zext_v8i32_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrld $31, %xmm0
+; SSE-NEXT:    psrld $31, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: cmpne_knownzeros_zext_v8i32_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: cmpne_knownzeros_zext_v8i32_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %a = lshr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %b = icmp ne <8 x i32> %a, zeroinitializer
+  %c = zext <8 x i1> %b to <8 x i32>
+  ret <8 x i32> %c
+}
+
+define <8 x i16> @cmpne_knownzeros_zext_v8i32_v8i16(<8 x i32> %x) {
+; SSE2-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrld $31, %xmm1
+; SSE2-NEXT:    psrld $31, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    psrld $31, %xmm1
+; SSE42-NEXT:    psrld $31, %xmm0
+; SSE42-NEXT:    packusdw %xmm1, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: cmpne_knownzeros_zext_v8i32_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $31, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %a = lshr <8 x i32> %x, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %b = icmp ne <8 x i32> %a, zeroinitializer
+  %c = zext <8 x i1> %b to <8 x i16>
+  ret <8 x i16> %c
+}
diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll
index df42ebf272839b8633faf0a31d279ae892974474..16539f1b2d464f5bddd0e1484030ee013e3df15a 100644
--- a/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/test/CodeGen/X86/vector-popcnt-128.ll
@@ -14,18 +14,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlq $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm0
+; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlq $4, %xmm1
-; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
@@ -35,18 +35,18 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE3-LABEL: testv2i64:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSE3-NEXT:    psrlq $1, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubq %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm0
+; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddq %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSE3-NEXT:    psrlq $4, %xmm1
-; SSE3-NEXT:    paddq %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSE3-NEXT:    psadbw %xmm0, %xmm1
@@ -128,28 +128,16 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv2i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv2i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; BITALG-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
@@ -161,18 +149,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    psrld $2, %xmm0
+; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $4, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -187,18 +175,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE3-LABEL: testv4i32:
 ; SSE3:       # %bb.0:
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSE3-NEXT:    psrld $1, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubd %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
-; SSE3-NEXT:    psrld $2, %xmm0
+; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddd %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSE3-NEXT:    psrld $4, %xmm1
-; SSE3-NEXT:    paddd %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    pxor %xmm0, %xmm0
 ; SSE3-NEXT:    movdqa %xmm1, %xmm2
@@ -303,32 +291,20 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv4i32:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; BITALG-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
@@ -346,16 +322,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -368,16 +344,16 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubw %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddw %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll
index b2cc2f1ebed6c13030e84a3db80ef55a310b66c1..570f59673d1ce95c4668c84df89a9f2b2baf9483 100644
--- a/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/test/CodeGen/X86/vector-popcnt-256.ll
@@ -58,28 +58,15 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv4i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; BITALG-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    retq
@@ -151,14 +138,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv8i32:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
@@ -169,14 +150,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm2
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; BITALG-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
index df5edc13c3ea42c9bb05f338ab509e9d206d641d..eae9e6c79bd4769e5cabeb64c49da7bd98022b4d 100644
--- a/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -50,14 +50,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; BITALG-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    retq
@@ -122,14 +115,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv16i32:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm2
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
-; BITALG-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
diff --git a/test/CodeGen/X86/vector-reduce-add.ll b/test/CodeGen/X86/vector-reduce-add.ll
index 7a5e5f34ad389d491eb5d718a2942e7ef457138a..e0f6f194f50220b013a06c8d21164fabe1cf67af 100644
--- a/test/CodeGen/X86/vector-reduce-add.ll
+++ b/test/CodeGen/X86/vector-reduce-add.ll
@@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -107,7 +107,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -119,7 +119,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -169,7 +169,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -182,7 +182,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -195,28 +195,21 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ;
 
 define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE2-LABEL: test_v4i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4i32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    phaddd %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
 ;
@@ -224,7 +217,8 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    retq
   %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0)
@@ -232,24 +226,15 @@ define i32 @test_v4i32(<4 x i32> %a0) {
 }
 
 define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE2-LABEL: test_v8i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8i32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    phaddd %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v8i32:
 ; AVX1:       # %bb.0:
@@ -257,7 +242,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -268,7 +254,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -279,7 +266,8 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -288,28 +276,17 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 }
 
 define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE2-LABEL: test_v16i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16i32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm3, %xmm1
-; SSE41-NEXT:    paddd %xmm2, %xmm1
-; SSE41-NEXT:    paddd %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT:    paddd %xmm1, %xmm0
-; SSE41-NEXT:    phaddd %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm3, %xmm1
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i32:
 ; AVX1:       # %bb.0:
@@ -320,7 +297,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -332,7 +310,8 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -346,7 +325,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -355,36 +334,21 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 }
 
 define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE2-LABEL: test_v32i32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddd %xmm6, %xmm2
-; SSE2-NEXT:    paddd %xmm7, %xmm3
-; SSE2-NEXT:    paddd %xmm5, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v32i32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddd %xmm6, %xmm2
-; SSE41-NEXT:    paddd %xmm7, %xmm3
-; SSE41-NEXT:    paddd %xmm5, %xmm3
-; SSE41-NEXT:    paddd %xmm1, %xmm3
-; SSE41-NEXT:    paddd %xmm4, %xmm2
-; SSE41-NEXT:    paddd %xmm3, %xmm2
-; SSE41-NEXT:    paddd %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT:    paddd %xmm2, %xmm0
-; SSE41-NEXT:    phaddd %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v32i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddd %xmm6, %xmm2
+; SSE-NEXT:    paddd %xmm7, %xmm3
+; SSE-NEXT:    paddd %xmm5, %xmm3
+; SSE-NEXT:    paddd %xmm1, %xmm3
+; SSE-NEXT:    paddd %xmm4, %xmm2
+; SSE-NEXT:    paddd %xmm3, %xmm2
+; SSE-NEXT:    paddd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddd %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v32i32:
 ; AVX1:       # %bb.0:
@@ -401,7 +365,8 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -415,7 +380,8 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -430,7 +396,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -443,29 +409,18 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ;
 
 define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE2-LABEL: test_v8i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    phaddw %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8i16:
 ; AVX:       # %bb.0:
@@ -473,7 +428,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX-NEXT:    retq
@@ -484,7 +440,8 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    retq
@@ -493,31 +450,19 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 }
 
 define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE2-LABEL: test_v16i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    phaddw %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, %eax
-; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v16i16:
 ; AVX1:       # %bb.0:
@@ -527,7 +472,8 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -541,7 +487,8 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -555,7 +502,8 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -565,35 +513,21 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 }
 
 define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE2-LABEL: test_v32i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddw %xmm3, %xmm1
-; SSE2-NEXT:    paddw %xmm2, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v32i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddw %xmm3, %xmm1
-; SSE41-NEXT:    paddw %xmm2, %xmm1
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    phaddw %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v32i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm3, %xmm1
+; SSE-NEXT:    paddw %xmm2, %xmm1
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v32i16:
 ; AVX1:       # %bb.0:
@@ -606,7 +540,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -621,7 +556,8 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -638,7 +574,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -648,43 +584,25 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 }
 
 define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE2-LABEL: test_v64i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddw %xmm6, %xmm2
-; SSE2-NEXT:    paddw %xmm7, %xmm3
-; SSE2-NEXT:    paddw %xmm5, %xmm3
-; SSE2-NEXT:    paddw %xmm1, %xmm3
-; SSE2-NEXT:    paddw %xmm4, %xmm2
-; SSE2-NEXT:    paddw %xmm3, %xmm2
-; SSE2-NEXT:    paddw %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    paddw %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v64i16:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddw %xmm6, %xmm2
-; SSE41-NEXT:    paddw %xmm7, %xmm3
-; SSE41-NEXT:    paddw %xmm5, %xmm3
-; SSE41-NEXT:    paddw %xmm1, %xmm3
-; SSE41-NEXT:    paddw %xmm4, %xmm2
-; SSE41-NEXT:    paddw %xmm3, %xmm2
-; SSE41-NEXT:    paddw %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT:    paddw %xmm2, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT:    paddw %xmm0, %xmm1
-; SSE41-NEXT:    phaddw %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v64i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    paddw %xmm6, %xmm2
+; SSE-NEXT:    paddw %xmm7, %xmm3
+; SSE-NEXT:    paddw %xmm5, %xmm3
+; SSE-NEXT:    paddw %xmm1, %xmm3
+; SSE-NEXT:    paddw %xmm4, %xmm2
+; SSE-NEXT:    paddw %xmm3, %xmm2
+; SSE-NEXT:    paddw %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE-NEXT:    paddw %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE-NEXT:    paddw %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v64i16:
 ; AVX1:       # %bb.0:
@@ -703,7 +621,8 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -720,7 +639,8 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -738,7 +658,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -878,7 +798,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -895,7 +815,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -975,7 +895,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -994,7 +914,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -1090,7 +1010,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -1110,7 +1030,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-and.ll b/test/CodeGen/X86/vector-reduce-and.ll
index 89ae9510cdcd419f4615b7d73f6f5f7577c012e3..305464e3707141ba67775ff9af33bfe1e6e18a69 100644
--- a/test/CodeGen/X86/vector-reduce-and.ll
+++ b/test/CodeGen/X86/vector-reduce-and.ll
@@ -49,7 +49,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -94,7 +94,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -105,7 +105,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -117,7 +117,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -148,7 +148,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -161,7 +161,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -174,7 +174,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -235,7 +235,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -247,7 +247,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -259,7 +259,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -288,7 +288,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -301,7 +301,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -309,13 +309,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-LABEL: test_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -350,7 +350,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -365,22 +365,22 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v32i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -457,7 +457,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -472,7 +472,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -487,7 +487,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -523,7 +523,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -539,7 +539,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -556,7 +556,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -598,7 +598,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -616,7 +616,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -634,7 +634,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -757,7 +757,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -774,7 +774,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -791,7 +791,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -851,7 +851,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -869,7 +869,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -888,7 +888,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -958,7 +958,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -978,7 +978,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -998,7 +998,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/test/CodeGen/X86/vector-reduce-fadd-fast.ll
index b17734b83e77651bade0b5c62b2efd928d461c08..9dadf969a0edf284f14d94633d9ea89aaa895ca2 100644
--- a/test/CodeGen/X86/vector-reduce-fadd-fast.ll
+++ b/test/CodeGen/X86/vector-reduce-fadd-fast.ll
@@ -20,18 +20,20 @@ define float @test_v2f32(float %a0, <2 x float> %a1) {
 ;
 ; SSE41-LABEL: test_v2f32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    haddps %xmm1, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddps %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
   ret float %1
@@ -50,24 +52,27 @@ define float @test_v4f32(float %a0, <4 x float> %a1) {
 ;
 ; SSE41-LABEL: test_v4f32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT:    addps %xmm1, %xmm0
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movaps %xmm1, %xmm2
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    addps %xmm1, %xmm2
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    addps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
 ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
 ; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
   ret float %1
@@ -88,10 +93,11 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; SSE41-LABEL: test_v8f32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    addps %xmm2, %xmm1
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT:    addps %xmm1, %xmm0
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movaps %xmm1, %xmm2
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    addps %xmm1, %xmm2
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    addps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f32:
@@ -100,8 +106,8 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -111,8 +117,8 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
@@ -138,10 +144,11 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; SSE41-NEXT:    addps %xmm4, %xmm2
 ; SSE41-NEXT:    addps %xmm3, %xmm1
 ; SSE41-NEXT:    addps %xmm2, %xmm1
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE41-NEXT:    addps %xmm1, %xmm0
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movaps %xmm1, %xmm2
+; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE41-NEXT:    addps %xmm1, %xmm2
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE41-NEXT:    addps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16f32:
@@ -151,8 +158,8 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -165,8 +172,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
@@ -188,17 +194,20 @@ define float @test_v2f32_zero(<2 x float> %a0) {
 ;
 ; SSE41-LABEL: test_v2f32_zero:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f32_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f32_zero:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
   ret float %1
@@ -220,7 +229,8 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -228,14 +238,16 @@ define float @test_v4f32_zero(<4 x float> %a0) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4f32_zero:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
   ret float %1
@@ -259,7 +271,8 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -269,8 +282,8 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -280,8 +293,8 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
@@ -310,7 +323,8 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -321,8 +335,8 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -335,8 +349,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
@@ -358,17 +371,20 @@ define float @test_v2f32_undef(<2 x float> %a0) {
 ;
 ; SSE41-LABEL: test_v2f32_undef:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddps %xmm0, %xmm0
+; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT:    addps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f32_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f32_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
   ret float %1
@@ -390,7 +406,8 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -398,14 +415,16 @@ define float @test_v4f32_undef(<4 x float> %a0) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4f32_undef:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
   ret float %1
@@ -429,7 +448,8 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -439,8 +459,8 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -450,8 +470,8 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
@@ -480,7 +500,8 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; SSE41-NEXT:    movaps %xmm0, %xmm1
 ; SSE41-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
 ; SSE41-NEXT:    addps %xmm0, %xmm1
-; SSE41-NEXT:    haddps %xmm1, %xmm1
+; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE41-NEXT:    addps %xmm0, %xmm1
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -491,8 +512,8 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -505,8 +526,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
@@ -518,54 +538,43 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ;
 
 define double @test_v2f64(double %a0, <2 x double> %a1) {
-; SSE2-LABEL: test_v2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm1, %xmm1, %xmm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddpd %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
   ret double %1
 }
 
 define double @test_v4f64(double %a0, <4 x double> %a1) {
-; SSE2-LABEL: test_v4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v4f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -573,8 +582,8 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX512-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
@@ -582,32 +591,23 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 }
 
 define double @test_v8f64(double %a0, <8 x double> %a1) {
-; SSE2-LABEL: test_v8f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm4, %xmm2
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    addpd %xmm4, %xmm2
-; SSE41-NEXT:    addpd %xmm3, %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm4, %xmm2
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddpd %ymm2, %ymm1, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -618,8 +618,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
@@ -627,32 +626,19 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 }
 
 define double @test_v16f64(double %a0, <16 x double> %a1) {
-; SSE2-LABEL: test_v16f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm6, %xmm2
-; SSE2-NEXT:    addpd %xmm7, %xmm3
-; SSE2-NEXT:    addpd %xmm5, %xmm1
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT:    addpd %xmm2, %xmm4
-; SSE2-NEXT:    addpd %xmm1, %xmm4
-; SSE2-NEXT:    movapd %xmm4, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movapd %xmm4, %xmm0
-; SSE41-NEXT:    addpd %xmm6, %xmm2
-; SSE41-NEXT:    addpd %xmm7, %xmm3
-; SSE41-NEXT:    addpd %xmm5, %xmm1
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm6, %xmm2
+; SSE-NEXT:    addpd %xmm7, %xmm3
+; SSE-NEXT:    addpd %xmm5, %xmm1
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd {{[0-9]+}}(%rsp), %xmm4
+; SSE-NEXT:    addpd %xmm2, %xmm4
+; SSE-NEXT:    addpd %xmm1, %xmm4
+; SSE-NEXT:    movapd %xmm4, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1]
+; SSE-NEXT:    addpd %xmm4, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16f64:
 ; AVX:       # %bb.0:
@@ -661,8 +647,8 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -674,8 +660,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
@@ -687,54 +672,45 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ;
 
 define double @test_v2f64_zero(<2 x double> %a0) {
-; SSE2-LABEL: test_v2f64_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v2f64_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v2f64_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f64_zero:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
   ret double %1
 }
 
 define double @test_v4f64_zero(<4 x double> %a0) {
-; SSE2-LABEL: test_v4f64_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4f64_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v4f64_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -742,8 +718,8 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
@@ -751,32 +727,24 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 }
 
 define double @test_v8f64_zero(<8 x double> %a0) {
-; SSE2-LABEL: test_v8f64_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8f64_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8f64_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f64_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -787,8 +755,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
@@ -796,32 +763,19 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 }
 
 define double @test_v16f64_zero(<16 x double> %a0) {
-; SSE2-LABEL: test_v16f64_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm6, %xmm2
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    addpd %xmm7, %xmm3
-; SSE2-NEXT:    addpd %xmm5, %xmm1
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16f64_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm6, %xmm2
-; SSE41-NEXT:    addpd %xmm4, %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm7, %xmm3
-; SSE41-NEXT:    addpd %xmm5, %xmm1
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm0, %xmm1
-; SSE41-NEXT:    haddpd %xmm1, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16f64_zero:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm6, %xmm2
+; SSE-NEXT:    addpd %xmm4, %xmm0
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    addpd %xmm7, %xmm3
+; SSE-NEXT:    addpd %xmm5, %xmm1
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16f64_zero:
 ; AVX:       # %bb.0:
@@ -830,8 +784,8 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -843,8 +797,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
@@ -856,54 +809,45 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ;
 
 define double @test_v2f64_undef(<2 x double> %a0) {
-; SSE2-LABEL: test_v2f64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v2f64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v2f64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v2f64_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
   ret double %1
 }
 
 define double @test_v4f64_undef(<4 x double> %a0) {
-; SSE2-LABEL: test_v4f64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4f64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v4f64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f64_undef:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -911,8 +855,8 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
@@ -920,32 +864,24 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 }
 
 define double @test_v8f64_undef(<8 x double> %a0) {
-; SSE2-LABEL: test_v8f64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8f64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    haddpd %xmm0, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v8f64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    movapd %xmm0, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v8f64_undef:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -956,8 +892,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
@@ -965,32 +900,19 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 }
 
 define double @test_v16f64_undef(<16 x double> %a0) {
-; SSE2-LABEL: test_v16f64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    addpd %xmm6, %xmm2
-; SSE2-NEXT:    addpd %xmm4, %xmm0
-; SSE2-NEXT:    addpd %xmm2, %xmm0
-; SSE2-NEXT:    addpd %xmm7, %xmm3
-; SSE2-NEXT:    addpd %xmm5, %xmm1
-; SSE2-NEXT:    addpd %xmm3, %xmm1
-; SSE2-NEXT:    addpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm1, %xmm0
-; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16f64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    addpd %xmm6, %xmm2
-; SSE41-NEXT:    addpd %xmm4, %xmm0
-; SSE41-NEXT:    addpd %xmm2, %xmm0
-; SSE41-NEXT:    addpd %xmm7, %xmm3
-; SSE41-NEXT:    addpd %xmm5, %xmm1
-; SSE41-NEXT:    addpd %xmm3, %xmm1
-; SSE41-NEXT:    addpd %xmm0, %xmm1
-; SSE41-NEXT:    haddpd %xmm1, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_v16f64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    addpd %xmm6, %xmm2
+; SSE-NEXT:    addpd %xmm4, %xmm0
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    addpd %xmm7, %xmm3
+; SSE-NEXT:    addpd %xmm5, %xmm1
+; SSE-NEXT:    addpd %xmm3, %xmm1
+; SSE-NEXT:    addpd %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT:    addpd %xmm1, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_v16f64_undef:
 ; AVX:       # %bb.0:
@@ -999,8 +921,8 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -1012,8 +934,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
diff --git a/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/test/CodeGen/X86/vector-reduce-fmul-fast.ll
index 4c093562cb5833f2a72bc8618f4171f8a78e609b..efacbf1e3b4b710907e08b2511026fa5d4b77eb0 100644
--- a/test/CodeGen/X86/vector-reduce-fmul-fast.ll
+++ b/test/CodeGen/X86/vector-reduce-fmul-fast.ll
@@ -107,8 +107,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -119,8 +118,7 @@ define float @test_v8f32(float %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
@@ -161,8 +159,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -175,8 +172,7 @@ define float @test_v16f32(float %a0, <16 x float> %a1) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
@@ -287,8 +283,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -299,8 +294,7 @@ define float @test_v8f32_zero(<8 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
@@ -342,8 +336,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -356,8 +349,7 @@ define float @test_v16f32_zero(<16 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
@@ -468,8 +460,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -480,8 +471,7 @@ define float @test_v8f32_undef(<8 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
@@ -523,8 +513,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -537,8 +526,7 @@ define float @test_v16f32_undef(<16 x float> %a0) {
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
@@ -586,8 +574,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -596,8 +583,7 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
 ; AVX512-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
@@ -621,8 +607,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -633,8 +618,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
@@ -664,8 +648,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -677,8 +660,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
@@ -728,8 +710,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -738,8 +719,7 @@ define double @test_v4f64_zero(<4 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
@@ -764,8 +744,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -776,8 +755,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
@@ -807,8 +785,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -820,8 +797,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
@@ -871,8 +847,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -881,8 +856,7 @@ define double @test_v4f64_undef(<4 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
@@ -907,8 +881,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -919,8 +892,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
@@ -950,8 +922,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
@@ -963,8 +934,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
diff --git a/test/CodeGen/X86/vector-reduce-mul.ll b/test/CodeGen/X86/vector-reduce-mul.ll
index 210c076d2a679df886e6fc3cd98c0485ed3ca387..58d712c35aacecaa16c3fede79828c30694339f5 100644
--- a/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/test/CodeGen/X86/vector-reduce-mul.ll
@@ -160,7 +160,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -184,7 +184,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512BW-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -208,7 +208,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
 ; AVX512BWVL-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
@@ -229,7 +229,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullq %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmullq %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
@@ -352,7 +352,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -385,7 +385,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -418,7 +418,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
@@ -442,7 +442,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
@@ -655,7 +655,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -696,7 +696,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -737,7 +737,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
@@ -763,7 +763,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
+; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
 ; AVX512DQVL-NEXT:    vzeroupper
 ; AVX512DQVL-NEXT:    retq
@@ -872,7 +872,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -884,7 +884,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -955,7 +955,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -969,7 +969,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1064,7 +1064,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1079,7 +1079,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -1171,7 +1171,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -1186,7 +1186,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -1240,7 +1240,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -1257,7 +1257,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BW-NEXT:    vzeroupper
@@ -1274,7 +1274,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
 ; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BWVL-NEXT:    vzeroupper
@@ -1290,7 +1290,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovd %xmm0, %eax
 ; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1306,7 +1306,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
@@ -1372,7 +1372,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -1390,7 +1390,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
 ; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BW-NEXT:    vzeroupper
@@ -1408,7 +1408,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
 ; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512BWVL-NEXT:    vzeroupper
@@ -1426,7 +1426,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vmovd %xmm0, %eax
 ; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1444,7 +1444,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512DQVL-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-or.ll b/test/CodeGen/X86/vector-reduce-or.ll
index 04ec6cfc970ee1f71d6d51d90382d92dfb685305..1b67c94e4ec5b07073b4a50b5f9ab6e81281edc9 100644
--- a/test/CodeGen/X86/vector-reduce-or.ll
+++ b/test/CodeGen/X86/vector-reduce-or.ll
@@ -49,7 +49,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -94,7 +94,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -105,7 +105,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -117,7 +117,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -148,7 +148,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -161,7 +161,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -174,7 +174,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -235,7 +235,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -247,7 +247,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -259,7 +259,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -288,7 +288,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -301,7 +301,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -309,13 +309,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-LABEL: test_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -350,7 +350,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -365,22 +365,22 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v32i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -457,7 +457,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -472,7 +472,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -487,7 +487,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -523,7 +523,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -539,7 +539,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -556,7 +556,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -598,7 +598,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -616,7 +616,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -634,7 +634,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -757,7 +757,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -774,7 +774,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -791,7 +791,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -851,7 +851,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -869,7 +869,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -888,7 +888,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -958,7 +958,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -978,7 +978,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -998,7 +998,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-reduce-umax.ll b/test/CodeGen/X86/vector-reduce-umax.ll
index 680a5c52e6307c3690a2d0c841cc9ced17887af9..52b42ce9bcbf9dd6e6c4ce7bc164b93df9f3ac60 100644
--- a/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/test/CodeGen/X86/vector-reduce-umax.ll
@@ -1141,15 +1141,14 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1207,20 +1206,19 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1296,35 +1294,30 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE2-LABEL: test_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
@@ -1406,47 +1399,38 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE2-LABEL: test_v64i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm5, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    pmaxsw %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm6
 ; SSE2-NEXT:    pxor %xmm8, %xmm2
 ; SSE2-NEXT:    pmaxsw %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm8, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm4, %xmm0
 ; SSE2-NEXT:    pmaxsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pmaxsw %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm5, %xmm1
 ; SSE2-NEXT:    pmaxsw %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pmaxsw %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pmaxsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-reduce-umin.ll b/test/CodeGen/X86/vector-reduce-umin.ll
index 52adee5ab26ef7d8afefee79893aec9115e037ad..32a1cdf0f171ccfef29229db76a269acc3b69f43 100644
--- a/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/test/CodeGen/X86/vector-reduce-umin.ll
@@ -1140,15 +1140,14 @@ define i16 @test_v8i16(<8 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pminsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1187,20 +1186,19 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pminsw %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm2, %xmm1
@@ -1253,35 +1251,30 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; SSE2-LABEL: test_v32i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pminsw %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pminsw %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    pminsw %xmm1, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    pminsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
@@ -1338,47 +1331,38 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; SSE2-LABEL: test_v64i16:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pminsw %xmm5, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    pminsw %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pminsw %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm6
 ; SSE2-NEXT:    pxor %xmm8, %xmm2
 ; SSE2-NEXT:    pminsw %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm8, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pminsw %xmm4, %xmm0
 ; SSE2-NEXT:    pminsw %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    pminsw %xmm7, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pminsw %xmm5, %xmm1
 ; SSE2-NEXT:    pminsw %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pminsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pminsw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pminsw %xmm1, %xmm0
+; SSE2-NEXT:    psrld $16, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pminsw %xmm1, %xmm0
 ; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    pminsw %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-reduce-xor.ll b/test/CodeGen/X86/vector-reduce-xor.ll
index cb69ee80ee46f40b8cac7bf39b9e6e32970b6c9d..0192ff3c923a5d0b243ea83df5fa49cc9ee4eec3 100644
--- a/test/CodeGen/X86/vector-reduce-xor.ll
+++ b/test/CodeGen/X86/vector-reduce-xor.ll
@@ -49,7 +49,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -59,7 +59,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -69,7 +69,7 @@ define i64 @test_v4i64(<4 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -94,7 +94,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -105,7 +105,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -117,7 +117,7 @@ define i64 @test_v8i64(<8 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -148,7 +148,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -161,7 +161,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -174,7 +174,7 @@ define i64 @test_v16i64(<16 x i64> %a0) {
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -235,7 +235,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -247,7 +247,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -259,7 +259,7 @@ define i32 @test_v8i32(<8 x i32> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -288,7 +288,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -301,7 +301,7 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -309,13 +309,13 @@ define i32 @test_v16i32(<16 x i32> %a0) {
 ; AVX512-LABEL: test_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -350,7 +350,7 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -365,22 +365,22 @@ define i32 @test_v32i32(<32 x i32> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v32i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -457,7 +457,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -472,7 +472,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -487,7 +487,7 @@ define i16 @test_v16i16(<16 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -523,7 +523,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -539,7 +539,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -556,7 +556,7 @@ define i16 @test_v32i16(<32 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -598,7 +598,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovd %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -616,7 +616,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -634,7 +634,7 @@ define i16 @test_v64i16(<64 x i16> %a0) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -757,7 +757,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -774,7 +774,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -791,7 +791,7 @@ define i8 @test_v32i8(<32 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -851,7 +851,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -869,7 +869,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -888,7 +888,7 @@ define i8 @test_v64i8(<64 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@@ -958,7 +958,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1-NEXT:    vzeroupper
@@ -978,7 +978,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX2-NEXT:    vzeroupper
@@ -998,7 +998,7 @@ define i8 @test_v128i8(<128 x i8> %a0) {
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
index dd5a5b013954d3c16756d24395680295d29136ef..75fe0b322ca12e3b626e1a7bba0ed541545cd81b 100644
--- a/test/CodeGen/X86/vector-rotate-256.ll
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -690,8 +690,8 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX2-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX2-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
@@ -702,8 +702,8 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512F-NEXT:    vpbroadcastw %xmm1, %ymm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
@@ -714,8 +714,8 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
 ; AVX512VL-NEXT:    vpbroadcastw %xmm1, %ymm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll
index eb51c1029dee1f7f17fa623111932d00b47bc928..f838f1b54dbf8424ecb59d2210a11bb13c1b7899 100644
--- a/test/CodeGen/X86/vector-rotate-512.ll
+++ b/test/CodeGen/X86/vector-rotate-512.ll
@@ -316,8 +316,8 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 ; AVX512F-NEXT:    vpbroadcastw %xmm2, %ymm3
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT:    vpsubw %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512F-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
 ; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 ; AVX512F-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpor %ymm0, %ymm4, %ymm0
@@ -331,8 +331,8 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
 ; AVX512VL-NEXT:    vpbroadcastw %xmm2, %ymm3
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm4
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT:    vpsubw %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm3, %xmm5, %xmm3
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpor %ymm0, %ymm4, %ymm0
@@ -468,14 +468,14 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512BW-LABEL: splatvar_rotate_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpsubb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsrlw %xmm2, %zmm4, %zmm2
@@ -488,14 +488,14 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %zmm2
-; AVX512VLBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT:    vpsubb %zmm2, %zmm3, %zmm2
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm3
 ; AVX512VLBW-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4
 ; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm4, %zmm1
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpandq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
 ; AVX512VLBW-NEXT:    vpsrlw %xmm2, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpsrlw %xmm2, %zmm4, %zmm2
@@ -876,7 +876,7 @@ define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
 ; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vprold $4, %zmm0, %zmm0
-; AVX512-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpandd {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
   %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
@@ -980,10 +980,8 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
@@ -991,10 +989,8 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    vpandq {{.*}}(%rip), %zmm1, %zmm1
 ; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
 ; AVX512VLBW-NEXT:    retq
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index b5f3b76c00cac481b0ddcc3f0a5dfcd2011de4b1..9576f6482fd7a71dfba192109ab274261f592abc 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -46,20 +46,20 @@ entry:
 define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_16i16:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_16i16:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT:    psraw $8, %xmm2
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSSE3-NEXT:    psraw $8, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_16i16:
@@ -103,30 +103,32 @@ entry:
 define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_32i8_to_32i16:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    psraw $8, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    psraw $8, %xmm0
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psraw $8, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; SSE2-NEXT:    psraw $8, %xmm2
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm3
-; SSE2-NEXT:    movdqa %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_32i8_to_32i16:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSSE3-NEXT:    psraw $8, %xmm4
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSSE3-NEXT:    psraw $8, %xmm5
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    psraw $8, %xmm0
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    psraw $8, %xmm1
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; SSSE3-NEXT:    psraw $8, %xmm2
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSSE3-NEXT:    psraw $8, %xmm3
-; SSSE3-NEXT:    movdqa %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa %xmm5, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_32i8_to_32i16:
@@ -230,24 +232,22 @@ entry:
 define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_8i32:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psrad $24, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    psrad $24, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_8i32:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    psrad $24, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSSE3-NEXT:    psrad $24, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_8i32:
@@ -292,37 +292,34 @@ entry:
 define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_16i32:
 ; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; SSE2-NEXT:    psrad $24, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psrad $24, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; SSE2-NEXT:    psrad $24, %xmm3
 ; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_16i32:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    psrad $24, %xmm0
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    psrad $24, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; SSSE3-NEXT:    psrad $24, %xmm4
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
 ; SSSE3-NEXT:    psrad $24, %xmm1
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[u,u,u,12,u,u,u,13,u,u,u,14,u,u,u,15]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; SSSE3-NEXT:    psrad $24, %xmm3
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_16i32:
@@ -424,14 +421,13 @@ entry:
 define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_4i64:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    psrad $24, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psrad $31, %xmm0
@@ -442,18 +438,19 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
 ;
 ; SSSE3-LABEL: sext_16i8_to_4i64:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    psrad $24, %xmm0
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,u,u,u,3,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    psrad $24, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    psrad $24, %xmm1
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_4i64:
@@ -498,63 +495,62 @@ entry:
 define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: sext_16i8_to_8i64:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm4
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    psrad $24, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT:    psrld $16, %xmm3
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    psrad $24, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: sext_16i8_to_8i64:
 ; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <u,u,u,2,u,u,u,3,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    psrad $24, %xmm4
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    psrad $24, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
-; SSSE3-NEXT:    psrad $31, %xmm4
-; SSSE3-NEXT:    psrad $24, %xmm0
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSSE3-NEXT:    pshufb %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    psrad $24, %xmm3
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm4
-; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    psrad $24, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    psrad $24, %xmm3
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: sext_16i8_to_8i64:
@@ -1291,7 +1287,8 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movzwl (%rdi), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    psrad $24, %xmm0
@@ -5064,7 +5061,8 @@ define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) {
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movzwl (%rdi), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,u,u,u,1,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSSE3-NEXT:    psrad $24, %xmm0
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; SSSE3-NEXT:    paddq %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index c944902d0a38bcb123879b3e2aa4c70bca50aabb..584a54e68e8673cc9179f35a00fffa69a0054ddc 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -990,15 +990,11 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
 ;
 ; X32-SSE-LABEL: constant_shift_v2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X32-SSE-NEXT:    psrlq $1, %xmm2
-; X32-SSE-NEXT:    psrlq $7, %xmm1
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X32-SSE-NEXT:    psrlq $1, %xmm2
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE-NEXT:    psrlq $1, %xmm1
 ; X32-SSE-NEXT:    psrlq $7, %xmm0
-; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT:    movapd {{.*#+}} xmm1 = [2.0E+0,7.2911220195563975E-304]
 ; X32-SSE-NEXT:    xorpd %xmm1, %xmm0
 ; X32-SSE-NEXT:    psubq %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 7f120166a5d4b2e5708bf234be471193f9136e84..6d79996164f2a7425fe0058a2b24014928df4311 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -1066,25 +1066,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
 ;
 ; X32-AVX1-LABEL: constant_shift_v4i64:
 ; X32-AVX1:       # %bb.0:
-; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X32-AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
-; X32-AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm3
-; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; X32-AVX1-NEXT:    vpsrlq $62, %xmm3, %xmm4
-; X32-AVX1-NEXT:    vpsrlq $31, %xmm3, %xmm3
-; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
-; X32-AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
-; X32-AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
-; X32-AVX1-NEXT:    vpsrlq $7, %xmm1, %xmm3
-; X32-AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
-; X32-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm3
+; X32-AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,1,0,2,0,0,0]
+; X32-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
 ; X32-AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; X32-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,16384,0,0,0,256]
+; X32-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X32-AVX1-NEXT:    retl
 ;
 ; X32-AVX2-LABEL: constant_shift_v4i64:
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index bd77311479bc145900812b7a2536eaba40ae25fd..7ce33dcfe242e219a520199f0d78d06ab4742847 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -373,8 +373,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
 ; SSE2-NEXT:    psrlw $4, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
@@ -382,16 +382,16 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
 ; SSE2-NEXT:    psrlw $2, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -488,8 +488,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm4, %xmm0
 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
@@ -497,16 +497,16 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
 ; X32-SSE-NEXT:    psrlw $2, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm4, %xmm0
 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
 ; X32-SSE-NEXT:    movdqa %xmm2, %xmm1
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm1
 ; X32-SSE-NEXT:    psrlw $1, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %shift = lshr <16 x i8> %a, %b
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 67963b1f992fd25628cc931e80f69d8028ac8c4d..a26fccd44c8e144ee429ac41ae40e3e5588cb80b 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -295,8 +295,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
 ; SSE2-NEXT:    psllw $4, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
@@ -304,8 +304,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
 ; SSE2-NEXT:    psllw $2, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
@@ -405,8 +405,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
 ; X32-SSE-NEXT:    psllw $4, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm4, %xmm0
 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
 ; X32-SSE-NEXT:    pxor %xmm3, %xmm3
@@ -414,8 +414,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X32-SSE-NEXT:    pandn %xmm0, %xmm4
 ; X32-SSE-NEXT:    psllw $2, %xmm0
-; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    pand %xmm3, %xmm0
+; X32-SSE-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    por %xmm4, %xmm0
 ; X32-SSE-NEXT:    paddb %xmm1, %xmm1
 ; X32-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 65335572229848bf853d02e6f0d449790d0c76f6..bf34c0332dd5fd7a16ef3a60c8309eb36edbeb43 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -579,6 +579,96 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
   ret <16 x i8> %shuffle
 }
 
+; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780
+
+define <16 x i8> @load_fold_pblendvb(<16 x i8>* %px, <16 x i8> %y) {
+; SSE2-LABEL: load_fold_pblendvb:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andnps (%rdi), %xmm1
+; SSE2-NEXT:    orps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_fold_pblendvb:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3],zero,zero,zero,xmm0[7,8,9],zero,xmm0[11],zero,zero,zero,xmm0[15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,xmm1[4,5,6],zero,zero,zero,xmm1[10],zero,xmm1[12,13,14],zero
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_fold_pblendvb:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; SSE41-NEXT:    pblendvb %xmm0, (%rdi), %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1OR2-LABEL: load_fold_pblendvb:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX1OR2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: load_fold_pblendvb:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movw $29812, %ax # imm = 0x7474
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqu8 (%rdi), %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %x = load <16 x i8>, <16 x i8>* %px, align 16
+  %select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i8> %select
+}
+
+define <16 x i8> @load_fold_pblendvb_commute(<16 x i8>* %px, <16 x i8> %y) {
+; SSE2-LABEL: load_fold_pblendvb_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    andnps %xmm0, %xmm2
+; SSE2-NEXT:    andps (%rdi), %xmm1
+; SSE2-NEXT:    orps %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_fold_pblendvb_commute:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa (%rdi), %xmm1
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[2],zero,xmm0[4,5,6],zero,zero,zero,xmm0[10],zero,xmm0[12,13,14],zero
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3],zero,zero,zero,xmm1[7,8,9],zero,xmm1[11],zero,zero,zero,xmm1[15]
+; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_fold_pblendvb_commute:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; SSE41-NEXT:    pblendvb %xmm0, (%rdi), %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1OR2-LABEL: load_fold_pblendvb_commute:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX1OR2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: load_fold_pblendvb_commute:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512VL-NEXT:    movw $29812, %ax # imm = 0x7474
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+  %x = load <16 x i8>, <16 x i8>* %px, align 16
+  %select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
+  ret <16 x i8> %select
+}
+
 define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
 ; SSE2-LABEL: trunc_v4i32_shuffle:
 ; SSE2:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index e35f664e121aa7369d525e888dafccfa85058db6..0e4d5dcd3865e3e04143ad519083403e33d4f18c 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1998,8 +1998,8 @@ define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
 define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
 ; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index de587beadc147b2c9f59273d731422d87c7c111e..461246d80a824d9b2db2d9886564e27b7292fe9f 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -846,7 +846,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
 define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41]
 ; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
@@ -863,7 +863,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_3
 define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41]
 ; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
@@ -1956,14 +1956,8 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z
 ;
 ; AVX2-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[10,11],zero,zero,zero,zero,zero,zero,ymm0[28,29],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
@@ -4052,6 +4046,30 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
   ret <16 x i16> %shuffle
 }
 
+define <16 x i16> @shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24(<16 x i16> %a, <16 x i16> %b) {
+; AVX1-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_02_04_06_01_03_05_07_31_30_29_28_27_26_25_24:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,1,3,5,7,31,30,29,28,27,26,25,24]
+; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>
+  ret <16 x i16> %shuffle
+}
+
 define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu:
 ; AVX1:       # %bb.0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 3e36b4a3b6a09e35d451feae85a8d296e705025a..c4759ab54f531dc463fa0629fa0174fd3f1d0d61 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1643,6 +1643,62 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
   ret <32 x i8> %shuffle
 }
 
+; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780
+
+define <32 x i8> @load_fold_pblendvb(<32 x i8>* %px, <32 x i8> %y) {
+; AVX1-LABEL: load_fold_pblendvb:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303]
+; AVX1-NEXT:    vandnps (%rdi), %ymm1, %ymm2
+; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_fold_pblendvb:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
+; AVX2-NEXT:    vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: load_fold_pblendvb:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movl $1953789044, %eax # imm = 0x74747474
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqu8 (%rdi), %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %x = load <32 x i8>, <32 x i8>* %px, align 32
+  %select = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 32, i32 33, i32 2, i32 35, i32 4, i32 5, i32 6, i32 39, i32 40, i32 41, i32 10, i32 43, i32 12, i32 13, i32 14, i32 47, i32 48, i32 49, i32 18, i32 51, i32 20, i32 21, i32 22, i32 55, i32 56, i32 57, i32 26, i32 59, i32 28, i32 29, i32 30, i32 63>
+  ret <32 x i8> %select
+}
+
+define <32 x i8> @load_fold_pblendvb_commute(<32 x i8>* %px, <32 x i8> %y) {
+; AVX1-LABEL: load_fold_pblendvb_commute:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303]
+; AVX1-NEXT:    vandnps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    vandps (%rdi), %ymm1, %ymm1
+; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_fold_pblendvb_commute:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
+; AVX2-NEXT:    vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: load_fold_pblendvb_commute:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512VL-NEXT:    movl $1953789044, %eax # imm = 0x74747474
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+  %x = load <32 x i8>, <32 x i8>* %px, align 32
+  %select = shufflevector <32 x i8> %y, <32 x i8> %x, <32 x i32> <i32 32, i32 33, i32 2, i32 35, i32 4, i32 5, i32 6, i32 39, i32 40, i32 41, i32 10, i32 43, i32 12, i32 13, i32 14, i32 47, i32 48, i32 49, i32 18, i32 51, i32 20, i32 21, i32 22, i32 55, i32 56, i32 57, i32 26, i32 59, i32 28, i32 29, i32 30, i32 63>
+  ret <32 x i8> %select
+}
+
 define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
 ; AVX1OR2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
 ; AVX1OR2:       # %bb.0:
@@ -2495,6 +2551,35 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
   ret <32 x i8> %shuffle
 }
 
+define <32 x i8> @shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47(<32 x i8> %a, <32 x i8> %b) {
+; AVX1-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,16,18,20,22,24,26,28,30,17,19,21,23,25,27,29,31]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,16,18,20,22,24,26,28,30,17,19,21,23,25,27,29,31]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00_32_34_36_38_40_42_44_46_33_35_37_39_41_43_45_47:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,32,34,36,38,40,42,44,46,33,35,37,39,41,43,45,47]
+; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
+; AVX512VLVBMI-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47>
+  ret <32 x i8> %shuffle
+}
+
 define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) {
 ; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
 ; AVX1:       # %bb.0:
@@ -2699,26 +2784,16 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
 ;
 ; AVX2-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8],zero,zero,zero,ymm0[9],zero,zero,zero,ymm0[10],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[28],zero,zero,zero,ymm0[29],zero,zero,zero,ymm0[30],zero,zero,zero,ymm0[31],zero,zero,zero
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VLBW-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VLBW-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
 ; AVX512VLBW-NEXT:    movl $286331153, %eax # imm = 0x11111111
 ; AVX512VLBW-NEXT:    kmovd %eax, %k1
-; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,u,u,u,9,u,u,u,10,u,u,u,11,u,u,u,28,u,u,u,29,u,u,u,30,u,u,u,31,u,u,u]
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index ed281c31d46b1da572f695805ed13211ff9c987a..b3750b74ad34019cf5828f605735bc89cd96d097 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -91,9 +91,8 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
 define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_1000:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4f64_1000:
@@ -174,10 +173,8 @@ define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) {
 define <4 x double> @shuffle_v4f64_2233(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_2233:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4f64_2233:
@@ -766,9 +763,8 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_1000:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4i64_1000:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 74c477300d6055acdbddd18dee47ca7d651513c3..addf2d2563fc59bd593e7b771de3d642840dda88 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -2848,3 +2848,32 @@ entry:
   %add = add <8 x i32> %shuffle, %shuffle1
   ret <8 x i32> %add
 }
+
+; This test used to crash due to bad handling of concat_vectors after a bitcast
+; in lowerVectorShuffleAsBroadcast.
+define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) {
+; AVX1-LABEL: broadcast_concat_crash:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[3,3,1,1]
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2OR512VL-LABEL: broadcast_concat_crash:
+; AVX2OR512VL:       # %bb.0: # %entry
+; AVX2OR512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2OR512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX2OR512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2OR512VL-NEXT:    retq
+entry:
+  %tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %bc = bitcast <8 x float> %tmp to <4 x i64>
+  %tmp1 = extractelement <4 x i64> %bc, i32 3
+  %tmp2 = bitcast i64 %tmp1 to <2 x float>
+  %tmp4 = extractelement <2 x float> %tmp2, i32 1
+  %tmp5 = insertelement <8 x float> undef, float %tmp4, i32 4
+  %tmp6 = insertelement <8 x float> %tmp5, float %z, i32 5
+  ret <8 x float> %tmp6
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index af3ef0894acfb06dfb0e2b1d0bbe2b26103cc371..678feb8b3307e93b889f0490d9b5745f35ae51b2 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -383,12 +383,12 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
 define <2 x double> @constant_fold_vpermilvar_pd() {
 ; X32-LABEL: constant_fold_vpermilvar_pd:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [2,1]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_pd:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [2,1]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [2.0E+0,1.0E+0]
 ; X64-NEXT:    retq
   %1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>)
   ret <2 x double> %1
@@ -397,12 +397,12 @@ define <2 x double> @constant_fold_vpermilvar_pd() {
 define <4 x double> @constant_fold_vpermilvar_pd_256() {
 ; X32-LABEL: constant_fold_vpermilvar_pd_256:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [2,1,3,4]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_pd_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [2,1,3,4]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [2.0E+0,1.0E+0,3.0E+0,4.0E+0]
 ; X64-NEXT:    retq
   %1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
   ret <4 x double> %1
@@ -411,12 +411,12 @@ define <4 x double> @constant_fold_vpermilvar_pd_256() {
 define <4 x float> @constant_fold_vpermilvar_ps() {
 ; X32-LABEL: constant_fold_vpermilvar_ps:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [4,1,3,2]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_ps:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [4,1,3,2]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [4.0E+0,1.0E+0,3.0E+0,2.0E+0]
 ; X64-NEXT:    retq
   %1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>)
   ret <4 x float> %1
@@ -425,13 +425,110 @@ define <4 x float> @constant_fold_vpermilvar_ps() {
 define <8 x float> @constant_fold_vpermilvar_ps_256() {
 ; X32-LABEL: constant_fold_vpermilvar_ps_256:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1,1,3,2,5,6,6,6]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermilvar_ps_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1,1,3,2,5,6,6,6]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1.0E+0,1.0E+0,3.0E+0,2.0E+0,5.0E+0,6.0E+0,6.0E+0,6.0E+0]
 ; X64-NEXT:    retq
   %1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
   ret <8 x float> %1
 }
+
+define void @PR39483() {
+; X32-AVX1-LABEL: PR39483:
+; X32-AVX1:       # %bb.0: # %entry
+; X32-AVX1-NEXT:    vmovups 32, %ymm0
+; X32-AVX1-NEXT:    vmovups 64, %ymm1
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X32-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; X32-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X32-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; X32-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; X32-AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; X32-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X32-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X32-AVX1-NEXT:    vmovups %ymm0, (%eax)
+;
+; X32-AVX2-LABEL: PR39483:
+; X32-AVX2:       # %bb.0: # %entry
+; X32-AVX2-NEXT:    vmovups 32, %ymm0
+; X32-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X32-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; X32-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; X32-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
+; X32-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; X32-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X32-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X32-AVX2-NEXT:    vmovups %ymm0, (%eax)
+;
+; X32-AVX512-LABEL: PR39483:
+; X32-AVX512:       # %bb.0: # %entry
+; X32-AVX512-NEXT:    vmovups 0, %zmm0
+; X32-AVX512-NEXT:    vmovups 64, %ymm1
+; X32-AVX512-NEXT:    vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
+; X32-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
+; X32-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X32-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
+; X32-AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; X32-AVX512-NEXT:    vmovups %ymm0, (%eax)
+;
+; X64-AVX1-LABEL: PR39483:
+; X64-AVX1:       # %bb.0: # %entry
+; X64-AVX1-NEXT:    vmovups 32, %ymm0
+; X64-AVX1-NEXT:    vmovups 64, %ymm1
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,3]
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; X64-AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X64-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vmovups %ymm0, (%rax)
+;
+; X64-AVX2-LABEL: PR39483:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    vmovups 32, %ymm0
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; X64-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
+; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; X64-AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovups %ymm0, (%rax)
+;
+; X64-AVX512-LABEL: PR39483:
+; X64-AVX512:       # %bb.0: # %entry
+; X64-AVX512-NEXT:    vmovups 0, %zmm0
+; X64-AVX512-NEXT:    vmovups 64, %ymm1
+; X64-AVX512-NEXT:    vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
+; X64-AVX512-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2
+; X64-AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmulps %ymm0, %ymm2, %ymm1
+; X64-AVX512-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; X64-AVX512-NEXT:    vmovups %ymm0, (%rax)
+entry:
+  %wide.vec = load <24 x float>, <24 x float>* null, align 4
+  %strided.vec18 = shufflevector <24 x float> %wide.vec, <24 x float> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %0 = fmul <8 x float> %strided.vec18, zeroinitializer
+  %1 = fadd <8 x float> zeroinitializer, %0
+  store <8 x float> %1, <8 x float>* undef, align 16
+  unreachable
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index ae2a5513bfda190765f816a9cb009307b9b1ee06..95d53ace5d3222712f684035dea3ee4d0037296d 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -691,7 +691,7 @@ define <8 x i32> @constant_fold_permd() {
 define <8 x float> @constant_fold_permps() {
 ; CHECK-LABEL: constant_fold_permps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5.0E+0,7.0E+0,3.0E+0,2.0E+0,8.0E+0,2.0E+0,6.0E+0,1.0E+0]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
   ret <8 x float> %1
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 52b5edd7abede08b76934e35e685a3ca991e77ae..5fe0a2b460b9d1948f6b82f10ef85c4112cbdad8 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -384,12 +384,12 @@ define void @buildvector_v4f32_07z6(float %a, <4 x float> %b, <4 x float>* %ptr)
 define <2 x double> @constant_fold_vpermil2pd() {
 ; X32-LABEL: constant_fold_vpermil2pd:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-2,2]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2pd:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-2,2]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-2.0E+0,2.0E+0]
 ; X64-NEXT:    retq
   %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> <double 1.0, double 2.0>, <2 x double> <double -2.0, double -1.0>, <2 x i64> <i64 4, i64 2>, i8 2)
   ret <2 x double> %1
@@ -398,12 +398,12 @@ define <2 x double> @constant_fold_vpermil2pd() {
 define <4 x double> @constant_fold_vpermil2pd_256() {
 ; X32-LABEL: constant_fold_vpermil2pd_256:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-4,0,4,3]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2pd_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-4,0,4,3]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-4.0E+0,0.0E+0,4.0E+0,3.0E+0]
 ; X64-NEXT:    retq
   %1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x double> <double -4.0, double -3.0, double -2.0, double -1.0>, <4 x i64> <i64 4, i64 8, i64 2, i64 0>, i8 2)
   ret <4 x double> %1
@@ -412,12 +412,12 @@ define <4 x double> @constant_fold_vpermil2pd_256() {
 define <4 x float> @constant_fold_vpermil2ps() {
 ; X32-LABEL: constant_fold_vpermil2ps:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-4,1,3,0]
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2ps:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-4,1,3,0]
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-4.0E+0,1.0E+0,3.0E+0,0.0E+0]
 ; X64-NEXT:    retq
   %1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> <float -4.0, float -3.0, float -2.0, float -1.0>, <4 x i32> <i32 4, i32 0, i32 2, i32 8>, i8 2)
   ret <4 x float> %1
@@ -426,12 +426,12 @@ define <4 x float> @constant_fold_vpermil2ps() {
 define <8 x float> @constant_fold_vpermil2ps_256() {
 ; X32-LABEL: constant_fold_vpermil2ps_256:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-8,1,3,0,5,0,5,7]
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: constant_fold_vpermil2ps_256:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-8,1,3,0,5,0,5,7]
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [-8.0E+0,1.0E+0,3.0E+0,0.0E+0,5.0E+0,0.0E+0,5.0E+0,7.0E+0]
 ; X64-NEXT:    retq
   %1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x float> <float -8.0, float -7.0, float -6.0, float -5.0, float -4.0, float -3.0, float -2.0, float -1.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 8, i32 0, i32 8, i32 0, i32 2>, i8 2)
   ret <8 x float> %1
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index 2eb9362947e258ef6152a5717daaa79d0f37b3e6..9c7163f39da60f44d524bc6f319495e15f5a79e5 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2642,14 +2642,14 @@ define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>*
 define <4 x float> @combine_constant_insertion_v4f32(float %f) {
 ; SSE2-LABEL: combine_constant_insertion_v4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4,5,3>
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_constant_insertion_v4f32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4,5,3>
+; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
@@ -2703,7 +2703,7 @@ define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
 ; SSE-LABEL: PR22377:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3]
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
 ; SSE-NEXT:    addps %xmm0, %xmm1
 ; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -2711,7 +2711,7 @@ define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: PR22377:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,3]
 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
 ; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm1
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -2809,14 +2809,14 @@ define <4 x float> @PR30264(<4 x float> %x) {
 ;
 ; SSE41-LABEL: PR30264:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4,1>
+; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: PR30264:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <u,u,4,1>
+; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
 ; AVX-NEXT:    retq
   %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index d9f186e64f1d96a94a70ceb3ad93ef6e9b519895..7dc850391bca4f95b704fc866b26d4883a83d4a0 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -233,7 +233,8 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm7 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
@@ -347,7 +348,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
@@ -680,22 +681,23 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -781,13 +783,13 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
@@ -1106,7 +1108,8 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm7
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vpsubq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm7 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm6, %xmm3
@@ -1220,7 +1223,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
@@ -1575,7 +1578,8 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm7
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vpsubq {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm7, %xmm3
@@ -1687,7 +1691,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpsubd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
@@ -2275,7 +2279,8 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm4 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
@@ -2451,7 +2456,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
@@ -2909,7 +2914,8 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm6 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm6 = mem[0,0]
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
@@ -3049,7 +3055,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
@@ -3351,27 +3357,28 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ;
 ; AVX1-LABEL: trunc_and_v16i64_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT:    vandps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vandpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vandpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vandpd %ymm6, %ymm2, %ymm2
+; AVX1-NEXT:    vandpd %ymm7, %ymm3, %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -3468,7 +3475,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
@@ -3498,7 +3505,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ;
 ; AVX512-LABEL: trunc_and_v16i32_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -3751,22 +3758,23 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -3852,13 +3860,13 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -4153,27 +4161,28 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ;
 ; AVX1-LABEL: trunc_xor_v16i64_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT:    vxorps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT:    vxorps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vxorpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vxorpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vxorpd %ymm6, %ymm2, %ymm2
+; AVX1-NEXT:    vxorpd %ymm7, %ymm3, %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -4270,7 +4279,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
@@ -4300,7 +4309,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ;
 ; AVX512-LABEL: trunc_xor_v16i32_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -4553,22 +4562,23 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -4654,13 +4664,13 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
@@ -4955,27 +4965,28 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
 ;
 ; AVX1-LABEL: trunc_or_v16i64_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vorps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT:    vorps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT:    vorps %ymm7, %ymm3, %ymm3
+; AVX1-NEXT:    vorpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vorpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vorpd %ymm6, %ymm2, %ymm2
+; AVX1-NEXT:    vorpd %ymm7, %ymm3, %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -5072,7 +5083,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
 ; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
 ; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
@@ -5102,7 +5113,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
 ;
 ; AVX512-LABEL: trunc_or_v16i32_v16i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -5355,22 +5366,23 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vandpd %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vandpd %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -5456,13 +5468,13 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll
index 91ede6cb06231043f08fe429123dc9b6701bc8fb..61935dce8f820097b46376ec1400776c23d81bcd 100644
--- a/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/test/CodeGen/X86/vector-trunc-packus.ll
@@ -2070,24 +2070,26 @@ define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
 ; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm7, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm7, %xmm7
+; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll
index 3e5dcc5c3c2fc2731202aa106922a50a0c6e7e2f..500d8ba1511d8c96a3e1e25d0fc790c67477135b 100644
--- a/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -2001,7 +2001,8 @@ define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovapd {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
 ; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/vector-trunc-usat.ll b/test/CodeGen/X86/vector-trunc-usat.ll
index 1bde6c3a141581d5016e6a675f0e36fd8c50edb0..0c3766ac9783849c317e56ac7a608a91831a26b9 100644
--- a/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/test/CodeGen/X86/vector-trunc-usat.ll
@@ -716,26 +716,26 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) {
 define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
 ; SSE2-LABEL: trunc_usat_v8i32_v8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pandn %xmm2, %xmm5
-; SSE2-NEXT:    por %xmm1, %xmm5
-; SSE2-NEXT:    pslld $16, %xmm5
-; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm1, %xmm4
+; SSE2-NEXT:    por %xmm3, %xmm4
+; SSE2-NEXT:    pslld $16, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm4
 ; SSE2-NEXT:    pslld $16, %xmm0
 ; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    packssdw %xmm5, %xmm0
+; SSE2-NEXT:    packssdw %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_usat_v8i32_v8i16:
@@ -826,36 +826,36 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
 define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) {
 ; SSE2-LABEL: trunc_usat_v16i32_v16i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm2, %xmm7
+; SSE2-NEXT:    pxor %xmm6, %xmm7
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
 ; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pandn %xmm8, %xmm1
+; SSE2-NEXT:    pxor %xmm7, %xmm1
 ; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm6
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm6, %xmm4
 ; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
 ; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pandn %xmm8, %xmm2
+; SSE2-NEXT:    pxor %xmm7, %xmm2
 ; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT:    pand %xmm6, %xmm0
-; SSE2-NEXT:    pandn %xmm8, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm5
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    pandn %xmm8, %xmm5
-; SSE2-NEXT:    por %xmm4, %xmm5
+; SSE2-NEXT:    pxor %xmm6, %xmm3
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm7, %xmm4
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT:    pxor %xmm5, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    por %xmm7, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm5
 ; SSE2-NEXT:    psrad $16, %xmm5
 ; SSE2-NEXT:    pslld $16, %xmm0
@@ -870,36 +870,36 @@ define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) {
 ;
 ; SSSE3-LABEL: trunc_usat_v16i32_v16i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm6
+; SSSE3-NEXT:    movdqa %xmm1, %xmm8
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm7
+; SSSE3-NEXT:    pxor %xmm6, %xmm7
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm7
 ; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pandn %xmm8, %xmm1
+; SSSE3-NEXT:    pxor %xmm7, %xmm1
 ; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa %xmm3, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm6
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pxor %xmm6, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
 ; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pandn %xmm8, %xmm2
+; SSSE3-NEXT:    pxor %xmm7, %xmm2
 ; SSSE3-NEXT:    por %xmm3, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    pxor %xmm7, %xmm3
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT:    pand %xmm6, %xmm0
-; SSSE3-NEXT:    pandn %xmm8, %xmm6
-; SSSE3-NEXT:    por %xmm6, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm5
-; SSSE3-NEXT:    pand %xmm5, %xmm4
-; SSSE3-NEXT:    pandn %xmm8, %xmm5
-; SSSE3-NEXT:    por %xmm4, %xmm5
+; SSSE3-NEXT:    pxor %xmm6, %xmm3
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm7, %xmm4
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT:    pxor %xmm5, %xmm7
+; SSSE3-NEXT:    pand %xmm8, %xmm5
+; SSSE3-NEXT:    por %xmm7, %xmm5
 ; SSSE3-NEXT:    pslld $16, %xmm5
 ; SSSE3-NEXT:    psrad $16, %xmm5
 ; SSSE3-NEXT:    pslld $16, %xmm0
@@ -1417,7 +1417,8 @@ define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovapd {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
 ; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index 0d00f8af5a8ea2138cf5b6de7f9bb41a45318bc3..79cbb8cc924fdefba8bc92ea82de8524af58e439 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -286,13 +286,14 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
 ; AVX1-LABEL: trunc8i64_8i8:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandpd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandpd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
@@ -907,13 +908,13 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
 ; AVX1-LABEL: trunc16i32_16i8:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43]
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
@@ -1922,16 +1923,14 @@ define <8 x i16> @PR32160(<8 x i32> %x) {
 ;
 ; AVX2-SLOW-LABEL: PR32160:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
 ; AVX2-SLOW-NEXT:    vpbroadcastd %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
 ; AVX2-FAST-LABEL: PR32160:
 ; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
+; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index d19c10d68bcf2fba9a1ca16b812a0cc36e439df9..21142ff3970c412e00bb12e26ae5c550d4b0ce2e 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -18,121 +18,150 @@
 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    psubq %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    paddq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlq $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlq $4, %xmm0
-; SSE2-NEXT:    paddq %xmm3, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv2i64:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm2, %xmm2
-; SSE3-NEXT:    psubq %xmm0, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    paddq %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlq $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubq %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
-; SSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddq %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlq $4, %xmm0
-; SSE3-NEXT:    paddq %xmm3, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddq %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv2i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    psubq %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    paddq %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddq %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    psrlw $4, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pshufb %xmm3, %xmm0
-; SSSE3-NEXT:    paddb %xmm5, %xmm0
-; SSSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
+; SSSE3-NEXT:    paddb %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv2i64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    psubq %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddq %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pand %xmm2, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pshufb %xmm4, %xmm5
-; SSE41-NEXT:    psrlw $4, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    pshufb %xmm3, %xmm0
-; SSE41-NEXT:    paddb %xmm5, %xmm0
-; SSE41-NEXT:    psadbw %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512CDVL-LABEL: testv2i64:
+; AVX512CDVL:       # %bb.0:
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vplzcntq %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
+; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CDVL-NEXT:    retq
+;
+; AVX512CD-LABEL: testv2i64:
+; AVX512CD:       # %bb.0:
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
+; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT:    vzeroupper
+; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv2i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
@@ -140,68 +169,50 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv2i64:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv2i64:
 ; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG_NOVLX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv2i64:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubq %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X32-SSE-NEXT:    pand %xmm3, %xmm4
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm3, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm5, %xmm0
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddq %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm3
+; X32-SSE-NEXT:    pxor %xmm0, %xmm0
+; X32-SSE-NEXT:    psadbw %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0)
   ret <2 x i64> %out
@@ -210,160 +221,150 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64u:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    psubq %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    paddq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlq $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrlq $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrlq $4, %xmm0
-; SSE2-NEXT:    paddq %xmm3, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddq %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv2i64u:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm2, %xmm2
-; SSE3-NEXT:    psubq %xmm0, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    paddq %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlq $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubq %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [3689348814741910323,3689348814741910323]
-; SSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrlq $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddq %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrlq $4, %xmm0
-; SSE3-NEXT:    paddq %xmm3, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddq %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv2i64u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    psubq %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    paddq %xmm2, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddq %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    psrlw $4, %xmm3
-; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pshufb %xmm3, %xmm0
-; SSSE3-NEXT:    paddb %xmm5, %xmm0
-; SSSE3-NEXT:    psadbw %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm3
+; SSSE3-NEXT:    paddb %xmm4, %xmm3
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    psadbw %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv2i64u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    psubq %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddq %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pand %xmm2, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    pshufb %xmm4, %xmm5
-; SSE41-NEXT:    psrlw $4, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    pshufb %xmm3, %xmm0
-; SSE41-NEXT:    paddb %xmm5, %xmm0
-; SSE41-NEXT:    psadbw %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    psadbw %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: testv2i64u:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv2i64u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512CDVL-LABEL: testv2i64u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CDVL-NEXT:    vplzcntq %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
 ; AVX512CDVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv2i64u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64]
 ; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv2i64u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
@@ -371,68 +372,50 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv2i64u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv2i64u:
 ; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG_NOVLX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv2i64u:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %xmm0, %xmm1, %xmm2
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv2i64u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubq %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X32-SSE-NEXT:    pand %xmm3, %xmm4
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm3, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm5, %xmm0
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddq %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm3
+; X32-SSE-NEXT:    pxor %xmm0, %xmm0
+; X32-SSE-NEXT:    psadbw %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1)
   ret <2 x i64> %out
@@ -441,130 +424,124 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrld $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrld $4, %xmm0
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psadbw %xmm1, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psadbw %xmm0, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv4i32:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm2, %xmm2
-; SSE3-NEXT:    psubd %xmm0, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    paddd %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrld $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubd %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
-; SSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrld $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddd %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrld $4, %xmm0
-; SSE3-NEXT:    paddd %xmm3, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddd %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE3-NEXT:    psadbw %xmm1, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE3-NEXT:    psadbw %xmm1, %xmm0
-; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE3-NEXT:    psadbw %xmm0, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    psubd %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    paddd %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    psrlw $4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pshufb %xmm3, %xmm0
-; SSSE3-NEXT:    paddb %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    psadbw %xmm1, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    psadbw %xmm1, %xmm0
-; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psadbw %xmm0, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    psubd %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    paddd %xmm2, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pshufb %xmm3, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
 ; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pshufb %xmm0, %xmm4
-; SSE41-NEXT:    paddb %xmm5, %xmm4
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE41-NEXT:    psadbw %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE41-NEXT:    psadbw %xmm1, %xmm3
 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
-; SSE41-NEXT:    packuswb %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: testv4i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -574,19 +551,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX2-LABEL: testv4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -596,55 +572,30 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv4i32:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512CDVL-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX512CDVL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512CDVL-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512CDVL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
+; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i32:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; AVX512CD-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX512CD-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512CD-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512CD-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
+; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
@@ -652,51 +603,34 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv4i32:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv4i32:
 ; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG_NOVLX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i32:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -706,27 +640,25 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv4i32:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubd %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; X32-SSE-NEXT:    paddd %xmm2, %xmm0
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm3, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddd %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
-; X32-SSE-NEXT:    paddb %xmm5, %xmm4
-; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
-; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm4
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm3
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X32-SSE-NEXT:    psadbw %xmm1, %xmm3
 ; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
-; X32-SSE-NEXT:    packuswb %xmm4, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0)
   ret <4 x i32> %out
@@ -735,130 +667,124 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; SSE2-LABEL: testv4i32u:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubd %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    psrld $2, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrld $4, %xmm0
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    psadbw %xmm1, %xmm2
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    psadbw %xmm0, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv4i32u:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pxor %xmm2, %xmm2
-; SSE3-NEXT:    psubd %xmm0, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSE3-NEXT:    paddd %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrld $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubd %xmm0, %xmm3
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [858993459,858993459,858993459,858993459]
-; SSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    psrld $2, %xmm3
-; SSE3-NEXT:    pand %xmm0, %xmm3
-; SSE3-NEXT:    paddd %xmm2, %xmm3
-; SSE3-NEXT:    movdqa %xmm3, %xmm0
-; SSE3-NEXT:    psrld $4, %xmm0
-; SSE3-NEXT:    paddd %xmm3, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddd %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE3-NEXT:    psadbw %xmm1, %xmm2
-; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE3-NEXT:    psadbw %xmm1, %xmm0
-; SSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    pxor %xmm0, %xmm0
+; SSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE3-NEXT:    psadbw %xmm0, %xmm2
+; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv4i32u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    psubd %xmm0, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
-; SSSE3-NEXT:    paddd %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddd %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    pand %xmm2, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    psrlw $4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pand %xmm2, %xmm3
-; SSSE3-NEXT:    pshufb %xmm3, %xmm0
-; SSSE3-NEXT:    paddb %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    psadbw %xmm1, %xmm2
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    psadbw %xmm1, %xmm0
-; SSSE3-NEXT:    packuswb %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm0, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    psadbw %xmm0, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    psadbw %xmm0, %xmm1
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv4i32u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    psubd %xmm0, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    paddd %xmm2, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pand %xmm2, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pshufb %xmm3, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm2
+; SSE41-NEXT:    pand %xmm1, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pshufb %xmm2, %xmm4
 ; SSE41-NEXT:    psrlw $4, %xmm0
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pshufb %xmm0, %xmm4
-; SSE41-NEXT:    paddb %xmm5, %xmm4
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE41-NEXT:    psadbw %xmm1, %xmm4
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm3
+; SSE41-NEXT:    paddb %xmm4, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE41-NEXT:    psadbw %xmm1, %xmm3
 ; SSE41-NEXT:    psadbw %xmm1, %xmm0
-; SSE41-NEXT:    packuswb %xmm4, %xmm0
+; SSE41-NEXT:    packuswb %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: testv4i32u:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -868,19 +794,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX2-LABEL: testv4i32u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -890,32 +815,30 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv4i32u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CDVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CDVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512CDVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CDVL-NEXT:    vplzcntd %xmm0, %xmm0
-; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
 ; AVX512CDVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i32u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512CD-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512CD-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i32u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vzeroupper
@@ -923,51 +846,34 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv4i32u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv4i32u:
 ; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG_NOVLX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG_NOVLX-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG_NOVLX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG_NOVLX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; BITALG_NOVLX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vzeroupper
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i32u:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %xmm0, %xmm1, %xmm2
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
-; BITALG-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
-; BITALG-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; BITALG-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; BITALG-NEXT:    vpshufb %xmm0, %xmm4, %xmm0
-; BITALG-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; BITALG-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -977,27 +883,25 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; X32-SSE-LABEL: testv4i32u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubd %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; X32-SSE-NEXT:    paddd %xmm2, %xmm0
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X32-SSE-NEXT:    pand %xmm2, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm4, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm3, %xmm5
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddd %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE-NEXT:    pand %xmm1, %xmm2
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X32-SSE-NEXT:    pshufb %xmm2, %xmm4
 ; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
-; X32-SSE-NEXT:    paddb %xmm5, %xmm4
-; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero
-; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm4
+; X32-SSE-NEXT:    pand %xmm1, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
+; X32-SSE-NEXT:    paddb %xmm4, %xmm3
+; X32-SSE-NEXT:    pxor %xmm1, %xmm1
+; X32-SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero
+; X32-SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X32-SSE-NEXT:    psadbw %xmm1, %xmm3
 ; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
-; X32-SSE-NEXT:    packuswb %xmm4, %xmm0
+; X32-SSE-NEXT:    packuswb %xmm3, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1)
   ret <4 x i32> %out
@@ -1006,24 +910,22 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; SSE2-LABEL: testv8i16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psubw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -1033,24 +935,22 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSE3-LABEL: testv8i16:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    psubw %xmm0, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE3-NEXT:    paddw %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubw %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddw %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
@@ -1060,11 +960,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    psubw %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddw %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSSE3-NEXT:    pand %xmm1, %xmm2
@@ -1083,11 +981,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSE41-LABEL: testv8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psubw %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    pand %xmm1, %xmm2
@@ -1106,11 +1002,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX-LABEL: testv8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1126,11 +1020,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i16:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
@@ -1140,11 +1032,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv8i16:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
@@ -1153,11 +1043,9 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv8i16:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
@@ -1165,21 +1053,17 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i16:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv8i16:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    psubw %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; X32-SSE-NEXT:    paddw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddw %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X32-SSE-NEXT:    pand %xmm1, %xmm2
@@ -1202,24 +1086,22 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; SSE2-LABEL: testv8i16u:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psubw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    paddw %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psubw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    paddw %xmm2, %xmm0
+; SSE2-NEXT:    paddb %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrlw $4, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    psllw $8, %xmm0
@@ -1229,24 +1111,22 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSE3-LABEL: testv8i16u:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    psubw %xmm0, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE3-NEXT:    paddw %xmm1, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $1, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE3-NEXT:    psubw %xmm1, %xmm0
-; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107]
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
 ; SSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSE3-NEXT:    pand %xmm1, %xmm2
 ; SSE3-NEXT:    psrlw $2, %xmm0
 ; SSE3-NEXT:    pand %xmm1, %xmm0
-; SSE3-NEXT:    paddw %xmm2, %xmm0
+; SSE3-NEXT:    paddb %xmm2, %xmm0
 ; SSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSE3-NEXT:    psrlw $4, %xmm1
-; SSE3-NEXT:    paddw %xmm0, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
 ; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    psllw $8, %xmm0
@@ -1256,11 +1136,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv8i16u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    psubw %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSSE3-NEXT:    paddw %xmm1, %xmm0
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddw %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm2
 ; SSSE3-NEXT:    pand %xmm1, %xmm2
@@ -1279,11 +1157,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSE41-LABEL: testv8i16u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psubw %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    paddw %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddw %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    pand %xmm1, %xmm2
@@ -1302,11 +1178,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX-LABEL: testv8i16u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1322,11 +1196,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i16u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
@@ -1336,11 +1208,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv8i16u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %ymm0, %xmm0
@@ -1349,11 +1219,9 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv8i16u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
@@ -1361,21 +1229,17 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i16u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntw %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv8i16u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    psubw %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; X32-SSE-NEXT:    paddw %xmm1, %xmm0
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddw %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
 ; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X32-SSE-NEXT:    pand %xmm1, %xmm2
@@ -1398,95 +1262,89 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; SSE2-LABEL: testv16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv16i8:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    psubb %xmm0, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrlw $2, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    psubb %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    paddb %xmm1, %xmm2
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddb %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
-; SSSE3-NEXT:    psrlw $4, %xmm2
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufb %xmm2, %xmm0
-; SSSE3-NEXT:    paddb %xmm4, %xmm0
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psubb %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    paddb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    pshufb %xmm3, %xmm4
-; SSE41-NEXT:    psrlw $4, %xmm2
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufb %xmm2, %xmm0
-; SSE41-NEXT:    paddb %xmm4, %xmm0
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    paddb %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1499,11 +1357,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1512,11 +1368,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv16i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1525,11 +1379,9 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv16i8:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
@@ -1537,31 +1389,28 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; BITALG-LABEL: testv16i8:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv16i8:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    psubb %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT:    paddb %xmm1, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddb %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
 ; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm4, %xmm0
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm4, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0)
   ret <16 x i8> %out
@@ -1570,95 +1419,89 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; SSE2-LABEL: testv16i8u:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $1, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $2, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    paddb %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrlw $4, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $1, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    psubb %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    psrlw $2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    paddb %xmm2, %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrlw $4, %xmm1
+; SSE2-NEXT:    paddb %xmm0, %xmm1
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE3-LABEL: testv16i8u:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    psubb %xmm0, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $1, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE3-NEXT:    psubb %xmm0, %xmm2
-; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; SSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE3-NEXT:    pand %xmm0, %xmm1
-; SSE3-NEXT:    psrlw $2, %xmm2
-; SSE3-NEXT:    pand %xmm0, %xmm2
-; SSE3-NEXT:    paddb %xmm1, %xmm2
-; SSE3-NEXT:    movdqa %xmm2, %xmm0
-; SSE3-NEXT:    psrlw $4, %xmm0
+; SSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pandn %xmm1, %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $1, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    psubb %xmm1, %xmm0
+; SSE3-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; SSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSE3-NEXT:    pand %xmm1, %xmm2
+; SSE3-NEXT:    psrlw $2, %xmm0
+; SSE3-NEXT:    pand %xmm1, %xmm0
 ; SSE3-NEXT:    paddb %xmm2, %xmm0
-; SSE3-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSE3-NEXT:    psrlw $4, %xmm1
+; SSE3-NEXT:    paddb %xmm0, %xmm1
+; SSE3-NEXT:    pand {{.*}}(%rip), %xmm1
+; SSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSE3-NEXT:    retq
 ;
 ; SSSE3-LABEL: testv16i8u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    psubb %xmm0, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSSE3-NEXT:    paddb %xmm1, %xmm2
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSSE3-NEXT:    paddb %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    pand %xmm2, %xmm3
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pshufb %xmm3, %xmm4
-; SSSE3-NEXT:    psrlw $4, %xmm2
-; SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSSE3-NEXT:    pshufb %xmm2, %xmm0
-; SSSE3-NEXT:    paddb %xmm4, %xmm0
+; SSSE3-NEXT:    psrlw $4, %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm0, %xmm1
+; SSSE3-NEXT:    paddb %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: testv16i8u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psubb %xmm0, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT:    paddb %xmm1, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT:    paddb %xmm0, %xmm1
+; SSE41-NEXT:    pandn %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pand %xmm2, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    pshufb %xmm3, %xmm4
-; SSE41-NEXT:    psrlw $4, %xmm2
-; SSE41-NEXT:    pand %xmm1, %xmm2
-; SSE41-NEXT:    pshufb %xmm2, %xmm0
-; SSE41-NEXT:    paddb %xmm4, %xmm0
+; SSE41-NEXT:    psrlw $4, %xmm0
+; SSE41-NEXT:    pand %xmm2, %xmm0
+; SSE41-NEXT:    pshufb %xmm0, %xmm1
+; SSE41-NEXT:    paddb %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: testv16i8u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1671,11 +1514,9 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i8u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1684,11 +1525,9 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv16i8u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdb %zmm0, %xmm0
@@ -1697,11 +1536,9 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv16i8u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; BITALG_NOVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    vzeroupper
@@ -1709,31 +1546,28 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; BITALG-LABEL: testv16i8u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
-; BITALG-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; BITALG-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
+; BITALG-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; BITALG-NEXT:    vpopcntb %xmm0, %xmm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv16i8u:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    psubb %xmm0, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT:    paddb %xmm1, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm3
-; X32-SSE-NEXT:    pand %xmm1, %xmm3
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm4
+; X32-SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-SSE-NEXT:    paddb %xmm0, %xmm1
+; X32-SSE-NEXT:    pandn %xmm1, %xmm0
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-SSE-NEXT:    movdqa %xmm0, %xmm3
+; X32-SSE-NEXT:    pand %xmm2, %xmm3
+; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm4
 ; X32-SSE-NEXT:    pshufb %xmm3, %xmm4
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm1, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm4, %xmm0
+; X32-SSE-NEXT:    psrlw $4, %xmm0
+; X32-SSE-NEXT:    pand %xmm2, %xmm0
+; X32-SSE-NEXT:    pshufb %xmm0, %xmm1
+; X32-SSE-NEXT:    paddb %xmm4, %xmm1
+; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1)
   ret <16 x i8> %out
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 775a7a359abf9fb97f76e74aa113101e9f3c0e90..c7087037e0104e8329db6e76091c9bc3a51710bb 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -15,158 +15,119 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv4i64:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512CDVL-LABEL: testv4i64:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CDVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vplzcntq %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64]
+; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i64:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64]
+; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv4i64:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv4i64:
 ; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG_NOVLX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i64:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv4i64:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    retl
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0)
@@ -177,142 +138,119 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv4i64u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512CDVL-LABEL: testv4i64u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vplzcntq %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
+; AVX512CDVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64]
 ; AVX512CDVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv4i64u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [63,63,63,63]
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [64,64,64,64]
 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv4i64u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv4i64u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntq %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv4i64u:
 ; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG_NOVLX-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv4i64u:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv4i64u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    retl
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1)
@@ -323,56 +261,53 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm6, %xmm6
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -382,84 +317,48 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv8i32:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CDVL-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDVL-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512CDVL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CDVL-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
-; AVX512CDVL-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX512CDVL-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vplzcntd %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32]
+; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv8i32:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32]
+; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv8i32:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv8i32:
 ; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG_NOVLX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -469,19 +368,11 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i32:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -491,19 +382,18 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv8i32:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-AVX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; X32-AVX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -518,56 +408,53 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm6, %xmm6
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
-; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -577,60 +464,48 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv8i32u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CDVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vplzcntd %ymm0, %ymm0
-; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
+; AVX512CDVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32]
 ; AVX512CDVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CDVL-NEXT:    retq
 ;
 ; AVX512CD-LABEL: testv8i32u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [31,31,31,31,31,31,31,31]
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32]
 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i32u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv8i32u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: testv8i32u:
 ; BITALG_NOVLX:       # %bb.0:
+; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG_NOVLX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG_NOVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG_NOVLX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG_NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG_NOVLX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG_NOVLX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG_NOVLX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG_NOVLX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -640,19 +515,11 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; BITALG-LABEL: testv8i32u:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; BITALG-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; BITALG-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; BITALG-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; BITALG-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; BITALG-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; BITALG-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; BITALG-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -662,19 +529,18 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ;
 ; X32-AVX-LABEL: testv8i32u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubd %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
 ; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-AVX-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm2, %ymm2
 ; X32-AVX-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -688,31 +554,28 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
-; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -722,11 +585,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX2-LABEL: testv16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -742,11 +603,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv16i16:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -762,11 +621,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv16i16:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -782,11 +639,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i16:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
@@ -794,11 +649,9 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv16i16:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
@@ -806,32 +659,26 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv16i16:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv16i16:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv16i16:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -851,31 +698,28 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
-; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
+; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -885,11 +729,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX2-LABEL: testv16i16u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -905,11 +747,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv16i16u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -925,11 +765,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv16i16u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -945,11 +783,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i16u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
@@ -957,11 +793,9 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv16i16u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQVL-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpmovdw %zmm0, %ymm0
@@ -969,32 +803,26 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv16i16u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv16i16u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntw %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv16i16u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1015,38 +843,33 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1059,11 +882,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv32i8:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1076,11 +897,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv32i8:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1093,11 +912,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv32i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1110,11 +927,9 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv32i8:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1127,32 +942,26 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv32i8:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv32i8:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv32i8:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1170,38 +979,33 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
-; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpandn %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv32i8u:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1214,11 +1018,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512CDVL-LABEL: testv32i8u:
 ; AVX512CDVL:       # %bb.0:
-; AVX512CDVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CDVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512CDVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CDVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1231,11 +1033,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512CD-LABEL: testv32i8u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512CD-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CD-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1248,11 +1048,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv32i8u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1265,11 +1063,9 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQVL-LABEL: testv32i8u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQVL-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; AVX512VPOPCNTDQVL-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512VPOPCNTDQVL-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; AVX512VPOPCNTDQVL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1282,32 +1078,26 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ;
 ; BITALG_NOVLX-LABEL: testv32i8u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG_NOVLX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; BITALG_NOVLX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG_NOVLX-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG_NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: testv32i8u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; BITALG-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; BITALG-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; BITALG-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; BITALG-NEXT:    vpopcntb %ymm0, %ymm0
 ; BITALG-NEXT:    retq
 ;
 ; X32-AVX-LABEL: testv32i8u:
 ; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vpaddb %ymm1, %ymm0, %ymm1
+; X32-AVX-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm2
 ; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -1370,145 +1160,55 @@ define <4 x i64> @foldv4i64u() nounwind {
 }
 
 define <8 x i32> @foldv8i32() nounwind {
-; AVX-LABEL: foldv8i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv8i32:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv8i32:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv8i32:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv8i32:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
   ret <8 x i32> %out
 }
 
 define <8 x i32> @foldv8i32u() nounwind {
-; AVX-LABEL: foldv8i32u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv8i32u:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv8i32u:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv8i32u:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv8i32u:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
   ret <8 x i32> %out
 }
 
 define <16 x i16> @foldv16i16() nounwind {
-; AVX-LABEL: foldv16i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv16i16:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv16i16:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv16i16:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv16i16:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
   ret <16 x i16> %out
 }
 
 define <16 x i16> @foldv16i16u() nounwind {
-; AVX-LABEL: foldv16i16u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv16i16u:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv16i16u:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv16i16u:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv16i16u:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
   ret <16 x i16> %out
 }
 
 define <32 x i8> @foldv32i8() nounwind {
-; AVX-LABEL: foldv32i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv32i8:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv32i8:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv32i8:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv32i8:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
   ret <32 x i8> %out
 }
 
 define <32 x i8> @foldv32i8u() nounwind {
-; AVX-LABEL: foldv32i8u:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; AVX-NEXT:    retq
-;
-; BITALG_NOVLX-LABEL: foldv32i8u:
-; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; BITALG_NOVLX-NEXT:    retq
-;
-; BITALG-LABEL: foldv32i8u:
-; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; BITALG-NEXT:    retq
-;
-; X32-AVX-LABEL: foldv32i8u:
-; X32-AVX:       # %bb.0:
-; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
-; X32-AVX-NEXT:    retl
+; ALL-LABEL: foldv32i8u:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; ALL-NEXT:    ret{{[l|q]}}
   %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
   ret <32 x i8> %out
 }
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 37c86f7f81a23229971cffbfa566f26aa4636228..501d7e96835d219db09afb8dc57660d39903ac94 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -8,93 +8,56 @@
 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512CD-LABEL: testv8i64:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; AVX512CD-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm5
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64]
+; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv8i64:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512CDBW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDBW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512CDBW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64]
+; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
 ; AVX512BW-LABEL: testv8i64:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv8i64:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; BITALG-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
@@ -104,67 +67,56 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512CD-LABEL: testv8i64u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512CD-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
+; AVX512CD-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64]
 ; AVX512CD-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv8i64u:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntq %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [63,63,63,63,63,63,63,63]
+; AVX512CDBW-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64]
 ; AVX512CDBW-NEXT:    vpsubq %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
 ; AVX512BW-LABEL: testv8i64u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv8i64u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv8i64u:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubq %zmm0, %zmm1, %zmm2
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; BITALG-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
@@ -174,77 +126,38 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512CD-LABEL: testv16i32:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512CD-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm3
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm5, %ymm5
-; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpackuswb %ymm5, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm5
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm4, %ymm5
-; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[4],ymm3[4],ymm0[5],ymm3[5]
-; AVX512CD-NEXT:    vpsadbw %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
-; AVX512CD-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512CD-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv16i32:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512CDBW-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CDBW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; AVX512CDBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512CDBW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
-; AVX512CDBW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; AVX512CDBW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vplzcntd %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
 ; AVX512BW-LABEL: testv16i32:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -254,29 +167,19 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i32:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv16i32:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; BITALG-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -290,39 +193,38 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512CD-LABEL: testv16i32u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CD-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512CD-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512CD-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512CD-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv16i32u:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512CDBW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vplzcntd %zmm0, %zmm0
-; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
+; AVX512CDBW-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
 ; AVX512CDBW-NEXT:    vpsubd %zmm0, %zmm1, %zmm0
 ; AVX512CDBW-NEXT:    retq
 ;
 ; AVX512BW-LABEL: testv16i32u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -332,29 +234,19 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv16i32u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv16i32u:
 ; BITALG:       # %bb.0:
+; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnd %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubd %zmm0, %zmm1, %zmm2
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
-; BITALG-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; BITALG-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; BITALG-NEXT:    vpshufb %zmm3, %zmm4, %zmm3
-; BITALG-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; BITALG-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; BITALG-NEXT:    vpshufb %zmm0, %zmm4, %zmm0
-; BITALG-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; BITALG-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; BITALG-NEXT:    vpsadbw %zmm1, %zmm2, %zmm2
 ; BITALG-NEXT:    vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -368,30 +260,27 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512CD-LABEL: testv32i16:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
-; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512CD-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT:    vpaddw %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm5
-; AVX512CD-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm4
+; AVX512CD-NEXT:    vpaddb %ymm0, %ymm4, %ymm0
 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpsllw $8, %ymm1, %ymm2
 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
@@ -400,11 +289,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512CDBW-LABEL: testv32i16:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -420,11 +307,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv32i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -440,17 +325,14 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv32i16:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm1, %zmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm1, %ymm1
@@ -458,11 +340,9 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ;
 ; BITALG-LABEL: testv32i16:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
@@ -472,30 +352,27 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512CD-LABEL: testv32i16u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
-; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512CD-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT:    vpaddw %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm5
-; AVX512CD-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpsllw $8, %ymm0, %ymm4
+; AVX512CD-NEXT:    vpaddb %ymm0, %ymm4, %ymm0
 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    vpsllw $8, %ymm1, %ymm2
 ; AVX512CD-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
@@ -504,11 +381,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; AVX512CDBW-LABEL: testv32i16u:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -524,11 +399,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv32i16u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -544,17 +417,14 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv32i16u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm0, %ymm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
 ; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm1, %zmm1
 ; AVX512VPOPCNTDQ-NEXT:    vpmovdw %zmm1, %ymm1
@@ -562,11 +432,9 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ;
 ; BITALG-LABEL: testv32i16u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubw %zmm0, %zmm1, %zmm1
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntw %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
@@ -576,37 +444,32 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512CD-LABEL: testv64i8:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
-; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv64i8:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -619,11 +482,9 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -636,37 +497,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv64i8:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv64i8:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
@@ -676,37 +532,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512CD-LABEL: testv64i8u:
 ; AVX512CD:       # %bb.0:
-; AVX512CD-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512CD-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
-; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512CD-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512CD-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm3
+; AVX512CD-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CD-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512CD-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512CD-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
-; AVX512CD-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512CD-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512CD-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512CD-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512CD-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512CD-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512CD-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512CD-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512CD-NEXT:    retq
 ;
 ; AVX512CDBW-LABEL: testv64i8u:
 ; AVX512CDBW:       # %bb.0:
-; AVX512CDBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512CDBW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512CDBW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512CDBW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512CDBW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -719,11 +570,9 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ;
 ; AVX512BW-LABEL: testv64i8u:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -736,37 +585,32 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ;
 ; AVX512VPOPCNTDQ-LABEL: testv64i8u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
-; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm4
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpandn %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
 ; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; BITALG-LABEL: testv64i8u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; BITALG-NEXT:    vpsubb %zmm0, %zmm1, %zmm1
-; BITALG-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
-; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
+; BITALG-NEXT:    vpaddb %zmm1, %zmm0, %zmm1
+; BITALG-NEXT:    vpandnq %zmm1, %zmm0, %zmm0
 ; BITALG-NEXT:    vpopcntb %zmm0, %zmm0
 ; BITALG-NEXT:    retq
   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index ac7c81a8fb6eddf09f9ec0812cf3c943d880268b..6de913079a52788a8466ba96e43ec5a3452a2fcc 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -64,10 +64,10 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
 ;
 ; AVX1-LABEL: zext_16i8_to_16i16:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_16i8_to_16i16:
@@ -526,10 +526,10 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
 ;
 ; AVX1-LABEL: zext_8i16_to_8i32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_8i16_to_8i32:
@@ -825,10 +825,10 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
 ;
 ; AVX1-LABEL: zext_4i32_to_4i64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_4i32_to_4i64:
@@ -1540,10 +1540,10 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
 ; AVX1-LABEL: zext_8i8_to_8i32:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_8i8_to_8i32:
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
index e27493e9758d60f749ce8e9a3563dfbd82e63d14..145da66558fd44af63dfe4b75ebfb5e5a545cfd8 100644
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -46,7 +46,7 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
 ; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    movq (%rdi,%rsi,8), %rax
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [0.5,0.5,0.5,0.5]
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
 ; AVX1-NEXT:    vblendvpd %ymm0, {{.*}}(%rip), %ymm1, %ymm0
 ; AVX1-NEXT:    vmovupd %ymm0, (%rax)
 ; AVX1-NEXT:    vzeroupper
@@ -57,8 +57,8 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
 ; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
 ; AVX2-NEXT:    movq (%rdi,%rsi,8), %rax
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-0.5,-0.5,-0.5,-0.5]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [0.5,0.5,0.5,0.5]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1]
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovupd %ymm0, (%rax)
 ; AVX2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/vshift-6.ll b/test/CodeGen/X86/vshift-6.ll
index 5cfa38ab833e83329e50d4e53451fcc422f0dc47..36e29abf8d7368bfb99b1555355326bce74af73f 100644
--- a/test/CodeGen/X86/vshift-6.ll
+++ b/test/CodeGen/X86/vshift-6.ll
@@ -50,8 +50,8 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
 ; X32-NEXT:    movdqa %xmm2, %xmm4
 ; X32-NEXT:    pandn %xmm0, %xmm4
 ; X32-NEXT:    psllw $2, %xmm0
-; X32-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-NEXT:    pand %xmm2, %xmm0
+; X32-NEXT:    pand {{\.LCPI.*}}, %xmm0
 ; X32-NEXT:    por %xmm4, %xmm0
 ; X32-NEXT:    paddb %xmm1, %xmm1
 ; X32-NEXT:    pcmpgtb %xmm1, %xmm3
@@ -85,8 +85,8 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
 ; X64-NEXT:    movdqa %xmm2, %xmm4
 ; X64-NEXT:    pandn %xmm0, %xmm4
 ; X64-NEXT:    psllw $2, %xmm0
-; X64-NEXT:    pand {{.*}}(%rip), %xmm0
 ; X64-NEXT:    pand %xmm2, %xmm0
+; X64-NEXT:    pand {{.*}}(%rip), %xmm0
 ; X64-NEXT:    por %xmm4, %xmm0
 ; X64-NEXT:    paddb %xmm1, %xmm1
 ; X64-NEXT:    pcmpgtb %xmm1, %xmm3
diff --git a/test/CodeGen/X86/wide-fma-contraction.ll b/test/CodeGen/X86/wide-fma-contraction.ll
index 3ee09dd8f80efc8c45f1521b3e323513d3d17ccf..d15ced21e95edbb89db36c2d796c2f3b17023d91 100644
--- a/test/CodeGen/X86/wide-fma-contraction.ll
+++ b/test/CodeGen/X86/wide-fma-contraction.ll
@@ -30,8 +30,8 @@ define <16 x float> @fmafunc(<16 x float> %a, <16 x float> %b, <16 x float> %c)
 ; CHECK-NOFMA-NEXT:    andl $-32, %esp
 ; CHECK-NOFMA-NEXT:    subl $32, %esp
 ; CHECK-NOFMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
-; CHECK-NOFMA-NEXT:    vaddps 8(%ebp), %ymm0, %ymm0
 ; CHECK-NOFMA-NEXT:    vmulps %ymm3, %ymm1, %ymm1
+; CHECK-NOFMA-NEXT:    vaddps 8(%ebp), %ymm0, %ymm0
 ; CHECK-NOFMA-NEXT:    vaddps 40(%ebp), %ymm1, %ymm1
 ; CHECK-NOFMA-NEXT:    movl %ebp, %esp
 ; CHECK-NOFMA-NEXT:    popl %ebp
diff --git a/test/CodeGen/X86/widen_arith-6.ll b/test/CodeGen/X86/widen_arith-6.ll
index 73b8f4ea276bdc8c52c405f5382e7cdb71a2640b..c039096604e4354a4cd68360eb790ab7668aab7f 100644
--- a/test/CodeGen/X86/widen_arith-6.ll
+++ b/test/CodeGen/X86/widen_arith-6.ll
@@ -14,7 +14,7 @@ define void @update(<3 x float>* %dst, <3 x float>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    movl $1073741824, {{[0-9]+}}(%esp) # imm = 0x40000000
 ; CHECK-NEXT:    movl $1065353216, {{[0-9]+}}(%esp) # imm = 0x3F800000
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <1976.04004,1976.04004,1976.04004,u>
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <1.97604004E+3,1.97604004E+3,1.97604004E+3,u>
 ; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_2: # %forbody
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index 1eb76b283c0b0f228926bb94ed8c53e1e2326ff0..038c6cb33b6389279cef9d5f9ca6bc3be50ea7c4 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -7,28 +7,15 @@
 ; sign to float v2i16 to v2f32
 
 define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
-; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
-; X86-SSE2:       # %bb.0: # %entry
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    psllq $48, %xmm0
-; X86-SSE2-NEXT:    psrad $16, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; X86-SSE2-NEXT:    movss %xmm0, (%eax)
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86-SSE2-NEXT:    movss %xmm0, 4(%eax)
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: convert_v2i16_to_v2f32:
-; X86-SSE42:       # %bb.0: # %entry
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    psllq $48, %xmm0
-; X86-SSE42-NEXT:    psrad $16, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-SSE42-NEXT:    cvtdq2ps %xmm0, %xmm0
-; X86-SSE42-NEXT:    extractps $1, %xmm0, 4(%eax)
-; X86-SSE42-NEXT:    movss %xmm0, (%eax)
-; X86-SSE42-NEXT:    retl
+; X86-LABEL: convert_v2i16_to_v2f32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    psllq $48, %xmm0
+; X86-NEXT:    psrad $16, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-NEXT:    movlps %xmm0, (%eax)
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: convert_v2i16_to_v2f32:
 ; X64:       # %bb.0: # %entry
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 2e4acb57ee493daaa9fee5cb67c4822306c9b3ca..8cbf8c4e3468edf0e372279eafbc86907e088265 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -5,11 +5,11 @@
 
 ; This load should be before the call, not after.
 
-; SSE: movaps    compl+128(%rip), %xmm0
+; SSE: movsd     compl+128(%rip), %xmm0
 ; SSE: movaps  %xmm0, (%rsp)
 ; SSE: callq   killcommon
 
-; AVX: vmovaps    compl+128(%rip), %xmm0
+; AVX: vmovsd     compl+128(%rip), %xmm0
 ; AVX: vmovaps  %xmm0, (%rsp)
 ; AVX: callq   killcommon
 
diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll
index ce99d22dbbd8fab35e6b7194718928532a623abe..167128ae002b55825dca76c6de59f75c55e10fa0 100644
--- a/test/CodeGen/X86/widened-broadcast.ll
+++ b/test/CodeGen/X86/widened-broadcast.ll
@@ -121,10 +121,21 @@ define <8 x i32> @load_splat_8i32_4i32_01010101(<4 x i32>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_8i32_4i32_01010101:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_8i32_4i32_01010101:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_8i32_4i32_01010101:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_8i32_4i32_01010101:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
   %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -138,21 +149,10 @@ define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_8i32_8i32_01010101:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_8i32_8i32_01010101:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_8i32_8i32_01010101:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_8i32_8i32_01010101:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <8 x i32>, <8 x i32>* %ptr
   %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -246,10 +246,21 @@ define <16 x i16> @load_splat_16i16_8i16_0123012301230123(<8 x i16>* %ptr) nounw
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_16i16_8i16_0123012301230123:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
   %ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -263,21 +274,10 @@ define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nou
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_16i16_16i16_0101010101010101:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x i16>, <16 x i16>* %ptr
   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -446,10 +446,21 @@ define <32 x i8> @load_splat_32i8_16i8_01234567012345670123456701234567(<16 x i8
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
   %ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/win32_sret.ll b/test/CodeGen/X86/win32_sret.ll
index 70fa22bb8bad8ea230223e83219ffe2d498ad51c..ea164ae28f87675efedca82c6a029b652068f7a3 100644
--- a/test/CodeGen/X86/win32_sret.ll
+++ b/test/CodeGen/X86/win32_sret.ll
@@ -16,7 +16,7 @@
 define void @sret1(i8* sret %x) nounwind {
 entry:
 ; WIN32-LABEL:      _sret1:
-; WIN32:      movb $42, (%eax)
+; WIN32:      movb $42, ({{%e[abcd]x}})
 ; WIN32-NOT:  popl %eax
 ; WIN32:    {{retl$}}
 
@@ -36,7 +36,7 @@ entry:
 define void @sret2(i8* sret %x, i8 %y) nounwind {
 entry:
 ; WIN32-LABEL:      _sret2:
-; WIN32:      movb {{.*}}, (%eax)
+; WIN32:      movb {{.*}}, ({{%e[abcd]x}})
 ; WIN32-NOT:  popl %eax
 ; WIN32:    {{retl$}}
 
@@ -56,8 +56,8 @@ entry:
 define void @sret3(i8* sret %x, i8* %y) nounwind {
 entry:
 ; WIN32-LABEL:      _sret3:
-; WIN32:      movb $42, (%eax)
-; WIN32-NOT:  movb $13, (%eax)
+; WIN32:      movb $42, ([[REG1:%e[abcd]x]])
+; WIN32-NOT:  movb $13, ([[REG1]])
 ; WIN32-NOT:  popl %eax
 ; WIN32:    {{retl$}}
 
@@ -81,7 +81,7 @@ entry:
 define void @sret4(%struct.S4* noalias sret %agg.result) {
 entry:
 ; WIN32-LABEL:     _sret4:
-; WIN32:     movl $42, (%eax)
+; WIN32:     movl $42, ({{%e[abcd]x}})
 ; WIN32-NOT: popl %eax
 ; WIN32:   {{retl$}}
 
@@ -118,8 +118,8 @@ entry:
 ; The address of the return structure is passed as an implicit parameter.
 ; In the -O0 build, %eax is spilled at the beginning of the function, hence we
 ; should match both 4(%esp) and 8(%esp).
-; WIN32:     {{[48]}}(%esp), %eax
-; WIN32:     movl $42, (%eax)
+; WIN32:     {{[48]}}(%esp), [[REG:%e[abcd]x]]
+; WIN32:     movl $42, ([[REG]])
 ; WIN32:     retl $4
 }
 
@@ -230,8 +230,8 @@ define void @test8_f(i64 inreg %a, i64* sret %out) {
 
 ; WIN32-LABEL: _test8_f:
 ; WIN32: movl {{[0-9]+}}(%esp), %[[out:[a-z]+]]
-; WIN32-DAG: movl %edx, 4(%[[out]])
-; WIN32-DAG: movl %eax, (%[[out]])
+; WIN32-DAG: movl {{%e[abcd]x}}, 4(%[[out]])
+; WIN32-DAG: movl {{%e[abcd]x}}, (%[[out]])
 ; WIN32: calll _clobber_eax
 ; WIN32: movl {{.*}}, %eax
 ; WIN32: retl
diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll
index fc9a10ed5867130349426299e784380599ea82fb..f0aff6f89bc2a8b6eab8c99c035c6f61d860256f 100644
--- a/test/CodeGen/X86/win64_vararg.ll
+++ b/test/CodeGen/X86/win64_vararg.ll
@@ -124,7 +124,8 @@ entry:
 ; CHECK: movq %rcx, %rax
 ; CHECK-DAG: movq %r9, 40(%rsp)
 ; CHECK-DAG: movq %r8, 32(%rsp)
-; CHECK: movl 32(%rsp), %[[tmp:[^ ]*]]
-; CHECK: movl %[[tmp]], (%rax)
+; CHECK-DAG: leaq 36(%rsp), %[[sret:[^ ]*]]
+; CHECK-DAG: movl %r8d, (%rax)
+; CHECK-DAG: movq %[[sret]], (%rsp)
 ; CHECK: popq
 ; CHECK: retq
diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll
index 24f2b2be430852bb706fcfcb548ef9d115521597..54789dc32d25e20e29bd08caba521f84e8b53228 100644
--- a/test/CodeGen/X86/win_coreclr_chkstk.ll
+++ b/test/CodeGen/X86/win_coreclr_chkstk.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32-coreclr | FileCheck %s -check-prefix=WIN_X64
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR38376.
+; RUN: llc < %s -mtriple=x86_64-pc-win32-coreclr -verify-machineinstrs=0 | FileCheck %s -check-prefix=WIN_X64
 ; RUN: llc < %s -mtriple=x86_64-pc-linux         | FileCheck %s -check-prefix=LINUX
 
 ; By default, windows CoreCLR requires an inline prologue stack expansion check
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index bf087e128337defc29336c0b6d9172a46297d9e4..41d69e544aa48eeee58cae41ec9440ce8a855f0b 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1029,7 +1029,8 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm2
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX1-NEXT:    vandnps %ymm2, %ymm5, %ymm2
 ; AVX1-NEXT:    vandps %ymm5, %ymm8, %ymm5
 ; AVX1-NEXT:    vorps %ymm2, %ymm5, %ymm2
@@ -1585,13 +1586,14 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm12
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT:    # ymm13 = mem[0,1,0,1]
 ; AVX1-NEXT:    vandnps %ymm12, %ymm13, %ymm12
 ; AVX1-NEXT:    vandps %ymm13, %ymm14, %ymm14
 ; AVX1-NEXT:    vorps %ymm12, %ymm14, %ymm12
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm14
 ; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vandnps %ymm14, %ymm13, %ymm14
 ; AVX1-NEXT:    vandps %ymm13, %ymm7, %ymm7
 ; AVX1-NEXT:    vorps %ymm14, %ymm7, %ymm13
@@ -1616,7 +1618,7 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm4, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm12, %xmm0
-; AVX1-NEXT:    vpaddb -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT:    vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm11, %xmm12, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
@@ -1732,22 +1734,22 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm14
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
-; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; AVX1-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm3
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
@@ -1756,7 +1758,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm1
-; AVX1-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
 ; AVX1-NEXT:    vmovdqa %xmm8, %xmm2
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
@@ -1765,16 +1767,16 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm9, %ymm14
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm9, %ymm9
 ; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
@@ -1788,7 +1790,7 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm9, %ymm6
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm1[2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm10, %ymm2
-; AVX1-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm3 # 32-byte Reload
+; AVX1-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm3, %ymm0
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm3[2,3],ymm2[2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm7, %ymm3
diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
index e9f291d239b8274da5db10e210124d2531572720..70a72e7ee1a9d367d3978ada9abf413638b05e3d 100644
--- a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -52,7 +52,7 @@ define void @foo2(<4 x float>* noalias %result) nounwind {
 ; CHECK-NEXT: .long 1088421888              ## float 7
 ; CHECK-LABEL: foo2:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [4,5,6,7]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; CHECK-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-NEXT:    retq
   %val = uitofp <4 x i32> <i32 4, i32 5, i32 6, i32 7> to <4 x float>
@@ -89,7 +89,7 @@ define void @foo4(<4 x float>* noalias %result) nounwind {
 ; CHECK-NEXT: .long 1132396544              ## float 255
 ; CHECK-LABEL: foo4:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1,127,128,255]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.27E+2,1.28E+2,2.55E+2]
 ; CHECK-NEXT:    movaps %xmm0, (%rdi)
 ; CHECK-NEXT:    retq
   %val = uitofp <4 x i8> <i8 1, i8 127, i8 -128, i8 -1> to <4 x float>
diff --git a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
index f1f81da926dee3e0e07adb2d4cb59fd2b42af852..75dcbfd8471d9ad314b0d92df9e1510f3c1f148a 100644
--- a/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ b/test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -160,14 +160,7 @@ attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
 ;
 ; CHECK-LABEL: segmentedStack:
 ; CHECK: cmpq
-; CHECK-NEXT: ja [[ENTRY_LABEL:LBB[0-9_]+]]
-;
-; CHECK: callq ___morestack
-; CHECK-NEXT: retq
-;
-; CHECK: [[ENTRY_LABEL]]:
-; Prologue
-; CHECK: push
+; CHECK-NEXT: jbe [[ENTRY_LABEL:LBB[0-9_]+]]
 ;
 ; In PR26107, we use to drop these two basic blocks, because
 ; the segmentedStack entry block was jumping directly to
@@ -186,6 +179,12 @@ attributes #2 = { "no-frame-pointer-elim"="false" nounwind }
 ;
 ; CHECK: [[STRINGS_EQUAL]]
 ; CHECK: popq
+;
+; CHECK: [[ENTRY_LABEL]]:
+; CHECK: callq ___morestack
+; CHECK-NEXT: retq
+;
+
 define zeroext i1 @segmentedStack(i8* readonly %vk1, i8* readonly %vk2, i64 %key_size) #5 {
 entry:
   %cmp.i = icmp eq i8* %vk1, null
diff --git a/test/CodeGen/X86/x87-schedule.ll b/test/CodeGen/X86/x87-schedule.ll
index f4f91d82c5290680f929ae9885b193463406c775..937a2c4561b64719a1543bc7e851908e4c1f5822 100644
--- a/test/CodeGen/X86/x87-schedule.ll
+++ b/test/CodeGen/X86/x87-schedule.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
 
@@ -68,6 +69,13 @@ define void @test_f2xm1() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_f2xm1:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    f2xm1 # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_f2xm1:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -142,6 +150,13 @@ define void @test_fabs() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fabs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fabs # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fabs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -256,6 +271,18 @@ define void @test_fadd(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fadd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fadd %st(0), %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fadd %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fadds (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    faddl (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fadd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -380,6 +407,18 @@ define void @test_faddp_fiadd(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_faddp_fiadd:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    faddp %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    faddp %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fiadds (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fiaddl (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_faddp_fiadd:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -480,6 +519,15 @@ define void @test_fbld_fbstp(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fbld_fbstp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fbld (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    fbstp (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fbld_fbstp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -558,6 +606,13 @@ define void @test_fchs() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fchs:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fchs # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fchs:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -640,6 +695,14 @@ define void @test_fclex() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fclex:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnclex # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fclex:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -716,6 +779,13 @@ define void @test_fnclex() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fnclex:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fnclex # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fnclex:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -846,6 +916,20 @@ define void @test_fcmov() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcmov:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcmovb %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovbe %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmove %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovnb %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovnbe %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovne %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovnu %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    fcmovu %st(1), %st(0) # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fcmov:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -974,6 +1058,18 @@ define void @test_fcom(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcom:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcom %st(1) # sched: [1:1.00]
+; BDVER2-NEXT:    fcom %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fcoms (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    fcoml (%eax) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fcom:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1106,6 +1202,19 @@ define void @test_fcomp_fcompp(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcomp_fcompp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcomp %st(1) # sched: [1:1.00]
+; BDVER2-NEXT:    fcomp %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fcomps (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    fcompl (%eax) # sched: [6:1.00]
+; BDVER2-NEXT:    fcompp # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fcomp_fcompp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1200,6 +1309,14 @@ define void @test_fcomi_fcomip() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcomi_fcomip:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcomi %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fcompi %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fcomi_fcomip:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1276,6 +1393,13 @@ define void @test_fcos() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fcos:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fcos # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fcos:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1350,6 +1474,13 @@ define void @test_fdecstp() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdecstp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdecstp # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fdecstp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -1464,6 +1595,18 @@ define void @test_fdiv(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdiv:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdiv %st(0), %st(1) # sched: [9:9.50]
+; BDVER2-NEXT:    fdiv %st(2) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivs (%ecx) # sched: [14:9.50]
+; BDVER2-NEXT:    fdivl (%eax) # sched: [14:9.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fdiv:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1588,6 +1731,18 @@ define void @test_fdivp_fidiv(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdivp_fidiv:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdivp %st(1) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivp %st(2) # sched: [9:9.50]
+; BDVER2-NEXT:    fidivs (%ecx) # sched: [14:9.50]
+; BDVER2-NEXT:    fidivl (%eax) # sched: [14:9.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fdivp_fidiv:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1712,6 +1867,18 @@ define void @test_fdivr(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdivr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdivr %st(0), %st(1) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivr %st(2) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivrs (%ecx) # sched: [14:9.50]
+; BDVER2-NEXT:    fdivrl (%eax) # sched: [14:9.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fdivr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1836,6 +2003,18 @@ define void @test_fdivrp_fidivr(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fdivrp_fidivr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fdivrp %st(1) # sched: [9:9.50]
+; BDVER2-NEXT:    fdivrp %st(2) # sched: [9:9.50]
+; BDVER2-NEXT:    fidivrs (%ecx) # sched: [14:9.50]
+; BDVER2-NEXT:    fidivrl (%eax) # sched: [14:9.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fdivrp_fidivr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -1920,6 +2099,13 @@ define void @test_ffree() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_ffree:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    ffree %st(0) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ffree:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2034,6 +2220,18 @@ define void @test_ficom(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_ficom:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    ficoms (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    ficoml (%eax) # sched: [6:1.00]
+; BDVER2-NEXT:    ficomps (%ecx) # sched: [6:1.00]
+; BDVER2-NEXT:    ficompl (%eax) # sched: [6:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ficom:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2158,6 +2356,18 @@ define void @test_fild(i16 *%a0, i32 *%a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fild:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    filds (%edx) # sched: [5:0.50]
+; BDVER2-NEXT:    fildl (%ecx) # sched: [5:0.50]
+; BDVER2-NEXT:    fildll (%eax) # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fild:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2242,6 +2452,13 @@ define void @test_fincstp() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fincstp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fincstp # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fincstp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2324,6 +2541,14 @@ define void @test_finit() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_finit:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fninit # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_finit:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2400,6 +2625,13 @@ define void @test_fninit() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fninit:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fninit # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fninit:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -2554,6 +2786,23 @@ define void @test_fist_fistp_fisttp(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fist_fistp_fisttp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fists (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fistl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fistps (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fistpl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fistpll (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    fisttps (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fisttpl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fisttpll (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fist_fistp_fisttp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2696,6 +2945,19 @@ define void @test_fld(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fld:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fld %st(0) # sched: [1:0.50]
+; BDVER2-NEXT:    flds (%edx) # sched: [5:0.50]
+; BDVER2-NEXT:    fldl (%ecx) # sched: [5:0.50]
+; BDVER2-NEXT:    fldt (%eax) # sched: [5:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fld:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2798,6 +3060,15 @@ define void @test_fldcw_fldenv(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fldcw_fldenv:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fldcw (%eax) # sched: [5:0.50]
+; BDVER2-NEXT:    fldenv (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fldcw_fldenv:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -2924,6 +3195,19 @@ define void @test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fld1 # sched: [3:1.00]
+; BDVER2-NEXT:    fldl2e # sched: [3:1.00]
+; BDVER2-NEXT:    fldl2t # sched: [3:1.00]
+; BDVER2-NEXT:    fldlg2 # sched: [3:1.00]
+; BDVER2-NEXT:    fldln2 # sched: [3:1.00]
+; BDVER2-NEXT:    fldpi # sched: [3:1.00]
+; BDVER2-NEXT:    fldz # sched: [3:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3050,6 +3334,18 @@ define void @test_fmul(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fmul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fmul %st(0), %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fmul %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fmuls (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fmull (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fmul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3174,6 +3470,18 @@ define void @test_fmulp_fimul(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fmulp_fimul:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fmulp %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fmulp %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fimuls (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fimull (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fmulp_fimul:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3258,6 +3566,13 @@ define void @test_fnop() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fnop:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fnop # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fnop:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3332,6 +3647,13 @@ define void @test_fpatan() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fpatan:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fpatan # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fpatan:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3414,6 +3736,14 @@ define void @test_fprem_fprem1() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fprem_fprem1:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fprem # sched: [100:0.50]
+; BDVER2-NEXT:    fprem1 # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fprem_fprem1:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3490,6 +3820,13 @@ define void @test_fptan() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fptan:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fptan # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fptan:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3564,6 +3901,13 @@ define void @test_frndint() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_frndint:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    frndint # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_frndint:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3646,6 +3990,14 @@ define void @test_frstor(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_frstor:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    frstor (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_frstor:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3738,6 +4090,15 @@ define void @test_fsave(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnsave (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fsave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3824,6 +4185,14 @@ define void @test_fnsave(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fnsave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fnsave (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fnsave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -3900,6 +4269,13 @@ define void @test_fscale() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fscale:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fscale # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fscale:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -3974,6 +4350,13 @@ define void @test_fsin() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsin:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsin # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fsin:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4048,6 +4431,13 @@ define void @test_fsincos() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsincos:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsincos # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fsincos:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4122,6 +4512,13 @@ define void @test_fsqrt() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsqrt:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsqrt # sched: [1:17.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fsqrt:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -4268,6 +4665,22 @@ define void @test_fst_fstp(i16* %a0, i32* %a1, i64 *%a2) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fst_fstp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fst %st(0) # sched: [1:0.50]
+; BDVER2-NEXT:    fsts (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fstl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fstp %st(0) # sched: [1:0.50]
+; BDVER2-NEXT:    fstpl (%edx) # sched: [1:0.50]
+; BDVER2-NEXT:    fstpl (%ecx) # sched: [1:0.50]
+; BDVER2-NEXT:    fstpt (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fst_fstp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4408,6 +4821,19 @@ define void @test_fstcw_fstenv_fstsw(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fstcw_fstenv_fstsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnstcw (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnstenv (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    fnstsw (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fstcw_fstenv_fstsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4518,6 +4944,16 @@ define void @test_fnstcw_fnstenv_fnstsw(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fnstcw_fnstenv_fnstsw:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fnstcw (%eax) # sched: [1:0.50]
+; BDVER2-NEXT:    fnstenv (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    fnstsw (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fnstcw_fnstenv_fnstsw:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4638,6 +5074,18 @@ define void @test_fsub(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsub %st(0), %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fsub %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fsubs (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fsubl (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fsub:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4762,6 +5210,18 @@ define void @test_fsubp_fisub(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsubp_fisub:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsubp %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fsubp %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fisubs (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fisubl (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fsubp_fisub:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -4886,6 +5346,18 @@ define void @test_fsubr(float *%a0, double *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsubr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsubr %st(0), %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fsubr %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fsubrs (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fsubrl (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fsubr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -5010,6 +5482,18 @@ define void @test_fsubrp_fisubr(i16 *%a0, i32 *%a1) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fsubrp_fisubr:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fsubrp %st(1) # sched: [5:1.00]
+; BDVER2-NEXT:    fsubrp %st(2) # sched: [5:1.00]
+; BDVER2-NEXT:    fisubrs (%ecx) # sched: [10:1.00]
+; BDVER2-NEXT:    fisubrl (%eax) # sched: [10:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fsubrp_fisubr:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -5094,6 +5578,13 @@ define void @test_ftst() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_ftst:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    ftst # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_ftst:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5200,6 +5691,17 @@ define void @test_fucom_fucomp_fucompp() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fucom_fucomp_fucompp:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fucom %st(1) # sched: [1:1.00]
+; BDVER2-NEXT:    fucom %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fucomp %st(1) # sched: [1:1.00]
+; BDVER2-NEXT:    fucomp %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fucompp # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fucom_fucomp_fucompp:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5290,6 +5792,14 @@ define void @test_fucomi_fucomip() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fucomi_fucomip:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fucomi %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    fucompi %st(3) # sched: [1:1.00]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fucomi_fucomip:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5366,6 +5876,13 @@ define void @test_fwait() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fwait:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    wait # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fwait:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5440,6 +5957,13 @@ define void @test_fxam() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fxam:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fxam # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fxam:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5522,6 +6046,14 @@ define void @test_fxch() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fxch:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fxch %st(1) # sched: [1:0.50]
+; BDVER2-NEXT:    fxch %st(3) # sched: [1:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fxch:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5614,6 +6146,15 @@ define void @test_fxrstor_fxsave(i8* %a0) optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fxrstor_fxsave:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fxrstor (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    fxsave (%eax) # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fxrstor_fxsave:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
@@ -5692,6 +6233,13 @@ define void @test_fxtract() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fxtract:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fxtract # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fxtract:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5766,6 +6314,13 @@ define void @test_fyl2x() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fyl2x:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fyl2x # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fyl2x:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
@@ -5840,6 +6395,13 @@ define void @test_fyl2xp1() optsize {
 ; SKX-NEXT:    #NO_APP
 ; SKX-NEXT:    retl # sched: [6:0.50]
 ;
+; BDVER2-LABEL: test_fyl2xp1:
+; BDVER2:       # %bb.0:
+; BDVER2-NEXT:    #APP
+; BDVER2-NEXT:    fyl2xp1 # sched: [100:0.50]
+; BDVER2-NEXT:    #NO_APP
+; BDVER2-NEXT:    retl # sched: [5:1.00]
+;
 ; BTVER2-LABEL: test_fyl2xp1:
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    #APP
diff --git a/test/CodeGen/X86/xop-schedule.ll b/test/CodeGen/X86/xop-schedule.ll
index 9a314e2327bd74d9ed6befd0e56c4aa27f67b9c4..ba0073bc63db9cbfbea82930677e960bc9fa6b92 100644
--- a/test/CodeGen/X86/xop-schedule.ll
+++ b/test/CodeGen/X86/xop-schedule.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+xop | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER12 --check-prefix=BDVER2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
 
@@ -17,16 +17,38 @@ define void @test_vfrczpd(<2 x double> %a0, <4 x double> %a1, <2 x double> *%a2,
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfrczpd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfrczpd %xmm0, %xmm0
-; BDVER-NEXT:    vfrczpd %ymm1, %ymm1
-; BDVER-NEXT:    vfrczpd (%rdi), %xmm0
-; BDVER-NEXT:    vfrczpd (%rsi), %ymm1
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfrczpd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfrczpd %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczpd %ymm1, %ymm1 # sched: [10:2.00]
+; BDVER12-NEXT:    vfrczpd (%rdi), %xmm0 # sched: [15:1.00]
+; BDVER12-NEXT:    vfrczpd (%rsi), %ymm1 # sched: [15:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vfrczpd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vfrczpd %xmm0, %xmm0
+; BDVER3-NEXT:    vfrczpd %ymm1, %ymm1
+; BDVER3-NEXT:    vfrczpd (%rdi), %xmm0
+; BDVER3-NEXT:    vfrczpd (%rsi), %ymm1
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vfrczpd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vfrczpd %xmm0, %xmm0
+; BDVER4-NEXT:    vfrczpd %ymm1, %ymm1
+; BDVER4-NEXT:    vfrczpd (%rdi), %xmm0
+; BDVER4-NEXT:    vfrczpd (%rsi), %ymm1
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vfrczpd $0, $0 \0a\09 vfrczpd $1, $1 \0a\09 vfrczpd $2, $0 \0a\09 vfrczpd $3, $1", "x,x,*m,*m"(<2 x double> %a0, <4 x double> %a1, <2 x double> *%a2, <4 x double> *%a3)
   ret void
 }
@@ -43,16 +65,38 @@ define void @test_vfrczps(<4 x float> %a0, <4 x double> %a1, <4 x float> *%a2, <
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfrczps:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfrczps %xmm0, %xmm0
-; BDVER-NEXT:    vfrczps %ymm1, %ymm1
-; BDVER-NEXT:    vfrczps (%rdi), %xmm0
-; BDVER-NEXT:    vfrczps (%rsi), %ymm1
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfrczps:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfrczps %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczps %ymm1, %ymm1 # sched: [10:2.00]
+; BDVER12-NEXT:    vfrczps (%rdi), %xmm0 # sched: [15:1.00]
+; BDVER12-NEXT:    vfrczps (%rsi), %ymm1 # sched: [15:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vfrczps:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vfrczps %xmm0, %xmm0
+; BDVER3-NEXT:    vfrczps %ymm1, %ymm1
+; BDVER3-NEXT:    vfrczps (%rdi), %xmm0
+; BDVER3-NEXT:    vfrczps (%rsi), %ymm1
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vfrczps:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vfrczps %xmm0, %xmm0
+; BDVER4-NEXT:    vfrczps %ymm1, %ymm1
+; BDVER4-NEXT:    vfrczps (%rdi), %xmm0
+; BDVER4-NEXT:    vfrczps (%rsi), %ymm1
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vfrczps $0, $0 \0a\09 vfrczps $1, $1 \0a\09 vfrczps $2, $0 \0a\09 vfrczps $3, $1", "x,x,*m,*m"(<4 x float> %a0, <4 x double> %a1, <4 x float> *%a2, <4 x double> *%a3)
   ret void
 }
@@ -66,13 +110,29 @@ define void @test_vfrczsd(<2 x double> %a0, <2 x double> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfrczsd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfrczsd %xmm0, %xmm0
-; BDVER-NEXT:    vfrczsd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfrczsd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfrczsd %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczsd (%rdi), %xmm0 # sched: [15:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vfrczsd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vfrczsd %xmm0, %xmm0
+; BDVER3-NEXT:    vfrczsd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vfrczsd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vfrczsd %xmm0, %xmm0
+; BDVER4-NEXT:    vfrczsd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vfrczsd $0, $0 \0a\09 vfrczsd $1, $0", "x,*m"(<2 x double> %a0, <2 x double> *%a1)
   ret void
 }
@@ -86,13 +146,29 @@ define void @test_vfrczss(<4 x float> %a0, <4 x double> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vfrczss:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vfrczss %xmm0, %xmm0
-; BDVER-NEXT:    vfrczss (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vfrczss:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vfrczss %xmm0, %xmm0 # sched: [10:1.00]
+; BDVER12-NEXT:    vfrczss (%rdi), %xmm0 # sched: [15:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vfrczss:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vfrczss %xmm0, %xmm0
+; BDVER3-NEXT:    vfrczss (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vfrczss:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vfrczss %xmm0, %xmm0
+; BDVER4-NEXT:    vfrczss (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vfrczss $0, $0 \0a\09 vfrczss $1, $0", "x,*m"(<4 x float> %a0, <4 x double> *%a1)
   ret void
 }
@@ -107,14 +183,32 @@ define void @test_vpcmov_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpcmov_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcmov (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcmov %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpcmov_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcmov (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcmov %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpcmov_128:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcmov (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcmov %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpcmov_128:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcmov (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcmov %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpcmov $2, $1, $0, $0 \0a\09 vpcmov $3, $1, $0, $0 \0a\09 vpcmov $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -130,15 +224,35 @@ define void @test_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpcmov_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpcmov_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER12-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpcmov_256:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpcmov_256:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpcmov $2, $1, $0, $0 \0a\09 vpcmov $3, $1, $0, $0 \0a\09 vpcmov $2, $3, $0, $0", "x,x,x,*m"(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i64> *%a3)
   ret void
 }
@@ -158,19 +272,47 @@ define void @test_vpcom(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpcom:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomb $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomd $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomq $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomw $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpcom:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomb $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomd $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomq $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomw $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpcom:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomb $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomd $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomq $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomw $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpcom:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpcomb $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomd $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomq $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomw $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomb $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomd $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomq $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomw $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpcomb $3, $1, $0, $0 \0a\09 vpcomd $3, $1, $0, $0 \0a\09 vpcomq $3, $1, $0, $0 \0a\09 vpcomw $3, $1, $0, $0 \0a\09 vpcomb $3, $2, $0, $0 \0a\09 vpcomd $3, $2, $0, $0 \0a\09 vpcomq $3, $2, $0, $0 \0a\09 vpcomw $3, $2, $0, $0", "x,x,*m,i"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2, i8 3)
   ret void
 }
@@ -190,19 +332,47 @@ define void @test_vpcomu(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpcomu:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpcomub $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomud $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomuq $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpcomuw $3, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpcomu:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vpcomub $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomud $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomuq $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vpcomuw $3, (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpcomu:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomub $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomud $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomuq $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpcomuw $3, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpcomu:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpcomub $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomud $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomuq $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomuw $3, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomub $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomud $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomuq $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpcomuw $3, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpcomub $3, $1, $0, $0 \0a\09 vpcomud $3, $1, $0, $0 \0a\09 vpcomuq $3, $1, $0, $0 \0a\09 vpcomuw $3, $1, $0, $0 \0a\09 vpcomub $3, $2, $0, $0 \0a\09 vpcomud $3, $2, $0, $0 \0a\09 vpcomuq $3, $2, $0, $0 \0a\09 vpcomuw $3, $2, $0, $0", "x,x,*m,i"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2, i8 3)
   ret void
 }
@@ -217,14 +387,32 @@ define void @test_vpermil2pd_128(<2 x double> %a0, <2 x double> %a1, <2 x double
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpermil2pd_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpermil2pd_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER12-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER12-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpermil2pd_128:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpermil2pd_128:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpermil2pd $4, $2, $1, $0, $0 \0a\09 vpermil2pd $4, $2, $3, $0, $0 \0a\09 vpermil2pd $4, $3, $1, $0, $0", "x,x,x,*m,i"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3, i8 3)
   ret void
 }
@@ -240,15 +428,35 @@ define void @test_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpermil2pd_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpermil2pd_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BDVER12-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER12-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpermil2pd_256:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER3-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpermil2pd_256:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER4-NEXT:    vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpermil2pd $4, $2, $1, $0, $0 \0a\09 vpermil2pd $4, $2, $3, $0, $0 \0a\09 vpermil2pd $4, $3, $1, $0, $0", "x,x,x,*m,i"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3, i8 3)
   ret void
 }
@@ -263,14 +471,32 @@ define void @test_vpermil2ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpermil2ps_128:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpermil2ps_128:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER12-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER12-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpermil2ps_128:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpermil2ps_128:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpermil2ps $4, $2, $1, $0, $0 \0a\09 vpermil2ps $4, $2, $3, $0, $0 \0a\09 vpermil2ps $4, $3, $1, $0, $0", "x,x,x,*m,i"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3, i8 3)
   ret void
 }
@@ -286,15 +512,35 @@ define void @test_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %
 ; GENERIC-NEXT:    vzeroupper # sched: [100:0.33]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpermil2ps_256:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0
-; BDVER-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    vzeroupper
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpermil2ps_256:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BDVER12-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER12-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [8:3.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    vzeroupper # sched: [46:4.00]
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpermil2ps_256:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER3-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    vzeroupper
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpermil2ps_256:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER4-NEXT:    vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    vzeroupper
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpermil2ps $4, $2, $1, $0, $0 \0a\09 vpermil2ps $4, $2, $3, $0, $0 \0a\09 vpermil2ps $4, $3, $1, $0, $0", "x,x,x,*m,i"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3, i8 3)
   ret void
 }
@@ -308,13 +554,29 @@ define void @test_vphaddbd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddbd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddbd %xmm0, %xmm0
-; BDVER-NEXT:    vphaddbd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddbd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddbd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddbd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphaddbd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddbd %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddbd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddbd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddbd %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddbd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddbd $0, $0 \0a\09 vphaddbd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -328,13 +590,29 @@ define void @test_vphaddbq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddbq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddbq %xmm0, %xmm0
-; BDVER-NEXT:    vphaddbq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddbq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddbq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddbq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphaddbq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddbq %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddbq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddbq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddbq %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddbq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddbq $0, $0 \0a\09 vphaddbq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -348,13 +626,29 @@ define void @test_vphaddbw(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddbw:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddbw %xmm0, %xmm0
-; BDVER-NEXT:    vphaddbw (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddbw:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddbw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddbw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphaddbw:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddbw %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddbw (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddbw:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddbw %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddbw (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddbw $0, $0 \0a\09 vphaddbw $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -368,13 +662,29 @@ define void @test_vphadddq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphadddq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphadddq %xmm0, %xmm0
-; BDVER-NEXT:    vphadddq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphadddq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphadddq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphadddq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphadddq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphadddq %xmm0, %xmm0
+; BDVER3-NEXT:    vphadddq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphadddq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphadddq %xmm0, %xmm0
+; BDVER4-NEXT:    vphadddq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphadddq $0, $0 \0a\09 vphadddq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -388,13 +698,29 @@ define void @test_vphaddubd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddubd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddubd %xmm0, %xmm0
-; BDVER-NEXT:    vphaddubd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddubd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddubd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddubd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphaddubd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddubd %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddubd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddubd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddubd %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddubd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddubd $0, $0 \0a\09 vphaddubd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -408,13 +734,29 @@ define void @test_vphaddubq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddubq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddubq %xmm0, %xmm0
-; BDVER-NEXT:    vphaddubq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddubq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddubq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddubq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphaddubq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddubq %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddubq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddubq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddubq %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddubq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddubq $0, $0 \0a\09 vphaddubq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -428,13 +770,29 @@ define void @test_vphaddubw(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddubw:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddubw %xmm0, %xmm0
-; BDVER-NEXT:    vphaddubw (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddubw:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddubw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddubw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphaddubw:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddubw %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddubw (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddubw:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddubw %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddubw (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddubw $0, $0 \0a\09 vphaddubw $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -448,13 +806,29 @@ define void @test_vphaddudq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddudq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddudq %xmm0, %xmm0
-; BDVER-NEXT:    vphaddudq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddudq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddudq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddudq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphaddudq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddudq %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddudq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddudq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddudq %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddudq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddudq $0, $0 \0a\09 vphaddudq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -468,13 +842,29 @@ define void @test_vphadduwd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphadduwd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphadduwd %xmm0, %xmm0
-; BDVER-NEXT:    vphadduwd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphadduwd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphadduwd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphadduwd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphadduwd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphadduwd %xmm0, %xmm0
+; BDVER3-NEXT:    vphadduwd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphadduwd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphadduwd %xmm0, %xmm0
+; BDVER4-NEXT:    vphadduwd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphadduwd $0, $0 \0a\09 vphadduwd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -488,13 +878,29 @@ define void @test_vphadduwq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphadduwq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphadduwq %xmm0, %xmm0
-; BDVER-NEXT:    vphadduwq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphadduwq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphadduwq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphadduwq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphadduwq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphadduwq %xmm0, %xmm0
+; BDVER3-NEXT:    vphadduwq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphadduwq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphadduwq %xmm0, %xmm0
+; BDVER4-NEXT:    vphadduwq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphadduwq $0, $0 \0a\09 vphadduwq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -508,13 +914,29 @@ define void @test_vphaddwd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddwd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddwd %xmm0, %xmm0
-; BDVER-NEXT:    vphaddwd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddwd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddwd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddwd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphaddwd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddwd %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddwd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddwd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddwd %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddwd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddwd $0, $0 \0a\09 vphaddwd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -528,13 +950,29 @@ define void @test_vphaddwq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphaddwq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphaddwq %xmm0, %xmm0
-; BDVER-NEXT:    vphaddwq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphaddwq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphaddwq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphaddwq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphaddwq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphaddwq %xmm0, %xmm0
+; BDVER3-NEXT:    vphaddwq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphaddwq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphaddwq %xmm0, %xmm0
+; BDVER4-NEXT:    vphaddwq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphaddwq $0, $0 \0a\09 vphaddwq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -548,13 +986,29 @@ define void @test_vphsubbw(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphsubbw:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphsubbw %xmm0, %xmm0
-; BDVER-NEXT:    vphsubbw (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphsubbw:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphsubbw %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphsubbw (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphsubbw:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphsubbw %xmm0, %xmm0
+; BDVER3-NEXT:    vphsubbw (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphsubbw:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphsubbw %xmm0, %xmm0
+; BDVER4-NEXT:    vphsubbw (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphsubbw $0, $0 \0a\09 vphsubbw $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -568,13 +1022,29 @@ define void @test_vphsubdq(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphsubdq:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphsubdq %xmm0, %xmm0
-; BDVER-NEXT:    vphsubdq (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphsubdq:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphsubdq %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphsubdq (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphsubdq:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphsubdq %xmm0, %xmm0
+; BDVER3-NEXT:    vphsubdq (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphsubdq:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphsubdq %xmm0, %xmm0
+; BDVER4-NEXT:    vphsubdq (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphsubdq $0, $0 \0a\09 vphsubdq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -588,13 +1058,29 @@ define void @test_vphsubwd(<2 x i64> %a0, <2 x i64> *%a1) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vphsubwd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vphsubwd %xmm0, %xmm0
-; BDVER-NEXT:    vphsubwd (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vphsubwd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vphsubwd %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vphsubwd (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vphsubwd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vphsubwd %xmm0, %xmm0
+; BDVER3-NEXT:    vphsubwd (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vphsubwd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vphsubwd %xmm0, %xmm0
+; BDVER4-NEXT:    vphsubwd (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vphsubwd $0, $0 \0a\09 vphsubwd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
   ret void
 }
@@ -608,13 +1094,29 @@ define void @test_vpmacsdd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsdd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsdd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER12-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacsdd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsdd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsdd $2, $1, $0, $0 \0a\09 vpmacsdd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -628,13 +1130,29 @@ define void @test_vpmacsdqh(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsdqh:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsdqh:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; BDVER12-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacsdqh:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsdqh:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsdqh $2, $1, $0, $0 \0a\09 vpmacsdqh $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -648,13 +1166,29 @@ define void @test_vpmacsdql(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsdql:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsdql:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; BDVER12-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacsdql:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsdql:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsdql $2, $1, $0, $0 \0a\09 vpmacsdql $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -668,13 +1202,29 @@ define void @test_vpmacssdd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacssdd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacssdd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BDVER12-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacssdd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacssdd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacssdd $2, $1, $0, $0 \0a\09 vpmacssdd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -688,13 +1238,29 @@ define void @test_vpmacssdqh(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacssdqh:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacssdqh:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; BDVER12-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacssdqh:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacssdqh:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacssdqh $2, $1, $0, $0 \0a\09 vpmacssdqh $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -708,13 +1274,29 @@ define void @test_vpmacssdql(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacssdql:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacssdql:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; BDVER12-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacssdql:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacssdql:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacssdql $2, $1, $0, $0 \0a\09 vpmacssdql $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -728,13 +1310,29 @@ define void @test_vpmacsswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsswd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsswd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacsswd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsswd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsswd $2, $1, $0, $0 \0a\09 vpmacsswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -748,13 +1346,29 @@ define void @test_vpmacssww(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacssww:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacssww:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacssww:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacssww:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacssww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacssww $2, $1, $0, $0 \0a\09 vpmacssww $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -768,13 +1382,29 @@ define void @test_vpmacswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacswd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacswd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacswd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacswd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacswd $2, $1, $0, $0 \0a\09 vpmacswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -788,13 +1418,29 @@ define void @test_vpmacsww(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmacsww:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmacsww:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmacsww:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmacsww:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmacsww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmacsww $2, $1, $0, $0 \0a\09 vpmacsww $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -808,13 +1454,29 @@ define void @test_vpmadcsswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmadcsswd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmadcsswd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmadcsswd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmadcsswd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmadcsswd $2, $1, $0, $0 \0a\09 vpmadcsswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -828,13 +1490,29 @@ define void @test_vpmadcswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i6
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpmadcswd:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpmadcswd:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BDVER12-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpmadcswd:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpmadcswd:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpmadcswd $2, $1, $0, $0 \0a\09 vpmadcswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -849,14 +1527,32 @@ define void @test_vpperm(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64>
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpperm:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpperm:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BDVER12-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER12-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpperm:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpperm:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpperm (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpperm %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpperm $2, $1, $0, $0 \0A\09 vpperm $3, $1, $0, $0 \0A\09 vpperm $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
   ret void
 }
@@ -888,31 +1584,83 @@ define void @test_vprot(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vprot:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vprotb %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vprotd %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vprotq %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vprotw %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vprotb (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vprotd (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vprotq (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vprotw (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vprotb %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vprotd %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vprotq %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vprotw %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vprotb $7, %xmm0, %xmm0
-; BDVER-NEXT:    vprotd $7, %xmm0, %xmm0
-; BDVER-NEXT:    vprotq $7, %xmm0, %xmm0
-; BDVER-NEXT:    vprotw $7, %xmm0, %xmm0
-; BDVER-NEXT:    vprotb $7, (%rdi), %xmm0
-; BDVER-NEXT:    vprotd $7, (%rdi), %xmm0
-; BDVER-NEXT:    vprotq $7, (%rdi), %xmm0
-; BDVER-NEXT:    vprotw $7, (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vprot:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vprotb %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vprotd %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vprotq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vprotw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vprotb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotb %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotd %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotq %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotw %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vprotb $7, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vprotd $7, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vprotq $7, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vprotw $7, %xmm0, %xmm0 # sched: [2:0.50]
+; BDVER12-NEXT:    vprotb $7, (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vprotd $7, (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vprotq $7, (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    vprotw $7, (%rdi), %xmm0 # sched: [7:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vprot:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vprotb %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotd %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotq %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotw %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotb (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vprotd (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vprotq (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vprotw (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vprotb %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotd %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotq %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotw %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotb $7, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotd $7, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotq $7, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotw $7, %xmm0, %xmm0
+; BDVER3-NEXT:    vprotb $7, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotd $7, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotq $7, (%rdi), %xmm0
+; BDVER3-NEXT:    vprotw $7, (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vprot:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vprotb %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotd %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotq %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotw %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotb (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vprotd (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vprotq (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vprotw (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vprotb %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotd %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotq %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotw %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotb $7, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotd $7, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotq $7, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotw $7, %xmm0, %xmm0
+; BDVER4-NEXT:    vprotb $7, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotd $7, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotq $7, (%rdi), %xmm0
+; BDVER4-NEXT:    vprotw $7, (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vprotb $1, $0, $0 \0A\09 vprotd $1, $0, $0 \0A\09 vprotq $1, $0, $0 \0A\09 vprotw $1, $0, $0 \0A\09 vprotb $2, $0, $0 \0A\09 vprotd $2, $0, $0 \0A\09 vprotq $2, $0, $0 \0A\09 vprotw $2, $0, $0 \0A\09 vprotb $0, $2, $0 \0A\09 vprotd $0, $2, $0 \0A\09 vprotq $0, $2, $0 \0A\09 vprotw $0, $2, $0 \0A\09 vprotb $3, $0, $0 \0A\09 vprotd $3, $0, $0 \0A\09 vprotq $3, $0, $0 \0A\09 vprotw $3, $0, $0 \0A\09 vprotb $3, $2, $0 \0A\09 vprotd $3, $2, $0 \0A\09 vprotq $3, $2, $0 \0A\09 vprotw $3, $2, $0", "x,x,*m,i"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2, i8 7)
   ret void
 }
@@ -936,23 +1684,59 @@ define void @test_vpsha(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpsha:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpshab %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshad %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshab (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshad (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshaq (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshaw (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshab %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshad %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshaq %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshaw %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpsha:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpshab %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshad %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshaq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshaw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshab (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshad (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshaq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshaw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshab %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshad %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshaq %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshaw %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpsha:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshab (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshad (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshaq (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshaw (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshab %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshad %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshaq %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshaw %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpsha:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshad %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshab (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshad (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshaq (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshaw (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshab %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshad %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshaq %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshaw %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpshab $1, $0, $0 \0A\09 vpshad $1, $0, $0 \0A\09 vpshaq $1, $0, $0 \0A\09 vpshaw $1, $0, $0 \0A\09 vpshab $2, $0, $0 \0A\09 vpshad $2, $0, $0 \0A\09 vpshaq $2, $0, $0 \0A\09 vpshaw $2, $0, $0 \0A\09 vpshab $0, $2, $0 \0A\09 vpshad $0, $2, $0 \0A\09 vpshaq $0, $2, $0 \0A\09 vpshaw $0, $2, $0", "x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
   ret void
 }
@@ -976,23 +1760,59 @@ define void @test_vpshl(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
 ; GENERIC-NEXT:    #NO_APP
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
-; BDVER-LABEL: test_vpshl:
-; BDVER:       # %bb.0:
-; BDVER-NEXT:    #APP
-; BDVER-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshld %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
-; BDVER-NEXT:    vpshlb (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshld (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshlq (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshlw (%rdi), %xmm0, %xmm0
-; BDVER-NEXT:    vpshlb %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshld %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshlq %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    vpshlw %xmm0, (%rdi), %xmm0
-; BDVER-NEXT:    #NO_APP
-; BDVER-NEXT:    retq
+; BDVER12-LABEL: test_vpshl:
+; BDVER12:       # %bb.0:
+; BDVER12-NEXT:    #APP
+; BDVER12-NEXT:    vpshlb %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshld %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshlq %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshlw %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BDVER12-NEXT:    vpshlb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlb %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshld %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlq %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    vpshlw %xmm0, (%rdi), %xmm0 # sched: [8:0.50]
+; BDVER12-NEXT:    #NO_APP
+; BDVER12-NEXT:    retq # sched: [5:1.00]
+;
+; BDVER3-LABEL: test_vpshl:
+; BDVER3:       # %bb.0:
+; BDVER3-NEXT:    #APP
+; BDVER3-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlb (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshld (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlq (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlw (%rdi), %xmm0, %xmm0
+; BDVER3-NEXT:    vpshlb %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshld %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshlq %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    vpshlw %xmm0, (%rdi), %xmm0
+; BDVER3-NEXT:    #NO_APP
+; BDVER3-NEXT:    retq
+;
+; BDVER4-LABEL: test_vpshl:
+; BDVER4:       # %bb.0:
+; BDVER4-NEXT:    #APP
+; BDVER4-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlb (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshld (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlq (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlw (%rdi), %xmm0, %xmm0
+; BDVER4-NEXT:    vpshlb %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshld %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshlq %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    vpshlw %xmm0, (%rdi), %xmm0
+; BDVER4-NEXT:    #NO_APP
+; BDVER4-NEXT:    retq
   call void asm sideeffect "vpshlb $1, $0, $0 \0A\09 vpshld $1, $0, $0 \0A\09 vpshlq $1, $0, $0 \0A\09 vpshlw $1, $0, $0 \0A\09 vpshlb $2, $0, $0 \0A\09 vpshld $2, $0, $0 \0A\09 vpshlq $2, $0, $0 \0A\09 vpshlw $2, $0, $0 \0A\09 vpshlb $0, $2, $0 \0A\09 vpshld $0, $2, $0 \0A\09 vpshlq $0, $2, $0 \0A\09 vpshlw $0, $2, $0", "x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
   ret void
 }
diff --git a/test/CodeGen/XCore/dwarf_debug.ll b/test/CodeGen/XCore/dwarf_debug.ll
index 4efd73e40a7d81eb896aab25d5aad96da18252a1..e0a75d25373d5f1b6cab5f2795f2b26941a93c36 100644
--- a/test/CodeGen/XCore/dwarf_debug.ll
+++ b/test/CodeGen/XCore/dwarf_debug.ll
@@ -4,11 +4,11 @@
 ; target triple = "xcore"
 
 ; CHECK-LABEL: f
-; CHECK: entsp 2
+; CHECK: entsp [[S:[0-9]+]]
 ; ...the prologue...
 ; CHECK: .loc 1 2 0 prologue_end      # test.c:2:0
 ; CHECK: add r0, r0, 1
-; CHECK: retsp 2
+; CHECK: retsp [[S]]
 define i32 @f(i32 %a) !dbg !4 {
 entry:
   %a.addr = alloca i32, align 4
diff --git a/test/DebugInfo/AArch64/asan-stack-vars.ll b/test/DebugInfo/AArch64/asan-stack-vars.ll
deleted file mode 100644
index 5db46065476fb3771c6ea5d4387dfc4ee18ff35b..0000000000000000000000000000000000000000
--- a/test/DebugInfo/AArch64/asan-stack-vars.ll
+++ /dev/null
@@ -1,324 +0,0 @@
-; RUN: llc -O0 -fast-isel -filetype=obj -o - %s | llvm-dwarfdump -v - | FileCheck %s
-;
-; Derived from (clang -O0 -g -fsanitize=address -fobjc-arc)
-;   @protocol NSObject
-;   @end
-;   @interface NSObject<NSObject>{}
-;   + (instancetype)alloc;
-;   @end
-;   struct CGSize {
-;     double width;
-;     double height;
-;   };
-;   typedef struct CGSize CGSize;
-;   @interface Object : NSObject
-;   - (instancetype)initWithSize:(CGSize)size;
-;   - (id)aMessage;
-;   @end
-;   @implementation MyObject
-;   + (id)doWithSize:(CGSize)imageSize andObject:(id)object {
-;     return [object aMessage];
-;   }
-;   @end
-;
-; CHECK: .debug_info contents:
-; CHECK: DW_TAG_subprogram
-; CHECK-NEXT:   DW_AT_low_pc [DW_FORM_addr]     (0x0000000000000000)
-; CHECK-NEXT:   DW_AT_high_pc [DW_FORM_addr]    ([[FN_END:.*]])
-; CHECK: "_cmd"
-; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location
-; CHECK-NEXT:   [0x{{0*}}, 0x{{.*}}):
-; CHECK-NOT:    DW_AT_
-; CHECK:        [0x{{.*}}, [[FN_END]]):
-; CHECK-NEXT: DW_AT_name {{.*}}"imageSize"
-
-; ModuleID = 'm.m'
-source_filename = "m.m"
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios"
-
-%0 = type opaque
-%struct._class_t = type { %struct._class_t*, %struct._class_t*, %struct._objc_cache*, i8* (i8*, i8*)**, %struct._class_ro_t* }
-%struct._objc_cache = type opaque
-%struct._class_ro_t = type { i32, i32, i32, i8*, i8*, %struct.__method_list_t*, %struct._objc_protocol_list*, %struct._ivar_list_t*, i8*, %struct._prop_list_t* }
-%struct.__method_list_t = type { i32, i32, [0 x %struct._objc_method] }
-%struct._objc_method = type { i8*, i8*, i8* }
-%struct._objc_protocol_list = type { i64, [0 x %struct._protocol_t*] }
-%struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32, i8**, i8*, %struct._prop_list_t* }
-%struct._ivar_list_t = type { i32, i32, [0 x %struct._ivar_t] }
-%struct._ivar_t = type { i32*, i8*, i8*, i32, i32 }
-%struct._prop_list_t = type { i32, i32, [0 x %struct._prop_t] }
-%struct._prop_t = type { i8*, i8* }
-%struct.CGSize = type { double, double }
-
-@"OBJC_CLASS_$_Object" = external global %struct._class_t
-@"OBJC_CLASSLIST_REFERENCES_$_" = private global %struct._class_t* @"OBJC_CLASS_$_Object", section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
-@OBJC_METH_VAR_NAME_ = private unnamed_addr constant [6 x i8] c"alloc\00", section "__TEXT,__objc_methname,cstring_literals", align 1
-@OBJC_SELECTOR_REFERENCES_ = private externally_initialized global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
-@OBJC_METH_VAR_NAME_.1 = private unnamed_addr constant [14 x i8] c"initWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
-@OBJC_SELECTOR_REFERENCES_.2 = private externally_initialized global i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
-@OBJC_METH_VAR_NAME_.3 = private unnamed_addr constant [9 x i8] c"aMessage\00", section "__TEXT,__objc_methname,cstring_literals", align 1
-@OBJC_SELECTOR_REFERENCES_.4 = private externally_initialized global i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
-@_objc_empty_cache = external global %struct._objc_cache
-@"OBJC_CLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* null, %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_CLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
-@"OBJC_METACLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* @"OBJC_CLASS_$_MyObject", %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_METACLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
-@OBJC_CLASS_NAME_ = private unnamed_addr constant [9 x i8] c"MyObject\00", section "__TEXT,__objc_classname,cstring_literals", align 1
-@OBJC_METH_VAR_NAME_.5 = private unnamed_addr constant [12 x i8] c"doWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
-@OBJC_METH_VAR_TYPE_ = private unnamed_addr constant [21 x i8] c"@32@0:8{CGSize=dd}16\00", section "__TEXT,__objc_methtype,cstring_literals", align 1
-@"\01l_OBJC_$_CLASS_METHODS_MyObject" = private global { i32, i32, [1 x %struct._objc_method] } { i32 24, i32 1, [1 x %struct._objc_method] [%struct._objc_method { i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i8*) }] }, section "__DATA, __objc_const", align 8
-@"\01l_OBJC_METACLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 131, i32 40, i32 40, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to %struct.__method_list_t*), %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
-@"\01l_OBJC_CLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 130, i32 0, i32 0, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* null, %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
-@"OBJC_LABEL_CLASS_$" = private global [1 x i8*] [i8* bitcast (%struct._class_t* @"OBJC_CLASS_$_MyObject" to i8*)], section "__DATA, __objc_classlist, regular, no_dead_strip", align 8
-@llvm.compiler.used = appending global [12 x i8*] [i8* bitcast (%struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_" to i8*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_ to i8*), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.2 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.4 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to i8*), i8* bitcast ([1 x i8*]* @"OBJC_LABEL_CLASS_$" to i8*)], section "llvm.metadata"
-@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @asan.module_ctor, i8* null }]
-@__asan_shadow_memory_dynamic_address = external global i64
-@___asan_gen_ = private unnamed_addr constant [34 x i8] c"2 32 16 9 imageSize 64 8 6 object\00", align 1
-
-; Function Attrs: noinline sanitize_address ssp uwtable
-define internal i8* @"\01+[MyObject doWithSize:]"(i8* %self, i8* %_cmd, [2 x double] %imageSize.coerce) #0 !dbg !14 {
-entry:
-  %0 = load i64, i64* @__asan_shadow_memory_dynamic_address
-  %self.addr = alloca i8*, align 8
-  %_cmd.addr = alloca i8*, align 8
-  %MyAlloca = alloca [96 x i8], align 32, !dbg !35
-  %1 = ptrtoint [96 x i8]* %MyAlloca to i64, !dbg !35
-  %2 = add i64 %1, 32, !dbg !35
-  %3 = inttoptr i64 %2 to %struct.CGSize*, !dbg !35
-  %4 = add i64 %1, 64, !dbg !35
-  %5 = inttoptr i64 %4 to %0**, !dbg !35
-  %6 = inttoptr i64 %1 to i64*, !dbg !35
-  store i64 1102416563, i64* %6, !dbg !35
-  %7 = add i64 %1, 8, !dbg !35
-  %8 = inttoptr i64 %7 to i64*, !dbg !35
-  store i64 ptrtoint ([34 x i8]* @___asan_gen_ to i64), i64* %8, !dbg !35
-  %9 = add i64 %1, 16, !dbg !35
-  %10 = inttoptr i64 %9 to i64*, !dbg !35
-  store i64 ptrtoint (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i64), i64* %10, !dbg !35
-  %11 = lshr i64 %1, 3, !dbg !35
-  %12 = add i64 %11, %0, !dbg !35
-  %13 = add i64 %12, 0, !dbg !35
-  %14 = inttoptr i64 %13 to i64*, !dbg !35
-  store i64 -940689368107847183, i64* %14, align 1, !dbg !35
-  %15 = add i64 %12, 9, !dbg !35
-  %16 = inttoptr i64 %15 to i16*, !dbg !35
-  store i16 -3085, i16* %16, align 1, !dbg !35
-  %17 = add i64 %12, 11, !dbg !35
-  %18 = inttoptr i64 %17 to i8*, !dbg !35
-  store i8 -13, i8* %18, align 1, !dbg !35
-  call void @llvm.dbg.declare(metadata %struct.CGSize* %3, metadata !36, metadata !37), !dbg !38
-  call void @llvm.dbg.declare(metadata %0** %5, metadata !39, metadata !37), !dbg !45
-  %19 = bitcast %struct.CGSize* %3 to [2 x double]*
-  %20 = ptrtoint [2 x double]* %19 to i64
-  %21 = lshr i64 %20, 3
-  %22 = add i64 %21, %0
-  %23 = inttoptr i64 %22 to i16*
-  %24 = load i16, i16* %23
-  %25 = icmp ne i16 %24, 0
-  br i1 %25, label %26, label %27
-
-; <label>:26:                                     ; preds = %entry
-  call void @__asan_report_store16(i64 %20)
-  call void asm sideeffect "", ""()
-  unreachable
-
-; <label>:27:                                     ; preds = %entry
-  store [2 x double] %imageSize.coerce, [2 x double]* %19, align 8
-  store i8* %self, i8** %self.addr, align 8
-  call void @llvm.dbg.declare(metadata i8** %self.addr, metadata !46, metadata !48), !dbg !49
-  store i8* %_cmd, i8** %_cmd.addr, align 8
-  call void @llvm.dbg.declare(metadata i8** %_cmd.addr, metadata !50, metadata !48), !dbg !49
-  %28 = load %struct._class_t*, %struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_", align 8, !dbg !52
-  %29 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64), i64 3), %0, !dbg !52
-  %30 = inttoptr i64 %29 to i8*, !dbg !52
-  %31 = load i8, i8* %30, !dbg !52
-  %32 = icmp ne i8 %31, 0, !dbg !52
-  br i1 %32, label %33, label %34, !dbg !52
-
-; <label>:33:                                     ; preds = %27
-  call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64)), !dbg !52
-  call void asm sideeffect "", ""(), !dbg !52
-  unreachable, !dbg !52
-
-; <label>:34:                                     ; preds = %27
-  %35 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !dbg !52, !invariant.load !2
-  %36 = bitcast %struct._class_t* %28 to i8*, !dbg !52
-  %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %36, i8* %35), !dbg !52
-  %37 = bitcast i8* %call to %0*, !dbg !52
-  %38 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64), i64 3), %0, !dbg !53
-  %39 = inttoptr i64 %38 to i8*, !dbg !53
-  %40 = load i8, i8* %39, !dbg !53
-  %41 = icmp ne i8 %40, 0, !dbg !53
-  br i1 %41, label %42, label %43, !dbg !53
-
-; <label>:42:                                     ; preds = %34
-  call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64)), !dbg !53
-  call void asm sideeffect "", ""(), !dbg !53
-  unreachable, !dbg !53
-
-; <label>:43:                                     ; preds = %34
-  %44 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.2, align 8, !dbg !53, !invariant.load !2
-  %45 = bitcast %0* %37 to i8*, !dbg !53
-  %46 = bitcast %struct.CGSize* %3 to [2 x double]*, !dbg !53
-  %47 = ptrtoint [2 x double]* %46 to i64, !dbg !53
-  %48 = lshr i64 %47, 3, !dbg !53
-  %49 = add i64 %48, %0, !dbg !53
-  %50 = inttoptr i64 %49 to i16*, !dbg !53
-  %51 = load i16, i16* %50, !dbg !53
-  %52 = icmp ne i16 %51, 0, !dbg !53
-  br i1 %52, label %53, label %54, !dbg !53
-
-; <label>:53:                                     ; preds = %43
-  call void @__asan_report_load16(i64 %47), !dbg !53
-  call void asm sideeffect "", ""(), !dbg !53
-  unreachable, !dbg !53
-
-; <label>:54:                                     ; preds = %43
-  %55 = load [2 x double], [2 x double]* %46, align 8, !dbg !53
-  %call1 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, [2 x double])*)(i8* %45, i8* %44, [2 x double] %55), !dbg !53
-  %56 = bitcast i8* %call1 to %0*, !dbg !53
-  %57 = ptrtoint %0** %5 to i64, !dbg !45
-  %58 = lshr i64 %57, 3, !dbg !45
-  %59 = add i64 %58, %0, !dbg !45
-  %60 = inttoptr i64 %59 to i8*, !dbg !45
-  %61 = load i8, i8* %60, !dbg !45
-  %62 = icmp ne i8 %61, 0, !dbg !45
-  br i1 %62, label %63, label %64, !dbg !45
-
-; <label>:63:                                     ; preds = %54
-  call void @__asan_report_store8(i64 %57), !dbg !45
-  call void asm sideeffect "", ""(), !dbg !45
-  unreachable, !dbg !45
-
-; <label>:64:                                     ; preds = %54
-  store %0* %56, %0** %5, align 8, !dbg !45
-  %65 = load %0*, %0** %5, align 8, !dbg !54
-  %66 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64), i64 3), %0, !dbg !55
-  %67 = inttoptr i64 %66 to i8*, !dbg !55
-  %68 = load i8, i8* %67, !dbg !55
-  %69 = icmp ne i8 %68, 0, !dbg !55
-  br i1 %69, label %70, label %71, !dbg !55
-
-; <label>:70:                                     ; preds = %64
-  call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64)), !dbg !55
-  call void asm sideeffect "", ""(), !dbg !55
-  unreachable, !dbg !55
-
-; <label>:71:                                     ; preds = %64
-  %72 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.4, align 8, !dbg !55, !invariant.load !2
-  %73 = bitcast %0* %65 to i8*, !dbg !55
-  %call2 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %73, i8* %72), !dbg !55
-  call void asm sideeffect "mov\09fp, fp\09\09; marker for objc_retainAutoreleaseReturnValue", ""(), !dbg !55
-  %74 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call2) #3, !dbg !55
-  %75 = bitcast %0** %5 to i8**, !dbg !56
-  call void @objc_storeStrong(i8** %75, i8* null) #3, !dbg !56
-  %76 = tail call i8* @objc_autoreleaseReturnValue(i8* %74) #3, !dbg !56
-  store i64 1172321806, i64* %6, !dbg !56
-  %77 = add i64 %12, 0, !dbg !56
-  %78 = inttoptr i64 %77 to i64*, !dbg !56
-  store i64 0, i64* %78, align 1, !dbg !56
-  %79 = add i64 %12, 9, !dbg !56
-  %80 = inttoptr i64 %79 to i16*, !dbg !56
-  store i16 0, i16* %80, align 1, !dbg !56
-  %81 = add i64 %12, 11, !dbg !56
-  %82 = inttoptr i64 %81 to i8*, !dbg !56
-  store i8 0, i8* %82, align 1, !dbg !56
-  ret i8* %76, !dbg !56
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-; Function Attrs: nonlazybind
-declare i8* @objc_msgSend(i8*, i8*, ...) #2
-
-declare i8* @objc_retainAutoreleasedReturnValue(i8* returned)
-
-declare void @objc_storeStrong(i8**, i8*)
-
-declare i8* @objc_autoreleaseReturnValue(i8* returned)
-
-define internal void @asan.module_ctor() {
-  call void @__asan_init()
-  call void @__asan_version_mismatch_check_v8()
-  ret void
-}
-
-declare void @__asan_init()
-
-declare void @__asan_version_mismatch_check_v8()
-
-declare void @__asan_report_load8(i64)
-
-declare void @__asan_report_load16(i64)
-
-declare void @__asan_report_store8(i64)
-
-declare void @__asan_report_store16(i64)
-
-attributes #0 = { noinline sanitize_address ssp uwtable }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nonlazybind }
-attributes #3 = { nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!5, !6, !7, !8, !9, !10, !11, !12}
-!llvm.ident = !{!13}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !1, producer: "clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
-!1 = !DIFile(filename: "m.m", directory: "/")
-!2 = !{}
-!3 = !{!4}
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyObject", scope: !1, file: !1, line: 15, flags: DIFlagObjcClassComplete, elements: !2, runtimeLang: DW_LANG_ObjC)
-!5 = !{i32 1, !"Objective-C Version", i32 2}
-!6 = !{i32 1, !"Objective-C Image Info Version", i32 0}
-!7 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
-!8 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
-!9 = !{i32 1, !"Objective-C Class Properties", i32 64}
-!10 = !{i32 2, !"Dwarf Version", i32 2}
-!11 = !{i32 2, !"Debug Info Version", i32 3}
-!12 = !{i32 1, !"PIC Level", i32 2}
-!13 = !{!"clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)"}
-!14 = distinct !DISubprogram(name: "+[MyObject doWithSize:]", scope: !1, file: !1, line: 16, type: !15, isLocal: true, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
-!15 = !DISubroutineType(types: !16)
-!16 = !{!17, !24, !26, !29}
-!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "id", file: !1, baseType: !18)
-!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
-!19 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_object", file: !1, elements: !20)
-!20 = !{!21}
-!21 = !DIDerivedType(tag: DW_TAG_member, name: "isa", scope: !19, file: !1, baseType: !22, size: 64)
-!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
-!23 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_class", file: !1, flags: DIFlagFwdDecl)
-!24 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25, flags: DIFlagArtificial | DIFlagObjectPointer)
-!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "Class", file: !1, baseType: !22)
-!26 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27, flags: DIFlagArtificial)
-!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64)
-!28 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_selector", file: !1, flags: DIFlagFwdDecl)
-!29 = !DIDerivedType(tag: DW_TAG_typedef, name: "CGSize", file: !1, line: 10, baseType: !30)
-!30 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CGSize", file: !1, line: 6, size: 128, elements: !31)
-!31 = !{!32, !34}
-!32 = !DIDerivedType(tag: DW_TAG_member, name: "width", scope: !30, file: !1, line: 7, baseType: !33, size: 64)
-!33 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
-!34 = !DIDerivedType(tag: DW_TAG_member, name: "height", scope: !30, file: !1, line: 8, baseType: !33, size: 64, offset: 64)
-!35 = !DILocation(line: 16, scope: !14)
-!36 = !DILocalVariable(name: "imageSize", arg: 3, scope: !14, file: !1, line: 16, type: !29)
-!37 = !DIExpression(DW_OP_deref)
-!38 = !DILocation(line: 16, column: 26, scope: !14)
-!39 = !DILocalVariable(name: "object", scope: !14, file: !1, line: 17, type: !40)
-!40 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !41, size: 64)
-!41 = !DICompositeType(tag: DW_TAG_structure_type, name: "Object", scope: !1, file: !1, line: 11, elements: !42, runtimeLang: DW_LANG_ObjC)
-!42 = !{!43}
-!43 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !41, baseType: !44)
-!44 = !DICompositeType(tag: DW_TAG_structure_type, name: "NSObject", scope: !1, file: !1, line: 3, elements: !2, runtimeLang: DW_LANG_ObjC)
-!45 = !DILocation(line: 17, column: 11, scope: !14)
-!46 = !DILocalVariable(name: "self", arg: 1, scope: !14, type: !47, flags: DIFlagArtificial | DIFlagObjectPointer)
-!47 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25)
-!48 = !DIExpression()
-!49 = !DILocation(line: 0, scope: !14)
-!50 = !DILocalVariable(name: "_cmd", arg: 2, scope: !14, type: !51, flags: DIFlagArtificial)
-!51 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27)
-!52 = !DILocation(line: 17, column: 21, scope: !14)
-!53 = !DILocation(line: 17, column: 20, scope: !14)
-!54 = !DILocation(line: 18, column: 11, scope: !14)
-!55 = !DILocation(line: 18, column: 10, scope: !14)
-!56 = !DILocation(line: 19, column: 1, scope: !14)
diff --git a/test/DebugInfo/AArch64/asan-stack-vars.mir b/test/DebugInfo/AArch64/asan-stack-vars.mir
new file mode 100644
index 0000000000000000000000000000000000000000..428cef6272266b288a962c986d21ee212a72a3e7
--- /dev/null
+++ b/test/DebugInfo/AArch64/asan-stack-vars.mir
@@ -0,0 +1,682 @@
+# RUN: llc -O0 -start-before=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump -v - | FileCheck %s
+#
+# Derived from (clang -O0 -g -fsanitize=address -fobjc-arc)
+#   @protocol NSObject
+#   @end
+#   @interface NSObject<NSObject>{}
+#   + (instancetype)alloc;
+#   @end
+#   struct CGSize {
+#     double width;
+#     double height;
+#   };
+#   typedef struct CGSize CGSize;
+#   @interface Object : NSObject
+#   - (instancetype)initWithSize:(CGSize)size;
+#   - (id)aMessage;
+#   @end
+#   @implementation MyObject
+#   + (id)doWithSize:(CGSize)imageSize andObject:(id)object {
+#     return [object aMessage];
+#   }
+#   @end
+#
+# CHECK: .debug_info contents:
+# CHECK: DW_TAG_subprogram
+# CHECK-NEXT:   DW_AT_low_pc [DW_FORM_addr]     (0x0000000000000000)
+# CHECK-NEXT:   DW_AT_high_pc [DW_FORM_addr]    ([[FN_END:.*]])
+# CHECK: "_cmd"
+# CHECK: DW_TAG_formal_parameter
+# CHECK-NEXT: DW_AT_location
+# CHECK-NEXT:   [0x{{0*}}, 0x{{.*}}):
+# CHECK-NOT:    DW_AT_
+# CHECK:        [0x{{.*}}, [[FN_END]]):
+# CHECK-NEXT: DW_AT_name {{.*}}"imageSize"
+--- |
+  ; ModuleID = 'test/DebugInfo/AArch64/asan-stack-vars.ll'
+  source_filename = "m.m"
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "arm64-apple-ios"
+  
+  %0 = type opaque
+  %struct._class_t = type { %struct._class_t*, %struct._class_t*, %struct._objc_cache*, i8* (i8*, i8*)**, %struct._class_ro_t* }
+  %struct._objc_cache = type opaque
+  %struct._class_ro_t = type { i32, i32, i32, i8*, i8*, %struct.__method_list_t*, %struct._objc_protocol_list*, %struct._ivar_list_t*, i8*, %struct._prop_list_t* }
+  %struct.__method_list_t = type { i32, i32, [0 x %struct._objc_method] }
+  %struct._objc_method = type { i8*, i8*, i8* }
+  %struct._objc_protocol_list = type { i64, [0 x %struct._protocol_t*] }
+  %struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32, i8**, i8*, %struct._prop_list_t* }
+  %struct._ivar_list_t = type { i32, i32, [0 x %struct._ivar_t] }
+  %struct._ivar_t = type { i32*, i8*, i8*, i32, i32 }
+  %struct._prop_list_t = type { i32, i32, [0 x %struct._prop_t] }
+  %struct._prop_t = type { i8*, i8* }
+  %struct.CGSize = type { double, double }
+  
+  @"OBJC_CLASS_$_Object" = external global %struct._class_t
+  @"OBJC_CLASSLIST_REFERENCES_$_" = private global %struct._class_t* @"OBJC_CLASS_$_Object", section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+  @OBJC_METH_VAR_NAME_ = private unnamed_addr constant [6 x i8] c"alloc\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+  @OBJC_SELECTOR_REFERENCES_ = private externally_initialized global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
+  @OBJC_METH_VAR_NAME_.1 = private unnamed_addr constant [14 x i8] c"initWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+  @OBJC_SELECTOR_REFERENCES_.2 = private externally_initialized global i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
+  @OBJC_METH_VAR_NAME_.3 = private unnamed_addr constant [9 x i8] c"aMessage\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+  @OBJC_SELECTOR_REFERENCES_.4 = private externally_initialized global i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
+  @_objc_empty_cache = external global %struct._objc_cache
+  @"OBJC_CLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* null, %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_CLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
+  @"OBJC_METACLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* @"OBJC_CLASS_$_MyObject", %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_METACLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
+  @OBJC_CLASS_NAME_ = private unnamed_addr constant [9 x i8] c"MyObject\00", section "__TEXT,__objc_classname,cstring_literals", align 1
+  @OBJC_METH_VAR_NAME_.5 = private unnamed_addr constant [12 x i8] c"doWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+  @OBJC_METH_VAR_TYPE_ = private unnamed_addr constant [21 x i8] c"@32@0:8{CGSize=dd}16\00", section "__TEXT,__objc_methtype,cstring_literals", align 1
+  @"\01l_OBJC_$_CLASS_METHODS_MyObject" = private global { i32, i32, [1 x %struct._objc_method] } { i32 24, i32 1, [1 x %struct._objc_method] [%struct._objc_method { i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i8*) }] }, section "__DATA, __objc_const", align 8
+  @"\01l_OBJC_METACLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 131, i32 40, i32 40, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to %struct.__method_list_t*), %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
+  @"\01l_OBJC_CLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 130, i32 0, i32 0, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* null, %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
+  @"OBJC_LABEL_CLASS_$" = private global [1 x i8*] [i8* bitcast (%struct._class_t* @"OBJC_CLASS_$_MyObject" to i8*)], section "__DATA, __objc_classlist, regular, no_dead_strip", align 8
+  @llvm.compiler.used = appending global [12 x i8*] [i8* bitcast (%struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_" to i8*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_ to i8*), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.2 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.4 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to i8*), i8* bitcast ([1 x i8*]* @"OBJC_LABEL_CLASS_$" to i8*)], section "llvm.metadata"
+  @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @asan.module_ctor, i8* null }]
+  @__asan_shadow_memory_dynamic_address = external global i64
+  @___asan_gen_ = private unnamed_addr constant [34 x i8] c"2 32 16 9 imageSize 64 8 6 object\00", align 1
+  @__stack_chk_guard = external global i8*
+  
+  ; Function Attrs: noinline sanitize_address ssp uwtable
+  define internal i8* @"\01+[MyObject doWithSize:]"(i8* %self, i8* %_cmd, [2 x double] %imageSize.coerce) #0 !dbg !14 {
+  entry:
+    %StackGuardSlot = alloca i8*
+    %0 = call i8* @llvm.stackguard()
+    call void @llvm.stackprotector(i8* %0, i8** %StackGuardSlot)
+    %1 = load i64, i64* @__asan_shadow_memory_dynamic_address
+    %self.addr = alloca i8*, align 8
+    %_cmd.addr = alloca i8*, align 8
+    %MyAlloca = alloca [96 x i8], align 32, !dbg !35
+    %2 = ptrtoint [96 x i8]* %MyAlloca to i64, !dbg !35
+    %3 = add i64 %2, 32, !dbg !35
+    %4 = inttoptr i64 %3 to %struct.CGSize*, !dbg !35
+    %5 = add i64 %2, 64, !dbg !35
+    %6 = inttoptr i64 %5 to %0**, !dbg !35
+    %7 = inttoptr i64 %2 to i64*, !dbg !35
+    store i64 1102416563, i64* %7, !dbg !35
+    %8 = add i64 %2, 8, !dbg !35
+    %9 = inttoptr i64 %8 to i64*, !dbg !35
+    store i64 ptrtoint ([34 x i8]* @___asan_gen_ to i64), i64* %9, !dbg !35
+    %10 = add i64 %2, 16, !dbg !35
+    %11 = inttoptr i64 %10 to i64*, !dbg !35
+    store i64 ptrtoint (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i64), i64* %11, !dbg !35
+    %12 = lshr i64 %2, 3, !dbg !35
+    %13 = add i64 %12, %1, !dbg !35
+    %14 = add i64 %13, 0, !dbg !35
+    %15 = inttoptr i64 %14 to i64*, !dbg !35
+    store i64 -940689368107847183, i64* %15, align 1, !dbg !35
+    %16 = add i64 %13, 9, !dbg !35
+    %17 = inttoptr i64 %16 to i16*, !dbg !35
+    store i16 -3085, i16* %17, align 1, !dbg !35
+    %18 = add i64 %13, 11, !dbg !35
+    %19 = inttoptr i64 %18 to i8*, !dbg !35
+    store i8 -13, i8* %19, align 1, !dbg !35
+    call void @llvm.dbg.declare(metadata %struct.CGSize* %4, metadata !36, metadata !DIExpression(DW_OP_deref)), !dbg !37
+    call void @llvm.dbg.declare(metadata %0** %6, metadata !38, metadata !DIExpression(DW_OP_deref)), !dbg !44
+    %20 = bitcast %struct.CGSize* %4 to [2 x double]*
+    %21 = ptrtoint [2 x double]* %20 to i64
+    %22 = lshr i64 %21, 3
+    %23 = add i64 %22, %1
+    %24 = inttoptr i64 %23 to i16*
+    %25 = load i16, i16* %24
+    %26 = icmp ne i16 %25, 0
+    br i1 %26, label %27, label %28
+  
+  ; <label>:27:                                     ; preds = %entry
+    call void @__asan_report_store16(i64 %21)
+    call void asm sideeffect "", ""()
+    unreachable
+  
+  ; <label>:28:                                     ; preds = %entry
+    store [2 x double] %imageSize.coerce, [2 x double]* %20, align 8
+    store i8* %self, i8** %self.addr, align 8
+    call void @llvm.dbg.declare(metadata i8** %self.addr, metadata !45, metadata !DIExpression()), !dbg !47
+    store i8* %_cmd, i8** %_cmd.addr, align 8
+    call void @llvm.dbg.declare(metadata i8** %_cmd.addr, metadata !48, metadata !DIExpression()), !dbg !47
+    %29 = load %struct._class_t*, %struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_", align 8, !dbg !50
+    %30 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64), i64 3), %1, !dbg !50
+    %31 = inttoptr i64 %30 to i8*, !dbg !50
+    %32 = load i8, i8* %31, !dbg !50
+    %33 = icmp ne i8 %32, 0, !dbg !50
+    br i1 %33, label %34, label %35, !dbg !50
+  
+  ; <label>:34:                                     ; preds = %28
+    call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64)), !dbg !50
+    call void asm sideeffect "", ""(), !dbg !50
+    unreachable, !dbg !50
+  
+  ; <label>:35:                                     ; preds = %28
+    %36 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !dbg !50, !invariant.load !2
+    %37 = bitcast %struct._class_t* %29 to i8*, !dbg !50
+    %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %37, i8* %36), !dbg !50
+    %38 = bitcast i8* %call to %0*, !dbg !50
+    %39 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64), i64 3), %1, !dbg !51
+    %40 = inttoptr i64 %39 to i8*, !dbg !51
+    %41 = load i8, i8* %40, !dbg !51
+    %42 = icmp ne i8 %41, 0, !dbg !51
+    br i1 %42, label %43, label %44, !dbg !51
+  
+  ; <label>:43:                                     ; preds = %35
+    call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64)), !dbg !51
+    call void asm sideeffect "", ""(), !dbg !51
+    unreachable, !dbg !51
+  
+  ; <label>:44:                                     ; preds = %35
+    %45 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.2, align 8, !dbg !51, !invariant.load !2
+    %46 = bitcast %0* %38 to i8*, !dbg !51
+    %47 = bitcast %struct.CGSize* %4 to [2 x double]*, !dbg !51
+    %48 = ptrtoint [2 x double]* %47 to i64, !dbg !51
+    %49 = lshr i64 %48, 3, !dbg !51
+    %50 = add i64 %49, %1, !dbg !51
+    %51 = inttoptr i64 %50 to i16*, !dbg !51
+    %52 = load i16, i16* %51, !dbg !51
+    %53 = icmp ne i16 %52, 0, !dbg !51
+    br i1 %53, label %54, label %55, !dbg !51
+  
+  ; <label>:54:                                     ; preds = %44
+    call void @__asan_report_load16(i64 %48), !dbg !51
+    call void asm sideeffect "", ""(), !dbg !51
+    unreachable, !dbg !51
+  
+  ; <label>:55:                                     ; preds = %44
+    %56 = load [2 x double], [2 x double]* %47, align 8, !dbg !51
+    %call1 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, [2 x double])*)(i8* %46, i8* %45, [2 x double] %56), !dbg !51
+    %57 = bitcast i8* %call1 to %0*, !dbg !51
+    %58 = ptrtoint %0** %6 to i64, !dbg !44
+    %59 = lshr i64 %58, 3, !dbg !44
+    %60 = add i64 %59, %1, !dbg !44
+    %61 = inttoptr i64 %60 to i8*, !dbg !44
+    %62 = load i8, i8* %61, !dbg !44
+    %63 = icmp ne i8 %62, 0, !dbg !44
+    br i1 %63, label %64, label %65, !dbg !44
+  
+  ; <label>:64:                                     ; preds = %55
+    call void @__asan_report_store8(i64 %58), !dbg !44
+    call void asm sideeffect "", ""(), !dbg !44
+    unreachable, !dbg !44
+  
+  ; <label>:65:                                     ; preds = %55
+    store %0* %57, %0** %6, align 8, !dbg !44
+    %66 = load %0*, %0** %6, align 8, !dbg !52
+    %67 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64), i64 3), %1, !dbg !53
+    %68 = inttoptr i64 %67 to i8*, !dbg !53
+    %69 = load i8, i8* %68, !dbg !53
+    %70 = icmp ne i8 %69, 0, !dbg !53
+    br i1 %70, label %71, label %72, !dbg !53
+  
+  ; <label>:71:                                     ; preds = %65
+    call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64)), !dbg !53
+    call void asm sideeffect "", ""(), !dbg !53
+    unreachable, !dbg !53
+  
+  ; <label>:72:                                     ; preds = %65
+    %73 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.4, align 8, !dbg !53, !invariant.load !2
+    %74 = bitcast %0* %66 to i8*, !dbg !53
+    %call2 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %74, i8* %73), !dbg !53
+    call void asm sideeffect "mov\09fp, fp\09\09; marker for objc_retainAutoreleaseReturnValue", ""(), !dbg !53
+    %75 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call2) #3, !dbg !53
+    %76 = bitcast %0** %6 to i8**, !dbg !54
+    call void @objc_storeStrong(i8** %76, i8* null) #3, !dbg !54
+    %77 = tail call i8* @objc_autoreleaseReturnValue(i8* %75) #3, !dbg !54
+    store i64 1172321806, i64* %7, !dbg !54
+    %78 = add i64 %13, 0, !dbg !54
+    %79 = inttoptr i64 %78 to i64*, !dbg !54
+    store i64 0, i64* %79, align 1, !dbg !54
+    %80 = add i64 %13, 9, !dbg !54
+    %81 = inttoptr i64 %80 to i16*, !dbg !54
+    store i16 0, i16* %81, align 1, !dbg !54
+    %82 = add i64 %13, 11, !dbg !54
+    %83 = inttoptr i64 %82 to i8*, !dbg !54
+    store i8 0, i8* %83, align 1, !dbg !54
+    %84 = call i8* @llvm.stackguard()
+    %85 = load volatile i8*, i8** %StackGuardSlot
+    %86 = icmp eq i8* %84, %85
+    br i1 %86, label %SP_return, label %CallStackCheckFailBlk, !prof !55
+  
+  SP_return:                                        ; preds = %72
+    ret i8* %77, !dbg !54
+  
+  CallStackCheckFailBlk:                            ; preds = %72
+    call void @__stack_chk_fail(), !dbg !47
+    unreachable, !dbg !47
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+  
+  ; Function Attrs: nonlazybind
+  declare i8* @objc_msgSend(i8*, i8*, ...) #2
+  
+  declare i8* @objc_retainAutoreleasedReturnValue(i8* returned)
+  
+  declare void @objc_storeStrong(i8**, i8*)
+  
+  declare i8* @objc_autoreleaseReturnValue(i8* returned)
+  
+  define internal void @asan.module_ctor() {
+    call void @__asan_init()
+    call void @__asan_version_mismatch_check_v8()
+    ret void
+  }
+  
+  declare void @__asan_init()
+  
+  declare void @__asan_version_mismatch_check_v8()
+  
+  declare void @__asan_report_load8(i64)
+  
+  declare void @__asan_report_load16(i64)
+  
+  declare void @__asan_report_store8(i64)
+  
+  declare void @__asan_report_store16(i64)
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+  
+  ; Function Attrs: nounwind
+  declare i8* @llvm.stackguard() #3
+  
+  declare void @__stack_chk_fail()
+  
+  attributes #0 = { noinline sanitize_address ssp uwtable }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { nonlazybind }
+  attributes #3 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!5, !6, !7, !8, !9, !10, !11, !12}
+  !llvm.ident = !{!13}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !1, producer: "clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+  !1 = !DIFile(filename: "m.m", directory: "/")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyObject", scope: !1, file: !1, line: 15, flags: DIFlagObjcClassComplete, elements: !2, runtimeLang: DW_LANG_ObjC)
+  !5 = !{i32 1, !"Objective-C Version", i32 2}
+  !6 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+  !7 = !{i32 1, !"Objective-C Image Info Section", !"__DATA,__objc_imageinfo,regular,no_dead_strip"}
+  !8 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+  !9 = !{i32 1, !"Objective-C Class Properties", i32 64}
+  !10 = !{i32 2, !"Dwarf Version", i32 2}
+  !11 = !{i32 2, !"Debug Info Version", i32 3}
+  !12 = !{i32 7, !"PIC Level", i32 2}
+  !13 = !{!"clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)"}
+  !14 = distinct !DISubprogram(name: "+[MyObject doWithSize:]", scope: !1, file: !1, line: 16, type: !15, isLocal: true, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+  !15 = !DISubroutineType(types: !16)
+  !16 = !{!17, !24, !26, !29}
+  !17 = !DIDerivedType(tag: DW_TAG_typedef, name: "id", file: !1, baseType: !18)
+  !18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
+  !19 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_object", file: !1, elements: !20)
+  !20 = !{!21}
+  !21 = !DIDerivedType(tag: DW_TAG_member, name: "isa", scope: !19, file: !1, baseType: !22, size: 64)
+  !22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
+  !23 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_class", file: !1, flags: DIFlagFwdDecl)
+  !24 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !25 = !DIDerivedType(tag: DW_TAG_typedef, name: "Class", file: !1, baseType: !22)
+  !26 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27, flags: DIFlagArtificial)
+  !27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64)
+  !28 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_selector", file: !1, flags: DIFlagFwdDecl)
+  !29 = !DIDerivedType(tag: DW_TAG_typedef, name: "CGSize", file: !1, line: 10, baseType: !30)
+  !30 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CGSize", file: !1, line: 6, size: 128, elements: !31)
+  !31 = !{!32, !34}
+  !32 = !DIDerivedType(tag: DW_TAG_member, name: "width", scope: !30, file: !1, line: 7, baseType: !33, size: 64)
+  !33 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+  !34 = !DIDerivedType(tag: DW_TAG_member, name: "height", scope: !30, file: !1, line: 8, baseType: !33, size: 64, offset: 64)
+  !35 = !DILocation(line: 16, scope: !14)
+  !36 = !DILocalVariable(name: "imageSize", arg: 3, scope: !14, file: !1, line: 16, type: !29)
+  !37 = !DILocation(line: 16, column: 26, scope: !14)
+  !38 = !DILocalVariable(name: "object", scope: !14, file: !1, line: 17, type: !39)
+  !39 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !40, size: 64)
+  !40 = !DICompositeType(tag: DW_TAG_structure_type, name: "Object", scope: !1, file: !1, line: 11, elements: !41, runtimeLang: DW_LANG_ObjC)
+  !41 = !{!42}
+  !42 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !40, baseType: !43)
+  !43 = !DICompositeType(tag: DW_TAG_structure_type, name: "NSObject", scope: !1, file: !1, line: 3, elements: !2, runtimeLang: DW_LANG_ObjC)
+  !44 = !DILocation(line: 17, column: 11, scope: !14)
+  !45 = !DILocalVariable(name: "self", arg: 1, scope: !14, type: !46, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !46 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25)
+  !47 = !DILocation(line: 0, scope: !14)
+  !48 = !DILocalVariable(name: "_cmd", arg: 2, scope: !14, type: !49, flags: DIFlagArtificial)
+  !49 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27)
+  !50 = !DILocation(line: 17, column: 21, scope: !14)
+  !51 = !DILocation(line: 17, column: 20, scope: !14)
+  !52 = !DILocation(line: 18, column: 11, scope: !14)
+  !53 = !DILocation(line: 18, column: 10, scope: !14)
+  !54 = !DILocation(line: 19, column: 1, scope: !14)
+  !55 = !{!"branch_weights", i32 2147481600, i32 2048}
+
+...
+---
+name:            "\x01+[MyObject doWithSize:]"
+alignment:       2
+tracksRegLiveness: true
+liveins:         
+  - { reg: '$x0' }
+  - { reg: '$x1' }
+  - { reg: '$d0' }
+  - { reg: '$d1' }
+frameInfo:       
+  stackSize:       352
+  maxAlignment:    32
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  '%stack.0.StackGuardSlot'
+  maxCallFrameSize: 0
+  localFrameSize:  144
+stack:           
+  - { id: 0, name: StackGuardSlot, offset: -40, size: 8, alignment: 8, 
+      stack-id: 0, local-offset: -8 }
+  - { id: 1, name: self.addr, offset: -168, size: 8, alignment: 8, stack-id: 0, 
+      local-offset: -136, debug-info-variable: '!45', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!47' }
+  - { id: 2, name: _cmd.addr, offset: -176, size: 8, alignment: 8, stack-id: 0, 
+      local-offset: -144, debug-info-variable: '!48', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!47' }
+  - { id: 3, name: MyAlloca, offset: -160, size: 96, alignment: 32, stack-id: 0, 
+      local-offset: -128 }
+  - { id: 4, type: spill-slot, offset: -184, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 5, type: spill-slot, offset: -192, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 6, type: spill-slot, offset: -200, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 7, type: spill-slot, offset: -208, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 8, type: spill-slot, offset: -216, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 9, type: spill-slot, offset: -224, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 10, type: spill-slot, offset: -232, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 11, type: spill-slot, offset: -240, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 12, type: spill-slot, offset: -248, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 13, type: spill-slot, offset: -256, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 14, type: spill-slot, offset: -264, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 15, type: spill-slot, offset: -272, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 16, type: spill-slot, offset: -280, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 17, type: spill-slot, offset: -288, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 18, type: spill-slot, offset: -296, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 19, type: spill-slot, offset: -304, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 20, type: spill-slot, offset: -312, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 21, type: spill-slot, offset: -320, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 22, type: spill-slot, offset: -328, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 23, type: spill-slot, offset: -336, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 24, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$lr' }
+  - { id: 25, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$fp' }
+  - { id: 26, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$x27' }
+  - { id: 27, type: spill-slot, offset: -32, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$x28' }
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $d0, $d1, $x27, $x28, $lr
+  
+    early-clobber $sp = frame-setup STPXpre killed $x28, killed $x27, $sp, -4 :: (store 8 into %stack.27), (store 8 into %stack.26)
+    frame-setup STPXi killed $fp, killed $lr, $sp, 2 :: (store 8 into %stack.25), (store 8 into %stack.24)
+    $fp = frame-setup ADDXri $sp, 16, 0
+    $x9 = frame-setup SUBXri $sp, 320, 0
+    $sp = ANDXri killed $x9, 7930
+    frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+    frame-setup CFI_INSTRUCTION offset $w30, -8
+    frame-setup CFI_INSTRUCTION offset $w29, -16
+    frame-setup CFI_INSTRUCTION offset $w27, -24
+    frame-setup CFI_INSTRUCTION offset $w28, -32
+    renamable $x8 = ADRP target-flags(aarch64-page) @"\01+[MyObject doWithSize:]"
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @"\01+[MyObject doWithSize:]", 0
+    renamable $x9 = ADRP target-flags(aarch64-page) @___asan_gen_
+    renamable $x9 = ADDXri killed renamable $x9, target-flags(aarch64-pageoff, aarch64-nc) @___asan_gen_, 0
+    $x10 = ADDXri $sp, 192, 0
+    renamable $x11 = ADRP target-flags(aarch64-page, aarch64-got) @__asan_shadow_memory_dynamic_address
+    renamable $x11 = LDRXui killed renamable $x11, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @__asan_shadow_memory_dynamic_address
+    $x12 = ADRP target-flags(aarch64-page, aarch64-got) @__stack_chk_guard
+    $x12 = LDRXui $x12, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @__stack_chk_guard
+    $x12 = LDRXui killed $x12, 0 :: (dereferenceable invariant load 8 from @__stack_chk_guard)
+    $x13 = ADRP target-flags(aarch64-page, aarch64-got) @__stack_chk_guard
+    $x13 = LDRXui $x13, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @__stack_chk_guard
+    $x13 = LDRXui killed $x13, 0 :: (dereferenceable invariant load 8 from @__stack_chk_guard)
+    STRXui killed renamable $x13, $sp, 39 :: (volatile store 8 into %stack.0.StackGuardSlot)
+    renamable $x11 = LDRXui killed renamable $x11, 0 :: (load 8 from @__asan_shadow_memory_dynamic_address)
+    renamable $x13 = ADDXri renamable $x10, 32, 0, debug-location !35
+    renamable $x14 = ADDXri renamable $x10, 64, 0, debug-location !35
+    $x15 = MOVZXi 35507, 0, debug-location !35
+    $x15 = MOVKXi $x15, 16821, 16, debug-location !35
+    STRXui killed renamable $x15, $sp, 24, debug-location !35 :: (store 96 into %stack.3.MyAlloca, align 32)
+    STRXui killed renamable $x9, $sp, 25, debug-location !35 :: (store 96 into %stack.3.MyAlloca + 1, align 32)
+    STRXui killed renamable $x8, $sp, 26, debug-location !35 :: (store 96 into %stack.3.MyAlloca + 2, align 32)
+    renamable $x8 = UBFMXri renamable $x10, 3, 63, debug-location !35
+    renamable $x9 = ADDXrs renamable $x11, renamable $x10, 67, debug-location !35
+    $x15 = MOVZXi 61937, 0, debug-location !35
+    $x15 = MOVKXi $x15, 61937, 16, debug-location !35
+    $x15 = MOVKXi $x15, 62194, 48, debug-location !35
+    STRXroX killed renamable $x15, renamable $x8, renamable $x11, 0, 0, debug-location !35 :: (store 8 into %ir.15, align 1)
+    renamable $x15 = ADDXrs renamable $x8, renamable $x11, 0, debug-location !35
+    $w16 = MOVZWi 62451, 0, debug-location !35
+    STURHHi killed renamable $w16, killed renamable $x15, 9, debug-location !35 :: (store 2 into %ir.17, align 1)
+    renamable $x8 = ADDXrs killed renamable $x8, renamable $x11, 0, debug-location !35
+    $w16 = MOVZWi 243, 0, debug-location !35
+    STRBBui killed renamable $w16, killed renamable $x8, 11, debug-location !35 :: (store 1 into %ir.19)
+    DBG_VALUE renamable $x13, 0, !36, !DIExpression(DW_OP_deref), debug-location !37
+    DBG_VALUE renamable $x14, 0, !38, !DIExpression(DW_OP_deref), debug-location !44
+    $x8 = ORRXrs $xzr, $x13, 0
+    renamable $x15 = UBFMXri renamable $x8, 3, 63
+    renamable $w16 = LDRHHroX killed renamable $x15, renamable $x11, 0, 0 :: (load 2 from %ir.24)
+    renamable $w16 = UBFMWri killed renamable $w16, 0, 15
+    STRXui killed $x1, $sp, 21 :: (store 8 into %stack.4)
+    STRDui killed $d1, $sp, 20 :: (store 8 into %stack.5)
+    STRDui killed $d0, $sp, 19 :: (store 8 into %stack.6)
+    STRXui killed $x0, $sp, 18 :: (store 8 into %stack.7)
+    STRXui killed $x13, $sp, 17 :: (store 8 into %stack.8)
+    DBG_VALUE $sp, 0, !36, !DIExpression(DW_OP_plus_uconst, 136, DW_OP_deref, DW_OP_deref), debug-location !37
+    STRXui killed $x14, $sp, 16 :: (store 8 into %stack.9)
+    DBG_VALUE $sp, 0, !38, !DIExpression(DW_OP_plus_uconst, 128, DW_OP_deref, DW_OP_deref), debug-location !44
+    STRXui killed $x10, $sp, 15 :: (store 8 into %stack.10)
+    STRXui killed $x12, $sp, 14 :: (store 8 into %stack.11)
+    STRXui killed $x11, $sp, 13 :: (store 8 into %stack.12)
+    STRXui killed $x9, $sp, 12 :: (store 8 into %stack.13)
+    STRXui killed $x8, $sp, 11 :: (store 8 into %stack.14)
+    CBZW killed renamable $w16, %bb.2
+  
+  bb.1 (%ir-block.27):
+    successors: 
+  
+    $x0 = LDRXui $sp, 11 :: (load 8 from %stack.14)
+    BL @__asan_report_store16, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp
+    INLINEASM &"", 1
+    BRK 1
+  
+  bb.2 (%ir-block.28):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_, 0
+    renamable $x9 = ORRXri $xzr, 4097
+    renamable $x8 = LSRVXr killed renamable $x8, killed renamable $x9
+    renamable $x9 = ADRP target-flags(aarch64-page) @"OBJC_CLASSLIST_REFERENCES_$_"
+    renamable $x9 = ADDXri killed renamable $x9, target-flags(aarch64-pageoff, aarch64-nc) @"OBJC_CLASSLIST_REFERENCES_$_", 0
+    $d0 = LDRDui $sp, 19 :: (load 8 from %stack.6)
+    $x10 = LDRXui $sp, 11 :: (load 8 from %stack.14)
+    STRDui killed renamable $d0, renamable $x10, 0 :: (store 8 into %ir.20)
+    $d1 = LDRDui $sp, 20 :: (load 8 from %stack.5)
+    STRDui killed renamable $d1, killed renamable $x10, 1 :: (store 8 into %ir.20 + 8)
+    $x11 = LDRXui $sp, 18 :: (load 8 from %stack.7)
+    STRXui killed renamable $x11, $sp, 23 :: (store 8 into %stack.1.self.addr)
+    $x12 = LDRXui $sp, 21 :: (load 8 from %stack.4)
+    STRXui killed renamable $x12, $sp, 22 :: (store 8 into %stack.2._cmd.addr)
+    renamable $x9 = LDRXui killed renamable $x9, 0, debug-location !50 :: (load 8 from @"OBJC_CLASSLIST_REFERENCES_$_")
+    $x13 = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w14 = LDRBBroX killed renamable $x8, killed renamable $x13, 0, 0, debug-location !50 :: (load 1 from %ir.31)
+    renamable $w14 = UBFMWri killed renamable $w14, 0, 7, debug-location !50
+    STRXui killed $x9, $sp, 10 :: (store 8 into %stack.15)
+    CBZW killed renamable $w14, %bb.4, debug-location !50
+  
+  bb.3 (%ir-block.34):
+    successors: 
+  
+    $x0 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_, debug-location !50
+    renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_, 0, debug-location !50
+    BL @__asan_report_load8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !50
+    INLINEASM &"", 1, debug-location !50
+    BRK 1, debug-location !50
+  
+  bb.4 (%ir-block.35):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_, 0
+    renamable $x1 = LDRXui killed renamable $x8, 0, debug-location !50 :: (invariant load 8 from @OBJC_SELECTOR_REFERENCES_)
+    $x8 = LDRXui $sp, 10 :: (load 8 from %stack.15)
+    $x0 = ORRXrs $xzr, killed $x8, 0, debug-location !50
+    BL @objc_msgSend, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit-def $x0, debug-location !50
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.2
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.2, 0
+    renamable $x1 = ORRXri $xzr, 4097
+    renamable $x8 = LSRVXr killed renamable $x8, killed renamable $x1
+    $x1 = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w9 = LDRBBroX killed renamable $x8, killed renamable $x1, 0, 0, debug-location !51 :: (load 1 from %ir.40)
+    renamable $w9 = UBFMWri killed renamable $w9, 0, 7, debug-location !51
+    STRXui killed $x0, $sp, 9 :: (store 8 into %stack.16)
+    CBZW killed renamable $w9, %bb.6, debug-location !51
+  
+  bb.5 (%ir-block.43):
+    successors: 
+  
+    $x0 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.2, debug-location !51
+    renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.2, 0, debug-location !51
+    BL @__asan_report_load8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !51
+    INLINEASM &"", 1, debug-location !51
+    BRK 1, debug-location !51
+  
+  bb.6 (%ir-block.44):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.2
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.2, 0
+    renamable $x1 = LDRXui killed renamable $x8, 0, debug-location !51 :: (invariant load 8 from @OBJC_SELECTOR_REFERENCES_.2)
+    $x8 = LDRXui $sp, 9 :: (load 8 from %stack.16)
+    $x9 = LDRXui $sp, 17 :: (load 8 from %stack.8)
+    renamable $x10 = UBFMXri renamable $x9, 3, 63, debug-location !51
+    $x11 = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w12 = LDRHHroX killed renamable $x10, killed renamable $x11, 0, 0, debug-location !51 :: (load 2 from %ir.51)
+    renamable $w12 = UBFMWri killed renamable $w12, 0, 15, debug-location !51
+    STRXui killed $x1, $sp, 8 :: (store 8 into %stack.17)
+    STRXui killed $x8, $sp, 7 :: (store 8 into %stack.18)
+    STRXui killed $x9, $sp, 6 :: (store 8 into %stack.19)
+    CBZW killed renamable $w12, %bb.8, debug-location !51
+  
+  bb.7 (%ir-block.54):
+    successors: 
+  
+    $x0 = LDRXui $sp, 6 :: (load 8 from %stack.19)
+    BL @__asan_report_load16, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !51
+    INLINEASM &"", 1, debug-location !51
+    BRK 1, debug-location !51
+  
+  bb.8 (%ir-block.55):
+    $x8 = LDRXui $sp, 6 :: (load 8 from %stack.19)
+    renamable $d1 = LDRDui renamable $x8, 1, debug-location !51 :: (load 8 from %ir.47 + 8)
+    renamable $d0 = LDRDui killed renamable $x8, 0, debug-location !51 :: (load 8 from %ir.47)
+    $x0 = LDRXui $sp, 7 :: (load 8 from %stack.18)
+    $x1 = LDRXui $sp, 8 :: (load 8 from %stack.17)
+    BL @objc_msgSend, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit killed $d0, implicit killed $d1, implicit-def $sp, implicit-def $x0, debug-location !51
+    $x8 = LDRXui $sp, 16 :: (load 8 from %stack.9)
+    renamable $x1 = UBFMXri killed renamable $x8, 3, 63, debug-location !44
+    $lr = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w9 = LDRBBroX killed renamable $x1, killed renamable $lr, 0, 0, debug-location !44 :: (load 1 from %ir.61)
+    renamable $w9 = UBFMWri killed renamable $w9, 0, 7, debug-location !44
+    STRXui killed $x0, $sp, 5 :: (store 8 into %stack.20)
+    CBZW killed renamable $w9, %bb.10, debug-location !44
+  
+  bb.9 (%ir-block.64):
+    successors: 
+  
+    $x0 = LDRXui $sp, 16 :: (load 8 from %stack.9)
+    BL @__asan_report_store8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !44
+    INLINEASM &"", 1, debug-location !44
+    BRK 1, debug-location !44
+  
+  bb.10 (%ir-block.65):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.4
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.4, 0
+    renamable $x9 = ORRXri $xzr, 4097
+    renamable $x8 = LSRVXr killed renamable $x8, killed renamable $x9
+    $x9 = LDRXui $sp, 5 :: (load 8 from %stack.20)
+    $x10 = LDRXui $sp, 16 :: (load 8 from %stack.9)
+    STRXui killed renamable $x9, renamable $x10, 0, debug-location !44 :: (store 8 into %ir.6)
+    renamable $x11 = LDRXui killed renamable $x10, 0, debug-location !52 :: (load 8 from %ir.6)
+    $x12 = LDRXui $sp, 13 :: (load 8 from %stack.12)
+    renamable $w13 = LDRBBroX killed renamable $x8, killed renamable $x12, 0, 0, debug-location !53 :: (load 1 from %ir.68)
+    renamable $w13 = UBFMWri killed renamable $w13, 0, 7, debug-location !53
+    STRXui killed $x11, $sp, 4 :: (store 8 into %stack.21)
+    CBZW killed renamable $w13, %bb.12, debug-location !53
+  
+  bb.11 (%ir-block.71):
+    successors: 
+  
+    $x0 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.4, debug-location !53
+    renamable $x0 = ADDXri $x0, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.4, 0, debug-location !53
+    BL @__asan_report_load8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $sp, debug-location !53
+    INLINEASM &"", 1, debug-location !53
+    BRK 1, debug-location !53
+  
+  bb.12 (%ir-block.72):
+    renamable $x8 = ADRP target-flags(aarch64-page) @OBJC_SELECTOR_REFERENCES_.4
+    renamable $x8 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @OBJC_SELECTOR_REFERENCES_.4, 0
+    renamable $x1 = LDRXui killed renamable $x8, 0, debug-location !53 :: (invariant load 8 from @OBJC_SELECTOR_REFERENCES_.4)
+    $x8 = LDRXui $sp, 4 :: (load 8 from %stack.21)
+    $x0 = ORRXrs $xzr, killed $x8, 0, debug-location !53
+    BL @objc_msgSend, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, implicit-def $x0, debug-location !53
+    INLINEASM &"mov\09fp, fp\09\09; marker for objc_retainAutoreleaseReturnValue", 1, debug-location !53
+    BL @objc_retainAutoreleasedReturnValue, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $x0, debug-location !53
+    $x8 = LDRXui $sp, 16 :: (load 8 from %stack.9)
+    STRXui killed $x0, $sp, 3 :: (store 8 into %stack.22)
+    $x0 = ORRXrs $xzr, killed $x8, 0, debug-location !54
+    $x8 = ORRXrs $xzr, killed $xzr, 0, debug-location !54
+    $x1 = ORRXrs $xzr, killed $x8, 0, debug-location !54
+    BL @objc_storeStrong, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit killed $x1, debug-location !54
+    $x0 = LDRXui $sp, 3 :: (load 8 from %stack.22)
+    BL @objc_autoreleaseReturnValue, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, implicit-def $x0, debug-location !54
+    $x8 = MOVZXi 13838, 0, debug-location !54
+    $x8 = MOVKXi $x8, 17888, 16, debug-location !54
+    $x1 = LDRXui $sp, 15 :: (load 8 from %stack.10)
+    STRXui killed renamable $x8, killed renamable $x1, 0, debug-location !54 :: (store 8 into %ir.7)
+    $x8 = LDRXui $sp, 12 :: (load 8 from %stack.13)
+    STRXui $xzr, renamable $x8, 0, debug-location !54 :: (store 8 into %ir.79, align 1)
+    STURHHi $wzr, renamable $x8, 9, debug-location !54 :: (store 2 into %ir.81, align 1)
+    STRBBui $wzr, killed renamable $x8, 11, debug-location !54 :: (store 1 into %ir.83)
+    $lr = ADRP target-flags(aarch64-page, aarch64-got) @__stack_chk_guard
+    $lr = LDRXui $lr, target-flags(aarch64-pageoff, aarch64-got, aarch64-nc) @__stack_chk_guard
+    $lr = LDRXui killed $lr, 0 :: (dereferenceable invariant load 8 from @__stack_chk_guard)
+    renamable $x9 = LDRXui $sp, 39 :: (load 8 from %stack.0.StackGuardSlot)
+    $xzr = SUBSXrs killed renamable $lr, killed renamable $x9, 0, implicit-def $nzcv, implicit-def $nzcv
+    STRXui killed $x0, $sp, 2 :: (store 8 into %stack.23)
+    Bcc 1, %bb.14, implicit $nzcv
+  
+  bb.13.SP_return:
+    $x0 = LDRXui $sp, 2 :: (load 8 from %stack.23)
+    $sp = frame-destroy SUBXri $fp, 16, 0, debug-location !54
+    $fp, $lr = frame-destroy LDPXi $sp, 2, debug-location !54 :: (load 8 from %stack.25), (load 8 from %stack.24)
+    early-clobber $sp, $x28, $x27 = frame-destroy LDPXpost $sp, 4, debug-location !54 :: (load 8 from %stack.27), (load 8 from %stack.26)
+    RET undef $lr, implicit killed $x0, debug-location !54
+  
+  bb.14.CallStackCheckFailBlk:
+    BL @__stack_chk_fail, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, debug-location !47
+    BRK 1, debug-location !47
+
+...
+---
+name:            asan.module_ctor
+alignment:       2
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       16
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$lr' }
+  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$fp' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $lr
+  
+    early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store 8 into %stack.1), (store 8 into %stack.0)
+    frame-setup CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup CFI_INSTRUCTION offset $w30, -8
+    frame-setup CFI_INSTRUCTION offset $w29, -16
+    BL @__asan_init, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp
+    BL @__asan_version_mismatch_check_v8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load 8 from %stack.1), (load 8 from %stack.0)
+    RET undef $lr
+
+...
diff --git a/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll b/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll
deleted file mode 100644
index 0606ddf60872f7406a179627428c2a9c980e86b2..0000000000000000000000000000000000000000
--- a/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.ll
+++ /dev/null
@@ -1,66 +0,0 @@
-; RUN: llc -O0 -regalloc=fast -stop-after=livedebugvalues -o - < %s | \
-; RUN:   FileCheck %s -implicit-check-not=DBG_VALUE
-
-target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-target triple = "arm64-apple-ios12.1.0"
-
-declare void @use(i32 %x)
-
-define void @f1(i32 %x) !dbg !6 {
-; CHECK-LABEL: name: f1
-entry:
-; CHECK-LABEL: bb.0.entry:
-  %var = add i32 %x, 1, !dbg !12
-  call void @llvm.dbg.value(metadata i32 %var, metadata !9, metadata !DIExpression()), !dbg !12
-; CHECK: DBG_VALUE debug-use renamable $w0, debug-use $noreg, !9, !DIExpression(), debug-location !12
-; CHECK-NEXT: STRWui killed $w0, $sp, 3 :: (store 4 into %stack.0)
-; CHECK-NEXT: DBG_VALUE debug-use $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
-
-  br label %artificial-bb-1, !dbg !13
-
-artificial-bb-1:                                  ; preds = %entry
-; CHECK-LABEL: bb.1.artificial-bb-1:
-; CHECK: DBG_VALUE debug-use $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
-
-  br label %artificial-bb-2
-
-artificial-bb-2:                                  ; preds = %artificial-bb-1
-; CHECK-LABEL: bb.2.artificial-bb-2:
-; CHECK: DBG_VALUE debug-use $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
-
-  %invisible = add i32 %var, 1
-  br label %return, !dbg !14
-
-return:                                           ; preds = %artificial-bb-2
-; CHECK-LABEL: bb.3.return:
-; CHECK: DBG_VALUE debug-use $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
-
-  call void @use(i32 %var)
-  ret void, !dbg !15
-}
-
-; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #0
-
-attributes #0 = { nounwind readnone speculatable }
-
-!llvm.dbg.cu = !{!0}
-!llvm.debugify = !{!3, !4}
-!llvm.module.flags = !{!5}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "compiler-gen-bbs-livedebugvalues.ll", directory: "/")
-!2 = !{}
-!3 = !{i32 6}
-!4 = !{i32 2}
-!5 = !{i32 2, !"Debug Info Version", i32 3}
-!6 = distinct !DISubprogram(name: "f1", linkageName: "f1", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
-!7 = !DISubroutineType(types: !2)
-!8 = !{!9, !11}
-!9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
-!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
-!11 = !DILocalVariable(name: "2", scope: !6, file: !1, line: 4, type: !10)
-!12 = !DILocation(line: 1, column: 1, scope: !6)
-!13 = !DILocation(line: 2, column: 1, scope: !6)
-!14 = !DILocation(line: 0, column: 1, scope: !6)
-!15 = !DILocation(line: 4, column: 1, scope: !6)
diff --git a/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir b/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
new file mode 100644
index 0000000000000000000000000000000000000000..79f2ac77c11a2afe8ca46d9afa8e8abf3bae2a92
--- /dev/null
+++ b/test/DebugInfo/AArch64/compiler-gen-bbs-livedebugvalues.mir
@@ -0,0 +1,110 @@
+# RUN: llc -o - %s -O0 -regalloc=fast -run-pass=livedebugvalues | \
+# RUN:   FileCheck %s -implicit-check-not=DBG_VALUE
+--- |
+  target triple = "arm64-apple-ios12.1.0"
+  
+  declare void @use(i32)
+  
+  define void @f1(i32 %x) !dbg !6 {
+  entry:
+    %var = add i32 %x, 1, !dbg !12
+    call void @llvm.dbg.value(metadata i32 %var, metadata !9, metadata !DIExpression()), !dbg !12
+    br label %artificial-bb-1, !dbg !13
+  
+  artificial-bb-1:
+    br label %artificial-bb-2
+  
+  artificial-bb-2:
+    %invisible = add i32 %var, 1
+    br label %return, !dbg !14
+  
+  return:
+    call void @use(i32 %var)
+    ret void, !dbg !15
+  }
+  
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #0
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { nounwind readnone speculatable }
+  attributes #1 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.debugify = !{!3, !4}
+  !llvm.module.flags = !{!5}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "compiler-gen-bbs-livedebugvalues.ll", directory: "/")
+  !2 = !{}
+  !3 = !{i32 6}
+  !4 = !{i32 2}
+  !5 = !{i32 2, !"Debug Info Version", i32 3}
+  !6 = distinct !DISubprogram(name: "f1", linkageName: "f1", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+  !7 = !DISubroutineType(types: !2)
+  !8 = !{!9, !11}
+  !9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
+  !10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+  !11 = !DILocalVariable(name: "2", scope: !6, file: !1, line: 4, type: !10)
+  !12 = !DILocation(line: 1, column: 1, scope: !6)
+  !13 = !DILocation(line: 2, column: 1, scope: !6)
+  !14 = !DILocation(line: 0, column: 1, scope: !6)
+  !15 = !DILocation(line: 4, column: 1, scope: !6)
+
+...
+---
+# CHECK-LABEL: name: f1
+name:            f1
+alignment:       2
+legalized:       true
+regBankSelected: true
+selected:        true
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       32
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -20, size: 4, alignment: 4, stack-id: 0 }
+  - { id: 1, type: spill-slot, offset: -8, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$lr' }
+  - { id: 2, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0, 
+      callee-saved-register: '$fp' }
+body:             |
+  ; CHECK-LABEL: bb.0.entry:
+  bb.0.entry:
+    liveins: $w0, $lr
+  
+    $sp = frame-setup SUBXri $sp, 32, 0
+    frame-setup STPXi killed $fp, killed $lr, $sp, 2 :: (store 8 into %stack.2), (store 8 into %stack.1)
+    frame-setup CFI_INSTRUCTION def_cfa_offset 32
+    frame-setup CFI_INSTRUCTION offset $w30, -8, debug-location !12
+    frame-setup CFI_INSTRUCTION offset $w29, -16, debug-location !12
+    renamable $w0 = ADDWri killed renamable $w0, 1, 0, debug-location !12
+    DBG_VALUE renamable $w0, $noreg, !9, !DIExpression(), debug-location !12
+    STRWui killed $w0, $sp, 3 :: (store 4 into %stack.0)
+    DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12), debug-location !12
+
+    ; CHECK: DBG_VALUE renamable $w0, $noreg, !9, !DIExpression(), debug-location !12
+    ; CHECK-NEXT: STRWui killed $w0, $sp, 3 :: (store 4 into %stack.0)
+    ; CHECK-NEXT: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+  
+  bb.1.artificial-bb-1:
+    ; CHECK-LABEL: bb.1.artificial-bb-1:
+    ; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+      
+  bb.2.artificial-bb-2:
+    ; CHECK-LABEL: bb.2.artificial-bb-2:
+    ; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+  
+  bb.3.return:
+    ; CHECK-LABEL: bb.3.return:
+    ; CHECK: DBG_VALUE $sp, 0, !9, !DIExpression(DW_OP_plus_uconst, 12)
+
+    $w0 = LDRWui $sp, 3 :: (load 4 from %stack.0)
+    BL @use, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit killed $w0
+    $fp, $lr = frame-destroy LDPXi $sp, 2, debug-location !15 :: (load 8 from %stack.2), (load 8 from %stack.1)
+    $sp = frame-destroy ADDXri $sp, 32, 0, debug-location !15
+    RET undef $lr, debug-location !15
+...
diff --git a/test/DebugInfo/AArch64/line-header.ll b/test/DebugInfo/AArch64/line-header.ll
index 2ac94728b867d9669ccd64d17606fb6c0177f7fb..1d9156debf1c51785f0ef6d8be0ff841b52e3086 100644
--- a/test/DebugInfo/AArch64/line-header.ll
+++ b/test/DebugInfo/AArch64/line-header.ll
@@ -3,4 +3,4 @@
 
 ; check line table length is correctly calculated for both big and little endian
 CHECK-LABEL: .debug_line contents:
-CHECK: total_length: 0x0000003c
+CHECK: total_length: 0x0000003f
diff --git a/test/DebugInfo/ARM/cfi-eof-prologue.ll b/test/DebugInfo/ARM/cfi-eof-prologue.ll
deleted file mode 100644
index f7ee9a23beebd2facb33bd9134a2dea4cd526b8e..0000000000000000000000000000000000000000
--- a/test/DebugInfo/ARM/cfi-eof-prologue.ll
+++ /dev/null
@@ -1,114 +0,0 @@
-; struct A {
-;   A();
-;   virtual ~A();
-; };
-; struct B : A {
-;   B();
-;   virtual ~B();
-; };
-; B::B() {}
-; CHECK: __ZN1BC1Ev:
-; CHECK:     .loc	1 [[@LINE-2]] 0 prologue_end
-; CHECK-NOT: .loc	1 0 0 prologue_end
-
-; The location of the prologue_end marker should not be affected by the presence
-; of CFI instructions.
-
-; RUN: llc -O0 -filetype=asm -mtriple=thumbv7-apple-ios < %s | FileCheck %s
-; RUN: llc -O0 -filetype=asm -mtriple=thumbv6-apple-ios < %s | FileCheck %s
-
-; ModuleID = 'test1.cpp'
-target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
-target triple = "thumbv7-apple-ios"
-
-%struct.B = type { %struct.A }
-%struct.A = type { i32 (...)** }
-
-@_ZTV1B = external unnamed_addr constant [4 x i8*]
-
-; Function Attrs: nounwind
-define %struct.B* @_ZN1BC2Ev(%struct.B* %this) unnamed_addr #0 align 2 !dbg !28 {
-entry:
-  tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !30, metadata !40), !dbg !41
-  %0 = getelementptr inbounds %struct.B, %struct.B* %this, i32 0, i32 0, !dbg !42
-  %call = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #3, !dbg !42
-  %1 = getelementptr inbounds %struct.B, %struct.B* %this, i32 0, i32 0, i32 0, !dbg !42
-  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !42, !tbaa !43
-  ret %struct.B* %this, !dbg !42
-}
-
-declare %struct.A* @_ZN1AC2Ev(%struct.A*)
-
-; Function Attrs: nounwind
-define %struct.B* @_ZN1BC1Ev(%struct.B* %this) unnamed_addr #0 align 2 !dbg !32 {
-entry:
-  tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !34, metadata !40), !dbg !46
-  tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !47, metadata !40) #3, !dbg !49
-  %0 = getelementptr inbounds %struct.B, %struct.B* %this, i32 0, i32 0, !dbg !50
-  %call.i = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #3, !dbg !50
-  %1 = getelementptr inbounds %struct.B, %struct.B* %this, i32 0, i32 0, i32 0, !dbg !50
-  store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !50, !tbaa !43
-  ret %struct.B* %this, !dbg !48
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!35, !36, !37, !38}
-!llvm.ident = !{!39}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !3, globals: !2, imports: !2)
-!1 = !DIFile(filename: "<stdin>", directory: "")
-!2 = !{}
-!3 = !{!4, !13}
-!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "B", line: 5, size: 32, align: 32, file: !5, elements: !6, vtableHolder: !13, identifier: "_ZTS1B")
-!5 = !DIFile(filename: "test1.cpp", directory: "")
-!6 = !{!7, !8, !12}
-!7 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !4, baseType: !13)
-!8 = !DISubprogram(name: "B", line: 6, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 6, file: !5, scope: !4, type: !9)
-!9 = !DISubroutineType(types: !10)
-!10 = !{null, !11}
-!11 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !4)
-!12 = !DISubprogram(name: "~B", line: 7, isLocal: false, isDefinition: false, virtuality: DW_VIRTUALITY_virtual, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 7, file: !5, scope: !4, type: !9, containingType: !4)
-!13 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", line: 1, size: 32, align: 32, file: !5, elements: !14, vtableHolder: !13, identifier: "_ZTS1A")
-!14 = !{!15, !22, !26}
-!15 = !DIDerivedType(tag: DW_TAG_member, name: "_vptr$A", size: 32, flags: DIFlagArtificial, file: !5, scope: !16, baseType: !17)
-!16 = !DIFile(filename: "test1.cpp", directory: "")
-!17 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, baseType: !18)
-!18 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "__vtbl_ptr_type", size: 32, baseType: !19)
-!19 = !DISubroutineType(types: !20)
-!20 = !{!21}
-!21 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!22 = !DISubprogram(name: "A", line: 2, isLocal: false, isDefinition: false, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 2, file: !5, scope: !13, type: !23)
-!23 = !DISubroutineType(types: !24)
-!24 = !{null, !25}
-!25 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer, baseType: !13)
-!26 = !DISubprogram(name: "~A", line: 3, isLocal: false, isDefinition: false, virtuality: DW_VIRTUALITY_virtual, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 3, file: !5, scope: !13, type: !23, containingType: !13)
-!28 = distinct !DISubprogram(name: "B", linkageName: "_ZN1BC2Ev", line: 9, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 9, file: !5, scope: !4, type: !9, declaration: !8, retainedNodes: !29)
-!29 = !{!30}
-!30 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !28, type: !31)
-!31 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 32, align: 32, baseType: !4)
-!32 = distinct !DISubprogram(name: "B", linkageName: "_ZN1BC1Ev", line: 9, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 9, file: !5, scope: !4, type: !9, declaration: !8, retainedNodes: !33)
-!33 = !{!34}
-!34 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !32, type: !31)
-!35 = !{i32 2, !"Dwarf Version", i32 4}
-!36 = !{i32 2, !"Debug Info Version", i32 3}
-!37 = !{i32 1, !"wchar_size", i32 4}
-!38 = !{i32 1, !"min_enum_size", i32 4}
-!39 = !{!"clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)"}
-!40 = !DIExpression()
-!41 = !DILocation(line: 0, scope: !28)
-!42 = !DILocation(line: 9, scope: !28)
-!43 = !{!44, !44, i64 0}
-!44 = !{!"vtable pointer", !45, i64 0}
-!45 = !{!"Simple C/C++ TBAA"}
-!46 = !DILocation(line: 0, scope: !32)
-!47 = !DILocalVariable(name: "this", arg: 1, flags: DIFlagArtificial | DIFlagObjectPointer, scope: !28, type: !31)
-!48 = !DILocation(line: 9, scope: !32)
-!49 = !DILocation(line: 0, scope: !28, inlinedAt: !48)
-!50 = !DILocation(line: 9, scope: !28, inlinedAt: !48)
diff --git a/test/DebugInfo/ARM/cfi-eof-prologue.mir b/test/DebugInfo/ARM/cfi-eof-prologue.mir
new file mode 100644
index 0000000000000000000000000000000000000000..d0808887770da6971d2568e1b55f4180ba8371cb
--- /dev/null
+++ b/test/DebugInfo/ARM/cfi-eof-prologue.mir
@@ -0,0 +1,212 @@
+# RUN: llc -o - %s -mtriple=thumbv7-apple-ios -start-after=patchable-function | FileCheck %s
+# RUN: llc -o - %s -mtriple=thumbv6-apple-ios -start-after=patchable-function | FileCheck %s
+
+# struct A {
+#   A();
+#   virtual ~A();
+# };
+# struct B : A {
+#   B();
+#   virtual ~B();
+# };
+# B::B() {}
+# CHECK: __ZN1BC1Ev:
+# CHECK:     .loc       1 9 0 prologue_end
+# CHECK-NOT: .loc       1 0 0 prologue_end
+#
+# The location of the prologue_end marker should not be affected by the presence
+# of CFI instructions.
+
+--- |
+  %struct.B = type { %struct.A }
+  %struct.A = type { i32 (...)** }
+  
+  @_ZTV1B = external unnamed_addr constant [4 x i8*]
+  
+  ; Function Attrs: nounwind
+  define %struct.B* @_ZN1BC2Ev(%struct.B* %this) unnamed_addr #0 align 2 !dbg !31 {
+  entry:
+    tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !33, metadata !DIExpression()), !dbg !35
+    %0 = bitcast %struct.B* %this to %struct.A*, !dbg !36
+    %call = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #0, !dbg !36
+    %1 = bitcast %struct.B* %this to i32 (...)***, !dbg !36
+    store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !36, !tbaa !37
+    ret %struct.B* %this, !dbg !36
+  }
+  
+  declare %struct.A* @_ZN1AC2Ev(%struct.A*)
+  
+  ; Function Attrs: nounwind
+  define %struct.B* @_ZN1BC1Ev(%struct.B* %this) unnamed_addr #0 align 2 !dbg !40 {
+  entry:
+    tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !42, metadata !DIExpression()), !dbg !43
+    tail call void @llvm.dbg.value(metadata %struct.B* %this, metadata !33, metadata !DIExpression()) #0, !dbg !44
+    %0 = bitcast %struct.B* %this to %struct.A*, !dbg !46
+    %call.i = tail call %struct.A* @_ZN1AC2Ev(%struct.A* %0) #0, !dbg !46
+    %1 = bitcast %struct.B* %this to i32 (...)***, !dbg !46
+    store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**), i32 (...)*** %1, align 4, !dbg !46, !tbaa !37
+    ret %struct.B* %this, !dbg !45
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+  
+  declare void @_Unwind_SjLj_Register({ i8*, i32, [4 x i32], i8*, i8*, [5 x i8*] }*)
+  
+  declare void @_Unwind_SjLj_Unregister({ i8*, i32, [4 x i32], i8*, i8*, [5 x i8*] }*)
+  
+  ; Function Attrs: nounwind readnone
+  declare i8* @llvm.frameaddress(i32) #2
+  
+  ; Function Attrs: nounwind
+  declare i8* @llvm.stacksave() #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackrestore(i8*) #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.eh.sjlj.setup.dispatch() #0
+  
+  ; Function Attrs: nounwind readnone
+  declare i8* @llvm.eh.sjlj.lsda() #2
+  
+  ; Function Attrs: nounwind readnone
+  declare void @llvm.eh.sjlj.callsite(i32) #2
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.eh.sjlj.functioncontext(i8*) #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #0
+  
+  attributes #0 = { nounwind }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { nounwind readnone }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!26, !27, !28, !29}
+  !llvm.ident = !{!30}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "<stdin>", directory: "")
+  !2 = !{}
+  !3 = !{!4, !8}
+  !4 = !DICompositeType(tag: DW_TAG_structure_type, name: "B", file: !5, line: 5, size: 32, align: 32, elements: !6, vtableHolder: !8, identifier: "_ZTS1B")
+  !5 = !DIFile(filename: "test1.cpp", directory: "")
+  !6 = !{!7, !21, !25}
+  !7 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !4, baseType: !8)
+  !8 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "A", file: !5, line: 1, size: 32, align: 32, elements: !9, vtableHolder: !8, identifier: "_ZTS1A")
+  !9 = !{!10, !16, !20}
+  !10 = !DIDerivedType(tag: DW_TAG_member, name: "_vptr$A", scope: !5, file: !5, baseType: !11, size: 32, flags: DIFlagArtificial)
+  !11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 32)
+  !12 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "__vtbl_ptr_type", baseType: !13, size: 32)
+  !13 = !DISubroutineType(types: !14)
+  !14 = !{!15}
+  !15 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !16 = !DISubprogram(name: "A", scope: !8, file: !5, line: 2, type: !17, isLocal: false, isDefinition: false, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true)
+  !17 = !DISubroutineType(types: !18)
+  !18 = !{null, !19}
+  !19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !20 = !DISubprogram(name: "~A", scope: !8, file: !5, line: 3, type: !17, isLocal: false, isDefinition: false, scopeLine: 3, containingType: !8, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0, flags: DIFlagPrototyped, isOptimized: true)
+  !21 = !DISubprogram(name: "B", scope: !4, file: !5, line: 6, type: !22, isLocal: false, isDefinition: false, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true)
+  !22 = !DISubroutineType(types: !23)
+  !23 = !{null, !24}
+  !24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !25 = !DISubprogram(name: "~B", scope: !4, file: !5, line: 7, type: !22, isLocal: false, isDefinition: false, scopeLine: 7, containingType: !4, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0, flags: DIFlagPrototyped, isOptimized: true)
+  !26 = !{i32 2, !"Dwarf Version", i32 4}
+  !27 = !{i32 2, !"Debug Info Version", i32 3}
+  !28 = !{i32 1, !"wchar_size", i32 4}
+  !29 = !{i32 1, !"min_enum_size", i32 4}
+  !30 = !{!"clang version 3.6.0 (trunk 224279) (llvm/trunk 224283)"}
+  !31 = distinct !DISubprogram(name: "B", linkageName: "_ZN1BC2Ev", scope: !4, file: !5, line: 9, type: !22, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !21, retainedNodes: !32)
+  !32 = !{!33}
+  !33 = !DILocalVariable(name: "this", arg: 1, scope: !31, type: !34, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !34 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 32, align: 32)
+  !35 = !DILocation(line: 0, scope: !31)
+  !36 = !DILocation(line: 9, scope: !31)
+  !37 = !{!38, !38, i64 0}
+  !38 = !{!"vtable pointer", !39, i64 0}
+  !39 = !{!"Simple C/C++ TBAA"}
+  !40 = distinct !DISubprogram(name: "B", linkageName: "_ZN1BC1Ev", scope: !4, file: !5, line: 9, type: !22, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !21, retainedNodes: !41)
+  !41 = !{!42}
+  !42 = !DILocalVariable(name: "this", arg: 1, scope: !40, type: !34, flags: DIFlagArtificial | DIFlagObjectPointer)
+  !43 = !DILocation(line: 0, scope: !40)
+  !44 = !DILocation(line: 0, scope: !31, inlinedAt: !45)
+  !45 = !DILocation(line: 9, scope: !40)
+  !46 = !DILocation(line: 9, scope: !31, inlinedAt: !45)
+
+...
+---
+name:            _ZN1BC2Ev
+alignment:       1
+liveins:         
+  - { reg: '$r0' }
+frameInfo:       
+  stackSize:       8
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, stack-id: 0, 
+      callee-saved-register: '$lr', callee-saved-restored: false }
+  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, stack-id: 0, 
+      callee-saved-register: '$r4' }
+body:             |
+  bb.0.entry:
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r4, -8
+    DBG_VALUE debug-use $r0, debug-use $noreg, !33, !DIExpression(), debug-location !35
+    $r4 = tMOVr $r0, 14, $noreg
+    DBG_VALUE debug-use $r4, debug-use $noreg, !33, !DIExpression(), debug-location !35
+    tBL 14, $noreg, @_ZN1AC2Ev, csr_ios, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit-def $sp, implicit-def dead $r0, debug-location !36
+    $r0 = t2MOVi16_ga_pcrel target-flags(arm-lo16, arm-nonlazy) @_ZTV1B, 0, debug-location !36
+    $r0 = t2MOVTi16_ga_pcrel killed $r0, target-flags(arm-hi16, arm-nonlazy) @_ZTV1B, 0, debug-location !36
+    $r0 = tPICADD killed $r0, 0, debug-location !36
+    renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg, debug-location !36 :: (load 4 from got)
+    renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 8, 14, $noreg, debug-location !36
+    tSTRi killed renamable $r0, renamable $r4, 0, 14, $noreg, debug-location !36 :: (store 4 into %ir.1, !tbaa !37)
+    $r0 = tMOVr killed $r4, 14, $noreg, debug-location !36
+    tPOP_RET 14, $noreg, def $r4, def $pc, implicit killed $r0, debug-location !36
+
+...
+---
+name:            _ZN1BC1Ev
+alignment:       1
+liveins:         
+  - { reg: '$r0' }
+frameInfo:       
+  stackSize:       8
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, stack-id: 0, 
+      callee-saved-register: '$lr', callee-saved-restored: false }
+  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, stack-id: 0, 
+      callee-saved-register: '$r4' }
+body:             |
+  bb.0.entry:
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r4, -8
+    DBG_VALUE debug-use $r0, debug-use $noreg, !42, !DIExpression(), debug-location !43
+    DBG_VALUE debug-use $r0, debug-use $noreg, !33, !DIExpression(), debug-location !44
+    $r4 = tMOVr $r0, 14, $noreg
+    DBG_VALUE debug-use $r4, debug-use $noreg, !33, !DIExpression(), debug-location !44
+    DBG_VALUE debug-use $r4, debug-use $noreg, !42, !DIExpression(), debug-location !43
+    tBL 14, $noreg, @_ZN1AC2Ev, csr_ios, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit-def $sp, implicit-def dead $r0, debug-location !46
+    $r0 = t2MOVi16_ga_pcrel target-flags(arm-lo16, arm-nonlazy) @_ZTV1B, 0, debug-location !46
+    $r0 = t2MOVTi16_ga_pcrel killed $r0, target-flags(arm-hi16, arm-nonlazy) @_ZTV1B, 0, debug-location !46
+    $r0 = tPICADD killed $r0, 0, debug-location !46
+    renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg, debug-location !46 :: (load 4 from got)
+    renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 8, 14, $noreg, debug-location !46
+    tSTRi killed renamable $r0, renamable $r4, 0, 14, $noreg, debug-location !46 :: (store 4 into %ir.1, !tbaa !37)
+    $r0 = tMOVr killed $r4, 14, $noreg, debug-location !45
+    tPOP_RET 14, $noreg, def $r4, def $pc, implicit killed $r0, debug-location !45
+
+...
diff --git a/test/DebugInfo/ARM/sdag-split-arg1.ll b/test/DebugInfo/ARM/sdag-split-arg1.ll
index 90834a44ba8b4772193750fb680f59c8fa16a592..78cdc4dd4bdaeace2b9c21dca3fbb05543452d80 100644
--- a/test/DebugInfo/ARM/sdag-split-arg1.ll
+++ b/test/DebugInfo/ARM/sdag-split-arg1.ll
@@ -7,7 +7,7 @@ entry:
   %0 = bitcast double %a to i64
   %extract.t84 = trunc i64 %0 to i32
   tail call void @llvm.dbg.value(metadata i32 %extract.t84, metadata !8, metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32)), !dbg !12
-  ; CHECK: DBG_VALUE debug-use $r0, debug-use $noreg, !6, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+  ; CHECK: DBG_VALUE $r0, $noreg, !6, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
   %r.sroa.0.0.insert.ext35 = zext i32 %extract.t84 to i64
   ret i64 %r.sroa.0.0.insert.ext35
 }
diff --git a/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll b/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll
index fa1dbb531d32f8b0ee8db3671b5cd95fb3be86fd..af76c889353501d99891df7aecf34d1cc1350750 100644
--- a/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll
+++ b/test/DebugInfo/ARM/single-constant-use-preserves-dbgloc.ll
@@ -31,10 +31,11 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %entry
 ; Materialize the constant.
-; CHECK:      .loc    1 7 5
+; CHECK:      .loc    1 0
 ; CHECK-NEXT: mvn     r0, #0
 
 ; The backend performs the store to %retval first, for some reason.
+; CHECK-NEXT: .loc    1 7 5
 ; CHECK-NEXT: str     r0, [sp, #4]
   store i32 -1, i32* %x, align 4, !dbg !19
 
diff --git a/test/DebugInfo/COFF/build-info.ll b/test/DebugInfo/COFF/build-info.ll
new file mode 100644
index 0000000000000000000000000000000000000000..94f006c3b093560e0daa3f291e9eb062db1700b9
--- /dev/null
+++ b/test/DebugInfo/COFF/build-info.ll
@@ -0,0 +1,39 @@
+; RUN: llc -filetype=obj -mtriple i686-pc-windows-msvc %s -o %t.o
+; RUN: llvm-pdbutil dump %t.o -symbols -types | FileCheck %s
+
+; CHECK: [[INFO_IDX:0x[^ ]*]] | LF_BUILDINFO
+; CHECK-NEXT:          0x{{.*}}: `D:\src\scopes\clang`
+; CHECK-NEXT:          <no type>: ``
+; CHECK-NEXT:          0x{{.*}}: `D:\src\scopes\foo.cpp`
+; CHECK-NEXT:          <no type>: ``
+; CHECK-NEXT:          <no type>: ``
+
+; CHECK: {{.*}} | S_BUILDINFO [size = 8] BuildId = `[[INFO_IDX]]`
+
+; ModuleID = 'D:\src\scopes\foo.cpp'
+source_filename = "D:\5Csrc\5Cscopes\5Cfoo.cpp"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc19.0.23918"
+
+define i32 @"?foo@@YAHXZ"() !dbg !10 {
+entry:
+  ret i32 42, !dbg !14
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+; One .debug$S section should contain an S_COMPILE3 record that identifies the
+; source language and the version of the compiler based on the DICompileUnit.
+!1 = !DIFile(filename: "D:\5Csrc\5Cscopes\5Cfoo.cpp", directory: "D:\5Csrc\5Cscopes\5Cclang")
+!2 = !{}
+!7 = !{i32 2, !"CodeView", i32 1}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 4.0.0 "}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "\01?foo@@YAHXZ", scope: !1, file: !1, line: 1, type: !11, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!11 = !DISubroutineType(types: !12)
+!12 = !{!13}
+!13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!14 = !DILocation(line: 2, scope: !10)
diff --git a/test/DebugInfo/COFF/fpo-realign-vframe.ll b/test/DebugInfo/COFF/fpo-realign-vframe.ll
index fded804a531ef9da315fc17c393338f04b2e9ce8..e5f8d5d34dc1afd8dd7901a6c9fe69e0e4baca8c 100644
--- a/test/DebugInfo/COFF/fpo-realign-vframe.ll
+++ b/test/DebugInfo/COFF/fpo-realign-vframe.ll
@@ -83,12 +83,12 @@
 ; OBJ:   }
 ; OBJ:   FrameData {
 ; OBJ:     FrameFunc [
-; OBJ:       $T1 $ebp 4 + =
-; OBJ:       $T0 $T1 4 - 8 @ =
-; OBJ:       $eip $T1 ^ =
-; OBJ:       $esp $T1 4 + =
-; OBJ:       $ebp $T1 4 - ^ =
-; OBJ:     ]
+; OBJ-NEXT:   $T1 $ebp 4 + =
+; OBJ-NEXT:   $T0 $T1 4 - 8 @ =
+; OBJ-NEXT:   $eip $T1 ^ =
+; OBJ-NEXT:   $esp $T1 4 + =
+; OBJ-NEXT:   $ebp $T1 4 - ^ =
+; OBJ-NEXT: ]
 ; OBJ:   }
 ; OBJ: ]
 ; OBJ: Subsection [
diff --git a/test/DebugInfo/COFF/types-std-nullptr-t.ll b/test/DebugInfo/COFF/types-std-nullptr-t.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4d64a67860d6167306faf1b5f1e0eecfb8014021
--- /dev/null
+++ b/test/DebugInfo/COFF/types-std-nullptr-t.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -filetype=obj | llvm-readobj - -codeview | FileCheck %s
+
+; C++ source to regenerate:
+; $ cat foo.cpp
+; decltype(nullptr) NullPtr = nullptr;
+; $ clang hello.cpp -S -emit-llvm -g -gcodeview -o t.ll
+
+; CHECK: CodeViewDebugInfo [
+; CHECK:   Subsection [
+; CHECK:     SubSectionType: Symbols (0xF1)
+; CHECK:     GlobalData {
+; CHECK:       Kind: S_GDATA32 (0x110D)
+; CHECK:       DataOffset: ?NullPtr@@3$$TA+0x0
+; CHECK:       Type: std::nullptr_t (0x103)
+; CHECK:       DisplayName: NullPtr
+; CHECK:       LinkageName: ?NullPtr@@3$$TA
+; CHECK:     }
+
+
+; ModuleID = 'foo.cpp'
+source_filename = "foo.cpp"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.15.26730"
+
+@"?NullPtr@@3$$TA" = dso_local global i8* null, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10}
+!llvm.ident = !{!11}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "NullPtr", linkageName: "?NullPtr@@3$$TA", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 8.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: None)
+!3 = !DIFile(filename: "foo.cpp", directory: "D:\5Csrc\5Cllvmbuild\5Ccl\5CDebug\5Cx64", checksumkind: CSK_MD5, checksum: "0d5c7c9860a17e584808c03a24a135e6")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(tag: DW_TAG_unspecified_type, name: "decltype(nullptr)")
+!7 = !{i32 2, !"CodeView", i32 1}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 2}
+!10 = !{i32 7, !"PIC Level", i32 2}
+!11 = !{!"clang version 8.0.0 "}
diff --git a/test/DebugInfo/COFF/vframe-csr.ll b/test/DebugInfo/COFF/vframe-csr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1c1c0cec50e3b3b24e5453cb14988be06f44869c
--- /dev/null
+++ b/test/DebugInfo/COFF/vframe-csr.ll
@@ -0,0 +1,179 @@
+; RUN: llc < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -filetype=obj < %s | llvm-readobj -codeview | FileCheck %s --check-prefix=OBJ
+
+; PR38857
+
+; This test case is identical to the fpo-realign-vframe.ll test, except it uses
+; two callee-saved registers.
+
+; Match the prologue for the .cv_fpo* directives.
+; ASM-LABEL: _realign_with_csrs:
+; ASM:         .cv_fpo_proc    _realign_with_csrs 0
+; ASM: # %bb.0:                                # %entry
+; ASM:         pushl   %ebp
+; ASM:         .cv_fpo_pushreg %ebp
+; ASM:         movl    %esp, %ebp
+; ASM:         .cv_fpo_setframe        %ebp
+; ASM:         andl    $-8, %esp
+; ASM:         .cv_fpo_stackalign      8
+; FIXME: Why 24 bytes? We only need 12 bytes of data.
+; ASM:         subl    $24, %esp
+; ASM:         .cv_fpo_stackalloc      24
+; ASM:         .cv_fpo_endprologue
+
+; 'x' should be EBP-relative, 'a' and 'force_alignment' ESP relative.
+; ASM:         calll   _getval
+; ASM-DAG:     leal    8(%esp), %[[LEA_DBL:[^ ]*]]
+; ASM-DAG:     leal    4(%esp), %[[LEA_A:[^ ]*]]
+; ASM:         pushl   %[[LEA_DBL]]
+; ASM:         pushl   %[[LEA_A]]
+; ASM:         pushl   %[[LEA_A]]
+; ASM:         calll   _usevals
+; ASM:         addl    $12, %esp
+
+; OBJ: Subsection [
+; OBJ:   SubSectionType: Symbols (0xF1)
+; OBJ: ]
+; OBJ: Subsection [
+; OBJ:   SubSectionType: FrameData (0xF5)
+;   	Really, the only important FrameFunc is the last one.
+; OBJ:   FrameData {
+; OBJ:   }
+; OBJ:   FrameData {
+; OBJ:   }
+; OBJ:   FrameData {
+; OBJ:   }
+; OBJ:   FrameData {
+; OBJ:   }
+; OBJ:   FrameData {
+; OBJ:     FrameFunc [
+; OBJ-NEXT:   $T1 $ebp 4 + =
+; OBJ-NEXT:   $T0 $T1 8 - 8 @ =
+; OBJ-NEXT:   $eip $T1 ^ =
+; OBJ-NEXT:   $esp $T1 4 + =
+; OBJ-NEXT:   $ebp $T1 4 - ^ =
+; OBJ-NEXT:   $esi $T1 8 - ^ =
+; OBJ-NEXT: ]
+; OBJ:   }
+; OBJ: ]
+; OBJ: Subsection [
+; OBJ:   SubSectionType: Symbols (0xF1)
+; OBJ:   GlobalProcIdSym {
+; OBJ:     Kind: S_GPROC32_ID (0x1147)
+; OBJ:     DisplayName: realign_with_csrs
+; OBJ:     LinkageName: _realign_with_csrs
+; OBJ:   }
+; 	The frame register for locals should be VFRAME, and EBP for parameters.
+; OBJ:   FrameProcSym {
+; OBJ:     Kind: S_FRAMEPROC (0x1012)
+; OBJ:     TotalFrameBytes: 0x18
+; OBJ:     LocalFramePtrReg: VFRAME (0x7536)
+; OBJ:     ParamFramePtrReg: EBP (0x16)
+; OBJ:   }
+; 	ESP is VFRAME - 24, ESP offset of 'a' is 4, so -20.
+; OBJ:   LocalSym {
+; OBJ:     Kind: S_LOCAL (0x113E)
+; OBJ:     Type: int (0x74)
+; OBJ:     Flags [ (0x0)
+; OBJ:     ]
+; OBJ:     VarName: a
+; OBJ:   }
+; OBJ:   DefRangeFramePointerRelSym {
+; OBJ:     Kind: S_DEFRANGE_FRAMEPOINTER_REL (0x1142)
+; OBJ:     Offset: -20
+; OBJ:   }
+; 	ESP is VFRAME - 16, ESP offset of 'force_alignment' is 8, so -8.
+; OBJ:   LocalSym {
+; OBJ:     Kind: S_LOCAL (0x113E)
+; OBJ:     Type: double (0x41)
+; OBJ:     Flags [ (0x0)
+; OBJ:     ]
+; OBJ:     VarName: force_alignment
+; OBJ:   }
+; OBJ:   DefRangeFramePointerRelSym {
+; OBJ:     Kind: S_DEFRANGE_FRAMEPOINTER_REL (0x1142)
+; OBJ:     Offset: -16
+; OBJ:   }
+; OBJ:   ProcEnd {
+; OBJ:     Kind: S_PROC_ID_END (0x114F)
+; OBJ:   }
+; OBJ: ]
+
+; ModuleID = 't.c'
+source_filename = "t.c"
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-windows-msvc19.14.26433"
+
+; Function Attrs: nounwind
+define dso_local i32 @realign_with_csrs() local_unnamed_addr #0 !dbg !8 {
+entry:
+  %a = alloca i32, align 4
+  %force_alignment = alloca double, align 8
+  %0 = bitcast i32* %a to i8*, !dbg !22
+  call void @llvm.dbg.declare(metadata i32* %a, metadata !14, metadata !DIExpression()), !dbg !22
+  %csr1 = tail call i32 @getval() #4
+  %call = tail call i32 @getval() #4, !dbg !22
+  store i32 %call, i32* %a, align 4, !dbg !22, !tbaa !17
+  %1 = bitcast double* %force_alignment to i8*, !dbg !23
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %1) #4, !dbg !23
+  call void @llvm.dbg.declare(metadata double* %force_alignment, metadata !15, metadata !DIExpression()), !dbg !23
+  store double 4.200000e-01, double* %force_alignment, align 8, !dbg !23, !tbaa !24
+  call void @usevals(i32* nonnull %a, i32* nonnull %a, double* nonnull %force_alignment) #4, !dbg !26
+  call void @usecsrs(i32 %csr1, i32 %csr1)
+  ret i32 0
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
+
+declare dso_local i32 @getval() local_unnamed_addr #3
+
+declare dso_local void @usevals(i32*, i32*, double*) local_unnamed_addr #3
+
+declare dso_local void @usecsrs(i32, i32) local_unnamed_addr #3
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "t.c", directory: "C:\5Csrc\5Cllvm-project\5Cbuild", checksumkind: CSK_MD5, checksum: "a646950309d5d01d8087fc10fea33941")
+!2 = !{}
+!3 = !{i32 1, !"NumRegisterParameters", i32 0}
+!4 = !{i32 2, !"CodeView", i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 2}
+!7 = !{!"clang version 8.0.0 "}
+!8 = distinct !DISubprogram(name: "realign_with_csrs", scope: !1, file: !1, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
+!9 = !DISubroutineType(types: !10)
+!10 = !{!11, !11}
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13, !14, !15}
+!13 = !DILocalVariable(name: "x", arg: 1, scope: !8, file: !1, line: 3, type: !11)
+!14 = !DILocalVariable(name: "a", scope: !8, file: !1, line: 4, type: !11)
+!15 = !DILocalVariable(name: "force_alignment", scope: !8, file: !1, line: 5, type: !16, align: 64)
+!16 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+!17 = !{!18, !18, i64 0}
+!18 = !{!"int", !19, i64 0}
+!19 = !{!"omnipotent char", !20, i64 0}
+!20 = !{!"Simple C/C++ TBAA"}
+!21 = !DILocation(line: 3, scope: !8)
+!22 = !DILocation(line: 4, scope: !8)
+!23 = !DILocation(line: 5, scope: !8)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"double", !19, i64 0}
+!26 = !DILocation(line: 6, scope: !8)
+!27 = !DILocation(line: 7, scope: !8)
+!28 = !DILocation(line: 8, scope: !8)
diff --git a/test/DebugInfo/Generic/linear-dbg-value.ll b/test/DebugInfo/Generic/linear-dbg-value.ll
index 62cbc4442aa2ec6038f2026dc7d6763ca603d141..2ea78eb3dae29a600f8328486caff69e217e3c11 100644
--- a/test/DebugInfo/Generic/linear-dbg-value.ll
+++ b/test/DebugInfo/Generic/linear-dbg-value.ll
@@ -1,4 +1,5 @@
-; RUN: llc -stop-before=expand-isel-pseudos -pre-RA-sched=linearize < %s | FileCheck %s
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39452.
+; RUN: llc -stop-before=expand-isel-pseudos -pre-RA-sched=linearize -verify-machineinstrs=0 < %s | FileCheck %s
 source_filename = "linear-dbg-value.ll"
 
 ; Function Attrs: nounwind readonly uwtable
diff --git a/test/DebugInfo/Inputs/loclists-dwp-b.ll b/test/DebugInfo/Inputs/loclists-dwp-b.ll
new file mode 100644
index 0000000000000000000000000000000000000000..77081bd7c2834a015e5c172a541fd9d2c0b32cf9
--- /dev/null
+++ b/test/DebugInfo/Inputs/loclists-dwp-b.ll
@@ -0,0 +1,32 @@
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @_Z1bi(i32 %i) local_unnamed_addr !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %i, metadata !12, metadata !DIExpression()), !dbg !13
+  tail call void asm sideeffect "", "~{rdi},~{dirflag},~{fpsr},~{flags}"() , !dbg !14, !srcloc !15
+  ret void, !dbg !16
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (https://git.llvm.org/git/clang.git/ 41055c6168135fe539801799e5c5636247cf0302) (https://git.llvm.org/git/llvm.git/ de0558be123ffbb5b5bd692c17dbd57a75fe684f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "b.cpp", directory: "/home/test/PRs/PR38990")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (https://git.llvm.org/git/clang.git/ 41055c6168135fe539801799e5c5636247cf0302) (https://git.llvm.org/git/llvm.git/ de0558be123ffbb5b5bd692c17dbd57a75fe684f)"}
+!7 = distinct !DISubprogram(name: "b", linkageName: "_Z1bi", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!12}
+!12 = !DILocalVariable(name: "i", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!13 = !DILocation(line: 1, column: 12, scope: !7)
+!14 = !DILocation(line: 1, column: 17, scope: !7)
+!15 = !{i32 22}
+!16 = !DILocation(line: 1, column: 38, scope: !7)
diff --git a/test/DebugInfo/MIR/AArch64/clobber-sp.mir b/test/DebugInfo/MIR/AArch64/clobber-sp.mir
index 222bbd798ba32c3aa814a101d000441f3deb13ee..4594065cc2969c8cd081b7d88376876214c90891 100644
--- a/test/DebugInfo/MIR/AArch64/clobber-sp.mir
+++ b/test/DebugInfo/MIR/AArch64/clobber-sp.mir
@@ -145,11 +145,11 @@ body:             |
     $sp = frame-setup SUBXri $sp, 32, 0
     frame-setup STPXi killed $fp, killed $lr, $sp, 2 :: (store 8 into %stack.3), (store 8 into %stack.2)
     $fp = frame-setup ADDXri $sp, 16, 0
-    DBG_VALUE debug-use $w0, debug-use _, !19, !22, debug-location !23
+    DBG_VALUE $w0, _, !19, !22, debug-location !23
     STURWi killed $w0, $fp, -4 :: (store 4 into %stack.0.x.addr)
-    DBG_VALUE debug-use $w1, debug-use _, !20, !22, debug-location !28
+    DBG_VALUE $w1, _, !20, !22, debug-location !28
     STRWui killed $w1, $sp, 2, debug-location !30 :: (store 4 into %stack.1)
-    DBG_VALUE debug-use $sp, 0, !20, !36, debug-location !28
+    DBG_VALUE $sp, 0, !20, !36, debug-location !28
     BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $d0, implicit killed $d1, implicit killed $d2, implicit killed $d3, implicit-def $sp, debug-location !30
     $w0 = LDRWui $sp, 2, debug-location !33 :: (load 4 from %stack.1)
     CBZW killed $w0, %bb.2.if.end, debug-location !33
@@ -157,13 +157,13 @@ body:             |
   bb.1.if.then:
     successors: %bb.2.if.end(0x80000000)
 
-    DBG_VALUE debug-use $sp, 0, !20, !36, debug-location !28
+    DBG_VALUE $sp, 0, !20, !36, debug-location !28
     $x0 = SUBXri $fp, 4, 0
-    DBG_VALUE debug-use $x0, debug-use _, !19, !22, debug-location !23
+    DBG_VALUE $x0, _, !19, !22, debug-location !23
     BL @h, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $x0, debug-location !34
 
   bb.2.if.end:
-    DBG_VALUE debug-use $sp, 0, !20, !36, debug-location !28
+    DBG_VALUE $sp, 0, !20, !36, debug-location !28
     $w8 = MOVZWi 0, 0
     $x0 = ORRXrs $xzr, undef $x8, 0, implicit killed $w8, debug-location !35
     $fp, $lr = LDPXi $sp, 2, debug-location !35 :: (load 8 from %stack.3), (load 8 from %stack.2)
diff --git a/test/DebugInfo/MIR/ARM/live-debug-values-reg-copy.mir b/test/DebugInfo/MIR/ARM/live-debug-values-reg-copy.mir
index e14c0a470518a9a9667c8b3d776c5661b01dacc0..e29420e27d5b7beea6eac9b802fdca786746772f 100644
--- a/test/DebugInfo/MIR/ARM/live-debug-values-reg-copy.mir
+++ b/test/DebugInfo/MIR/ARM/live-debug-values-reg-copy.mir
@@ -5,9 +5,9 @@
 # to another. The altered instructions are labeled below.
 #
 # CHECK: ![[ARG1:.*]] = !DILocalVariable(name: "arg1"
-# CHECK: DBG_VALUE debug-use $r4, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK: DBG_VALUE $r4, $noreg, ![[ARG1]], !DIExpression(), debug-location
 # CHECK: $r5 = MOVr killed $r4, 14, $noreg, $noreg, debug-location
-# CHECK-NEXT: DBG_VALUE debug-use $r5, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK-NEXT: DBG_VALUE $r5, $noreg, ![[ARG1]], !DIExpression(), debug-location
 --- |
   ; ModuleID = 'live-debug-values-reg-copy.ll'
   source_filename = "live-debug-values-reg-copy.c"
@@ -119,8 +119,8 @@ body:             |
     frame-setup CFI_INSTRUCTION offset $r11, -8
     frame-setup CFI_INSTRUCTION offset $r5, -12
     frame-setup CFI_INSTRUCTION offset $r4, -16
-    DBG_VALUE debug-use $r0, debug-use $noreg, !13, !DIExpression(), debug-location !16
-    DBG_VALUE debug-use $r0, debug-use $noreg, !13, !DIExpression(), debug-location !16
+    DBG_VALUE $r0, $noreg, !13, !DIExpression(), debug-location !16
+    DBG_VALUE $r0, $noreg, !13, !DIExpression(), debug-location !16
     CMPri renamable $r0, 10, 14, $noreg, implicit-def $cpsr, debug-location !16
     Bcc %bb.2, 13, killed $cpsr, debug-location !16
   
@@ -132,7 +132,7 @@ body:             |
   
   bb.2.if.else:
     renamable $r4 = ADDri killed renamable $r0, 10, 14, $noreg, $noreg, debug-location !16
-    DBG_VALUE debug-use $r4, debug-use $noreg, !13, !DIExpression(), debug-location !16
+    DBG_VALUE $r4, $noreg, !13, !DIExpression(), debug-location !16
     $r0 = MOVr $r4, 14, $noreg, $noreg, debug-location !16
     BL @externFunc2, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $r0, implicit-def $sp, implicit-def $r0, debug-location !16
     $r5 = MOVr killed $r0, 14, $noreg, $noreg, debug-location !16
diff --git a/test/DebugInfo/MIR/ARM/split-superreg-complex.mir b/test/DebugInfo/MIR/ARM/split-superreg-complex.mir
index 89472ec1da0ede138e6f90cf88a315192e796024..868321bab2ac51b41c65d188b5868b6a4a59b16b 100644
--- a/test/DebugInfo/MIR/ARM/split-superreg-complex.mir
+++ b/test/DebugInfo/MIR/ARM/split-superreg-complex.mir
@@ -113,7 +113,7 @@ body:             |
     tBL 14, _, @v, csr_ios, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $r0, implicit-def $r1, implicit-def $r2, implicit-def $r3, debug-location !19
     $d1 = VMOVDRR killed $r2, killed $r3, 14, _, implicit-def $q0, debug-location !19
     $d0 = VMOVDRR killed $r0, killed $r1, 14, _, implicit killed $q0, implicit-def $q0, debug-location !19
-    DBG_VALUE debug-use $q0, debug-use _, !14, !20, debug-location !21
+    DBG_VALUE $q0, _, !14, !20, debug-location !21
     $s4 = VMOVS $s1, 14, _, implicit-def $d2, debug-location !24
     $d0 = VADDfd $d0, killed $d2, 14, _, implicit killed $q0, debug-location !24
     $r0 = VMOVRS $s0, 14, _, implicit killed $d0, debug-location !25
diff --git a/test/DebugInfo/MIR/ARM/split-superreg-piece.mir b/test/DebugInfo/MIR/ARM/split-superreg-piece.mir
index 945fc09d6e477ce4e471f26b748b4b48c7c03afb..69b4f7a07a4cba4439d0bc8f61327dc4536ab238 100644
--- a/test/DebugInfo/MIR/ARM/split-superreg-piece.mir
+++ b/test/DebugInfo/MIR/ARM/split-superreg-piece.mir
@@ -113,7 +113,7 @@ body:             |
     tBL 14, _, @v, csr_ios, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $r0, implicit-def $r1, implicit-def $r2, implicit-def $r3, debug-location !19
     $d1 = VMOVDRR killed $r2, killed $r3, 14, _, implicit-def $q0, debug-location !19
     $d0 = VMOVDRR killed $r0, killed $r1, 14, _, implicit killed $q0, implicit-def $q0, debug-location !19
-    DBG_VALUE debug-use $q0, debug-use _, !14, !20, debug-location !21
+    DBG_VALUE $q0, _, !14, !20, debug-location !21
     $s4 = VMOVS $s1, 14, _, implicit-def $d2, debug-location !24
     $d0 = VADDfd $d0, killed $d2, 14, _, implicit killed $q0, debug-location !24
     $r0 = VMOVRS $s0, 14, _, implicit killed $d0, debug-location !25
diff --git a/test/DebugInfo/MIR/ARM/split-superreg.mir b/test/DebugInfo/MIR/ARM/split-superreg.mir
index a87c33485bc6c0701808def1a06e852533a49583..39b8b4341faf1c45c327e560ad5e29892edd160a 100644
--- a/test/DebugInfo/MIR/ARM/split-superreg.mir
+++ b/test/DebugInfo/MIR/ARM/split-superreg.mir
@@ -113,7 +113,7 @@ body:             |
     tBL 14, _, @v, csr_ios, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $r0, implicit-def $r1, implicit-def $r2, implicit-def $r3, debug-location !19
     $d1 = VMOVDRR killed $r2, killed $r3, 14, _, implicit-def $q0, debug-location !19
     $d0 = VMOVDRR killed $r0, killed $r1, 14, _, implicit killed $q0, implicit-def $q0, debug-location !19
-    DBG_VALUE debug-use $q0, debug-use _, !14, !20, debug-location !21
+    DBG_VALUE $q0, _, !14, !20, debug-location !21
     $s4 = VMOVS $s1, 14, _, implicit-def $d2, debug-location !24
     $d0 = VADDfd $d0, killed $d2, 14, _, implicit killed $q0, debug-location !24
     $r0 = VMOVRS $s0, 14, _, implicit killed $d0, debug-location !25
diff --git a/test/DebugInfo/MIR/Mips/last-inst-bundled.mir b/test/DebugInfo/MIR/Mips/last-inst-bundled.mir
index e228c8876a534309a14f2b76d49353dd754c7ffb..b1239697b7b43f269f24eb75c8e7f349724402e0 100644
--- a/test/DebugInfo/MIR/Mips/last-inst-bundled.mir
+++ b/test/DebugInfo/MIR/Mips/last-inst-bundled.mir
@@ -21,7 +21,7 @@
 #
 # Check that last bundled instruction of block gets recognized as end of basic block.
 # CHECK: bb.2.if.end
-# CHECK-NEXT: DBG_VALUE debug-use $s0, debug-use $noreg, !12, !DIExpression(), debug-location !17
+# CHECK-NEXT: DBG_VALUE $s0, $noreg, !12, !DIExpression(), debug-location !17
 
 --- |
   ; ModuleID = '<stdin>'
@@ -161,15 +161,15 @@ body:             |
     SW killed $s0, $sp, 24 :: (store 4 into %stack.2)
     CFI_INSTRUCTION offset $ra_64, -4
     CFI_INSTRUCTION offset $s0_64, -8
-    DBG_VALUE debug-use $a0, debug-use $noreg, !12, !DIExpression(), debug-location !17
+    DBG_VALUE $a0, $noreg, !12, !DIExpression(), debug-location !17
     $s0 = OR $a0, $zero
-    DBG_VALUE debug-use $s0, debug-use $noreg, !12, !DIExpression(), debug-location !17
-    DBG_VALUE debug-use $sp, 0, !13, !DIExpression(DW_OP_plus_uconst, 20), debug-location !19
+    DBG_VALUE $s0, $noreg, !12, !DIExpression(), debug-location !17
+    DBG_VALUE $sp, 0, !13, !DIExpression(DW_OP_plus_uconst, 20), debug-location !19
     JAL @set_cond, csr_o32, implicit-def dead $ra, implicit $a0, implicit $a1, implicit-def $sp, debug-location !20 {
       renamable $a1 = LEA_ADDiu $sp, 20
     }
     renamable $at = LW $sp, 20, debug-location !21 :: (dereferenceable load 4 from %ir.condition, !tbaa !23)
-    DBG_VALUE debug-use $at, debug-use $noreg, !13, !DIExpression(), debug-location !19
+    DBG_VALUE $at, $noreg, !13, !DIExpression(), debug-location !19
     BEQ killed renamable $at, $zero, %bb.2, implicit-def $at, debug-location !27 {
       NOP debug-location !27
     }
diff --git a/test/DebugInfo/MIR/Mips/live-debug-values-reg-copy.mir b/test/DebugInfo/MIR/Mips/live-debug-values-reg-copy.mir
index 70a85c075ec3e8f9e3b9d0a938dc98ab3c5a320b..dd009b8de451d28015f52542b94335fad78c3b74 100644
--- a/test/DebugInfo/MIR/Mips/live-debug-values-reg-copy.mir
+++ b/test/DebugInfo/MIR/Mips/live-debug-values-reg-copy.mir
@@ -6,12 +6,12 @@
 #
 # CHECK: ![[ARG1:.*]] = !DILocalVariable(name: "arg1"
 # CHECK: ![[ARG2:.*]] = !DILocalVariable(name: "arg2"
-# CHECK: DBG_VALUE debug-use $s0_64, debug-use $noreg, ![[ARG2]], !DIExpression(), debug-location
+# CHECK: DBG_VALUE $s0_64, $noreg, ![[ARG2]], !DIExpression(), debug-location
 # CHECK: $s1_64 = OR64 killed $s0_64, $zero_64, debug-location
-# CHECK-NEXT: DBG_VALUE debug-use $s1_64, debug-use $noreg, ![[ARG2]], !DIExpression(), debug-location
-# CHECK: DBG_VALUE debug-use $f24, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK-NEXT: DBG_VALUE $s1_64, $noreg, ![[ARG2]], !DIExpression(), debug-location
+# CHECK: DBG_VALUE $f24, $noreg, ![[ARG1]], !DIExpression(), debug-location
 # CHECK: $f26 = FMOV_S killed $f24, debug-location
-# CHECK-NEXT: DBG_VALUE debug-use $f26, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK-NEXT: DBG_VALUE $f26, $noreg, ![[ARG1]], !DIExpression(), debug-location
 
 --- |
   ; ModuleID = 'live-debug-values-reg-copy.ll'
@@ -161,11 +161,11 @@ body:             |
     CFI_INSTRUCTION offset $d24_64, -12
     CFI_INSTRUCTION offset $ra_64, -24
     CFI_INSTRUCTION offset $s0_64, -32
-    DBG_VALUE debug-use $f12, debug-use $noreg, !14, !DIExpression(), debug-location !19
-    DBG_VALUE debug-use $a1_64, debug-use $noreg, !15, !DIExpression(), debug-location !19
-    DBG_VALUE debug-use $s0, debug-use $noreg, !15, !DIExpression(), debug-location !19
-    DBG_VALUE debug-use $s0_64, debug-use $noreg, !15, !DIExpression(), debug-location !19
-    DBG_VALUE debug-use $f12, debug-use $noreg, !14, !DIExpression(), debug-location !19
+    DBG_VALUE $f12, $noreg, !14, !DIExpression(), debug-location !19
+    DBG_VALUE $a1_64, $noreg, !15, !DIExpression(), debug-location !19
+    DBG_VALUE $s0, $noreg, !15, !DIExpression(), debug-location !19
+    DBG_VALUE $s0_64, $noreg, !15, !DIExpression(), debug-location !19
+    DBG_VALUE $f12, $noreg, !14, !DIExpression(), debug-location !19
     renamable $d0_64 = CVT_D64_S renamable $f12, debug-location !19
     renamable $at_64 = LUi64 target-flags(mips-highest) %const.0
     renamable $at_64 = DADDiu killed renamable $at_64, target-flags(mips-higher) %const.0
@@ -211,7 +211,7 @@ body:             |
     renamable $at_64 = DSLL killed renamable $at_64, 16
     renamable $f0 = LWC1 killed renamable $at_64, target-flags(mips-abs-lo) %const.1, debug-location !19 :: (load 4 from constant-pool)
     renamable $f24 = FADD_S killed renamable $f12, killed renamable $f0, debug-location !19
-    DBG_VALUE debug-use $f24, debug-use $noreg, !14, !DIExpression(), debug-location !19
+    DBG_VALUE $f24, $noreg, !14, !DIExpression(), debug-location !19
     JAL @externFunc2, csr_n64, implicit-def dead $ra, implicit $f12, implicit-def $sp, implicit-def $f0, debug-location !19 {
       $f12 = FMOV_S $f24, debug-location !19
     }
diff --git a/test/DebugInfo/MIR/X86/bit-piece-dh.mir b/test/DebugInfo/MIR/X86/bit-piece-dh.mir
index 8c74f8395fea0d6c97e2bc29178d8b320f919f19..e8100b71eff49d848c150eb00c14e4ce44d7009e 100644
--- a/test/DebugInfo/MIR/X86/bit-piece-dh.mir
+++ b/test/DebugInfo/MIR/X86/bit-piece-dh.mir
@@ -88,7 +88,7 @@ body:             |
     CFI_INSTRUCTION offset $rbp, -16
     $rbp = frame-setup MOV64rr $rsp
     CFI_INSTRUCTION def_cfa_register $rbp
-    DBG_VALUE debug-use $dh, debug-use _, !14, !15, debug-location !16
+    DBG_VALUE $dh, _, !14, !15, debug-location !16
     $edi = SHR32ri killed $edi, 8, implicit-def dead $eflags, debug-location !17
     $eax = MOVSX32rr8 $dil, implicit killed $edi, debug-location !20
     $rbp = POP64r implicit-def $rsp, implicit $rsp, debug-location !20
diff --git a/test/DebugInfo/MIR/X86/kill-after-spill.mir b/test/DebugInfo/MIR/X86/kill-after-spill.mir
index e9c03938f2b4874c0ae2aeb0c4e1ecd941afab00..5110dc349bea72a1fc9c2508b04ffbc0d1a50f35 100644
--- a/test/DebugInfo/MIR/X86/kill-after-spill.mir
+++ b/test/DebugInfo/MIR/X86/kill-after-spill.mir
@@ -14,8 +14,8 @@
 # ...
 #
 # CHECK: bb.1.if.end:
-# CHECK: DBG_VALUE debug-use $rbp, 0, !37, !DIExpression(DW_OP_constu, 44, DW_OP_minus), debug-location !58
-# CHECK-NOT: DBG_VALUE debug-use $rbp, 0, !36, !DIExpression(DW_OP_constu, 48, DW_OP_minus), debug-location !57
+# CHECK: DBG_VALUE $rbp, 0, !37, !DIExpression(DW_OP_constu, 44, DW_OP_minus), debug-location !58
+# CHECK-NOT: DBG_VALUE $rbp, 0, !36, !DIExpression(DW_OP_constu, 48, DW_OP_minus), debug-location !57
 
 --- |
   ; ModuleID = '<stdin>'
@@ -274,12 +274,12 @@ body:             |
     CFI_INSTRUCTION offset $r13, -40
     CFI_INSTRUCTION offset $r14, -32
     CFI_INSTRUCTION offset $r15, -24
-    DBG_VALUE debug-use $edi, debug-use $noreg, !36, !DIExpression(), debug-location !57
-    DBG_VALUE debug-use $esi, debug-use $noreg, !37, !DIExpression(), debug-location !58
+    DBG_VALUE $edi, $noreg, !36, !DIExpression(), debug-location !57
+    DBG_VALUE $esi, $noreg, !37, !DIExpression(), debug-location !58
     $ebx = MOV32rr $esi
-    DBG_VALUE debug-use $ebx, debug-use $noreg, !37, !DIExpression(), debug-location !58
+    DBG_VALUE $ebx, $noreg, !37, !DIExpression(), debug-location !58
     $r15d = MOV32rr $edi
-    DBG_VALUE debug-use $r15d, debug-use $noreg, !36, !DIExpression(), debug-location !57
+    DBG_VALUE $r15d, $noreg, !36, !DIExpression(), debug-location !57
     renamable $r14 = MOV64ri -9223372036854775808
     $edi = MOV32rr $ebx
     CALL64pcrel32 @func1, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
diff --git a/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir b/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
index 1f62a0f81363799d4ad2cb949401fa8be8fef2fc..8bc340721be9058ccb15fd16226da96695a334c9 100644
--- a/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-values-3preds.mir
@@ -31,9 +31,9 @@
 # DBG_VALUE for variables "x", "y" and "z" are extended into %bb.9 from its
 # predecessors %bb.0, %bb.2 and %bb.8.
 # CHECK:      bb.9.for.end:
-# CHECK-DAG:  DBG_VALUE debug-use $edi, debug-use $noreg, ![[X_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
-# CHECK-DAG:  DBG_VALUE debug-use $esi, debug-use $noreg, ![[Y_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
-# CHECK-DAG:  DBG_VALUE debug-use $edx, debug-use $noreg, ![[Z_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
+# CHECK-DAG:  DBG_VALUE $edi, $noreg, ![[X_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
+# CHECK-DAG:  DBG_VALUE $esi, $noreg, ![[Y_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
+# CHECK-DAG:  DBG_VALUE $edx, $noreg, ![[Z_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
 # CHECK:      RET
 
 --- |
@@ -186,10 +186,10 @@ body:             |
     successors: %bb.1.for.body.preheader(20), %bb.9.for.end(12)
     liveins: $ecx, $edi, $edx, $esi
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $r8d = MOV32rr $esi, debug-location !26
     $r8d = IMUL32rr killed $r8d, $edi, implicit-def dead $eflags, debug-location !26
@@ -200,10 +200,10 @@ body:             |
     successors: %bb.3.for.body(0)
     liveins: $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags
   
@@ -211,10 +211,10 @@ body:             |
     successors: %bb.4.if.then(4), %bb.5.if.end(124)
     liveins: $eax, $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     TEST32rr $edi, $edi, implicit-def $eflags, debug-location !35
     JG_1 %bb.4.if.then, implicit $eflags
@@ -223,10 +223,10 @@ body:             |
     successors: %bb.6.if.then.4(4), %bb.7.if.end.6(124)
     liveins: $eax, $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     TEST32rr $esi, $esi, implicit-def $eflags, debug-location !39
     JG_1 %bb.6.if.then.4, implicit $eflags
@@ -235,10 +235,10 @@ body:             |
     successors: %bb.8.if.then.8(4), %bb.2.for.cond(124)
     liveins: $eax, $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     TEST32rr $edx, $edx, implicit-def $eflags, debug-location !45
     JG_1 %bb.8.if.then.8, implicit $eflags
@@ -247,13 +247,13 @@ body:             |
     successors: %bb.3.for.body(124), %bb.9.for.end(4)
     liveins: $eax, $ecx, $edi, $edx, $esi, $r8d
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $eax = INC32r killed $eax, implicit-def dead $eflags, debug-location !44
-    DBG_VALUE debug-use $eax, debug-use _, !13, !17, debug-location !25
+    DBG_VALUE $eax, _, !13, !17, debug-location !25
     CMP32rr $eax, $r8d, implicit-def $eflags, debug-location !31
     JL_1 %bb.3.for.body, implicit $eflags
     JMP_1 %bb.9.for.end
@@ -261,8 +261,8 @@ body:             |
   bb.4.if.then:
     liveins: $ecx, $edi
   
-    DBG_VALUE debug-use $edi, debug-use _, !9, !17, debug-location !18
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edi, _, !9, !17, debug-location !18
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $ecx = IMUL32rr killed $ecx, killed $edi, implicit-def dead $eflags, debug-location !36
     DBG_VALUE 0, 0, !13, !17, debug-location !25
@@ -272,8 +272,8 @@ body:             |
   bb.6.if.then.4:
     liveins: $ecx, $esi
   
-    DBG_VALUE debug-use $esi, debug-use _, !10, !17, debug-location !19
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $esi, _, !10, !17, debug-location !19
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $ecx = IMUL32rr killed $ecx, killed $esi, implicit-def dead $eflags, debug-location !40
     DBG_VALUE 0, 0, !13, !17, debug-location !25
@@ -284,8 +284,8 @@ body:             |
     successors: %bb.9.for.end(0)
     liveins: $ecx, $edx
   
-    DBG_VALUE debug-use $edx, debug-use _, !11, !17, debug-location !21
-    DBG_VALUE debug-use $ecx, debug-use _, !12, !17, debug-location !23
+    DBG_VALUE $edx, _, !11, !17, debug-location !21
+    DBG_VALUE $ecx, _, !12, !17, debug-location !23
     DBG_VALUE 0, 0, !13, !17, debug-location !25
     $ecx = IMUL32rr killed $ecx, killed $edx, implicit-def dead $eflags, debug-location !46
   
diff --git a/test/DebugInfo/MIR/X86/live-debug-values-reg-copy.mir b/test/DebugInfo/MIR/X86/live-debug-values-reg-copy.mir
index 3e3d0992ac85847ccbbca46f7ace8e4e83c83cd3..edc2a2624ee9ffd1fed648569a93fca1d8bc9198 100644
--- a/test/DebugInfo/MIR/X86/live-debug-values-reg-copy.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-values-reg-copy.mir
@@ -5,9 +5,9 @@
 # to another. The altered instructions are labeled below.
 #
 # CHECK: ![[ARG1:.*]] = !DILocalVariable(name: "arg1"
-# CHECK: DBG_VALUE debug-use $ebx, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK: DBG_VALUE $ebx, $noreg, ![[ARG1]], !DIExpression(), debug-location
 # CHECK: $r12d = MOV32rr killed $ebx, implicit-def $r12
-# CHECK-NEXT: DBG_VALUE debug-use $r12d, debug-use $noreg, ![[ARG1]], !DIExpression(), debug-location
+# CHECK-NEXT: DBG_VALUE $r12d, $noreg, ![[ARG1]], !DIExpression(), debug-location
 --- |
   ; ModuleID = 'live-debug-values-reg-copy.ll'
   source_filename = "live-debug-values-reg-copy.c"
@@ -148,9 +148,9 @@ body:             |
     CFI_INSTRUCTION def_cfa_offset 32
     CFI_INSTRUCTION offset $rbx, -24
     CFI_INSTRUCTION offset $rbp, -16
-    DBG_VALUE debug-use $edi, debug-use $noreg, !12, !DIExpression(), debug-location !15
+    DBG_VALUE $edi, $noreg, !12, !DIExpression(), debug-location !15
     $ebx = MOV32rr $edi, implicit-def $rbx
-    DBG_VALUE debug-use $ebx, debug-use $noreg, !12, !DIExpression(), debug-location !15
+    DBG_VALUE $ebx, $noreg, !12, !DIExpression(), debug-location !15
     renamable $rdi = LEA64r $rsp, 1, $noreg, 4, $noreg
     CALL64pcrel32 @init, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, debug-location !15
     renamable $edi = MOV32rm $rsp, 1, $noreg, 4, $noreg :: (dereferenceable load 4 from %ir.local1, !tbaa !20)
diff --git a/test/DebugInfo/MIR/X86/live-debug-values-spill.mir b/test/DebugInfo/MIR/X86/live-debug-values-spill.mir
index fb83963a4e9e6ac925bf5bfca742a9d6938fec4a..78a9a01dda791d785826c42bb6a8eaac97341898 100644
--- a/test/DebugInfo/MIR/X86/live-debug-values-spill.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-values-spill.mir
@@ -53,35 +53,35 @@
 #
 # GENERATE:      bb.1.if.end:
 # GENERATE:      MOV32mr $rbp, 1, $noreg, -48, $noreg, killed $edx :: (store 4 into %stack.5)
-# GENERATE-NEXT: DBG_VALUE debug-use $rbp, 0, ![[INT0]], !DIExpression(DW_OP_constu, 48, DW_OP_minus)
+# GENERATE-NEXT: DBG_VALUE $rbp, 0, ![[INT0]], !DIExpression(DW_OP_constu, 48, DW_OP_minus)
 # GENERATE:      MOV32mr $rbp, 1, $noreg, -52, $noreg, killed $r8d :: (store 4 into %stack.4)
-# GENERATE-NEXT: DBG_VALUE debug-use $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
+# GENERATE-NEXT: DBG_VALUE $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
 # GENERATE:      MOV32mr $rbp, 1, $noreg, -56, $noreg, killed $esi :: (store 4 into %stack.3)
-# GENERATE-NEXT: DBG_VALUE debug-use $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
+# GENERATE-NEXT: DBG_VALUE $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
 #
 # Check that the spill locations that are valid at the end of bb.1.if.end are
 # propagated to subsequent BBs.
 #
 # GENERATE:      bb.2.if.then4:
 # GENERATE-NOT:  bb.3:
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
 #
 # GENERATE:      bb.3:
 # GENERATE-NOT:  bb.4.if.end13:
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
 #
 # GENERATE:      bb.4.if.end13:
 # GENERATE-NOT:  bb.5.cleanup:
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
-# GENERATE-DAG:  DBG_VALUE debug-use $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTD]], !DIExpression(DW_OP_constu, 56, DW_OP_minus)
+# GENERATE-DAG:  DBG_VALUE $rbp, 0, ![[INTB]], !DIExpression(DW_OP_constu, 52, DW_OP_minus)
 # 
 # Check that the spill location rbp-48 (the variable int0) is not propagated 
 # because int0 is redefined within the same basic block.
 #
 # TERMINATE:     bb.2.if.then4:
-# TERMINATE-NOT: DBG_VALUE debug-use $rbp, -48,
+# TERMINATE-NOT: DBG_VALUE $rbp, -48,
 --- |
   ; ModuleID = '<stdin>'
   source_filename = "spill1.c"
@@ -369,31 +369,31 @@ body:             |
     CFI_INSTRUCTION offset $r13, -40
     CFI_INSTRUCTION offset $r14, -32
     CFI_INSTRUCTION offset $r15, -24
-    DBG_VALUE debug-use $edi, debug-use _, !24, !38, debug-location !39
-    DBG_VALUE debug-use $esi, debug-use _, !25, !38, debug-location !40
-    DBG_VALUE debug-use $edx, debug-use _, !26, !38, debug-location !41
-    DBG_VALUE debug-use $ecx, debug-use _, !27, !38, debug-location !42
-    DBG_VALUE debug-use $r8d, debug-use _, !28, !38, debug-location !43
-    DBG_VALUE debug-use $r9d, debug-use _, !29, !38, debug-location !44
+    DBG_VALUE $edi, _, !24, !38, debug-location !39
+    DBG_VALUE $esi, _, !25, !38, debug-location !40
+    DBG_VALUE $edx, _, !26, !38, debug-location !41
+    DBG_VALUE $ecx, _, !27, !38, debug-location !42
+    DBG_VALUE $r8d, _, !28, !38, debug-location !43
+    DBG_VALUE $r9d, _, !29, !38, debug-location !44
     $r14d = MOV32rr $r8d
-    DBG_VALUE debug-use $r14d, debug-use _, !28, !38, debug-location !43
+    DBG_VALUE $r14d, _, !28, !38, debug-location !43
     $r12d = MOV32rr $esi
-    DBG_VALUE debug-use $r12d, debug-use _, !25, !38, debug-location !40
+    DBG_VALUE $r12d, _, !25, !38, debug-location !40
     $eax = MOV32rr $edi
-    DBG_VALUE debug-use $eax, debug-use _, !24, !38, debug-location !39
+    DBG_VALUE $eax, _, !24, !38, debug-location !39
     $r13d = MOV32rm $rip, 1, _, @glob0, _, debug-location !46 :: (dereferenceable load 4 from @glob0, !tbaa !47)
-    DBG_VALUE debug-use $r13d, debug-use _, !31, !38, debug-location !51
+    DBG_VALUE $r13d, _, !31, !38, debug-location !51
     $r8d = MOV32rm $rip, 1, _, @glob1, _, debug-location !52 :: (dereferenceable load 4 from @glob1, !tbaa !47)
-    DBG_VALUE debug-use $r8d, debug-use _, !32, !38, debug-location !53
+    DBG_VALUE $r8d, _, !32, !38, debug-location !53
     $r15d = MOV32rm $rip, 1, _, @glob2, _, debug-location !54 :: (dereferenceable load 4 from @glob2, !tbaa !47)
-    DBG_VALUE debug-use $r15d, debug-use _, !33, !38, debug-location !55
+    DBG_VALUE $r15d, _, !33, !38, debug-location !55
     $esi = MOV32rm $rip, 1, _, @glob3, _, debug-location !56 :: (dereferenceable load 4 from @glob3, !tbaa !47)
-    DBG_VALUE debug-use $esi, debug-use _, !34, !38, debug-location !57
+    DBG_VALUE $esi, _, !34, !38, debug-location !57
     $ebx = MOV32rm $rip, 1, _, @glob4, _, debug-location !59 :: (dereferenceable load 4 from @glob4, !tbaa !47)
-    DBG_VALUE debug-use $ebx, debug-use _, !35, !38, debug-location !60
+    DBG_VALUE $ebx, _, !35, !38, debug-location !60
     MOV32mr $rbp, 1, _, -44, _, $ebx, debug-location !60 :: (store 4 into %ir.inte, !tbaa !47)
     $edi = MOV32rm $rip, 1, _, @glob5, _, debug-location !62 :: (dereferenceable load 4 from @glob5, !tbaa !47)
-    DBG_VALUE debug-use $edi, debug-use _, !36, !38, debug-location !63
+    DBG_VALUE $edi, _, !36, !38, debug-location !63
     MOV32mr $rbp, 1, _, -60, _, $edi, debug-location !63 :: (store 4 into %ir.intf, !tbaa !47)
     TEST32rr killed $eax, $eax, implicit-def $eflags, debug-location !67
     JNE_1 %bb.5.cleanup, implicit $eflags
@@ -405,11 +405,11 @@ body:             |
     MOV32mr $rbp, 1, _, -48, _, killed $edx :: (store 4 into %stack.5)
     MOV32mr $rbp, 1, _, -52, _, killed $r8d :: (store 4 into %stack.4)
     MOV32mr $rbp, 1, _, -56, _, killed $esi :: (store 4 into %stack.3)
-    DBG_VALUE debug-use _, debug-use _, !30, !38, debug-location !45
+    DBG_VALUE _, _, !30, !38, debug-location !45
     $r14d = ADD32rr killed $r14d, killed $ecx, implicit-def dead $eflags, debug-location !68
     $r14d = ADD32rr killed $r14d, killed $r9d, implicit-def dead $eflags, debug-location !69
     $r14d = IMUL32rm killed $r14d, $rbp, 1, _, 16, _, implicit-def dead $eflags, debug-location !70 :: (load 4 from %fixed-stack.6, align 16)
-    DBG_VALUE debug-use $r14d, debug-use _, !26, !38, debug-location !41
+    DBG_VALUE $r14d, _, !26, !38, debug-location !41
     CALL64pcrel32 @use, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !72
     $edi = MOV32rr killed $ebx, debug-location !73
     CALL64pcrel32 @use, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !73
@@ -421,21 +421,21 @@ body:             |
     liveins: $r14d, $r15d, $rbp
   
     $rdi = LEA64r $rbp, 1, _, -44, _
-    DBG_VALUE debug-use $rbp, -44, !35, !38, debug-location !60
+    DBG_VALUE $rbp, -44, !35, !38, debug-location !60
     $rsi = LEA64r $rbp, 1, _, -60, _
-    DBG_VALUE debug-use $rbp, -60, !36, !38, debug-location !63
+    DBG_VALUE $rbp, -60, !36, !38, debug-location !63
     $rdx = LEA64r $rbp, 1, _, -64, _
-    DBG_VALUE debug-use $rbp, -64, !37, !38, debug-location !78
+    DBG_VALUE $rbp, -64, !37, !38, debug-location !78
     CALL64pcrel32 @set, csr_64, implicit $rsp, implicit $rdi, implicit $rsi, implicit $rdx, implicit-def $rsp, debug-location !79
     $eax = MOV32rm $rbp, 1, _, -44, _, debug-location !81 :: (dereferenceable load 4 from %ir.inte, !tbaa !47)
-    DBG_VALUE debug-use $eax, debug-use _, !35, !38, debug-location !60
+    DBG_VALUE $eax, _, !35, !38, debug-location !60
     $r15d = ADD32rm killed $r15d, $rbp, 1, _, -52, _, implicit-def dead $eflags, debug-location !82 :: (load 4 from %stack.4)
     $r15d = IMUL32rr killed $r15d, $eax, implicit-def dead $eflags, debug-location !82
     $r15d = ADD32rm killed $r15d, $rbp, 1, _, -56, _, implicit-def dead $eflags, debug-location !83 :: (load 4 from %stack.3)
     $r15d = IMUL32rr killed $r15d, killed $eax, implicit-def dead $eflags, debug-location !84
-    DBG_VALUE debug-use $r15d, debug-use _, !31, !38, debug-location !51
+    DBG_VALUE $r15d, _, !31, !38, debug-location !51
     $r13d = MOV32rr killed $r15d
-    DBG_VALUE debug-use $r13d, debug-use _, !31, !38, debug-location !51
+    DBG_VALUE $r13d, _, !31, !38, debug-location !51
     JMP_1 %bb.4.if.end13
   
   bb.2:
@@ -443,17 +443,17 @@ body:             |
     liveins: $r13d, $r14d, $rbp
   
     $r14d = ADD32rm killed $r14d, $rbp, 1, _, -48, _, implicit-def dead $eflags, debug-location !71 :: (load 4 from %stack.5)
-    DBG_VALUE debug-use $r14d, debug-use _, !26, !38, debug-location !41
+    DBG_VALUE $r14d, _, !26, !38, debug-location !41
   
   bb.4.if.end13:
     successors: %bb.5.cleanup(0x80000000)
     liveins: $r13d, $r14d, $rbp
   
-    DBG_VALUE debug-use $r14d, debug-use _, !26, !38, debug-location !41
-    DBG_VALUE debug-use $r13d, debug-use _, !31, !38, debug-location !51
+    DBG_VALUE $r14d, _, !26, !38, debug-location !41
+    DBG_VALUE $r13d, _, !31, !38, debug-location !51
     $r13d = IMUL32rm killed $r13d, $rbp, 1, _, 16, _, implicit-def dead $eflags, debug-location !86 :: (load 4 from %fixed-stack.6, align 16)
     $r13d = ADD32rr killed $r13d, killed $r14d, implicit-def dead $eflags, debug-location !87
-    DBG_VALUE debug-use $r13d, debug-use _, !26, !38, debug-location !41
+    DBG_VALUE $r13d, _, !26, !38, debug-location !41
     $edi = MOV32rr killed $r13d, debug-location !88
     CALL64pcrel32 @use, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !88
   
diff --git a/test/DebugInfo/MIR/X86/live-debug-values.mir b/test/DebugInfo/MIR/X86/live-debug-values.mir
index c3558aaed311a623f2daf5c197172af294ccbced..5245285da5e33df4a469b5eae4a4e1de9364adb4 100644
--- a/test/DebugInfo/MIR/X86/live-debug-values.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-values.mir
@@ -35,7 +35,7 @@
 # CHECK: ![[N_VAR:[0-9]+]] = !DILocalVariable(name: "n",{{.*}})
 #
 # CHECK:      bb.5.if.end.7:
-# CHECK:        DBG_VALUE debug-use $ebx, debug-use $noreg, ![[N_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
+# CHECK:        DBG_VALUE $ebx, $noreg, ![[N_VAR]], !DIExpression(), debug-location !{{[0-9]+}}
 
 
 --- |
@@ -193,10 +193,10 @@ body:             |
     frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION def_cfa_offset 16
     CFI_INSTRUCTION offset $rbx, -16
-    DBG_VALUE debug-use $edi, debug-use _, !12, !20, debug-location !21
-    DBG_VALUE debug-use $rsi, debug-use _, !13, !20, debug-location !22
+    DBG_VALUE $edi, _, !12, !20, debug-location !21
+    DBG_VALUE $rsi, _, !13, !20, debug-location !22
     $eax = MOV32rr $edi
-    DBG_VALUE debug-use $eax, debug-use _, !12, !20, debug-location !21
+    DBG_VALUE $eax, _, !12, !20, debug-location !21
     $edi = MOV32ri 2
     CMP32ri8 killed $eax, 2, implicit-def $eflags, debug-location !26
     JNE_1 %bb.2.if.end, implicit $eflags
@@ -205,12 +205,12 @@ body:             |
     successors: %bb.2.if.end(0)
     liveins: $rsi
   
-    DBG_VALUE debug-use $rsi, debug-use _, !13, !20, debug-location !22
+    DBG_VALUE $rsi, _, !13, !20, debug-location !22
     $rdi = MOV64rm killed $rsi, 1, _, 8, _, debug-location !27 :: (load 8 from %ir.arrayidx, !tbaa !28)
     dead $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, implicit-def $al, debug-location !32
     CALL64pcrel32 @atoi, csr_64, implicit $rsp, implicit $rdi, implicit $al, implicit-def $rsp, implicit-def $eax, debug-location !32
     $edi = MOV32rr $eax, debug-location !32
-    DBG_VALUE debug-use $edi, debug-use _, !14, !20, debug-location !33
+    DBG_VALUE $edi, _, !14, !20, debug-location !33
   
   bb.2.if.end:
     successors: %bb.3.if.then.3(16), %bb.4.if.else.5(16)
@@ -218,7 +218,7 @@ body:             |
   
     CALL64pcrel32 @change, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, implicit-def $eax, debug-location !34
     $ebx = MOV32rr $eax, debug-location !34
-    DBG_VALUE debug-use $ebx, debug-use _, !14, !20, debug-location !33
+    DBG_VALUE $ebx, _, !14, !20, debug-location !33
     CMP32ri8 $ebx, 11, implicit-def $eflags, debug-location !37
     JL_1 %bb.4.if.else.5, implicit killed $eflags, debug-location !37
   
@@ -226,7 +226,7 @@ body:             |
     successors: %bb.5.if.end.7(0)
     liveins: $ebx
   
-    DBG_VALUE debug-use $ebx, debug-use _, !14, !20, debug-location !33
+    DBG_VALUE $ebx, _, !14, !20, debug-location !33
     $edi = MOV32rr $ebx, debug-location !38
     CALL64pcrel32 @modify, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, implicit-def $eax, debug-location !38
     $ecx = MOV32rr $eax, debug-location !38
@@ -237,7 +237,7 @@ body:             |
     successors: %bb.5.if.end.7(0)
     liveins: $ebx
   
-    DBG_VALUE debug-use $ebx, debug-use _, !14, !20, debug-location !33
+    DBG_VALUE $ebx, _, !14, !20, debug-location !33
     $edi = MOV32rr killed $ebx, debug-location !42
     CALL64pcrel32 @inc, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, implicit-def $eax, debug-location !42
     $ecx = MOV32rr $eax, debug-location !42
diff --git a/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg-debugonly.mir b/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg-debugonly.mir
index f8d3603b79d9d72dcc4e5870e912a387d2cac176..4a2f96c5d5ee7537ee9fee3d469ec23e62a42cc9 100644
--- a/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg-debugonly.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg-debugonly.mir
@@ -130,12 +130,12 @@ stack:
 constants:
 body:             |
   bb.0.entry:
-    DBG_VALUE debug-use $edi, debug-use _, !21, !DIExpression(), debug-location !25
-    DBG_VALUE debug-use $rsi, debug-use _, !22, !DIExpression(), debug-location !26
+    DBG_VALUE $edi, _, !21, !DIExpression(), debug-location !25
+    DBG_VALUE $rsi, _, !22, !DIExpression(), debug-location !26
     %2 = MOV32rm $rip, 1, _, @bar, _, debug-location !27 :: (dereferenceable load 4 from `i32* getelementptr inbounds ([2 x i32], [2 x i32]* @bar, i64 0, i64 0)`, !tbaa !28)
-    DBG_VALUE debug-use %2, debug-use _, !23, !DIExpression(), debug-location !32
+    DBG_VALUE %2, _, !23, !DIExpression(), debug-location !32
     %3 = MOV32rm $rip, 1, _, @bar + 4, _, debug-location !33 :: (dereferenceable load 4 from `i32* getelementptr inbounds ([2 x i32], [2 x i32]* @bar, i64 0, i64 1)`, !tbaa !28)
-    DBG_VALUE debug-use %3, debug-use _, !24, !DIExpression(), debug-location !34
+    DBG_VALUE %3, _, !24, !DIExpression(), debug-location !34
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !35
     $edi = COPY %2, debug-location !35
     $esi = COPY %3, debug-location !35
@@ -153,11 +153,11 @@ body:             |
 # not cover the whole BB.
 #
 # CHECKDBG-LABEL: ********** EMITTING LIVE DEBUG VARIABLES **********
-# CHECKDBG-NEXT: !"argc,5"        [0B;0e):0 Loc0=debug-use $edi
+# CHECKDBG-NEXT: !"argc,5"        [0B;0e):0 Loc0=$edi
 # CHECKDBG-NEXT:         [0B;0e):0 %bb.0-160B
-# CHECKDBG-NEXT: !"argv,5"        [0B;0e):0 Loc0=debug-use $rsi
+# CHECKDBG-NEXT: !"argv,5"        [0B;0e):0 Loc0=$rsi
 # CHECKDBG-NEXT:         [0B;0e):0 %bb.0-160B
-# CHECKDBG-NEXT: !"a0,7"  [16r;64r):0 Loc0=debug-use %2
+# CHECKDBG-NEXT: !"a0,7"  [16r;64r):0 Loc0=%2
 # CHECKDBG-NEXT:         [16r;64r):0 %bb.0-160B
-# CHECKDBG-NEXT: !"a1,8"  [32r;80r):0 Loc0=debug-use %3
+# CHECKDBG-NEXT: !"a1,8"  [32r;80r):0 Loc0=%3
 # CHECKDBG-NEXT:         [32r;80r):0 %bb.0-160B
diff --git a/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg.mir b/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg.mir
index 430dbb742d43b18cea44c6204492c1de1073356f..ac0d519ddfef6336887f3b1c54f32b16a1a3b2a8 100644
--- a/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg.mir
+++ b/test/DebugInfo/MIR/X86/live-debug-vars-unused-arg.mir
@@ -128,12 +128,12 @@ stack:
 constants:
 body:             |
   bb.0.entry:
-    DBG_VALUE debug-use $edi, debug-use _, !21, !DIExpression(), debug-location !25
-    DBG_VALUE debug-use $rsi, debug-use _, !22, !DIExpression(), debug-location !26
+    DBG_VALUE $edi, _, !21, !DIExpression(), debug-location !25
+    DBG_VALUE $rsi, _, !22, !DIExpression(), debug-location !26
     %2 = MOV32rm $rip, 1, _, @bar, _, debug-location !27 :: (dereferenceable load 4 from `i32* getelementptr inbounds ([2 x i32], [2 x i32]* @bar, i64 0, i64 0)`, !tbaa !28)
-    DBG_VALUE debug-use %2, debug-use _, !23, !DIExpression(), debug-location !32
+    DBG_VALUE %2, _, !23, !DIExpression(), debug-location !32
     %3 = MOV32rm $rip, 1, _, @bar + 4, _, debug-location !33 :: (dereferenceable load 4 from `i32* getelementptr inbounds ([2 x i32], [2 x i32]* @bar, i64 0, i64 1)`, !tbaa !28)
-    DBG_VALUE debug-use %3, debug-use _, !24, !DIExpression(), debug-location !34
+    DBG_VALUE %3, _, !24, !DIExpression(), debug-location !34
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !35
     $edi = COPY %2, debug-location !35
     $esi = COPY %3, debug-location !35
@@ -150,9 +150,9 @@ body:             |
 # CHECKMIR: ![[ARGV:[0-9]+]] = !DILocalVariable(name: "argv", arg: 2
 # CHECKMIR: name:            main
 # CHECKMIR: body:
-# CHECKMIR: DBG_VALUE debug-use $edi, debug-use $noreg, ![[ARGC]]
-# CHECKMIR-NOT: DBG_VALUE debug-use %{{.*}}, debug-use $noreg, ![[ARGC]]
-# CHECKMIR: DBG_VALUE debug-use $rsi, debug-use $noreg, ![[ARGV]]
-# CHECKMIR-NOT: DBG_VALUE debug-use %{{.*}}, debug-use $noreg, ![[ARGC]]
-# CHECKMIR-NOT: DBG_VALUE debug-use %{{.*}}, debug-use $noreg, ![[ARGV]]
+# CHECKMIR: DBG_VALUE $edi, $noreg, ![[ARGC]]
+# CHECKMIR-NOT: DBG_VALUE %{{.*}}, $noreg, ![[ARGC]]
+# CHECKMIR: DBG_VALUE $rsi, $noreg, ![[ARGV]]
+# CHECKMIR-NOT: DBG_VALUE %{{.*}}, $noreg, ![[ARGC]]
+# CHECKMIR-NOT: DBG_VALUE %{{.*}}, $noreg, ![[ARGV]]
 
diff --git a/test/DebugInfo/MIR/X86/livedebugvalues-limit.mir b/test/DebugInfo/MIR/X86/livedebugvalues-limit.mir
index 6c78a76a3282a39b3395d971c3bfc9a8eea4d4d3..509d16a736cb64539faebf2785a11b2cbe08ae18 100644
--- a/test/DebugInfo/MIR/X86/livedebugvalues-limit.mir
+++ b/test/DebugInfo/MIR/X86/livedebugvalues-limit.mir
@@ -25,13 +25,13 @@
   ; CHECK: ![[CS3]] = distinct !DILocation(line: 8, column: 3, scope: !{{[0-9]+}})
   ;
   ; CHECK:  bb.1.if.then:
-  ; CHECK:      DBG_VALUE debug-use $ebx, debug-use $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
-  ; CHECK-NOT:  DBG_VALUE debug-use $ebx, debug-use $noreg, ![[A_VAR]], !DIExpression(), debug-location
-  ; CHECK:      DBG_VALUE debug-use $ebx, debug-use $noreg, ![[A_VAR]], !DIExpression(), debug-location ![[INLCS2]]
+  ; CHECK:      DBG_VALUE $ebx, $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
+  ; CHECK-NOT:  DBG_VALUE $ebx, $noreg, ![[A_VAR]], !DIExpression(), debug-location
+  ; CHECK:      DBG_VALUE $ebx, $noreg, ![[A_VAR]], !DIExpression(), debug-location ![[INLCS2]]
   ; CHECK: bb.2.if.end:
-  ; CHECK:     DBG_VALUE debug-use $ebx, debug-use $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
-  ; CHECK-NOT: DBG_VALUE debug-use $ebx, debug-use $noreg, ![[A_VAR]], !DIExpression(), debug-location
-  ; CHECK:     DBG_VALUE debug-use $ebx, debug-use $noreg, ![[A_VAR]], !DIExpression(), debug-location ![[INLCS3]]
+  ; CHECK:     DBG_VALUE $ebx, $noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
+  ; CHECK-NOT: DBG_VALUE $ebx, $noreg, ![[A_VAR]], !DIExpression(), debug-location
+  ; CHECK:     DBG_VALUE $ebx, $noreg, ![[A_VAR]], !DIExpression(), debug-location ![[INLCS3]]
   ;
   ; ModuleID = 'livedebugvalues-limit.ll'
   source_filename = "livedebugvalues-limit.c"
@@ -159,7 +159,7 @@ body:             |
     CFI_INSTRUCTION offset $rbp, -16
     $rbp = frame-setup MOV64rr $rsp
     CFI_INSTRUCTION def_cfa_register $rbp
-    DBG_VALUE debug-use $edi, debug-use _, !12, !13, debug-location !14
+    DBG_VALUE $edi, _, !12, !13, debug-location !14
     $rbp = POP64r implicit-def $rsp, implicit $rsp, debug-location !15
     TAILJMPd64 @sink, csr_64, implicit $rsp, implicit $rsp, implicit $edi, debug-location !15
 
@@ -208,10 +208,10 @@ body:             |
     frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
     frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
     CFI_INSTRUCTION offset $rbx, -24
-    DBG_VALUE debug-use $edi, debug-use _, !19, !13, debug-location !20
+    DBG_VALUE $edi, _, !19, !13, debug-location !20
     $ebx = MOV32rr $edi
-    DBG_VALUE debug-use $ebx, debug-use _, !12, !13, debug-location !21
-    DBG_VALUE debug-use $ebx, debug-use _, !19, !13, debug-location !20
+    DBG_VALUE $ebx, _, !12, !13, debug-location !21
+    DBG_VALUE $ebx, _, !19, !13, debug-location !20
     CALL64pcrel32 @sink, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !23
     TEST32rr $ebx, $ebx, implicit-def $eflags, debug-location !24
     JE_1 %bb.2.if.end, implicit $eflags
@@ -220,18 +220,18 @@ body:             |
     successors: %bb.2.if.end
     liveins: $ebx, $rbp
   
-    DBG_VALUE debug-use $ebx, debug-use _, !19, !13, debug-location !20
-    DBG_VALUE debug-use $ebx, debug-use _, !12, !13, debug-location !27
+    DBG_VALUE $ebx, _, !19, !13, debug-location !20
+    DBG_VALUE $ebx, _, !12, !13, debug-location !27
     $edi = MOV32rr $ebx, debug-location !29
     CALL64pcrel32 @sink, csr_64, implicit $rsp, implicit $edi, implicit-def $rsp, debug-location !29
   
   bb.2.if.end:
     liveins: $ebx, $rbp
   
-    DBG_VALUE debug-use $ebx, debug-use _, !19, !13, debug-location !20
+    DBG_VALUE $ebx, _, !19, !13, debug-location !20
     $edi = MOV32rr killed $ebx, debug-location !33
     $rsp = ADD64ri8 $rsp, 8, implicit-def dead $eflags, debug-location !33
-    DBG_VALUE debug-use $ebx, debug-use _, !12, !13, debug-location !31
+    DBG_VALUE $ebx, _, !12, !13, debug-location !31
     $rbx = POP64r implicit-def $rsp, implicit $rsp, debug-location !33
     $rbp = POP64r implicit-def $rsp, implicit $rsp, debug-location !33
     TAILJMPd64 @sink, csr_64, implicit $rsp, implicit $rsp, implicit $edi, debug-location !33
diff --git a/test/DebugInfo/MIR/X86/mlicm-hoist.mir b/test/DebugInfo/MIR/X86/mlicm-hoist.mir
index 2c2f4edad4f67e6e58d0f7d57d6b800ceaa7486a..0797e89d2c61b5fad4808272a9ad6c27b4842039 100644
--- a/test/DebugInfo/MIR/X86/mlicm-hoist.mir
+++ b/test/DebugInfo/MIR/X86/mlicm-hoist.mir
@@ -122,17 +122,17 @@ body:             |
     successors: %bb.1.while.body(0x80000000)
     liveins: $rdi
 
-    DBG_VALUE debug-use $rdi, debug-use _, !16, !17, debug-location !18
+    DBG_VALUE $rdi, _, !16, !17, debug-location !18
     %2 = COPY $rdi
-    DBG_VALUE debug-use %2, debug-use _, !16, !17, debug-location !18
+    DBG_VALUE %2, _, !16, !17, debug-location !18
 
   bb.1.while.body:
     successors: %bb.1.while.body(0x80000000)
 
     %0 = PHI %2, %bb.0.entry, %1, %bb.1.while.body
-    DBG_VALUE debug-use %0, debug-use _, !16, !17, debug-location !18
+    DBG_VALUE %0, _, !16, !17, debug-location !18
     %1 = ADD64ri8 %0, 4, implicit-def dead $eflags, debug-location !20
-    DBG_VALUE debug-use %1, debug-use _, !16, !17, debug-location !18
+    DBG_VALUE %1, _, !16, !17, debug-location !18
     %3 = MOV32rm %0, 1, _, 0, _, debug-location !21 :: (load 4 from %ir.p.addr.0, !tbaa !22)
     %4 = MOV64rm $rip, 1, _, target-flags(x86-gotpcrel) @x, _, debug-location !26 :: (load 8 from got)
     MOV32mr killed %4, 1, _, 0, _, killed %3, debug-location !26 :: (store 4 into @x, !tbaa !22)
diff --git a/test/DebugInfo/MIR/X86/regcoalescer.mir b/test/DebugInfo/MIR/X86/regcoalescer.mir
index 4136d5ebe6354541c8b3ecbbe5b78642c873a4ea..8601893cdc7dffa61482858c2447b6395120fb4e 100644
--- a/test/DebugInfo/MIR/X86/regcoalescer.mir
+++ b/test/DebugInfo/MIR/X86/regcoalescer.mir
@@ -40,11 +40,11 @@ registers:
 body:             |
   bb.0.entry:
     %0 = MOV32r0 implicit-def dead $eflags, debug-location !19
-    DBG_VALUE debug-use %0, debug-use _, !18, !DIExpression(), debug-location !20
+    DBG_VALUE %0, _, !18, !DIExpression(), debug-location !20
     $eax = COPY killed %0, debug-location !21
     RET 0, killed $eax, debug-location !21
 
 ...
 
 # CHECK: $eax = MOV32r0
-# CHECK-NEXT: DBG_VALUE debug-use $eax
+# CHECK-NEXT: DBG_VALUE $eax
diff --git a/test/DebugInfo/MSP430/sdagsplit-1.ll b/test/DebugInfo/MSP430/sdagsplit-1.ll
index 7f2356a083fe1c0789e6fc9c9b2a269006464c49..9e77e950f837bc21d135679fb14b4adde840650b 100644
--- a/test/DebugInfo/MSP430/sdagsplit-1.ll
+++ b/test/DebugInfo/MSP430/sdagsplit-1.ll
@@ -13,10 +13,10 @@
 ;      return 0;
 ;    }
 ;
-; CHECK-DAG: DBG_VALUE debug-use $r{{[0-9]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 32, 16), debug-location !{{[0-9]+}}
-; CHECK-DAG: DBG_VALUE debug-use $r{{[0-9]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 48, 16), debug-location !{{[0-9]+}}
-; CHECK-DAG: DBG_VALUE debug-use $r{{[0-9]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 0, 16), debug-location !{{[0-9]+}}
-; CHECK-DAG: DBG_VALUE debug-use $r{{[0-9]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 16, 16), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE $r{{[0-9]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 32, 16), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE $r{{[0-9]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 48, 16), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE $r{{[0-9]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 0, 16), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE $r{{[0-9]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 16, 16), debug-location !{{[0-9]+}}
 
 ; ModuleID = 'sdagsplit-1.c'
 target datalayout = "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16"
diff --git a/test/DebugInfo/Mips/delay-slot.ll b/test/DebugInfo/Mips/delay-slot.ll
index f8959a2c52becf6e726231a3e319beabacd99b63..8f444bce30fd12a1e35629bbe8a58cf0c987fd27 100644
--- a/test/DebugInfo/Mips/delay-slot.ll
+++ b/test/DebugInfo/Mips/delay-slot.ll
@@ -16,7 +16,7 @@
 ; CHECK: 0x0000000000000004      2      0      1   0             0  is_stmt prologue_end
 ; CHECK: 0x0000000000000024      3      0      1   0             0  is_stmt
 ; CHECK: 0x0000000000000034      4      0      1   0             0  is_stmt
-; CHECK: 0x0000000000000044      5      0      1   0             0  is_stmt
+; CHECK: 0x0000000000000048      5      0      1   0             0  is_stmt
 ; CHECK: 0x0000000000000058      5      0      1   0             0  is_stmt end_sequence
 
 
diff --git a/test/DebugInfo/Mips/eh_frame.ll b/test/DebugInfo/Mips/eh_frame.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4687443cb1cff294e18c43d1bae4bae0cce47385
--- /dev/null
+++ b/test/DebugInfo/Mips/eh_frame.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple mips-unknown-linux-gnu -mattr=+micromips -O3 -filetype=obj -o - %s | llvm-readelf -r | FileCheck %s
+
+; CHECK: .rel.eh_frame
+; CHECK: DW.ref.__gxx_personality_v0
+; CHECK-NEXT: .text
+; CHECK-NEXT: .gcc_except_table
+
+@_ZTIi = external constant i8*
+
+define dso_local i32 @main() local_unnamed_addr personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  %exception.i = tail call i8* @__cxa_allocate_exception(i32 4) nounwind
+  %0 = bitcast i8* %exception.i to i32*
+  store i32 5, i32* %0, align 16
+  invoke void @__cxa_throw(i8* %exception.i, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+          to label %.noexc unwind label %return
+
+.noexc:
+  unreachable
+
+return:
+  %1 = landingpad { i8*, i32 }
+          catch i8* null
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = tail call i8* @__cxa_begin_catch(i8* %2) nounwind
+  tail call void @__cxa_end_catch()
+  ret i32 0
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*) local_unnamed_addr
+
+declare void @__cxa_end_catch() local_unnamed_addr
+
+declare i8* @__cxa_allocate_exception(i32) local_unnamed_addr
+
+declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
diff --git a/test/DebugInfo/NVPTX/cu-range-hole.ll b/test/DebugInfo/NVPTX/cu-range-hole.ll
index 01d038477c0da1f59e9d32c4e6cb7d28212b49b5..c8ea509396a943fb7ab74d58417f479936a92f1c 100644
--- a/test/DebugInfo/NVPTX/cu-range-hole.ll
+++ b/test/DebugInfo/NVPTX/cu-range-hole.ll
@@ -148,75 +148,15 @@ entry:
 ; CHECK: // .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK: // .b8 8                                // Address Size (in bytes)
 ; CHECK: // .b8 1                                // Abbrev [1] 0xb:0xb0 DW_TAG_compile_unit
-; CHECK: // .b8 99                               // DW_AT_producer
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 118
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 32
-; CHECK: // .b8 51
-; CHECK: // .b8 46
-; CHECK: // .b8 53
-; CHECK: // .b8 46
-; CHECK: // .b8 48
-; CHECK: // .b8 32
-; CHECK: // .b8 40
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 107
-; CHECK: // .b8 32
-; CHECK: // .b8 50
-; CHECK: // .b8 48
-; CHECK: // .b8 52
-; CHECK: // .b8 49
-; CHECK: // .b8 54
-; CHECK: // .b8 52
-; CHECK: // .b8 41
-; CHECK: // .b8 32
-; CHECK: // .b8 40
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 118
-; CHECK: // .b8 109
-; CHECK: // .b8 47
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 107
-; CHECK: // .b8 32
-; CHECK: // .b8 50
-; CHECK: // .b8 48
-; CHECK: // .b8 52
-; CHECK: // .b8 49
-; CHECK: // .b8 56
-; CHECK: // .b8 51
-; CHECK: // .b8 41
+; CHECK: // .b8 99,108,97,110,103,32,118,101,114,115,105,111,110,32,51,46,53,46,48,32,40,116,114,117,110,107,32,50,48,52,49,54,52,41,32,40,108,108,118,109 // DW_AT_producer
+; CHECK: // .b8 47,116,114,117,110,107,32,50,48,52,49,56,51,41
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 46
-; CHECK: // .b8 99
+; CHECK: // .b8 98,46,99                         // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 115
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 114
-; CHECK: // .b8 99
-; CHECK: // .b8 101
+; CHECK: // .b8 47,115,111,117,114,99,101        // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end2                      // DW_AT_high_pc
@@ -259,9 +199,7 @@ entry:
 ; CHECK: // .b32 179                             // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 4                                // Abbrev [4] 0xb3:0x7 DW_TAG_base_type
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 105,110,116                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
diff --git a/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
index 8a6fddddd8843e8f7d0d4b886b91a558d6488fef..9a4beed23d7d8150636393b5114bb19fbdbab2b1 100644
--- a/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
+++ b/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
@@ -129,23 +129,14 @@
 ; CHECK: // .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK: // .b8 8                                // Address Size (in bytes)
 ; CHECK: // .b8 1                                // Abbrev [1] 0xb:0x80 DW_TAG_compile_unit
-; CHECK: // .b8 99                               // DW_AT_producer
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 103
+; CHECK: // .b8 99,108,97,110,103                // DW_AT_producer
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 46
-; CHECK: // .b8 99
+; CHECK: // .b8 116,46,99                        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 116                              // DW_AT_comp_dir
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 116
+; CHECK: // .b8 116,101,115,116                  // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
@@ -154,21 +145,7 @@
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
 ; CHECK: // .b8 1                                // DW_AT_frame_base
 ; CHECK: // .b8 156
-; CHECK: // .b8 117                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 95
-; CHECK: // .b8 100
-; CHECK: // .b8 98
-; CHECK: // .b8 103
-; CHECK: // .b8 95
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 99
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 101
+; CHECK: // .b8 117,115,101,95,100,98,103,95,100,101,99,108,97,114,101 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 3                                // DW_AT_decl_line
@@ -187,9 +164,7 @@
 ; CHECK: // .b32 110                             // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 4                                // Abbrev [4] 0x6e:0x15 DW_TAG_structure_type
-; CHECK: // .b8 70                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 111
+; CHECK: // .b8 70,111,111                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 1                                // DW_AT_decl_file
@@ -205,9 +180,7 @@
 ; CHECK: // .b8 0
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 6                                // Abbrev [6] 0x83:0x7 DW_TAG_base_type
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 105,110,116                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
diff --git a/test/DebugInfo/NVPTX/dbg-value-const-byref.ll b/test/DebugInfo/NVPTX/dbg-value-const-byref.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b120a406454d0b1836a47ef7982c12e5995c80c0
--- /dev/null
+++ b/test/DebugInfo/NVPTX/dbg-value-const-byref.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=nvptx64-nvidia-cuda < %s | FileCheck %s
+; Generated with -O1 from:
+; int f1();
+; void f2(int*);
+; int f3(int);
+;
+; int foo() {
+;   int i = 3;
+;   f3(i);
+;   i = 7;
+;   i = f1();
+;   f2(&i);
+;   return 0;
+; }
+;
+; Test that we generate valid debug info for optimized code,
+; particularly variables that are described as constants and passed
+; by reference.
+;
+; CHECK: DEBUG_VALUE: foo:i <- [DW_OP_deref] $vrdepot
+; CHECK: DEBUG_VALUE: foo:i <- 3
+; CHECK: DEBUG_VALUE: foo:i <- 7
+; CHECK: DEBUG_VALUE: foo:i <- %
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo() #0 !dbg !4 {
+entry:
+  %i = alloca i32, align 4
+  call void @llvm.dbg.value(metadata i32 3, metadata !10, metadata !DIExpression()), !dbg !15
+  %call = call i32 @f3(i32 3) #3, !dbg !16
+  call void @llvm.dbg.value(metadata i32 7, metadata !10, metadata !DIExpression()), !dbg !18
+  %call1 = call i32 (...) @f1() #3, !dbg !19
+  call void @llvm.dbg.value(metadata i32 %call1, metadata !10, metadata !DIExpression()), !dbg !19
+  store i32 %call1, i32* %i, align 4, !dbg !19, !tbaa !20
+  call void @llvm.dbg.value(metadata i32* %i, metadata !10, metadata !DIExpression(DW_OP_deref)), !dbg !24
+  call void @f2(i32* %i) #3, !dbg !24
+  ret i32 0, !dbg !25
+}
+
+declare i32 @f3(i32)
+
+declare i32 @f1(...)
+
+declare void @f2(i32*)
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!11, !12}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5.0 ", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "dbg-value-const-byref.c", directory: "")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: true, unit: !0, scopeLine: 5, file: !1, scope: !5, type: !6, retainedNodes: !9)
+!5 = !DIFile(filename: "dbg-value-const-byref.c", directory: "")
+!6 = !DISubroutineType(types: !7)
+!7 = !{!8}
+!8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !{!10}
+!10 = !DILocalVariable(name: "i", line: 6, scope: !4, file: !5, type: !8)
+!11 = !{i32 2, !"Dwarf Version", i32 2}
+!12 = !{i32 1, !"Debug Info Version", i32 3}
+!13 = !{!"clang version 3.5.0 "}
+!14 = !{i32 3}
+!15 = !DILocation(line: 6, scope: !4)
+!16 = !DILocation(line: 7, scope: !4)
+!17 = !{i32 7}
+!18 = !DILocation(line: 8, scope: !4)
+!19 = !DILocation(line: 9, scope: !4)
+!20 = !{!21, !21, i64 0}
+!21 = !{!"int", !22, i64 0}
+!22 = !{!"omnipotent char", !23, i64 0}
+!23 = !{!"Simple C/C++ TBAA"}
+!24 = !DILocation(line: 10, scope: !4)
+!25 = !DILocation(line: 11, scope: !4)
diff --git a/test/DebugInfo/NVPTX/debug-file-loc-only.ll b/test/DebugInfo/NVPTX/debug-file-loc-only.ll
new file mode 100644
index 0000000000000000000000000000000000000000..389a7c65781ae19990111331cd1f15a2bc1a7faa
--- /dev/null
+++ b/test/DebugInfo/NVPTX/debug-file-loc-only.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda | FileCheck %s
+
+; // Bitcode int this test case is reduced version of compiled code below:
+;extern "C" {
+;#line 1 "/source/dir/foo.h"
+;__device__ void foo() {}
+;#line 2 "/source/dir/bar.cu"
+;__device__ void bar() {}
+;}
+
+; CHECK: .target sm_{{[0-9]+$}}
+
+; CHECK: .visible .func foo()
+; CHECK: .loc [[FOO:[0-9]+]] 1 31
+; CHECK:  ret;
+; CHECK: .visible .func bar()
+; CHECK: .loc [[BAR:[0-9]+]] 2 31
+; CHECK:  ret;
+
+define void @foo() !dbg !4 {
+bb:
+  ret void, !dbg !10
+}
+
+define void @bar() !dbg !7 {
+bb:
+  ret void, !dbg !11
+}
+
+; CHECK-DAG: .file [[FOO]] "{{.*}}foo.h"
+; CHECK-DAG: .file [[BAR]] "{{.*}}bar.cu"
+
+; CHECK-NOT: .section .debug{{.*}}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "", isOptimized: false, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2)
+!1 = !DIFile(filename: "bar.cu", directory: "/source/dir")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "foo", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!5 = !DIFile(filename: "foo.h", directory: "/source/dir")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!8 = !{i32 2, !"Dwarf Version", i32 2}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !DILocation(line: 1, column: 31, scope: !4)
+!11 = !DILocation(line: 2, column: 31, scope: !7)
diff --git a/test/DebugInfo/NVPTX/debug-file-loc.ll b/test/DebugInfo/NVPTX/debug-file-loc.ll
index 16753e763220a3f7f7d084c49619184654384b71..a9ea67c338867f806a29ffc11200766fa57de031 100644
--- a/test/DebugInfo/NVPTX/debug-file-loc.ll
+++ b/test/DebugInfo/NVPTX/debug-file-loc.ll
@@ -63,25 +63,10 @@ bb:
 ; CHECK: // .b8 0                                // DW_AT_producer
 ; CHECK: // .b8 4                                // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 46
-; CHECK: // .b8 99
-; CHECK: // .b8 117
+; CHECK: // .b8 98,97,114,46,99,117              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 115
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 114
-; CHECK: // .b8 99
-; CHECK: // .b8 101
-; CHECK: // .b8 47
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 114
+; CHECK: // .b8 47,115,111,117,114,99,101,47,100,105,114                // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end1                      // DW_AT_high_pc
diff --git a/test/DebugInfo/NVPTX/debug-info.ll b/test/DebugInfo/NVPTX/debug-info.ll
index f80a8426286f91c36bee2591ff05cd32675f3555..02e6240aa3eea49d4499d768f7db6e5462b73e73 100644
--- a/test/DebugInfo/NVPTX/debug-info.ll
+++ b/test/DebugInfo/NVPTX/debug-info.ll
@@ -36,7 +36,6 @@
 ; CHECK: setp.ge.s32     %p{{.+}}, %r{{.+}}, %r{{.+}};
 ; CHECK: .loc [[DEBUG_INFO_CU]] 7 7
 ; CHECK: @%p{{.+}} bra   [[BB:.+]];
-; CHECK: .loc [[DEBUG_INFO_CU]] 8 13
 ; CHECK: ld.param.f32    %f{{.+}}, [{{.+}}];
 ; CHECK: ld.param.u64    %rd{{.+}}, [{{.+}}];
 ; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
@@ -44,6 +43,7 @@
 ; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
 ; CHECK: mul.wide.u32    %rd{{.+}}, %r{{.+}}, 4;
 ; CHECK: add.s64         %rd{{.+}}, %rd{{.+}}, %rd{{.+}};
+; CHECK: .loc [[DEBUG_INFO_CU]] 8 13
 ; CHECK: ld.global.f32   %f{{.+}}, [%rd{{.+}}];
 ; CHECK: add.s64         %rd{{.+}}, %rd{{.+}}, %rd{{.+}};
 ; CHECK: .loc [[DEBUG_INFO_CU]] 8 19
@@ -158,8 +158,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 5                                // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -281,8 +280,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 12                               // DW_FORM_flag
 ; CHECK: // .b8 63                               // DW_AT_external
 ; CHECK: // .b8 12                               // DW_FORM_flag
-; CHECK: // .b8 135                              // DW_AT_noreturn
-; CHECK: // .b8 1
+; CHECK: // .b8 135,1                            // DW_AT_noreturn
 ; CHECK: // .b8 12                               // DW_FORM_flag
 ; CHECK: // .b8 0                                // EOM(1)
 ; CHECK: // .b8 0                                // EOM(2)
@@ -351,8 +349,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 12                               // DW_FORM_flag
 ; CHECK: // .b8 63                               // DW_AT_external
 ; CHECK: // .b8 12                               // DW_FORM_flag
-; CHECK: // .b8 135                              // DW_AT_noreturn
-; CHECK: // .b8 1
+; CHECK: // .b8 135,1                            // DW_AT_noreturn
 ; CHECK: // .b8 12                               // DW_FORM_flag
 ; CHECK: // .b8 0                                // EOM(1)
 ; CHECK: // .b8 0                                // EOM(2)
@@ -391,8 +388,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 25                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -411,8 +407,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 26                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -442,8 +437,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 28                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 0                                // DW_CHILDREN_no
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -488,8 +482,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 31                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -508,8 +501,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 32                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -574,8 +566,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 37                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -594,8 +585,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 38                               // Abbreviation Code
 ; CHECK: // .b8 46                               // DW_TAG_subprogram
 ; CHECK: // .b8 1                                // DW_CHILDREN_yes
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -631,8 +621,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_FORM_addr
 ; CHECK: // .b8 64                               // DW_AT_frame_base
 ; CHECK: // .b8 10                               // DW_FORM_block1
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -695,9 +684,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // EOM(1)
 ; CHECK: // .b8 0                                // EOM(2)
 ; CHECK: // .b8 0                                // EOM(3)
-; CHECK: //	}
-; CHECK: //	.section	.debug_info
-; CHECK: //	{
+; CHECK: // }
+; CHECK: // .section .debug_info
+; CHECK: // {
 ; CHECK: // .b32 10025                           // Length of Unit
 ; CHECK: // .b8 2                                // DWARF version number
 ; CHECK: // .b8 0
@@ -707,43 +696,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // DW_AT_producer
 ; CHECK: // .b8 4                                // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 103
-; CHECK: // .b8 45
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 111
-; CHECK: // .b8 46
-; CHECK: // .b8 99
-; CHECK: // .b8 117
+; CHECK: // .b8 100,101,98,117,103,45,105,110,102,111,46,99,117 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 115
-; CHECK: // .b8 111
-; CHECK: // .b8 109
-; CHECK: // .b8 101
-; CHECK: // .b8 47
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 99
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 121
+; CHECK: // .b8 47,115,111,109,101,47,100,105,114,101,99,116,111,114,121 // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
 ; CHECK: // .b8 2                                // Abbrev [2] 0x41:0x588 DW_TAG_namespace
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 100
+; CHECK: // .b8 115,116,100                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 3                                // Abbrev [3] 0x46:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 1                                // DW_AT_decl_file
@@ -1432,7 +1393,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 172                              // DW_AT_decl_line
 ; CHECK: // .b8 1
-; CHECK:  / .b32 6628                            // DW_AT_import
+; CHECK: // .b32 6628                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x4d0:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 173                              // DW_AT_decl_line
@@ -1472,7 +1433,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 180                              // DW_AT_decl_line
 ; CHECK: // .b8 1
-; CHECK:  / .b32 6931                            // DW_AT_import
+; CHECK: // .b32 6931                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x510:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 181                              // DW_AT_decl_line
@@ -1506,7 +1467,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 4                                // Abbrev [4] 0x540:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 187                              // DW_AT_decl_line
-; CHECK:  / .b8 1
+; CHECK: // .b8 1
 ; CHECK: // .b32 7163                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x548:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
@@ -1529,7 +1490,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1
 ; CHECK: // .b32 7330                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x568:0x8 DW_TAG_imported_declaration
-; CHECK:  / .b8 10                               // DW_AT_decl_file
+; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 192                              // DW_AT_decl_line
 ; CHECK: // .b8 1
 ; CHECK: // .b32 7379                            // DW_AT_import
@@ -1554,7 +1515,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1
 ; CHECK: // .b32 7538                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x590:0x8 DW_TAG_imported_declaration
-; CHECK:  / .b8 10                               // DW_AT_decl_file
+; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 197                              // DW_AT_decl_line
 ; CHECK: // .b8 1
 ; CHECK: // .b32 7580                            // DW_AT_import
@@ -1577,7 +1538,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 201                              // DW_AT_decl_line
 ; CHECK: // .b8 1
-; CHECK:  / .b32 7704                            // DW_AT_import
+; CHECK: // .b32 7704                            // DW_AT_import
 ; CHECK: // .b8 4                                // Abbrev [4] 0x5b8:0x8 DW_TAG_imported_declaration
 ; CHECK: // .b8 10                               // DW_AT_decl_file
 ; CHECK: // .b8 202                              // DW_AT_decl_line
@@ -1590,20 +1551,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 7772                            // DW_AT_import
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x5c9:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 120
+; CHECK: // .b8 95,90,76,51,97,98,115,120        // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 97,98,115                        // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK:  / .b8 1                                // DW_AT_decl_file
+; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 44                               // DW_AT_decl_line
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
@@ -1611,37 +1563,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x5e4:0x11 DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 108,111,110,103,32,108,111,110,103,32,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0x5f5:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 97
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,97,99,111,115,102    // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
+; CHECK: // .b8 97,99,111,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 46                               // DW_AT_decl_line
@@ -1651,32 +1580,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x612:0x9 DW_TAG_base_type
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK:  / .b8 111
-; CHECK: // .b8 97
-; CHECK: // .b8 116
+; CHECK: // .b8 102,108,111,97,116               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0x61b:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,99,111,115,104,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 97,99,111,115,104                // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK:  / .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 48                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
@@ -1685,20 +1597,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x63a:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,97,115,105,110,102   // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
+; CHECK: // .b8 97,115,105,110                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 50                               // DW_AT_decl_line
@@ -1708,22 +1609,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x657:0x1f DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,115,105,110,104,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 97,115,105,110,104               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 52                               // DW_AT_decl_line
@@ -1731,22 +1619,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x670:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x676:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,97,116,97,110,102    // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 97,116,97,110                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 56                               // DW_AT_decl_line
@@ -1754,28 +1631,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x68d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x693:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,116,97,110,50,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
+; CHECK: // .b8 97,116,97,110,50                 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
-; CHECK:  / .b8 54                               // DW_AT_decl_line
+; CHECK: // .b8 54                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x6ad:0x5 DW_TAG_formal_parameter
@@ -1784,22 +1647,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x6b8:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,116,97,110,104,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK:  / .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 97,116,97,110,104                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 58                               // DW_AT_decl_line
@@ -1809,20 +1659,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x6d7:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 99
-; CHECK: // .b8 98
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 99                               // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 114
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,52,99,98,114,116,102    // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,98,114,116                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 60                               // DW_AT_decl_line
@@ -1832,20 +1671,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x6f4:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 99
-; CHECK: // .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 99                               // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
+; CHECK: // .b8 95,90,76,52,99,101,105,108,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,101,105,108                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 62                               // DW_AT_decl_line
@@ -1855,29 +1683,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x711:0x2b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK:  / .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
+; CHECK: // .b8 95,90,76,56,99,111,112,121,115,105,103,110,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,112,121,115,105,103,110   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 64                               // DW_AT_decl_line
@@ -1889,18 +1697,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x73c:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,99,111,115,102       // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
+; CHECK: // .b8 99,111,115                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 66                               // DW_AT_decl_line
@@ -1910,20 +1709,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x757:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK:  / .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
+; CHECK: // .b8 95,90,76,52,99,111,115,104,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,115,104                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 68                               // DW_AT_decl_line
@@ -1933,18 +1721,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x774:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,101,114,102,102      // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,114,102                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 72                               // DW_AT_decl_line
@@ -1954,20 +1733,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x78f:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 99
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 101                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 99
+; CHECK: // .b8 95,90,76,52,101,114,102,99,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,114,102,99                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 70                               // DW_AT_decl_line
@@ -1977,66 +1745,33 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x7ac:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,101,120,112,102      // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 101,120,112                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 76                               // DW_AT_decl_line
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x7c1:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x7c7:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 50
+; CHECK: // .b8 95,90,76,52,101,120,112,50,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,50                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x7de:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x7e4:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 109
-; CHECK: // .b8 49
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 109
-; CHECK: // .b8 49
+; CHECK: // .b8 95,90,76,53,101,120,112,109,49,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,109,49               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 78                               // DW_AT_decl_line
@@ -2044,22 +1779,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x7fd:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x803:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 95,90,76,52,102,97,98,115,102    // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,97,98,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 80                               // DW_AT_decl_line
@@ -2069,21 +1793,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x820:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK:  / .b8 105
-; CHECK: // .b8 109
+; CHECK: // .b8 95,90,76,52,102,100,105,109,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,100,105,109                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 82                               // DW_AT_decl_line
@@ -2095,45 +1807,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x843:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
+; CHECK: // .b8 95,90,76,53,102,108,111,111,114,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,108,111,111,114              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 84                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0x85c:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0x85c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x862:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 97
+; CHECK: // .b8 95,90,76,51,102,109,97,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,97                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 86                               // DW_AT_decl_line
@@ -2146,22 +1834,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x883:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 5                                // Abbrev [5] 0x889:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 5                                // Abbrev [5] 0x889:0x23 DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,52,102,109,97,120,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 120
+; CHECK: // .b8 102,109,97,120                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 88                               // DW_AT_decl_line
@@ -2173,21 +1849,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x8ac:0x23 DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 105
-; CHECK: // .b8 110
+; CHECK: // .b8 95,90,76,52,102,109,105,110,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,105,110                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 90                               // DW_AT_decl_line
@@ -2199,21 +1863,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x8cf:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK:  / .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
+; CHECK: // .b8 95,90,76,52,102,109,111,100,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,111,100                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 92                               // DW_AT_decl_line
@@ -2225,33 +1877,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x8f2:0x2a DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 102
-; CHECK: // .b8 112
-; CHECK: // .b8 99
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 102
-; CHECK: // .b8 121
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 99
-; CHECK: // .b8 108
-; CHECK:  / .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 102
-; CHECK: // .b8 121
+; CHECK: // .b8 95,90,76,49,48,102,112,99,108,97,115,115,105,102,121,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,112,99,108,97,115,115,105,102,121 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 94                               // DW_AT_decl_line
@@ -2261,31 +1889,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x91c:0x7 DW_TAG_base_type
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 105,110,116                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0x923:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 95,90,76,53,102,114,101,120,112,102,80,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,114,101,120,112              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -2293,87 +1904,41 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x93e:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x943:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0x943:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x949:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 5                                // Abbrev [5] 0x94e:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 104
-; CHECK: // .b8 121
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 104                              // DW_AT_name
-; CHECK: // .b8 121
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,53,104,121,112,111,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 104,121,112,111,116              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 98                               // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x968:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x96d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x973:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,105,108,111,103,98,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
+; CHECK: // .b8 105,108,111,103,98               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 100                              // DW_AT_decl_line
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0x98c:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0x98c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x992:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 105
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,56,105,115,102,105,110,105,116,101,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 105
-; CHECK: // .b8 116
-; CHECK: // .b8 101
+; CHECK: // .b8 105,115,102,105,110,105,116,101  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 102                              // DW_AT_decl_line
@@ -2383,39 +1948,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x9b7:0x8 DW_TAG_base_type
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 108
+; CHECK: // .b8 98,111,111,108                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_encoding
-; CHECK:  / .b8 1                                // DW_AT_byte_size
+; CHECK: // .b8 1                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0x9bf:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,57,105,115,103,114,101,97,116,101,114,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
+; CHECK: // .b8 105,115,103,114,101,97,116,101,114 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 106                              // DW_AT_decl_line
@@ -2427,42 +1967,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0x9ec:0x38 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 52
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK:  / .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,52,105,115,103,114,101,97,116,101,114,101,113,117,97,108,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK:  / .b8 97
-; CHECK: // .b8 108
+; CHECK: // .b8 105,115,103,114,101,97,116,101,114,101,113,117,97,108 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 105                              // DW_AT_decl_line
@@ -2474,50 +1981,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xa24:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,105,115,105,110,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 105,115,105,110,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
-; CHECK:  / .b8 108                              // DW_AT_decl_line
+; CHECK: // .b8 108                              // DW_AT_decl_line
 ; CHECK: // .b32 2487                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xa3d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xa43:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,105,115,108,101,115,115,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
+; CHECK: // .b8 105,115,108,101,115,115          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 112                              // DW_AT_decl_line
@@ -2529,36 +2007,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xa6a:0x32 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 49
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,49,105,115,108,101,115,115,101,113,117,97,108,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 113
-; CHECK:  / .b8 117
-; CHECK: // .b8 97
-; CHECK: // .b8 108
+; CHECK: // .b8 105,115,108,101,115,115,101,113,117,97,108 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 111                              // DW_AT_decl_line
@@ -2570,40 +2021,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xa9c:0x36 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 51
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,51,105,115,108,101,115,115,103,114,101,97,116,101,114,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK:  / .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 115
-; CHECK: // .b8 103
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
+; CHECK: // .b8 105,115,108,101,115,115,103,114,101,97,116,101,114 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 114                              // DW_AT_decl_line
@@ -2615,22 +2035,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xad2:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 110
-; CHECK: // .b8 97
-; CHECK:  / .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,105,115,110,97,110,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 110
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 105,115,110,97,110               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 116                              // DW_AT_decl_line
@@ -2640,28 +2047,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xaf1:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 110
-; CHECK: // .b8 111
-; CHECK:  / .b8 114
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,56,105,115,110,111,114,109,97,108,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 110
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 108
+; CHECK: // .b8 105,115,110,111,114,109,97,108   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 118                              // DW_AT_decl_line
@@ -2671,36 +2059,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xb16:0x32 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK:  / .b8 49
-; CHECK: // .b8 105
-; CHECK: // .b8 115
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,49,105,115,117,110,111,114,100,101,114,101,100,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK:  / .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 100
+; CHECK: // .b8 105,115,117,110,111,114,100,101,114,101,100 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 120                              // DW_AT_decl_line
@@ -2712,20 +2073,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xb48:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 108
-; CHECK:  / .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 95,90,76,52,108,97,98,115,108    // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,97,98,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 121                              // DW_AT_decl_line
@@ -2735,64 +2085,28 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0xb65:0xc DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 108,111,110,103,32,105,110,116   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
-; CHECK:  / .b8 5                                // Abbrev [5] 0xb71:0x25 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 5                                // Abbrev [5] 0xb71:0x25 DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,53,108,100,101,120,112,102,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,100,101,120,112              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 123                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0xb8b:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0xb8b:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0xb90:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xb96:0x21 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK:  / .b8 97
+; CHECK: // .b8 95,90,76,54,108,103,97,109,109,97,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,103,97,109,109,97            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 125                              // DW_AT_decl_line
@@ -2802,22 +2116,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xbb7:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 120
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK:  / .b8 115
+; CHECK: // .b8 95,90,76,53,108,108,97,98,115,120 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,97,98,115                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 126                              // DW_AT_decl_line
@@ -2827,24 +2128,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xbd6:0x21 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK:  / .b8 116
+; CHECK: // .b8 95,90,76,54,108,108,114,105,110,116,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,114,105,110,116          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 128                              // DW_AT_decl_line
@@ -2854,18 +2140,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xbf7:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,108,111,103,102      // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
+; CHECK: // .b8 108,111,103                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 138                              // DW_AT_decl_line
@@ -2873,24 +2150,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xc0c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc12:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
+; CHECK: // .b8 95,90,76,53,108,111,103,49,48,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,49,48                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 130                              // DW_AT_decl_line
@@ -2898,24 +2162,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xc2b:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc31:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 112
+; CHECK: // .b8 95,90,76,53,108,111,103,49,112,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,49,112               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 132                              // DW_AT_decl_line
@@ -2925,20 +2176,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc50:0x1d DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 50
+; CHECK: // .b8 95,90,76,52,108,111,103,50,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,50                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 134                              // DW_AT_decl_line
@@ -2948,20 +2188,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc6d:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
+; CHECK: // .b8 95,90,76,52,108,111,103,98,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,98                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 136                              // DW_AT_decl_line
@@ -2971,22 +2200,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xc8a:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,53,108,114,105,110,116,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,114,105,110,116              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 140                              // DW_AT_decl_line
@@ -2996,24 +2212,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xca9:0x21 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK:  / .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 95,90,76,54,108,114,111,117,110,100,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,114,111,117,110,100          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 142                              // DW_AT_decl_line
@@ -3023,26 +2224,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xcca:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK:  / .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 95,90,76,55,108,108,114,111,117,110,100,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,114,111,117,110,100      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 143                              // DW_AT_decl_line
@@ -3052,22 +2236,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xced:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,109,111,100,102,102,80,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 109,111,100,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 145                              // DW_AT_decl_line
@@ -3081,35 +2252,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 8                                // Abbrev [8] 0xd11:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 5                                // Abbrev [5] 0xd16:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 110
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 80
-; CHECK: // .b8 75
-; CHECK: // .b8 99
+; CHECK: // .b8 95,90,76,51,110,97,110,80,75,99  // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 110,97,110                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 146                              // DW_AT_decl_line
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xd2d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0xd33:0xa DW_TAG_base_type
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 101
+; CHECK: // .b8 100,111,117,98,108,101           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
@@ -3118,30 +2273,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 9                                // Abbrev [9] 0xd42:0x5 DW_TAG_const_type
 ; CHECK: // .b32 3399                            // DW_AT_type
 ; CHECK: // .b8 7                                // Abbrev [7] 0xd47:0x8 DW_TAG_base_type
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 104
-; CHECK: // .b8 97
-; CHECK: // .b8 114
+; CHECK: // .b8 99,104,97,114                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 8                                // DW_AT_encoding
 ; CHECK: // .b8 1                                // DW_AT_byte_size
 ; CHECK: // .b8 5                                // Abbrev [5] 0xd4f:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 110
-; CHECK:  / .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 75
-; CHECK: // .b8 99
+; CHECK: // .b8 95,90,76,52,110,97,110,102,80,75,99 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 110,97,110,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 147                              // DW_AT_decl_line
@@ -3151,30 +2290,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xd6e:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK:  / .b8 114
-; CHECK: // .b8 98
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,57,110,101,97,114,98,121,105,110,116,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 98
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 110,101,97,114,98,121,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 149                              // DW_AT_decl_line
@@ -3183,56 +2301,24 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0xd8f:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 5                                // Abbrev [5] 0xd95:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 5                                // Abbrev [5] 0xd95:0x2d DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,57,110,101,120,116,97,102,116,101,114,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
+; CHECK: // .b8 110,101,120,116,97,102,116,101,114 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 151                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xdb7:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0xdbc:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xdc2:0x21 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK: // .b8 102
-; CHECK: // .b8 105
+; CHECK: // .b8 95,90,76,51,112,111,119,102,105  // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 112                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 119
+; CHECK: // .b8 112,111,119                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 155                              // DW_AT_decl_line
@@ -3243,64 +2329,24 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0xddd:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 5                                // Abbrev [5] 0xde3:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
+; CHECK: // .b8 5                                // Abbrev [5] 0xde3:0x2d DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,57,114,101,109,97,105,110,100,101,114,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,109,97,105,110,100,101,114 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 157                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0xe05:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0xe05:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0xe0a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xe10:0x2e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK:  / .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 111
+; CHECK: // .b8 95,90,76,54,114,101,109,113,117,111,102,102,80,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,109,113,117,111          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 159                              // DW_AT_decl_line
@@ -3314,20 +2360,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xe3e:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK:  / .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,52,114,105,110,116,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,105,110,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 161                              // DW_AT_decl_line
@@ -3337,22 +2372,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xe5b:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK:  / .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 95,90,76,53,114,111,117,110,100,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,111,117,110,100              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 163                              // DW_AT_decl_line
@@ -3362,30 +2384,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xe7a:0x29 DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 115
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 108
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 110
-; CHECK: // .b8 0
-; CHECK: // .b8 1                                // DW_AT_decl_file
-; CHECK:  / .b8 165                              // DW_AT_decl_line
+; CHECK: // .b8 95,90,76,55,115,99,97,108,98,108,110,102,108 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,99,97,108,98,108,110         // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 1                                // DW_AT_decl_file
+; CHECK: // .b8 165                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xe98:0x5 DW_TAG_formal_parameter
@@ -3394,25 +2398,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xea3:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 115
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK:  / .b8 110
+; CHECK: // .b8 95,90,76,54,115,99,97,108,98,110,102,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,99,97,108,98,110             // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 167                              // DW_AT_decl_line
@@ -3424,48 +2412,22 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xeca:0x23 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 98
-; CHECK: // .b8 105
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 98
-; CHECK: // .b8 105
-; CHECK: // .b8 116
+; CHECK: // .b8 95,90,76,55,115,105,103,110,98,105,116,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,103,110,98,105,116       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 169                              // DW_AT_decl_line
 ; CHECK: // .b32 2487                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xee7:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 1554                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xeed:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 51
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK:  / .b8 110
-; CHECK:  / .b8 0
+; CHECK: // .b8 95,90,76,51,115,105,110,102      // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,110                      // DW_AT_name
+; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 171                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
@@ -3474,43 +2436,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf08:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK:  / .b8 115
-; CHECK:  / .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 95,90,76,52,115,105,110,104,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,110,104                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 173                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0xf1f:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0xf1f:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf25:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 115
-; CHECK: // .b8 113
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 113
-; CHECK:  / .b8 114
-; CHECK:  / .b8 116
+; CHECK: // .b8 95,90,76,52,115,113,114,116,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,113,114,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 175                              // DW_AT_decl_line
@@ -3520,41 +2460,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf42:0x1b DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK:  / .b8 51
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,51,116,97,110,102       // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 116,97,110                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 177                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 6                                // Abbrev [6] 0xf57:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 6                                // Abbrev [6] 0xf57:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf5d:0x1d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK:  / .b8 102
-; CHECK:  / .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 95,90,76,52,116,97,110,104,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,97,110,104                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 179                              // DW_AT_decl_line
@@ -3563,25 +2483,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0xf74:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 5                                // Abbrev [5] 0xf7a:0x21 DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 116
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
+; CHECK: // .b8 5                                // Abbrev [5] 0xf7a:0x21 DW_TAG_subprogram
+; CHECK: // .b8 95,90,76,54,116,103,97,109,109,97,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 103
-; CHECK:  / .b8 97
-; CHECK:  / .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
+; CHECK: // .b8 116,103,97,109,109,97            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 181                              // DW_AT_decl_line
@@ -3591,49 +2496,30 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 5                                // Abbrev [5] 0xf9b:0x1f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 99
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 99
-; CHECK: // .b8 0
-; CHECK: // .b8 1                                // DW_AT_decl_file
-; CHECK:  / .b8 183                              // DW_AT_decl_line
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b8 95,90,76,53,116,114,117,110,99,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,114,117,110,99               // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 1                                // DW_AT_decl_file
+; CHECK: // .b8 183                              // DW_AT_decl_line
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0xfb4:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0xfba:0x14 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
+; CHECK: // .b8 97,99,111,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 54                               // DW_AT_decl_line
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0xfc8:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3379                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0xfc8:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0xfce:0x14 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
+; CHECK: // .b8 97,115,105,110                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 56                               // DW_AT_decl_line
@@ -3641,13 +2527,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0xfdc:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 3379                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0xfe2:0x14 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
+; CHECK: // .b8 97,116,97,110                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 58                               // DW_AT_decl_line
@@ -3656,13 +2539,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0xff0:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0xff6:0x1a DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
+; CHECK: // .b8 0                                // End Of Children Mark
+; CHECK: // .b8 10                               // Abbrev [10] 0xff6:0x1a DW_TAG_subprogram
+; CHECK: // .b8 97,116,97,110,50                 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 60                               // DW_AT_decl_line
@@ -3670,15 +2549,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1005:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x100a:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 3379                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x100a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1010:0x14 DW_TAG_subprogram
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
+; CHECK: // .b8 99,101,105,108                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 178                              // DW_AT_decl_line
@@ -3687,11 +2563,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x101e:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1024:0x13 DW_TAG_subprogram
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
+; CHECK: // .b8 0                                // End Of Children Mark
+; CHECK: // .b8 10                               // Abbrev [10] 0x1024:0x13 DW_TAG_subprogram
+; CHECK: // .b8 99,111,115                       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 63                               // DW_AT_decl_line
@@ -3702,10 +2576,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1037:0x14 DW_TAG_subprogram
-; CHECK:  / .b8 99                               // DW_AT_name
-; CHECK:  / .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
+; CHECK: // .b8 99,111,115,104                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 72                               // DW_AT_decl_line
@@ -3716,9 +2587,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x104b:0x13 DW_TAG_subprogram
-; CHECK:  / .b8 101                              // DW_AT_name
-; CHECK:  / .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 101,120,112                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 100                              // DW_AT_decl_line
@@ -3729,10 +2598,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x105e:0x14 DW_TAG_subprogram
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK:  / .b8 97
-; CHECK:  / .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 102,97,98,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 181                              // DW_AT_decl_line
@@ -3743,11 +2609,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1072:0x15 DW_TAG_subprogram
-; CHECK:  / .b8 102                              // DW_AT_name
-; CHECK:  / .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
+; CHECK: // .b8 102,108,111,111,114              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 184                              // DW_AT_decl_line
@@ -3757,11 +2619,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1081:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1087:0x19 DW_TAG_subprogram
-; CHECK:  / .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
+; CHECK: // .b8 10                               // Abbrev [10] 0x1087:0x19 DW_TAG_subprogram
+; CHECK: // .b8 102,109,111,100                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 187                              // DW_AT_decl_line
@@ -3773,12 +2632,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x109a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x10a0:0x1a DW_TAG_subprogram
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 10                               // Abbrev [10] 0x10a0:0x1a DW_TAG_subprogram
+; CHECK: // .b8 102,114,101,120,112              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 103                              // DW_AT_decl_line
@@ -3791,11 +2646,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x10ba:0x1a DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
+; CHECK: // .b8 108,100,101,120,112              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 106                              // DW_AT_decl_line
@@ -3808,9 +2659,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x10d4:0x13 DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
+; CHECK: // .b8 108,111,103                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 109                              // DW_AT_decl_line
@@ -3819,27 +2668,20 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x10e1:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x10e7:0x15 DW_TAG_subprogram
-; CHECK:  / .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
+; CHECK: // .b8 0                                // End Of Children Mark
+; CHECK: // .b8 10                               // Abbrev [10] 0x10e7:0x15 DW_TAG_subprogram
+; CHECK: // .b8 108,111,103,49,48                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 112                              // DW_AT_decl_line
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x10f6:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 1                                // DW_AT_external
+; CHECK: // .b8 6                                // Abbrev [6] 0x10f6:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x10fc:0x19 DW_TAG_subprogram
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 109,111,100,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 115                              // DW_AT_decl_line
@@ -3848,15 +2690,13 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x110a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x110f:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 4373                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x110f:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 4373                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x1115:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 10                               // Abbrev [10] 0x111a:0x18 DW_TAG_subprogram
-; CHECK: // .b8 112                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 119
+; CHECK: // .b8 112,111,119                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 153                              // DW_AT_decl_line
@@ -3865,13 +2705,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1127:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x112c:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3379                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x112c:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1132:0x13 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
+; CHECK: // .b8 115,105,110                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 65                               // DW_AT_decl_line
@@ -3880,12 +2718,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x113f:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1145:0x14 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 0                                // End Of Children Mark
+; CHECK: // .b8 10                               // Abbrev [10] 0x1145:0x14 DW_TAG_subprogram
+; CHECK: // .b8 115,105,110,104                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -3895,11 +2730,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1153:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1159:0x14 DW_TAG_subprogram
-; CHECK:  / .b8 115                              // DW_AT_name
-; CHECK: // .b8 113
-; CHECK: // .b8 114
-; CHECK: // .b8 116
+; CHECK: // .b8 10                               // Abbrev [10] 0x1159:0x14 DW_TAG_subprogram
+; CHECK: // .b8 115,113,114,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 156                              // DW_AT_decl_line
@@ -3910,10 +2742,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x116d:0x13 DW_TAG_subprogram
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK:  / .b8 110
-; CHECK:  / .b8 0
+; CHECK: // .b8 116,97,110                       // DW_AT_name
+; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 67                               // DW_AT_decl_line
 ; CHECK: // .b32 3379                            // DW_AT_type
@@ -3923,10 +2753,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3379                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1180:0x14 DW_TAG_subprogram
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
+; CHECK: // .b8 116,97,110,104                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 76                               // DW_AT_decl_line
@@ -3938,11 +2765,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 11                               // Abbrev [11] 0x1194:0xd DW_TAG_typedef
 ; CHECK: // .b32 4513                            // DW_AT_type
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 118
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 100,105,118,95,116               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 101                              // DW_AT_decl_line
@@ -3950,12 +2773,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 11                               // Abbrev [11] 0x11a3:0xe DW_TAG_typedef
 ; CHECK: // .b32 4529                            // DW_AT_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 118
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 108,100,105,118,95,116           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 109                              // DW_AT_decl_line
@@ -3964,10 +2782,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 105                              // DW_AT_decl_line
 ; CHECK: // .b8 14                               // Abbrev [14] 0x11b5:0xf DW_TAG_member
-; CHECK: // .b8 113                              // DW_AT_name
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 116
+; CHECK: // .b8 113,117,111,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 4                                // DW_AT_decl_file
@@ -3976,9 +2791,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 35
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // Abbrev [14] 0x11c4:0xe DW_TAG_member
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
+; CHECK: // .b8 114,101,109                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 4                                // DW_AT_decl_file
@@ -3988,11 +2801,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 8
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 15                               // Abbrev [15] 0x11d3:0xd DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 116
+; CHECK: // .b8 97,98,111,114,116                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 3                                // DW_AT_decl_line
@@ -4001,9 +2810,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 1                                // DW_AT_noreturn
 ; CHECK: // .b8 16                               // Abbrev [16] 0x11e0:0x14 DW_TAG_subprogram
-; CHECK:  / .b8 97                               // DW_AT_name
-; CHECK:  / .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 97,98,115                        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 7                                // DW_AT_decl_line
@@ -4015,12 +2822,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x11f4:0x17 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK:  / .b8 101
-; CHECK:  / .b8 120
-; CHECK: // .b8 105
-; CHECK: // .b8 116
+; CHECK: // .b8 97,116,101,120,105,116           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 7                                // DW_AT_decl_line
@@ -4033,12 +2835,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x120b:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 4624                            // DW_AT_type
-; CHECK:  / .b8 17                               // Abbrev [17] 0x1210:0x1 DW_TAG_subroutine_type
-; CHECK:  / .b8 10                               // Abbrev [10] 0x1211:0x14 DW_TAG_subprogram
-; CHECK:  / .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 102
+; CHECK: // .b8 17                               // Abbrev [17] 0x1210:0x1 DW_TAG_subroutine_type
+; CHECK: // .b8 10                               // Abbrev [10] 0x1211:0x14 DW_TAG_subprogram
+; CHECK: // .b8 97,116,111,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 6                                // DW_AT_decl_file
 ; CHECK: // .b8 26                               // DW_AT_decl_line
@@ -4048,11 +2847,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x121f:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 16                               // Abbrev [16] 0x1225:0x15 DW_TAG_subprogram
-; CHECK:  / .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 105
+; CHECK: // .b8 16                               // Abbrev [16] 0x1225:0x15 DW_TAG_subprogram
+; CHECK: // .b8 97,116,111,105                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 22                               // DW_AT_decl_line
@@ -4064,10 +2860,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x123a:0x15 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK:  / .b8 116
-; CHECK:  / .b8 111
-; CHECK: // .b8 108
+; CHECK: // .b8 97,116,111,108                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 27                               // DW_AT_decl_line
@@ -4079,13 +2872,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x124f:0x2b DW_TAG_subprogram
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK:  / .b8 97
-; CHECK:  / .b8 114
-; CHECK: // .b8 99
-; CHECK: // .b8 104
+; CHECK: // .b8 98,115,101,97,114,99,104         // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 7                                // DW_AT_decl_file
 ; CHECK: // .b8 20                               // DW_AT_decl_line
@@ -4097,8 +2884,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1265:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4731                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x126a:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 4737                            // DW_AT_type
-; CHECK:  / .b8 6                                // Abbrev [6] 0x126f:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 4737                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x126f:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1274:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4772                            // DW_AT_type
@@ -4109,52 +2896,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 19                               // Abbrev [19] 0x1280:0x1 DW_TAG_const_type
 ; CHECK: // .b8 11                               // Abbrev [11] 0x1281:0xe DW_TAG_typedef
 ; CHECK: // .b32 4751                            // DW_AT_type
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 122
-; CHECK: // .b8 101
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 0
-; CHECK:  / .b8 8                                // DW_AT_decl_file
-; CHECK:  / .b8 62                               // DW_AT_decl_line
+; CHECK: // .b8 115,105,122,101,95,116           // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 8                                // DW_AT_decl_file
+; CHECK: // .b8 62                               // DW_AT_decl_line
 ; CHECK: // .b8 7                                // Abbrev [7] 0x128f:0x15 DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 100
-; CHECK:  / .b8 32
-; CHECK:  / .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 108,111,110,103,32,117,110,115,105,103,110,101,100,32,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 7                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
 ; CHECK: // .b8 20                               // Abbrev [20] 0x12a4:0x16 DW_TAG_typedef
 ; CHECK: // .b32 4794                            // DW_AT_type
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 109
-; CHECK: // .b8 112
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK:  / .b8 116
-; CHECK:  / .b8 0
+; CHECK: // .b8 95,95,99,111,109,112,97,114,95,102,110,95,116 // DW_AT_name
+; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 230                              // DW_AT_decl_line
 ; CHECK: // .b8 2
@@ -4168,12 +2922,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4731                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x12cf:0x1c DW_TAG_subprogram
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
+; CHECK: // .b8 99,97,108,108,111,99             // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 212                              // DW_AT_decl_line
@@ -4187,12 +2936,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x12eb:0x19 DW_TAG_subprogram
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 118
+; CHECK: // .b8 100,105,118                      // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK:  / .b8 4                                // DW_AT_decl_file
-; CHECK:  / .b8 21                               // DW_AT_decl_line
+; CHECK: // .b8 4                                // DW_AT_decl_file
+; CHECK: // .b8 21                               // DW_AT_decl_line
 ; CHECK: // .b8 3
 ; CHECK: // .b32 4500                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
@@ -4203,25 +2950,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 22                               // Abbrev [22] 0x1304:0x12 DW_TAG_subprogram
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 105
-; CHECK: // .b8 116
+; CHECK: // .b8 101,120,105,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 31                               // DW_AT_decl_line
-; CHECK:  / .b8 2
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 2
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 1                                // DW_AT_noreturn
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1310:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 23                               // Abbrev [23] 0x1316:0x11 DW_TAG_subprogram
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 101
+; CHECK: // .b8 102,114,101,101                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 227                              // DW_AT_decl_line
@@ -4231,13 +2972,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1321:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4730                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 16                               // Abbrev [16] 0x1327:0x17 DW_TAG_subprogram
-; CHECK:  / .b8 103                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 110
-; CHECK: // .b8 118
+; CHECK: // .b8 16                               // Abbrev [16] 0x1327:0x17 DW_TAG_subprogram
+; CHECK: // .b8 103,101,116,101,110,118          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 52                               // DW_AT_decl_line
@@ -4246,15 +2982,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1338:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 3389                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 3389                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x133e:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 3399                            // DW_AT_type
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1343:0x15 DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 108,97,98,115                    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 8                                // DW_AT_decl_line
@@ -4263,13 +2996,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1352:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 2917                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 2917                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1358:0x1a DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 118
+; CHECK: // .b8 108,100,105,118                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 23                               // DW_AT_decl_line
@@ -4283,28 +3013,19 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1372:0x17 DW_TAG_subprogram
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
+; CHECK: // .b8 109,97,108,108,111,99            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 210                              // DW_AT_decl_line
 ; CHECK: // .b8 1
 ; CHECK: // .b32 4730                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x1383:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 1                                // DW_AT_external
+; CHECK: // .b8 6                                // Abbrev [6] 0x1383:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1389:0x1b DW_TAG_subprogram
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 101
-; CHECK: // .b8 110
+; CHECK: // .b8 109,98,108,101,110               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 95                               // DW_AT_decl_line
@@ -4317,15 +3038,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x139e:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
-; CHECK:  / .b8 16                               // Abbrev [16] 0x13a4:0x23 DW_TAG_subprogram
-; CHECK:  / .b8 109                              // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK: // .b8 99
-; CHECK: // .b8 115
+; CHECK: // .b8 16                               // Abbrev [16] 0x13a4:0x23 DW_TAG_subprogram
+; CHECK: // .b8 109,98,115,116,111,119,99,115    // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 106                              // DW_AT_decl_line
@@ -4338,29 +3052,18 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x13bc:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x13c1:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 4737                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b32 4737                            // DW_AT_type
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x13c7:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 5068                            // DW_AT_type
 ; CHECK: // .b8 7                                // Abbrev [7] 0x13cc:0xb DW_TAG_base_type
-; CHECK: // .b8 119                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 119,99,104,97,114,95,116         // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 16                               // Abbrev [16] 0x13d7:0x21 DW_TAG_subprogram
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 98
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK:  / .b8 99
-; CHECK:  / .b8 0
+; CHECK: // .b8 109,98,116,111,119,99            // DW_AT_name
+; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 98                               // DW_AT_decl_line
 ; CHECK: // .b8 3
@@ -4375,11 +3078,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 23                               // Abbrev [23] 0x13f8:0x21 DW_TAG_subprogram
-; CHECK: // .b8 113                              // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 111
-; CHECK:  / .b8 114
-; CHECK:  / .b8 116
+; CHECK: // .b8 113,115,111,114,116              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 253                              // DW_AT_decl_line
@@ -4396,10 +3095,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4772                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 24                               // Abbrev [24] 0x1419:0xf DW_TAG_subprogram
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 114,97,110,100                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 118                              // DW_AT_decl_line
@@ -4407,14 +3103,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
-; CHECK:  / .b8 16                               // Abbrev [16] 0x1428:0x1d DW_TAG_subprogram
-; CHECK:  / .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
+; CHECK: // .b8 16                               // Abbrev [16] 0x1428:0x1d DW_TAG_subprogram
+; CHECK: // .b8 114,101,97,108,108,111,99        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 224                              // DW_AT_decl_line
@@ -4428,11 +3118,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 4737                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 23                               // Abbrev [23] 0x1445:0x12 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 100
+; CHECK: // .b8 115,114,97,110,100               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 120                              // DW_AT_decl_line
@@ -4443,28 +3129,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x1457:0x10 DW_TAG_base_type
-; CHECK: // .b8 117                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 100
-; CHECK: // .b8 32
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 117,110,115,105,103,110,101,100,32,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 7                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1467:0x1b DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 100
+; CHECK: // .b8 115,116,114,116,111,100          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 164                              // DW_AT_decl_line
@@ -4479,19 +3149,14 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 8                                // Abbrev [8] 0x1482:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 4926                            // DW_AT_type
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1487:0x20 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 108
+; CHECK: // .b8 115,116,114,116,111,108          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 183                              // DW_AT_decl_line
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x1497:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 1                                // DW_AT_external
+; CHECK: // .b8 6                                // Abbrev [6] 0x1497:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x149c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 5250                            // DW_AT_type
@@ -4499,13 +3164,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x14a7:0x21 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 108
+; CHECK: // .b8 115,116,114,116,111,117,108      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 187                              // DW_AT_decl_line
@@ -4520,12 +3179,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x14c8:0x17 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 109
+; CHECK: // .b8 115,121,115,116,101,109          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 205                              // DW_AT_decl_line
@@ -4537,16 +3191,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x14df:0x23 DW_TAG_subprogram
-; CHECK: // .b8 119                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 115
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 109
-; CHECK: // .b8 98
-; CHECK:  / .b8 115
-; CHECK:  / .b8 0
-; CHECK:  / .b8 4                                // DW_AT_decl_file
+; CHECK: // .b8 119,99,115,116,111,109,98,115    // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 109                              // DW_AT_decl_line
 ; CHECK: // .b8 3
 ; CHECK: // .b32 4737                            // DW_AT_type
@@ -4561,15 +3208,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 8                                // Abbrev [8] 0x1502:0x5 DW_TAG_pointer_type
 ; CHECK: // .b32 5383                            // DW_AT_type
-; CHECK:  / .b8 9                                // Abbrev [9] 0x1507:0x5 DW_TAG_const_type
-; CHECK:  / .b32 5068                            // DW_AT_type
+; CHECK: // .b8 9                                // Abbrev [9] 0x1507:0x5 DW_TAG_const_type
+; CHECK: // .b32 5068                            // DW_AT_type
 ; CHECK: // .b8 16                               // Abbrev [16] 0x150c:0x1c DW_TAG_subprogram
-; CHECK: // .b8 119                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 109
-; CHECK: // .b8 98
+; CHECK: // .b8 119,99,116,111,109,98            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 102                              // DW_AT_decl_line
@@ -4577,26 +3219,18 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x151d:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 4926                            // DW_AT_type
+; CHECK: // .b8 6                                // Abbrev [6] 0x151d:0x5 DW_TAG_formal_parameter
+; CHECK: // .b32 4926                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1522:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 5068                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 2                                // Abbrev [2] 0x1528:0x78 DW_TAG_namespace
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 117
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 120
-; CHECK: // .b8 120
+; CHECK: // .b8 95,95,103,110,117,95,99,120,120  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 3                                // Abbrev [3] 0x1533:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 5                                // DW_AT_decl_file
-; CHECK:  / .b8 201                              // DW_AT_decl_line
-; CHECK:  / .b32 5536                            // DW_AT_import
+; CHECK: // .b8 201                              // DW_AT_decl_line
+; CHECK: // .b32 5536                            // DW_AT_import
 ; CHECK: // .b8 3                                // Abbrev [3] 0x153a:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 5                                // DW_AT_decl_file
 ; CHECK: // .b8 207                              // DW_AT_decl_line
@@ -4612,8 +3246,8 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 3                                // Abbrev [3] 0x154f:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 5                                // DW_AT_decl_file
 ; CHECK: // .b8 228                              // DW_AT_decl_line
-; CHECK:  / .b32 5653                            // DW_AT_import
-; CHECK:  / .b8 3                                // Abbrev [3] 0x1556:0x7 DW_TAG_imported_declaration
+; CHECK: // .b32 5653                            // DW_AT_import
+; CHECK: // .b8 3                                // Abbrev [3] 0x1556:0x7 DW_TAG_imported_declaration
 ; CHECK: // .b8 5                                // DW_AT_decl_file
 ; CHECK: // .b8 229                              // DW_AT_decl_line
 ; CHECK: // .b32 5675                            // DW_AT_import
@@ -4630,30 +3264,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 233                              // DW_AT_decl_line
 ; CHECK: // .b32 5795                            // DW_AT_import
 ; CHECK: // .b8 25                               // Abbrev [25] 0x1572:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 57
-; CHECK:  / .b8 95
-; CHECK:  / .b8 95
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 117
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 120
-; CHECK: // .b8 120
-; CHECK: // .b8 51
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 118
-; CHECK: // .b8 69
-; CHECK: // .b8 120
-; CHECK: // .b8 120
+; CHECK: // .b8 95,90,78,57,95,95,103,110,117,95,99,120,120,51,100,105,118,69,120,120 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 118
+; CHECK: // .b8 100,105,118                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_decl_file
 ; CHECK: // .b8 214                              // DW_AT_decl_line
@@ -4668,25 +3281,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 11                               // Abbrev [11] 0x15a0:0xf DW_TAG_typedef
 ; CHECK: // .b32 5551                            // DW_AT_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 118
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 108,108,100,105,118,95,116       // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 121                              // DW_AT_decl_line
 ; CHECK: // .b8 13                               // Abbrev [13] 0x15af:0x22 DW_TAG_structure_type
 ; CHECK: // .b8 16                               // DW_AT_byte_size
-; CHECK:  / .b8 4                                // DW_AT_decl_file
-; CHECK:  / .b8 117                              // DW_AT_decl_line
+; CHECK: // .b8 4                                // DW_AT_decl_file
+; CHECK: // .b8 117                              // DW_AT_decl_line
 ; CHECK: // .b8 14                               // Abbrev [14] 0x15b3:0xf DW_TAG_member
-; CHECK: // .b8 113                              // DW_AT_name
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 116
+; CHECK: // .b8 113,117,111,116                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 4                                // DW_AT_decl_file
@@ -4695,23 +3299,17 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 35
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // Abbrev [14] 0x15c2:0xe DW_TAG_member
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
+; CHECK: // .b8 114,101,109                      // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK:  / .b32 1508                            // DW_AT_type
-; CHECK:  / .b8 4                                // DW_AT_decl_file
+; CHECK: // .b32 1508                            // DW_AT_type
+; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 120                              // DW_AT_decl_line
 ; CHECK: // .b8 2                                // DW_AT_data_member_location
 ; CHECK: // .b8 35
 ; CHECK: // .b8 8
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 22                               // Abbrev [22] 0x15d1:0x13 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 69
-; CHECK: // .b8 120
-; CHECK: // .b8 105
-; CHECK: // .b8 116
+; CHECK: // .b8 95,69,120,105,116                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 45                               // DW_AT_decl_line
@@ -4723,11 +3321,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x15e4:0x16 DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
+; CHECK: // .b8 108,108,97,98,115                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 12                               // DW_AT_decl_line
@@ -4739,11 +3333,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x15fa:0x1b DW_TAG_subprogram
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK:  / .b8 118
+; CHECK: // .b8 108,108,100,105,118              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 29                               // DW_AT_decl_line
@@ -4757,11 +3347,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1508                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 16                               // Abbrev [16] 0x1615:0x16 DW_TAG_subprogram
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 108
-; CHECK: // .b8 108
+; CHECK: // .b8 97,116,111,108,108               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 36                               // DW_AT_decl_line
@@ -4771,15 +3357,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1625:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x162b:0x21 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 108
-; CHECK: // .b8 108
+; CHECK: // .b8 115,116,114,116,111,108,108      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 209                              // DW_AT_decl_line
@@ -4794,14 +3374,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x164c:0x22 DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK:  / .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 108
-; CHECK: // .b8 108
+; CHECK: // .b8 115,116,114,116,111,117,108,108  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 214                              // DW_AT_decl_line
@@ -4816,57 +3389,25 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x166e:0x1a DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK:  / .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 100
-; CHECK: // .b8 32
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 108,111,110,103,32,108,111,110,103,32,117,110,115,105,103,110,101,100,32,105,110,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 7                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
 ; CHECK: // .b8 10                               // Abbrev [10] 0x1688:0x1b DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 102
+; CHECK: // .b8 115,116,114,116,111,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 172                              // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
-; CHECK:  / .b8 6                                // Abbrev [6] 0x1698:0x5 DW_TAG_formal_parameter
+; CHECK: // .b8 6                                // Abbrev [6] 0x1698:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 3389                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x169d:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 5250                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 10                               // Abbrev [10] 0x16a3:0x1c DW_TAG_subprogram
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 108
-; CHECK: // .b8 100
+; CHECK: // .b8 115,116,114,116,111,108,100      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_decl_file
 ; CHECK: // .b8 175                              // DW_AT_decl_line
@@ -4879,38 +3420,15 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 5250                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x16bf:0xf DW_TAG_base_type
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK:  / .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 100
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 101
+; CHECK: // .b8 108,111,110,103,32,100,111,117,98,108,101 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_encoding
 ; CHECK: // .b8 8                                // DW_AT_byte_size
 ; CHECK: // .b8 26                               // Abbrev [26] 0x16ce:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,99,111,115,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 97,99,111,115,102                // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 62                               // DW_AT_decl_line
 ; CHECK: // .b8 5
@@ -4920,24 +3438,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x16ee:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 97
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,97,99,111,115,104,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK:  / .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 97,99,111,115,104,102            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 90                               // DW_AT_decl_line
@@ -4948,22 +3451,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1710:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,115,105,110,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK:  / .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 97,115,105,110,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 57                               // DW_AT_decl_line
@@ -4974,56 +3464,25 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1730:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 97
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,97,115,105,110,104,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 97,115,105,110,104,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 95                               // DW_AT_decl_line
 ; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x174c:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1752:0x28 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,97,116,97,110,50,102,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 50
-; CHECK: // .b8 102
+; CHECK: // .b8 97,116,97,110,50,102             // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
-; CHECK:  / .b8 47                               // DW_AT_decl_line
+; CHECK: // .b8 47                               // DW_AT_decl_line
 ; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
@@ -5033,23 +3492,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x177a:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,97,116,97,110,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 97,116,97,110,102                // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 52                               // DW_AT_decl_line
 ; CHECK: // .b8 5
@@ -5059,24 +3505,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x179a:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,97,116,97,110,104,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 97                               // DW_AT_name
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK:  / .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 97,116,97,110,104,102            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 100                              // DW_AT_decl_line
@@ -5087,22 +3518,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x17bc:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 99
-; CHECK: // .b8 98
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK:  / .b8 98
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,99,98,114,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,98,114,116,102                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 150                              // DW_AT_decl_line
@@ -5113,22 +3531,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x17dc:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 99
-; CHECK: // .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK:  / .b8 101
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,99,101,105,108,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,101,105,108,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 155                              // DW_AT_decl_line
@@ -5139,31 +3544,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x17fc:0x2e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 57
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK:  / .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 103
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,57,99,111,112,121,115,105,103,110,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,112,121,115,105,103,110,102 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 165                              // DW_AT_decl_line
@@ -5176,20 +3559,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x182a:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,99,111,115,102,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,115,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 219                              // DW_AT_decl_line
@@ -5200,22 +3572,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1848:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK:  / .b8 53
-; CHECK: // .b8 99
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 99                               // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 115
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,99,111,115,104,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 99,111,115,104,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 32                               // DW_AT_decl_line
@@ -5226,22 +3585,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1868:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK:  / .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 99
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 99
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,101,114,102,99,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,114,102,99,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 210                              // DW_AT_decl_line
@@ -5252,20 +3598,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1888:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK:  / .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,101,114,102,102,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,114,102,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 200                              // DW_AT_decl_line
@@ -5276,22 +3611,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x18a6:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK:  / .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 50
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,101,120,112,50,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,50,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 145                              // DW_AT_decl_line
@@ -5302,22 +3624,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x18c6:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 9                                // DW_AT_decl_file
+; CHECK: // .b8 95,90,76,52,101,120,112,102,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,102                  // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 14                               // DW_AT_decl_line
 ; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
@@ -5326,27 +3637,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x18e4:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 109
-; CHECK: // .b8 49
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 101                              // DW_AT_name
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 109
-; CHECK: // .b8 49
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,101,120,112,109,49,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 101,120,112,109,49,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
-; CHECK:  / .b8 105                              // DW_AT_decl_line
+; CHECK: // .b8 105                              // DW_AT_decl_line
 ; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
@@ -5354,49 +3650,22 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1906:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 98
-; CHECK: // .b8 115
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,97,98,115,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,97,98,115,102                // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 95                               // DW_AT_decl_line
 ; CHECK: // .b8 2
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1920:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1926:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,100,105,109,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,100,105,109,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 80                               // DW_AT_decl_line
@@ -5407,26 +3676,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1946:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x194c:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 102
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,102,108,111,111,114,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,108,111,111,114,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -5437,22 +3691,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x196e:0x2a DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,102,109,97,102,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,97,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 32                               // DW_AT_decl_line
@@ -5467,23 +3708,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1998:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,109,97,120,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,97,120,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 110                              // DW_AT_decl_line
@@ -5496,23 +3723,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x19be:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,109,105,110,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,105,110,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 105                              // DW_AT_decl_line
@@ -5525,23 +3738,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x19e4:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK:  / .b8 102
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,102,109,111,100,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,109,111,100,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 17                               // DW_AT_decl_line
@@ -5554,26 +3753,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1a0a:0x29 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 102
-; CHECK:  / .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 102                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,102,114,101,120,112,102,102,80,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 102,114,101,120,112,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 7                                // DW_AT_decl_line
@@ -5586,25 +3768,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1a33:0x28 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 104
-; CHECK:  / .b8 121
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 104                              // DW_AT_name
-; CHECK: // .b8 121
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,104,121,112,111,116,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 104,121,112,111,116,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 110                              // DW_AT_decl_line
@@ -5617,24 +3783,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1a5b:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,105,108,111,103,98,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
+; CHECK: // .b8 105,108,111,103,98,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -5645,25 +3796,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1a7d:0x28 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,108,100,101,120,112,102,102,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,100,101,120,112,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 240                              // DW_AT_decl_line
@@ -5673,59 +3808,25 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1a9a:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1a9f:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 2332                            // DW_AT_type
+; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1aa5:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 108
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,108,103,97,109,109,97,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,103,97,109,109,97,102        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 235                              // DW_AT_decl_line
-; CHECK:  / .b8 5
+; CHECK: // .b8 5
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1ac3:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1ac9:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK:  / .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,108,108,114,105,110,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,114,105,110,116,102      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 125                              // DW_AT_decl_line
@@ -5736,28 +3837,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1aed:0x26 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK:  / .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,56,108,108,114,111,117,110,100,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,108,114,111,117,110,100,102  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 66                               // DW_AT_decl_line
@@ -5768,24 +3850,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b13:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK:  / .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,108,111,103,49,48,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,49,48,102            // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 76                               // DW_AT_decl_line
@@ -5796,24 +3863,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b35:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 112
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK:  / .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 49
-; CHECK: // .b8 112
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,108,111,103,49,112,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,49,112,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -5824,22 +3876,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b57:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 50
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 50
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,108,111,103,50,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,50,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 5                                // DW_AT_decl_line
@@ -5848,24 +3887,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1b71:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 0                                // End Of Children Mark
+; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b77:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 98
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,108,111,103,98,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,98,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 90                               // DW_AT_decl_line
@@ -5876,20 +3902,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1b97:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 108                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 103
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,108,111,103,102,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,111,103,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 67                               // DW_AT_decl_line
@@ -5900,24 +3915,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1bb5:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK:  / .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,108,114,105,110,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,114,105,110,116,102          // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 116                              // DW_AT_decl_line
@@ -5928,54 +3928,22 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1bd7:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 108
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 108                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,108,114,111,117,110,100,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 108,114,111,117,110,100,102      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 71                               // DW_AT_decl_line
 ; CHECK: // .b8 6
 ; CHECK: // .b32 2917                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1bf5:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1bfb:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 109
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 109                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,109,111,100,102,102,102,80,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 109,111,100,102,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 12                               // DW_AT_decl_line
@@ -5988,71 +3956,22 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 3345                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1c22:0x2b DW_TAG_subprogram
-; CHECK:  / .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 98
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,48,110,101,97,114,98,121,105,110,116,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 98
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 110,101,97,114,98,121,105,110,116,102 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 130                              // DW_AT_decl_line
-; CHECK:  / .b8 4
+; CHECK: // .b8 4
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1c47:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1c4d:0x31 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 110
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,48,110,101,120,116,97,102,116,101,114,102,102,102 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 110                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 120
-; CHECK:  / .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 116
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
+; CHECK: // .b8 110,101,120,116,97,102,116,101,114,102 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 194                              // DW_AT_decl_line
@@ -6065,21 +3984,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1c7e:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 112
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 112                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 119
-; CHECK:  / .b8 102
+; CHECK: // .b8 95,90,76,52,112,111,119,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 112,111,119,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 47                               // DW_AT_decl_line
@@ -6092,34 +3999,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1ca2:0x31 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK:  / .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,49,48,114,101,109,97,105,110,100,101,114,102,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,109,97,105,110,100,101,114,102 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 22                               // DW_AT_decl_line
@@ -6132,29 +4014,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1cd3:0x31 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK:  / .b8 109
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 109
-; CHECK: // .b8 113
-; CHECK: // .b8 117
-; CHECK: // .b8 111
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,114,101,109,113,117,111,102,102,102,80,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,109,113,117,111,102      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 27                               // DW_AT_decl_line
@@ -6164,57 +4026,29 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1cf4:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1cf9:0x5 DW_TAG_formal_parameter
-; CHECK:  / .b32 1554                            // DW_AT_type
+; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1cfe:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 2377                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d04:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 114
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,114,105,110,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,105,110,116,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 111                              // DW_AT_decl_line
 ; CHECK: // .b8 4
 ; CHECK: // .b32 1554                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_declaration
+; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 6                                // Abbrev [6] 0x1d1e:0x5 DW_TAG_formal_parameter
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d24:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 114
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 111
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 100
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK:  / .b8 9                                // DW_AT_decl_file
+; CHECK: // .b8 95,90,76,54,114,111,117,110,100,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,111,117,110,100,102          // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 61                               // DW_AT_decl_line
 ; CHECK: // .b8 6
 ; CHECK: // .b32 1554                            // DW_AT_type
@@ -6223,29 +4057,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d46:0x2c DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 56
-; CHECK: // .b8 115
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 108
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK:  / .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,56,115,99,97,108,98,108,110,102,102,108 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,99,97,108,98,108,110,102     // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 250                              // DW_AT_decl_line
@@ -6258,27 +4072,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2917                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d72:0x2a DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 115
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK:  / .b8 105
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 99
-; CHECK: // .b8 97
-; CHECK: // .b8 108
-; CHECK: // .b8 98
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,115,99,97,108,98,110,102,102,105 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,99,97,108,98,110,102         // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 245                              // DW_AT_decl_line
@@ -6291,20 +4087,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 2332                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1d9c:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK:  / .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,115,105,110,102,102  // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,110,102                  // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 210                              // DW_AT_decl_line
@@ -6315,22 +4100,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1dba:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,115,105,110,104,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,105,110,104,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 37                               // DW_AT_decl_line
@@ -6341,22 +4113,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1dda:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 115
-; CHECK: // .b8 113
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 113
-; CHECK: // .b8 114
-; CHECK: // .b8 116
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,115,113,114,116,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,113,114,116,102              // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 139                              // DW_AT_decl_line
@@ -6367,20 +4126,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1dfa:0x1e DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 52
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,52,116,97,110,102,102   // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,97,110,102                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 252                              // DW_AT_decl_line
@@ -6391,22 +4139,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1e18:0x20 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 53
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 104
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,53,116,97,110,104,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,97,110,104,102               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 42                               // DW_AT_decl_line
@@ -6417,26 +4152,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1e38:0x24 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 55
-; CHECK: // .b8 116
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 103
-; CHECK: // .b8 97
-; CHECK: // .b8 109
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,55,116,103,97,109,109,97,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,103,97,109,109,97,102        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 9                                // DW_AT_decl_file
 ; CHECK: // .b8 56                               // DW_AT_decl_line
@@ -6447,24 +4165,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 26                               // Abbrev [26] 0x1e5c:0x22 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 76
-; CHECK: // .b8 54
-; CHECK: // .b8 116
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 99
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 116                              // DW_AT_name
-; CHECK: // .b8 114
-; CHECK: // .b8 117
-; CHECK: // .b8 110
-; CHECK: // .b8 99
-; CHECK: // .b8 102
+; CHECK: // .b8 95,90,76,54,116,114,117,110,99,102,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 116,114,117,110,99,102           // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 11                               // DW_AT_decl_file
 ; CHECK: // .b8 150                              // DW_AT_decl_line
@@ -6475,181 +4178,27 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 27                               // Abbrev [27] 0x1e7e:0x22a DW_TAG_structure_type
-; CHECK:  / .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_byte_size
-; CHECK:  / .b8 13                               // DW_AT_decl_file
+; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 77                               // DW_AT_decl_line
 ; CHECK: // .b8 28                               // Abbrev [28] 0x1e9c:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK:  / .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,120,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,120 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 78                               // DW_AT_decl_line
-; CHECK:  / .b32 5207                            // DW_AT_type
+; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x1eeb:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK:  / .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,121,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK:  / .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,121 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 79                               // DW_AT_decl_line
@@ -6657,138 +4206,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x1f3a:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK:  / .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK:  / .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,122,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
-; CHECK: // .b8 0
-; CHECK:  / .b8 13                               // DW_AT_decl_file
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,122 // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 80                               // DW_AT_decl_line
 ; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 25                               // Abbrev [25] 0x1f89:0x49 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK:  / .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 118
-; CHECK: // .b8 53
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK: // .b8 69
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,99,118,53,117,105,110,116,51,69 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 118
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK:  / .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
+; CHECK: // .b8 111,112,101,114,97,116,111,114,32,117,105,110,116,51 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 83                               // DW_AT_decl_line
@@ -6800,36 +4232,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x1fd2:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK:  / .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
 ; CHECK: // .b8 1                                // DW_AT_declaration
-; CHECK:  / .b8 1                                // DW_AT_external
+; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 3                                // DW_AT_accessibility
 ; CHECK:                                         // DW_ACCESS_private
 ; CHECK: // .b8 29                               // Abbrev [29] 0x1ff2:0x6 DW_TAG_formal_parameter
@@ -6837,31 +4245,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x1ff9:0x2c DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK:  / .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -6876,54 +4260,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 8422                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 31                               // Abbrev [31] 0x2025:0x43 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK:  / .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 83
-; CHECK: // .b8 69
-; CHECK: // .b8 82
-; CHECK: // .b8 75
-; CHECK: // .b8 83
-; CHECK: // .b8 95
-; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 61
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,97,83,69,82,75,83,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 111,112,101,114,97,116,111,114,61 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -6938,51 +4277,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 8422                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 32                               // Abbrev [32] 0x2068:0x3f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,73,100,120,95,116,97,100,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 38
+; CHECK: // .b8 111,112,101,114,97,116,111,114,38 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 85                               // DW_AT_decl_line
@@ -6997,11 +4294,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 27                               // Abbrev [27] 0x20a8:0x2f DW_TAG_structure_type
-; CHECK: // .b8 117                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
+; CHECK: // .b8 117,105,110,116,51               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_byte_size
 ; CHECK: // .b8 14                               // DW_AT_decl_file
@@ -7048,105 +4341,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 7836                            // DW_AT_specification
 ; CHECK: // .b8 1                                // DW_AT_inline
 ; CHECK: // .b8 27                               // Abbrev [27] 0x20f6:0x228 DW_TAG_structure_type
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK:  / .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_byte_size
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 88                               // DW_AT_decl_line
 ; CHECK: // .b8 28                               // Abbrev [28] 0x2114:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK:  / .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,120,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK:  / .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,120 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 89                               // DW_AT_decl_line
@@ -7154,151 +4358,21 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x2163:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK:  / .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK:  / .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,121,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
-; CHECK: // .b8 0
-; CHECK:  / .b8 13                               // DW_AT_decl_file
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,121 // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 90                               // DW_AT_decl_line
 ; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x21b2:0x4f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK:  / .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,49,55,95,95,102,101,116,99,104,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 98,117,105,108,116,105,110,95,122,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK:  / .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,122 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 91                               // DW_AT_decl_line
@@ -7306,60 +4380,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 25                               // Abbrev [25] 0x2201:0x47 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK:  / .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 118
-; CHECK: // .b8 52
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK:  / .b8 109
-; CHECK: // .b8 51
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,99,118,52,100,105,109,51,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 32
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 51
+; CHECK: // .b8 111,112,101,114,97,116,111,114,32,100,105,109,51 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 94                               // DW_AT_decl_line
@@ -7368,34 +4391,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 29                               // Abbrev [29] 0x2241:0x6 DW_TAG_formal_parameter
 ; CHECK: // .b32 9166                            // DW_AT_type
-; CHECK:  / .b8 1                                // DW_AT_artificial
+; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x2248:0x27 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK:  / .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -7408,31 +4407,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x226f:0x2c DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK:  / .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -7447,54 +4422,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9181                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 31                               // Abbrev [31] 0x229b:0x43 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK:  / .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 83
-; CHECK: // .b8 69
-; CHECK: // .b8 82
-; CHECK: // .b8 75
-; CHECK: // .b8 83
-; CHECK: // .b8 95
-; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 61
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,97,83,69,82,75,83,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 111,112,101,114,97,116,111,114,61 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -7509,51 +4439,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9181                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 32                               // Abbrev [32] 0x22de:0x3f DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 53
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 107
-; CHECK: // .b8 68
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,53,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,98,108,111,99,107,68,105,109,95,116,97,100,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 38
+; CHECK: // .b8 111,112,101,114,97,116,111,114,38 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 96                               // DW_AT_decl_line
@@ -7568,10 +4456,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 35                               // Abbrev [35] 0x231e:0x9d DW_TAG_structure_type
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 51
+; CHECK: // .b8 100,105,109,51                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_byte_size
 ; CHECK: // .b8 14                               // DW_AT_decl_file
@@ -7608,10 +4493,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 35
 ; CHECK: // .b8 8
 ; CHECK: // .b8 23                               // Abbrev [23] 0x234f:0x21 DW_TAG_subprogram
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 51
+; CHECK: // .b8 100,105,109,51                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // DW_AT_decl_file
 ; CHECK: // .b8 165                              // DW_AT_decl_line
@@ -7629,10 +4511,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 5207                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 23                               // Abbrev [23] 0x2370:0x17 DW_TAG_subprogram
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK: // .b8 51
+; CHECK: // .b8 100,105,109,51                   // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // DW_AT_decl_file
 ; CHECK: // .b8 166                              // DW_AT_decl_line
@@ -7646,41 +4525,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9152                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 37                               // Abbrev [37] 0x2387:0x33 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 52
-; CHECK: // .b8 100
-; CHECK: // .b8 105
-; CHECK: // .b8 109
-; CHECK:  / .b8 51
-; CHECK: // .b8 99
-; CHECK: // .b8 118
-; CHECK: // .b8 53
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,52,100,105,109,51,99,118,53,117,105,110,116,51,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK: // .b8 0
-; CHECK:  / .b8 14                               // DW_AT_decl_file
+; CHECK: // .b8 111,112,101,114,97,116,111,114,32,117,105,110,116,51 // DW_AT_name
+; CHECK: // .b8 0
+; CHECK: // .b8 14                               // DW_AT_decl_file
 ; CHECK: // .b8 167                              // DW_AT_decl_line
 ; CHECK: // .b8 1
 ; CHECK: // .b32 9152                            // DW_AT_type
@@ -7695,11 +4544,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 8990                            // DW_AT_type
 ; CHECK: // .b8 20                               // Abbrev [20] 0x23c0:0xe DW_TAG_typedef
 ; CHECK: // .b32 8360                            // DW_AT_type
-; CHECK: // .b8 117                              // DW_AT_name
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
+; CHECK: // .b8 117,105,110,116,51               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 14                               // DW_AT_decl_file
 ; CHECK: // .b8 127                              // DW_AT_decl_line
@@ -7718,107 +4563,16 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 8468                            // DW_AT_specification
 ; CHECK: // .b8 1                                // DW_AT_inline
 ; CHECK: // .b8 27                               // Abbrev [27] 0x23ed:0x233 DW_TAG_structure_type
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK:  / .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_byte_size
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 66                               // DW_AT_decl_line
 ; CHECK: // .b8 28                               // Abbrev [28] 0x240c:0x50 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK:  / .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK:  / .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,49,55,95,95,102,101,116,99,104 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 95,98,117,105,108,116,105,110,95,120,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK:  / .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 120
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,120 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 67                               // DW_AT_decl_line
@@ -7826,76 +4580,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x245c:0x50 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK:  / .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,49,55,95,95,102,101,116,99,104 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 95,98,117,105,108,116,105,110,95,121,69,118
 ; CHECK: // .b8 0
-; CHECK:  / .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 121
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,121 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 68                               // DW_AT_decl_line
@@ -7903,76 +4591,10 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 28                               // Abbrev [28] 0x24ac:0x50 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK:  / .b8 78
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK:  / .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 55
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,49,55,95,95,102,101,116,99,104 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 95,98,117,105,108,116,105,110,95,122,69,118
 ; CHECK: // .b8 0
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK:  / .b8 95
-; CHECK: // .b8 102
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 104
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 122
+; CHECK: // .b8 95,95,102,101,116,99,104,95,98,117,105,108,116,105,110,95,122 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 69                               // DW_AT_decl_line
@@ -7980,64 +4602,11 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_declaration
 ; CHECK: // .b8 1                                // DW_AT_external
 ; CHECK: // .b8 25                               // Abbrev [25] 0x24fc:0x4a DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK:  / .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK:  / .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 99
-; CHECK: // .b8 118
-; CHECK: // .b8 53
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,99,118,53,117,105,110,116,51 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 69,118
+; CHECK: // .b8 0
+; CHECK: // .b8 111,112,101,114,97,116,111,114,32,117,105,110,116,51 // DW_AT_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 32
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 116
-; CHECK: // .b8 51
-; CHECK:  / .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 72                               // DW_AT_decl_line
 ; CHECK: // .b32 8360                            // DW_AT_type
@@ -8048,32 +4617,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x2546:0x28 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK:  / .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -8086,32 +4630,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 1                                // DW_AT_artificial
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 30                               // Abbrev [30] 0x256e:0x2d DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_name
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
+; CHECK: // .b8 95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -8126,55 +4645,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9775                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 31                               // Abbrev [31] 0x259b:0x44 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 83
-; CHECK: // .b8 69
-; CHECK: // .b8 82
-; CHECK: // .b8 75
-; CHECK: // .b8 83
-; CHECK: // .b8 95
-; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 61
+; CHECK: // .b8 95,90,78,75,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,97,83,69,82,75,83,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 111,112,101,114,97,116,111,114,61 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -8189,52 +4662,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9775                            // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 32                               // Abbrev [32] 0x25df:0x40 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 78
-; CHECK: // .b8 75
-; CHECK: // .b8 50
-; CHECK: // .b8 54
-; CHECK: // .b8 95
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 117
-; CHECK: // .b8 100
-; CHECK: // .b8 97
-; CHECK: // .b8 95
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 105
-; CHECK: // .b8 108
-; CHECK: // .b8 116
-; CHECK: // .b8 105
-; CHECK: // .b8 110
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 104
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 73
-; CHECK: // .b8 100
-; CHECK: // .b8 120
-; CHECK: // .b8 95
-; CHECK: // .b8 116
-; CHECK: // .b8 97
-; CHECK: // .b8 100
-; CHECK: // .b8 69
-; CHECK: // .b8 118
+; CHECK: // .b8 95,90,78,75,50,54,95,95,99,117,100,97,95,98,117,105,108,116,105,110,95,116,104,114,101,97,100,73,100,120,95,116,97,100,69,118 // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 111                              // DW_AT_name
-; CHECK: // .b8 112
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 97
-; CHECK: // .b8 116
-; CHECK: // .b8 111
-; CHECK: // .b8 114
-; CHECK: // .b8 38
+; CHECK: // .b8 111,112,101,114,97,116,111,114,38 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 13                               // DW_AT_decl_file
 ; CHECK: // .b8 74                               // DW_AT_decl_line
@@ -8262,20 +4692,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b32 9228                            // DW_AT_specification
 ; CHECK: // .b8 1                                // DW_AT_inline
 ; CHECK: // .b8 38                               // Abbrev [38] 0x263f:0x32 DW_TAG_subprogram
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 51
-; CHECK: // .b8 114
-; CHECK: // .b8 101
-; CHECK: // .b8 115
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 102
-; CHECK: // .b8 0
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 115
+; CHECK: // .b8 95,90,51,114,101,115,102,102,80,102 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 114,101,115                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_decl_file
 ; CHECK: // .b8 3                                // DW_AT_decl_line
@@ -8294,9 +4713,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b8 3                                // DW_AT_decl_line
 ; CHECK: // .b32 1554                            // DW_AT_type
 ; CHECK: // .b8 39                               // Abbrev [39] 0x2665:0xb DW_TAG_formal_parameter
-; CHECK: // .b8 114                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 115
+; CHECK: // .b8 114,101,115                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_decl_file
 ; CHECK: // .b8 3                                // DW_AT_decl_line
@@ -8307,26 +4724,9 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
 ; CHECK: // .b8 1                                // DW_AT_frame_base
 ; CHECK: // .b8 156
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 53
-; CHECK: // .b8 115
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 121
-; CHECK: // .b8 105
-; CHECK: // .b8 102
-; CHECK: // .b8 80
-; CHECK: // .b8 102
-; CHECK: // .b8 83
-; CHECK: // .b8 95
-; CHECK: // .b8 0
-; CHECK: // .b8 115                              // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 120
-; CHECK: // .b8 112
-; CHECK: // .b8 121
+; CHECK: // .b8 95,90,53,115,97,120,112,121,105,102,80,102,83,95 // DW_AT_MIPS_linkage_name
+; CHECK: // .b8 0
+; CHECK: // .b8 115,97,120,112,121               // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 12                               // DW_AT_decl_file
 ; CHECK: // .b8 5                                // DW_AT_decl_line
diff --git a/test/DebugInfo/NVPTX/debug-loc-offset.ll b/test/DebugInfo/NVPTX/debug-loc-offset.ll
index 53c5fd9dff198e2a10accb8f369a391ef03673ce..91926517bbc6cd826f0b796bc5741171d779fad6 100644
--- a/test/DebugInfo/NVPTX/debug-loc-offset.ll
+++ b/test/DebugInfo/NVPTX/debug-loc-offset.ll
@@ -166,8 +166,7 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b8 1                                // DW_FORM_addr
 ; CHECK: // .b8 64                               // DW_AT_frame_base
 ; CHECK: // .b8 10                               // DW_FORM_block1
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -201,8 +200,7 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b8 1                                // DW_FORM_addr
 ; CHECK: // .b8 64                               // DW_AT_frame_base
 ; CHECK: // .b8 10                               // DW_FORM_block1
-; CHECK: // .b8 135                              // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 64
+; CHECK: // .b8 135,64                           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 8                                // DW_FORM_string
 ; CHECK: // .b8 3                                // DW_AT_name
 ; CHECK: // .b8 8                                // DW_FORM_string
@@ -250,74 +248,14 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK: // .b8 8                                // Address Size (in bytes)
 ; CHECK: // .b8 1                                // Abbrev [1] 0xb:0x8f DW_TAG_compile_unit
-; CHECK: // .b8 99                               // DW_AT_producer
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 118
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 32
-; CHECK: // .b8 51
-; CHECK: // .b8 46
-; CHECK: // .b8 53
-; CHECK: // .b8 46
-; CHECK: // .b8 48
-; CHECK: // .b8 32
-; CHECK: // .b8 40
-; CHECK: // .b8 50
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 52
-; CHECK: // .b8 55
-; CHECK: // .b8 57
-; CHECK: // .b8 41
+; CHECK: // .b8 99,108,97,110,103,32,118,101,114,115,105,111,110,32,51,46,53,46,48,32,40,50,49,48,52,55,57,41 // DW_AT_producer
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 103
-; CHECK: // .b8 45
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 45
-; CHECK: // .b8 111
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 50
-; CHECK: // .b8 46
-; CHECK: // .b8 99
-; CHECK: // .b8 99
+; CHECK: // .b8 100,101,98,117,103,45,108,111,99,45,111,102,102,115,101,116,50,46,99,99 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 118
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 107
-; CHECK: // .b8 101
-; CHECK: // .b8 95
-; CHECK: // .b8 103
-; CHECK: // .b8 99
-; CHECK: // .b8 99
+; CHECK: // .b8 47,108,108,118,109,95,99,109,97,107,101,95,103,99,99 // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin1                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end1                      // DW_AT_high_pc
@@ -330,18 +268,9 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b64 Lfunc_end1                      // DW_AT_high_pc
 ; CHECK: // .b8 1                                // DW_AT_frame_base
 ; CHECK: // .b8 156
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 51
-; CHECK: // .b8 98
-; CHECK: // .b8 97
-; CHECK: // .b8 122
-; CHECK: // .b8 49
-; CHECK: // .b8 65
+; CHECK: // .b8 95,90,51,98,97,122,49,65         // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 122
+; CHECK: // .b8 98,97,122                        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 2                                // DW_AT_decl_file
 ; CHECK: // .b8 6                                // DW_AT_decl_line
@@ -360,74 +289,14 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b32 .debug_abbrev                   // Offset Into Abbrev. Section
 ; CHECK: // .b8 8                                // Address Size (in bytes)
 ; CHECK: // .b8 1                                // Abbrev [1] 0xb:0x91 DW_TAG_compile_unit
-; CHECK: // .b8 99                               // DW_AT_producer
-; CHECK: // .b8 108
-; CHECK: // .b8 97
-; CHECK: // .b8 110
-; CHECK: // .b8 103
-; CHECK: // .b8 32
-; CHECK: // .b8 118
-; CHECK: // .b8 101
-; CHECK: // .b8 114
-; CHECK: // .b8 115
-; CHECK: // .b8 105
-; CHECK: // .b8 111
-; CHECK: // .b8 110
-; CHECK: // .b8 32
-; CHECK: // .b8 51
-; CHECK: // .b8 46
-; CHECK: // .b8 53
-; CHECK: // .b8 46
-; CHECK: // .b8 48
-; CHECK: // .b8 32
-; CHECK: // .b8 40
-; CHECK: // .b8 50
-; CHECK: // .b8 49
-; CHECK: // .b8 48
-; CHECK: // .b8 52
-; CHECK: // .b8 55
-; CHECK: // .b8 57
-; CHECK: // .b8 41
+; CHECK: // .b8 99,108,97,110,103,32,118,101,114,115,105,111,110,32,51,46,53,46,48,32,40,50,49,48,52,55,57,41 // DW_AT_producer
 ; CHECK: // .b8 0
 ; CHECK: // .b8 4                                // DW_AT_language
 ; CHECK: // .b8 0
-; CHECK: // .b8 100                              // DW_AT_name
-; CHECK: // .b8 101
-; CHECK: // .b8 98
-; CHECK: // .b8 117
-; CHECK: // .b8 103
-; CHECK: // .b8 45
-; CHECK: // .b8 108
-; CHECK: // .b8 111
-; CHECK: // .b8 99
-; CHECK: // .b8 45
-; CHECK: // .b8 111
-; CHECK: // .b8 102
-; CHECK: // .b8 102
-; CHECK: // .b8 115
-; CHECK: // .b8 101
-; CHECK: // .b8 116
-; CHECK: // .b8 49
-; CHECK: // .b8 46
-; CHECK: // .b8 99
-; CHECK: // .b8 99
+; CHECK: // .b8 100,101,98,117,103,45,108,111,99,45,111,102,102,115,101,116,49,46,99,99 // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b32 .debug_line                     // DW_AT_stmt_list
-; CHECK: // .b8 47                               // DW_AT_comp_dir
-; CHECK: // .b8 108
-; CHECK: // .b8 108
-; CHECK: // .b8 118
-; CHECK: // .b8 109
-; CHECK: // .b8 95
-; CHECK: // .b8 99
-; CHECK: // .b8 109
-; CHECK: // .b8 97
-; CHECK: // .b8 107
-; CHECK: // .b8 101
-; CHECK: // .b8 95
-; CHECK: // .b8 103
-; CHECK: // .b8 99
-; CHECK: // .b8 99
+; CHECK: // .b8 47,108,108,118,109,95,99,109,97,107,101,95,103,99,99 // DW_AT_comp_dir
 ; CHECK: // .b8 0
 ; CHECK: // .b64 Lfunc_begin0                    // DW_AT_low_pc
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
@@ -436,17 +305,9 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b64 Lfunc_end0                      // DW_AT_high_pc
 ; CHECK: // .b8 1                                // DW_AT_frame_base
 ; CHECK: // .b8 156
-; CHECK: // .b8 95                               // DW_AT_MIPS_linkage_name
-; CHECK: // .b8 90
-; CHECK: // .b8 51
-; CHECK: // .b8 98
-; CHECK: // .b8 97
-; CHECK: // .b8 114
-; CHECK: // .b8 105
+; CHECK: // .b8 95,90,51,98,97,114,105           // DW_AT_MIPS_linkage_name
 ; CHECK: // .b8 0
-; CHECK: // .b8 98                               // DW_AT_name
-; CHECK: // .b8 97
-; CHECK: // .b8 114
+; CHECK: // .b8 98,97,114                        // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 1                                // DW_AT_decl_file
 ; CHECK: // .b8 1                                // DW_AT_decl_line
@@ -460,9 +321,7 @@ attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "n
 ; CHECK: // .b32 148                             // DW_AT_type
 ; CHECK: // .b8 0                                // End Of Children Mark
 ; CHECK: // .b8 7                                // Abbrev [7] 0x94:0x7 DW_TAG_base_type
-; CHECK: // .b8 105                              // DW_AT_name
-; CHECK: // .b8 110
-; CHECK: // .b8 116
+; CHECK: // .b8 105,110,116                      // DW_AT_name
 ; CHECK: // .b8 0
 ; CHECK: // .b8 5                                // DW_AT_encoding
 ; CHECK: // .b8 4                                // DW_AT_byte_size
diff --git a/test/DebugInfo/WebAssembly/dbg-value-live-interval.ll b/test/DebugInfo/WebAssembly/dbg-value-live-interval.ll
index b79ef23fc96ee6eb202ed6f2863c5fe8c0ab5228..1754d2a7a21c7d7e17ebb1699bb3c03888acaf77 100644
--- a/test/DebugInfo/WebAssembly/dbg-value-live-interval.ll
+++ b/test/DebugInfo/WebAssembly/dbg-value-live-interval.ll
@@ -3,7 +3,7 @@
 ; CHECK: After WebAssembly Optimize Live Intervals:
 ; CHECK: bb.3.for.body.for.body_crit_edge:
 ; CHECK: [[REG:%[0-9]+]]:i32 = nsw ADD_I32 {{.*}} fib.c:7:7
-; CHECK: DBG_VALUE debug-use [[REG]]:i32, debug-use $noreg, !"a", {{.*}} fib.c:5:13
+; CHECK: DBG_VALUE [[REG]]:i32, $noreg, !"a", {{.*}} fib.c:5:13
 ; CHECK: After WebAssembly Store Results:
 
 ; ModuleID = 'fib.bc'
diff --git a/test/DebugInfo/WebAssembly/dbg-value-move-2.ll b/test/DebugInfo/WebAssembly/dbg-value-move-2.ll
index 30a87d1f3cd7b358481e338a590d7dbfb9883ddd..90e8b66609bf5335e6b73567cdc14b579be064cf 100644
--- a/test/DebugInfo/WebAssembly/dbg-value-move-2.ll
+++ b/test/DebugInfo/WebAssembly/dbg-value-move-2.ll
@@ -3,7 +3,7 @@
 ; CHECK: After WebAssembly Register Stackify:
 ; CHECK: bb.2.for.body:
 ; CHECK: [[REG:%[0-9]+]]:i32 = TEE_I32 {{.*}} fib2.c:6:7
-; CHECK-NEXT: DBG_VALUE debug-use [[REG]]:i32, debug-use $noreg, !"a", {{.*}} fib2.c:2:13
+; CHECK-NEXT: DBG_VALUE [[REG]]:i32, $noreg, !"a", {{.*}} fib2.c:2:13
 ; CHECK: After WebAssembly Register Coloring:
 
 ; ModuleID = 'fib2.bc'
diff --git a/test/DebugInfo/WebAssembly/dbg-value-move.ll b/test/DebugInfo/WebAssembly/dbg-value-move.ll
index 7644b97a7b7760750b8bb0f3c853723e8b9c00f2..8514f3dcaa74a783483385b28d3f77a1b6d99e6c 100644
--- a/test/DebugInfo/WebAssembly/dbg-value-move.ll
+++ b/test/DebugInfo/WebAssembly/dbg-value-move.ll
@@ -3,7 +3,7 @@
 ; CHECK: After WebAssembly Register Stackify:
 ; CHECK: bb.3.for.body.for.body_crit_edge:
 ; CHECK: [[REG:%[0-9]+]]:i32 = nsw ADD_I32 {{.*}} fib.c:7:7
-; CHECK-NEXT: DBG_VALUE debug-use [[REG]]:i32, debug-use $noreg, !"a", {{.*}} fib.c:5:13
+; CHECK-NEXT: DBG_VALUE [[REG]]:i32, $noreg, !"a", {{.*}} fib.c:5:13
 ; CHECK: After WebAssembly Register Coloring:
 
 ; ModuleID = 'fib.bc'
diff --git a/test/DebugInfo/X86/bbjoin.ll b/test/DebugInfo/X86/bbjoin.ll
index b3f20a9b8e36aa3e27855751913590b73a6b8660..c175108f384220d702223b62e9f393eafdd41f2d 100644
--- a/test/DebugInfo/X86/bbjoin.ll
+++ b/test/DebugInfo/X86/bbjoin.ll
@@ -11,12 +11,12 @@
 ; }
 ; CHECK: ![[X:.*]] = !DILocalVariable(name: "x",
 ; CHECK: bb.0.entry:
-; CHECK:   DBG_VALUE 23, debug-use $noreg, ![[X]],
-; CHECK:   DBG_VALUE debug-use $rsp, debug-use $noreg, ![[X]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_deref),
+; CHECK:   DBG_VALUE 23, $noreg, ![[X]],
+; CHECK:   DBG_VALUE $rsp, $noreg, ![[X]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_deref),
 ; CHECK: bb.1.if.then:
-; CHECK:   DBG_VALUE 43, debug-use $noreg, ![[X]],
+; CHECK:   DBG_VALUE 43, $noreg, ![[X]],
 ; CHECK: bb.2.if.end:
-; CHECK-NOT:  DBG_VALUE 23, debug-use $noreg, ![[X]],
+; CHECK-NOT:  DBG_VALUE 23, $noreg, ![[X]],
 ; CHECK:   RETQ $eax
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/DebugInfo/X86/debug-loc-asan.ll b/test/DebugInfo/X86/debug-loc-asan.ll
deleted file mode 100644
index 3e54035b7d73c0ff29b3a014362d5d3b46623066..0000000000000000000000000000000000000000
--- a/test/DebugInfo/X86/debug-loc-asan.ll
+++ /dev/null
@@ -1,190 +0,0 @@
-; RUN: llc -fast-isel-sink-local-values -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-; RUN: llc -fast-isel-sink-local-values  -O0 -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s \
-; RUN:   | llvm-dwarfdump -debug-info - | FileCheck %s --check-prefix=DWARF
-
-; Verify that we have correct debug info for local variables in code
-; instrumented with AddressSanitizer.
-
-; Generated from the source file test.cc:
-; int bar(int y) {
-;   return y + 2;
-; }
-; with "clang++ -S -emit-llvm -mllvm -asan-skip-promotable-allocas=0 -fsanitize=address -O0 -g test.cc"
-
-; The address of the (potentially now malloc'ed) alloca ends up
-; in rdi, after which it is spilled to the stack. We record the
-; spill OFFSET on the stack for checking the debug info below.
-; CHECK: #DEBUG_VALUE: bar:y <- [DW_OP_deref] [$rcx+0]
-; CHECK: movq %rcx, [[OFFSET:[0-9]+]](%rsp)
-; CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]
-; CHECK-NEXT: #DEBUG_VALUE: bar:y <- [DW_OP_plus_uconst [[OFFSET]], DW_OP_deref, DW_OP_deref]
-; This location should be valid until the end of the function.
-
-; CHECK:        movq    %rbp, %rsp
-; CHECK-NEXT: [[END_LABEL:.Ltmp[0-9]+]]:
-
-; CHECK: .Ldebug_loc{{[0-9]+}}:
-; We expect two location ranges for the variable.
-
-; First, its address is stored in %rcx:
-; CHECK:      .quad .Lfunc_begin0-.Lfunc_begin0
-; CHECK-NEXT: .quad [[START_LABEL]]-.Lfunc_begin0
-; CHECK: DW_OP_breg2
-; DWARF:       DW_TAG_formal_parameter
-; DWARF:         DW_AT_location
-; DWARF-NEXT:      [{{.*}}, {{.*}}): DW_OP_breg2 RCX+0, DW_OP_deref
-
-; Then it's addressed via %rsp:
-; CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
-; CHECK-NEXT: .quad [[END_LABEL]]-.Lfunc_begin0
-; CHECK: DW_OP_breg7
-; CHECK-NEXT: [[OFFSET]]
-; CHECK: DW_OP_deref
-; DWARF-NEXT:      [{{.*}}, {{.*}}): DW_OP_breg7 RSP+{{[0-9]+}}, DW_OP_deref, DW_OP_deref)
-
-; ModuleID = 'test.cc'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 1, void ()* @asan.module_ctor }]
-@__asan_option_detect_stack_use_after_return = external global i32
-@___asan_gen_ = private unnamed_addr constant [16 x i8] c"1 32 4 6 y.addr\00", align 1
-
-; Function Attrs: nounwind sanitize_address uwtable
-define i32 @_Z3bari(i32 %y) #0 !dbg !4 {
-entry:
-  %MyAlloca = alloca [64 x i8], align 32
-  %0 = ptrtoint [64 x i8]* %MyAlloca to i64
-  %1 = load i32, i32* @__asan_option_detect_stack_use_after_return
-  %2 = icmp ne i32 %1, 0
-  br i1 %2, label %3, label %5
-
-; <label>:3                                       ; preds = %entry
-  %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0)
-  br label %5
-
-; <label>:5                                       ; preds = %entry, %3
-  %6 = phi i64 [ %0, %entry ], [ %4, %3 ]
-  %7 = add i64 %6, 32
-  %8 = inttoptr i64 %7 to i32*
-  %9 = inttoptr i64 %6 to i64*
-  store i64 1102416563, i64* %9
-  %10 = add i64 %6, 8
-  %11 = inttoptr i64 %10 to i64*
-  store i64 ptrtoint ([16 x i8]* @___asan_gen_ to i64), i64* %11
-  %12 = add i64 %6, 16
-  %13 = inttoptr i64 %12 to i64*
-  store i64 ptrtoint (i32 (i32)* @_Z3bari to i64), i64* %13
-  %14 = lshr i64 %6, 3
-  %15 = add i64 %14, 2147450880
-  %16 = add i64 %15, 0
-  %17 = inttoptr i64 %16 to i64*
-  store i64 -868083100587789839, i64* %17
-  %18 = ptrtoint i32* %8 to i64
-  %19 = lshr i64 %18, 3
-  %20 = add i64 %19, 2147450880
-  %21 = inttoptr i64 %20 to i8*
-  %22 = load i8, i8* %21
-  %23 = icmp ne i8 %22, 0
-  call void @llvm.dbg.declare(metadata i32* %8, metadata !12, metadata !14), !dbg !DILocation(scope: !4)
-  br i1 %23, label %24, label %30
-
-; <label>:24                                      ; preds = %5
-  %25 = and i64 %18, 7
-  %26 = add i64 %25, 3
-  %27 = trunc i64 %26 to i8
-  %28 = icmp sge i8 %27, %22
-  br i1 %28, label %29, label %30
-
-; <label>:29                                      ; preds = %24
-  call void @__asan_report_store4(i64 %18)
-  call void asm sideeffect "", ""()
-  unreachable
-
-; <label>:30                                      ; preds = %24, %5
-  store i32 %y, i32* %8, align 4
-  %31 = ptrtoint i32* %8 to i64, !dbg !13
-  %32 = lshr i64 %31, 3, !dbg !13
-  %33 = add i64 %32, 2147450880, !dbg !13
-  %34 = inttoptr i64 %33 to i8*, !dbg !13
-  %35 = load i8, i8* %34, !dbg !13
-  %36 = icmp ne i8 %35, 0, !dbg !13
-  br i1 %36, label %37, label %43, !dbg !13
-
-; <label>:37                                      ; preds = %30
-  %38 = and i64 %31, 7, !dbg !13
-  %39 = add i64 %38, 3, !dbg !13
-  %40 = trunc i64 %39 to i8, !dbg !13
-  %41 = icmp sge i8 %40, %35, !dbg !13
-  br i1 %41, label %42, label %43
-
-; <label>:42                                      ; preds = %37
-  call void @__asan_report_load4(i64 %31), !dbg !13
-  call void asm sideeffect "", ""()
-  unreachable
-
-; <label>:43                                      ; preds = %37, %30
-  %44 = load i32, i32* %8, align 4, !dbg !13
-  %add = add nsw i32 %44, 2, !dbg !13
-  store i64 1172321806, i64* %9, !dbg !13
-  %45 = icmp ne i64 %6, %0, !dbg !13
-  br i1 %45, label %46, label %53, !dbg !13
-
-; <label>:46                                      ; preds = %43
-  %47 = add i64 %15, 0, !dbg !13
-  %48 = inttoptr i64 %47 to i64*, !dbg !13
-  store i64 -723401728380766731, i64* %48, !dbg !13
-  %49 = add i64 %6, 56, !dbg !13
-  %50 = inttoptr i64 %49 to i64*, !dbg !13
-  %51 = load i64, i64* %50, !dbg !13
-  %52 = inttoptr i64 %51 to i8*, !dbg !13
-  store i8 0, i8* %52, !dbg !13
-  br label %56, !dbg !13
-
-; <label>:53                                      ; preds = %43
-  %54 = add i64 %15, 0, !dbg !13
-  %55 = inttoptr i64 %54 to i64*, !dbg !13
-  store i64 0, i64* %55, !dbg !13
-  br label %56, !dbg !13
-
-; <label>:56                                      ; preds = %53, %46
-  ret i32 %add, !dbg !13
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-define internal void @asan.module_ctor() {
-  call void @__asan_init_v3()
-  ret void
-}
-
-declare void @__asan_init_v3()
-
-declare void @__asan_report_load4(i64)
-
-declare void @__asan_report_store4(i64)
-
-declare i64 @__asan_stack_malloc_0(i64, i64)
-
-attributes #0 = { nounwind sanitize_address uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!9, !10}
-!llvm.ident = !{!11}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 (209308)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
-!1 = !DIFile(filename: "test.cc", directory: "/llvm_cmake_gcc")
-!2 = !{}
-!4 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
-!5 = !DIFile(filename: "test.cc", directory: "/llvm_cmake_gcc")
-!6 = !DISubroutineType(types: !7)
-!7 = !{!8, !8}
-!8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!9 = !{i32 2, !"Dwarf Version", i32 4}
-!10 = !{i32 2, !"Debug Info Version", i32 3}
-!11 = !{!"clang version 3.5.0 (209308)"}
-!12 = !DILocalVariable(name: "y", line: 1, arg: 1, scope: !4, file: !5, type: !8)
-!13 = !DILocation(line: 2, scope: !4)
-!14 = !DIExpression(DW_OP_deref)
diff --git a/test/DebugInfo/X86/debug-loc-asan.mir b/test/DebugInfo/X86/debug-loc-asan.mir
new file mode 100644
index 0000000000000000000000000000000000000000..e4a6057deefbe05a69dd0c072a86536fbf982535
--- /dev/null
+++ b/test/DebugInfo/X86/debug-loc-asan.mir
@@ -0,0 +1,346 @@
+# RUN: llc -o - %s -start-after=patchable-function -O0 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+# RUN: llc -o - %s -start-after=patchable-function  -O0 -mtriple=x86_64-unknown-linux-gnu -filetype=obj \
+# RUN:   | llvm-dwarfdump -debug-info - | FileCheck %s --check-prefix=DWARF
+#
+# Verify that we have correct debug info for local variables in code
+# instrumented with AddressSanitizer.
+#
+# Generated from the source file test.cc:
+# int bar(int y) {
+#   return y + 2;
+# }
+# with "clang++ -S -emit-llvm -mllvm -asan-skip-promotable-allocas=0 -fsanitize=address -O0 -g test.cc"
+#
+# The address of the (potentially now malloc'ed) alloca ends up
+# in rdi, after which it is spilled to the stack. We record the
+# spill OFFSET on the stack for checking the debug info below.
+# CHECK: #DEBUG_VALUE: bar:y <- [DW_OP_deref] [$rcx+0]
+# CHECK: movq %rcx, [[OFFSET:[0-9]+]](%rsp)
+# CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]
+# CHECK-NEXT: #DEBUG_VALUE: bar:y <- [DW_OP_plus_uconst [[OFFSET]], DW_OP_deref, DW_OP_deref]
+# This location should be valid until the end of the function.
+#
+# CHECK:        movq    %rbp, %rsp
+# CHECK-NEXT: [[END_LABEL:.Ltmp[0-9]+]]:
+#
+# CHECK: .Ldebug_loc{{[0-9]+}}:
+# We expect two location ranges for the variable.
+#
+# First, its address is stored in %rcx:
+# CHECK:      .quad .Lfunc_begin0-.Lfunc_begin0
+# CHECK-NEXT: .quad [[START_LABEL]]-.Lfunc_begin0
+# CHECK: DW_OP_breg2
+# DWARF:       DW_TAG_formal_parameter
+# DWARF:         DW_AT_location
+# DWARF-NEXT:      [{{.*}}, {{.*}}): DW_OP_breg2 RCX+0, DW_OP_deref
+#
+# Then it's addressed via %rsp:
+# CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
+# CHECK-NEXT: .quad [[END_LABEL]]-.Lfunc_begin0
+# CHECK: DW_OP_breg7
+# CHECK-NEXT: [[OFFSET]]
+# CHECK: DW_OP_deref
+# DWARF-NEXT:      [{{.*}}, {{.*}}): DW_OP_breg7 RSP+{{[0-9]+}}, DW_OP_deref, DW_OP_deref)
+--- |
+  @__asan_option_detect_stack_use_after_return = external global i32
+  @___asan_gen_ = private unnamed_addr constant [16 x i8] c"1 32 4 6 y.addr\00", align 1
+  
+  ; Function Attrs: nounwind sanitize_address uwtable
+  define i32 @_Z3bari(i32 %y) #0 !dbg !6 {
+  entry:
+    %MyAlloca = alloca [64 x i8], align 32
+    %0 = ptrtoint [64 x i8]* %MyAlloca to i64
+    %1 = load i32, i32* @__asan_option_detect_stack_use_after_return
+    %2 = icmp ne i32 %1, 0
+    br i1 %2, label %3, label %5
+  
+  ; <label>:3:                                      ; preds = %entry
+    %4 = call i64 @__asan_stack_malloc_0(i64 64, i64 %0)
+    br label %5
+  
+  ; <label>:5:                                      ; preds = %3, %entry
+    %6 = phi i64 [ %0, %entry ], [ %4, %3 ]
+    %7 = add i64 %6, 32
+    %8 = inttoptr i64 %7 to i32*
+    %9 = inttoptr i64 %6 to i64*
+    store i64 1102416563, i64* %9
+    %10 = add i64 %6, 8
+    %11 = inttoptr i64 %10 to i64*
+    store i64 ptrtoint ([16 x i8]* @___asan_gen_ to i64), i64* %11
+    %12 = add i64 %6, 16
+    %13 = inttoptr i64 %12 to i64*
+    store i64 ptrtoint (i32 (i32)* @_Z3bari to i64), i64* %13
+    %14 = lshr i64 %6, 3
+    %15 = add i64 %14, 2147450880
+    %16 = add i64 %15, 0
+    %17 = inttoptr i64 %16 to i64*
+    store i64 -868083100587789839, i64* %17
+    %18 = ptrtoint i32* %8 to i64
+    %19 = lshr i64 %18, 3
+    %20 = add i64 %19, 2147450880
+    %21 = inttoptr i64 %20 to i8*
+    %22 = load i8, i8* %21
+    %23 = icmp ne i8 %22, 0
+    call void @llvm.dbg.declare(metadata i32* %8, metadata !10, metadata !DIExpression(DW_OP_deref)), !dbg !11
+    br i1 %23, label %24, label %30
+  
+  ; <label>:24:                                     ; preds = %5
+    %25 = and i64 %18, 7
+    %26 = add i64 %25, 3
+    %27 = trunc i64 %26 to i8
+    %28 = icmp sge i8 %27, %22
+    br i1 %28, label %29, label %30
+  
+  ; <label>:29:                                     ; preds = %24
+    call void @__asan_report_store4(i64 %18)
+    call void asm sideeffect "", ""()
+    unreachable
+  
+  ; <label>:30:                                     ; preds = %24, %5
+    store i32 %y, i32* %8, align 4
+    %31 = ptrtoint i32* %8 to i64, !dbg !12
+    %32 = lshr i64 %31, 3, !dbg !12
+    %33 = add i64 %32, 2147450880, !dbg !12
+    %34 = inttoptr i64 %33 to i8*, !dbg !12
+    %35 = load i8, i8* %34, !dbg !12
+    %36 = icmp ne i8 %35, 0, !dbg !12
+    br i1 %36, label %37, label %43, !dbg !12
+  
+  ; <label>:37:                                     ; preds = %30
+    %38 = and i64 %31, 7, !dbg !12
+    %39 = add i64 %38, 3, !dbg !12
+    %40 = trunc i64 %39 to i8, !dbg !12
+    %41 = icmp sge i8 %40, %35, !dbg !12
+    br i1 %41, label %42, label %43
+  
+  ; <label>:42:                                     ; preds = %37
+    call void @__asan_report_load4(i64 %31), !dbg !12
+    call void asm sideeffect "", ""()
+    unreachable
+  
+  ; <label>:43:                                     ; preds = %37, %30
+    %44 = load i32, i32* %8, align 4, !dbg !12
+    %add = add nsw i32 %44, 2, !dbg !12
+    store i64 1172321806, i64* %9, !dbg !12
+    %45 = icmp ne i64 %6, %0, !dbg !12
+    br i1 %45, label %46, label %53, !dbg !12
+  
+  ; <label>:46:                                     ; preds = %43
+    %47 = add i64 %15, 0, !dbg !12
+    %48 = inttoptr i64 %47 to i64*, !dbg !12
+    store i64 -723401728380766731, i64* %48, !dbg !12
+    %49 = add i64 %6, 56, !dbg !12
+    %50 = inttoptr i64 %49 to i64*, !dbg !12
+    %51 = load i64, i64* %50, !dbg !12
+    %52 = inttoptr i64 %51 to i8*, !dbg !12
+    store i8 0, i8* %52, !dbg !12
+    br label %56, !dbg !12
+  
+  ; <label>:53:                                     ; preds = %43
+    %54 = add i64 %15, 0, !dbg !12
+    %55 = inttoptr i64 %54 to i64*, !dbg !12
+    store i64 0, i64* %55, !dbg !12
+    br label %56, !dbg !12
+  
+  ; <label>:56:                                     ; preds = %53, %46
+    ret i32 %add, !dbg !12
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+  
+  declare void @__asan_init_v3()
+  
+  declare void @__asan_report_load4(i64)
+  
+  declare void @__asan_report_store4(i64)
+  
+  declare i64 @__asan_stack_malloc_0(i64, i64)
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #2
+  
+  attributes #0 = { nounwind sanitize_address uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+  !llvm.ident = !{!5}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5.0 (209308)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "test.cc", directory: "/llvm_cmake_gcc")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{!"clang version 3.5.0 (209308)"}
+  !6 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+  !7 = !DISubroutineType(types: !8)
+  !8 = !{!9, !9}
+  !9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !10 = !DILocalVariable(name: "y", arg: 1, scope: !6, file: !1, line: 1, type: !9)
+  !11 = !DILocation(line: 0, scope: !6)
+  !12 = !DILocation(line: 2, scope: !6)
+
+...
+---
+name:            _Z3bari
+alignment:       4
+tracksRegLiveness: true
+liveins:         
+  - { reg: '$edi' }
+frameInfo:       
+  stackSize:       152
+  offsetAdjustment: -160
+  maxAlignment:    32
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: 0 }
+stack:           
+  - { id: 0, name: MyAlloca, offset: -96, size: 64, alignment: 32, stack-id: 0 }
+  - { id: 1, type: spill-slot, offset: -100, size: 4, alignment: 4, stack-id: 0 }
+  - { id: 2, type: spill-slot, offset: -112, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 3, type: spill-slot, offset: -120, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 4, type: spill-slot, offset: -128, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 5, type: spill-slot, offset: -136, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 6, type: spill-slot, offset: -144, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 7, type: spill-slot, offset: -145, size: 1, alignment: 1, stack-id: 0 }
+  - { id: 8, type: spill-slot, offset: -146, size: 1, alignment: 1, stack-id: 0 }
+  - { id: 9, type: spill-slot, offset: -152, size: 4, alignment: 4, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    liveins: $edi
+  
+    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    CFI_INSTRUCTION offset $rbp, -16
+    $rbp = frame-setup MOV64rr $rsp
+    CFI_INSTRUCTION def_cfa_register $rbp
+    $rsp = frame-setup AND64ri8 $rsp, -32, implicit-def dead $eflags
+    $rsp = frame-setup SUB64ri32 $rsp, 160, implicit-def dead $eflags
+    renamable $rax = LEA64r $rsp, 1, $noreg, 64, $noreg
+    CMP32mi8 $noreg, 1, $noreg, @__asan_option_detect_stack_use_after_return, $noreg, 0, implicit-def $eflags :: (load 4 from @__asan_option_detect_stack_use_after_return)
+    $rcx = MOV64rr $rax
+    MOV32mr $rsp, 1, $noreg, 60, $noreg, killed $edi :: (store 4 into %stack.1)
+    MOV64mr $rsp, 1, $noreg, 48, $noreg, killed $rax :: (store 8 into %stack.2)
+    MOV64mr $rsp, 1, $noreg, 40, $noreg, killed $rcx :: (store 8 into %stack.3)
+    JE_1 %bb.2, implicit $eflags
+  
+  bb.1 (%ir-block.3):
+    $edi = MOV32ri 64, implicit-def $rdi
+    $rsi = MOV64rm $rsp, 1, $noreg, 48, $noreg :: (load 8 from %stack.2)
+    CALL64pcrel32 @__asan_stack_malloc_0, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit killed $rsi, implicit-def $rax
+    MOV64mr $rsp, 1, $noreg, 40, $noreg, killed $rax :: (store 8 into %stack.3)
+  
+  bb.2 (%ir-block.5):
+    $rax = MOV64rm $rsp, 1, $noreg, 40, $noreg :: (load 8 from %stack.3)
+    $rcx = MOV64rr $rax
+    renamable $rcx = ADD64ri8 renamable $rcx, 32, implicit-def $eflags
+    MOV64mi32 renamable $rax, 1, $noreg, 0, $noreg, 1102416563 :: (store 8 into %ir.9)
+    renamable $rdx = MOV64ri @___asan_gen_
+    MOV64mr renamable $rax, 1, $noreg, 8, $noreg, killed renamable $rdx :: (store 8 into %ir.11)
+    renamable $rdx = MOV64ri @_Z3bari
+    MOV64mr renamable $rax, 1, $noreg, 16, $noreg, killed renamable $rdx :: (store 8 into %ir.13)
+    $rdx = MOV64rr $rax
+    renamable $rdx = SHR64ri renamable $rdx, 3, implicit-def $eflags
+    $rsi = MOV64rr $rdx
+    renamable $rsi = ADD64ri32 renamable $rsi, 2147450880, implicit-def $eflags
+    renamable $rdi = MOV64ri -868083100587789839
+    MOV64mr killed renamable $rdx, 1, $noreg, 2147450880, $noreg, killed renamable $rdi :: (store 8 into %ir.17)
+    $rdx = MOV64rr $rcx
+    renamable $rdx = SHR64ri renamable $rdx, 3, implicit-def $eflags
+    renamable $r8b = MOV8rm killed renamable $rdx, 1, $noreg, 2147450880, $noreg :: (load 1 from %ir.21)
+    DBG_VALUE renamable $rcx, 0, !10, !DIExpression(DW_OP_deref), debug-location !11
+    CMP8ri renamable $r8b, 0, implicit-def $eflags
+    MOV64mr $rsp, 1, $noreg, 32, $noreg, killed $rax :: (store 8 into %stack.4)
+    MOV64mr $rsp, 1, $noreg, 24, $noreg, killed $rcx :: (store 8 into %stack.5)
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    MOV64mr $rsp, 1, $noreg, 16, $noreg, killed $rsi :: (store 8 into %stack.6)
+    MOV8mr $rsp, 1, $noreg, 15, $noreg, killed $r8b :: (store 1 into %stack.7)
+    JE_1 %bb.5, implicit $eflags
+  
+  bb.3 (%ir-block.24):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    renamable $rax = AND64ri8 renamable $rax, 7, implicit-def $eflags
+    renamable $rax = ADD64ri8 renamable $rax, 3, implicit-def $eflags
+    $cl = MOV8rr $al, implicit killed $rax
+    $dl = MOV8rm $rsp, 1, $noreg, 15, $noreg :: (load 1 from %stack.7)
+    CMP8rr killed renamable $cl, killed renamable $dl, implicit-def $eflags
+    JL_1 %bb.5, implicit $eflags
+  
+  bb.4 (%ir-block.29):
+    successors: 
+  
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rdi = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    CALL64pcrel32 @__asan_report_store4, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi
+    INLINEASM &"", 1
+  
+  bb.5 (%ir-block.30):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    $ecx = MOV32rm $rsp, 1, $noreg, 60, $noreg :: (load 4 from %stack.1)
+    MOV32mr renamable $rax, 1, $noreg, 0, $noreg, killed renamable $ecx :: (store 4 into %ir.8)
+    renamable $rax = SHR64ri renamable $rax, 3, implicit-def $eflags, debug-location !12
+    renamable $dl = MOV8rm killed renamable $rax, 1, $noreg, 2147450880, $noreg, debug-location !12 :: (load 1 from %ir.34)
+    CMP8ri renamable $dl, 0, implicit-def $eflags, debug-location !12
+    MOV8mr $rsp, 1, $noreg, 14, $noreg, killed $dl :: (store 1 into %stack.8)
+    JE_1 %bb.8, implicit $eflags, debug-location !12
+  
+  bb.6 (%ir-block.37):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    renamable $rax = AND64ri8 renamable $rax, 7, implicit-def $eflags, debug-location !12
+    renamable $rax = ADD64ri8 renamable $rax, 3, implicit-def $eflags, debug-location !12
+    $cl = MOV8rr $al, implicit killed $rax, debug-location !12
+    $dl = MOV8rm $rsp, 1, $noreg, 14, $noreg :: (load 1 from %stack.8)
+    CMP8rr killed renamable $cl, killed renamable $dl, implicit-def $eflags, debug-location !12
+    JL_1 %bb.8, implicit $eflags
+  
+  bb.7 (%ir-block.42):
+    successors: 
+  
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rdi = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    CALL64pcrel32 @__asan_report_load4, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, debug-location !12
+    INLINEASM &"", 1
+  
+  bb.8 (%ir-block.43):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 24, $noreg :: (load 8 from %stack.5)
+    renamable $ecx = MOV32rm killed renamable $rax, 1, $noreg, 0, $noreg, debug-location !12 :: (load 4 from %ir.8)
+    renamable $ecx = ADD32ri8 renamable $ecx, 2, implicit-def $eflags, debug-location !12
+    $rdx = MOV64rm $rsp, 1, $noreg, 32, $noreg :: (load 8 from %stack.4)
+    MOV64mi32 renamable $rdx, 1, $noreg, 0, $noreg, 1172321806, debug-location !12 :: (store 8 into %ir.9)
+    $rsi = MOV64rm $rsp, 1, $noreg, 48, $noreg :: (load 8 from %stack.2)
+    CMP64rr killed renamable $rdx, killed renamable $rsi, implicit-def $eflags, debug-location !12
+    MOV32mr $rsp, 1, $noreg, 8, $noreg, killed $ecx :: (store 4 into %stack.9)
+    JE_1 %bb.10, implicit $eflags, debug-location !12
+  
+  bb.9 (%ir-block.46):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    renamable $rax = MOV64ri -723401728380766731, debug-location !12
+    $rcx = MOV64rm $rsp, 1, $noreg, 16, $noreg :: (load 8 from %stack.6)
+    MOV64mr killed renamable $rcx, 1, $noreg, 0, $noreg, killed renamable $rax, debug-location !12 :: (store 8 into %ir.48)
+    $rax = MOV64rm $rsp, 1, $noreg, 32, $noreg :: (load 8 from %stack.4)
+    renamable $rdx = MOV64rm killed renamable $rax, 1, $noreg, 56, $noreg, debug-location !12 :: (load 8 from %ir.50)
+    MOV8mi killed renamable $rdx, 1, $noreg, 0, $noreg, 0, debug-location !12 :: (store 1 into %ir.52)
+    JMP_1 %bb.11, debug-location !12
+  
+  bb.10 (%ir-block.53):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $rax = MOV64rm $rsp, 1, $noreg, 16, $noreg :: (load 8 from %stack.6)
+    MOV64mi32 killed renamable $rax, 1, $noreg, 0, $noreg, 0, debug-location !12 :: (store 8 into %ir.55)
+  
+  bb.11 (%ir-block.56):
+    DBG_VALUE $rsp, 0, !10, !DIExpression(DW_OP_plus_uconst, 24, DW_OP_deref, DW_OP_deref), debug-location !11
+    $eax = MOV32rm $rsp, 1, $noreg, 8, $noreg :: (load 4 from %stack.9)
+    $rsp = MOV64rr $rbp, debug-location !12
+    $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !12
+    CFI_INSTRUCTION def_cfa $rsp, 8, debug-location !12
+    RETQ implicit killed $eax, debug-location !12
+
+...
diff --git a/test/DebugInfo/X86/debug-loc-offset.ll b/test/DebugInfo/X86/debug-loc-offset.ll
deleted file mode 100644
index 521282bdcd73eca0eb7ec786f146abf9ac254407..0000000000000000000000000000000000000000
--- a/test/DebugInfo/X86/debug-loc-offset.ll
+++ /dev/null
@@ -1,171 +0,0 @@
-; RUN: llc %s -filetype=obj -O0 -mtriple=i386-unknown-linux-gnu -dwarf-version=4 -o %t
-; RUN: llvm-dwarfdump -v %t | FileCheck %s
-
-; From the code:
-
-; debug-loc-offset1.cc
-; int bar (int b) {
-;   return b+4;
-; }
-
-; debug-loc-offset2.cc
-; struct A {
-;   int var;
-;   virtual char foo();
-; };
-
-; void baz(struct A a) {
-;   int z = 2;
-;   if (a.var > 2)
-;     z++;
-;   if (a.foo() == 'a')
-;     z++;
-; }
-
-; Compiled separately for i386-pc-linux-gnu and linked together.
-; This ensures that we have multiple compile units and multiple location lists
-; so that we can verify that
-; debug_loc entries are relative to the low_pc of the CU. The loc entry for
-; the byval argument in foo.cpp is in the second CU and so should have
-; an offset relative to that CU rather than from the beginning of the text
-; section.
-
-; Checking that we have two compile units with two sets of high/lo_pc.
-; CHECK: .debug_info contents
-; CHECK: DW_TAG_compile_unit
-; CHECK: DW_AT_low_pc {{.*}} (0x0000000000000020)
-; CHECK: DW_AT_high_pc
-
-; CHECK: DW_TAG_subprogram
-; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3baz1A"
-; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
-; CHECK:       DW_AT_location [DW_FORM_sec_offset]   ({{.*}}
-; CHECK-NEXT:    [0x00000020, 0x00000037): DW_OP_breg0 EAX+0, DW_OP_deref
-; CHECK-NEXT:    [0x00000037, 0x00000063): DW_OP_breg5 EBP-8, DW_OP_deref, DW_OP_deref
-; CHECK-NEXT:  DW_AT_name [DW_FORM_strp]{{.*}}"a"
-
-; CHECK: DW_TAG_variable
-; CHECK: DW_AT_location [DW_FORM_exprloc]
-; CHECK-NOT: DW_AT_location
-
-; CHECK: DW_TAG_compile_unit
-; CHECK: DW_AT_low_pc {{.*}} (0x0000000000000000)
-; CHECK: DW_AT_high_pc
-
-; CHECK: DW_TAG_subprogram
-; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3bari"
-; CHECK-NOT: {{DW_TAG|NULL}}
-; CHECK: DW_TAG_formal_parameter
-; CHECK-NOT: DW_TAG
-; CHECK:       DW_AT_location [DW_FORM_sec_offset]   ({{.*}}
-; CHECK-NEXT:    [0x00000000, 0x0000000a): DW_OP_consts +0, DW_OP_stack_value
-; CHECK-NEXT:    [0x0000000a, 0x00000017): DW_OP_consts +1, DW_OP_stack_value)
-; CHECK-NEXT:  DW_AT_name [DW_FORM_strp]{{.*}}"b"
-
-; CHECK: .debug_loc contents:
-; CHECK:       0x00000000:
-; CHECK-NEXT:    [0x00000000, 0x0000000a): DW_OP_consts +0, DW_OP_stack_value
-; CHECK-NEXT:    [0x0000000a, 0x00000017): DW_OP_consts +1, DW_OP_stack_value
-; CHECK:       0x00000022:
-; CHECK-NEXT:    [0x00000000, 0x00000017): DW_OP_breg0 EAX+0, DW_OP_deref
-; CHECK-NEXT:    [0x00000017, 0x00000043): DW_OP_breg5 EBP-8, DW_OP_deref, DW_OP_deref
-
-%struct.A = type { i32 (...)**, i32 }
-
-; Function Attrs: nounwind
-define i32 @_Z3bari(i32 %b) #0 !dbg !4 {
-entry:
-  %b.addr = alloca i32, align 4
-  store i32 %b, i32* %b.addr, align 4
-  call void @llvm.dbg.value(metadata i32 0, metadata !21, metadata !DIExpression()), !dbg !22
-  %0 = load i32, i32* %b.addr, align 4, !dbg !23
-  call void @llvm.dbg.value(metadata i32 1, metadata !21, metadata !DIExpression()), !dbg !22
-  %add = add nsw i32 %0, 4, !dbg !23
-  ret i32 %add, !dbg !23
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-define void @_Z3baz1A(%struct.A* %a) #2 !dbg !14 {
-entry:
-  %z = alloca i32, align 4
-  call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !24, metadata !DIExpression(DW_OP_deref)), !dbg !25
-  call void @llvm.dbg.declare(metadata i32* %z, metadata !26, metadata !DIExpression()), !dbg !27
-  store i32 2, i32* %z, align 4, !dbg !27
-  %var = getelementptr inbounds %struct.A, %struct.A* %a, i32 0, i32 1, !dbg !28
-  %0 = load i32, i32* %var, align 4, !dbg !28
-  %cmp = icmp sgt i32 %0, 2, !dbg !28
-  br i1 %cmp, label %if.then, label %if.end, !dbg !28
-
-if.then:                                          ; preds = %entry
-  %1 = load i32, i32* %z, align 4, !dbg !30
-  %inc = add nsw i32 %1, 1, !dbg !30
-  store i32 %inc, i32* %z, align 4, !dbg !30
-  br label %if.end, !dbg !30
-
-if.end:                                           ; preds = %if.then, %entry
-  %call = call signext i8 @_ZN1A3fooEv(%struct.A* %a), !dbg !31
-  %conv = sext i8 %call to i32, !dbg !31
-  %cmp1 = icmp eq i32 %conv, 97, !dbg !31
-  br i1 %cmp1, label %if.then2, label %if.end4, !dbg !31
-
-if.then2:                                         ; preds = %if.end
-  %2 = load i32, i32* %z, align 4, !dbg !33
-  %inc3 = add nsw i32 %2, 1, !dbg !33
-  store i32 %inc3, i32* %z, align 4, !dbg !33
-  br label %if.end4, !dbg !33
-
-if.end4:                                          ; preds = %if.then2, %if.end
-  ret void, !dbg !34
-}
-
-declare signext i8 @_ZN1A3fooEv(%struct.A*) #2
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.dbg.cu = !{!0, !9}
-!llvm.module.flags = !{!18, !19}
-!llvm.ident = !{!20, !20}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 (210479)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
-!1 = !DIFile(filename: "debug-loc-offset1.cc", directory: "/llvm_cmake_gcc")
-!2 = !{}
-!4 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
-!5 = !DIFile(filename: "debug-loc-offset1.cc", directory: "/llvm_cmake_gcc")
-!6 = !DISubroutineType(types: !7)
-!7 = !{!8, !8}
-!8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!9 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 (210479)", isOptimized: false, emissionKind: FullDebug, file: !10, enums: !2, retainedTypes: !11, globals: !2, imports: !2)
-!10 = !DIFile(filename: "debug-loc-offset2.cc", directory: "/llvm_cmake_gcc")
-!11 = !{!12}
-!12 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", line: 1, flags: DIFlagFwdDecl, file: !10, identifier: "_ZTS1A")
-!14 = distinct !DISubprogram(name: "baz", linkageName: "_Z3baz1A", line: 6, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !9, scopeLine: 6, file: !10, scope: !15, type: !16, retainedNodes: !2)
-!15 = !DIFile(filename: "debug-loc-offset2.cc", directory: "/llvm_cmake_gcc")
-!16 = !DISubroutineType(types: !17)
-!17 = !{null, !12}
-!18 = !{i32 2, !"Dwarf Version", i32 4}
-!19 = !{i32 2, !"Debug Info Version", i32 3}
-!20 = !{!"clang version 3.5.0 (210479)"}
-!21 = !DILocalVariable(name: "b", line: 1, arg: 1, scope: !4, file: !5, type: !8)
-!22 = !DILocation(line: 1, scope: !4)
-!23 = !DILocation(line: 2, scope: !4)
-!24 = !DILocalVariable(name: "a", line: 6, arg: 1, scope: !14, file: !15, type: !12)
-!25 = !DILocation(line: 6, scope: !14)
-!26 = !DILocalVariable(name: "z", line: 7, scope: !14, file: !15, type: !8)
-!27 = !DILocation(line: 7, scope: !14)
-!28 = !DILocation(line: 8, scope: !29)
-!29 = distinct !DILexicalBlock(line: 8, column: 0, file: !10, scope: !14)
-!30 = !DILocation(line: 9, scope: !29)
-!31 = !DILocation(line: 10, scope: !32)
-!32 = distinct !DILexicalBlock(line: 10, column: 0, file: !10, scope: !14)
-!33 = !DILocation(line: 11, scope: !32)
-!34 = !DILocation(line: 12, scope: !14)
diff --git a/test/DebugInfo/X86/debug-loc-offset.mir b/test/DebugInfo/X86/debug-loc-offset.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c5f80d062970e02321981c11f485ed8f6a6cc130
--- /dev/null
+++ b/test/DebugInfo/X86/debug-loc-offset.mir
@@ -0,0 +1,276 @@
+# RUN: llc -o - %s -start-after=patchable-function -filetype=obj -O0 -mtriple=i386-unknown-linux-gnu -dwarf-version=4 | llvm-dwarfdump -v - | FileCheck %s
+
+# From the code:
+#
+# debug-loc-offset1.cc
+# int bar (int b) {
+#   return b+4;
+# }
+#
+# debug-loc-offset2.cc
+# struct A {
+#   int var;
+#   virtual char foo();
+# };
+#
+# void baz(struct A a) {
+#   int z = 2;
+#   if (a.var > 2)
+#     z++;
+#   if (a.foo() == 'a')
+#     z++;
+# }
+#
+# Compiled separately for i386-pc-linux-gnu and linked together.
+# This ensures that we have multiple compile units and multiple location lists
+# so that we can verify that
+# debug_loc entries are relative to the low_pc of the CU. The loc entry for
+# the byval argument in foo.cpp is in the second CU and so should have
+# an offset relative to that CU rather than from the beginning of the text
+# section.
+#
+# Checking that we have two compile units with two sets of high/lo_pc.
+# CHECK: .debug_info contents
+# CHECK: DW_TAG_compile_unit
+# CHECK: DW_AT_low_pc {{.*}} (0x0000000000000020)
+# CHECK: DW_AT_high_pc
+#
+# CHECK: DW_TAG_subprogram
+# CHECK-NOT: DW_TAG
+# CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3baz1A"
+# CHECK-NOT: {{DW_TAG|NULL}}
+# CHECK: DW_TAG_formal_parameter
+# CHECK-NOT: DW_TAG
+# CHECK:       DW_AT_location [DW_FORM_sec_offset]   ({{.*}}
+# CHECK-NEXT:    [0x00000020, 0x00000037): DW_OP_breg0 EAX+0, DW_OP_deref
+# CHECK-NEXT:    [0x00000037, 0x00000063): DW_OP_breg5 EBP-8, DW_OP_deref, DW_OP_deref
+# CHECK-NEXT:  DW_AT_name [DW_FORM_strp]{{.*}}"a"
+#
+# CHECK: DW_TAG_variable
+# CHECK: DW_AT_location [DW_FORM_exprloc]
+# CHECK-NOT: DW_AT_location
+#
+# CHECK: DW_TAG_compile_unit
+# CHECK: DW_AT_low_pc {{.*}} (0x0000000000000000)
+# CHECK: DW_AT_high_pc
+#
+# CHECK: DW_TAG_subprogram
+# CHECK-NOT: DW_TAG
+# CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3bari"
+# CHECK-NOT: {{DW_TAG|NULL}}
+# CHECK: DW_TAG_formal_parameter
+# CHECK-NOT: DW_TAG
+# CHECK:       DW_AT_location [DW_FORM_sec_offset]   ({{.*}}
+# CHECK-NEXT:    [0x00000000, 0x0000000a): DW_OP_consts +0, DW_OP_stack_value
+# CHECK-NEXT:    [0x0000000a, 0x00000017): DW_OP_consts +1, DW_OP_stack_value)
+# CHECK-NEXT:  DW_AT_name [DW_FORM_strp]{{.*}}"b"
+#
+# CHECK: .debug_loc contents:
+# CHECK:       0x00000000:
+# CHECK-NEXT:    [0x00000000, 0x0000000a): DW_OP_consts +0, DW_OP_stack_value
+# CHECK-NEXT:    [0x0000000a, 0x00000017): DW_OP_consts +1, DW_OP_stack_value
+# CHECK:       0x00000022:
+# CHECK-NEXT:    [0x00000000, 0x00000017): DW_OP_breg0 EAX+0, DW_OP_deref
+# CHECK-NEXT:    [0x00000017, 0x00000043): DW_OP_breg5 EBP-8, DW_OP_deref, DW_OP_deref
+--- |
+  target triple = "i386-unknown-linux-gnu"
+  
+  %struct.A = type { i32 (...)**, i32 }
+  
+  ; Function Attrs: nounwind
+  define i32 @_Z3bari(i32 %b) #0 !dbg !10 {
+  entry:
+    %b.addr = alloca i32, align 4
+    store i32 %b, i32* %b.addr, align 4
+    call void @llvm.dbg.value(metadata i32 0, metadata !14, metadata !DIExpression()), !dbg !15
+    %0 = load i32, i32* %b.addr, align 4, !dbg !16
+    call void @llvm.dbg.value(metadata i32 1, metadata !14, metadata !DIExpression()), !dbg !15
+    %add = add nsw i32 %0, 4, !dbg !16
+    ret i32 %add, !dbg !16
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+  
+  define void @_Z3baz1A(%struct.A* %a) #2 !dbg !17 {
+  entry:
+    %z = alloca i32, align 4
+    call void @llvm.dbg.declare(metadata %struct.A* %a, metadata !20, metadata !DIExpression(DW_OP_deref)), !dbg !21
+    call void @llvm.dbg.declare(metadata i32* %z, metadata !22, metadata !DIExpression()), !dbg !23
+    store i32 2, i32* %z, align 4, !dbg !23
+    %var = getelementptr inbounds %struct.A, %struct.A* %a, i32 0, i32 1, !dbg !24
+    %0 = load i32, i32* %var, align 4, !dbg !24
+    %cmp = icmp sgt i32 %0, 2, !dbg !24
+    br i1 %cmp, label %if.then, label %if.end, !dbg !24
+  
+  if.then:                                          ; preds = %entry
+    %1 = load i32, i32* %z, align 4, !dbg !26
+    %inc = add nsw i32 %1, 1, !dbg !26
+    store i32 %inc, i32* %z, align 4, !dbg !26
+    br label %if.end, !dbg !26
+  
+  if.end:                                           ; preds = %if.then, %entry
+    %call = call signext i8 @_ZN1A3fooEv(%struct.A* %a), !dbg !27
+    %conv = sext i8 %call to i32, !dbg !27
+    %cmp1 = icmp eq i32 %conv, 97, !dbg !27
+    br i1 %cmp1, label %if.then2, label %if.end4, !dbg !27
+  
+  if.then2:                                         ; preds = %if.end
+    %2 = load i32, i32* %z, align 4, !dbg !29
+    %inc3 = add nsw i32 %2, 1, !dbg !29
+    store i32 %inc3, i32* %z, align 4, !dbg !29
+    br label %if.end4, !dbg !29
+  
+  if.end4:                                          ; preds = %if.then2, %if.end
+    ret void, !dbg !30
+  }
+  
+  declare signext i8 @_ZN1A3fooEv(%struct.A*) #2
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+  
+  attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #3 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0, !3}
+  !llvm.module.flags = !{!7, !8}
+  !llvm.ident = !{!9, !9}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5.0 (210479)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+  !1 = !DIFile(filename: "debug-loc-offset1.cc", directory: "/llvm_cmake_gcc")
+  !2 = !{}
+  !3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 3.5.0 (210479)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !5, globals: !2, imports: !2)
+  !4 = !DIFile(filename: "debug-loc-offset2.cc", directory: "/llvm_cmake_gcc")
+  !5 = !{!6}
+  !6 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", file: !4, line: 1, flags: DIFlagFwdDecl, identifier: "_ZTS1A")
+  !7 = !{i32 2, !"Dwarf Version", i32 4}
+  !8 = !{i32 2, !"Debug Info Version", i32 3}
+  !9 = !{!"clang version 3.5.0 (210479)"}
+  !10 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", scope: !1, file: !1, line: 1, type: !11, isLocal: false, isDefinition: true, scopeLine: 1, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+  !11 = !DISubroutineType(types: !12)
+  !12 = !{!13, !13}
+  !13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !14 = !DILocalVariable(name: "b", arg: 1, scope: !10, file: !1, line: 1, type: !13)
+  !15 = !DILocation(line: 1, scope: !10)
+  !16 = !DILocation(line: 2, scope: !10)
+  !17 = distinct !DISubprogram(name: "baz", linkageName: "_Z3baz1A", scope: !4, file: !4, line: 6, type: !18, isLocal: false, isDefinition: true, scopeLine: 6, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, retainedNodes: !2)
+  !18 = !DISubroutineType(types: !19)
+  !19 = !{null, !6}
+  !20 = !DILocalVariable(name: "a", arg: 1, scope: !17, file: !4, line: 6, type: !6)
+  !21 = !DILocation(line: 6, scope: !17)
+  !22 = !DILocalVariable(name: "z", scope: !17, file: !4, line: 7, type: !13)
+  !23 = !DILocation(line: 7, scope: !17)
+  !24 = !DILocation(line: 8, scope: !25)
+  !25 = distinct !DILexicalBlock(scope: !17, file: !4, line: 8)
+  !26 = !DILocation(line: 9, scope: !25)
+  !27 = !DILocation(line: 10, scope: !28)
+  !28 = distinct !DILexicalBlock(scope: !17, file: !4, line: 10)
+  !29 = !DILocation(line: 11, scope: !28)
+  !30 = !DILocation(line: 12, scope: !17)
+
+...
+---
+name:            _Z3bari
+alignment:       4
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       8
+  offsetAdjustment: -4
+  maxAlignment:    4
+  maxCallFrameSize: 0
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -8, size: 4, alignment: 8, stack-id: 0 }
+  - { id: 1, size: 4, alignment: 16, stack-id: 0 }
+stack:           
+  - { id: 0, type: spill-slot, offset: -12, size: 4, alignment: 4, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    frame-setup PUSH32r killed $ebp, implicit-def $esp, implicit $esp
+    CFI_INSTRUCTION def_cfa_offset 8
+    CFI_INSTRUCTION offset $ebp, -8
+    $ebp = frame-setup MOV32rr $esp
+    CFI_INSTRUCTION def_cfa_register $ebp
+    frame-setup PUSH32r undef $eax, implicit-def $esp, implicit $esp
+    renamable $eax = MOV32rm $ebp, 1, $noreg, 8, $noreg :: (load 4 from %fixed-stack.1)
+    DBG_VALUE 0, 0, !14, !DIExpression(), debug-location !15
+    renamable $ecx = MOV32rm $ebp, 1, $noreg, 8, $noreg, debug-location !16 :: (load 4 from %ir.b.addr)
+    DBG_VALUE 1, 0, !14, !DIExpression(), debug-location !15
+    renamable $ecx = ADD32ri8 renamable $ecx, 4, implicit-def $eflags, debug-location !16
+    MOV32mr $ebp, 1, $noreg, -4, $noreg, killed $eax :: (store 4 into %fixed-stack.1)
+    $eax = MOV32rr killed $ecx, debug-location !16
+    $esp = frame-destroy ADD32ri8 $esp, 4, implicit-def dead $eflags, debug-location !16
+    $ebp = frame-destroy POP32r implicit-def $esp, implicit $esp, debug-location !16
+    CFI_INSTRUCTION def_cfa $esp, 4, debug-location !16
+    RETL implicit killed $eax, debug-location !16
+
+...
+---
+name:            _Z3baz1A
+alignment:       4
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       28
+  offsetAdjustment: -24
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 4
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -8, size: 4, alignment: 8, stack-id: 0 }
+  - { id: 1, size: 4, alignment: 16, stack-id: 0, isImmutable: true }
+stack:           
+  - { id: 0, name: z, offset: -12, size: 4, alignment: 4, stack-id: 0, 
+      debug-info-variable: '!22', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!23' }
+  - { id: 1, type: spill-slot, offset: -16, size: 4, alignment: 4, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    frame-setup PUSH32r killed $ebp, implicit-def $esp, implicit $esp
+    CFI_INSTRUCTION def_cfa_offset 8
+    CFI_INSTRUCTION offset $ebp, -8
+    $ebp = frame-setup MOV32rr $esp
+    CFI_INSTRUCTION def_cfa_register $ebp
+    $esp = frame-setup SUB32ri8 $esp, 24, implicit-def dead $eflags
+    renamable $eax = MOV32rm $ebp, 1, $noreg, 8, $noreg :: (load 4 from %fixed-stack.1)
+    DBG_VALUE renamable $eax, 0, !20, !DIExpression(DW_OP_deref), debug-location !21
+    MOV32mi $ebp, 1, $noreg, -4, $noreg, 2, debug-location !23 :: (store 4 into %ir.z)
+    CMP32mi8 renamable $eax, 1, $noreg, 4, $noreg, 2, implicit-def $eflags, debug-location !24 :: (load 4 from %ir.var)
+    MOV32mr $ebp, 1, $noreg, -8, $noreg, killed $eax :: (store 4 into %stack.1)
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    JLE_1 %bb.2, implicit $eflags, debug-location !24
+  
+  bb.1.if.then:
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    renamable $eax = MOV32rm $ebp, 1, $noreg, -4, $noreg, debug-location !26 :: (load 4 from %ir.z)
+    renamable $eax = ADD32ri8 renamable $eax, 1, implicit-def $eflags, debug-location !26
+    MOV32mr $ebp, 1, $noreg, -4, $noreg, killed renamable $eax, debug-location !26 :: (store 4 into %ir.z)
+  
+  bb.2.if.end:
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    $eax = MOV32rm $ebp, 1, $noreg, -8, $noreg :: (load 4 from %stack.1)
+    MOV32mr $esp, 1, $noreg, 0, $noreg, killed renamable $eax, debug-location !27 :: (store 4 into stack)
+    CALLpcrel32 @_ZN1A3fooEv, csr_32, implicit $esp, implicit $ssp, implicit-def $al, debug-location !27
+    renamable $ecx = MOVSX32rr8 killed renamable $al, debug-location !27
+    CMP32ri8 killed renamable $ecx, 97, implicit-def $eflags, debug-location !27
+    JNE_1 %bb.4, implicit $eflags, debug-location !27
+  
+  bb.3.if.then2:
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    renamable $eax = MOV32rm $ebp, 1, $noreg, -4, $noreg, debug-location !29 :: (load 4 from %ir.z)
+    renamable $eax = ADD32ri8 renamable $eax, 1, implicit-def $eflags, debug-location !29
+    MOV32mr $ebp, 1, $noreg, -4, $noreg, killed renamable $eax, debug-location !29 :: (store 4 into %ir.z)
+  
+  bb.4.if.end4:
+    DBG_VALUE $ebp, 0, !20, !DIExpression(DW_OP_constu, 8, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !21
+    $esp = frame-destroy ADD32ri8 $esp, 24, implicit-def dead $eflags, debug-location !30
+    $ebp = frame-destroy POP32r implicit-def $esp, implicit $esp, debug-location !30
+    CFI_INSTRUCTION def_cfa $esp, 4, debug-location !30
+    RETL debug-location !30
+
+...
diff --git a/test/DebugInfo/X86/debug_addr.ll b/test/DebugInfo/X86/debug_addr.ll
index ea7c8bda7bc4dde0be86bd4e26dbffab8140ce4c..b50428a282cf856bce395eac4282cea6c799e5b6 100644
--- a/test/DebugInfo/X86/debug_addr.ll
+++ b/test/DebugInfo/X86/debug_addr.ll
@@ -31,7 +31,8 @@
 ; DWARF5: DW_TAG_compile_unit
 ; DWARF5-NOT: DW_TAG_{{.*}}
 ; DWARF5: DW_AT_GNU_dwo_name{{.*}}test.dwo
-; DWARF5: DW_AT_GNU_addr_base{{.*}}0x00000008
+; DWARF5: DW_AT_addr_base{{.*}}0x00000008
+; DWARF5: DW_AT_low_pc [DW_FORM_addrx] ( indexed (00000000) address = 0x0000000000000000)
 ; DWARF5: .debug_addr contents:
 ; DWARF5-NEXT: 0x00000000: Addr Section: length = 0x0000000c, version = 0x0005, addr_size = 0x04, seg_size = 0x00
 ; DWARF5-NEXT: Addrs: [
diff --git a/test/DebugInfo/X86/dw_op_minus.ll b/test/DebugInfo/X86/dw_op_minus.ll
deleted file mode 100644
index 8013c2cd0237197a5ebd098eba60bdcd9041a6b5..0000000000000000000000000000000000000000
--- a/test/DebugInfo/X86/dw_op_minus.ll
+++ /dev/null
@@ -1,80 +0,0 @@
-; Test dwarf codegen of DW_OP_minus.
-; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-
-; This was built by compiling the following source with SafeStack and
-; simplifying the result a little.
-; extern "C" {
-; void Capture(int *);
-; void f() {
-;   int buf[100];
-;   Capture(buf);
-; }
-; }
-; The interesting part is !DIExpression(DW_OP_constu, 400, DW_OP_minus)
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@__safestack_unsafe_stack_ptr = external thread_local(initialexec) global i8*
-
-define void @f() !dbg !4 {
-entry:
-  %unsafe_stack_ptr = load i8*, i8** @__safestack_unsafe_stack_ptr
-  %unsafe_stack_static_top = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400
-  store i8* %unsafe_stack_static_top, i8** @__safestack_unsafe_stack_ptr
-  %0 = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400
-  %buf = bitcast i8* %0 to [100 x i32]*
-  %1 = bitcast [100 x i32]* %buf to i8*, !dbg !16
-  call void @llvm.dbg.declare(metadata i8* %unsafe_stack_ptr, metadata !8, metadata !17), !dbg !18
-  %arraydecay = getelementptr inbounds [100 x i32], [100 x i32]* %buf, i64 0, i64 0, !dbg !19
-  call void @Capture(i32* %arraydecay), !dbg !20
-  store i8* %unsafe_stack_ptr, i8** @__safestack_unsafe_stack_ptr, !dbg !21
-  ret void, !dbg !21
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
-
-declare void @Capture(i32*)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!13, !14}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "1.cc", directory: "/tmp")
-!2 = !{}
-!4 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !5, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !7)
-!5 = !DISubroutineType(types: !6)
-!6 = !{null}
-!7 = !{!8}
-!8 = !DILocalVariable(name: "buf", scope: !4, file: !1, line: 5, type: !9)
-!9 = !DICompositeType(tag: DW_TAG_array_type, baseType: !10, size: 3200, align: 32, elements: !11)
-!10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!11 = !{!12}
-!12 = !DISubrange(count: 100)
-!13 = !{i32 2, !"Dwarf Version", i32 4}
-!14 = !{i32 2, !"Debug Info Version", i32 3}
-!15 = !{!"clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)"}
-!16 = !DILocation(line: 5, column: 3, scope: !4)
-!17 = !DIExpression(DW_OP_constu, 400, DW_OP_minus)
-!18 = !DILocation(line: 5, column: 7, scope: !4)
-!19 = !DILocation(line: 6, column: 11, scope: !4)
-!20 = !DILocation(line: 6, column: 3, scope: !4)
-!21 = !DILocation(line: 7, column: 1, scope: !4)
-
-; RCX - 400
-; CHECK:      .short	3                       # Loc expr size
-; CHECK-NEXT: .byte	114                     # DW_OP_breg2
-; CHECK-NEXT: .byte	240                     # -400
-; CHECK-NEXT: .byte	124
-
-; RCX is clobbered in call @Capture, but there is a spilled copy.
-; *(RSP + 8) - 400
-; CHECK:      .short	7                       # Loc expr size
-; CHECK-NEXT: .byte	119                     # DW_OP_breg7
-; CHECK-NEXT: .byte	8                       # 8
-; CHECK-NEXT: .byte	6                       # DW_OP_deref
-; CHECK-NEXT: .byte	16                      # DW_OP_constu
-; CHECK-NEXT: .byte	144                     # 400
-; CHECK-NEXT: .byte	3                       #
-; CHECK-NEXT: .byte	28                      # DW_OP_minus
diff --git a/test/DebugInfo/X86/dw_op_minus.mir b/test/DebugInfo/X86/dw_op_minus.mir
new file mode 100644
index 0000000000000000000000000000000000000000..574e5aed44257ae3b68a19165f001f418a41a20e
--- /dev/null
+++ b/test/DebugInfo/X86/dw_op_minus.mir
@@ -0,0 +1,119 @@
+# RUN: llc -o - %s -start-after=patchable-function -O0 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+# Test dwarf codegen of DW_OP_minus.
+#
+# This was built by compiling the following source with SafeStack and
+# simplifying the result a little.
+# extern "C" {
+# void Capture(int *);
+# void f() {
+#   int buf[100];
+#   Capture(buf);
+# }
+# }
+# The interesting part is !DIExpression(DW_OP_constu, 400, DW_OP_minus)
+#
+# RCX - 400
+# CHECK:      .short    3                       # Loc expr size
+# CHECK-NEXT: .byte     114                     # DW_OP_breg2
+# CHECK-NEXT: .byte     240                     # -400
+# CHECK-NEXT: .byte     124
+#
+# RCX is clobbered in call @Capture, but there is a spilled copy.
+# *(RSP + 8) - 400
+# CHECK:      .short    7                       # Loc expr size
+# CHECK-NEXT: .byte     119                     # DW_OP_breg7
+# CHECK-NEXT: .byte     8                       # 8
+# CHECK-NEXT: .byte     6                       # DW_OP_deref
+# CHECK-NEXT: .byte     16                      # DW_OP_constu
+# CHECK-NEXT: .byte     144                     # 400
+# CHECK-NEXT: .byte     3                       #
+# CHECK-NEXT: .byte     28                      # DW_OP_minus
+--- |
+  @__safestack_unsafe_stack_ptr = external thread_local(initialexec) global i8*
+  
+  define void @f() !dbg !5 {
+  entry:
+    %unsafe_stack_ptr = load i8*, i8** @__safestack_unsafe_stack_ptr
+    %unsafe_stack_static_top = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400
+    store i8* %unsafe_stack_static_top, i8** @__safestack_unsafe_stack_ptr
+    %0 = getelementptr i8, i8* %unsafe_stack_ptr, i32 -400
+    %buf = bitcast i8* %0 to [100 x i32]*
+    %1 = bitcast [100 x i32]* %buf to i8*, !dbg !14
+    call void @llvm.dbg.declare(metadata i8* %unsafe_stack_ptr, metadata !9, metadata !DIExpression(DW_OP_constu, 400, DW_OP_minus)), !dbg !15
+    %arraydecay = getelementptr inbounds [100 x i32], [100 x i32]* %buf, i64 0, i64 0, !dbg !16
+    call void @Capture(i32* %arraydecay), !dbg !17
+    store i8* %unsafe_stack_ptr, i8** @__safestack_unsafe_stack_ptr, !dbg !18
+    ret void, !dbg !18
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+  
+  declare void @Capture(i32*)
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { nounwind readnone speculatable }
+  attributes #1 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.8.0 (trunk 248518) (llvm/trunk 248512)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "1.cc", directory: "/tmp")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !8)
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{null}
+  !8 = !{!9}
+  !9 = !DILocalVariable(name: "buf", scope: !5, file: !1, line: 5, type: !10)
+  !10 = !DICompositeType(tag: DW_TAG_array_type, baseType: !11, size: 3200, align: 32, elements: !12)
+  !11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !12 = !{!13}
+  !13 = !DISubrange(count: 100)
+  !14 = !DILocation(line: 5, column: 3, scope: !5)
+  !15 = !DILocation(line: 5, column: 7, scope: !5)
+  !16 = !DILocation(line: 6, column: 11, scope: !5)
+  !17 = !DILocation(line: 6, column: 3, scope: !5)
+  !18 = !DILocation(line: 7, column: 1, scope: !5)
+
+...
+---
+name:            f
+alignment:       4
+tracksRegLiveness: true
+frameInfo:       
+  stackSize:       24
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+stack:           
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    $rsp = frame-setup SUB64ri8 $rsp, 24, implicit-def dead $eflags
+    CFI_INSTRUCTION def_cfa_offset 32
+    renamable $rax = MOV64rm $rip, 1, $noreg, target-flags(x86-gottpoff) @__safestack_unsafe_stack_ptr, $noreg :: (load 8 from got)
+    renamable $rcx = MOV64rm renamable $rax, 1, $noreg, 0, $fs :: (dereferenceable load 8 from @__safestack_unsafe_stack_ptr)
+    DBG_VALUE renamable $rcx, 0, !9, !DIExpression(DW_OP_constu, 400, DW_OP_minus), debug-location !15
+    $rdx = MOV64rr $rcx
+    renamable $rdx = ADD64ri32 renamable $rdx, -400, implicit-def dead $eflags
+    MOV64mr renamable $rax, 1, $noreg, 0, $fs, renamable $rdx :: (store 8 into @__safestack_unsafe_stack_ptr)
+    $rdi = MOV64rr killed $rdx, debug-location !17
+    MOV64mr $rsp, 1, $noreg, 16, $noreg, killed $rax :: (store 8 into %stack.0)
+    MOV64mr $rsp, 1, $noreg, 8, $noreg, killed $rcx :: (store 8 into %stack.1)
+    DBG_VALUE $rsp, 0, !9, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_deref, DW_OP_constu, 400, DW_OP_minus), debug-location !15
+    CALL64pcrel32 @Capture, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, debug-location !17
+    $rax = MOV64rm $rsp, 1, $noreg, 16, $noreg :: (load 8 from %stack.0)
+    $rcx = MOV64rm $rsp, 1, $noreg, 8, $noreg :: (load 8 from %stack.1)
+    MOV64mr killed renamable $rax, 1, $noreg, 0, $fs, killed renamable $rcx, debug-location !18 :: (store 8 into @__safestack_unsafe_stack_ptr)
+    $rsp = frame-destroy ADD64ri8 $rsp, 24, implicit-def dead $eflags, debug-location !18
+    CFI_INSTRUCTION def_cfa_offset 8, debug-location !18
+    RETQ debug-location !18
+
+...
diff --git a/test/DebugInfo/X86/dwarf-no-source-loc.ll b/test/DebugInfo/X86/dwarf-no-source-loc.ll
index 19695ab126b592912781492ad2d512660ee75a11..60d50a391a1b657d106b00d08a8852b8a2490b0d 100644
--- a/test/DebugInfo/X86/dwarf-no-source-loc.ll
+++ b/test/DebugInfo/X86/dwarf-no-source-loc.ll
@@ -40,14 +40,15 @@ if.end:                                           ; preds = %if.then, %entry
   ret void, !dbg !14
 }
 
-; CHECK:      .loc 1 7 7 prologue_end
+; CHECK:      .loc 1 7 7
 ; CHECK-NOT:  .loc
-; CHECK:      # %bb.1
-; CHECK-NEXT: .file 2 "/tests{{[/\]+}}include.h"
-; CHECK-NEXT: .loc 2 20 5
+; CHECK:      .loc 1 0 7 is_stmt 0
 ; CHECK-NOT:  .loc
+; CHECK:      .loc 2 20 5 is_stmt 1
 ; CHECK:      .LBB0_2:
-; CHECK:      .loc 1 10 3
+; CHECK-NEXT: .loc 2 0 5 is_stmt 0
+; CHECK-NOT:  .loc
+; CHECK:      .loc 1 10 3 is_stmt 1
 ;
 ; DISABLE-NOT: .loc 1 0
 
diff --git a/test/DebugInfo/X86/dwarf-no-source-loc.mir b/test/DebugInfo/X86/dwarf-no-source-loc.mir
deleted file mode 100644
index f6ad6ee6d4c8d98e1f9b65d05f378cccf2f24eaa..0000000000000000000000000000000000000000
--- a/test/DebugInfo/X86/dwarf-no-source-loc.mir
+++ /dev/null
@@ -1,74 +0,0 @@
-# RUN: llc -o - %s -start-before=patchable-function -use-unknown-locations=Default | FileCheck %s --check-prefixes=CHECK,DEFAULT
-# RUN: llc -o - %s -start-before=patchable-function -use-unknown-locations=Enable | FileCheck %s --check-prefixes=CHECK,ENABLE
-# RUN: llc -o - %s -start-before=patchable-function -use-unknown-locations=Disable | FileCheck %s --check-prefixes=CHECK,DISABLE
---- |
-  target triple = "x86_64--"
-  
-  !0 = !DIFile(filename: "dwarf-no-source-loc.mir", directory: "/")
-  !1 = distinct !DICompileUnit(file: !0, language: DW_LANG_C, emissionKind: LineTablesOnly)
-  !2 = distinct !DISubprogram(name: "func", unit: !1)
-  !3 = !DILocation(line: 17, scope: !2)
-  !4 = !DILocation(line: 42, scope: !2)
-
-  !llvm.dbg.cu = !{!1}
-  !llvm.module.flags = !{!10, !11}
-  !10 = !{i32 2, !"Dwarf Version", i32 4}
-  !11 = !{i32 2, !"Debug Info Version", i32 3}
-  
-  define void @func() !dbg !2 {
-    unreachable
-  }
-...
----
-name: func
-body: |
-  bb.0:
-    NOOP
-    NOOP
-    $eax = MOV32ri 1, debug-location !3
-    ; CHECK-LABEL: bb.0
-    ; CHECK: nop
-    ; CHECK: nop
-    ; CHECK: .loc 1 17 0 prologue_end
-    ; CHECK: movl $1, %eax
-
-  bb.1:
-    NOOP
-    $ebx = MOV32ri 2, debug-location !4
-    ; CHECK-LABEL: bb.1
-    ; DEFAULT: .loc 1 42 0
-    ; ENABLE: .loc 1 0
-    ; DISABLE-NOT: .loc 1 0
-    ; CHECK: nop
-    ; ENABLE: .loc 1 42 0
-    ; CHECK: movl $2, %ebx
-
-  bb.2:
-    NOOP
-    ; CHECK-LABEL: bb.2
-    ; DEFAULT: .loc 1 0 0 is_stmt 0
-    ; ENABLE: .loc 1 0 0 is_stmt 0
-    ; DISABLE-NOT: .loc 1 0
-    ; CHECK: nop
-
-  bb.3:
-    NOOP
-    $ecx = MOV32ri 3, debug-location !3
-    ; CHECK-LABEL: bb.3
-    ; CHECK: nop
-    ; DEFAULT: .loc 1 17 0 is_stmt 1
-    ; ENABLE: .loc 1 17 0 is_stmt 1
-    ; DISABLE-NOT: .loc 1 0
-    ; CHECK: movl $3, %ecx
-
-  bb.4:
-    NOOP
-    $edx = MOV32ri 4, debug-location !4
-    ; CHECK: bb.4
-    ; DEFAULT: .loc 1 42 0
-    ; ENABLE: .loc 1 0 0 is_stmt 0
-    ; DISABLE-NOT: .loc 1 0
-    ; CHECK: nop
-    ; ENABLE: .loc 1 42 0 is_stmt 1
-    ; CHECK: movl $4, %edx
-...
diff --git a/test/DebugInfo/X86/dwarfdump-debug-loclists.test b/test/DebugInfo/X86/dwarfdump-debug-loclists.test
new file mode 100644
index 0000000000000000000000000000000000000000..669607fe557a36a3ed0465f0b076c90094474ca1
--- /dev/null
+++ b/test/DebugInfo/X86/dwarfdump-debug-loclists.test
@@ -0,0 +1,167 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux -o %t.o
+# RUN: llvm-dwarfdump -v %t.o | FileCheck %s
+
+# CHECK:      .debug_info
+# CHECK:       DW_AT_name{{.*}}"stub"
+# CHECK:       DW_AT_location [DW_FORM_sec_offset]   (0x0000000c
+# CHECK-NEXT:    [0x0000000000000010, 0x0000000000000020): DW_OP_breg5 RDI+0
+# CHECK-NEXT:    [0x0000000000000530, 0x0000000000000540): DW_OP_breg6 RBP-8, DW_OP_deref
+# CHECK-NEXT:    [0x0000000000000700, 0x0000000000000710): DW_OP_breg5 RDI+0
+
+# CHECK:      .debug_loclists contents:
+# CHECK-NEXT: 0x00000000: locations list header: length = 0x0000002f, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+# CHECK-NEXT: 0x00000000:
+# CHECK-NEXT:   [0x0000000000000000, 0x0000000000000010): DW_OP_breg5 RDI+0
+# CHECK-NEXT:   [0x0000000000000530, 0x0000000000000540): DW_OP_breg6 RBP-8, DW_OP_deref
+# CHECK-NEXT:   [0x0000000000000700, 0x0000000000000710): DW_OP_breg5 RDI+0
+
+.section  .debug_str,"MS",@progbits,1
+  .asciz  "stub"
+
+.section  .debug_str_offsets,"",@progbits
+  .long  68
+  .short  5
+  .short  0
+.Lstr_offsets_base0:
+  .zero 64
+
+.section  .debug_loclists,"",@progbits
+  .long  .Ldebug_loclist_table_end0-.Ldebug_loclist_table_start0
+.Ldebug_loclist_table_start0:
+ .short 5                        # Version.
+ .byte 8                         # Address size.
+ .byte 0                         # Segmen selector size.
+ .long 0                         # Offset entry count.
+.Lloclists_table_base0:
+.Ldebug_loc0:
+  .byte  4                       # DW_LLE_offset_pair
+  .uleb128 0x0                   #   starting offset
+  .uleb128 0x10                  #   ending offset
+  .short  2                      # Loc expr size
+  .byte  117                     # DW_OP_breg5
+  .byte  0                       # 0
+  
+  .byte  6                       # DW_LLE_base_address
+  .quad  0x500                   # Some address
+  
+  .byte  4                       # DW_LLE_offset_pair
+  .uleb128 0x30                  #   starting offset
+  .uleb128 0x40                  #   ending offset
+  .short  3                      # Loc expr size
+  .byte  118                     # DW_OP_breg6
+  .byte  120                     # -8
+  .byte  6                       # DW_OP_deref
+
+  .byte  8                       # DW_LLE_start_length
+  .quad  0x700                   # Some address
+  .uleb128 0x10                  #   length
+  .short  2                      # Loc expr size
+  .byte  117                     # DW_OP_breg5
+  .byte  0                       # 0
+  
+  .byte  0                       # DW_LLE_end_of_list
+
+.Ldebug_loclist_table_end0:
+
+.section  .debug_abbrev,"",@progbits
+  .byte  1                       # Abbreviation Code
+  .byte  17                      # DW_TAG_compile_unit
+  .byte  1                       # DW_CHILDREN_yes
+  .byte  37                      # DW_AT_producer
+  .byte  37                      # DW_FORM_strx1
+  .byte  19                      # DW_AT_language
+  .byte  5                       # DW_FORM_data2
+  .byte  3                       # DW_AT_name
+  .byte  37                      # DW_FORM_strx1
+  .byte  114                     # DW_AT_str_offsets_base
+  .byte  23                      # DW_FORM_sec_offset
+  .byte  16                      # DW_AT_stmt_list
+  .byte  23                      # DW_FORM_sec_offset
+  .byte  27                      # DW_AT_comp_dir
+  .byte  37                      # DW_FORM_strx1
+  .byte  17                      # DW_AT_low_pc
+  .byte  1                       # DW_FORM_addr
+  .byte  18                      # DW_AT_high_pc
+  .byte  6                       # DW_FORM_data4
+  .ascii  "\214\001"             # DW_AT_loclists_base
+  .byte  23                      # DW_FORM_sec_offset
+  .byte  0                       # EOM(1)
+  .byte  0                       # EOM(2)
+  .byte  2                       # Abbreviation Code
+  .byte  46                      # DW_TAG_subprogram
+  .byte  1                       # DW_CHILDREN_yes
+  .byte  17                      # DW_AT_low_pc
+  .byte  1                       # DW_FORM_addr
+  .byte  18                      # DW_AT_high_pc
+  .byte  6                       # DW_FORM_data4
+  .byte  64                      # DW_AT_frame_base
+  .byte  24                      # DW_FORM_exprloc
+  .byte  110                     # DW_AT_linkage_name
+  .byte  37                      # DW_FORM_strx1
+  .byte  3                       # DW_AT_name
+  .byte  37                      # DW_FORM_strx1
+  .byte  58                      # DW_AT_decl_file
+  .byte  11                      # DW_FORM_data1
+  .byte  59                      # DW_AT_decl_line
+  .byte  11                      # DW_FORM_data1
+  .byte  63                      # DW_AT_external
+  .byte  25                      # DW_FORM_flag_present
+  .byte  0                       # EOM(1)
+  .byte  0                       # EOM(2)
+  .byte  3                       # Abbreviation Code
+  .byte  52                      # DW_TAG_variable
+  .byte  0                       # DW_CHILDREN_no
+  .byte  2                       # DW_AT_location
+  .byte  23                      # DW_FORM_sec_offset
+  .byte  3                       # DW_AT_name
+  .byte  37                      # DW_FORM_strx1
+  .byte  58                      # DW_AT_decl_file
+  .byte  11                      # DW_FORM_data1
+  .byte  59                      # DW_AT_decl_line
+  .byte  11                      # DW_FORM_data1
+  .byte  73                      # DW_AT_type
+  .byte  19                      # DW_FORM_ref4
+  .byte  0                       # EOM(1)
+  .byte  0                       # EOM(2)
+  .byte  0                       # EOM(3)
+
+.section  .debug_info,"",@progbits
+.Lcu_begin0:
+  .long  70                      # Length of Unit
+  .short  5                      # DWARF version number
+  .byte  1                       # DWARF Unit Type
+  .byte  8                       # Address Size (in bytes)
+  .long  .debug_abbrev           # Offset Into Abbrev. Section
+  .byte  1                       # Abbrev [1] 0xc:0xef DW_TAG_compile_unit
+  .byte  0                       # DW_AT_producer
+  .short  4                      # DW_AT_language
+  .byte  1                       # DW_AT_name
+  .long  .Lstr_offsets_base0     # DW_AT_str_offsets_base
+  .long  .Lline_table_start0     # DW_AT_stmt_list
+  .byte  2                       # DW_AT_comp_dir
+  .quad  0x10                    # DW_AT_low_pc
+  .long  0                       # DW_AT_high_pc
+  .long  .Lloclists_table_base0  # DW_AT_loclists_base
+  .byte  2                       # Abbrev [2] 0x2a:0x20 DW_TAG_subprogram
+  .quad  0                       # DW_AT_low_pc
+  .long  0                       # DW_AT_high_pc
+  .byte  1                       # DW_AT_frame_base
+  .byte  86
+  .byte  11                      # DW_AT_linkage_name
+  .byte  12                      # DW_AT_name
+  .byte  1                       # DW_AT_decl_file
+  .byte  6                       # DW_AT_decl_line
+                                 # DW_AT_external
+  .byte 3                        # Abbrev [3] 0x40:0xb DW_TAG_variable
+  .long .Ldebug_loc0             # DW_AT_location
+  .byte 7                        # DW_AT_name
+  .byte 1                        # DW_AT_decl_file
+  .byte 6                        # DW_AT_decl_line
+  .long 76                       # DW_AT_type
+  .byte 0                        # End Of Children Mark
+  .byte 0                        # End Of Children Mark
+  .byte  0                       # End Of Children Mark
+
+.section .debug_line,"",@progbits
+.Lline_table_start0:
+
diff --git a/test/DebugInfo/X86/dwarfdump-header.s b/test/DebugInfo/X86/dwarfdump-header.s
index 7daba5f69612a02536b7d2ae707cff7b9eb4a1ec..daf03614f9daa0fa2ca66a368840cc924cd3fda3 100644
--- a/test/DebugInfo/X86/dwarfdump-header.s
+++ b/test/DebugInfo/X86/dwarfdump-header.s
@@ -152,35 +152,13 @@ CU_split_5_end:
 # CHECK-NEXT: DW_AT_producer {{.*}} "Handmade DWO producer"
 # CHECK-NEXT: DW_AT_name {{.*}} "V5_dwo_compile_unit"
 
-        .section .debug_types,"",@progbits
-# CHECK-LABEL: .debug_types contents:
-
-# DWARF v4 Type unit header. Normal/split are identical so we do only one.
-TU_4_start:
-        .long  TU_4_end-TU_4_version  # Length of Unit
-TU_4_version:
-        .short 4               # DWARF version number
-        .long .debug_abbrev    # Offset Into Abbrev. Section
-        .byte 8                # Address Size (in bytes)
-        .quad 0x0011223344556677 # Type Signature
-        .long TU_4_type-TU_4_start # Type offset
-# The type-unit DIE, which has a name.
-        .byte 2
-        .long str_TU_4
-# The type DIE, which has a name.
-TU_4_type:
-        .byte 3
-        .long str_TU_4
-        .byte 0 # NULL
-        .byte 0 # NULL
-TU_4_end:
-
-# CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
-# CHECK: 0x00000017: DW_TAG_type_unit
+# Now a DWARF v5 type unit, which goes in a .debug_info.dwo comdat.
+# Note there will not be another ".debug_info.dwo contents:" line, even though
+# there is a separate ELF section header; it's dumped along with the previous
+# unit as if they were in a single section.
 
-        .section .debug_types.dwo,"",@progbits
-# FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
-# CHECK: .debug_types.dwo contents:
+        .section .debug_info.dwo,"G",@progbits,5555,comdat
+# CHECK-NOT: .debug_info.dwo
 
 # DWARF v5 split type unit header.
 TU_split_5_start:
@@ -206,6 +184,32 @@ TU_split_5_end:
 # CHECK: 0x00000000: Type Unit: length = 0x00000020 version = 0x0005 unit_type = DW_UT_split_type abbr_offset = 0x0000 addr_size = 0x08 name = 'V5_split_type_unit' type_signature = 0x8899aabbccddeeff type_offset = 0x001d (next unit at 0x00000024)
 # CHECK: 0x00000018: DW_TAG_type_unit
 
+        .section .debug_types,"",@progbits
+# CHECK-LABEL: .debug_types contents:
+
+# DWARF v4 Type unit header. Normal/split are identical so we do only one.
+TU_4_start:
+        .long  TU_4_end-TU_4_version  # Length of Unit
+TU_4_version:
+        .short 4               # DWARF version number
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+        .byte 8                # Address Size (in bytes)
+        .quad 0x0011223344556677 # Type Signature
+        .long TU_4_type-TU_4_start # Type offset
+# The type-unit DIE, which has a name.
+        .byte 2
+        .long str_TU_4
+# The type DIE, which has a name.
+TU_4_type:
+        .byte 3
+        .long str_TU_4
+        .byte 0 # NULL
+        .byte 0 # NULL
+TU_4_end:
+
+# CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
+# CHECK: 0x00000017: DW_TAG_type_unit
+
         .section .debug_line,"",@progbits
 # CHECK-LABEL: .debug_line contents:
 
diff --git a/test/DebugInfo/X86/dwarfdump-ranges-unrelocated.s b/test/DebugInfo/X86/dwarfdump-ranges-unrelocated.s
index 2bb46707cc85e7c6c5cb53d436aa5ce90cc1e507..d779dac1a41f069d31eb2260653128c9e43e5d2d 100644
--- a/test/DebugInfo/X86/dwarfdump-ranges-unrelocated.s
+++ b/test/DebugInfo/X86/dwarfdump-ranges-unrelocated.s
@@ -21,6 +21,16 @@
 # BRIEF-NEXT:  [0x0000000000000000, 0x0000000000000002)
 # BRIEF-NEXT:  [0x0000000000000000, 0x0000000000000003))
 
+# RUN: llvm-dwarfdump -diff %t | FileCheck %s --check-prefix=DIFF
+# DIFF: DW_TAG_compile_unit
+# DIFF-NEXT: DW_AT_producer	()
+# DIFF-NEXT: DW_AT_language	(DW_LANG_C_plus_plus)
+# DIFF-NEXT: DW_AT_name	()
+# DIFF-NEXT: DW_AT_stmt_list	()
+# DIFF-NEXT: DW_AT_comp_dir	()
+# DIFF-NEXT: DW_AT_low_pc	()
+# DIFF-NEXT: DW_AT_ranges	()
+
 ## Asm code for testcase is a reduced and modified output from next
 ## invocation and source:
 # clang test.cpp -S -o test.s -gmlt -ffunction-sections
diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets.s b/test/DebugInfo/X86/dwarfdump-str-offsets.s
index e68f08b9c7a25d9a9da09d52b9959e6def1272d2..2f4215a04ba94b8ec6a863f8615310341a0a33e4 100644
--- a/test/DebugInfo/X86/dwarfdump-str-offsets.s
+++ b/test/DebugInfo/X86/dwarfdump-str-offsets.s
@@ -1,5 +1,6 @@
 # RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o
 # RUN: llvm-dwarfdump -v %t.o 2> %t.err | FileCheck --check-prefix=COMMON --check-prefix=SPLIT %s
+# RUN: llvm-dwarfdump -verify %t.o | FileCheck --check-prefix=VERIFY %s
 # 
 # Check that we don't report an error on a non-existent range list table.
 # RUN: FileCheck -allow-empty --check-prefix ERR %s < %t.err
@@ -136,6 +137,8 @@ dwo_str_TU_5_type:
         .byte 0x00  # DW_CHILDREN_no
         .byte 0x03  # DW_AT_name
         .byte 0x26  # DW_FORM_strx2
+        .byte 0x49  # DW_AT_type
+        .byte 0x13  # DW_FORM_ref4
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x06  # Abbrev code
@@ -143,6 +146,8 @@ dwo_str_TU_5_type:
         .byte 0x00  # DW_CHILDREN_no
         .byte 0x03  # DW_AT_name
         .byte 0x27  # DW_FORM_strx3
+        .byte 0x49  # DW_AT_type
+        .byte 0x13  # DW_FORM_ref4
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x07  # Abbrev code
@@ -150,6 +155,15 @@ dwo_str_TU_5_type:
         .byte 0x00  # DW_CHILDREN_no
         .byte 0x03  # DW_AT_name
         .byte 0x28  # DW_FORM_strx4
+        .byte 0x49  # DW_AT_type
+        .byte 0x13  # DW_FORM_ref4
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x08  # Abbrev code
+        .byte 0x24  # DW_TAG_base_type
+        .byte 0x00  # DW_CHILDREN_no
+        .byte 0x3e  # DW_AT_encoding
+        .byte 0x0b  # DW_FORM_data1
         .byte 0x00  # EOM(1)
         .byte 0x00  # EOM(2)
         .byte 0x00  # EOM(3)
@@ -202,17 +216,24 @@ CU1_5_version:
 # A subprogram DIE with DW_AT_name, using DW_FORM_strx1.
         .byte 4                # Abbreviation code
         .byte 3                # Subprogram name string (DW_FORM_strx1)
-# A variable DIE with DW_AT_name, using DW_FORM_strx2.
+# A variable DIE with DW_AT_name, using DW_FORM_strx2, and DW_AT_type.
         .byte 5                # Abbreviation code
         .short 0x0004          # Subprogram name string (DW_FORM_strx2)
-# A variable DIE with DW_AT_name, using DW_FORM_strx3.
+        .long TypeDie-.debug_info
+# A variable DIE with DW_AT_name, using DW_FORM_strx3, and DW_AT_type.
         .byte 6                # Abbreviation code
         .byte 5                # Subprogram name string (DW_FORM_strx3)
         .short 0               # Subprogram name string (DW_FORM_strx3)
-# A variable DIE with DW_AT_name, using DW_FORM_strx4.
+        .long TypeDie-.debug_info
+# A variable DIE with DW_AT_name, using DW_FORM_strx4, and DW_AT_type.
         .byte 7                # Abbreviation code
-        .quad 0x00000006       # Subprogram name string (DW_FORM_strx4)
+        .long 6                # Subprogram name string (DW_FORM_strx4)
+        .long TypeDie-.debug_info
         .byte 0 # NULL
+# A base type DIE with DW_AT_encoding.
+TypeDie:
+        .byte 8                # Abbreviation code
+        .byte 5                # DW_ATE_signed
         .byte 0 # NULL
         .byte 0 # NULL
 CU1_5_end:
@@ -386,4 +407,6 @@ TU_split_5_end:
 # SPLIT-NEXT:  0x00000014: 00000047 "V5_split_type_unit"
 # SPLIT-NEXT:  0x00000018: 0000005a "V5_split_Mystruct"
 
+# VERIFY: No errors.
+
 # ERR-NOT: parsing a range list table:
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index 3ea5aa2e350fa0063ee925881e90ddb0604bce3a..5883d2b0c4087fb82b673ca180e104ad16fce534 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -11,6 +11,7 @@
 ; CHECK-NEXT: DW_AT_GNU_dwo_name
 ; CHECK-NEXT: DW_AT_comp_dir
 ; CHECK-NEXT: DW_AT_GNU_dwo_id
+; CHECK-NEXT: DW_AT_GNU_ranges_base
 ; CHECK-NEXT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]                   (0x00000000)
 
 ; CHECK: .debug_info.dwo contents:
@@ -55,11 +56,14 @@
 ; V5RNGLISTS-NOT:  DW_TAG
 ; V5RNGLISTS:      DW_AT_rnglists_base [DW_FORM_sec_offset]  (0x0000000c)
 ; V5RNGLISTS:      .debug_rnglists contents:
-; V5RNGLISTS-NEXT: 0x00000000: range list header: length = 0x00000015, version = 0x0005,
-; V5RNGLISTS-SAME: addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+; V5RNGLISTS-NEXT: 0x00000000: range list header: length = 0x00000019, version = 0x0005,
+; V5RNGLISTS-SAME: addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001
+; V5RNGLISTS-NEXT: offsets: [
+; V5RNGLISTS-NEXT: => 0x00000010
+; V5RNGLISTS-NEXT: ]
 ; V5RNGLISTS-NEXT: ranges:
-; V5RNGLISTS-NEXT: 0x0000000c: [DW_RLE_offset_pair]:
-; V5RNGLISTS-NEXT: 0x0000000f: [DW_RLE_offset_pair]:
+; V5RNGLISTS-NEXT: 0x00000010: [DW_RLE_offset_pair]:
+; V5RNGLISTS-NEXT: 0x00000013: [DW_RLE_offset_pair]:
 ; V5RNGLISTS:      0x{{[0-9a-f]+}}: [DW_RLE_end_of_list]
 
 ; From the code:
diff --git a/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir b/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
index c6d743171a5718c55de7a9e8cf0104cad3d112a5..92fc740b77ea2a30e54d8019c09817a2f5d1dc10 100644
--- a/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
+++ b/test/DebugInfo/X86/live-debug-vars-discard-invalid.mir
@@ -1,4 +1,5 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -start-before greedy -stop-after virtregrewriter -o - %s | FileCheck %s
+# FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39481.
+# RUN: llc -mtriple=x86_64-linux-gnu -start-before greedy -stop-after virtregrewriter -o - -verify-machineinstrs=0 %s | FileCheck %s
 
 --- |
   ; ModuleID = '<stdin>'
@@ -64,7 +65,7 @@ body:             |
 
   bb.1:
     ; This DBG_VALUE will be discarded (use before def of %0).
-    DBG_VALUE debug-use %0, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %0, $noreg, !18, !DIExpression(), debug-location !25
     %0:gr64 = IMPLICIT_DEF
     %0:gr64 = IMPLICIT_DEF
     %0:gr64 = IMPLICIT_DEF
@@ -72,32 +73,32 @@ body:             |
 
   bb.2:
     ; This DBG_VALUE will be discarded (%1 is defined earlier, but it is not live in, so we do not know where %1 is stored).
-    DBG_VALUE debug-use %1, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %1, $noreg, !18, !DIExpression(), debug-location !25
     %1:gr64 = IMPLICIT_DEF
     %1:gr64 = IMPLICIT_DEF
     %1:gr64 = IMPLICIT_DEF
     %1:gr64 = IMPLICIT_DEF
     ; This DBG_VALUE is kept, even if %1 is dead, it was defined in the prev instruction,
     ; so the value should be available for as long as the register allocated to %1 is live.
-    DBG_VALUE debug-use %1, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %1, $noreg, !18, !DIExpression(), debug-location !25
 
   bb.3:
     %1:gr64 = IMPLICIT_DEF
-    DBG_VALUE 0, debug-use $noreg, !23, !DIExpression(), debug-location !25
+    DBG_VALUE 0, $noreg, !23, !DIExpression(), debug-location !25
     ; This DBG_VALUE is kept, even if %1 is dead, it was defined in the prev non-dbg instruction,
     ; so the value should be available for as long as the register allocated to %1 is live.
-    DBG_VALUE debug-use %1, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %1, $noreg, !18, !DIExpression(), debug-location !25
 
   bb.4:
     ; All DBG_VALUEs here should survive. %2 is livein as it was defined in bb.0, and it has use/def in the BTS64rr instruction.
-    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %2, $noreg, !18, !DIExpression(), debug-location !25
     %2:gr64 = BTS64rr %2, 0, implicit-def $eflags
-    DBG_VALUE 0, debug-use $noreg, !23, !DIExpression(), debug-location !25
-    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE 0, $noreg, !23, !DIExpression(), debug-location !25
+    DBG_VALUE %2, $noreg, !18, !DIExpression(), debug-location !25
     %2:gr64 = BTS64rr %2, 0, implicit-def $eflags
-    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %2, $noreg, !18, !DIExpression(), debug-location !25
     %2:gr64 = BTS64rr %2, 0, implicit-def $eflags
-    DBG_VALUE debug-use %2, debug-use $noreg, !18, !DIExpression(), debug-location !25
+    DBG_VALUE %2, $noreg, !18, !DIExpression(), debug-location !25
 
   bb.5:
     RET 0, debug-location !32
@@ -106,29 +107,29 @@ body:             |
 # CHECK-LABEL: name: foobar
 
 # CHECK-LABEL: bb.1:
-# CHECK:        DBG_VALUE debug-use $noreg
+# CHECK:        DBG_VALUE $noreg
 
 # CHECK-LABEL: bb.2:
-# CHECK:        DBG_VALUE debug-use $noreg
+# CHECK:        DBG_VALUE $noreg
 # CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
 # CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
 # CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
 # CHECK-NEXT:   dead renamable $rcx = IMPLICIT_DEF
-# CHECK-NEXT:   DBG_VALUE debug-use $rcx, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE $rcx, $noreg, !18, !DIExpression()
 
 # CHECK-LABEL: bb.3:
 # CHECK:        dead renamable $rcx = IMPLICIT_DEF
-# CHECK-NEXT:   DBG_VALUE 0, debug-use $noreg, !23, !DIExpression()
-# CHECK-NEXT:   DBG_VALUE debug-use $rcx, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE 0, $noreg, !23, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE $rcx, $noreg, !18, !DIExpression()
 
 # CHECK-LABEL: bb.4:
 # CHECK:        liveins: $rax
-# CHECK:        DBG_VALUE debug-use $rax, debug-use $noreg, !18, !DIExpression()
+# CHECK:        DBG_VALUE $rax, $noreg, !18, !DIExpression()
 # CHECK-NEXT:   renamable $rax = BTS64rr killed renamable $rax, 0, implicit-def $eflags
-# CHECK-NEXT:   DBG_VALUE 0, debug-use $noreg, !23, !DIExpression()
-# CHECK-NEXT:   DBG_VALUE debug-use $rax, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE 0, $noreg, !23, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE $rax, $noreg, !18, !DIExpression()
 # CHECK-NEXT:   renamable $rax = BTS64rr killed renamable $rax, 0, implicit-def $eflags
-# CHECK-NEXT:   DBG_VALUE debug-use $rax, debug-use $noreg, !18, !DIExpression()
+# CHECK-NEXT:   DBG_VALUE $rax, $noreg, !18, !DIExpression()
 # CHECK-NEXT:   dead renamable $rax = BTS64rr killed renamable $rax, 0, implicit-def $eflags
 
 # CHECK-LABEL: bb.5:
diff --git a/test/DebugInfo/X86/live-debug-vars-dse.mir b/test/DebugInfo/X86/live-debug-vars-dse.mir
index bf6c71fa0ff762ab9034c2c47900673232478358..3a82c9d377b6e5bd1b4ee349e1b7d9266e58c59e 100644
--- a/test/DebugInfo/X86/live-debug-vars-dse.mir
+++ b/test/DebugInfo/X86/live-debug-vars-dse.mir
@@ -134,7 +134,7 @@ body:             |
     $rcx = COPY %1, debug-location !15
     CALL64pcrel32 @escape, csr_win64, implicit $rsp, implicit $ssp, implicit $rcx, implicit-def $rsp, implicit-def $ssp, debug-location !15
     ADJCALLSTACKUP64 32, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp, debug-location !15
-    DBG_VALUE 1, debug-use _, !13, !DIExpression(), debug-location !16
+    DBG_VALUE 1, _, !13, !DIExpression(), debug-location !16
     MOV32mi $rip, 1, _, @global, _, 1, debug-location !17 :: (store 4 into @global)
     DBG_VALUE %stack.0.x.addr, 0, !13, !DIExpression(), debug-location !18
     MOV32mi %stack.0.x.addr, 1, _, 0, _, 2, debug-location !18 :: (store 4 into %ir.x.addr)
diff --git a/test/DebugInfo/X86/live-debug-vars-index.mir b/test/DebugInfo/X86/live-debug-vars-index.mir
index 1a38a101d642a36aa41c5e44b59e9d62f6ea5a71..c4ba495051776af66a0ce41551b7f2b01ecb6f3f 100644
--- a/test/DebugInfo/X86/live-debug-vars-index.mir
+++ b/test/DebugInfo/X86/live-debug-vars-index.mir
@@ -40,14 +40,14 @@ tracksRegLiveness: true
 body:             |
   bb.0:
 
-    DBG_VALUE debug-use $esi, debug-use $noreg, !13, !DIExpression(), debug-location !11
+    DBG_VALUE $esi, $noreg, !13, !DIExpression(), debug-location !11
     DBG_LABEL !8, debug-location !9
-    DBG_VALUE debug-use $edi, debug-use $noreg, !10, !DIExpression(), debug-location !11
+    DBG_VALUE $edi, $noreg, !10, !DIExpression(), debug-location !11
     RET 0, undef $eax, debug-location !12
 ...
 
 # CHECK-LABEL: name:            foo
 # CHECK: bb.0:
 # CHECK-DAG: DBG_LABEL
-# CHECK-DAG: DBG_VALUE debug-use $esi
-# CHECK-DAG: DBG_VALUE debug-use $edi
+# CHECK-DAG: DBG_VALUE $esi
+# CHECK-DAG: DBG_VALUE $edi
diff --git a/test/DebugInfo/X86/loclists-dwp.ll b/test/DebugInfo/X86/loclists-dwp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a5ce92270d6da0b6082993026490e759e20ee7e7
--- /dev/null
+++ b/test/DebugInfo/X86/loclists-dwp.ll
@@ -0,0 +1,62 @@
+; RUN: llc -split-dwarf-file=%t1.dwo -filetype=obj -o %t1.o < %s
+; RUN: llc -split-dwarf-file=%t2.dwo -filetype=obj -o %t2.o < %p/../Inputs/loclists-dwp-b.ll 
+; RUN: llvm-dwp %t1.o %t2.o -o %t.dwp
+; RUN: llvm-dwarfdump -v %t.dwp | FileCheck %s
+
+; Make sure that 2 location lists from different units within a dwp file are 
+; dumped correctly. The 2 location lists differ in the length of their address
+; ranges.
+; 
+; Generate both .ll files with clang -S -emit-llvm from the following sources:
+; a.cpp:
+; void y();
+; void a(int i) {
+;   y();
+;   asm("" : : : "rdi");
+; }
+;
+; b.cpp:
+; void b(int i) { asm("" : : : "rdi"); }
+
+; CHECK:      DW_AT_location [DW_FORM_sec_offset]   (0x00000000
+; CHECK-NEXT: Addr idx 0 (w/ length 6): DW_OP_reg5 RDI)
+
+; CHECK:      DW_AT_location [DW_FORM_sec_offset]   (0x00000000
+; CHECK-NEXT: Addr idx 0 (w/ length 0): DW_OP_reg5 RDI)
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @_Z1ai(i32 %i) local_unnamed_addr !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %i, metadata !12, metadata !DIExpression()), !dbg !13
+  tail call void @_Z1yv(), !dbg !14
+  tail call void asm sideeffect "", "~{rdi},~{dirflag},~{fpsr},~{flags}"(), !dbg !15, !srcloc !16
+  ret void, !dbg !17
+}
+
+declare dso_local void @_Z1yv() local_unnamed_addr
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (https://git.llvm.org/git/clang.git/ 41055c6168135fe539801799e5c5636247cf0302) (https://git.llvm.org/git/llvm.git/ de0558be123ffbb5b5bd692c17dbd57a75fe684f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "a.cpp", directory: "/home/test/PRs/PR38990")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (https://git.llvm.org/git/clang.git/ 41055c6168135fe539801799e5c5636247cf0302) (https://git.llvm.org/git/llvm.git/ de0558be123ffbb5b5bd692c17dbd57a75fe684f)"}
+!7 = distinct !DISubprogram(name: "a", linkageName: "_Z1ai", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!12}
+!12 = !DILocalVariable(name: "i", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+!13 = !DILocation(line: 2, column: 12, scope: !7)
+!14 = !DILocation(line: 3, column: 3, scope: !7)
+!15 = !DILocation(line: 4, column: 3, scope: !7)
+!16 = !{i32 41}
+!17 = !DILocation(line: 5, column: 1, scope: !7)
diff --git a/test/DebugInfo/X86/parameters.ll b/test/DebugInfo/X86/parameters.ll
index 7a5b852bde25d43134fdeb89bf5a07342c95ef5c..ed0048cc15e0393bdf64b301e216c31102789d92 100644
--- a/test/DebugInfo/X86/parameters.ll
+++ b/test/DebugInfo/X86/parameters.ll
@@ -1,7 +1,6 @@
 ; REQUIRES: object-emission
 ;
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 -filetype=obj < %s > %t
-; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 -filetype=obj %s -o - | llvm-dwarfdump -v -debug-info - | FileCheck %s
 
 ; Test case derived from compiling the following source with clang -g:
 ;
diff --git a/test/DebugInfo/X86/pieces-1.ll b/test/DebugInfo/X86/pieces-1.ll
index f91aec19f380f05771a02e53a9773958f6ede92d..02b45d11fdc92c791494bd957058739b204a6242 100644
--- a/test/DebugInfo/X86/pieces-1.ll
+++ b/test/DebugInfo/X86/pieces-1.ll
@@ -1,5 +1,4 @@
-; RUN: llc -O0 %s -filetype=obj -o %t.o
-; RUN: llvm-dwarfdump -debug-loc %t.o | FileCheck %s
+; RUN: llc -O0 %s -filetype=obj -o - | llvm-dwarfdump -debug-loc - | FileCheck %s
 ;
 ; rdar://problem/15928306
 ;
diff --git a/test/DebugInfo/X86/pr19307.ll b/test/DebugInfo/X86/pr19307.ll
deleted file mode 100644
index 90bbefaf3a7bd7d41f8dd9d5d808f43eb277a04c..0000000000000000000000000000000000000000
--- a/test/DebugInfo/X86/pr19307.ll
+++ /dev/null
@@ -1,144 +0,0 @@
-; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-
-; Generated from the source file pr19307.cc:
-; #include <string>
-; void parse_range(unsigned long long &offset, unsigned long long &limit,
-;                  std::string range) {
-;   if (range.compare(0, 6, "items=") != 0 || range[6] == '-')
-;     offset = 1;
-;   range.erase(0, 6);
-;   limit = 2;
-; }
-; with "clang++ -S -emit-llvm -O0 -g pr19307.cc"
-
-; Location of "range" string is spilled from %rdx to stack and is
-; addressed via %rbp.
-; CHECK: movq %rdx, {{[-0-9]+}}(%rbp)
-; CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]:
-; This location should be valid until the end of the function.
-
-; Verify that we have proper range in debug_loc section:
-; CHECK: .Ldebug_loc{{[0-9]+}}:
-; CHECK: DW_OP_breg1
-; CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
-; CHECK-NEXT: .quad .Lfunc_end0-.Lfunc_begin0
-; CHECK: DW_OP_breg6
-; CHECK: DW_OP_deref
-
-; ModuleID = 'pr19307.cc'
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
-%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
-
-@.str = private unnamed_addr constant [7 x i8] c"items=\00", align 1
-
-; Function Attrs: uwtable
-define void @_Z11parse_rangeRyS_Ss(i64* %offset, i64* %limit, %"class.std::basic_string"* %range) #0 !dbg !13 {
-entry:
-  %offset.addr = alloca i64*, align 8
-  %limit.addr = alloca i64*, align 8
-  store i64* %offset, i64** %offset.addr, align 8
-  call void @llvm.dbg.declare(metadata i64** %offset.addr, metadata !45, metadata !DIExpression()), !dbg !46
-  store i64* %limit, i64** %limit.addr, align 8
-  call void @llvm.dbg.declare(metadata i64** %limit.addr, metadata !47, metadata !DIExpression()), !dbg !46
-  call void @llvm.dbg.declare(metadata %"class.std::basic_string"* %range, metadata !48, metadata !DIExpression(DW_OP_deref)), !dbg !49
-  %call = call i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"* %range, i64 0, i64 6, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0)), !dbg !50
-  %cmp = icmp ne i32 %call, 0, !dbg !50
-  br i1 %cmp, label %if.then, label %lor.lhs.false, !dbg !50
-
-lor.lhs.false:                                    ; preds = %entry
-  %call1 = call i8* @_ZNSsixEm(%"class.std::basic_string"* %range, i64 6), !dbg !52
-  %0 = load i8, i8* %call1, !dbg !52
-  %conv = sext i8 %0 to i32, !dbg !52
-  %cmp2 = icmp eq i32 %conv, 45, !dbg !52
-  br i1 %cmp2, label %if.then, label %if.end, !dbg !52
-
-if.then:                                          ; preds = %lor.lhs.false, %entry
-  %1 = load i64*, i64** %offset.addr, align 8, !dbg !54
-  store i64 1, i64* %1, align 8, !dbg !54
-  br label %if.end, !dbg !54
-
-if.end:                                           ; preds = %if.then, %lor.lhs.false
-  %call3 = call %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"* %range, i64 0, i64 6), !dbg !55
-  %2 = load i64*, i64** %limit.addr, align 8, !dbg !56
-  store i64 2, i64* %2, align 8, !dbg !56
-  ret void, !dbg !57
-}
-
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-declare i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"*, i64, i64, i8*) #2
-
-declare i8* @_ZNSsixEm(%"class.std::basic_string"*, i64) #2
-
-declare %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"*, i64, i64) #2
-
-attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!42, !43}
-!llvm.ident = !{!44}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 (209308)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !3, globals: !2, imports: !21)
-!1 = !DIFile(filename: "pr19307.cc", directory: "/llvm_cmake_gcc")
-!2 = !{}
-!3 = !{!4, !6, !8}
-!4 = !DICompositeType(tag: DW_TAG_structure_type, line: 83, flags: DIFlagFwdDecl, file: !5, identifier: "_ZTS11__mbstate_t")
-!5 = !DIFile(filename: "/usr/include/wchar.h", directory: "/llvm_cmake_gcc")
-!6 = !DICompositeType(tag: DW_TAG_structure_type, name: "lconv", line: 54, flags: DIFlagFwdDecl, file: !7, identifier: "_ZTS5lconv")
-!7 = !DIFile(filename: "/usr/include/locale.h", directory: "/llvm_cmake_gcc")
-!8 = !DICompositeType(tag: DW_TAG_class_type, name: "basic_string<char, std::char_traits<char>, std::allocator<char> >", line: 1134, flags: DIFlagFwdDecl, file: !9, scope: !10, identifier: "_ZTSSs")
-!9 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/basic_string.tcc", directory: "/llvm_cmake_gcc")
-!10 = !DINamespace(name: "std", scope: null)
-!11 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/x86_64-linux-gnu/bits/c++config.h", directory: "/llvm_cmake_gcc")
-!13 = distinct !DISubprogram(name: "parse_range", linkageName: "_Z11parse_rangeRyS_Ss", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 4, file: !1, scope: !14, type: !15, retainedNodes: !2)
-!14 = !DIFile(filename: "pr19307.cc", directory: "/llvm_cmake_gcc")
-!15 = !DISubroutineType(types: !16)
-!16 = !{null, !17, !17, !19}
-!17 = !DIDerivedType(tag: DW_TAG_reference_type, baseType: !18)
-!18 = !DIBasicType(tag: DW_TAG_base_type, name: "long long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
-!19 = !DIDerivedType(tag: DW_TAG_typedef, name: "string", line: 65, file: !20, scope: !10, baseType: !8)
-!20 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", directory: "/llvm_cmake_gcc")
-!21 = !{!22, !26, !29, !33, !38, !41}
-!22 = !DIImportedEntity(tag: DW_TAG_imported_module, file: !1, line: 57, scope: !23, entity: !25)
-!23 = !DINamespace(name: "__gnu_debug", scope: null)
-!24 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/debug/debug.h", directory: "/llvm_cmake_gcc")
-!25 = !DINamespace(name: "__debug", scope: !10)
-!26 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 66, scope: !10, entity: !27)
-!27 = !DIDerivedType(tag: DW_TAG_typedef, name: "mbstate_t", line: 106, file: !5, baseType: !28)
-!28 = !DIDerivedType(tag: DW_TAG_typedef, name: "__mbstate_t", line: 95, file: !5, baseType: !4)
-!29 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 141, scope: !10, entity: !30)
-!30 = !DIDerivedType(tag: DW_TAG_typedef, name: "wint_t", line: 141, file: !31, baseType: !32)
-!31 = !DIFile(filename: "/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", directory: "/llvm_cmake_gcc")
-!32 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
-!33 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 42, scope: !34, entity: !36)
-!34 = !DINamespace(name: "__gnu_cxx", scope: null)
-!35 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/cpp_type_traits.h", directory: "/llvm_cmake_gcc")
-!36 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", line: 155, file: !11, scope: !10, baseType: !37)
-!37 = !DIBasicType(tag: DW_TAG_base_type, name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
-!38 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 43, scope: !34, entity: !39)
-!39 = !DIDerivedType(tag: DW_TAG_typedef, name: "ptrdiff_t", line: 156, file: !11, scope: !10, baseType: !40)
-!40 = !DIBasicType(tag: DW_TAG_base_type, name: "long int", size: 64, align: 64, encoding: DW_ATE_signed)
-!41 = !DIImportedEntity(tag: DW_TAG_imported_declaration, file: !1, line: 55, scope: !10, entity: !6)
-!42 = !{i32 2, !"Dwarf Version", i32 4}
-!43 = !{i32 2, !"Debug Info Version", i32 3}
-!44 = !{!"clang version 3.5.0 (209308)"}
-!45 = !DILocalVariable(name: "offset", line: 3, arg: 1, scope: !13, file: !14, type: !17)
-!46 = !DILocation(line: 3, scope: !13)
-!47 = !DILocalVariable(name: "limit", line: 3, arg: 2, scope: !13, file: !14, type: !17)
-!48 = !DILocalVariable(name: "range", line: 4, arg: 3, scope: !13, file: !14, type: !19)
-!49 = !DILocation(line: 4, scope: !13)
-!50 = !DILocation(line: 5, scope: !51)
-!51 = distinct !DILexicalBlock(line: 5, column: 0, file: !1, scope: !13)
-!52 = !DILocation(line: 5, scope: !53)
-!53 = distinct !DILexicalBlock(line: 5, column: 0, file: !1, scope: !51)
-!54 = !DILocation(line: 6, scope: !51)
-!55 = !DILocation(line: 7, scope: !13)
-!56 = !DILocation(line: 8, scope: !13)
-!57 = !DILocation(line: 9, scope: !13)
-
diff --git a/test/DebugInfo/X86/pr19307.mir b/test/DebugInfo/X86/pr19307.mir
new file mode 100644
index 0000000000000000000000000000000000000000..b8380b703a95627ae533a9d900e9e45d979d50d0
--- /dev/null
+++ b/test/DebugInfo/X86/pr19307.mir
@@ -0,0 +1,224 @@
+# RUN: llc -o - %s -start-after=patchable-function -O0 | FileCheck %s
+
+# Generated from the source file pr19307.cc:
+# #include <string>
+# void parse_range(unsigned long long &offset, unsigned long long &limit,
+#                  std::string range) {
+#   if (range.compare(0, 6, "items=") != 0 || range[6] == '-')
+#     offset = 1;
+#   range.erase(0, 6);
+#   limit = 2;
+# }
+# with "clang++ -S -emit-llvm -O0 -g pr19307.cc"
+#
+# Location of "range" string is spilled from %rdx to stack and is
+# addressed via %rbp.
+# CHECK: movq %rdx, {{[-0-9]+}}(%rbp)
+# CHECK-NEXT: [[START_LABEL:.Ltmp[0-9]+]]:
+# This location should be valid until the end of the function.
+#
+# Verify that we have proper range in debug_loc section:
+# CHECK: .Ldebug_loc{{[0-9]+}}:
+# CHECK: DW_OP_breg1
+# CHECK:      .quad [[START_LABEL]]-.Lfunc_begin0
+# CHECK-NEXT: .quad .Lfunc_end0-.Lfunc_begin0
+# CHECK: DW_OP_breg6
+# CHECK: DW_OP_deref
+--- |
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  %"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
+  %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+  
+  @.str = private unnamed_addr constant [7 x i8] c"items=\00", align 1
+  
+  ; Function Attrs: uwtable
+  define void @_Z11parse_rangeRyS_Ss(i64* %offset, i64* %limit, %"class.std::basic_string"* %range) #0 !dbg !34 {
+  entry:
+    %offset.addr = alloca i64*, align 8
+    %limit.addr = alloca i64*, align 8
+    store i64* %offset, i64** %offset.addr, align 8
+    call void @llvm.dbg.declare(metadata i64** %offset.addr, metadata !41, metadata !DIExpression()), !dbg !42
+    store i64* %limit, i64** %limit.addr, align 8
+    call void @llvm.dbg.declare(metadata i64** %limit.addr, metadata !43, metadata !DIExpression()), !dbg !42
+    call void @llvm.dbg.declare(metadata %"class.std::basic_string"* %range, metadata !44, metadata !DIExpression(DW_OP_deref)), !dbg !45
+    %call = call i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"* %range, i64 0, i64 6, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0)), !dbg !46
+    %cmp = icmp ne i32 %call, 0, !dbg !46
+    br i1 %cmp, label %if.then, label %lor.lhs.false, !dbg !46
+  
+  lor.lhs.false:                                    ; preds = %entry
+    %call1 = call i8* @_ZNSsixEm(%"class.std::basic_string"* %range, i64 6), !dbg !48
+    %0 = load i8, i8* %call1, !dbg !48
+    %conv = sext i8 %0 to i32, !dbg !48
+    %cmp2 = icmp eq i32 %conv, 45, !dbg !48
+    br i1 %cmp2, label %if.then, label %if.end, !dbg !48
+  
+  if.then:                                          ; preds = %lor.lhs.false, %entry
+    %1 = load i64*, i64** %offset.addr, align 8, !dbg !50
+    store i64 1, i64* %1, align 8, !dbg !50
+    br label %if.end, !dbg !50
+  
+  if.end:                                           ; preds = %if.then, %lor.lhs.false
+    %call3 = call %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"* %range, i64 0, i64 6), !dbg !51
+    %2 = load i64*, i64** %limit.addr, align 8, !dbg !52
+    store i64 2, i64* %2, align 8, !dbg !52
+    ret void, !dbg !53
+  }
+  
+  ; Function Attrs: nounwind readnone speculatable
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+  
+  declare i32 @_ZNKSs7compareEmmPKc(%"class.std::basic_string"*, i64, i64, i8*) #2
+  
+  declare i8* @_ZNSsixEm(%"class.std::basic_string"*, i64) #2
+  
+  declare %"class.std::basic_string"* @_ZNSs5eraseEmm(%"class.std::basic_string"*, i64, i64) #2
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #3
+  
+  attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { nounwind readnone speculatable }
+  attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #3 = { nounwind }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!31, !32}
+  !llvm.ident = !{!33}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5.0 (209308)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2, imports: !11)
+  !1 = !DIFile(filename: "pr19307.cc", directory: "/llvm_cmake_gcc")
+  !2 = !{}
+  !3 = !{!4, !6, !8}
+  !4 = !DICompositeType(tag: DW_TAG_structure_type, file: !5, line: 83, flags: DIFlagFwdDecl, identifier: "_ZTS11__mbstate_t")
+  !5 = !DIFile(filename: "/usr/include/wchar.h", directory: "/llvm_cmake_gcc")
+  !6 = !DICompositeType(tag: DW_TAG_structure_type, name: "lconv", file: !7, line: 54, flags: DIFlagFwdDecl, identifier: "_ZTS5lconv")
+  !7 = !DIFile(filename: "/usr/include/locale.h", directory: "/llvm_cmake_gcc")
+  !8 = !DICompositeType(tag: DW_TAG_class_type, name: "basic_string<char, std::char_traits<char>, std::allocator<char> >", scope: !10, file: !9, line: 1134, flags: DIFlagFwdDecl, identifier: "_ZTSSs")
+  !9 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/basic_string.tcc", directory: "/llvm_cmake_gcc")
+  !10 = !DINamespace(name: "std", scope: null)
+  !11 = !{!12, !15, !18, !22, !27, !30}
+  !12 = !DIImportedEntity(tag: DW_TAG_imported_module, scope: !13, entity: !14, file: !1, line: 57)
+  !13 = !DINamespace(name: "__gnu_debug", scope: null)
+  !14 = !DINamespace(name: "__debug", scope: !10)
+  !15 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !10, entity: !16, file: !1, line: 66)
+  !16 = !DIDerivedType(tag: DW_TAG_typedef, name: "mbstate_t", file: !5, line: 106, baseType: !17)
+  !17 = !DIDerivedType(tag: DW_TAG_typedef, name: "__mbstate_t", file: !5, line: 95, baseType: !4)
+  !18 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !10, entity: !19, file: !1, line: 141)
+  !19 = !DIDerivedType(tag: DW_TAG_typedef, name: "wint_t", file: !20, line: 141, baseType: !21)
+  !20 = !DIFile(filename: "/llvm_cmake_gcc/bin/../lib/clang/3.5.0/include/stddef.h", directory: "/llvm_cmake_gcc")
+  !21 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+  !22 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !23, entity: !24, file: !1, line: 42)
+  !23 = !DINamespace(name: "__gnu_cxx", scope: null)
+  !24 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", scope: !10, file: !25, line: 155, baseType: !26)
+  !25 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/x86_64-linux-gnu/bits/c++config.h", directory: "/llvm_cmake_gcc")
+  !26 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+  !27 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !23, entity: !28, file: !1, line: 43)
+  !28 = !DIDerivedType(tag: DW_TAG_typedef, name: "ptrdiff_t", scope: !10, file: !25, line: 156, baseType: !29)
+  !29 = !DIBasicType(name: "long int", size: 64, align: 64, encoding: DW_ATE_signed)
+  !30 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !10, entity: !6, file: !1, line: 55)
+  !31 = !{i32 2, !"Dwarf Version", i32 4}
+  !32 = !{i32 2, !"Debug Info Version", i32 3}
+  !33 = !{!"clang version 3.5.0 (209308)"}
+  !34 = distinct !DISubprogram(name: "parse_range", linkageName: "_Z11parse_rangeRyS_Ss", scope: !1, file: !1, line: 3, type: !35, isLocal: false, isDefinition: true, scopeLine: 4, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+  !35 = !DISubroutineType(types: !36)
+  !36 = !{null, !37, !37, !39}
+  !37 = !DIDerivedType(tag: DW_TAG_reference_type, baseType: !38)
+  !38 = !DIBasicType(name: "long long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+  !39 = !DIDerivedType(tag: DW_TAG_typedef, name: "string", scope: !10, file: !40, line: 65, baseType: !8)
+  !40 = !DIFile(filename: "/usr/lib/gcc/x86_64-linux-gnu/4.6/../../../../include/c++/4.6/bits/stringfwd.h", directory: "/llvm_cmake_gcc")
+  !41 = !DILocalVariable(name: "offset", arg: 1, scope: !34, file: !1, line: 3, type: !37)
+  !42 = !DILocation(line: 3, scope: !34)
+  !43 = !DILocalVariable(name: "limit", arg: 2, scope: !34, file: !1, line: 3, type: !37)
+  !44 = !DILocalVariable(name: "range", arg: 3, scope: !34, file: !1, line: 4, type: !39)
+  !45 = !DILocation(line: 4, scope: !34)
+  !46 = !DILocation(line: 5, scope: !47)
+  !47 = distinct !DILexicalBlock(scope: !34, file: !1, line: 5)
+  !48 = !DILocation(line: 5, scope: !49)
+  !49 = distinct !DILexicalBlock(scope: !47, file: !1, line: 5)
+  !50 = !DILocation(line: 6, scope: !47)
+  !51 = !DILocation(line: 7, scope: !34)
+  !52 = !DILocation(line: 8, scope: !34)
+  !53 = !DILocation(line: 9, scope: !34)
+
+...
+---
+name:            _Z11parse_rangeRyS_Ss
+alignment:       4
+tracksRegLiveness: true
+liveins:         
+  - { reg: '$rdi' }
+  - { reg: '$rsi' }
+  - { reg: '$rdx' }
+frameInfo:       
+  stackSize:       40
+  offsetAdjustment: -32
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: 0 }
+stack:           
+  - { id: 0, name: offset.addr, offset: -24, size: 8, alignment: 8, stack-id: 0, 
+      debug-info-variable: '!41', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!42' }
+  - { id: 1, name: limit.addr, offset: -32, size: 8, alignment: 8, stack-id: 0, 
+      debug-info-variable: '!43', debug-info-expression: '!DIExpression()', 
+      debug-info-location: '!42' }
+  - { id: 2, type: spill-slot, offset: -40, size: 8, alignment: 8, stack-id: 0 }
+  - { id: 3, type: spill-slot, offset: -48, size: 8, alignment: 8, stack-id: 0 }
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi, $rdx
+  
+    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    CFI_INSTRUCTION offset $rbp, -16
+    $rbp = frame-setup MOV64rr $rsp
+    CFI_INSTRUCTION def_cfa_register $rbp
+    $rsp = frame-setup SUB64ri8 $rsp, 32, implicit-def dead $eflags
+    $eax = XOR32rr undef $eax, undef $eax, implicit-def $eflags, implicit-def $rax
+    MOV64mr $rbp, 1, $noreg, -8, $noreg, killed renamable $rdi :: (store 8 into %ir.offset.addr)
+    MOV64mr $rbp, 1, $noreg, -16, $noreg, killed renamable $rsi :: (store 8 into %ir.limit.addr)
+    DBG_VALUE renamable $rdx, 0, !44, !DIExpression(DW_OP_deref), debug-location !45
+    $rdi = MOV64rr $rdx, debug-location !46
+    $rsi = MOV64rr killed $rax, debug-location !46
+    $eax = MOV32ri 6, implicit-def $rax, debug-location !46
+    MOV64mr $rbp, 1, $noreg, -24, $noreg, killed $rdx :: (store 8 into %stack.2)
+    DBG_VALUE $rbp, 0, !44, !DIExpression(DW_OP_constu, 24, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !45
+    $rdx = MOV64rr killed $rax, debug-location !46
+    renamable $rcx = MOV64ri @.str, debug-location !46
+    CALL64pcrel32 @_ZNKSs7compareEmmPKc, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit killed $rsi, implicit killed $rdx, implicit killed $rcx, implicit-def $eax, debug-location !46
+    CMP32ri8 killed renamable $eax, 0, implicit-def $eflags, debug-location !46
+    JNE_1 %bb.2, implicit $eflags, debug-location !46
+  
+  bb.1.lor.lhs.false:
+    DBG_VALUE $rbp, 0, !44, !DIExpression(DW_OP_constu, 24, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !45
+    $rdi = MOV64rm $rbp, 1, $noreg, -24, $noreg :: (load 8 from %stack.2)
+    $esi = MOV32ri 6, implicit-def $rsi, debug-location !48
+    CALL64pcrel32 @_ZNSsixEm, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit killed $rsi, implicit-def $rax, debug-location !48
+    renamable $ecx = MOVSX32rm8 killed renamable $rax, 1, $noreg, 0, $noreg, debug-location !48 :: (load 1 from %ir.call1)
+    CMP32ri8 killed renamable $ecx, 45, implicit-def $eflags, debug-location !48
+    JNE_1 %bb.3, implicit $eflags, debug-location !48
+  
+  bb.2.if.then:
+    DBG_VALUE $rbp, 0, !44, !DIExpression(DW_OP_constu, 24, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !45
+    renamable $rax = MOV64rm $rbp, 1, $noreg, -8, $noreg, debug-location !50 :: (load 8 from %ir.offset.addr)
+    MOV64mi32 killed renamable $rax, 1, $noreg, 0, $noreg, 1, debug-location !50 :: (store 8 into %ir.1)
+  
+  bb.3.if.end:
+    DBG_VALUE $rbp, 0, !44, !DIExpression(DW_OP_constu, 24, DW_OP_minus, DW_OP_deref, DW_OP_deref), debug-location !45
+    $esi = XOR32rr undef $esi, undef $esi, implicit-def $eflags, implicit-def $rsi
+    $rdi = MOV64rm $rbp, 1, $noreg, -24, $noreg :: (load 8 from %stack.2)
+    $edx = MOV32ri 6, implicit-def $rdx, debug-location !51
+    CALL64pcrel32 @_ZNSs5eraseEmm, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit killed $rsi, implicit killed $rdx, implicit-def $rax, debug-location !51
+    renamable $rdx = MOV64rm $rbp, 1, $noreg, -16, $noreg, debug-location !52 :: (load 8 from %ir.limit.addr)
+    MOV64mi32 killed renamable $rdx, 1, $noreg, 0, $noreg, 2, debug-location !52 :: (store 8 into %ir.2)
+    MOV64mr $rbp, 1, $noreg, -32, $noreg, killed $rax :: (store 8 into %stack.3)
+    $rsp = frame-destroy ADD64ri8 $rsp, 32, implicit-def dead $eflags, debug-location !53
+    $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !53
+    CFI_INSTRUCTION def_cfa $rsp, 8, debug-location !53
+    RETQ debug-location !53
+
+...
diff --git a/test/DebugInfo/X86/pr34545.ll b/test/DebugInfo/X86/pr34545.ll
index 8d781157d92ad89401bca03457bbca722b9133d0..fe5d2a285f55ca3620e8ab19e04efcac34e724ba 100644
--- a/test/DebugInfo/X86/pr34545.ll
+++ b/test/DebugInfo/X86/pr34545.ll
@@ -1,13 +1,13 @@
 ; RUN: llc -O1 -filetype=asm -mtriple x86_64-unknown-linux-gnu -mcpu=x86-64 -o - %s -stop-after=livedebugvars | FileCheck %s
 
 ; CHECK: $eax = MOV32rm
-; CHECK: DBG_VALUE debug-use $eax
+; CHECK: DBG_VALUE $eax
 ; CHECK: $eax = SHL32rCL killed renamable $eax
-; CHECK: DBG_VALUE debug-use $eax
-; CHECK: DBG_VALUE debug-use $rsp, 0, !{{[0-9]+}}, !DIExpression(DW_OP_constu, 4, DW_OP_minus)
-; CHECK: DBG_VALUE debug-use $eax
+; CHECK: DBG_VALUE $eax
+; CHECK: DBG_VALUE $rsp, 0, !{{[0-9]+}}, !DIExpression(DW_OP_constu, 4, DW_OP_minus)
+; CHECK: DBG_VALUE $eax
 ; CHECK: $eax = SHL32rCL killed renamable $eax
-; CHECK: DBG_VALUE debug-use $eax
+; CHECK: DBG_VALUE $eax
 ; CHECK: RETQ $eax
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/DebugInfo/X86/range_reloc.ll b/test/DebugInfo/X86/range_reloc.ll
index d1479e232f696f0e3c0d033003a746937a7009bd..5c40cf31944eb15e6826f357431538304011ef37 100644
--- a/test/DebugInfo/X86/range_reloc.ll
+++ b/test/DebugInfo/X86/range_reloc.ll
@@ -16,8 +16,21 @@
 ; smaller (the growth of debug_ranges itself would be more significant).
 
 ; COMMON: {{^.Ldebug_ranges0}}
-; COMMON-NEXT:   .quad   .Lfunc_begin0
-; COMMON-NEXT:   .quad   .Lfunc_end0
+; NOBASE-NEXT:   .quad   .Lfunc_begin0
+; NOBASE-NEXT:   .quad   .Lfunc_end0
+; NOBASE-NEXT:   .quad   .Lfunc_begin1
+; NOBASE-NEXT:   .quad   .Lfunc_end1
+; NOBASE-NEXT:   .quad   .Lfunc_begin3
+; NOBASE-NEXT:   .quad   .Lfunc_end3
+; NOBASE-NEXT:   .quad   .Lfunc_begin4
+; NOBASE-NEXT:   .quad   .Lfunc_end4
+; NOBASE-NEXT:   .quad   .Lfunc_begin5
+; NOBASE-NEXT:   .quad   .Lfunc_end5
+
+; BASE-NEXT:   .quad   -1
+; BASE-NEXT:   .quad   .Lfunc_begin0
+; BASE-NEXT:   .quad   .Lfunc_begin0-.Lfunc_begin0
+; BASE-NEXT:   .quad   .Lfunc_end0-.Lfunc_begin0
 ; BASE-NEXT:   .quad   -1
 ; BASE-NEXT:   .quad   .Lfunc_begin1
 ; BASE-NEXT:   .quad   .Lfunc_begin1-.Lfunc_begin1
@@ -25,35 +38,33 @@
 ; BASE-NEXT:   .quad   .Lfunc_begin3-.Lfunc_begin1
 ; BASE-NEXT:   .quad   .Lfunc_end3-.Lfunc_begin1
 ; BASE-NEXT:   .quad   -1
-; BASE-NEXT:   .quad   0
-; NOBASE-NEXT:   .quad   .Lfunc_begin1
-; NOBASE-NEXT:   .quad   .Lfunc_end1
-; NOBASE-NEXT:   .quad   .Lfunc_begin3
-; NOBASE-NEXT:   .quad   .Lfunc_end3
-; COMMON-NEXT:   .quad   .Lfunc_begin4
-; COMMON-NEXT:   .quad   .Lfunc_end4
-; COMMON-NEXT:   .quad   .Lfunc_begin5
-; COMMON-NEXT:   .quad   .Lfunc_end5
+; BASE-NEXT:   .quad   .Lfunc_begin4
+; BASE-NEXT:   .quad   .Lfunc_begin4-.Lfunc_begin4
+; BASE-NEXT:   .quad   .Lfunc_end4-.Lfunc_begin4
+; BASE-NEXT:   .quad   -1
+; BASE-NEXT:   .quad   .Lfunc_begin5
+; BASE-NEXT:   .quad   .Lfunc_begin5-.Lfunc_begin5
+; BASE-NEXT:   .quad   .Lfunc_end5-.Lfunc_begin5
 ; COMMON-NEXT:   .quad   0
 ; COMMON-NEXT:   .quad   0
 
 ; DWARF5: {{^.Ldebug_ranges0}}
-; DWARF5-NEXT:                                      # DW_RLE_start_length
-; DWARF5-NEXT: .quad    .Lfunc_begin0               #   start
+; DWARF5-NEXT:                                      # DW_RLE_startx_length
+; DWARF5-NEXT: .byte 0                              #   start index
 ; DWARF5-NEXT: .uleb128 .Lfunc_end0-.Lfunc_begin0   #   length
-; DWARF5-NEXT:                                      # DW_RLE_base_address
-; DWARF5-NEXT: .quad    .Lfunc_begin1               #   base address
+; DWARF5-NEXT:                                      # DW_RLE_base_addressx
+; DWARF5-NEXT: .byte 1                              #   base address index
 ; DWARF5-NEXT:                                      # DW_RLE_offset_pair
 ; DWARF5-NEXT: .uleb128 .Lfunc_begin1-.Lfunc_begin1 #   starting offset
 ; DWARF5-NEXT: .uleb128 .Lfunc_end1-.Lfunc_begin1   #   ending offset
 ; DWARF5-NEXT:                                      # DW_RLE_offset_pair
 ; DWARF5-NEXT: .uleb128 .Lfunc_begin3-.Lfunc_begin1 #   starting offset
 ; DWARF5-NEXT: .uleb128 .Lfunc_end3-.Lfunc_begin1   #   ending offset
-; DWARF5-NEXT:                                      # DW_RLE_start_length
-; DWARF5-NEXT: .quad	   .Lfunc_begin4               #   start
+; DWARF5-NEXT:                                      # DW_RLE_startx_length
+; DWARF5-NEXT: .byte 3                              #   start index
 ; DWARF5-NEXT: .uleb128 .Lfunc_end4-.Lfunc_begin4   #   length
-; DWARF5-NEXT:                                      # DW_RLE_start_length
-; DWARF5-NEXT: .quad	   .Lfunc_begin5               #   start
+; DWARF5-NEXT:                                      # DW_RLE_startx_length
+; DWARF5-NEXT: .byte 4                              #   start index
 ; DWARF5-NEXT: .uleb128 .Lfunc_end5-.Lfunc_begin5   #   length
 ; DWARF5-NEXT:                                      # DW_RLE_end_of_list
 
diff --git a/test/DebugInfo/X86/rnglists_curanges.ll b/test/DebugInfo/X86/rnglists_curanges.ll
index aac0ef59eeeeb5d0cd5476f5b95560f86a7f31e7..05206a7b5b28d79c009df0c94145a9803e0dd975 100644
--- a/test/DebugInfo/X86/rnglists_curanges.ll
+++ b/test/DebugInfo/X86/rnglists_curanges.ll
@@ -16,8 +16,8 @@
 ; CHECK-NOT:  DW_TAG
 ; CHECK:      DW_AT_rnglists_base [DW_FORM_sec_offset]                   (0x0000000c)
 ; CHECK:      .debug_rnglists contents:
-; CHECK:      0x00000000: range list header: length = 0x0000001d, version = 0x0005,
-; CHECK-SAME: addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+; CHECK:      0x00000000: range list header: length = 0x00000013, version = 0x0005,
+; CHECK-SAME: addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001
 
 ; Function Attrs: noinline nounwind optnone uwtable
 define dso_local void @f1() section "text.foo" !dbg !7 {
diff --git a/test/DebugInfo/X86/sdag-combine.ll b/test/DebugInfo/X86/sdag-combine.ll
index 3023ce751c03680fd8405a6aa657dd0564c5f659..c95afdb0cf08aa2abd8ddbf67c8edf4dd66902b8 100644
--- a/test/DebugInfo/X86/sdag-combine.ll
+++ b/test/DebugInfo/X86/sdag-combine.ll
@@ -15,7 +15,7 @@ define swiftcc void @g() #0 !dbg !5 {
 entry:
   %0 = alloca %TSb, align 1
   %1 = call swiftcc i1 @f(), !dbg !7
-  ; CHECK: DBG_VALUE debug-use $rcx, debug-use $noreg, !8, !DIExpression(), debug-location !7
+  ; CHECK: DBG_VALUE $rcx, $noreg, !8, !DIExpression(), debug-location !7
   call void @llvm.dbg.value(metadata i1 %1, metadata !8, metadata !DIExpression()), !dbg !7
   %2 = getelementptr inbounds %TSb, %TSb* %0, i32 0, i32 0, !dbg !7
   store i1 %1, i1* %2, align 1, !dbg !7
diff --git a/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll b/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
index 45eff822474fd9d637ba60d5acf0b5cba5c9efe6..0266f6108e7a7c648891953bab07a0d3d0af6270 100644
--- a/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
+++ b/test/DebugInfo/X86/sdag-dangling-dbgvalue.ll
@@ -62,50 +62,50 @@ target triple = "x86_64-apple-macosx10.4.0"
 
 @S = global %struct.SS { i32 23, i32 -17 }, align 4, !dbg !0
 
-; Verify that the def comes before the debug-use for foo1.
+; Verify that the def comes before the for foo1.
 ; TODO: Currently dbg.value for bar1 is dropped(?), is that expected?
 define i32 @test1() local_unnamed_addr #0 !dbg !17 {
 ; CHECK-LABEL: bb.0.entry1
 ; CHECK-NEXT:    [[REG1:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG1]], debug-use $noreg, ![[FOO1]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG1]], $noreg, ![[FOO1]], !DIExpression()
 entry1:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !20, metadata !DIExpression()), !dbg !23
   call void @llvm.dbg.value(metadata %struct.SS* null, metadata !22, metadata !DIExpression()), !dbg !24
   ret i32 ptrtoint (%struct.SS* @S to i32), !dbg !25
 }
 
-; Verify that the def comes before the debug-use for foo2 and bar2.
+; Verify that the def comes before the for foo2 and bar2.
 define i32 @test2() local_unnamed_addr #0 !dbg !26 {
 ; CHECK-LABEL: bb.0.entry2
 ; CHECK-NEXT:    [[REG2:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG2]], debug-use $noreg, ![[FOO2]], !DIExpression()
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG2]], debug-use $noreg, ![[BAR2]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG2]], $noreg, ![[FOO2]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG2]], $noreg, ![[BAR2]], !DIExpression()
 entry2:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !28, metadata !DIExpression()), !dbg !30
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !29, metadata !DIExpression()), !dbg !31
   ret i32 add (i32 ptrtoint (%struct.SS* @S to i32), i32 ptrtoint (%struct.SS* @S to i32)), !dbg !32
 }
 
-; Verify that the def comes before the debug-use for foo3 and bar3.
+; Verify that the def comes before the for foo3 and bar3.
 define i32 @test3() local_unnamed_addr #0 !dbg !33 {
 ; CHECK-LABEL: bb.0.entry3
 ; CHECK-NEXT:    [[REG3:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG3]], debug-use $noreg, ![[BAR3]], !DIExpression()
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG3]], debug-use $noreg, ![[FOO3]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG3]], $noreg, ![[BAR3]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG3]], $noreg, ![[FOO3]], !DIExpression()
 entry3:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !36, metadata !DIExpression()), !dbg !38
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !35, metadata !DIExpression()), !dbg !37
   ret i32 add (i32 ptrtoint (%struct.SS* @S to i32), i32 ptrtoint (%struct.SS* @S to i32)), !dbg !39
 }
 
-; Verify that the def comes before the debug-use for bar4.
+; Verify that the def comes before the for bar4.
 ; TODO: Currently dbg.value for foo4 is dropped. It is set to null and not
 ;       used. Just like in test1 it can be discussed if there should be a
 ;       DBG_VALUE for foo4 here.
 define i32 @test4() local_unnamed_addr #0 !dbg !40 {
 ; CHECK-LABEL: bb.0.entry4
 ; CHECK-NEXT:    [[REG4:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG4]], debug-use $noreg, ![[BAR4]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG4]], $noreg, ![[BAR4]], !DIExpression()
 entry4:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !42, metadata !DIExpression()), !dbg !44
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !43, metadata !DIExpression()), !dbg !45
@@ -119,8 +119,8 @@ entry4:
 define i32 @test5() local_unnamed_addr #0 !dbg !47 {
 ; CHECK-LABEL: bb.0.entry5:
 ; CHECK-NEXT:    [[REG5:%[0-9]+]]:gr64 =
-; CHECK-NEXT:    DBG_VALUE debug-use [[REG5]], debug-use $noreg, ![[BAR5]], !DIExpression()
-; CHECK-NOT:     DBG_VALUE debug-use [[REG5]], debug-use $noreg, ![[FOO5]], !DIExpression()
+; CHECK-NEXT:    DBG_VALUE [[REG5]], $noreg, ![[BAR5]], !DIExpression()
+; CHECK-NOT:     DBG_VALUE [[REG5]], $noreg, ![[FOO5]], !DIExpression()
 ; CHECK:         RET
 entry5:
   call void @llvm.dbg.value(metadata %struct.SS* @S, metadata !49, metadata !DIExpression()), !dbg !51
diff --git a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll
index 1dc51f5524919afe445f4b14a977e66be8c0528a..116e05746f3eaef68e9c692184d32198a4448afc 100644
--- a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll
+++ b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-1.ll
@@ -49,7 +49,7 @@ for.body.lr.ph:                                   ; preds = %entry
 for.cond.cleanup:                                 ; preds = %for.body, %entry
 ; CHECK-LABEL: bb.{{.*}}.for.cond.cleanup:
 ; CHECK:      [[REG1:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG1]]
+; CHECK-NEXT: DBG_VALUE [[REG1]]
   %x.0.lcssa = phi i32 [ 9, %entry ], [ %add, %for.body ]
   call void @llvm.dbg.value(metadata i32 %x.0.lcssa, metadata !15, metadata !DIExpression()), !dbg !26
   %2 = bitcast [80 x i32]* %arr to i8*, !dbg !37
@@ -63,9 +63,9 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 ; CHECK:      [[REG2:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG3:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG4:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]]
-; CHECK-NEXT: DBG_VALUE debug-use [[REG3]]
-; CHECK-NEXT: DBG_VALUE debug-use [[REG4]]
+; CHECK-NEXT: DBG_VALUE [[REG2]]
+; CHECK-NEXT: DBG_VALUE [[REG3]]
+; CHECK-NEXT: DBG_VALUE [[REG4]]
   %u.023 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
   %y.022 = phi i32 [ 13, %for.body.lr.ph ], [ %mul, %for.body ]
   %x.021 = phi i32 [ 9, %for.body.lr.ph ], [ %add, %for.body ]
diff --git a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll
index 7958dd878f88e2761a0df3238f796f048df1f3da..6c6a9597b5a65241c13b7c7bc9b60256416452f6 100644
--- a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll
+++ b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-2.ll
@@ -28,7 +28,7 @@ for.body.lr.ph:                                   ; preds = %entry
 for.cond.cleanup:                                 ; preds = %for.body, %entry
 ; CHECK-LABEL: bb.{{.*}}.for.cond.cleanup:
 ; CHECK:      [[REG1:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG1]]
+; CHECK-NEXT: DBG_VALUE [[REG1]]
   %x.0.lcssa = phi i32 [ 9, %entry ], [ %add, %for.body ]
   call void @llvm.dbg.value(metadata i32 %x.0.lcssa, metadata !15, metadata !DIExpression()), !dbg !26
   %2 = bitcast [80 x i32]* %arr to i8*, !dbg !37
@@ -42,16 +42,16 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 ; CHECK:      [[REG2:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG3:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG4:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG3]], debug-use $noreg, !16
-; CHECK-NEXT: DBG_VALUE 555, debug-use $noreg, !17
+; CHECK-NEXT: DBG_VALUE [[REG3]], $noreg, !16
+; CHECK-NEXT: DBG_VALUE 555, $noreg, !17
 ; XXX: Shouldn't the following DBG_VALUE be placed after the add (ADD32rr).
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]], debug-use $noreg, !17
+; CHECK-NEXT: DBG_VALUE [[REG2]], $noreg, !17
 ; CHECK-NEXT: ADD32rr
 ; XXX: Shouldn't the following DBG_VALUE be placed after the mul (LEA etc).
-; CHECK-NEXT: DBG_VALUE 777, debug-use $noreg, !17
+; CHECK-NEXT: DBG_VALUE 777, $noreg, !17
 ; CHECK:      INC32r
 ; XXX: Shouldn't the following DBG_VALUE be placed after the icmp (the non-dead implicit def of $eflags)
-; CHECK:      DBG_VALUE debug-use [[REG4]]
+; CHECK:      DBG_VALUE [[REG4]]
 ; CHECK-NEXT: implicit-def $eflags
   %u.023 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
   %y.022 = phi i32 [ 13, %for.body.lr.ph ], [ %mul, %for.body ]
diff --git a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll
index 83fbee5a61299df2cd0648702f26b1ede8a7855c..4aa7243c9e9d891d88468d8c8aba954093669952 100644
--- a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll
+++ b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-3.ll
@@ -65,12 +65,12 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 ; CHECK-NEXT: [[REG5:%[0-9]+]]:gr32_nosp = PHI
 ; CHECK-NEXT: [[REG6:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG7:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]], debug-use $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG3]], debug-use $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG4]], debug-use $noreg, !18, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG5]], debug-use $noreg, !18, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG6]], debug-use $noreg, !17, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG7]], debug-use $noreg, !17, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK-NEXT: DBG_VALUE [[REG2]], $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK-NEXT: DBG_VALUE [[REG3]], $noreg, !19, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK-NEXT: DBG_VALUE [[REG4]], $noreg, !18, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK-NEXT: DBG_VALUE [[REG5]], $noreg, !18, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK-NEXT: DBG_VALUE [[REG6]], $noreg, !17, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK-NEXT: DBG_VALUE [[REG7]], $noreg, !17, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
   %u.023 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
   %y.022 = phi i64 [ 13, %for.body.lr.ph ], [ %mul, %for.body ]
   %x.021 = phi i64 [ 9, %for.body.lr.ph ], [ %add, %for.body ]
diff --git a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll
index 6872c2c9d3067d2e27bdff2cf2bb32ef2bf5770c..23674ee32cc490bcb9eefa733d25b693e28b342d 100644
--- a/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll
+++ b/test/DebugInfo/X86/sdag-dbgvalue-phi-use-4.ll
@@ -11,11 +11,11 @@
 ; CHECK:      [[REG1:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG2:%[0-9]+]]:gr32 = PHI
 ; CHECK-NEXT: [[REG3:%[0-9]+]]:gr32 = PHI
-; CHECK-NEXT: DBG_VALUE debug-use [[REG1]], debug-use $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]], debug-use $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG3]], debug-use $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 64, 16)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG1]], debug-use $noreg, !12, !DIExpression(DW_OP_LLVM_fragment, 10, 32)
-; CHECK-NEXT: DBG_VALUE debug-use [[REG2]], debug-use $noreg, !12, !DIExpression(DW_OP_LLVM_fragment, 42, 13)
+; CHECK-NEXT: DBG_VALUE [[REG1]], $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 0, 32)
+; CHECK-NEXT: DBG_VALUE [[REG2]], $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 32, 32)
+; CHECK-NEXT: DBG_VALUE [[REG3]], $noreg, !13, !DIExpression(DW_OP_LLVM_fragment, 64, 16)
+; CHECK-NEXT: DBG_VALUE [[REG1]], $noreg, !12, !DIExpression(DW_OP_LLVM_fragment, 10, 32)
+; CHECK-NEXT: DBG_VALUE [[REG2]], $noreg, !12, !DIExpression(DW_OP_LLVM_fragment, 42, 13)
 ; CHECK-NOT:  DBG_VALUE
 
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
diff --git a/test/DebugInfo/X86/sdag-legalize-multires.ll b/test/DebugInfo/X86/sdag-legalize-multires.ll
index 0ceace388e96d7a91e77c165ba1d82bc7865e05c..f0db358b5425385ec200059939bdd14a16cad65b 100644
--- a/test/DebugInfo/X86/sdag-legalize-multires.ll
+++ b/test/DebugInfo/X86/sdag-legalize-multires.ll
@@ -21,10 +21,10 @@ entry:
   %0 = call float @llvm.cos.f32(float 1.500000e+00), !dbg !13
   ; CHECK: $xmm1 = MOVAPSrr $xmm0
   call void @llvm.dbg.value(metadata float %0, metadata !15, metadata !DIExpression()), !dbg !13
-  ; CHECK: DBG_VALUE debug-use {{.*}}$xmm1, {{.*}}, ![[RSIN]], !DIExpression(),
+  ; CHECK: DBG_VALUE {{.*}}$xmm1, {{.*}}, ![[RSIN]], !DIExpression(),
   %1 = call float @llvm.sin.f32(float 1.500000e+00), !dbg !13
   call void @llvm.dbg.value(metadata float %1, metadata !11, metadata !DIExpression()), !dbg !13
-  ; CHECK: DBG_VALUE debug-use {{.*}}$xmm0, {{.*}}, ![[RCOS]], !DIExpression(),
+  ; CHECK: DBG_VALUE {{.*}}$xmm0, {{.*}}, ![[RCOS]], !DIExpression(),
   call void @g(float %0, float %1), !dbg !13
   ret void, !dbg !13
 }
diff --git a/test/DebugInfo/X86/sdag-salvage-add.ll b/test/DebugInfo/X86/sdag-salvage-add.ll
index f3f129e9bea02d5b77fa62c3a167120ac087cdc5..fda9b33d43b8b3c43168ec0aeb59bf9ef2a46b4b 100644
--- a/test/DebugInfo/X86/sdag-salvage-add.ll
+++ b/test/DebugInfo/X86/sdag-salvage-add.ll
@@ -24,9 +24,9 @@
 ;
 ; CHECK:   ![[S4:.*]] = !DILocalVariable(name: "s4", 
 ; CHECK:   ![[MYVAR:.*]] = !DILocalVariable(name: "myVar", 
-; CHECK:      DBG_VALUE debug-use $rax, debug-use $noreg, ![[MYVAR]],
+; CHECK:      DBG_VALUE $rax, $noreg, ![[MYVAR]],
 ; CHECK-SAME:           !DIExpression(DW_OP_plus_uconst, 4096, DW_OP_stack_value)
-; CHECK-NEXT: DBG_VALUE debug-use $rax, debug-use $noreg, ![[S4]],
+; CHECK-NEXT: DBG_VALUE $rax, $noreg, ![[S4]],
 ; CHECK-SAME:           !DIExpression(DW_OP_plus_uconst, 4096, DW_OP_stack_value)
 ; CHECK-NEXT: $rdi = MOV64rm killed renamable $rax, 1, $noreg, 4096, $noreg,
 
diff --git a/test/DebugInfo/X86/sdag-split-arg.ll b/test/DebugInfo/X86/sdag-split-arg.ll
index 31cb678e8e12778ae14d3549460ed3b4217bd6a5..745c1c366af9bfa8c7e0e8a7496d02a4e8c1d8d0 100644
--- a/test/DebugInfo/X86/sdag-split-arg.ll
+++ b/test/DebugInfo/X86/sdag-split-arg.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -O0 -filetype=asm %s -o - | FileCheck %s
 ; Test large integral function arguments passed in multiple registers.
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 64 16] $ax
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 48 16] $r9w
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 32 16] $r10w
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 16 16] $r11w
-; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 0 16] $bx
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 64 16] ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 48 16] ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 32 16] ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 16 16] ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
+; CHECK: DEBUG_VALUE: foo:bar <- [DW_OP_LLVM_fragment 0 16]  ${{([a-d]x)|(si)|(di)|(bp)|(r[0-9]+w)}}
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
diff --git a/test/DebugInfo/X86/sdagsplit-1.ll b/test/DebugInfo/X86/sdagsplit-1.ll
index a2e02d8ad8a19b59a810660ac9eb05a781035d0c..87eb3b10c32a886cbad651c65b55cdf2e985d6ef 100644
--- a/test/DebugInfo/X86/sdagsplit-1.ll
+++ b/test/DebugInfo/X86/sdagsplit-1.ll
@@ -13,8 +13,8 @@
 ;      return 0;
 ;    }
 ;
-; CHECK-DAG: DBG_VALUE debug-use ${{[a-z]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 0, 32), debug-location !{{[0-9]+}}
-; CHECK-DAG: DBG_VALUE debug-use ${{[a-z]+}}, debug-use $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 32, 32), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE ${{[a-z]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 0, 32), debug-location !{{[0-9]+}}
+; CHECK-DAG: DBG_VALUE ${{[a-z]+}}, $noreg, !{{[0-9]+}}, !DIExpression(DW_OP_LLVM_fragment, 32, 32), debug-location !{{[0-9]+}}
 
 ; ModuleID = 'sdagsplit-1.c'
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
diff --git a/test/DebugInfo/X86/split-dwarf-v5-ranges.ll b/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
new file mode 100644
index 0000000000000000000000000000000000000000..74e94643b9c082c568323eab7640df4ddbd37b67
--- /dev/null
+++ b/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
@@ -0,0 +1,78 @@
+; RUN: llc -split-dwarf-file=foo.dwo -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s \
+; RUN: 	    | llvm-dwarfdump -v -debug-info -debug-rnglists - | FileCheck %s
+
+; CHECK: .debug_info contents:
+; CHECK: .debug_info.dwo contents:
+; CHECK: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x0) rangelist = 0x00000004
+; CHECK:          [0x0000000000000001, 0x000000000000000c) ".text"
+; CHECK:          [0x000000000000000e, 0x0000000000000013) ".text")
+
+; CHECK: .debug_rnglists.dwo contents:
+; CHECK: 0x00000000: range list header: length = 0x00000015, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001
+; CHECK: offsets: [
+; CHECK: 0x00000004 => 0x00000010
+; CHECK: ]
+; CHECK: ranges:
+; CHECK: 0x00000010: [DW_RLE_base_addressx]:  0x0000000000000000
+; CHECK: 0x00000012: [DW_RLE_offset_pair  ]:  0x0000000000000001, 0x000000000000000c => [0x0000000000000001, 0x000000000000000c)
+; CHECK: 0x00000015: [DW_RLE_offset_pair  ]:  0x000000000000000e, 0x0000000000000013 => [0x000000000000000e, 0x0000000000000013)
+; CHECK: 0x00000018: [DW_RLE_end_of_list  ]
+
+; Function Attrs: noinline optnone uwtable
+define dso_local void @_Z2f3v() !dbg !7 {
+entry:
+  %x = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata i32* %x, metadata !10, metadata !DIExpression()), !dbg !13
+  %call = call i32 @_Z2f2v(), !dbg !14
+  store i32 %call, i32* %x, align 4, !dbg !13
+  %0 = load i32, i32* %x, align 4, !dbg !13
+  %tobool = icmp ne i32 %0, 0, !dbg !13
+  br i1 %tobool, label %if.then, label %if.end, !dbg !15
+
+if.then:                                          ; preds = %entry
+  call void @_Z2f1v(), !dbg !16
+  br label %if.end, !dbg !18
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void, !dbg !19
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+declare dso_local i32 @_Z2f2v()
+
+declare dso_local void @_Z2f1v()
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @_Z2f4v() #3 section "x" !dbg !20 {
+entry:
+  ret void, !dbg !21
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 344806) (llvm/trunk 344835)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: GNU)
+!1 = !DIFile(filename: "ranges.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch", checksumkind: CSK_MD5, checksum: "a1e825b91fba21d696f05eb06d440aa3")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 8.0.0 (trunk 344806) (llvm/trunk 344835)"}
+!7 = distinct !DISubprogram(name: "f3", linkageName: "_Z2f3v", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DILocalVariable(name: "x", scope: !11, file: !1, line: 4, type: !12)
+!11 = distinct !DILexicalBlock(scope: !7, file: !1, line: 4, column: 11)
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DILocation(line: 4, column: 11, scope: !11)
+!14 = !DILocation(line: 4, column: 15, scope: !11)
+!15 = !DILocation(line: 4, column: 11, scope: !7)
+!16 = !DILocation(line: 5, column: 5, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !11, file: !1, line: 4, column: 21)
+!18 = !DILocation(line: 6, column: 3, scope: !17)
+!19 = !DILocation(line: 7, column: 1, scope: !7)
+!20 = distinct !DISubprogram(name: "f4", linkageName: "_Z2f4v", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!21 = !DILocation(line: 8, column: 42, scope: !20)
diff --git a/test/DebugInfo/X86/v5-loc.ll b/test/DebugInfo/X86/v5-loc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..08789a518134223b9b905c4da9038e76c5f2fac0
--- /dev/null
+++ b/test/DebugInfo/X86/v5-loc.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s \
+; RUN: 	    | llvm-dwarfdump -v -debug-info - | FileCheck %s
+
+; CHECK: DW_AT_location [DW_FORM_exprloc] (DW_OP_addrx 0x0)
+
+%struct.foo = type { i32 }
+
+@f = dso_local global %struct.foo zeroinitializer, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "f", scope: !2, file: !3, line: 5, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 8.0.0 (trunk 344833) (llvm/trunk 344837)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, nameTableKind: GNU)
+!3 = !DIFile(filename: "loc.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch", checksumkind: CSK_MD5, checksum: "e579a1a06fae14a4526216e905198a01")
+!4 = !{}
+!5 = !{!0}
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", file: !3, line: 1, size: 32, flags: DIFlagTypePassByValue | DIFlagTrivial, elements: !7, identifier: "_ZTS3foo")
+!7 = !{!8}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "i", scope: !6, file: !3, line: 2, baseType: !9, size: 32)
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !{i32 2, !"Dwarf Version", i32 5}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{i32 1, !"wchar_size", i32 4}
+!13 = !{!"clang version 8.0.0 (trunk 344833) (llvm/trunk 344837)"}
diff --git a/test/DebugInfo/X86/vla.ll b/test/DebugInfo/X86/vla.ll
index 6713d86769d2bbb4ba022145cdce222168a816ab..7d4aff8470dfad3ec2975a81b0eda880aed48ce5 100644
--- a/test/DebugInfo/X86/vla.ll
+++ b/test/DebugInfo/X86/vla.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin -filetype=asm %s -o - | FileCheck %s
 ; Ensure that we generate an indirect location for the variable length array a.
-; CHECK: ##DEBUG_VALUE: vla:a <- [DW_OP_deref] [$rcx+0]
-; CHECK: DW_OP_breg2
+; CHECK: ##DEBUG_VALUE: vla:a <- [DW_OP_deref] [{{\$r[a-z]+}}+0]
+; CHECK: DW_OP_breg{{[0-9]}}
 ; rdar://problem/13658587
 ;
 ; generated from:
diff --git a/test/DebugInfo/cross-cu-scope.ll b/test/DebugInfo/cross-cu-scope.ll
index dffd44885f06d5a204afeca1774d62c4c5a3e48a..7f663349e37bd466e8cf71eee829616fcbb81be9 100644
--- a/test/DebugInfo/cross-cu-scope.ll
+++ b/test/DebugInfo/cross-cu-scope.ll
@@ -1,5 +1,6 @@
 ; RUN: %llc_dwarf %s -filetype=obj -o %t
 ; RUN: llvm-dwarfdump -debug-info %t | FileCheck %s
+; REQUIRES: default_triple
 
 ; Reduced test case from PR35212. Two DISubprogram belong to a different CU but
 ; share a scope. Both are declarations and end up in the scope's CU. We want to
diff --git a/test/DebugInfo/debuglineinfo-path.ll b/test/DebugInfo/debuglineinfo-path.ll
index d92e1facad1b9e35141ba7ce5baaf0194536600b..88dd4824798aac77698ef7880bd896d21b349727 100644
--- a/test/DebugInfo/debuglineinfo-path.ll
+++ b/test/DebugInfo/debuglineinfo-path.ll
@@ -1,5 +1,6 @@
 ; Make sure that absolute source dir is detected correctly regardless of the platform.
-; REQUIRES: object-emission
+; REQUIRES: object-emission, default_triple
+
 ; On powerpc llvm-nm describes win_func as a global variable, not a function. It breaks the test.
 ; It is not essential to DWARF path handling code we're testing here.
 ; UNSUPPORTED: powerpc
diff --git a/test/DebugInfo/dwarfdump-dump-gdbindex.test b/test/DebugInfo/dwarfdump-dump-gdbindex.test
index cd5cd132d5d9f4b2a5f9c514922c8465eca3865c..2ff13eb4cf0364b7fa742cc6116e126a17e6ba21 100644
--- a/test/DebugInfo/dwarfdump-dump-gdbindex.test
+++ b/test/DebugInfo/dwarfdump-dump-gdbindex.test
@@ -10,7 +10,7 @@ RUN: llvm-dwarfdump -gdb-index %p/Inputs/dwarfdump-gdbindex-v7.elf-x86-64 | File
 ; gcc version 5.3.1 20160413, GNU gold (GNU Binutils for Ubuntu 2.26) 1.11
 ; Info about gdb-index: https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html
 
-; CHECK-LABEL: .gnu_index contents:
+; CHECK-LABEL: .gdb_index contents:
 ; CHECK: Version = 7
 
 ; CHECK:      CU list offset = 0x18, has 2 entries:
diff --git a/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll b/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8d1f4b9cc5c3dd529ba485012d3b2c1b016e2659
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/Inputs/hidden-definitions.ll
@@ -0,0 +1,6 @@
+@bar = hidden global i32 0
+
+define hidden i32 @foo() {
+entry:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcLazy/hello.ll b/test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll
similarity index 95%
rename from test/ExecutionEngine/OrcLazy/hello.ll
rename to test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll
index 86d9a9a4b31c64e182ff1f0b3a67457029910982..00b54fbf73fd759d2ac9711a2b508faa646cea3d 100644
--- a/test/ExecutionEngine/OrcLazy/hello.ll
+++ b/test/ExecutionEngine/OrcLazy/global-ctors-and-dtors.ll
@@ -1,5 +1,7 @@
 ; RUN: lli -jit-kind=orc-lazy -orc-lazy-debug=funcs-to-stdout %s | FileCheck %s
 ;
+; Test that global constructors and destructors are run.
+;
 ; CHECK: Hello
 ; CHECK: [ {{.*}}main{{.*}} ]
 ; CHECK: Goodbye
diff --git a/test/ExecutionEngine/OrcLazy/hidden-visibility.ll b/test/ExecutionEngine/OrcLazy/hidden-visibility.ll
new file mode 100644
index 0000000000000000000000000000000000000000..199fd644bffc51cc25d0e73285474ccc551c2a3b
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/hidden-visibility.ll
@@ -0,0 +1,17 @@
+; RUN: lli -jit-kind=orc-lazy -extra-module %p/Inputs/hidden-definitions.ll %s
+; RUN: not lli -jit-kind=orc-lazy -jd libFoo -extra-module %p/Inputs/hidden-definitions.ll %s
+;
+; Check that hidden symbols in another module are visible when the module is
+; added to the same JITDylib, and not visible if it is added to a different
+; JITDylib.
+
+@bar = external global i32
+declare i32 @foo()
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+entry:
+  %0 = call i32() @foo()
+  %1 = load i32, i32* @bar
+  %2 = add i32 %0, %1
+  ret i32 %2
+}
diff --git a/test/ExecutionEngine/OrcLazy/minimal.ll b/test/ExecutionEngine/OrcLazy/minimal.ll
new file mode 100644
index 0000000000000000000000000000000000000000..86087bc3878e0d0e6d7ef6ebb6be18921320ad25
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/minimal.ll
@@ -0,0 +1,8 @@
+; RUN: lli -jit-kind=orc-lazy %s
+;
+; Basic sanity check: A module with a single no-op main function runs.
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+entry:
+  ret i32 0
+}
diff --git a/test/ExecutionEngine/OrcLazy/single-function-call.ll b/test/ExecutionEngine/OrcLazy/single-function-call.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fba52026660da2b8b9188be257fbf9338ae0b48a
--- /dev/null
+++ b/test/ExecutionEngine/OrcLazy/single-function-call.ll
@@ -0,0 +1,15 @@
+; RUN: lli -jit-kind=orc-lazy %s
+;
+; Basic sanity check: We can make a call inside lazily JIT'd code.
+; Compared to minimal.ll, this demonstrates that we can call through a stub.
+
+define i32 @foo() {
+entry:
+  ret i32 0
+}
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) {
+entry:
+  %0 = call i32() @foo()
+  ret i32 %0
+}
diff --git a/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s b/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s
index 8f7e204321864778693041b176f53cd498f51fc9..ac097c44e5fd8431238bbde4c0375e881d6dad17 100644
--- a/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s
+++ b/test/ExecutionEngine/RuntimeDyld/X86/COFF_x86_64_IMGREL.s
@@ -1,6 +1,6 @@
 # RUN: rm -rf %t && mkdir -p %t
 # RUN: llvm-mc -triple=x86_64-pc-win32 -filetype=obj -o %t/COFF_x86_64_IMGREL.o %s
-# RUN: llvm-rtdyld -triple=x86_64-pc-win32 -verify -check=%s %t/COFF_x86_64_IMGREL.o
+# RUN: llvm-rtdyld -triple=x86_64-pc-win32 -verify -target-addr-start=40960000000000 -check=%s %t/COFF_x86_64_IMGREL.o
 .text
 	.def	 F;
 	.scl	2;
@@ -18,9 +18,9 @@
 	.align	16, 0x90
 
 F:                                      # @F
-# rtdyld-check: decode_operand(inst1, 3) = section_addr(COFF_x86_64_IMGREL.o, .text)+0
+# rtdyld-check: decode_operand(inst1, 3) = section_addr(COFF_x86_64_IMGREL.o, .text)+0-40960000000000
 inst1:
     mov %ebx, F@IMGREL
-# rtdyld-check: decode_operand(inst2, 3) = section_addr(COFF_x86_64_IMGREL.o, .rdata)+5
+# rtdyld-check: decode_operand(inst2, 3) = section_addr(COFF_x86_64_IMGREL.o, .rdata)+5-40960000000000
 inst2:
     mov %ebx, (__constdata@imgrel+5)
diff --git a/test/FileCheck/envvar-opts.txt b/test/FileCheck/envvar-opts.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b25ecb89a318123a62561ffc541127470437ba3f
--- /dev/null
+++ b/test/FileCheck/envvar-opts.txt
@@ -0,0 +1,15 @@
+; Create a case that produces a simple diagnostic.
+; RUN: echo foo > %t.in
+; CHECK: foo
+; CHECK: bar
+
+; RUN: env FILECHECK_OPTS= \
+; RUN: not FileCheck %s -input-file %t.in 2>&1 \
+; RUN: | FileCheck -check-prefix QUIET %s
+
+; RUN: env FILECHECK_OPTS=-v \
+; RUN: not FileCheck %s -input-file %t.in 2>&1 \
+; RUN: | FileCheck -check-prefix VERB %s
+
+; QUIET-NOT: remark: {{CHECK}}: expected string found in input
+; VERB:      remark: {{CHECK}}: expected string found in input
diff --git a/test/FileCheck/opt-color.txt b/test/FileCheck/opt-color.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9430114bf31e92f2762dac84fb9b19c68e294531
--- /dev/null
+++ b/test/FileCheck/opt-color.txt
@@ -0,0 +1,22 @@
+; Create a case that produces a simple diagnostic.
+; RUN: echo foo > %t.in
+; CHECK: bar
+
+; Run without and with -color.  In the former case, FileCheck should suppress
+; color in its diagnostics because stderr is a file.
+; RUN: not FileCheck %s < %t.in 2> %t.no-color
+; RUN: not FileCheck -color %s < %t.in 2> %t.color
+
+; Check whether color was produced.
+; RUN: FileCheck -check-prefix NO-COLOR %s < %t.no-color
+; RUN: FileCheck -check-prefix COLOR %s < %t.color
+
+; Make sure our NO-COLOR and COLOR patterns are sane: they don't match the
+; opposite cases.
+; RUN: not FileCheck -check-prefix COLOR %s < %t.no-color
+; RUN: not FileCheck -check-prefix NO-COLOR %s < %t.color
+
+; I don't know of a good way to check for ANSI color codes, so just make sure
+; some new characters show up where those codes should appear.
+; NO-COLOR: : error: CHECK: expected string not found in input
+; COLOR: : {{.+}}error: {{.+}}CHECK: expected string not found in input
diff --git a/test/Instrumentation/HWAddressSanitizer/basic.ll b/test/Instrumentation/HWAddressSanitizer/basic.ll
index e8010992945724b214b017eea77f9b0e3773fe47..8253016d97b4adff5d0c2f7b4c55822045f3a127 100644
--- a/test/Instrumentation/HWAddressSanitizer/basic.ll
+++ b/test/Instrumentation/HWAddressSanitizer/basic.ll
@@ -354,5 +354,6 @@ entry:
 
 ; CHECK:      define internal void @hwasan.module_ctor() {
 ; CHECK-NEXT:   call void @__hwasan_init()
+; CHECK-NEXT:   call void @__hwasan_init_frames(
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
diff --git a/test/Instrumentation/HWAddressSanitizer/frame-descriptor.ll b/test/Instrumentation/HWAddressSanitizer/frame-descriptor.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3fd4197d3bb78e7bc8a74f5cb7c42cf30ce4feae
--- /dev/null
+++ b/test/Instrumentation/HWAddressSanitizer/frame-descriptor.ll
@@ -0,0 +1,27 @@
+; Test frame descriptors
+;
+; RUN: opt < %s -hwasan -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+declare void @use32(i32*, i64*)
+
+define void @test_alloca() sanitize_hwaddress {
+entry:
+  %XYZ = alloca i32, align 4
+  %ABC = alloca i64, align 4
+  call void @use32(i32* nonnull %XYZ, i64 *nonnull %ABC)
+  ret void
+}
+
+; CHECK: @[[STR:[0-9]*]] = private unnamed_addr constant [15 x i8] c"4 XYZ; 8 ABC; \00", align 1
+; CHECK: private constant { void ()*, [15 x i8]* } { void ()* @test_alloca, [15 x i8]* @[[STR]] }, section "__hwasan_frames", comdat($test_alloca)
+
+; CHECK-LABEL: @test_alloca(
+; CHECK: ret void
+
+; CHECK-LABEL: @hwasan.module_ctor
+; CHECK: call void @__hwasan_init_frames(i8* @__start___hwasan_frames, i8* @__stop___hwasan_frames)
+; CHECK: ret void
+
diff --git a/test/Instrumentation/HWAddressSanitizer/with-calls.ll b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
index 768434c5b556e6056843d7960b18d625fee7eb6b..8d6068c343894f890c61d972bb7cfa64d7c09cbf 100644
--- a/test/Instrumentation/HWAddressSanitizer/with-calls.ll
+++ b/test/Instrumentation/HWAddressSanitizer/with-calls.ll
@@ -199,5 +199,6 @@ entry:
 
 ; CHECK:      define internal void @hwasan.module_ctor() {
 ; CHECK-NEXT:   call void @__hwasan_init()
+; CHECK-NEXT:   call void @__hwasan_init_frames(
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
diff --git a/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll b/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4dcca26fd0b1c54c75e340639d2976b2704fcc2f
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
@@ -0,0 +1,236 @@
+; Test for handling of asm constraints in MSan instrumentation.
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S | FileCheck -check-prefixes=CHECK,CHECK-NONCONS %s
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S | FileCheck -check-prefixes=CHECK,CHECK-CONS %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.pair = type { i32, i32 }
+
+@id1 = common dso_local global i32 0, align 4
+@is1 = common dso_local global i32 0, align 4
+@id2 = common dso_local global i32 0, align 4
+@is2 = common dso_local global i32 0, align 4
+@id3 = common dso_local global i32 0, align 4
+@pair2 = common dso_local global %struct.pair zeroinitializer, align 4
+@pair1 = common dso_local global %struct.pair zeroinitializer, align 4
+@c2 = common dso_local global i8 0, align 1
+@c1 = common dso_local global i8 0, align 1
+@memcpy_d1 = common dso_local global i8* (i8*, i8*, i32)* null, align 8
+@memcpy_d2 = common dso_local global i8* (i8*, i8*, i32)* null, align 8
+@memcpy_s1 = common dso_local global i8* (i8*, i8*, i32)* null, align 8
+@memcpy_s2 = common dso_local global i8* (i8*, i8*, i32)* null, align 8
+
+; The functions below were generated from a C source that contains declarations like follows:
+;   void f1() {
+;     asm("" : "=r" (id1) : "r" (is1));
+;   }
+; with corresponding input/output constraints.
+; Note that the assembly statement is always empty, as MSan doesn't look at it anyway.
+
+; One input register, one output register:
+;   asm("" : "=r" (id1) : "r" (is1));
+define dso_local void @f_1i_1o_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @is1, align 4
+  %1 = call i32 asm "", "=r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0)
+  store i32 %1, i32* @id1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_1i_1o_reg
+; CHECK: [[IS1_F1:%.*]] = load i32, i32* @is1, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call i32 asm "",{{.*}}(i32 [[IS1_F1]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+
+
+; Two input registers, two output registers:
+;   asm("" : "=r" (id1), "=r" (id2) : "r" (is1), "r"(is2));
+define dso_local void @f_2i_2o_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @is1, align 4
+  %1 = load i32, i32* @is2, align 4
+  %2 = call { i32, i32 } asm "", "=r,=r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %asmresult = extractvalue { i32, i32 } %2, 0
+  %asmresult1 = extractvalue { i32, i32 } %2, 1
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_2i_2o_reg
+; CHECK: [[IS1_F2:%.*]] = load i32, i32* @is1, align 4
+; CHECK: [[IS2_F2:%.*]] = load i32, i32* @is2, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[IS1_F2]], i32 [[IS2_F2]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+
+; Input same as output, used twice:
+;   asm("" : "=r" (id1), "=r" (id2) : "r" (id1), "r" (id2));
+define dso_local void @f_2i_2o_reuse2_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @id1, align 4
+  %1 = load i32, i32* @id2, align 4
+  %2 = call { i32, i32 } asm "", "=r,=r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %asmresult = extractvalue { i32, i32 } %2, 0
+  %asmresult1 = extractvalue { i32, i32 } %2, 1
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_2i_2o_reuse2_reg
+; CHECK: [[ID1_F3:%.*]] = load i32, i32* @id1, align 4
+; CHECK: [[ID2_F3:%.*]] = load i32, i32* @id2, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[ID1_F3]], i32 [[ID2_F3]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+
+
+; One of the input registers is also an output:
+;   asm("" : "=r" (id1), "=r" (id2) : "r" (id1), "r"(is1));
+define dso_local void @f_2i_2o_reuse1_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @id1, align 4
+  %1 = load i32, i32* @is1, align 4
+  %2 = call { i32, i32 } asm "", "=r,=r,r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1)
+  %asmresult = extractvalue { i32, i32 } %2, 0
+  %asmresult1 = extractvalue { i32, i32 } %2, 1
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_2i_2o_reuse1_reg
+; CHECK: [[ID1_F4:%.*]] = load i32, i32* @id1, align 4
+; CHECK: [[IS1_F4:%.*]] = load i32, i32* @is1, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[ID1_F4]], i32 [[IS1_F4]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+
+
+; One input register, three output registers:
+;   asm("" : "=r" (id1), "=r" (id2), "=r" (id3) : "r" (is1));
+define dso_local void @f_1i_3o_reg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @is1, align 4
+  %1 = call { i32, i32, i32 } asm "", "=r,=r,=r,r,~{dirflag},~{fpsr},~{flags}"(i32 %0)
+  %asmresult = extractvalue { i32, i32, i32 } %1, 0
+  %asmresult1 = extractvalue { i32, i32, i32 } %1, 1
+  %asmresult2 = extractvalue { i32, i32, i32 } %1, 2
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id2, align 4
+  store i32 %asmresult2, i32* @id3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_1i_3o_reg
+; CHECK: [[IS1_F5:%.*]] = load i32, i32* @is1, align 4
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i32, i32, i32 } asm "",{{.*}}(i32 [[IS1_F5]])
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id1 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id2 to i64)
+; CHECK: store i32 0,{{.*}}ptrtoint (i32* @id3 to i64)
+
+
+; 2 input memory args, 2 output memory args:
+;  asm("" : "=m" (id1), "=m" (id2) : "m" (is1), "m"(is2))
+define dso_local void @f_2i_2o_mem() sanitize_memory {
+entry:
+  call void asm "", "=*m,=*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(i32* @id1, i32* @id2, i32* @is1, i32* @is2)
+  ret void
+}
+
+; CHECK-LABEL: @f_2i_2o_mem
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@is1{{.*}}, i64 4)
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@is2{{.*}}, i64 4)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id1{{.*}}, i64 4)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id2{{.*}}, i64 4)
+; CHECK: call void asm "", "=*m,=*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(i32* @id1, i32* @id2, i32* @is1, i32* @is2)
+
+
+; Same input and output passed as both memory and register:
+;  asm("" : "=r" (id1), "=m"(id1) : "r"(is1), "m"(is1));
+define dso_local void @f_1i_1o_memreg() sanitize_memory {
+entry:
+  %0 = load i32, i32* @is1, align 4
+  %1 = call i32 asm "", "=r,=*m,r,*m,~{dirflag},~{fpsr},~{flags}"(i32* @id1, i32 %0, i32* @is1)
+  store i32 %1, i32* @id1, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_1i_1o_memreg
+; CHECK: [[IS1_F7:%.*]] = load i32, i32* @is1, align 4
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@is1{{.*}}, i64 4)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id1{{.*}}, i64 4)
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call i32 asm "", "=r,=*m,r,*m,~{dirflag},~{fpsr},~{flags}"(i32* @id1, i32 [[IS1_F7]], i32* @is1)
+
+
+; Three outputs, first and last returned via regs, second via mem:
+;  asm("" : "=r" (id1), "=m"(id2), "=r" (id3):);
+define dso_local void @f_3o_reg_mem_reg() sanitize_memory {
+entry:
+  %0 = call { i32, i32 } asm "", "=r,=*m,=r,~{dirflag},~{fpsr},~{flags}"(i32* @id2)
+  %asmresult = extractvalue { i32, i32 } %0, 0
+  %asmresult1 = extractvalue { i32, i32 } %0, 1
+  store i32 %asmresult, i32* @id1, align 4
+  store i32 %asmresult1, i32* @id3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @f_3o_reg_mem_reg
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id2{{.*}}), i64 4)
+; CHECK: call { i32, i32 } asm "", "=r,=*m,=r,~{dirflag},~{fpsr},~{flags}"(i32* @id2)
+
+
+; Three inputs and three outputs of different types: a pair, a char, a function pointer.
+; Everything is meant to be passed in registers, but LLVM chooses to return the integer pair by pointer:
+;  asm("" : "=r" (pair2), "=r" (c2), "=r" (memcpy_d1) : "r"(pair1), "r"(c1), "r"(memcpy_s1));
+define dso_local void @f_3i_3o_complex_reg() sanitize_memory {
+entry:
+  %0 = load i64, i64* bitcast (%struct.pair* @pair1 to i64*), align 4
+  %1 = load i8, i8* @c1, align 1
+  %2 = load i8* (i8*, i8*, i32)*, i8* (i8*, i8*, i32)** @memcpy_s1, align 8
+  %3 = call { i8, i8* (i8*, i8*, i32)* } asm "", "=*r,=r,=r,r,r,r,~{dirflag},~{fpsr},~{flags}"(%struct.pair* @pair2, i64 %0, i8 %1, i8* (i8*, i8*, i32)* %2)
+  %asmresult = extractvalue { i8, i8* (i8*, i8*, i32)* } %3, 0
+  %asmresult1 = extractvalue { i8, i8* (i8*, i8*, i32)* } %3, 1
+  store i8 %asmresult, i8* @c2, align 1
+  store i8* (i8*, i8*, i32)* %asmresult1, i8* (i8*, i8*, i32)** @memcpy_d1, align 8
+  ret void
+}
+
+; CHECK-LABEL: @f_3i_3o_complex_reg
+; CHECK: [[PAIR1_F9:%.*]] = load {{.*}} @pair1
+; CHECK: [[C1_F9:%.*]] = load {{.*}} @c1
+; CHECK: [[MEMCPY_S1_F9:%.*]] = load {{.*}} @memcpy_s1
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@pair2{{.*}}, i64 8)
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call { i8, i8* (i8*, i8*, i32)* } asm "", "=*r,=r,=r,r,r,r,~{dirflag},~{fpsr},~{flags}"(%struct.pair* @pair2, {{.*}}[[PAIR1_F9]], i8 [[C1_F9]], {{.*}} [[MEMCPY_S1_F9]])
+
+; Three inputs and three outputs of different types: a pair, a char, a function pointer.
+; Everything is passed in memory:
+;  asm("" : "=m" (pair2), "=m" (c2), "=m" (memcpy_d1) : "m"(pair1), "m"(c1), "m"(memcpy_s1));
+define dso_local void @f_3i_3o_complex_mem() sanitize_memory {
+entry:
+  call void asm "", "=*m,=*m,=*m,*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(%struct.pair* @pair2, i8* @c2, i8* (i8*, i8*, i32)** @memcpy_d1, %struct.pair* @pair1, i8* @c1, i8* (i8*, i8*, i32)** @memcpy_s1)
+  ret void
+}
+
+; CHECK-LABEL: @f_3i_3o_complex_mem
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@pair1{{.*}}, i64 8)
+; CHECK-CONS: call void @__msan_instrument_asm_load(i8* @c1, i64 1)
+; CHECK-CONS: call void @__msan_instrument_asm_load({{.*}}@memcpy_s1{{.*}}, i64 8)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@pair2{{.*}}, i64 8)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@c2{{.*}}, i64 1)
+; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@memcpy_d1{{.*}}, i64 8)
+; CHECK: call void asm "", "=*m,=*m,=*m,*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(%struct.pair* @pair2, i8* @c2, i8* (i8*, i8*, i32)** @memcpy_d1, %struct.pair* @pair1, i8* @c1, i8* (i8*, i8*, i32)** @memcpy_s1)
diff --git a/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll b/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll
index 0b9f455f1d0ec8086a60eea9a1c5d671746dd8aa..7240e1086dae89274d36e09bb9653697e1ff560e 100644
--- a/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_x86_bts_asm.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ;    unsigned long *addr = &value;
 ;    asm("btsq %2, %1; setc %0" : "=qm" (bit), "=m" (addr): "Ir" (nr));
 ;    if (bit)
-;      return 0
+;      return 0;
 ;    else
 ;      return 1;
 ;  }
@@ -52,25 +52,27 @@ if.else:                                          ; preds = %entry
   ret i32 1
 }
 
-; Start with the asm call
+; Hooks for inputs usually go before the assembly statement. But here we have none,
+; because %nr is passed by value. However we check %nr for being initialized.
+; CHECK-CONS: [[NRC:%.*]] = ptrtoint i64* %nr to i64
+
+; In the conservative mode, call the store hooks for %bit and %addr:
+; CHECK-CONS: call void @__msan_instrument_asm_store(i8* %bit, i64 1)
+; CHECK-CONS: [[ADDR8S:%.*]] = bitcast i64** %addr to i8*
+; CHECK-CONS: call void @__msan_instrument_asm_store(i8* [[ADDR8S]], i64 8)
+
+; Landing pad for the %nr check above.
+; CHECK-CONS: call void @__msan_warning_noreturn()
+
 ; CHECK: call void asm "btsq $2, $1; setc $0"
 
 ; Calculating the shadow offset of %bit.
 ; CHECK: [[PTR:%.*]] = ptrtoint {{.*}} %bit to i64
-; CHECK: [[SH_NUM:%.*]] = xor i64 [[PTR]], [[OFF:[0-9]*]]
+; CHECK: [[SH_NUM:%.*]] = xor i64 [[PTR]]
 ; CHECK: [[SHADOW:%.*]] = inttoptr i64 [[SH_NUM]] {{.*}}
 
-; In the conservative mode, unpoison the shadow.
-; CHECK-CONS: store i8 0, i8* [[SHADOW]]
-; Now calculate the shadow address again, because MSan does this for every
-; shadow access.
-; CHECK-CONS: [[PTR2:%.*]] = ptrtoint {{.*}} %bit to i64
-; CHECK-CONS: [[SH_NUM2:%.*]] = xor i64 [[PTR2]], [[OFF]]
-; CHECK-CONS: [[SHADOW2:%.*]] = inttoptr i64 [[SH_NUM2]] {{.*}}
-
 ; Now load the shadow value for the boolean.
-; CHECK-NONCONS: [[MSLD:%.*]] = load {{.*}} [[SHADOW]]
-; CHECK-CONS: [[MSLD:%.*]] = load {{.*}} [[SHADOW2]]
+; CHECK: [[MSLD:%.*]] = load {{.*}} [[SHADOW]]
 ; CHECK: [[MSPROP:%.*]] = trunc i8 [[MSLD]] to i1
 
 ; Is the shadow poisoned?
diff --git a/test/Instrumentation/SanitizerCoverage/coff-comdat.ll b/test/Instrumentation/SanitizerCoverage/coff-comdat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..61a9dcd92de2160090be7505fcc76e3ab0260016
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/coff-comdat.ll
@@ -0,0 +1,85 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -sanitizer-coverage-pc-table=1 -S | FileCheck %s
+
+; Make sure we use the right comdat groups for COFF to avoid relocations
+; against discarded sections. Internal linkage functions are also different from
+; ELF. We don't add a module unique identifier.
+
+; Test based on this source:
+; int baz(int);
+; static int __attribute__((noinline)) bar(int x) {
+;   if (x)
+;     return baz(x);
+;   return 0;
+; }
+; int foo(int x) {
+;   if (baz(0))
+;     x = bar(x);
+;   return x;
+; }
+
+; Both new comdats should no duplicates on COFF.
+
+; CHECK: $foo = comdat noduplicates
+; CHECK: $bar = comdat noduplicates
+
+; Tables for 'foo' should be in the 'foo' comdat.
+
+; CHECK: @__sancov_gen_{{.*}} = private global [1 x i8] zeroinitializer, section ".SCOV$CM", comdat($foo), align 1
+
+; CHECK: @__sancov_gen_{{.*}} = private constant [2 x i64*]
+; CHECK-SAME: [i64* bitcast (i32 (i32)* @foo to i64*), i64* inttoptr (i64 1 to i64*)],
+; CHECK-SAME: section ".SCOVP$M", comdat($foo), align 8
+
+; Tables for 'bar' should be in the 'bar' comdat.
+
+; CHECK: @__sancov_gen_{{.*}} = private global [1 x i8] zeroinitializer, section ".SCOV$CM", comdat($bar), align 1
+
+; CHECK: @__sancov_gen_{{.*}} = private constant [2 x i64*]
+; CHECK-SAME: [i64* bitcast (i32 (i32)* @bar to i64*), i64* inttoptr (i64 1 to i64*)],
+; CHECK-SAME: section ".SCOVP$M", comdat($bar), align 8
+
+; 'foo' and 'bar' should be in their new comdat groups.
+
+; CHECK: define dso_local i32 @foo(i32 %x){{.*}} comdat {
+; CHECK: define internal fastcc i32 @bar(i32 %x){{.*}} comdat {
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.14.26433"
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @foo(i32 %x) local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 @baz(i32 0) #3
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call fastcc i32 @bar(i32 %x)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %x.addr.0 = phi i32 [ %call1, %if.then ], [ %x, %entry ]
+  ret i32 %x.addr.0
+}
+
+declare dso_local i32 @baz(i32) local_unnamed_addr #1
+
+; Function Attrs: noinline nounwind uwtable
+define internal fastcc i32 @bar(i32 %x) unnamed_addr #2 {
+entry:
+  %tobool = icmp eq i32 %x, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 @baz(i32 %x) #3
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { "asdf" }
+attributes #2 = { noinline nounwind uwtable }
+attributes #3 = { nounwind }
diff --git a/test/LTO/X86/internalize.ll b/test/LTO/X86/internalize.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6b18aa7ded03713483af8af3801de7756d05b1e3
--- /dev/null
+++ b/test/LTO/X86/internalize.ll
@@ -0,0 +1,42 @@
+; RUN: opt %s -o %t1.bc
+
+; RUN: llvm-lto %t1.bc -o %t1.save.opt  --exported-symbol=_foo -save-merged-module -O0
+; RUN: llvm-dis < %t1.save.opt.merged.bc | FileCheck %s --check-prefix=INTERNALIZE
+
+; Test the enable-lto-internalization option by setting it to false.
+; This makes sure internalization does not happen.
+; RUN: llvm-lto %t1.bc -enable-lto-internalization=false -o %t1.save.opt  \
+; RUN:                 --exported-symbol=_foo -save-merged-module -O0
+; RUN: llvm-dis < %t1.save.opt.merged.bc | FileCheck %s --check-prefix=INTERNALIZE-OPTION-DISABLE
+
+; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps \
+; RUN:     -r=%t1.bc,_foo,pxl \
+; RUN:     -r=%t1.bc,_bar,pl
+; RUN: llvm-dis < %t.o.0.2.internalize.bc | FileCheck  %s --check-prefix=INTERNALIZE2
+
+; Test the enable-lto-internalization option by setting it to false.
+; This makes sure internalization does not happen in runRegularLTO().
+; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps -enable-lto-internalization=false \
+; RUN:     -r=%t1.bc,_foo,pxl \
+; RUN:     -r=%t1.bc,_bar,pl
+; RUN: llvm-dis < %t.o.0.2.internalize.bc | FileCheck  %s --check-prefix=INTERNALIZE2-OPTION-DISABLE
+
+; INTERNALIZE: define void @foo
+; INTERNALIZE: define internal void @bar
+; INTERNALIZE-OPTION-DISABLE: define void @foo
+; INTERNALIZE-OPTION-DISABLE: define void @bar
+; INTERNALIZE2: define dso_local void @foo
+; INTERNALIZE2: define internal void @bar
+; INTERNALIZE2-OPTION-DISABLE: define dso_local void @foo
+; INTERNALIZE2-OPTION-DISABLE: define dso_local void @bar
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define void @foo() {
+    call void @bar()
+    ret void
+}
+define void @bar() {
+    ret void
+}
diff --git a/test/LTO/X86/libcall-overridden-via-alias.ll b/test/LTO/X86/libcall-overridden-via-alias.ll
new file mode 100755
index 0000000000000000000000000000000000000000..04e1512f5b8ccf20b5053d4e9fb346905a2604d3
--- /dev/null
+++ b/test/LTO/X86/libcall-overridden-via-alias.ll
@@ -0,0 +1,69 @@
+; Given a library call that is represented as an llvm intrinsic call, but
+; later transformed to an actual call, if an overriding definition of that
+; library routine is provided indirectly via an alias, verify that LTO
+; does not eliminate the definition.  This is a test for PR38547.
+;
+; RUN: llvm-as -o %t1 %s
+; RUN: llvm-lto -exported-symbol=main -save-merged-module -filetype=asm -o %t2 %t1
+; RUN: llvm-dis -o - %t2.merged.bc | FileCheck --check-prefix=CHECK_IR %s
+;
+; Check that the call is represented as an llvm intrinsic in the IR after LTO:
+; CHECK_IR-LABEL: main
+; CHECK_IR: call float @llvm.log.f32
+;
+; Check that the IR contains the overriding definition of the library routine
+; in the IR after LTO:
+; CHECK_IR: define internal float @logf(float
+; CHECK_IR-NEXT:   [[TMP:%.*]] = fadd float
+; CHECK_IR-NEXT:   ret float [[TMP]]
+;
+; Check that the assembly code from LTO contains the call to the expected
+; library routine, and that the overriding definition of the library routine
+; is present:
+; RUN: FileCheck --check-prefix=CHECK_ASM %s < %t2
+; CHECK_ASM-LABEL: main:
+; CHECK_ASM: callq logf
+; CHECK_ASM-LABEL: logf:
+; CHECK_ASM-NEXT: add
+; CHECK_ASM-NEXT: ret
+
+; Produced from the following source-code:
+;
+;extern float logf(float);
+;// 'src' and 'dst' are 'volatile' to prohibit optimization.
+;volatile float src = 3.14f;
+;volatile float dst;
+;
+;int main() {
+;  dst = logf(src);
+;  return 0;
+;}
+;
+;extern float fname(float x);
+;float fname(float x) {
+;  return x + x;
+;}
+;
+;float logf(float x) __attribute__((alias("fname")));
+;
+target triple = "x86_64-unknown-linux-gnu"
+
+@src = global float 0x40091EB860000000, align 4
+@dst = common global float 0.000000e+00, align 4
+
+@logf = alias float (float), float (float)* @fname
+
+define i32 @main() local_unnamed_addr {
+entry:
+  %0 = load volatile float, float* @src, align 4
+  %1 = tail call float @llvm.log.f32(float %0)
+  store volatile float %1, float* @dst, align 4
+  ret i32 0
+}
+
+declare float @llvm.log.f32(float)
+
+define float @fname(float %x) {
+  %add = fadd float %x, %x
+  ret float %add
+}
diff --git a/test/MC/AArch64/CheckDataSymbol.s b/test/MC/AArch64/CheckDataSymbol.s
new file mode 100644
index 0000000000000000000000000000000000000000..ea3ed7b287328a0208401d43bf51474d48b3578a
--- /dev/null
+++ b/test/MC/AArch64/CheckDataSymbol.s
@@ -0,0 +1,15 @@
+# RUN: llvm-mc -filetype=obj -assemble \
+# RUN: -triple=aarch64- %s -o - \
+# RUN: | llvm-readobj -s -t - | FileCheck %s
+# CHECK:     Name: $d.1 ({{[1-9][0-9]+}})
+# CHECK-NEXT:     Value: 0x4
+# CHECK-NEXT:     Size: 0
+# CHECK-NEXT:     Binding: Local (0x0)
+# CHECK-NEXT:     Type: None (0x0)
+# CHECK-NEXT:     Other: 0
+# CHECK-NEXT:     Section: .text (0x2)
+# CHECK-NEXT:   }
+
+.text
+nop
+.zero 4
diff --git a/test/MC/AArch64/macro-hex-int.s b/test/MC/AArch64/macro-hex-int.s
new file mode 100644
index 0000000000000000000000000000000000000000..0d697bce53e06a6f8f3a6365fe5608235875acb6
--- /dev/null
+++ b/test/MC/AArch64/macro-hex-int.s
@@ -0,0 +1,8 @@
+// RUN: llvm-mc -triple aarch64-elf -filetype=obj %s -o - | llvm-objdump -d -r - | FileCheck %s
+
+.macro do_add sz
+        add     v0.\sz, v0.\sz, v0.\sz
+.endm
+
+do_add 8h
+// CHECK:  add     v0.8h, v0.8h, v0.8h
diff --git a/test/MC/AArch64/udf.s b/test/MC/AArch64/udf.s
new file mode 100644
index 0000000000000000000000000000000000000000..a6a345b1ff67f5bfcf3db104a48eb72c03160b55
--- /dev/null
+++ b/test/MC/AArch64/udf.s
@@ -0,0 +1,9 @@
+# RUN: llvm-mc -assemble -show-encoding -triple=aarch64- %s | FileCheck %s
+# CHECK:  .text
+# CHECK-NEXT: udf #0      // encoding: [0x00,0x00,0x00,0x00]
+# CHECK-NEXT: udf #513    // encoding: [0x01,0x02,0x00,0x00]
+# CHECK-NEXT: udf #65535  // encoding: [0xff,0xff,0x00,0x00]
+.text
+udf 0
+udf 513
+udf 65535
diff --git a/test/MC/AArch64/udf_not.s b/test/MC/AArch64/udf_not.s
new file mode 100644
index 0000000000000000000000000000000000000000..55b59fe47105c4693918e0c0efbcedca1377eef1
--- /dev/null
+++ b/test/MC/AArch64/udf_not.s
@@ -0,0 +1,7 @@
+# RUN: not llvm-mc -assemble -show-encoding -triple=aarch64- %s 2>&1 | FileCheck %s
+udf 65536
+udf -1
+udf -768
+# CHECK:{{.*}} immediate must be an integer in range [0, 65535].
+# CHECK:{{.*}} immediate must be an integer in range [0, 65535].
+# CHECK:{{.*}} immediate must be an integer in range [0, 65535].
diff --git a/test/MC/AMDGPU/mimg.s b/test/MC/AMDGPU/mimg.s
index 95bc4c306e7b61a523c36c59b5be760208a9e556..83835270a1d4afb0d26fe496e36d6801c9663cc6 100644
--- a/test/MC/AMDGPU/mimg.s
+++ b/test/MC/AMDGPU/mimg.s
@@ -157,6 +157,84 @@ image_load v[5:7], v[1:4], s[8:15] dmask:0xf tfe d16
 // GFX8_1:   image_load v[5:7], v[1:4], s[8:15] dmask:0xf tfe d16 ; encoding: [0x00,0x0f,0x01,0xf0,0x01,0x05,0x02,0x80]
 // GFX9:     image_load v[5:7], v[1:4], s[8:15] dmask:0xf tfe d16 ; encoding: [0x00,0x0f,0x01,0xf0,0x01,0x05,0x02,0x80]
 
+//===----------------------------------------------------------------------===//
+// Image Load/Store: a16
+//===----------------------------------------------------------------------===//
+
+image_load v5, v[1:2], s[8:15] unorm a16
+// GFX9:     image_load v5, v[1:2], s[8:15] unorm a16 ; encoding: [0x00,0x90,0x00,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:6], v[1:2], s[8:15] dmask:0x3 unorm a16
+// GFX9:     image_load v[5:6], v[1:2], s[8:15] dmask:0x3 unorm a16 ; encoding: [0x00,0x93,0x00,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:7], v[1:2], s[8:15] dmask:0x7 unorm a16
+// GFX9:     image_load v[5:7], v[1:2], s[8:15] dmask:0x7 unorm a16 ; encoding: [0x00,0x97,0x00,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:8], v[1:2], s[8:15] dmask:0xf unorm a16
+// GFX9:     image_load v[5:8], v[1:2], s[8:15] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x00,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v5, v[1:2], s[8:15] unorm a16
+// GFX9:     image_store v5, v[1:2], s[8:15] unorm a16 ; encoding: [0x00,0x90,0x20,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:6], v[1:2], s[8:15] dmask:0x3 unorm a16
+// GFX9:     image_store v[5:6], v[1:2], s[8:15] dmask:0x3 unorm a16 ; encoding: [0x00,0x93,0x20,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:7], v[1:2], s[8:15] dmask:0x7 unorm a16
+// GFX9:     image_store v[5:7], v[1:2], s[8:15] dmask:0x7 unorm a16 ; encoding: [0x00,0x97,0x20,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:8], v[1:2], s[8:15] dmask:0xf unorm a16
+// GFX9:     image_store v[5:8], v[1:2], s[8:15] dmask:0xf unorm a16 ; encoding: [0x00,0x9f,0x20,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+/===----------------------------------------------------------------------===//
+// Image Load/Store: a16 & d16
+//===----------------------------------------------------------------------===//
+
+image_load v5, v[1:2], s[8:15] dmask:0x3 unorm a16 d16
+// GFX9:     image_load v5, v[1:2], s[8:15] dmask:0x3 unorm a16 d16 ; encoding: [0x00,0x93,0x00,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:6], v[1:2], s[8:15] dmask:0x7 unorm a16 d16
+// GFX9:     image_load v[5:6], v[1:2], s[8:15] dmask:0x7 unorm a16 d16 ; encoding: [0x00,0x97,0x00,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_load v[5:6], v[1:2], s[8:15] dmask:0xf unorm a16 d16
+// GFX9:     image_load v[5:6], v[1:2], s[8:15] dmask:0xf unorm a16 d16 ; encoding: [0x00,0x9f,0x00,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v5, v[1:2], s[8:15] dmask:0x3 unorm a16 d16
+// GFX9:     image_store v5, v[1:2], s[8:15] dmask:0x3 unorm a16 d16 ; encoding: [0x00,0x93,0x20,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:6], v[1:2], s[8:15] dmask:0x7 unorm a16 d16
+// GFX9:     image_store v[5:6], v[1:2], s[8:15] dmask:0x7 unorm a16 d16 ; encoding: [0x00,0x97,0x20,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
+image_store v[5:6], v[1:2], s[8:15] dmask:0xf unorm a16 d16
+// GFX9:     image_store v[5:6], v[1:2], s[8:15] dmask:0xf unorm a16 d16 ; encoding: [0x00,0x9f,0x20,0xf0,0x01,0x05,0x02,0x80]
+// NOSICI:   error: a16 modifier is not supported on this GPU
+// NOVI:     error: a16 modifier is not supported on this GPU
+
 //===----------------------------------------------------------------------===//
 // Image Load/Store: PCK variants
 //===----------------------------------------------------------------------===//
@@ -193,6 +271,11 @@ image_load_mip_pck v5, v[1:4], s[8:15] dmask:0x1 d16
 // NOVI:   error: invalid operand for instruction
 // NOGFX9: error: invalid operand for instruction
 
+image_load_mip_pck v5, v[1:2], s[8:15] dmask:0x1 a16
+// GFX9:   image_load_mip_pck v5, v[1:2], s[8:15] dmask:0x1 a16 ; encoding: [0x00,0x81,0x10,0xf0,0x01,0x05,0x02,0x00]
+// NOSICI: error: a16 modifier is not supported on this GPU
+// NOVI:   error: a16 modifier is not supported on this GPU
+
 image_store_mip_pck v252, v2, s[12:19] dmask:0x1 unorm
 // GCN: image_store_mip_pck v252, v2, s[12:19] dmask:0x1 unorm ; encoding: [0x00,0x11,0x2c,0xf0,0x02,0xfc,0x03,0x00]
 
@@ -216,6 +299,11 @@ image_store_mip_pck v252, v[2:5], s[12:19] dmask:0x1 d16
 // NOVI:   error: invalid operand for instruction
 // NOGFX9: error: invalid operand for instruction
 
+image_store_mip_pck v252, v[2:3], s[12:19] dmask:0x1 a16
+// GFX9:   image_store_mip_pck v252, v[2:3], s[12:19] dmask:0x1 a16 ; encoding: [0x00,0x81,0x2c,0xf0,0x02,0xfc,0x03,0x00]
+// NOSICI: error: a16 modifier is not supported on this GPU
+// NOVI:   error: a16 modifier is not supported on this GPU
+
 //===----------------------------------------------------------------------===//
 // Image Sample
 //===----------------------------------------------------------------------===//
diff --git a/test/MC/Disassembler/AArch64/udf.txt b/test/MC/Disassembler/AArch64/udf.txt
new file mode 100644
index 0000000000000000000000000000000000000000..445803ebe710423b6ead56270f77372a67d64b4d
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/udf.txt
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -arch aarch64 -disassemble -o - %s | FileCheck %s
+# RUN: llvm-mc -arch aarch64 -disassemble -o - %s | \
+# RUN: llvm-mc -assemble -filetype=obj -arch aarch64 -o - | \
+# RUN: llvm-objdump -r -d --triple=arm64- - | \
+# RUN: FileCheck %s -check-prefix=OBJ
+[0x00,0x00,0x00,0x00]
+[0x01,0x02,0x00,0x00]
+[0xff,0xff,0x00,0x00]
+
+# CHECK: udf #0
+# CHECK-NEXT: udf #513
+# CHECK-NEXT: udf #65535
+
+#OBJ:             0:	00 00 00 00 	udf	#0
+#OBJ-NEXT:        4:	01 02 00 00 	udf	#513
+#OBJ-NEXT:        8:	ff ff 00 00 	udf	#65535
diff --git a/test/MC/Disassembler/MSP430/lit.local.cfg b/test/MC/Disassembler/MSP430/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..b1cf1fbd21d7ee7ae521ccfd5f431cd2a7d200d6
--- /dev/null
+++ b/test/MC/Disassembler/MSP430/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'MSP430' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/MC/Disassembler/MSP430/msp430.txt b/test/MC/Disassembler/MSP430/msp430.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c7d6ff576da8bea860f92640c67128b1f5eb7f7a
--- /dev/null
+++ b/test/MC/Disassembler/MSP430/msp430.txt
@@ -0,0 +1,27 @@
+# RUN: llvm-mc -disassemble %s -triple=msp430 | FileCheck %s
+0x0f 0x47                     # CHECK: mov r7, r15
+0x2f 0x48                     # CHECK: mov @r8, r15
+0x3f 0x48                     # CHECK: mov @r8+, r15
+0x0f 0x43                     # CHECK: clr r15
+0x08 0x57                     # CHECK: add r7, r8
+0x28 0x57                     # CHECK: add @r7, r8
+0x38 0x57                     # CHECK: add @r7+, r8
+0x87 0x12                     # CHECK: call r7
+0x00 0x47                     # CHECK: br r7
+0x39 0xb2                     # CHECK: bit #8, r9
+
+0xfe 0x3f                     # CHECK: jmp $-2
+0xfe 0x23                     # CHECK: jne $-2
+
+0x3f 0x40 0x2a 0x00           # CHECK: mov #42, r15
+0x1f 0x48 0x2a 0x00           # CHECK: mov 42(r8), r15
+0x1f 0x42 0x2a 0x00           # CHECK: mov &42, r15
+0x1f 0x40 0x2a 0x00           # CHECK: mov 42, r15
+0xb0 0x12 0x81 0x01           # CHECK: call #385
+0x97 0x12 0x06 0x00           # CHECK: call 6(r7)
+0xa7 0xb2 0x02 0x00           # CHECK: bit #34, 2(r7)
+0xa9 0x57 0x08 0x00           # CHECK: add @r7, 8(r9)
+0xb7 0xe7 0xfe 0xff           # CHECK: xor @r7+, -2(r7)
+
+0xbf 0x40 0x2a 0x00 0x0c 0x00 # CHECK: mov #42, 12(r15)
+0x9a 0xb9 0x10 0x00 0x08 0x00 # CHECK: bit 16(r9), 8(r10)
diff --git a/test/MC/Disassembler/RISCV/unknown-fence-field.txt b/test/MC/Disassembler/RISCV/unknown-fence-field.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5b20994dcb697df2e950c498097d8827786a2732
--- /dev/null
+++ b/test/MC/Disassembler/RISCV/unknown-fence-field.txt
@@ -0,0 +1,9 @@
+# RUN: llvm-mc -disassemble -triple=riscv32 < %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -disassemble -triple=riscv64 < %s 2>&1 | FileCheck %s
+#
+# Test generated by a LLVM MC Disassembler Protocol Buffer Fuzzer
+# for the RISC-V assembly language.
+
+# This decodes as fence , iorw with invalid fence field as 0.
+[0x0f 0x00 0xf0 0x00]
+# CHECK: fence unknown, iorw
diff --git a/test/MC/Hexagon/elf-flags.s b/test/MC/Hexagon/elf-flags.s
index 0d2f007cb3da75d38f1f3e8583d38335d8cfbe7f..e5c4a8d93fe51cd6f2e2610243e93fb018fe1acd 100644
--- a/test/MC/Hexagon/elf-flags.s
+++ b/test/MC/Hexagon/elf-flags.s
@@ -1,10 +1,8 @@
-# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv4 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V4 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv5 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V5 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv55 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V55 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv60 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V60 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv62 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V62 %s
 
-# CHECK-V4: Flags: 0x3
 # CHECK-V5: Flags: 0x4
 # CHECK-V55: Flags: 0x5
 # CHECK-V60: Flags: 0x60
diff --git a/test/MC/MSP430/addrmode.s b/test/MC/MSP430/addrmode.s
new file mode 100644
index 0000000000000000000000000000000000000000..46051c00fed7a1367d7e6585c50059611778ddb4
--- /dev/null
+++ b/test/MC/MSP430/addrmode.s
@@ -0,0 +1,110 @@
+; RUN: llvm-mc -triple msp430 -show-encoding < %s | FileCheck %s
+
+foo:
+  mov r8, r15
+  mov disp+2(r8), r15
+  mov disp+2, r15
+  mov &disp+2, r15
+  mov @r8, r15
+  mov @r8+, r15
+  mov #disp+2, r15
+
+; CHECK: mov r8, r15           ; encoding: [0x0f,0x48]
+; CHECK: mov disp+2(r8), r15   ; encoding: [0x1f,0x48,A,A]
+; CHECK: mov disp+2, r15       ; encoding: [0x1f,0x40,A,A]
+; CHECK: mov &disp+2, r15      ; encoding: [0x1f,0x42,A,A]
+; CHECK: mov @r8, r15          ; encoding: [0x2f,0x48]
+; CHECK: mov @r8+, r15         ; encoding: [0x3f,0x48]
+; CHECK: mov #disp+2, r15      ; encoding: [0x3f,0x40,A,A]
+
+  mov #42, r15
+  mov #42, 12(r15)
+  mov #42, &disp
+  mov disp, disp+2
+
+; CHECK: mov #42, r15          ; encoding: [0x3f,0x40,0x2a,0x00]
+; CHECK: mov #42, 12(r15)      ; encoding: [0xbf,0x40,0x2a,0x00,0x0c,0x00]
+; CHECK: mov #42, &disp        ; encoding: [0xb2,0x40,0x2a,0x00,A,A]
+; CHECK: mov disp, disp+2      ; encoding: [0x90,0x40,A,A,B,B]
+
+  add r7, r8
+  add 6(r7), r8
+  add &disp, r8
+  add disp, r8
+  add @r9, r8
+  add @r9+, r8
+  add #42, r8
+
+; CHECK: add r7, r8            ; encoding: [0x08,0x57]
+; CHECK: add 6(r7), r8         ; encoding: [0x18,0x57,0x06,0x00]
+; CHECK: add &disp, r8         ; encoding: [0x18,0x52,A,A]
+; CHECK: add disp, r8          ; encoding: [0x18,0x50,A,A]
+; CHECK: add @r9, r8           ; encoding: [0x28,0x59]
+; CHECK: add @r9+, r8          ; encoding: [0x38,0x59]
+; CHECK: add #42, r8           ; encoding: [0x38,0x50,0x2a,0x00]
+
+  add r7, 6(r5)
+  add 6(r7), 6(r5)
+  add &disp, 6(r5)
+  add disp, 6(r5)
+  add @r9, 6(r5)
+  add @r9+, 6(r5)
+  add #42, 6(r5)
+
+; CHECK: add r7, 6(r5)         ; encoding: [0x85,0x57,0x06,0x00]
+; CHECK: add 6(r7), 6(r5)      ; encoding: [0x95,0x57,0x06,0x00,0x06,0x00]
+; CHECK: add &disp, 6(r5)      ; encoding: [0x95,0x52,A,A,0x06,0x00]
+; CHECK: add disp, 6(r5)       ; encoding: [0x95,0x50,A,A,0x06,0x00]
+; CHECK: add @r9, 6(r5)        ; encoding: [0xa5,0x59,0x06,0x00]
+; CHECK: add @r9+, 6(r5)       ; encoding: [0xb5,0x59,0x06,0x00]
+; CHECK: add #42, 6(r5)        ; encoding: [0xb5,0x50,0x2a,0x00,0x06,0x00]
+
+  add r7, &disp
+  add 6(r7), &disp
+  add &disp, &disp
+  add disp, &disp
+  add @r9, &disp
+  add @r9+, &disp
+  add #42, &disp
+
+; CHECK: add r7, &disp         ; encoding: [0x82,0x57,A,A]
+; CHECK: add 6(r7), &disp      ; encoding: [0x92,0x57,0x06,0x00,A,A]
+; CHECK: add &disp, &disp      ; encoding: [0x92,0x52,A,A,B,B]
+; CHECK: add disp, &disp       ; encoding: [0x92,0x50,A,A,B,B]
+; CHECK: add @r9, &disp        ; encoding: [0xa2,0x59,A,A]
+; CHECK: add @r9+, &disp       ; encoding: [0xb2,0x59,A,A]
+; CHECK: add #42, &disp        ; encoding: [0xb2,0x50,0x2a,0x00,A,A]
+
+  add r7, disp
+  add 6(r7), disp
+  add &disp, disp
+  add disp, disp
+  add @r9, disp
+  add @r9+, disp
+  add #42, disp
+
+; CHECK: add r7, disp          ; encoding: [0x80,0x57,A,A]
+; CHECK: add 6(r7), disp       ; encoding: [0x90,0x57,0x06,0x00,A,A]
+; CHECK: add &disp, disp       ; encoding: [0x90,0x52,A,A,B,B]
+; CHECK: add disp, disp        ; encoding: [0x90,0x50,A,A,B,B]
+; CHECK: add @r9, disp         ; encoding: [0xa0,0x59,A,A]
+; CHECK: add @r9+, disp        ; encoding: [0xb0,0x59,A,A]
+; CHECK: add #42, disp         ; encoding: [0xb0,0x50,0x2a,0x00,A,A]
+
+  call r7
+  call 6(r7)
+  call disp+6(r7)
+  call &disp
+  call disp
+  call #disp
+
+; CHECK: call r7               ; encoding: [0x87,0x12]
+; CHECK: call 6(r7)            ; encoding: [0x97,0x12,0x06,0x00]
+; CHECK: call disp+6(r7)       ; encoding: [0x97,0x12,A,A]
+; CHECK: call &disp            ; encoding: [0x92,0x12,A,A]
+; CHECK: call disp             ; encoding: [0x90,0x12,A,A]
+; CHECK: call #disp            ; encoding: [0xb0,0x12,A,A]
+
+disp:
+  .word 0xcafe
+  .word 0xbabe
diff --git a/test/MC/MSP430/altreg.s b/test/MC/MSP430/altreg.s
new file mode 100644
index 0000000000000000000000000000000000000000..fe1e3a43772353871c41667402ddb09f01a98539
--- /dev/null
+++ b/test/MC/MSP430/altreg.s
@@ -0,0 +1,7 @@
+; RUN: llvm-mc -triple msp430 -show-encoding < %s | FileCheck %s
+  mov pc, r0 ; CHECK: mov r0, r0
+  mov sp, r1 ; CHECK: mov r1, r1
+  mov sr, r2 ; CHECK: mov r2, r2
+  mov cg, r3 ; CHECK: mov r3, r3
+  mov fp, r4 ; CHECK: mov r4, r4
+  bic #8, SR ; CHECK: dint
diff --git a/test/MC/MSP430/const.s b/test/MC/MSP430/const.s
new file mode 100644
index 0000000000000000000000000000000000000000..f5cca109a50bebfa25db7fcea61c8165a658d606
--- /dev/null
+++ b/test/MC/MSP430/const.s
@@ -0,0 +1,10 @@
+; RUN: llvm-mc -triple msp430 -show-encoding < %s | FileCheck %s
+  mov #4, r15 ; CHECK: mov #4, r15 ; encoding: [0x2f,0x42]
+  mov #8, r15 ; CHECK: mov #8, r15 ; encoding: [0x3f,0x42]
+  mov #0, r15 ; CHECK: clr r15     ; encoding: [0x0f,0x43]
+  mov #1, r15 ; CHECK: mov #1, r15 ; encoding: [0x1f,0x43]
+  mov #2, r15 ; CHECK: mov #2, r15 ; encoding: [0x2f,0x43]
+  mov #-1, r7 ; CHECK: mov #-1, r7 ; encoding: [0x37,0x43]
+
+  push #8     ; CHECK: push #8     ; encoding: [0x32,0x12]
+  push #42    ; CHECK: push #42    ; encoding: [0x30,0x12,0x2a,0x00]
diff --git a/test/MC/MSP430/invalid.s b/test/MC/MSP430/invalid.s
new file mode 100644
index 0000000000000000000000000000000000000000..2815b520dd5556f0921d2d3e2f3532d83bb96c51
--- /dev/null
+++ b/test/MC/MSP430/invalid.s
@@ -0,0 +1,19 @@
+; RUN: not llvm-mc -triple msp430 < %s 2>&1 | FileCheck %s
+foo:
+  ;; invalid operand count
+  mov    r7        ; CHECK: :[[@LINE]]:3: error: too few operands for instruction
+
+  ;; invalid destination addressing modes
+  mov    r7, @r15  ; CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+  mov    r7, @r15+ ; CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+  mov    r7, #0    ; CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+  mov    r7, #123  ; CHECK: :[[@LINE]]:14: error: invalid operand for instruction
+
+  ;; invalid byte instructions
+  swpb.b r7        ; CHECK: :[[@LINE]]:3: error: invalid instruction mnemonic
+  sxt.b  r7        ; CHECK: :[[@LINE]]:3: error: invalid instruction mnemonic
+  call.b r7        ; CHECK: :[[@LINE]]:3: error: invalid instruction mnemonic
+
+  ;; invalid conditional jump offsets
+  jmp    -513      ; CHECK: :[[@LINE]]:10: error: invalid jump offset
+  jmp    512       ; CHECK: :[[@LINE]]:10: error: invalid jump offset
diff --git a/test/MC/MSP430/lit.local.cfg b/test/MC/MSP430/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..b1cf1fbd21d7ee7ae521ccfd5f431cd2a7d200d6
--- /dev/null
+++ b/test/MC/MSP430/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'MSP430' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/MC/MSP430/opcode.s b/test/MC/MSP430/opcode.s
new file mode 100644
index 0000000000000000000000000000000000000000..14655fe091fa6a7929b1d045f2ea7f3c5414d299
--- /dev/null
+++ b/test/MC/MSP430/opcode.s
@@ -0,0 +1,163 @@
+; RUN: llvm-mc -triple msp430 -show-encoding %s \
+; RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+; RUN: llvm-mc -triple msp430 -filetype=obj %s \
+; RUN:     | llvm-objdump -d - | FileCheck -check-prefix=CHECK-INST %s
+
+  ;; IForm8 instructions
+  mov.b  r7, r8 ; CHECK-INST: mov.b  r7, r8
+                ; CHECK: encoding: [0x48,0x47]
+  add.b  r7, r8 ; CHECK-INST: add.b  r7, r8
+                ; CHECK: encoding: [0x48,0x57]
+  addc.b r7, r8 ; CHECK-INST: addc.b r7, r8
+                ; CHECK: encoding: [0x48,0x67]
+  subc.b r7, r8 ; CHECK-INST: subc.b r7, r8
+                ; CHECK: encoding: [0x48,0x77]
+  sub.b  r7, r8 ; CHECK-INST: sub.b  r7, r8
+                ; CHECK: encoding: [0x48,0x87]
+  cmp.b  r7, r8 ; CHECK-INST: cmp.b  r7, r8
+                ; CHECK: encoding: [0x48,0x97]
+  dadd.b r7, r8 ; CHECK-INST: dadd.b r7, r8
+                ; CHECK: encoding: [0x48,0xa7]
+  bit.b  r7, r8 ; CHECK-INST: bit.b  r7, r8
+                ; CHECK: encoding: [0x48,0xb7]
+  bic.b  r7, r8 ; CHECK-INST: bic.b  r7, r8
+                ; CHECK: encoding: [0x48,0xc7]
+  bis.b  r7, r8 ; CHECK-INST: bis.b  r7, r8
+                ; CHECK: encoding: [0x48,0xd7]
+  xor.b  r7, r8 ; CHECK-INST: xor.b  r7, r8
+                ; CHECK: encoding: [0x48,0xe7]
+  and.b  r7, r8 ; CHECK-INST: and.b  r7, r8
+                ; CHECK: encoding: [0x48,0xf7]
+
+  ;; IForm16 instructions
+  mov    r7, r8 ; CHECK-INST: mov    r7, r8
+                ; CHECK: encoding: [0x08,0x47]
+  add    r7, r8 ; CHECK-INST: add    r7, r8
+                ; CHECK: encoding: [0x08,0x57]
+  addc   r7, r8 ; CHECK-INST: addc   r7, r8
+                ; CHECK: encoding: [0x08,0x67]
+  subc   r7, r8 ; CHECK-INST: subc   r7, r8
+                ; CHECK: encoding: [0x08,0x77]
+  sub    r7, r8 ; CHECK-INST: sub    r7, r8
+                ; CHECK: encoding: [0x08,0x87]
+  cmp    r7, r8 ; CHECK-INST: cmp    r7, r8
+                ; CHECK: encoding: [0x08,0x97]
+  dadd   r7, r8 ; CHECK-INST: dadd   r7, r8
+                ; CHECK: encoding: [0x08,0xa7]
+  bit    r7, r8 ; CHECK-INST: bit    r7, r8
+                ; CHECK: encoding: [0x08,0xb7]
+  bic    r7, r8 ; CHECK-INST: bic    r7, r8
+                ; CHECK: encoding: [0x08,0xc7]
+  bis    r7, r8 ; CHECK-INST: bis    r7, r8
+                ; CHECK: encoding: [0x08,0xd7]
+  xor    r7, r8 ; CHECK-INST: xor    r7, r8
+                ; CHECK: encoding: [0x08,0xe7]
+  and    r7, r8 ; CHECK-INST: and    r7, r8
+                ; CHECK: encoding: [0x08,0xf7]
+
+  ;; IIForm8 instructions
+  rrc.b  r7     ; CHECK-INST: rrc.b  r7    
+                ; CHECK: encoding: [0x47,0x10]
+  rra.b  r7     ; CHECK-INST: rra.b  r7    
+                ; CHECK: encoding: [0x47,0x11]
+  push.b r7     ; CHECK-INST: push.b r7    
+                ; CHECK: encoding: [0x47,0x12]
+
+  ;; IIForm16 instructions
+  rrc    r7     ; CHECK-INST: rrc    r7    
+                ; CHECK: encoding: [0x07,0x10]
+  swpb   r7     ; CHECK-INST: swpb   r7    
+                ; CHECK: encoding: [0x87,0x10]
+  rra    r7     ; CHECK-INST: rra    r7    
+                ; CHECK: encoding: [0x07,0x11]
+  sxt    r7     ; CHECK-INST: sxt    r7    
+                ; CHECK: encoding: [0x87,0x11]
+  push   r7     ; CHECK-INST: push   r7    
+                ; CHECK: encoding: [0x07,0x12]
+  call   r7     ; CHECK-INST: call   r7    
+                ; CHECK: encoding: [0x87,0x12]
+  reti          ; CHECK-INST: reti         
+                ; CHECK: encoding: [0x00,0x13]
+
+  ;; CJForm instructions
+  jnz    -2     ; CHECK-INST: jne    $-2
+                ; CHECK: encoding: [0xfe,0x23]
+  jne    -2     ; CHECK-INST: jne    $-2
+                ; CHECK: encoding: [0xfe,0x23]
+  jeq    -2     ; CHECK-INST: jeq    $-2
+                ; CHECK: encoding: [0xfe,0x27]
+  jz     -2     ; CHECK-INST: jeq    $-2
+                ; CHECK: encoding: [0xfe,0x27]
+  jnc    -2     ; CHECK-INST: jlo    $-2
+                ; CHECK: encoding: [0xfe,0x2b]
+  jlo    -2     ; CHECK-INST: jlo    $-2
+                ; CHECK: encoding: [0xfe,0x2b]
+  jc     -2     ; CHECK-INST: jhs    $-2
+                ; CHECK: encoding: [0xfe,0x2f]
+  jhs    -2     ; CHECK-INST: jhs    $-2
+                ; CHECK: encoding: [0xfe,0x2f]
+  jn     -2     ; CHECK-INST: jn     $-2
+                ; CHECK: encoding: [0xfe,0x33]
+  jge    -2     ; CHECK-INST: jge    $-2
+                ; CHECK: encoding: [0xfe,0x37]
+  jl     -2     ; CHECK-INST: jl     $-2
+                ; CHECK: encoding: [0xfe,0x3b]
+  jmp    $-2    ; CHECK-INST: jmp    $-2
+                ; CHECK: encoding: [0xfe,0x3f]
+
+  ;; Emulated arithmetic instructions
+  adc    r7     ; CHECK-INST: adc    r7
+                ; CHECK: encoding: [0x07,0x63]
+  dadc   r7     ; CHECK-INST: dadc   r7
+                ; CHECK: encoding: [0x07,0xa3]
+  dec    r7     ; CHECK-INST: dec    r7
+                ; CHECK: encoding: [0x17,0x83]
+  decd   r7     ; CHECK-INST: decd   r7
+                ; CHECK: encoding: [0x27,0x83]
+  inc    r7     ; CHECK-INST: inc    r7
+                ; CHECK: encoding: [0x17,0x53]
+  incd   r7     ; CHECK-INST: incd   r7
+                ; CHECK: encoding: [0x27,0x53]
+  sbc    r7     ; CHECK-INST: sbc    r7
+                ; CHECK: encoding: [0x07,0x73]
+
+  ;; Emulated logical instructions
+  inv    r7     ; CHECK-INST: inv    r7
+                ; CHECK: encoding: [0x37,0xe3]
+  rla    r7     ; CHECK-INST: add    r7, r7
+                ; CHECK: encoding: [0x07,0x57]
+  rlc    r7     ; CHECK-INST: addc   r7, r7
+                ; CHECK: encoding: [0x07,0x67]
+
+  ;; Emulated program flow control instructions
+  br     r7     ; CHECK-INST: br     r7    
+                ; CHECK: encoding: [0x00,0x47]
+  dint          ; CHECK-INST: dint
+                ; CHECK: encoding: [0x32,0xc2]
+  eint          ; CHECK-INST: eint
+                ; CHECK: encoding: [0x32,0xd2]
+  nop           ; CHECK-INST: nop
+                ; CHECK: encoding: [0x03,0x43]
+  ret           ; CHECK-INST: ret          
+                ; CHECK: encoding: [0x30,0x41]
+
+  ;; Emulated data instruction
+  clr    r7     ; CHECK-INST: clr    r7
+                ; CHECK: encoding: [0x07,0x43]
+  clrc          ; CHECK-INST: clrc
+                ; CHECK: encoding: [0x12,0xc3]
+  clrn          ; CHECK-INST: clrn
+                ; CHECK: encoding: [0x22,0xc2]
+  clrz          ; CHECK-INST: clrz
+                ; CHECK: encoding: [0x22,0xc3]
+  pop    r7     ; CHECK-INST: pop    r7
+                ; CHECK: encoding: [0x37,0x41]
+  setc          ; CHECK-INST: setc
+                ; CHECK: encoding: [0x12,0xd3]
+  setn          ; CHECK-INST: setn
+                ; CHECK: encoding: [0x22,0xd2]
+  setz          ; CHECK-INST: setz
+                ; CHECK: encoding: [0x22,0xd3]
+  tst    r7     ; CHECK-INST: tst    r7
+                ; CHECK: encoding: [0x07,0x93]
diff --git a/test/MC/MSP430/reloc.s b/test/MC/MSP430/reloc.s
new file mode 100644
index 0000000000000000000000000000000000000000..42dd64a43c555d850874d39badcb2d885331818a
--- /dev/null
+++ b/test/MC/MSP430/reloc.s
@@ -0,0 +1,22 @@
+; RUN: llvm-mc -triple msp430 -show-encoding < %s | FileCheck %s
+
+         mov    disp+2(r8), r15
+; CHECK: mov    disp+2(r8), r15 ; encoding: [0x1f,0x48,A,A]
+; CHECK:                        ;   fixup A - offset: 2, value: disp+2, kind: fixup_16_byte
+
+         mov    disp+2, r15
+; CHECK: mov    disp+2, r15     ; encoding: [0x1f,0x40,A,A]
+; CHECK:                        ;   fixup A - offset: 2, value: disp+2, kind: fixup_16_pcrel_byte
+
+         mov    &disp+2, r15
+; CHECK: mov    &disp+2, r15    ; encoding: [0x1f,0x42,A,A]
+; CHECK:                        ;   fixup A - offset: 2, value: disp+2, kind: fixup_16
+
+         mov    disp, disp+2
+; CHECK: mov    disp, disp+2    ; encoding: [0x90,0x40,A,A,B,B]
+; CHECK:                        ;   fixup A - offset: 2, value: disp, kind: fixup_16_pcrel_byte
+; CHECK:                        ;   fixup B - offset: 4, value: disp+2, kind: fixup_16_pcrel_byte
+
+         jmp    foo
+; CHECK: jmp    foo             ; encoding: [A,0b001111AA]
+; CHECK:                        ;   fixup A - offset: 0, value: foo, kind: fixup_10_pcrel
diff --git a/test/MC/Mips/micromips32r6/valid.s b/test/MC/Mips/micromips32r6/valid.s
index 6b2aec9d5aaa78ecff0b537aaf062ef97ded7072..b6af2b951c77c781de548e5885d9383224dd61a3 100644
--- a/test/MC/Mips/micromips32r6/valid.s
+++ b/test/MC/Mips/micromips32r6/valid.s
@@ -169,6 +169,10 @@
   rdpgpr $3, $9            # CHECK: $3, $9              # encoding: [0x00,0x69,0xe1,0x7c]
   sdbbp                    # CHECK: sdbbp               # encoding: [0x00,0x00,0xdb,0x7c]
   sdbbp 34                 # CHECK: sdbbp 34            # encoding: [0x00,0x22,0xdb,0x7c]
+  sigrie                   # CHECK: sigrie              # encoding: [0x00,0x00,0x00,0x3f]
+                           # CHECK-NEXT:                # <MCInst #{{[0-9]+}} SIGRIE_MM
+  sigrie    257            # CHECK: sigrie 257          # encoding: [0x00,0x00,0x40,0x7f]
+                           # CHECK-NEXT:                # <MCInst #{{[0-9]+}} SIGRIE_MM
   xor $3, $4, $5           # CHECK: xor $3, $4, $5      # encoding: [0x00,0xa4,0x1b,0x10]
   xori $3, $4, 1234        # CHECK: xori $3, $4, 1234   # encoding: [0x70,0x64,0x04,0xd2]
   sw $5, 4($6)             # CHECK: sw $5, 4($6)        # encoding: [0xf8,0xa6,0x00,0x04]
diff --git a/test/MC/Mips/mips32r6/valid.s b/test/MC/Mips/mips32r6/valid.s
index e60b5fad3712f0e9b96f92ab1e772d618010a5f3..6c023d38573b4d38d56d135dad165f242d3859be 100644
--- a/test/MC/Mips/mips32r6/valid.s
+++ b/test/MC/Mips/mips32r6/valid.s
@@ -281,6 +281,10 @@ a:
         sdbbp     34             # CHECK: sdbbp 34               # encoding: [0x00,0x00,0x08,0x8e]
                                  # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SDBBP
                                  # CHECK-NOT:                    # <MCInst #{{[0-9]+}} SDBBP_MM
+        sigrie                   # CHECK: sigrie                 # encoding: [0x04,0x17,0x00,0x00]
+                                 # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SIGRIE
+        sigrie    257            # CHECK: sigrie 257             # encoding: [0x04,0x17,0x01,0x01]
+                                 # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SIGRIE
         sync                     # CHECK: sync                   # encoding: [0x00,0x00,0x00,0x0f]
                                  # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SYNC
         sync    1                # CHECK: sync 1                 # encoding: [0x00,0x00,0x00,0x4f]
diff --git a/test/MC/Mips/mips64r6/valid.s b/test/MC/Mips/mips64r6/valid.s
index c79077ba40009d66d73817e324607356bbdc5b28..c810a40e25289d4f8da1f85f34148aa2131e198b 100644
--- a/test/MC/Mips/mips64r6/valid.s
+++ b/test/MC/Mips/mips64r6/valid.s
@@ -242,6 +242,10 @@ a:
         sdbbp     34             # CHECK: sdbbp 34               # encoding: [0x00,0x00,0x08,0x8e]
                                  # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SDBBP
                                  # CHECK-NOT:                    # <MCInst #{{[0-9]+}} SDBBP_MM
+        sigrie                   # CHECK: sigrie                 # encoding: [0x04,0x17,0x00,0x00]
+                                 # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SIGRIE
+        sigrie    257            # CHECK: sigrie 257             # encoding: [0x04,0x17,0x01,0x01]
+                                 # CHECK-NEXT:                   # <MCInst #{{[0-9]+}} SIGRIE
         sdc2    $20,629($s2)     # CHECK: sdc2 $20, 629($18)     # encoding: [0x49,0xf4,0x92,0x75]
         sel.d   $f0,$f1,$f2      # CHECK: sel.d $f0, $f1, $f2 # encoding: [0x46,0x22,0x08,0x10]
         sel.s   $f0,$f1,$f2      # CHECK: sel.s $f0, $f1, $f2 # encoding: [0x46,0x02,0x08,0x10]
diff --git a/test/MC/RISCV/rv32i-invalid.s b/test/MC/RISCV/rv32i-invalid.s
index 92b9b4ad34f39f05e1f7e53a72bcb0c2bf925199..f856bf1f93442369e0e85cc7425e43f9c2e7f7a9 100644
--- a/test/MC/RISCV/rv32i-invalid.s
+++ b/test/MC/RISCV/rv32i-invalid.s
@@ -6,6 +6,7 @@ fence iorw, iore # CHECK: :[[@LINE]]:13: error: operand must be formed of letter
 fence wr, wr # CHECK: :[[@LINE]]:7: error: operand must be formed of letters selected in-order from 'iorw'
 fence rw, rr # CHECK: :[[@LINE]]:11: error: operand must be formed of letters selected in-order from 'iorw'
 fence 1, rw # CHECK: :[[@LINE]]:7: error: operand must be formed of letters selected in-order from 'iorw'
+fence unknown, unknown # CHECK: :[[@LINE]]:7: error: operand must be formed of letters selected in-order from 'iorw'
 
 ## uimm5
 slli a0, a0, 32 # CHECK: :[[@LINE]]:14: error: immediate must be an integer in the range [0, 31]
diff --git a/test/MC/SystemZ/asm-match.s b/test/MC/SystemZ/asm-match.s
new file mode 100644
index 0000000000000000000000000000000000000000..843d3ae6cac07c6ac7f5a8e8848a2073a3ee278a
--- /dev/null
+++ b/test/MC/SystemZ/asm-match.s
@@ -0,0 +1,81 @@
+// REQUIRES: asserts
+// RUN: llvm-mc -triple s390x-linux-gnu -debug-only=asm-matcher %s 2>&1 | FileCheck %s
+//
+// Check that debug output prints the operands correctly.
+
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'sllg'
+// CHECK: Trying to match opcode SLLG
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r3): match success using generic matcher
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 2 (Reg:r0): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDAddr32Disp20 against actual operand at index 3 (Mem:3): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'llill'
+// CHECK: Trying to match opcode LLILL
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r0): match success using generic matcher
+// CHECK: Matching formal operand class MCK_U16Imm against actual operand at index 2 (Imm:0): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'lgr'
+// CHECK: Trying to match opcode LGR
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r1): match success using generic matcher
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 2 (Reg:r0): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'lg'
+// CHECK: Trying to match opcode LG
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r1): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDXAddr64Disp20 against actual operand at index 2 (Mem:16(r2)): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'lg'
+// CHECK: Trying to match opcode LG
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r1): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDXAddr64Disp20 against actual operand at index 2 (Mem:16(r2,r3)): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'stmg'
+// CHECK: Trying to match opcode STMG
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r13): match success using generic matcher
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 2 (Reg:r15): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDAddr64Disp20 against actual operand at index 3 (Mem:104(r15)): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'mvc'
+// CHECK: Trying to match opcode MVC
+// CHECK: Matching formal operand class MCK_BDLAddr64Disp12Len8 against actual operand at index 1 (Mem:184(8,r15)): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDAddr64Disp12 against actual operand at index 2 (Mem:8(r2)): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'mvck'
+// CHECK: Trying to match opcode MVCK
+// CHECK: Matching formal operand class MCK_BDRAddr64Disp12 against actual operand at index 1 (Mem:0(r0,r1)): match success using generic matcher
+// CHECK: Matching formal operand class MCK_BDAddr64Disp12 against actual operand at index 2 (Mem:4095(r15)): match success using generic matcher
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 3 (Reg:r2): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'j'
+// CHECK: Trying to match opcode J
+// CHECK: Matching formal operand class MCK_PCRel16 against actual operand at index 1 (Imm:.Ltmp0+2): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 2: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: AsmMatcher: found 1 encodings with mnemonic 'brasl'
+// CHECK: Trying to match opcode BRASL
+// CHECK: Matching formal operand class MCK_GR64 against actual operand at index 1 (Reg:r14): match success using generic matcher
+// CHECK: Matching formal operand class MCK_PCRelTLS32 against actual operand at index 2 (ImmTLS:fun): match success using generic matcher
+// CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode
+// CHECK: .text
+// CHECK: sllg	%r3, %r0, 3
+// CHECK: llill	%r0, 0
+// CHECK: lgr	%r1, %r0
+// CHECK: lg	%r1, 16(%r2)
+// CHECK: lg	%r1, 16(%r2,%r3)
+// CHECK: stmg	%r13, %r15, 104(%r15)
+// CHECK: mvc	184(8,%r15), 8(%r2)
+// CHECK: mvck	0(%r0,%r1), 4095(%r15), %r2
+// CHECK: .Ltmp0:
+// CHECK: j	.Ltmp0+2
+// CHECK: brasl	%r14, fun
+	
+        sllg    %r3, %r0, 3
+        llill	%r0, 0
+        lgr	%r1, %r0
+        lg      %r1, 16(%r2)
+        lg      %r1, 16(%r2,%r3)
+        stmg    %r13, %r15, 104(%r15)
+        mvc     184(8,%r15), 8(%r2)
+        mvck    0(%r0,%r1), 4095(%r15), %r2
+.Ltmp0:
+        j	.Ltmp0+2
+        brasl   %r14, fun
diff --git a/test/MC/WebAssembly/basic-assembly.s b/test/MC/WebAssembly/basic-assembly.s
index cc60143639fe910420847fb9059f896ac92130fa..c2b316c9243829ec587319af507e47e5b1e2d1e0 100644
--- a/test/MC/WebAssembly/basic-assembly.s
+++ b/test/MC/WebAssembly/basic-assembly.s
@@ -1,4 +1,6 @@
-# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+simd128,+nontrapping-fptoint < %s | FileCheck %s
+# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s
+# this one is just here to see if it converts to .o without errors, but doesn't check any output:
+# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s
 
     .text
     .type    test0,@function
@@ -46,10 +48,19 @@ test0:
     # TODO: enable once instruction has been added.
     #i32x4.trunc_s/f32x4:sat
     i32.trunc_s/f32
+    try
+.LBB0_3:
+    i32.catch   0
+.LBB0_4:
+    catch_all
+.LBB0_5:
+    end_try
     #i32.trunc_s:sat/f32
-    get_global	__stack_pointer@GLOBAL
+    get_global  __stack_pointer@GLOBAL
     end_function
-
+.Lfunc_end0:
+	.size	test0, .Lfunc_end0-test0
+    .globaltype	__stack_pointer, i32
 
 # CHECK:           .text
 # CHECK-LABEL: test0:
@@ -88,5 +99,14 @@ test0:
 # CHECK-NEXT:      get_local   5
 # CHECK-NEXT:      f32x4.add
 # CHECK-NEXT:      i32.trunc_s/f32
-# CHECK-NEXT:      get_global	__stack_pointer@GLOBAL
+# CHECK-NEXT:      try
+# CHECK-NEXT:  .LBB0_3:
+# CHECK-NEXT:      i32.catch   0
+# CHECK-NEXT:  .LBB0_4:
+# CHECK-NEXT:      catch_all
+# CHECK-NEXT:  .LBB0_5:
+# CHECK-NEXT:      end_try
+# CHECK-NEXT:      get_global  __stack_pointer@GLOBAL
 # CHECK-NEXT:      end_function
+
+# CHECK:           .globaltype	__stack_pointer, i32
diff --git a/test/MC/WebAssembly/simd-encodings.s b/test/MC/WebAssembly/simd-encodings.s
index 02d07674c16161c3491a9768da41bbef3cb10fa7..8cd4bc9cd342b1ef2b92f0fe01f7288c4c6afb1b 100644
--- a/test/MC/WebAssembly/simd-encodings.s
+++ b/test/MC/WebAssembly/simd-encodings.s
@@ -382,6 +382,18 @@
     # CHECK: f64x2.abs # encoding: [0xfd,0x80]
     f64x2.abs
 
+    # CHECK: f32x4.min # encoding: [0xfd,0x81]
+    f32x4.min
+
+    # CHECK: f64x2.min # encoding: [0xfd,0x82]
+    f64x2.min
+
+    # CHECK: f32x4.max # encoding: [0xfd,0x83]
+    f32x4.max
+
+    # CHECK: f64x2.max # encoding: [0xfd,0x84]
+    f64x2.max
+
     # CHECK: f32x4.add # encoding: [0xfd,0x85]
     f32x4.add
 
diff --git a/test/MC/X86/cfi-open-within-another-crash.s b/test/MC/X86/cfi-open-within-another-crash.s
new file mode 100644
index 0000000000000000000000000000000000000000..81627f4459c0541558162e7009dcea17748361e0
--- /dev/null
+++ b/test/MC/X86/cfi-open-within-another-crash.s
@@ -0,0 +1,18 @@
+# Test for D51695 ensuring there is no crash when two .cfi_startproc are opened
+# without the first one being closed.
+
+# RUN: not llvm-mc %s -filetype=obj -triple=x86_64-unknown-linux -o /dev/null 2>&1 | FileCheck %s
+
+.text
+.globl proc_one
+proc_one:
+ .cfi_startproc
+ 
+.text
+.globl proc_two
+proc_two:
+ .cfi_startproc
+ 
+ .cfi_endproc
+
+# CHECK: error: starting new .cfi frame before finishing the previous one
diff --git a/test/MC/X86/cfi-scope-errors.s b/test/MC/X86/cfi-scope-errors.s
index a61f817f741e883bdc1d5ce53280ca7dfa4ff918..a7d6a8a157a5eb29cd5a7dd21ccf7401b36c9e95 100644
--- a/test/MC/X86/cfi-scope-errors.s
+++ b/test/MC/X86/cfi-scope-errors.s
@@ -1,6 +1,5 @@
-# RUN: not llvm-mc %s -triple x86_64-linux -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error:
-
-# FIXME: Push source locations into diagnostics.
+# RUN: not llvm-mc %s -triple x86_64-linux -o /dev/null 2>&1 | FileCheck %s
+# RUN: not llvm-mc %s -triple x86_64-linux -filetype=obj -o /dev/null 2>&1 | FileCheck %s
 
 .text
 .cfi_def_cfa rsp, 8
@@ -9,8 +8,16 @@
 .cfi_startproc
 nop
 
+# TODO(kristina): As Reid suggested, this now supports source locations as a side effect
+# of another patch aimed at fixing the crash that would occur here, however the other
+# ones do not unfortunately. Will address it in a further patch propogating SMLoc down to
+# other CFI directives at which point more LINE checks can be added to ensure proper source
+# location reporting.
+
+# This tests source location correctness as well as the error and it not crashing.
+# CHECK: [[@LINE+2]]:1: error: starting new .cfi frame before finishing the previous one
 .cfi_startproc
-# CHECK: error: starting new .cfi frame before finishing the previous one
+
 nop
 .cfi_endproc
 
diff --git a/test/MC/X86/intel-syntax-encoding.s b/test/MC/X86/intel-syntax-encoding.s
index aedd74447d658dea700e8aed18b4e98f8c7d3aa1..cf1b403e967e9b6bab366b0dcfb970c0b5de4be2 100644
--- a/test/MC/X86/intel-syntax-encoding.s
+++ b/test/MC/X86/intel-syntax-encoding.s
@@ -64,6 +64,11 @@
 pushf
 popf
 
+// CHECK: encoding: [0x66,0x9c]
+// CHECK: encoding: [0x66,0x9d]
+pushfw
+popfw
+
 LBB0_3:
 // CHECK: encoding: [0xeb,A]
 	jmp	LBB0_3
diff --git a/test/MC/X86/intel-syntax-hex.s b/test/MC/X86/intel-syntax-hex.s
index b3a19fbaa34501ef74cc3db0dd0add822147f4be..cb73ca9f5017b3f13f1c0cd4038d81fc4217ba35 100644
--- a/test/MC/X86/intel-syntax-hex.s
+++ b/test/MC/X86/intel-syntax-hex.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s
+// RUN: llvm-mc -masm-integers -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s
 // rdar://12470373
 
 // Checks to make sure we parse the hexadecimal suffix properly.
diff --git a/test/MC/X86/pr27884.s b/test/MC/X86/pr27884.s
index edd4e8d34a9b21193a53118bd2b8a6f7dda76a5a..d78c35c8fc023f73cf0f12aa22ac4974e2555b63 100644
--- a/test/MC/X86/pr27884.s
+++ b/test/MC/X86/pr27884.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown %s
+// RUN: llvm-mc -triple x86_64-unknown-unknown %s -masm-integers=1
 
 .intel_syntax
 add rbx, 0B0h
diff --git a/test/Object/AMDGPU/elf-header-flags-mach.yaml b/test/Object/AMDGPU/elf-header-flags-mach.yaml
index c3800d2ff2741cb388fb78d5dcb472a223d20178..7a594843c208aa1fe62de59b8e5d49ea7ce35f9a 100644
--- a/test/Object/AMDGPU/elf-header-flags-mach.yaml
+++ b/test/Object/AMDGPU/elf-header-flags-mach.yaml
@@ -91,6 +91,9 @@
 # RUN: yaml2obj -docnum=31 %s > %t.o.31
 # RUN: llvm-readobj -s -file-headers %t.o.31 | FileCheck --check-prefixes=ELF-ALL,ELF-GFX906 %s
 # RUN: obj2yaml %t.o.31 | FileCheck --check-prefixes=YAML-GFX906 %s
+# RUN: yaml2obj -docnum=32 %s > %t.o.32
+# RUN: llvm-readobj -s -file-headers %t.o.32 | FileCheck --check-prefixes=ELF-ALL,ELF-GFX909 %s
+# RUN: obj2yaml %t.o.32 | FileCheck --check-prefixes=YAML-GFX909 %s
 
 
 # ELF-ALL:     Flags [
@@ -125,6 +128,7 @@
 # ELF-GFX902:    EF_AMDGPU_MACH_AMDGCN_GFX902 (0x2D)
 # ELF-GFX904:    EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E)
 # ELF-GFX906:    EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
+# ELF-GFX909:    EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31)
 # ELF-ALL:     ]
 
 # YAML-R600:    Flags: [ EF_AMDGPU_MACH_R600_R600 ]
@@ -158,6 +162,7 @@
 # YAML-GFX902:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX902 ]
 # YAML-GFX904:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX904 ]
 # YAML-GFX906:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX906 ]
+# YAML-GFX909:  Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX909 ]
 
 # Doc1
 --- !ELF
@@ -499,3 +504,14 @@ FileHeader:
   Machine: EM_AMDGPU
   Flags:   [ EF_AMDGPU_MACH_AMDGCN_GFX906 ]
 ...
+
+# Doc32
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  OSABI:   ELFOSABI_NONE
+  Type:    ET_REL
+  Machine: EM_AMDGPU
+  Flags:   [ EF_AMDGPU_MACH_AMDGCN_GFX909 ]
+...
diff --git a/test/Object/AMDGPU/elf-header-flags-sram-ecc.yaml b/test/Object/AMDGPU/elf-header-flags-sram-ecc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78b2913be04edc730c4a58d67c78ac2aa525f9a9
--- /dev/null
+++ b/test/Object/AMDGPU/elf-header-flags-sram-ecc.yaml
@@ -0,0 +1,61 @@
+# RUN: yaml2obj -docnum=1 %s > %t.o.1
+# RUN: llvm-readobj -s -file-headers %t.o.1 | FileCheck --check-prefixes=ELF-ALL,ELF-SRAM-ECC-NONE %s
+# RUN: obj2yaml %t.o.1 | FileCheck --check-prefixes=YAML-SRAM-ECC-NONE %s
+# RUN: yaml2obj -docnum=2 %s > %t.o.2
+# RUN: llvm-readobj -s -file-headers %t.o.2 | FileCheck --check-prefixes=ELF-ALL,ELF-SRAM-ECC-GFX900 %s
+# RUN: obj2yaml %t.o.2 | FileCheck --check-prefixes=YAML-SRAM-ECC-GFX900 %s
+# RUN: yaml2obj -docnum=3 %s > %t.o.3
+# RUN: llvm-readobj -s -file-headers %t.o.3 | FileCheck --check-prefixes=ELF-ALL,ELF-SRAM-ECC-XNACK-GFX900 %s
+# RUN: obj2yaml %t.o.3 | FileCheck --check-prefixes=YAML-SRAM-ECC-XNACK-GFX900 %s
+
+# ELF-SRAM-ECC-NONE:      Flags [
+# ELF-SRAM-ECC-NONE-NEXT:   EF_AMDGPU_SRAM_ECC (0x200)
+# ELF-SRAM-ECC-NONE-NEXT: ]
+
+# ELF-SRAM-ECC-GFX900:      Flags [
+# ELF-SRAM-ECC-GFX900-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C)
+# ELF-SRAM-ECC-GFX900-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+# ELF-SRAM-ECC-GFX900-NEXT: ]
+
+# ELF-SRAM-ECC-XNACK-GFX900:      Flags [
+# ELF-SRAM-ECC-XNACK-GFX900-NEXT:   EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C)
+# ELF-SRAM-ECC-XNACK-GFX900-NEXT:   EF_AMDGPU_SRAM_ECC           (0x200)
+# ELF-SRAM-ECC-XNACK-GFX900-NEXT:   EF_AMDGPU_XNACK              (0x100)
+# ELF-SRAM-ECC-XNACK-GFX900-NEXT: ]
+
+# YAML-SRAM-ECC-NONE:         Flags: [ EF_AMDGPU_MACH_NONE, EF_AMDGPU_SRAM_ECC ]
+# YAML-SRAM-ECC-GFX900:       Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_SRAM_ECC ]
+# YAML-SRAM-ECC-XNACK-GFX900: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_XNACK, EF_AMDGPU_SRAM_ECC ]
+
+# Doc1
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  OSABI:   ELFOSABI_NONE
+  Type:    ET_REL
+  Machine: EM_AMDGPU
+  Flags:   [ EF_AMDGPU_SRAM_ECC ]
+...
+
+# Doc2
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  OSABI:   ELFOSABI_NONE
+  Type:    ET_REL
+  Machine: EM_AMDGPU
+  Flags:   [ EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_SRAM_ECC ]
+...
+
+# Doc3
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  OSABI:   ELFOSABI_NONE
+  Type:    ET_REL
+  Machine: EM_AMDGPU
+  Flags:   [ EF_AMDGPU_MACH_AMDGCN_GFX900, EF_AMDGPU_XNACK, EF_AMDGPU_SRAM_ECC ]
+...
diff --git a/test/Object/Inputs/trivial-object-test.wasm b/test/Object/Inputs/trivial-object-test.wasm
index 2aa042d54dc5bf56ae1abfd89144ab15ef5b832e..8652d67f69222ab57610df1b13d3972e031460a1 100644
Binary files a/test/Object/Inputs/trivial-object-test.wasm and b/test/Object/Inputs/trivial-object-test.wasm differ
diff --git a/test/Object/archive-GNU64-write.test b/test/Object/archive-GNU64-write.test
deleted file mode 100644
index 0bfb7c80d05acb3de6956085e596bdce3ebdfd6a..0000000000000000000000000000000000000000
--- a/test/Object/archive-GNU64-write.test
+++ /dev/null
@@ -1,40 +0,0 @@
-# REQUIRES: llvm-64-bits
-# REQUIRES: system-linux
-# REQUIRES: shell
-
-# RUN: yaml2obj %s > %t
-# RUN: dd if=%t of=%t bs=1 count=0 seek=1M
-# RUN: rm -f %t.lib
-# RUN: cp %t %t2
-# RUN: SYM64_THRESHOLD=19 llvm-ar cr %t.lib %t %t2 %p/Inputs/trivial-object-test.elf-x86-64
-# RUN: llvm-nm --print-armap %t.lib | FileCheck %s
-# RUN: grep SYM64 %t.lib
-
-# Delete temp files. They are too large.
-# RUN: rm -f %t %t2 %t.lib
-
-!ELF
-FileHeader:
-  Class:           ELFCLASS64
-  Data:            ELFDATA2LSB
-  Type:            ET_EXEC
-  Machine:         EM_X86_64
-Sections:
-  - Name:            .data
-    Type:            SHT_PROGBITS
-    Flags:           [ SHF_ALLOC ]
-    AddressAlign:    0x0000000000000001
-    Content:         "00"
-    Size:            32
-
-# CHECK:      Archive map
-# CHECK-NEXT: main in trivial-object-test.elf-x86-64
-
-# CHECK:    archive-GNU64-write.test.tmp:
-
-# CHECK:    archive-GNU64-write.test.tmp2:
-
-# CHECK:    trivial-object-test.elf-x86-64:
-# CHECK-NEXT:                     U SomeOtherFunction
-# CHECK-NEXT:    0000000000000000 T main
-# CHECK-NEXT:                     U puts
diff --git a/test/Object/archive-format.test b/test/Object/archive-format.test
index 219fc7f894a7f6c0ad5c336ce45afe1b0d14ffe8..b1ae411161bc1bdc346f9e69c6d7c114d8bcaf39 100644
--- a/test/Object/archive-format.test
+++ b/test/Object/archive-format.test
@@ -38,7 +38,7 @@ BSD-SAME: #1/16           0           0     0     644     20        `
 BSD-NEXT: 0123456789abcdefzed.
 
 RUN: rm -f %t.a
-RUN: llvm-ar --format=darwin rc %t.a 0123456789abcde 0123456789abcdef
+RUN: llvm-ar --format=darwin rcS %t.a 0123456789abcde 0123456789abcdef
 RUN: cat %t.a | FileCheck -strict-whitespace --check-prefix=DARWIN %s
 
 DARWIN:      !<arch>
diff --git a/test/Object/archive-symtab.test b/test/Object/archive-symtab.test
index 297970725bdef9f10a5ec3fc9d254066173ae29b..96f48139ddd315f90f1cd5ebf258d745fc22bc0b 100644
--- a/test/Object/archive-symtab.test
+++ b/test/Object/archive-symtab.test
@@ -2,6 +2,11 @@ RUN: rm -f %t.a
 RUN: llvm-ar rcsU %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
 RUN: llvm-nm -M %t.a | FileCheck %s
 
+RUN: rm -f %t.a
+RUN: env SYM64_THRESHOLD=1 llvm-ar rcsU %t.a %p/Inputs/trivial-object-test.elf-x86-64 %p/Inputs/trivial-object-test2.elf-x86-64
+RUN: llvm-nm -M %t.a | FileCheck %s
+RUXX: grep SYM64 %t.a
+
 CHECK: Archive map
 CHECK-NEXT: main in trivial-object-test.elf-x86-64
 CHECK-NEXT: foo in trivial-object-test2.elf-x86-64
@@ -82,6 +87,11 @@ RUN: rm -f %t.a
 RUN: llvm-ar --format=bsd rcsU %t.a %p/Inputs/trivial-object-test.macho-x86-64 %p/Inputs/trivial-object-test2.macho-x86-64
 RUN: llvm-nm -M %t.a | FileCheck --check-prefix=MACHO %s
 
+RUN: rm -f %t.a
+RUN: env SYM64_THRESHOLD=1 llvm-ar --format=darwin rcsU %t.a %p/Inputs/trivial-object-test.macho-x86-64 %p/Inputs/trivial-object-test2.macho-x86-64
+RUN: llvm-nm -M %t.a | FileCheck --check-prefix=MACHO %s
+RUN: grep '__\.SYMDEF_64' %t.a
+
 MACHO: Archive map
 MACHO-NEXT: _main in trivial-object-test.macho-x86-64
 MACHO-NEXT: _foo in trivial-object-test2.macho-x86-64
@@ -138,3 +148,21 @@ RUN: llvm-ar --format=gnu rcsD %t.a %p/Inputs/trivial-object-test.macho-x86-64
 RUN: FileCheck --check-prefix=GNU-SYMTAB-ALIGN %s < %t.a
 GNU-SYMTAB-ALIGN: !<arch>
 GNU-SYMTAB-ALIGN-NEXT: /               0           0     0     0       14        `
+
+
+** Test the behavior of an empty archive:
+
+No symbol table emitted for GNU archives
+RUN: rm -f %t.a
+RUN: llvm-ar rcs --format=gnu %t.a
+RUN: not grep -q '/               ' %t.a
+
+No symbol table for BSD archives
+RUN: rm -f %t.a
+RUN: llvm-ar rcs --format=bsd %t.a
+RUN: not grep -q '__\.SYMDEF' %t.a
+
+And we do emit a symbol table for DARWIN archives
+RUN: rm -f %t.a
+RUN: llvm-ar rcs --format=darwin %t.a
+RUN: grep -q '__\.SYMDEF' %t.a
diff --git a/test/Object/nm-trivial-object.test b/test/Object/nm-trivial-object.test
index 621a1779166aa33698592364bf25d8c86bb931fa..ca9c2782611afb23bed15692029f38a765cdc2e5 100644
--- a/test/Object/nm-trivial-object.test
+++ b/test/Object/nm-trivial-object.test
@@ -61,6 +61,7 @@ COFF32-NEXT:          U _puts
 
 
 WASM:      00000000 d .L.str
+WASM-NEXT: 00000003 t .LSomeOtherFunction_bitcast
 WASM-NEXT:          U SomeOtherFunction
 WASM-NEXT: 00000002 T main
 WASM-NEXT:          U puts
diff --git a/test/Object/obj2yaml.test b/test/Object/obj2yaml.test
index d9540d4422f14ed5ff1ec41f7745bdc4492a8b22..46a0d77363647c85f7c15de4b83f9910dbdb1378 100644
--- a/test/Object/obj2yaml.test
+++ b/test/Object/obj2yaml.test
@@ -667,8 +667,10 @@ WASM-NEXT:         Size:            13
 WASM-NEXT:       - Index:           2
 WASM:              Name:            puts
 WASM:            - Index:           3
-WASM:              Name:            SomeOtherFunction
+WASM:              Name:            .LSomeOtherFunction_bitcast
 WASM:            - Index:           4
+WASM:              Name:            SomeOtherFunction
+WASM:            - Index:           5
 WASM:              Name:            var
 WASM:          SegmentInfo:
 WASM-NEXT:       - Index:           0
diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test
index 24fd4a32efa55266af62259734c53ac3dfe6a255..3a1793e3daf6e4266af5ee4b4751ac8cdd5ea67b 100644
--- a/test/Object/objdump-relocations.test
+++ b/test/Object/objdump-relocations.test
@@ -62,6 +62,7 @@ ELF-MIPSEL: R_MIPS_CALL16 SomeOtherFunction
 WASM:      CODE
 WASM-NEXT: R_WEBASSEMBLY_MEMORY_ADDR_SLEB .L.str
 WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB puts
+WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB .LSomeOtherFunction_bitcast
 WASM-NEXT: R_WEBASSEMBLY_FUNCTION_INDEX_LEB SomeOtherFunction
 
 ELF-complex-x86-64: .text
diff --git a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
index adc95b950104dab4e401839583c627c5186600e7..c6a45cd36eabb5ab8728f0de6cd24f9259a35c6c 100644
--- a/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
+++ b/test/ObjectYAML/MachO/DWARF-BigEndian.yaml
@@ -376,8 +376,8 @@ DWARF:
 #CHECK: DWARF:           
 #CHECK:   debug_str:       
 #CHECK:     - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)'
-#CHECK:     - ../compiler-rt/lib/builtins/absvdi2.c
-#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - '../compiler-rt/lib/builtins/absvdi2.c'
+#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
 #CHECK:     - int
 #CHECK:     - di_int
 #CHECK:     - long long int
diff --git a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
index 1d6da66a073983d56a8dfc0ba29606c9d8f3fc37..1e136e67be13dcfa67cd9586f9ac4bcbaad22308 100644
--- a/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
+++ b/test/ObjectYAML/MachO/DWARF-LittleEndian.yaml
@@ -365,8 +365,8 @@ DWARF:
 #CHECK: DWARF:           
 #CHECK:   debug_str:       
 #CHECK:     - 'clang version 4.0.0 (trunk 290181) (llvm/trunk 290209)'
-#CHECK:     - ../compiler-rt/lib/builtins/absvdi2.c
-#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - '../compiler-rt/lib/builtins/absvdi2.c'
+#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
 #CHECK:     - int
 #CHECK:     - di_int
 #CHECK:     - long long int
diff --git a/test/ObjectYAML/MachO/DWARF-debug_str.yaml b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
index 417a755642b811f7fb8dc53b39030eeeb580fb02..84c5e22d25544459fe00652f744b15fe2d820cd1 100644
--- a/test/ObjectYAML/MachO/DWARF-debug_str.yaml
+++ b/test/ObjectYAML/MachO/DWARF-debug_str.yaml
@@ -257,7 +257,7 @@ DWARF:
 #CHECK:     - ''
 #CHECK:     - 'clang version 4.0.0 (trunk 288677) (llvm/trunk 288676)'
 #CHECK:     - hello_world.c
-#CHECK:     - /Users/cbieneman/dev/open-source/llvm-build-rel
+#CHECK:     - '/Users/cbieneman/dev/open-source/llvm-build-rel'
 #CHECK:     - main
 #CHECK:     - argc
 #CHECK:     - argv
diff --git a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
index 9184e3c5143f7a597fa943ad47150cd40b13f280..5fc6afa536e859a4634ae9e422481bdddfd98acf 100644
--- a/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
+++ b/test/ObjectYAML/MachO/dylib_dylinker_command.yaml
@@ -40,7 +40,7 @@ LoadCommands:
 #CHECK:   - cmd:             LC_LOAD_DYLINKER
 #CHECK:     cmdsize:         32
 #CHECK:     name:            12
-#CHECK:     PayloadString:   /usr/lib/dyld
+#CHECK:     PayloadString:   '/usr/lib/dyld'
 #CHECK:     ZeroPadBytes:    7
 #CHECK:   - cmd:             LC_LOAD_DYLIB
 #CHECK:     cmdsize:         48
@@ -58,5 +58,5 @@ LoadCommands:
 #CHECK:       timestamp:       2
 #CHECK:       current_version: 80349697
 #CHECK:       compatibility_version: 65536
-#CHECK:     PayloadString:   /usr/lib/libSystem.B.dylib
+#CHECK:     PayloadString:   '/usr/lib/libSystem.B.dylib'
 #CHECK:     ZeroPadBytes:    6
diff --git a/test/ObjectYAML/wasm/import_memory_shared.yaml b/test/ObjectYAML/wasm/import_memory_shared.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..849bdc5314d1b1b7270fb5f67ca3e50027272d5c
--- /dev/null
+++ b/test/ObjectYAML/wasm/import_memory_shared.yaml
@@ -0,0 +1,36 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - Index:           0
+        ReturnType:      I32
+        ParamTypes:
+          - I32
+  - Type:            IMPORT
+    Imports:
+      - Module:          foo
+        Field:           imported_memory
+        Kind:            MEMORY
+        Memory:
+          Flags:           [ HAS_MAX, IS_SHARED ]
+          Initial:         0x00000010
+          Maximum:         0x00000011
+
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:   - Type:            IMPORT
+# CHECK:     Imports:         
+# CHECK:       - Module:          foo
+# CHECK:         Field:           imported_memory
+# CHECK:         Kind:            MEMORY
+# CHECK:         Memory:
+# CHECK:           Flags:           [ HAS_MAX, IS_SHARED ]
+# CHECK:           Initial:         0x00000010
+# CHECK:           Maximum:         0x00000011
+# CHECK: ...
diff --git a/test/Other/new-pm-thinlto-defaults.ll b/test/Other/new-pm-thinlto-defaults.ll
index 001e3eeeb968dc0dc4230ead1cfb97e143daf9e7..c68aa1d05aa3024f637c3a7efe41d52669ceb075 100644
--- a/test/Other/new-pm-thinlto-defaults.ll
+++ b/test/Other/new-pm-thinlto-defaults.ll
@@ -26,6 +26,10 @@
 ; RUN: opt -disable-verify -debug-pass-manager -new-pm-debug-info-for-profiling \
 ; RUN:     -passes='thinlto-pre-link<O2>,name-anon-globals' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-DIS,CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O2
+; Enabling the hot-cold-split pass should not affect the ThinLTO pre-link
+; RUN: opt -disable-verify -debug-pass-manager \
+; RUN:     -passes='thinlto-pre-link<O2>,name-anon-globals' -hot-cold-split -S  %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O2
 ;
 ; Postlink pipelines:
 ; RUN: opt -disable-verify -debug-pass-manager \
diff --git a/test/Other/opt-hot-cold-split.ll b/test/Other/opt-hot-cold-split.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a3fbdeffb2fd4db8ce6d7649733bd017d828c0d9
--- /dev/null
+++ b/test/Other/opt-hot-cold-split.ll
@@ -0,0 +1,292 @@
+; RUN: opt -mtriple=x86_64-- -Os -hotcoldsplit -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK-LABEL: Pass Arguments:
+; CHECK-NEXT: Target Transform Information
+; CHECK-NEXT: Type-Based Alias Analysis
+; CHECK-NEXT: Scoped NoAlias Alias Analysis
+; CHECK-NEXT: Assumption Cache Tracker
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Module Verifier
+; CHECK-NEXT:     Instrument function entry/exit with calls to e.g. mcount() (pre inlining)
+; CHECK-NEXT:     Simplify the CFG
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     SROA
+; CHECK-NEXT:     Early CSE
+; CHECK-NEXT:     Lower 'expect' Intrinsics
+; CHECK-NEXT:  Pass Arguments:
+; CHECK-NEXT:  Target Library Information
+; CHECK-NEXT:  Target Transform Information
+;              Target Pass Configuration
+; CHECK:       Type-Based Alias Analysis
+; CHECK-NEXT:  Scoped NoAlias Alias Analysis
+; CHECK-NEXT:  Assumption Cache Tracker
+; CHECK-NEXT:  Profile summary info
+; CHECK-NEXT:    ModulePass Manager
+; CHECK-NEXT:      Force set function attributes
+; CHECK-NEXT:      Infer set function attributes
+; CHECK-NEXT:      Interprocedural Sparse Conditional Constant Propagation
+; CHECK-NEXT:        Unnamed pass: implement Pass::getPassName()
+; CHECK-NEXT:     Called Value Propagation
+; CHECK-NEXT:     Global Variable Optimizer
+; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Promote Memory to Register
+; CHECK-NEXT:     Dead Argument Elimination
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Combine redundant instructions
+; CHECK-NEXT:       Simplify the CFG
+; CHECK-NEXT:     CallGraph Construction
+; CHECK-NEXT:     Globals Alias Analysis
+; CHECK-NEXT:     Call Graph SCC Pass Manager
+; CHECK-NEXT:       Remove unused exception handling info
+; CHECK-NEXT:       Function Integration/Inlining
+; CHECK-NEXT:       Deduce function attributes
+; CHECK-NEXT:       FunctionPass Manager
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         SROA
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory SSA
+; CHECK-NEXT:         Early CSE w/ MemorySSA
+; CHECK-NEXT:         Speculatively execute instructions if target has divergent branches
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Lazy Value Information Analysis
+; CHECK-NEXT:         Jump Threading
+; CHECK-NEXT:         Value Propagation
+; CHECK-NEXT:         Simplify the CFG
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Tail Call Elimination
+; CHECK-NEXT:         Simplify the CFG
+; CHECK-NEXT:         Reassociate expressions
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Canonicalize natural loops
+; CHECK-NEXT:         LCSSA Verifier
+; CHECK-NEXT:         Loop-Closed SSA Form Pass
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Scalar Evolution Analysis
+; CHECK-NEXT:         Loop Pass Manager
+; CHECK-NEXT:           Rotate Loops
+; CHECK-NEXT:           Loop Invariant Code Motion
+; CHECK-NEXT:           Unswitch loops
+; CHECK-NEXT:         Simplify the CFG
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:         Canonicalize natural loops
+; CHECK-NEXT:         LCSSA Verifier
+; CHECK-NEXT:         Loop-Closed SSA Form Pass
+; CHECK-NEXT:         Scalar Evolution Analysis
+; CHECK-NEXT:         Loop Pass Manager
+; CHECK-NEXT:           Induction Variable Simplification
+; CHECK-NEXT:           Recognize loop idioms
+; CHECK-NEXT:           Delete dead loops
+; CHECK-NEXT:           Unroll loops
+; CHECK-NEXT:         MergedLoadStoreMotion
+; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory Dependence Analysis
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Global Value Numbering
+; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Memory Dependence Analysis
+; CHECK-NEXT:         MemCpy Optimization
+; CHECK-NEXT:         Sparse Conditional Constant Propagation
+; CHECK-NEXT:         Demanded bits analysis
+; CHECK-NEXT:         Bit-Tracking Dead Code Elimination
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:         Lazy Value Information Analysis
+; CHECK-NEXT:         Jump Threading
+; CHECK-NEXT:         Value Propagation
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Phi Values Analysis
+; CHECK-NEXT:         Memory Dependence Analysis
+; CHECK-NEXT:         Dead Store Elimination
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Canonicalize natural loops
+; CHECK-NEXT:         LCSSA Verifier
+; CHECK-NEXT:         Loop-Closed SSA Form Pass
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Scalar Evolution Analysis
+; CHECK-NEXT:         Loop Pass Manager
+; CHECK-NEXT:           Loop Invariant Code Motion
+; CHECK-NEXT:         Post-Dominator Tree Construction
+; CHECK-NEXT:         Aggressive Dead Code Elimination
+; CHECK-NEXT:         Simplify the CFG
+; CHECK-NEXT:         Dominator Tree Construction
+; CHECK-NEXT:         Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
+; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Lazy Branch Probability Analysis
+; CHECK-NEXT:         Lazy Block Frequency Analysis
+; CHECK-NEXT:         Optimization Remark Emitter
+; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:     A No-Op Barrier Pass
+; CHECK-NEXT:     Eliminate Available Externally Globals
+; CHECK-NEXT:     CallGraph Construction
+; CHECK-NEXT:     Deduce function attributes in RPO
+; CHECK-NEXT:     Global Variable Optimizer
+; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
+; CHECK-NEXT:     Dead Global Elimination
+; CHECK-NEXT:     CallGraph Construction
+; CHECK-NEXT:     Globals Alias Analysis
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Float to int
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       LCSSA Verifier
+; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Rotate Loops
+; CHECK-NEXT:       Loop Access Analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Loop Distribution
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Loop Access Analysis
+; CHECK-NEXT:       Demanded bits analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Loop Vectorization
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Loop Access Analysis
+; CHECK-NEXT:       Loop Load Elimination
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Combine redundant instructions
+; CHECK-NEXT:       Simplify the CFG
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Demanded bits analysis
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       SLP Vectorizer
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Combine redundant instructions
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       LCSSA Verifier
+; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Unroll loops
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Combine redundant instructions
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       LCSSA Verifier
+; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Loop Invariant Code Motion
+; CHECK-NEXT:       Alignment from assumptions
+; CHECK-NEXT:     Strip Unused Function Prototypes
+; CHECK-NEXT:     Dead Global Elimination
+; CHECK-NEXT:     Merge Duplicate Global Constants
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Dominator Tree Construction
+; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
+; CHECK-NEXT:       Canonicalize natural loops
+; CHECK-NEXT:       LCSSA Verifier
+; CHECK-NEXT:       Loop-Closed SSA Form Pass
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:       Function Alias Analysis Results
+; CHECK-NEXT:       Scalar Evolution Analysis
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
+; CHECK-NEXT:       Loop Pass Manager
+; CHECK-NEXT:         Loop Sink
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Remove redundant instructions
+; CHECK-NEXT:       Hoist/decompose integer division and remainder
+; CHECK-NEXT:       Simplify the CFG
+; CHECK-NEXT:     Hot Cold Splitting
+; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
+; CHECK-NEXT:     FunctionPass Manager
+; CHECK-NEXT:       Module Verifier
+; CHECK-NEXT:     Bitcode Writer
+; CHECK-NEXT: Pass Arguments:  -domtree
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT: Pass Arguments:  -targetlibinfo -domtree -loops -branch-prob -block-freq
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     Natural Loop Information
+; CHECK-NEXT:     Branch Probability Analysis
+; CHECK-NEXT:     Block Frequency Analysis
+; CHECK-NEXT: Pass Arguments:  -targetlibinfo -domtree -loops -branch-prob -block-freq
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     Natural Loop Information
+; CHECK-NEXT:     Branch Probability Analysis
+; CHECK-NEXT:     Block Frequency Analysis
+; CHECK-NEXT: Pass Arguments:  -targetlibinfo -domtree -loops -branch-prob -block-freq
+; CHECK-NEXT: Target Library Information
+; CHECK-NEXT:   FunctionPass Manager
+; CHECK-NEXT:     Dominator Tree Construction
+; CHECK-NEXT:     Natural Loop Information
+; CHECK-NEXT:     Branch Probability Analysis
+; CHECK-NEXT:     Block Frequency Analysis
diff --git a/test/Other/pass-pipeline-parsing.ll b/test/Other/pass-pipeline-parsing.ll
index b303318c79636445d7e08a8e96290dcfd1c1e759..2e8bc7c87302563549eae2099a8f2b6a47d2c85d 100644
--- a/test/Other/pass-pipeline-parsing.ll
+++ b/test/Other/pass-pipeline-parsing.ll
@@ -54,52 +54,52 @@
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED1
-; CHECK-UNBALANCED1: unable to parse pass pipeline description
+; CHECK-UNBALANCED1: invalid pipeline 'no-op-module)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(no-op-module))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED2
-; CHECK-UNBALANCED2: unable to parse pass pipeline description
+; CHECK-UNBALANCED2: invalid pipeline 'module(no-op-module))'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='module(no-op-module' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED3
-; CHECK-UNBALANCED3: unable to parse pass pipeline description
+; CHECK-UNBALANCED3: invalid pipeline 'module(no-op-module'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED4
-; CHECK-UNBALANCED4: unable to parse pass pipeline description
+; CHECK-UNBALANCED4: invalid pipeline 'no-op-function)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED5
-; CHECK-UNBALANCED5: unable to parse pass pipeline description
+; CHECK-UNBALANCED5: invalid pipeline 'function(no-op-function))'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(function(no-op-function)))' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED6
-; CHECK-UNBALANCED6: unable to parse pass pipeline description
+; CHECK-UNBALANCED6: invalid pipeline 'function(function(no-op-function)))'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED7
-; CHECK-UNBALANCED7: unable to parse pass pipeline description
+; CHECK-UNBALANCED7: invalid pipeline 'function(no-op-function'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED8
-; CHECK-UNBALANCED8: unable to parse pass pipeline description
+; CHECK-UNBALANCED8: invalid pipeline 'function(function(no-op-function)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module,)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED9
-; CHECK-UNBALANCED9: unable to parse pass pipeline description
+; CHECK-UNBALANCED9: invalid pipeline 'no-op-module,)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function,)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-UNBALANCED10
-; CHECK-UNBALANCED10: unable to parse pass pipeline description
+; CHECK-UNBALANCED10: invalid pipeline 'no-op-function,)'
 
 ; RUN: opt -disable-output -debug-pass-manager \
 ; RUN:     -passes=no-op-cgscc,no-op-cgscc %s 2>&1 \
@@ -176,37 +176,86 @@
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function(no-op-function)function(no-op-function)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-MISSING-COMMA1
-; CHECK-MISSING-COMMA1: unable to parse pass pipeline description
+; CHECK-MISSING-COMMA1: invalid pipeline 'function(no-op-function)function(no-op-function)'
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='function()' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-EMPTY-INNER-PIPELINE
-; CHECK-EMPTY-INNER-PIPELINE: unable to parse pass pipeline description
+; CHECK-EMPTY-INNER-PIPELINE: unknown function pass ''
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-module(no-op-module,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-MODULE-PASS
-; CHECK-PIPELINE-ON-MODULE-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-MODULE-PASS: invalid use of 'no-op-module' pass as module pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-cgscc(no-op-cgscc,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-CGSCC-PASS
-; CHECK-PIPELINE-ON-CGSCC-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-CGSCC-PASS: invalid use of 'no-op-cgscc' pass as cgscc pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function(no-op-function,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-FUNCTION-PASS
-; CHECK-PIPELINE-ON-FUNCTION-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-FUNCTION-PASS: invalid use of 'no-op-function' pass as function pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-loop(no-op-loop,whatever)' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-PIPELINE-ON-LOOP-PASS
-; CHECK-PIPELINE-ON-LOOP-PASS: unable to parse pass pipeline description
+; CHECK-PIPELINE-ON-LOOP-PASS: invalid use of 'no-op-loop' pass as loop pipeline
 
 ; RUN: not opt -disable-output -debug-pass-manager \
 ; RUN:     -passes='no-op-function()' %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-EMPTY-PIPELINE-ON-PASS
-; CHECK-EMPTY-PIPELINE-ON-PASS: unable to parse pass pipeline description
+; CHECK-EMPTY-PIPELINE-ON-PASS: invalid use of 'no-op-function' pass as function pipeline
+
+; RUN: not opt -passes='no-op-module,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-MODULE
+; CHECK-UNKNOWN-MODULE: unknown module pass 'bad'
+
+; RUN: not opt -passes='no-op-loop,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-LOOP
+; CHECK-UNKNOWN-LOOP: unknown loop pass 'bad'
+
+; RUN: not opt -passes='no-op-cgscc,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-CGSCC
+; CHECK-UNKNOWN-CGSCC: unknown cgscc pass 'bad'
+
+; RUN: not opt -passes='no-op-function,bad' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='function(bad,pipeline,text)' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='module(no-op-module,function(bad,pipeline,text))' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='no-op-module,function(bad,pipeline,text)' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; RUN: not opt -passes='module(cgscc(function(bad,pipeline,text)))' \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=CHECK-UNKNOWN-FUNCTION
+; CHECK-UNKNOWN-FUNCTION: unknown function pass 'bad'
+
+; RUN: not opt -aa-pipeline=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=AA-PIPELINE-ERR
+; AA-PIPELINE-ERR: unknown alias analysis name 'bad'
+; RUN: opt -passes-ep-peephole=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PEEPHOLE-ERR
+; PASSES-EP-PEEPHOLE-ERR: Could not parse -passes-ep-peephole pipeline: unknown function pass 'bad'
+; RUN: opt -passes-ep-late-loop-optimizations=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LATELOOPOPT-ERR
+; PASSES-EP-LATELOOPOPT-ERR: Could not parse -passes-ep-late-loop-optimizations pipeline: unknown loop pass 'bad'
+; RUN: opt -passes-ep-loop-optimizer-end=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-LOOPOPTEND-ERR
+; PASSES-EP-LOOPOPTEND-ERR: Could not parse -passes-ep-loop-optimizer-end pipeline: unknown loop pass 'bad'
+; RUN: opt -passes-ep-scalar-optimizer-late=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-SCALAROPTLATE-ERR
+; PASSES-EP-SCALAROPTLATE-ERR: Could not parse -passes-ep-scalar-optimizer-late pipeline: unknown function pass 'bad'
+; RUN: opt -passes-ep-cgscc-optimizer-late=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-CGSCCOPTLATE-ERR
+; PASSES-EP-CGSCCOPTLATE-ERR: Could not parse -passes-ep-cgscc-optimizer-late pipeline: unknown cgscc pass 'bad'
+; RUN: opt -passes-ep-vectorizer-start=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-VECTORIZERSTART-ERR
+; PASSES-EP-VECTORIZERSTART-ERR: Could not parse -passes-ep-vectorizer-start pipeline: unknown function pass 'bad'
+; RUN: opt -passes-ep-pipeline-start=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PIPELINESTART-ERR
+; PASSES-EP-PIPELINESTART-ERR: Could not parse -passes-ep-pipeline-start pipeline: unknown pass name 'bad'
 
 define void @f() {
 entry:
diff --git a/test/Other/print-debug-counter.ll b/test/Other/print-debug-counter.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3647f39026dcf0ed7ac8181dd9699aeaa361c0e2
--- /dev/null
+++ b/test/Other/print-debug-counter.ll
@@ -0,0 +1,32 @@
+; REQUIRES: asserts
+
+; RUN: opt -S -debug-counter=early-cse-skip=1,early-cse-count=1 -early-cse \
+; RUN:        -debug-counter=newgvn-vn-skip=1,newgvn-vn-count=2 -newgvn \
+; RUN:        -instcombine -print-debug-counter < %s 2>&1 | FileCheck %s
+;; Test debug counter prints correct info in right order.
+; CHECK-LABEL: Counters and values:
+; CHECK:       early-cse
+; CHECK-SAME:  {4,1,1}
+; CHECK:       instcombine-visit
+; CHECK-SAME:  {12,0,-1}
+; CHECK:       newgvn-vn
+; CHECK-SAME:  {9,1,2}
+define i32 @f1(i32 %a, i32 %b) {
+bb:
+  %add1 = add i32 %a, %b
+  %add2 = add i32 %a, %b
+  %add3 = add i32 %a, %b
+  %add4 = add i32 %a, %b
+  %ret1 = add i32 %add1, %add2
+  %ret2 = add i32 %add3, %add4
+  %ret = add i32 %ret1, %ret2
+  ret i32 %ret
+}
+
+define i32 @f2(i32 %a, i32 %b) {
+bb:
+  %add1 = add i32 %a, %b
+  %add2 = add i32 %a, %b
+  %ret = add i32 %add1, %add2
+  ret i32 %ret
+}
diff --git a/test/Other/scc-pass-printer.ll b/test/Other/scc-pass-printer.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9d86bf039636a1da179aead92e09f9bf569fbed2
--- /dev/null
+++ b/test/Other/scc-pass-printer.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -inline -print-after-all | FileCheck %s -check-prefix=INL
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=inline -print-after-all | FileCheck %s -check-prefix=INL
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -inline -print-after-all -print-module-scope | FileCheck %s -check-prefix=INL-MOD
+; RUN: opt < %s 2>&1 -disable-output \
+; RUN: 	   -passes=inline -print-after-all -print-module-scope | FileCheck %s -check-prefix=INL-MOD
+
+; INL: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .bar, foo}}
+; INL: define void @bar()
+; INL-NEXT:  call void @foo()
+; INL: define void @foo()
+; INL-NEXT:   call void @bar()
+; INL: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .tester}}
+; INL: define void @tester()
+; INL-NEXT:  call void @foo()
+; INL: IR Dump After
+
+; INL-MOD: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .bar, foo}}
+; INL-MOD: define void @tester()
+; INL-MOD-NEXT:  call void @foo()
+; INL-MOD: define void @foo()
+; INL-MOD-NEXT:   call void @bar()
+; INL-MOD: define void @bar()
+; INL-MOD-NEXT:  call void @foo()
+; INL-MOD: IR Dump After {{Function Integration/Inlining|InlinerPass .*scc: .tester}}
+; INL-MOD: define void @tester()
+; INL-MOD-NEXT:  call void @foo()
+; INL-MOD: define void @foo()
+; INL-MOD-NEXT:   call void @bar()
+; INL-MOD: define void @bar()
+; INL-MOD-NEXT:  call void @foo()
+; INL-MOD: IR Dump After
+
+define void @tester() noinline {
+  call void @foo()
+  ret void
+}
+
+define void @foo() noinline {
+  call void @bar()
+  ret void
+}
+
+define void @bar() noinline {
+  call void @foo()
+  ret void
+}
diff --git a/test/Other/size-remarks.ll b/test/Other/size-remarks.ll
index 34cb1202bb91d147d248ecc173b4b5ed290ac7aa..1e96dd02207867054eec905996b87a829ca8087a 100644
--- a/test/Other/size-remarks.ll
+++ b/test/Other/size-remarks.ll
@@ -32,7 +32,7 @@
 ; CGSCC-NEXT: Name:            IRSizeChange
 ; CGSCC-NEXT: Function:
 ; CGSCC-NEXT: Args:
-; CGSCC-NEXT:  - Pass:            Function Integration/Inlining
+; CGSCC-NEXT:  - Pass:            'Function Integration/Inlining'
 ; CGSCC-NEXT:  - String:          ': IR instruction count changed from '
 ; CGSCC-NEXT:  - IRInstrsBefore:  '[[ORIG]]'
 ; CGSCC-NEXT:  - String:          ' to '
@@ -44,7 +44,7 @@
 ; CGSCC-NEXT: Name:            FunctionIRSizeChange
 ; CGSCC-NEXT: Function:
 ; CGSCC-NEXT: Args:
-; CGSCC-NEXT:   - Pass:            Function Integration/Inlining
+; CGSCC-NEXT:   - Pass:            'Function Integration/Inlining'
 ; CGSCC-NEXT:   - String:          ': Function: '
 ; CGSCC-NEXT:   - Function:        bar
 ; CGSCC-NEXT:   - String:          ': IR instruction count changed from '
diff --git a/test/ThinLTO/X86/Inputs/alias_internal.ll b/test/ThinLTO/X86/Inputs/alias_internal.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e55e40b1d052675dd5118f1d99e70e73dc039d9a
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/alias_internal.ll
@@ -0,0 +1,8 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal i32 @f(i8*) unnamed_addr {
+    ret i32 42
+}
+
+@a2 = weak alias i32 (i8*), i32 (i8*)* @f
diff --git a/test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll b/test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fe1fa70ee831270b06b302d7abfa7dafa8dda3ad
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/globals-import-blockaddr.ll
@@ -0,0 +1,12 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@label_addr = internal constant [1 x i8*] [i8* blockaddress(@foo, %lb)], align 8
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define dso_local [1 x i8*]* @foo() {
+  br label %lb
+
+lb:
+  ret [1 x i8*]* @label_addr
+}
diff --git a/test/ThinLTO/X86/alias_internal.ll b/test/ThinLTO/X86/alias_internal.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d6433f6981dafdd0a625ebe8fd3318953741178c
--- /dev/null
+++ b/test/ThinLTO/X86/alias_internal.ll
@@ -0,0 +1,21 @@
+; Test to make sure dot dumper can correctly handle aliases to multiple
+; different internal aliasees with the same name.
+
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/alias_internal.ll -o %t2.bc
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.out -save-temps \
+; RUN:   -r %t1.bc,a1,plx \
+; RUN:   -r %t2.bc,a2,plx
+
+; RUN: cat %t.out.index.dot | FileCheck %s
+; CHECK-DAG: M0_12511626713252727690 -> M0_{{.*}} // alias
+; CHECK-DAG: M1_8129049334585965161 -> M1_{{.*}} // alias
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal i32 @f(i8*) unnamed_addr {
+    ret i32 42
+}
+
+@a1 = weak alias i32 (i8*), i32 (i8*)* @f
diff --git a/test/ThinLTO/X86/cfi-devirt.ll b/test/ThinLTO/X86/cfi-devirt.ll
index 134da52857aae736cbd2387c1acf9619e6e98066..7ade794d498b6e726863e3cec738f0e49fa39599 100644
--- a/test/ThinLTO/X86/cfi-devirt.ll
+++ b/test/ThinLTO/X86/cfi-devirt.ll
@@ -5,7 +5,9 @@
 ; RUN: opt -thinlto-bc -o %t.o %s
 
 ; Legacy PM
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
 ; RUN: llvm-lto2 run %t.o -save-temps -pass-remarks=. \
+; RUN:   -verify-machineinstrs=0 \
 ; RUN:   -o %t3 \
 ; RUN:   -r=%t.o,test,px \
 ; RUN:   -r=%t.o,_ZN1A1nEi,p \
@@ -22,7 +24,9 @@
 ; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR
 
 ; New PM
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
 ; RUN: llvm-lto2 run %t.o -save-temps -use-new-pm -pass-remarks=. \
+; RUN:   -verify-machineinstrs=0 \
 ; RUN:   -o %t3 \
 ; RUN:   -r=%t.o,test,px \
 ; RUN:   -r=%t.o,_ZN1A1nEi,p \
diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll
index ed52222f43a8e3396ec14f801bbbb51899490e39..76dc09e8a9e87a2cbe7b8d1046ecf8125019451e 100644
--- a/test/ThinLTO/X86/deadstrip.ll
+++ b/test/ThinLTO/X86/deadstrip.ll
@@ -8,10 +8,10 @@
 ; RUN: llvm-lto -exported-symbol=_main -thinlto-action=promote %t1.bc -thinlto-index=%t.index.bc -o - | llvm-lto -exported-symbol=_main -thinlto-action=internalize -thinlto-index %t.index.bc -thinlto-module-id=%t1.bc - -o - | llvm-dis -o - | FileCheck %s
 ; RUN: llvm-lto -exported-symbol=_main -thinlto-action=promote %t2.bc -thinlto-index=%t.index.bc -o - | llvm-lto -exported-symbol=_main -thinlto-action=internalize -thinlto-index %t.index.bc -thinlto-module-id=%t2.bc - -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK2
 
-; RUN: llvm-lto -exported-symbol=_main -thinlto-action=run %t1.bc %t2.bc
+; RUN: llvm-lto -exported-symbol=_main -thinlto-action=run -stats %t1.bc %t2.bc 2>&1 | FileCheck %s --check-prefix=STATS
 ; RUN: llvm-nm %t1.bc.thinlto.o | FileCheck %s --check-prefix=CHECK-NM
 
-; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.out -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.out -save-temps -stats \
 ; RUN:   -r %t1.bc,_main,plx \
 ; RUN:   -r %t1.bc,_bar,pl \
 ; RUN:   -r %t1.bc,_dead_func,pl \
@@ -25,7 +25,7 @@
 ; RUN:   -r %t2.bc,_dead_func,l \
 ; RUN:   -r %t2.bc,_another_dead_func,pl \
 ; RUN:   -thinlto-threads=1 \
-; RUN:	 -debug-only=function-import 2>&1 | FileCheck %s --check-prefix=DEBUG
+; RUN:	 -debug-only=function-import 2>&1 | FileCheck %s --check-prefix=DEBUG --check-prefix=STATS
 ; RUN: llvm-dis < %t.out.1.3.import.bc | FileCheck %s --check-prefix=LTO2
 ; RUN: llvm-dis < %t.out.2.3.import.bc | FileCheck %s --check-prefix=LTO2-CHECK2
 ; RUN: llvm-nm %t.out.1 | FileCheck %s --check-prefix=CHECK2-NM
@@ -89,6 +89,8 @@
 ; DEBUG-DAG: Initialize import for 15611644523426561710 (boo)
 ; DEBUG-DAG: Ignores Dead GUID: 2384416018110111308 (another_dead_func)
 
+; STATS: 3 function-import  - Number of dead stripped symbols in index
+
 ; Next test the case where Inputs/deadstrip.ll does not get a module index,
 ; which will cause it to be handled by regular LTO in the new LTO API.
 ; In that case there are uses of @dead_func in the regular LTO partition
diff --git a/test/ThinLTO/X86/devirt-after-icp.ll b/test/ThinLTO/X86/devirt-after-icp.ll
index b711e260c1ef4bc87c7d61ed94269df2537f2573..987221787e2f8a4a7aa22c555c3af66b43cdb0c9 100644
--- a/test/ThinLTO/X86/devirt-after-icp.ll
+++ b/test/ThinLTO/X86/devirt-after-icp.ll
@@ -45,7 +45,9 @@
 ; RUN: opt -thinlto-bc -o %t.o %s
 
 ; Legacy PM
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
 ; RUN: llvm-lto2 run %t.o -save-temps -pass-remarks=. \
+; RUN:   -verify-machineinstrs=0 \
 ; RUN:   -o %t3 \
 ; RUN:   -r=%t.o,_Z3bazP1A,px \
 ; RUN:   -r=%t.o,_ZN1A3fooEv, \
@@ -63,7 +65,9 @@
 ; RUN: llvm-dis %t3.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR
 
 ; New PM
+; FIXME: Fix machine verifier issues and remove -verify-machineinstrs=0. PR39436.
 ; RUN: llvm-lto2 run %t.o -save-temps -use-new-pm -pass-remarks=. \
+; RUN:   -verify-machineinstrs=0 \
 ; RUN:   -o %t3 \
 ; RUN:   -r=%t.o,_Z3bazP1A,px \
 ; RUN:   -r=%t.o,_ZN1A3fooEv, \
diff --git a/test/ThinLTO/X86/dot-dumper-full-lto.ll b/test/ThinLTO/X86/dot-dumper-full-lto.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6d4d1efa9a32ea7bb059dc4dc95389604a18912c
--- /dev/null
+++ b/test/ThinLTO/X86/dot-dumper-full-lto.ll
@@ -0,0 +1,28 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/dot-dumper.ll -o %t2.bc
+; RUN: llvm-lto2 run -save-temps %t1.bc %t2.bc -o %t3 \
+; RUN:  -r=%t1.bc,main,px \
+; RUN:  -r=%t1.bc,A, \
+; RUN:  -r=%t2.bc,foo,p \
+; RUN:  -r=%t2.bc,bar,p \
+; RUN:  -r=%t2.bc,A,p \
+; RUN:  -r=%t2.bc,B,p
+; RUN: cat %t3.index.dot | FileCheck %s
+
+; CHECK: subgraph cluster_4294967295
+; CHECK:   M4294967295_[[ID:[0-9]+]]{{.*}}main
+; CHECK: // Cross-module edges:
+; CHECK:  M4294967295_[[ID]] -> M0_{{[0-9]+}}{{.*}}// ref
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = external global i32
+
+define i32 @main() {
+  %v = load i32, i32* @A
+  ret i32 %v
+}
+
+!0 = !{i32 1, !"ThinLTO", i32 0}
+!llvm.module.flags = !{ !0 }
diff --git a/test/ThinLTO/X86/dot-dumper.ll b/test/ThinLTO/X86/dot-dumper.ll
index 25cd0ed617f963aef330e4efaa8411533053ccb5..72175a1ea4d09151cedae850378dba72f76d85e3 100644
--- a/test/ThinLTO/X86/dot-dumper.ll
+++ b/test/ThinLTO/X86/dot-dumper.ll
@@ -34,7 +34,7 @@
 ; CLUSTER1:         // Module: {{.*}}2.bc
 ; CLUSTER1-NEXT:    subgraph cluster_1 {
 ; CLUSTER1-DAG:       M1_[[A:[0-9]+]] [{{.*}}A|extern{{.*}}]; // variable
-; CLUSTER1-DAG:       M1_[[FOO:[0-9]+]] [{{.*}}foo|extern{{.*}}]; // function, not eligible to import
+; CLUSTER1-DAG:       M1_[[FOO:[0-9]+]] [{{.*}}foo|extern{{.*}} ffl: 00001{{.*}}]; // function
 ; CLUSTER1-DAG:       M1_[[B:[0-9]+]] [{{.*}}B|extern{{.*}}]; // variable
 ; CLUSTER1-DAG:       M1_[[BAR:[0-9]+]] [{{.*}}bar|extern{{.*}}]; // function, dead
 ; CLUSTER1-NEXT:      // Edges:
diff --git a/test/ThinLTO/X86/globals-import-blockaddr.ll b/test/ThinLTO/X86/globals-import-blockaddr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d4ed674030a70eb760ebc315e0943880bf134da0
--- /dev/null
+++ b/test/ThinLTO/X86/globals-import-blockaddr.ll
@@ -0,0 +1,18 @@
+; RUN: opt -module-summary %s -o %t1.bc
+; RUN: opt -module-summary %p/Inputs/globals-import-blockaddr.ll -o %t2.bc
+; RUN: llvm-lto2 run -save-temps %t1.bc -r=%t1.bc,foo,l -r=%t1.bc,main,pl %t2.bc -r=%t2.bc,foo,pl -o %t3
+; RUN: llvm-dis %t3.1.3.import.bc -o - | FileCheck %s
+
+; Verify that we haven't imported GV containing blockaddress
+; CHECK: @label_addr.llvm.0 = external hidden constant
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare dso_local [1 x i8*]* @foo();
+
+define dso_local i32 @main() {
+  %p = call [1 x i8*]* @foo()
+  %v = ptrtoint [1 x i8*]* %p to i32
+  ret i32 %v
+}
diff --git a/test/ThinLTO/X86/internalize.ll b/test/ThinLTO/X86/internalize.ll
index 433cfe40894910e3dda31790750188cafb482316..70b28469b4865709007b8639c3ed1f0179149217 100644
--- a/test/ThinLTO/X86/internalize.ll
+++ b/test/ThinLTO/X86/internalize.ll
@@ -3,12 +3,27 @@
 ; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=REGULAR
 ; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o -  --exported-symbol=foo | llvm-dis -o - | FileCheck %s --check-prefix=INTERNALIZE
 
+; Test the enable-lto-internalization option by setting it to false.
+; This makes sure indices are not marked as internallinkage and therefore
+; internalization does not happen.
+; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc \
+; RUN:          -enable-lto-internalization=false --exported-symbol=foo
+; RUN: llvm-dis < %t1.bc.thinlto.internalized.bc | FileCheck %s --check-prefix=INTERNALIZE-OPTION-DISABLE
+
 ; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps \
 ; RUN:     -r=%t1.bc,_foo,pxl \
 ; RUN:     -r=%t1.bc,_bar,pl \
 ; RUN:     -r=%t1.bc,_linkonce_func,pl
 ; RUN: llvm-dis < %t.o.1.2.internalize.bc | FileCheck  %s --check-prefix=INTERNALIZE2
 
+; Test the enable-lto-internalization option by setting it to false.
+; This makes sure indices are not marked as internallinkage and therefore
+; internalization does not happen.
+; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps -enable-lto-internalization=false \
+; RUN:     -r=%t1.bc,_foo,pxl \
+; RUN:     -r=%t1.bc,_bar,pl \
+; RUN:     -r=%t1.bc,_linkonce_func,pl
+; RUN: llvm-dis < %t.o.1.2.internalize.bc | FileCheck  %s --check-prefix=INTERNALIZE2-OPTION-DISABLE
 
 ; REGULAR: define void @foo
 ; REGULAR: define void @bar
@@ -16,9 +31,15 @@
 ; INTERNALIZE: define void @foo
 ; INTERNALIZE: define internal void @bar
 ; INTERNALIZE: define internal void @linkonce_func()
+; INTERNALIZE-OPTION-DISABLE: define void @foo
+; INTERNALIZE-OPTION-DISABLE: define void @bar
+; INTERNALIZE-OPTION-DISABLE: define linkonce void @linkonce_func()
 ; INTERNALIZE2: define dso_local void @foo
 ; INTERNALIZE2: define internal void @bar
 ; INTERNALIZE2: define internal void @linkonce_func()
+; INTERNALIZE2-OPTION-DISABLE: define dso_local void @foo
+; INTERNALIZE2-OPTION-DISABLE: define dso_local void @bar
+; INTERNALIZE2-OPTION-DISABLE: define weak dso_local void @linkonce_func()
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/Transforms/BlockExtractor/extract-blocks.ll b/test/Transforms/BlockExtractor/extract-blocks.ll
index 47e5fc30849a8d190ac84496a4d8363c72ac9806..e720953a1e7a619b4af3e94781d52218215e7867 100644
--- a/test/Transforms/BlockExtractor/extract-blocks.ll
+++ b/test/Transforms/BlockExtractor/extract-blocks.ll
@@ -4,11 +4,11 @@
 ; RUN: opt -S -extract-blocks -extract-blocks-file=%t -extract-blocks-erase-funcs %s | FileCheck %s --check-prefix=CHECK-ERASE
 
 ; CHECK-NO-ERASE: @foo(
-; CHECK-NO-ERASE: @foo_bb9(
-; CHECK-NO-ERASE: @foo_bb20(
+; CHECK-NO-ERASE: @foo.bb9(
+; CHECK-NO-ERASE: @foo.bb20(
 ; CHECK-ERASE: declare i32 @foo(
-; CHECK-ERASE: @foo_bb9(
-; CHECK-ERASE: @foo_bb20(
+; CHECK-ERASE: @foo.bb9(
+; CHECK-ERASE: @foo.bb20(
 define i32 @foo(i32 %arg, i32 %arg1) {
 bb:
   %tmp5 = icmp sgt i32 %arg, 0
diff --git a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
index 8313cfac04ee83048c13f9dc7cdb7f5aa497d6b7..55c44e1e832794a7b5238334d0aaff910a4a2cb6 100644
--- a/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
+++ b/test/Transforms/CodeExtractor/ExtractedFnEntryCount.ll
@@ -22,7 +22,7 @@ entry:
   ret i32 %val
 }
 
-; CHECK: @inlinedFunc.1_if.then(i1 %cond) !prof [[COUNT1:![0-9]+]]
+; CHECK: @inlinedFunc.1.if.then(i1 %cond) !prof [[COUNT1:![0-9]+]]
 
 
 !llvm.module.flags = !{!0}
diff --git a/test/Transforms/CodeExtractor/PartialInlineAnd.ll b/test/Transforms/CodeExtractor/PartialInlineAnd.ll
index d32d834d2df3b5171fe69fd393105c54eae33b01..6d555b740e5e9d7c79a18467b5126fd263746771 100644
--- a/test/Transforms/CodeExtractor/PartialInlineAnd.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineAnd.ll
@@ -41,11 +41,11 @@ bb:
 ; CHECK-LABEL: @dummy_caller
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call void @bar.1_
+; CHECK: call void @bar.1.
 ; LIMIT-LABEL: @dummy_caller
 ; LIMIT: br i1
 ; LIMIT-NOT: br
-; LIMIT: call void @bar.1_
+; LIMIT: call void @bar.1.
   %tmp = tail call i32 @bar(i32 %arg)
   ret i32 %tmp
 }
diff --git a/test/Transforms/CodeExtractor/PartialInlineAndOr.ll b/test/Transforms/CodeExtractor/PartialInlineAndOr.ll
index 485e06ce1023421e30c35bc2227784ddcd33eaea..9da9ed4437e1b39338c8df2c922af00a55d72bf6 100644
--- a/test/Transforms/CodeExtractor/PartialInlineAndOr.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineAndOr.ll
@@ -49,7 +49,7 @@ bb:
 ; CHECK: br i1
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call void @bar.1_
+; CHECK: call void @bar.1.
 ; LIMIT-LABEL: @dummy_caller
 ; LIMIT-NOT: br i1
 ; LIMIT: call i32 @bar
diff --git a/test/Transforms/CodeExtractor/PartialInlineAttributes.ll b/test/Transforms/CodeExtractor/PartialInlineAttributes.ll
index 40170846392f790dc364a26825ae6714fb6d896d..18c934bc6a1ae782615630bcfbe8834005fcfc71 100644
--- a/test/Transforms/CodeExtractor/PartialInlineAttributes.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineAttributes.ll
@@ -55,9 +55,9 @@ if.end:
   ret i32 %add
 }
 ; CHECK-LABEL: @caller
-; CHECK: call void @callee_most.2_if.then(i32 %v
+; CHECK: call void @callee_most.2.if.then(i32 %v
 ; CHECK: call i32 @callee_noinline(i32 %v)
-; CHECK: call void @callee_writeonly.1_if.then(i32 %v
+; CHECK: call void @callee_writeonly.1.if.then(i32 %v
 define i32 @caller(i32 %v) {
 entry:
   %c1 = call i32 @callee_most(i32 %v)
@@ -66,8 +66,8 @@ entry:
   ret i32 %c3
 }
 
-; CHECK: define internal void @callee_writeonly.1_if.then(i32 %v, i32* %sub.out) { 
-; CHECK: define internal void @callee_most.2_if.then(i32 %v, i32* %sub.out)  [[FN_ATTRS:#[0-9]+]]
+; CHECK: define internal void @callee_writeonly.1.if.then(i32 %v, i32* %sub.out) { 
+; CHECK: define internal void @callee_most.2.if.then(i32 %v, i32* %sub.out)  [[FN_ATTRS:#[0-9]+]]
 
 ; attributes to preserve
 attributes #0 = {
diff --git a/test/Transforms/CodeExtractor/PartialInlineDebug.ll b/test/Transforms/CodeExtractor/PartialInlineDebug.ll
index 5d9e64dc277ce2bdbbdaa77b9a26a9aeb6dd0c88..c0bc66db0ee65a94d96e7179f3ae1791d3ae2be5 100644
--- a/test/Transforms/CodeExtractor/PartialInlineDebug.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineDebug.ll
@@ -23,7 +23,7 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; CHECK-LABEL: @caller
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void @callee.2_if.then(i32 %v, i32* %mul.loc.i), !dbg ![[DBG2:[0-9]+]]
+; CHECK-NEXT: call void @callee.2.if.then(i32 %v, i32* %mul.loc.i), !dbg ![[DBG2:[0-9]+]]
 define i32 @caller(i32 %v) !dbg !8 {
 entry:
   %call = call i32 @callee(i32 %v), !dbg !14
@@ -53,17 +53,17 @@ if.end:
 
 ; CHECK-LABEL: @caller2
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void @callee2.1_if.then(i32 %v, i32* %sub.loc.i), !dbg ![[DBG4:[0-9]+]]
+; CHECK-NEXT: call void @callee2.1.if.then(i32 %v, i32* %sub.loc.i), !dbg ![[DBG4:[0-9]+]]
 define i32 @caller2(i32 %v) !dbg !21 {
 entry:
   %call = call i32 @callee2(i32 %v), !dbg !22
   ret i32 %call
 }
 
-; CHECK-LABEL: define internal void @callee2.1_if.then
+; CHECK-LABEL: define internal void @callee2.1.if.then
 ; CHECK: br label %if.then, !dbg ![[DBG5:[0-9]+]]
 
-; CHECK-LABEL: define internal void @callee.2_if.then
+; CHECK-LABEL: define internal void @callee.2.if.then
 ; CHECK: br label %if.then, !dbg ![[DBG6:[0-9]+]]
 
 ; CHECK: ![[DBG1]] = !DILocation(line: 10, column: 7,
diff --git a/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
index 3a7a9752e5074ffefb344c88ae28ac9e145ab063..0efc8299dab04593ad9f4aa41f404f5c5d9c4a3b 100644
--- a/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll
@@ -17,9 +17,9 @@ define internal i32 @Caller1(i1 %cond, i32* align 2 %align.val) !prof !3{
 entry:
 ; CHECK-LABEL: @Caller1
 ; CHECK: br
-; CHECK: call void @Func.1_ 
+; CHECK: call void @Func.1.
 ; CHECK: br
-; CHECK: call void @Func.1_ 
+; CHECK: call void @Func.1.
   %val = call i32 @Func(i1 %cond, i32* %align.val)
   %val2 = call i32 @Func(i1 %cond, i32* %align.val)
   ret i32 %val
@@ -29,7 +29,7 @@ define internal i32 @Caller2(i1 %cond, i32* align 2 %align.val) !prof !2{
 entry:
 ; CHECK-LABEL: @Caller2
 ; CHECK: br
-; CHECK: call void @Func.1_ 
+; CHECK: call void @Func.1.
   %val = call i32 @Func(i1 %cond, i32* %align.val)
   ret i32 %val
 }
diff --git a/test/Transforms/CodeExtractor/PartialInlineInvokeProducesOutVal.ll b/test/Transforms/CodeExtractor/PartialInlineInvokeProducesOutVal.ll
index bba7ad05facc45038deaa8b2b82dc24406ada03b..bc6f780c5a815bb72c4b46d3fd64010c1328c2ef 100644
--- a/test/Transforms/CodeExtractor/PartialInlineInvokeProducesOutVal.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineInvokeProducesOutVal.ll
@@ -26,14 +26,14 @@ bb5:                                              ; preds = %bb4, %bb1, %bb
 ; CHECK-LABEL: bb:
 ; CHECK-NEXT:  [[CALL26LOC:%.*]] = alloca i8*
 ; CHECK-LABEL: codeRepl.i:
-; CHECK-NEXT:   call void @bar.1_bb1(i8** [[CALL26LOC]])
+; CHECK-NEXT:   call void @bar.1.bb1(i8** [[CALL26LOC]])
 define i8* @dummy_caller(i32 %arg) {
 bb:
   %tmp = tail call i8* @bar(i32 %arg)
   ret i8* %tmp
 }
 
-; CHECK-LABEL: define internal void @bar.1_bb1
+; CHECK-LABEL: define internal void @bar.1.bb1
 ; CHECK-LABEL: bb1:
 ; CHECK-NEXT:    %call26 = invoke i8* @invoke_callee()
 ; CHECK-NEXT:            to label %cont unwind label %lpad
diff --git a/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll
index e8a4d1281a237c085d7de5de17bacdde85e3e1b0..1e1a1b062d440c15c132119b80945531eee37689 100644
--- a/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineLiveAcross.ll
@@ -36,7 +36,7 @@ declare void @foo(...) local_unnamed_addr #1
 define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
 ; CHECK-LABEL: @dummy_caller
 ; CHECK: codeRepl.i:
-; CHECK:  call void @test.1_bb2()
+; CHECK:  call void @test.1.bb2()
 ; CHECK-NOT: load
 ; CHECK  br
 
@@ -45,7 +45,7 @@ bb:
   ret i32 %tmp
 }
 
-; CHECK-LABEL: define internal void @test.1_bb2()
+; CHECK-LABEL: define internal void @test.1.bb2()
 ; CHECK: .exitStub:
 ; CHECK-NOT:  store i32 %tmp7, i32* %tmp7.out
 ; CHECK: ret
diff --git a/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll
index a48ff4b1b8f99e6c49730b8cf2be97e0c523fda3..d41492f8ffd45a5a36d1e3fd357f07b68a05d229 100644
--- a/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineNoLiveOut.ll
@@ -39,7 +39,7 @@ declare void @foo(...) local_unnamed_addr #0
 define i32 @dummy_caller(i32 %arg) local_unnamed_addr #0 {
 ; CHECK-LABEL: @dummy_caller
 ; CHECK: codeRepl.i:
-; CHECK:  call void @test.1_bb2()
+; CHECK:  call void @test.1.bb2()
 ; CHECK-NOT: load
 ; CHECK  br
 bb:
@@ -47,7 +47,7 @@ bb:
   ret i32 %tmp
 }
 
-; CHECK-LABEL: define internal void @test.1_bb2()
+; CHECK-LABEL: define internal void @test.1.bb2()
 ; CHECK: .exitStub:
 ; CHECK-NOT:  store i32 %tmp7, i32* %tmp7.out
 ; CHECK: ret
diff --git a/test/Transforms/CodeExtractor/PartialInlineOr.ll b/test/Transforms/CodeExtractor/PartialInlineOr.ll
index 758945c7ade5ed98480ebb672f5f48c5fc9670a2..cbf7a47de9bfbd64ac51e9ac6f665073d5b2ba4d 100644
--- a/test/Transforms/CodeExtractor/PartialInlineOr.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineOr.ll
@@ -41,7 +41,7 @@ bb:
 ; CHECK-LABEL: @dummy_caller
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call void @bar.2_
+; CHECK: call void @bar.2.
 ; LIMIT-LABEL: @dummy_caller
 ; LIMIT-NOT: br
 ; LIMIT: call i32 @bar(
@@ -84,7 +84,7 @@ bb5:                                              ; preds = %bb4, %bb1
 define i32 @dummy_caller2(i32 %arg) local_unnamed_addr #0 {
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call {{.*}} @bar_multi_ret.1_
+; CHECK: call {{.*}} @bar_multi_ret.1.
   %tmp = tail call i32 @bar_multi_ret(i32 %arg)
   ret i32 %tmp
 }
diff --git a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
index fb6d1c335361df6fa02290dba2dcacd0f35f937f..09d0e2503ea2f61e9e30c02143f92fa146c49fc0 100644
--- a/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineOrAnd.ll
@@ -52,12 +52,12 @@ bb:
 ; CHECK: br i1
 ; CHECK: br i1
 ; CHECK: br i1
-; CHECK: call void @bar.1_
+; CHECK: call void @bar.1.
 ; LIMIT3-LABEL: @dummy_caller
 ; LIMIT3: br i1
 ; LIMIT3: br i1
 ; LIMIT3-NOT: br i1
-; LIMIT3: call void @bar.1_
+; LIMIT3: call void @bar.1.
 ; LIMIT2-LABEL: @dummy_caller
 ; LIMIT2-NOT: br i1
 ; LIMIT2: call i32 @bar(
diff --git a/test/Transforms/CodeExtractor/PartialInlinePGOMultiRegion.ll b/test/Transforms/CodeExtractor/PartialInlinePGOMultiRegion.ll
index a51bdd01df5a19956a9ca72e63ce43a7ce97cd9f..5d187abb68aa01ebb2615cb1c314e3f077ae6129 100644
--- a/test/Transforms/CodeExtractor/PartialInlinePGOMultiRegion.ll
+++ b/test/Transforms/CodeExtractor/PartialInlinePGOMultiRegion.ll
@@ -109,9 +109,9 @@ define signext i32 @foo(i32 signext %value, i32 signext %ub) #0 !prof !30 {
 ; CHECK-LABEL: @foo
 ; CHECK-NOT: call signext i32 @bar
 ; CHECK: codeRepl1.i:
-; CHECK: call void @bar.1_if.then
+; CHECK: call void @bar.1.if.then
 ; CHECK: codeRepl.i:
-; CHECK: call void @bar.1_if.then2
+; CHECK: call void @bar.1.if.then2
 entry:
   %value.addr = alloca i32, align 4
   %ub.addr = alloca i32, align 4
@@ -123,11 +123,11 @@ entry:
   ret i32 %call
 }
 
-; CHECK-LABEL: define internal void @bar.1_if.then2
+; CHECK-LABEL: define internal void @bar.1.if.then2
 ; CHECK: .exitStub:
 ; CHECK: ret void
 
-; CHECK-LABEL: define internal void @bar.1_if.then
+; CHECK-LABEL: define internal void @bar.1.if.then
 ; CHECK: .exitStub:
 ; CHECK: ret void
 
diff --git a/test/Transforms/CodeExtractor/PartialInlinePGORegion.ll b/test/Transforms/CodeExtractor/PartialInlinePGORegion.ll
index 27c858f3de60e48cd216d687cfc9cb453dc1dd19..4aa706243153fb9662ff3794d20711e5d2929b1d 100644
--- a/test/Transforms/CodeExtractor/PartialInlinePGORegion.ll
+++ b/test/Transforms/CodeExtractor/PartialInlinePGORegion.ll
@@ -66,7 +66,7 @@ define signext i32 @foo(i32 signext %value, i32 signext %ub) #0 !prof !30 {
 ; CHECK-LABEL: @foo
 ; CHECK: codeRepl.i:
 ; CHECK-NOT: call signext i32 @bar
-; CHECK: call void @bar.1_if.then
+; CHECK: call void @bar.1.if.then
 entry:
   %value.addr = alloca i32, align 4
   %ub.addr = alloca i32, align 4
@@ -78,7 +78,7 @@ entry:
   ret i32 %call
 }
 
-; CHECK-LABEL: define internal void @bar.1_if.then
+; CHECK-LABEL: define internal void @bar.1.if.then
 ; CHECK: .exitStub:
 ; CHECK: ret void
 
diff --git a/test/Transforms/CodeExtractor/PartialInlineVarArg.ll b/test/Transforms/CodeExtractor/PartialInlineVarArg.ll
index bf6db27c959ab375dabd512e3fe15197a44979f5..8582f5e18f8472845c853211f2a6eaa77a2ac214 100644
--- a/test/Transforms/CodeExtractor/PartialInlineVarArg.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineVarArg.ll
@@ -36,7 +36,7 @@ bb:
 }
 ; CHECK-LABEL: @caller1
 ; CHECK: codeRepl.i:
-; CHECK-NEXT:  call void (i32, i8**, i32, ...) @vararg.3_bb1(i32 %stat1.i, i8** %vargs.i, i32 %arg)
+; CHECK-NEXT:  call void (i32, i8**, i32, ...) @vararg.3.bb1(i32 %stat1.i, i8** %vargs.i, i32 %arg)
 
 define i32 @caller2(i32 %arg, float %arg2) {
 bb:
@@ -46,7 +46,7 @@ bb:
 
 ; CHECK-LABEL: @caller2
 ; CHECK: codeRepl.i:
-; CHECK-NEXT:  call void (i32, i8**, i32, ...) @vararg.3_bb1(i32 %stat1.i, i8** %vargs.i, i32 %arg, i32 10, float %arg2)
+; CHECK-NEXT:  call void (i32, i8**, i32, ...) @vararg.3.bb1(i32 %stat1.i, i8** %vargs.i, i32 %arg, i32 10, float %arg2)
 
 ; Test case to check that we do not extract a vararg function, if va_end is in
 ; a block that is not outlined.
@@ -104,4 +104,4 @@ entry:
 
 ; CHECK-LABEL: @caller_with_signext
 ; CHECK: codeRepl.i:
-; CHECK-NEXT:  call void (i32*, ...) @vararg2.1_cond.end(i32* %foo, i32 signext 8)
+; CHECK-NEXT:  call void (i32*, ...) @vararg2.1.cond.end(i32* %foo, i32 signext 8)
diff --git a/test/Transforms/CodeExtractor/PartialInlineVarArgsDebug.ll b/test/Transforms/CodeExtractor/PartialInlineVarArgsDebug.ll
index 1a3d3ee4401f8fef1084cf7492c49ad87c471bad..02f695d3662605fef61b25e897a26090c4d95704 100644
--- a/test/Transforms/CodeExtractor/PartialInlineVarArgsDebug.ll
+++ b/test/Transforms/CodeExtractor/PartialInlineVarArgsDebug.ll
@@ -19,14 +19,14 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; CHECK-LABEL: @caller
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void (i32, i32*, ...) @callee.1_if.then(i32 %v, i32* %mul.loc.i, i32 99), !dbg ![[DBG2:[0-9]+]]
+; CHECK-NEXT: call void (i32, i32*, ...) @callee.1.if.then(i32 %v, i32* %mul.loc.i, i32 99), !dbg ![[DBG2:[0-9]+]]
 define i32 @caller(i32 %v) !dbg !8 {
 entry:
   %call = call i32 (i32, ...) @callee(i32 %v, i32 99), !dbg !14
   ret i32 %call, !dbg !15
 }
 
-; CHECK-LABEL: define internal void @callee.1_if.then
+; CHECK-LABEL: define internal void @callee.1.if.then
 ; CHECK: br label %if.then, !dbg ![[DBG3:[0-9]+]]
 
 ; CHECK: ![[DBG1]] = !DILocation(line: 10, column: 7,
diff --git a/test/Transforms/CodeExtractor/SingleCondition.ll b/test/Transforms/CodeExtractor/SingleCondition.ll
index 4110cd95b7ee86a3ed0c382a4769cec254e97b81..334364484eefa1f32e58f9a771ac859f8df8cf7c 100644
--- a/test/Transforms/CodeExtractor/SingleCondition.ll
+++ b/test/Transforms/CodeExtractor/SingleCondition.ll
@@ -16,7 +16,7 @@ define internal i32 @dummyCaller(i1 %cond, i32* align 2 %align.val) {
 entry:
 ; CHECK-LABEL: @dummyCaller
 ; CHECK: br
-; CHECK: call void @inlinedFunc.1_ 
+; CHECK: call void @inlinedFunc.1.
   %val = call i32 @inlinedFunc(i1 %cond, i32* %align.val)
   ret i32 %val
 }
diff --git a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
index 0f8a71907d859b62fab4d2204b3f0986196a42f2..e6a5113261e906880691cea9827767c632c5f09a 100644
--- a/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
+++ b/test/Transforms/CodeExtractor/X86/InheritTargetAttributes.ll
@@ -36,5 +36,5 @@ entry:
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+sse4.1" }
 
-; CHECK: define {{.*}} @inlinedFunc.1_if.then{{.*}} [[COUNT1:#[0-9]+]]
+; CHECK: define {{.*}} @inlinedFunc.1.if.then{{.*}} [[COUNT1:#[0-9]+]]
 ; CHECK: [[COUNT1]] = { {{.*}} "target-cpu"="x86-64" "target-features"="+sse4.1" }
diff --git a/test/Transforms/CodeExtractor/cost.ll b/test/Transforms/CodeExtractor/cost.ll
index 4ac5acee019adcd4c1c6a2dfee6d20206092d36d..841b42b7c354a9e5c392560bd89508fba06bdd87 100644
--- a/test/Transforms/CodeExtractor/cost.ll
+++ b/test/Transforms/CodeExtractor/cost.ll
@@ -47,14 +47,14 @@ declare i32 @foo(i32* %arg)
 define i32 @dummy_caller(i32* %arg) local_unnamed_addr {
 ; CHECK-LABEL: @dummy_caller
   %tmp = call i32 @outline_region_notlikely(i32* %arg)
-; CHECK:  call void @outline_region_notlikely.2_bb1
+; CHECK:  call void @outline_region_notlikely.2.bb1
   %tmp2 = tail call i32 @outline_region_likely(i32* %arg)
 ; CHECK: %tmp2 = tail call i32 @outline_region_likely(i32* %arg)
   ret i32 %tmp
 
 }
 
-; CHECK-LABEL: define internal void @outline_region_notlikely.2_bb1(i32* %arg) {
+; CHECK-LABEL: define internal void @outline_region_notlikely.2.bb1(i32* %arg) {
 ; CHECK-NEXT: newFuncRoot:
 
 !llvm.module.flags = !{!0}
diff --git a/test/Transforms/CodeExtractor/cost_meta.ll b/test/Transforms/CodeExtractor/cost_meta.ll
index 2e4467a8d0c95eb8c7b42872b29f79111cd3eb30..ca1690a4c9f52f3279b9643232648eb979f60089 100644
--- a/test/Transforms/CodeExtractor/cost_meta.ll
+++ b/test/Transforms/CodeExtractor/cost_meta.ll
@@ -28,7 +28,7 @@ define i32 @dummy_caller(i32* %arg) local_unnamed_addr {
  }
 
 
-; CHECK-LABEL: define internal void @outline_region_notlikely.1_bb1(i32* %arg) {
+; CHECK-LABEL: define internal void @outline_region_notlikely.1.bb1(i32* %arg) {
 ; CHECK-NEXT: newFuncRoot:
 
 declare i32 @foo(i32 * %arg)
diff --git a/test/Transforms/CodeExtractor/inline_eh.ll b/test/Transforms/CodeExtractor/inline_eh.ll
index 4e0aa7a0d72cb068523db324b2d3f20e7281fe87..a69e0c30bb93b42b2ebba4a1510109c5f56ac372 100644
--- a/test/Transforms/CodeExtractor/inline_eh.ll
+++ b/test/Transforms/CodeExtractor/inline_eh.ll
@@ -42,11 +42,11 @@ entry:
 ; CHECK: entry:
 ; CHECK-NEXT: br i1
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void @callee.1_{{.*}}()
+; CHECK-NEXT: call void @callee.1.{{.*}}()
   call void @callee(i1 %cond)
   ret void
 }
 
-; CHECK-LABEL: define {{.*}} @callee.1_{{.*}}() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+; CHECK-LABEL: define {{.*}} @callee.1.{{.*}}() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
 ; CHECK: invoke void @bar()
 ; CHECK: landingpad
diff --git a/test/Transforms/CodeExtractor/inline_eh_1.ll b/test/Transforms/CodeExtractor/inline_eh_1.ll
index 31e35839644f8d1bad32acf05ef06a37fa292cd7..b01abb6c1e8970c8dba65e551237e9043cf3801c 100644
--- a/test/Transforms/CodeExtractor/inline_eh_1.ll
+++ b/test/Transforms/CodeExtractor/inline_eh_1.ll
@@ -42,12 +42,12 @@ entry:
 ; CHECK: entry:
 ; CHECK-NEXT: br i1
 ; CHECK: codeRepl.i:
-; CHECK-NEXT: call void @callee.1_{{.*}}()
+; CHECK-NEXT: call void @callee.1.{{.*}}()
   call void @callee(i1 %cond)
   ret void
 }
 
-; CHECK-LABEL: define {{.*}} @callee.1_{{.*}}() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
+; CHECK-LABEL: define {{.*}} @callee.1.{{.*}}() personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 ; CHECK: invoke void @bar()
 ; CHECK: cleanuppad
 ; CHECK-NEXT: cleanupret
diff --git a/test/Transforms/CodeExtractor/live_shrink.ll b/test/Transforms/CodeExtractor/live_shrink.ll
index c25ed2b622cdca85e868ab44569b78b2c7e9e1de..780ab480c4bafd565aca033038daf23d01f4a8ac 100644
--- a/test/Transforms/CodeExtractor/live_shrink.ll
+++ b/test/Transforms/CodeExtractor/live_shrink.ll
@@ -41,13 +41,13 @@ bb:
 ; CHECK-NOT: llvm.lifetime
 ; CHECK: br i1
 ; CHECK: codeRepl.i:
-; CHECK: call void @_Z3foov.1_
+; CHECK: call void @_Z3foov.1.
 
   tail call void @_Z3foov()
   ret void
 }
 
-; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK-LABEL: define internal void @_Z3foov.1.
 ; CHECK: newFuncRoot:
 ; CHECK-NEXT:  %tmp = alloca %class.A
 ; CHECK-NEXT:  %tmp1 = bitcast %class.A* %tmp to i8*
diff --git a/test/Transforms/CodeExtractor/live_shrink_gep.ll b/test/Transforms/CodeExtractor/live_shrink_gep.ll
index ac6aa4fbda43b0d30dc3dceb6a14551b35e3017f..aed86f84b66c8414a225bce259e655de95acac3a 100644
--- a/test/Transforms/CodeExtractor/live_shrink_gep.ll
+++ b/test/Transforms/CodeExtractor/live_shrink_gep.ll
@@ -42,12 +42,12 @@ bb:
 ; CHECK-NOT: llvm.lifetime
 ; CHECK: br i1
 ; CHECK: codeRepl.i:
-; CHECK: call void @_Z3foov.1_
+; CHECK: call void @_Z3foov.1.
   tail call void @_Z3foov()
   ret void
 }
 
-; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK-LABEL: define internal void @_Z3foov.1.
 ; CHECK: newFuncRoot:
 ; CHECK-NEXT:  %tmp = alloca %class.A
 ; CHECK-NEXT:  %tmp1 = getelementptr
diff --git a/test/Transforms/CodeExtractor/live_shrink_hoist.ll b/test/Transforms/CodeExtractor/live_shrink_hoist.ll
index 1f57146c941889dbff14db0c0613cd86ebc3b466..13dab8d6b83a9278e53ff32e7179814219da57ac 100644
--- a/test/Transforms/CodeExtractor/live_shrink_hoist.ll
+++ b/test/Transforms/CodeExtractor/live_shrink_hoist.ll
@@ -50,7 +50,7 @@ bb:
   ret void
 }
 
-; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK-LABEL: define internal void @_Z3foov.1.
 ; CHECK: bb9:
 ; CHECK: call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %tmp1)
 ; CHECK:  br label %.exitStub
diff --git a/test/Transforms/CodeExtractor/live_shrink_multiple.ll b/test/Transforms/CodeExtractor/live_shrink_multiple.ll
index 8d9045c7267b1a153e3ab78d097978878c8c62cd..9350ca2ef9c8737394a4aebdf7f3c9453fb0bf62 100644
--- a/test/Transforms/CodeExtractor/live_shrink_multiple.ll
+++ b/test/Transforms/CodeExtractor/live_shrink_multiple.ll
@@ -42,7 +42,7 @@ bb:
   ret void
 }
 
-; CHECK-LABEL: define internal void @_Z3foov.1_
+; CHECK-LABEL: define internal void @_Z3foov.1.
 ; CHECK: newFuncRoot:
 ; CHECK-NEXT:  alloca 
 ; CHECK-NEXT:  bitcast 
diff --git a/test/Transforms/CodeExtractor/unreachable-block.ll b/test/Transforms/CodeExtractor/unreachable-block.ll
index 09f41f6bd2fbaf59d807959a6d8688e3baaa2074..7ce65f529a6bb5b47f0f4c085dd6daf8300fabb5 100644
--- a/test/Transforms/CodeExtractor/unreachable-block.ll
+++ b/test/Transforms/CodeExtractor/unreachable-block.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -S -partial-inliner %s | FileCheck %s
 
 ; CHECK-LABEL: define void @dipsy(
-; CHECK-NEXT:   call void @tinkywinky.1_ontrue()
+; CHECK-NEXT:   call void @tinkywinky.1.ontrue()
 ; CHECK-NEXT:   call void @patatuccio()
 ; CHECK-NEXT:   ret void
 ; CHECK-NEXT: }
 
-; CHECK-LABEL: define internal void @tinkywinky.1_ontrue() {
+; CHECK-LABEL: define internal void @tinkywinky.1.ontrue() {
 ; CHECK-NEXT: newFuncRoot:
 ; CHECK-NEXT:   br label %ontrue
 ; CHECK: onfalse{{.*}}:
diff --git a/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
index a5878e00e034f7304df6c1eabb95d2e87ecae5d1..5cc00b75962e1ee4e6a06466f724eaad2b048f8a 100644
--- a/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
+++ b/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
@@ -145,3 +145,31 @@ while_body:
 while_end:
   ret void
 }
+
+declare i8* @llvm.strip.invariant.group.p0i8(i8*)
+
+define void @test_invariant_group(i32) {
+; CHECK-LABEL: test_invariant_group
+  br i1 undef, label %8, label %7
+
+; <label>:2:                                      ; preds = %8, %2
+  br i1 undef, label %2, label %7
+
+; <label>:3:                                      ; preds = %8
+  %4 = getelementptr inbounds i8, i8* %9, i32 40000
+  %5 = bitcast i8* %4 to i64*
+  br i1 undef, label %7, label %6
+
+; <label>:6:                                      ; preds = %3
+  store i64 1, i64* %5, align 8
+  br label %7
+
+; <label>:7:                                      ; preds = %6, %3, %2, %1
+  ret void
+
+; <label>:8:                                      ; preds = %1
+  %9 = call i8* @llvm.strip.invariant.group.p0i8(i8* nonnull undef)
+  %10 = icmp eq i32 %0, 0
+  br i1 %10, label %3, label %2
+}
+
diff --git a/test/Transforms/ConstProp/calls-math-finite.ll b/test/Transforms/ConstProp/calls-math-finite.ll
index 93741612fc5ba4e254a2748bb1cb72ad118e8ac3..d13b798bde21fe299aeed0dc68855e9511ac50da 100644
--- a/test/Transforms/ConstProp/calls-math-finite.ll
+++ b/test/Transforms/ConstProp/calls-math-finite.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -constprop -S | FileCheck %s
+; RUN: opt < %s -constprop -S -mtriple=unknown-unknown-linux-musl | FileCheck -check-prefix=MUSL %s
 
 ; Test to verify constant folding can occur when math routines are mapped
 ; to the __<func>_finite versions of functions due to __FINITE_MATH_ONLY__
@@ -57,6 +58,48 @@ define void @T() {
 ; CHECK-NEXT:    store float 0x40240926E0000000, float* [[SLOTF]]
 ; CHECK-NEXT:    ret void
 ;
+; MUSL-LABEL: @T(
+; MUSL-NEXT:    [[SLOT:%.*]] = alloca double
+; MUSL-NEXT:    [[SLOTF:%.*]] = alloca float
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+; MUSL-NEXT:    call
+; MUSL-NEXT:    store
+
   %slot = alloca double
   %slotf = alloca float
 
diff --git a/test/Transforms/ConstantHoisting/X86/bad-cases.ll b/test/Transforms/ConstantHoisting/X86/bad-cases.ll
new file mode 100644
index 0000000000000000000000000000000000000000..00890942096233313223cf23ce3609da83860c1f
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/X86/bad-cases.ll
@@ -0,0 +1,47 @@
+; RUN: opt -consthoist -S < %s | FileCheck %s
+target triple = "x86_64--"
+
+; We don't want to convert constant divides because the benefit from converting
+; them to a mul in the backend is larget than constant materialization savings.
+define void @signed_const_division(i64 %in1, i64 %in2, i64* %addr) {
+; CHECK-LABEL: @signed_const_division
+; CHECK: %res1 = sdiv i64 %l1, 4294967296
+; CHECK: %res2 = srem i64 %l2, 4294967296
+entry:
+  br label %loop
+
+loop:
+  %l1 = phi i64 [%res1, %loop], [%in1, %entry]
+  %l2 = phi i64 [%res2, %loop], [%in2, %entry]
+  %res1 = sdiv i64 %l1, 4294967296
+  store volatile i64 %res1, i64* %addr
+  %res2 = srem i64 %l2, 4294967296
+  store volatile i64 %res2, i64* %addr
+  %again = icmp eq i64 %res1, %res2
+  br i1 %again, label %loop, label %end
+
+end:
+  ret void
+}
+
+define void @unsigned_const_division(i64 %in1, i64 %in2, i64* %addr) {
+; CHECK-LABEL: @unsigned_const_division
+; CHECK: %res1 = udiv i64 %l1, 4294967296
+; CHECK: %res2 = urem i64 %l2, 4294967296
+
+entry:
+  br label %loop
+
+loop:
+  %l1 = phi i64 [%res1, %loop], [%in1, %entry]
+  %l2 = phi i64 [%res2, %loop], [%in2, %entry]
+  %res1 = udiv i64 %l1, 4294967296
+  store volatile i64 %res1, i64* %addr
+  %res2 = urem i64 %l2, 4294967296
+  store volatile i64 %res2, i64* %addr
+  %again = icmp eq i64 %res1, %res2
+  br i1 %again, label %loop, label %end
+
+end:
+  ret void
+}
diff --git a/test/Transforms/FunctionImport/Inputs/import_stats.ll b/test/Transforms/FunctionImport/Inputs/import_stats.ll
new file mode 100644
index 0000000000000000000000000000000000000000..818fbf20d6ff698448d3d592f18b2ea6643a652c
--- /dev/null
+++ b/test/Transforms/FunctionImport/Inputs/import_stats.ll
@@ -0,0 +1,16 @@
+; ModuleID = 'import_stats2.ll'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@globalvar = global i32 1, align 4
+
+define void @hot() {
+  store i32 0, i32* @globalvar, align 4
+  ret void
+}
+define void @critical() {
+  ret void
+}
+define void @none() {
+  ret void
+}
diff --git a/test/Transforms/FunctionImport/import_stats.ll b/test/Transforms/FunctionImport/import_stats.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2cb415d1e960b9a88f27932c74dc50983dfc35a3
--- /dev/null
+++ b/test/Transforms/FunctionImport/import_stats.ll
@@ -0,0 +1,71 @@
+; Test to check thin link importing stats
+
+; -stats requires asserts
+; REQUIRES: asserts
+
+; REQUIRES: x86-registered-target
+
+; RUN: opt -module-summary %s -o %t.bc
+; RUN: opt -module-summary %p/Inputs/import_stats.ll -o %t2.bc
+
+; Test thin link stats with both new and old LTO
+; RUN: llvm-lto -thinlto-action=run -stats %t.bc %t2.bc \
+; RUN:		2>&1 | FileCheck %s --check-prefix=THINLINKSTATS
+; RUN: llvm-lto2 run -stats -o %t3 %t.bc %t2.bc \
+; RUN:          -r %t.bc,hot_function,plx \
+; RUN:          -r %t.bc,hot, \
+; RUN:          -r %t.bc,critical, \
+; RUN:          -r %t.bc,none, \
+; RUN:          -r %t2.bc,hot,plx \
+; RUN:          -r %t2.bc,critical,plx \
+; RUN:          -r %t2.bc,none,plx \
+; RUN:          -r %t2.bc,globalvar,plx \
+; RUN:          2>&1 | FileCheck %s --check-prefix=THINLINKSTATS
+
+; THINLINKSTATS-DAG: 1 function-import   - Number of global variables thin link decided to import
+; THINLINKSTATS-DAG: 1 function-import  - Number of critical functions thin link decided to import
+; THINLINKSTATS-DAG: 3 function-import  - Number of functions thin link decided to import
+; THINLINKSTATS-DAG: 1 function-import  - Number of hot functions thin link decided to import
+
+; ModuleID = 'import_stats.ll'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This function has a high profile count, so entry block is hot.
+define void @hot_function(i1 %a) !prof !20 {
+entry:
+  call void @hot()
+  call void @critical()
+  br i1 %a, label %None1, label %None2, !prof !42
+None1:          ; half goes here
+  call void @none()
+  br label %exit
+None2:          ; half goes here
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @hot()
+declare void @none()
+declare void @critical()
+
+!42 = !{!"branch_weights", i32 1, i32 1}
+
+!llvm.module.flags = !{!1}
+!20 = !{!"function_entry_count", i64 100, i64 696010031887058302}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 300}
+!5 = !{!"MaxCount", i64 100}
+!6 = !{!"MaxInternalCount", i64 100}
+!7 = !{!"MaxFunctionCount", i64 100}
+!8 = !{!"NumCounts", i64 4}
+!9 = !{!"NumFunctions", i64 1}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 4}
diff --git a/test/Transforms/GVN/opt-remarks.ll b/test/Transforms/GVN/opt-remarks.ll
index 6919528bb832f0f56de278685310e12a8ebff062..120ff36f20482aeb36365cf2353cc17c1e1d93d9 100644
--- a/test/Transforms/GVN/opt-remarks.ll
+++ b/test/Transforms/GVN/opt-remarks.ll
@@ -49,7 +49,7 @@
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            gvn
 ; YAML-NEXT: Name:            LoadClobbered
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 3, Column: 3 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 3, Column: 3 }
 ; YAML-NEXT: Function:        may_alias
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'load of type '
@@ -57,10 +57,10 @@
 ; YAML-NEXT:   - String:          ' not eliminated'
 ; YAML-NEXT:   - String:          ' in favor of '
 ; YAML-NEXT:   - OtherAccess:     load
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 1, Column: 13 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 1, Column: 13 }
 ; YAML-NEXT:   - String:          ' because it is clobbered by '
 ; YAML-NEXT:   - ClobberedBy:     store
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 2, Column: 10 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 2, Column: 10 }
 ; YAML-NEXT: ...
 
 define i32 @arg(i32* %p, i32 %i) {
diff --git a/test/Transforms/GVN/range.ll b/test/Transforms/GVN/range.ll
index 39acc0c351570711d5dcda8af4332d516d144ab2..fd5fa56b617dac3a61302c836649bbccf2519490 100644
--- a/test/Transforms/GVN/range.ll
+++ b/test/Transforms/GVN/range.ll
@@ -2,7 +2,7 @@
 
 define i32 @test1(i32* %p) {
 ; CHECK-LABEL: @test1(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range !0
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !0
@@ -12,8 +12,7 @@ define i32 @test1(i32* %p) {
 
 define i32 @test2(i32* %p) {
 ; CHECK-LABEL: @test2(i32* %p)
-; CHECK: %a = load i32, i32* %p
-; CHECK-NOT: range
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p
@@ -23,7 +22,7 @@ define i32 @test2(i32* %p) {
 
 define i32 @test3(i32* %p) {
 ; CHECK-LABEL: @test3(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[DISJOINT_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !1
@@ -33,7 +32,7 @@ define i32 @test3(i32* %p) {
 
 define i32 @test4(i32* %p) {
 ; CHECK-LABEL: @test4(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !2
@@ -43,7 +42,7 @@ define i32 @test4(i32* %p) {
 
 define i32 @test5(i32* %p) {
 ; CHECK-LABEL: @test5(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_SIGNED_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE3:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !3
   %b = load i32, i32* %p, !range !4
@@ -53,7 +52,7 @@ define i32 @test5(i32* %p) {
 
 define i32 @test6(i32* %p) {
 ; CHECK-LABEL: @test6(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_TEST6:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE5:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !5
   %b = load i32, i32* %p, !range !6
@@ -63,7 +62,7 @@ define i32 @test6(i32* %p) {
 
 define i32 @test7(i32* %p) {
 ; CHECK-LABEL: @test7(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_TEST7:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE7:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !7
   %b = load i32, i32* %p, !range !8
@@ -73,7 +72,7 @@ define i32 @test7(i32* %p) {
 
 define i32 @test8(i32* %p) {
 ; CHECK-LABEL: @test8(i32* %p)
-; CHECK: %a = load i32, i32* %p
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE9:[0-9]+]]
 ; CHECK-NOT: range
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !9
@@ -82,11 +81,11 @@ define i32 @test8(i32* %p) {
   ret i32 %c
 }
 
-; CHECK: ![[DISJOINT_RANGE]] = !{i32 0, i32 2, i32 3, i32 5}
-; CHECK: ![[MERGED_RANGE]] = !{i32 0, i32 5}
-; CHECK: ![[MERGED_SIGNED_RANGE]] = !{i32 -5, i32 -2, i32 1, i32 5}
-; CHECK: ![[MERGED_TEST6]] = !{i32 10, i32 1}
-; CHECK: ![[MERGED_TEST7]] = !{i32 3, i32 4, i32 5, i32 2}
+; CHECK: ![[RANGE0]] = !{i32 0, i32 2}
+; CHECK: ![[RANGE3]] = !{i32 -5, i32 -2}
+; CHECK: ![[RANGE5]] = !{i32 10, i32 1}
+; CHECK: ![[RANGE7]] = !{i32 1, i32 2, i32 3, i32 4}
+; CHECK: ![[RANGE9]] = !{i32 1, i32 5}
 
 !0 = !{i32 0, i32 2}
 !1 = !{i32 3, i32 5}
diff --git a/test/Transforms/GlobalOpt/shrink-global-to-bool-check-debug.ll b/test/Transforms/GlobalOpt/shrink-global-to-bool-check-debug.ll
new file mode 100644
index 0000000000000000000000000000000000000000..71019128bb1ba662e65ae8dd50d81cd308d069d8
--- /dev/null
+++ b/test/Transforms/GlobalOpt/shrink-global-to-bool-check-debug.ll
@@ -0,0 +1,22 @@
+;RUN: opt -S -debugify -globalopt -f %s | FileCheck %s
+
+@foo = internal global i32 0, align 4
+
+define dso_local i32 @bar() {
+entry:
+  store i32 5, i32* @foo, align 4
+  %0 = load i32, i32* @foo, align 4
+  ret i32 %0
+}
+
+;CHECK:      @bar
+;CHECK-NEXT: entry:
+;CHECK-NEXT:   store i1 true, i1* @foo, !dbg ![[DbgLocStore:[0-9]+]]
+;CHECK-NEXT:   %.b = load i1, i1* @foo, !dbg ![[DbgLocLoadSel:[0-9]+]]
+;CHECK-NEXT:   %0 = select i1 %.b, i32 5, i32 0, !dbg ![[DbgLocLoadSel]]
+;CHECK-NEXT:   call void @llvm.dbg.value({{.*}}), !dbg ![[DbgLocLoadSel]]
+;CHECK-NEXT:   ret i32 %0, !dbg ![[DbgLocRet:[0-9]+]]
+
+;CHECK: ![[DbgLocStore]] = !DILocation(line: 1,
+;CHECK: ![[DbgLocLoadSel]] = !DILocation(line: 2,
+;CHECK: ![[DbgLocRet]] = !DILocation(line: 3,
diff --git a/test/Transforms/HotColdSplit/X86/lit.local.cfg b/test/Transforms/HotColdSplit/X86/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..e71f3cc4c41e708fa77255cb5ac82eb42037e31c
--- /dev/null
+++ b/test/Transforms/HotColdSplit/X86/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/HotColdSplit/X86/outline-expensive.ll b/test/Transforms/HotColdSplit/X86/outline-expensive.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5b0cceae2af321e0b35fdaab7d255e2d84fc631f
--- /dev/null
+++ b/test/Transforms/HotColdSplit/X86/outline-expensive.ll
@@ -0,0 +1,25 @@
+; The magic number 6 comes from (1 * TCC_Expensive) + (1 * CostOfCallX86).
+; RUN: opt -hotcoldsplit -min-outlining-thresh=6 -S < %s | FileCheck %s
+
+; Test that we outline even though there are only two cold instructions. TTI
+; should determine that they are expensive in terms of code size.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: @fun
+; CHECK: call void @fun.cold.1
+define void @fun(i32 %x) {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ret void
+
+if.else:
+  %y = sdiv i32 %x, 111
+  call void @sink(i32 %y)
+  ret void
+}
+
+declare void @sink(i32 %x) cold
diff --git a/test/Transforms/HotColdSplit/delete-use-without-def-dbg-val.ll b/test/Transforms/HotColdSplit/delete-use-without-def-dbg-val.ll
new file mode 100644
index 0000000000000000000000000000000000000000..878db4863807591dbbb409f20f53b502415f6cde
--- /dev/null
+++ b/test/Transforms/HotColdSplit/delete-use-without-def-dbg-val.ll
@@ -0,0 +1,53 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK-NOT: call {{.*}}llvm.dbg.value
+
+; CHECK-LABEL: define {{.*}}@foo.cold
+; CHECK-NOT: call {{.*}}llvm.dbg.value
+
+define void @foo() !dbg !6 {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %cleanup
+
+if.end:                                           ; preds = %entry
+  ; We expect this block to be outlined. That kills the definition of %var.
+  %var = add i32 0, 0, !dbg !11
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  br label %cleanup
+
+cleanup:
+  ; This dbg.value should be deleted after outlining, otherwise the verifier
+  ; complains about function-local metadata being used outside of a function.
+  call void @llvm.dbg.value(metadata i32 %var, metadata !9, metadata !DIExpression()), !dbg !11
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+declare void @sink() cold
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!3, !4}
+!llvm.module.flags = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{}
+!3 = !{i32 7}
+!4 = !{i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!7 = !DISubroutineType(types: !2)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!11 = !DILocation(line: 1, column: 1, scope: !6)
diff --git a/test/Transforms/HotColdSplit/do-not-split.ll b/test/Transforms/HotColdSplit/do-not-split.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d5a8c44cc04f2b06a9eb00672e7d74fb47d81d50
--- /dev/null
+++ b/test/Transforms/HotColdSplit/do-not-split.ll
@@ -0,0 +1,105 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; Check that these functions are not split. Outlined functions are called from a
+; basic block named codeRepl.
+
+; The cold region is too small to split.
+; CHECK-LABEL: @foo
+; CHECK-NOT: foo.cold.1
+define void @foo() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+; The cold region is still too small to split.
+; CHECK-LABEL: @bar
+; CHECK-NOT: bar.cold.1
+define void @bar() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @sink()
+  call void @sink()
+  ret void
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+; Make sure we don't try to outline the entire function.
+; CHECK-LABEL: @fun
+; CHECK-NOT: fun.cold.1
+define void @fun() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+; Don't outline infinite loops.
+; CHECK-LABEL: @infinite_loop
+; CHECK-NOT: infinite_loop.cold.1
+define void @infinite_loop() {
+entry:
+  br label %loop
+
+loop:
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  br label %loop
+}
+
+; Don't count debug intrinsics towards the outlining threshold.
+; CHECK-LABEL: @dont_count_debug_intrinsics
+; CHECK-NOT: dont_count_debug_intrinsics.cold.1
+define void @dont_count_debug_intrinsics(i32 %arg1) !dbg !6 {
+entry:
+  %var = add i32 0, 0, !dbg !11
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret void
+
+if.end:                                           ; preds = %entry
+  call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11
+  call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11
+  call void @sink()
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+declare void @sink() cold
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!3, !4}
+!llvm.module.flags = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{}
+!3 = !{i32 7}
+!4 = !{i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "dont_count_debug_intrinsics", linkageName: "dont_count_debug_intrinsics", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!7 = !DISubroutineType(types: !2)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!11 = !DILocation(line: 1, column: 1, scope: !6)
diff --git a/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll b/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
new file mode 100644
index 0000000000000000000000000000000000000000..17001f95468602954ee04fc92f55d280e0858b10
--- /dev/null
+++ b/test/Transforms/HotColdSplit/duplicate-phi-preds-crash.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+declare void @sideeffect(i64)
+
+declare i8* @realloc(i8* %ptr, i64 %size)
+
+declare void @free(i8* %ptr)
+
+declare void @sink() cold
+
+; CHECK-LABEL: define {{.*}}@realloc2(
+; CHECK: call {{.*}}@sideeffect(
+; CHECK: call {{.*}}@realloc(
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call {{.*}}@realloc2.cold.1(i64 %size, i8* %ptr)
+; CHECK-LABEL: cleanup:
+; CHECK-NEXT: phi i8* [ null, %if.then ], [ null, %codeRepl ], [ %call, %if.end ]
+define i8* @realloc2(i8* %ptr, i64 %size) {
+entry:
+  %0 = add i64 %size, -1
+  %1 = icmp ugt i64 %0, 184549375
+  br i1 %1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @sideeffect(i64 %size)
+  br label %cleanup
+
+if.end:                                           ; preds = %entry
+  %call = call i8* @realloc(i8* %ptr, i64 %size)
+  %tobool1 = icmp eq i8* %call, null
+  br i1 %tobool1, label %if.then2, label %cleanup
+
+if.then2:                                         ; preds = %if.end
+  call void @sideeffect(i64 %size)
+  call void @sink()
+  %tobool3 = icmp eq i8* %ptr, null
+  br i1 %tobool3, label %cleanup, label %if.then4
+
+if.then4:                                         ; preds = %if.then2
+  call void @free(i8* %ptr)
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end, %if.then4, %if.then2, %if.then
+  %retval.0 = phi i8* [ null, %if.then ], [ null, %if.then2 ], [ null, %if.then4 ], [ %call, %if.end ]
+  ret i8* %retval.0
+}
+
+; CHECK-LABEL: define {{.*}}@realloc2.cold.1(
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sink
+; CHECK: call {{.*}}@free
diff --git a/test/Transforms/HotColdSplit/eh-typeid-for.ll b/test/Transforms/HotColdSplit/eh-typeid-for.ll
new file mode 100644
index 0000000000000000000000000000000000000000..75f9e672332e21eef0fece6f7efffe3df16507e8
--- /dev/null
+++ b/test/Transforms/HotColdSplit/eh-typeid-for.ll
@@ -0,0 +1,26 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+; Do not outline calls to @llvm.eh.typeid.for. See llvm.org/PR39545.
+
+@_ZTIi = external constant i8*
+
+; CHECK-LABEL: @fun
+; CHECK-NOT: call {{.*}}@fun.cold.1
+define void @fun() {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ret void
+
+if.else:
+  %t = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  ret void
+}
+
+declare void @sink() cold
+
+declare i32 @llvm.eh.typeid.for(i8*)
diff --git a/test/Transforms/HotColdSplit/minsize.ll b/test/Transforms/HotColdSplit/minsize.ll
new file mode 100644
index 0000000000000000000000000000000000000000..69cd0979b94596c5458c85e713a693373bc27654
--- /dev/null
+++ b/test/Transforms/HotColdSplit/minsize.ll
@@ -0,0 +1,25 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: @fun
+; CHECK: call void @fun.cold.1
+define void @fun() {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ret void
+
+if.else:
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  ret void
+}
+
+declare void @sink() cold
+
+; CHECK: define {{.*}} @fun.cold.1{{.*}}#[[outlined_func_attr:[0-9]+]]
+; CHECK: attributes #[[outlined_func_attr]] = { {{.*}}minsize
diff --git a/test/Transforms/HotColdSplit/multiple-exits.ll b/test/Transforms/HotColdSplit/multiple-exits.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2e7cf84f72e49131c3a7891f849911d51a3c15ef
--- /dev/null
+++ b/test/Transforms/HotColdSplit/multiple-exits.ll
@@ -0,0 +1,73 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern void sideeffect(int);
+; extern void __attribute__((cold)) sink();
+; void foo(int cond) {
+;   if (cond) { //< Start outlining here.
+;     sink();
+;     if (cond > 10)
+;       goto exit1;
+;     else
+;       goto exit2;
+;   }
+; exit1:
+;   sideeffect(1);
+;   return;
+; exit2:
+;   sideeffect(2);
+;   return;
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: br i1 {{.*}}, label %exit1, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK: [[targetBlock:%.*]] = call i1 @foo.cold.1(
+; CHECK-NEXT: br i1 [[targetBlock]], label %exit1, label %[[return:.*]]
+; CHECK-LABEL: exit1:
+; CHECK: call {{.*}}@sideeffect(i32 1)
+; CHECK: [[return]]:
+; CHECK-NEXT: ret void
+define void @foo(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %exit1, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void (...) @sink()
+  %cmp = icmp sgt i32 %cond, 10
+  br i1 %cmp, label %exit1, label %exit2
+
+exit1:                                            ; preds = %entry, %if.then
+  call void @sideeffect(i32 1)
+  br label %return
+
+exit2:                                            ; preds = %if.then
+  call void @sideeffect(i32 2)
+  br label %return
+
+return:                                           ; preds = %exit2, %exit1
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1(
+; TODO: Eliminate this unnecessary unconditional branch.
+; CHECK: br
+; CHECK: [[exit1Stub:.*]]:
+; CHECK-NEXT: ret i1 true
+; CHECK: [[returnStub:.*]]:
+; CHECK-NEXT: ret i1 false
+; CHECK: call {{.*}}@sink
+; CHECK-NEXT: [[cmp:%.*]] = icmp
+; CHECK-NEXT: br i1 [[cmp]], label %[[exit1Stub]], label %exit2
+; CHECK-LABEL: exit2:
+; CHECK-NEXT: call {{.*}}@sideeffect(i32 2)
+; CHECK-NEXT: br label %[[returnStub]]
+
+declare void @sink(...) cold
+
+declare void @sideeffect(i32)
diff --git a/test/Transforms/HotColdSplit/outline-if-then-else.ll b/test/Transforms/HotColdSplit/outline-if-then-else.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bbde7651e28121ac1da59bdf2c61fc372398dcbb
--- /dev/null
+++ b/test/Transforms/HotColdSplit/outline-if-then-else.ll
@@ -0,0 +1,64 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern void sideeffect(int);
+; extern void __attribute__((cold)) sink();
+; void foo(int cond) {
+;   if (cond) { //< Start outlining here.
+;     if (cond > 10)
+;       sideeffect(0);
+;     else
+;       sideeffect(1);
+;     sink();
+;   }
+;   sideeffect(2);
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: br i1 {{.*}}, label %codeRepl, label %if.end2
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @foo.cold.1
+; CHECK-LABEL: if.end2:
+; CHECK: call void @sideeffect(i32 2)
+define void @foo(i32 %cond) {
+entry:
+  %cond.addr = alloca i32
+  store i32 %cond, i32* %cond.addr
+  %0 = load i32, i32* %cond.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end2
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %cond.addr
+  %cmp = icmp sgt i32 %1, 10
+  br i1 %cmp, label %if.then1, label %if.else
+
+if.then1:                                         ; preds = %if.then
+  call void @sideeffect(i32 0)
+  br label %if.end
+
+if.else:                                          ; preds = %if.then
+  call void @sideeffect(i32 1)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then1
+  call void (...) @sink()
+  ret void
+
+if.end2:                                          ; preds = %entry
+  call void @sideeffect(i32 2)
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sideeffect
+; CHECK: call {{.*}}@sink
+
+declare void @sideeffect(i32)
+
+declare void @sink(...) cold
diff --git a/test/Transforms/HotColdSplit/outline-while-loop.ll b/test/Transforms/HotColdSplit/outline-while-loop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2a132bda7f00a29f5a9ea39d7c5785b1eb6107d3
--- /dev/null
+++ b/test/Transforms/HotColdSplit/outline-while-loop.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -hotcoldsplit < %s | FileCheck %s
+
+; Source:
+;
+; extern void sideeffect(int);
+; extern void __attribute__((cold)) sink();
+; void foo(int cond) {
+;   if (cond) { //< Start outlining here.
+;     while (cond > 10) {
+;       --cond;
+;       sideeffect(0);
+;     }
+;     sink();
+;   }
+;   sideeffect(1);
+; }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo(
+; CHECK: br i1 {{.*}}, label %if.end, label %codeRepl
+; CHECK-LABEL: codeRepl:
+; CHECK-NEXT: call void @foo.cold.1
+; CHECK-LABEL: if.end:
+; CHECK: call void @sideeffect(i32 1)
+define void @foo(i32 %cond) {
+entry:
+  %tobool = icmp eq i32 %cond, 0
+  br i1 %tobool, label %if.end, label %while.cond.preheader
+
+while.cond.preheader:                             ; preds = %entry
+  %cmp3 = icmp sgt i32 %cond, 10
+  br i1 %cmp3, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %while.cond.preheader
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %cond.addr.04 = phi i32 [ %dec, %while.body ], [ %cond, %while.body.preheader ]
+  %dec = add nsw i32 %cond.addr.04, -1
+  tail call void @sideeffect(i32 0) #3
+  %cmp = icmp sgt i32 %dec, 10
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %while.cond.preheader
+  tail call void (...) @sink()
+  ret void
+
+if.end:                                           ; preds = %entry
+  tail call void @sideeffect(i32 1)
+  ret void
+}
+
+; CHECK-LABEL: define {{.*}}@foo.cold.1
+; CHECK: phi i32
+; CHECK-NEXT: add nsw i32
+; CHECK-NEXT: call {{.*}}@sideeffect
+; CHECK-NEXT: icmp
+; CHECK-NEXT: br
+
+declare void @sideeffect(i32)
+
+declare void @sink(...) cold
diff --git a/test/Transforms/HotColdSplit/split-cold-1.ll b/test/Transforms/HotColdSplit/split-cold-1.ll
deleted file mode 100644
index 60ec234ab83b275d41d8edcd33ed2965e1dd4c2d..0000000000000000000000000000000000000000
--- a/test/Transforms/HotColdSplit/split-cold-1.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
-; RUN: opt -passes=hotcoldsplit -S < %s | FileCheck %s
-
-; Outlined function is called from a basic block named codeRepl
-; CHECK: codeRepl:
-; CHECK-NEXT: call void @foo
-define void @foo() {
-entry:
-  br i1 undef, label %if.then, label %if.end
-
-if.then:                                          ; preds = %entry
-  unreachable
-
-if.end:                                           ; preds = %entry
-  br label %if.then12
-
-if.then12:                                        ; preds = %if.end
-  br label %cleanup40
-
-cleanup40:                                        ; preds = %if.then12
-  br label %return
-
-return:                                           ; preds = %cleanup40
-  ret void
-}
diff --git a/test/Transforms/HotColdSplit/split-cold-2.ll b/test/Transforms/HotColdSplit/split-cold-2.ll
index 101bc11cba9cd3dbd5cc1395c1e7ca7c20f3e173..ac7d856608cfb33e67e6f1a7e328606e3e6e760d 100644
--- a/test/Transforms/HotColdSplit/split-cold-2.ll
+++ b/test/Transforms/HotColdSplit/split-cold-2.ll
@@ -1,15 +1,20 @@
-; RUN: opt -hotcoldsplit -S < %s
-; RUN: opt -passes=hotcoldsplit -S < %s
+; RUN: opt -hotcoldsplit -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
+; RUN: opt -passes=hotcoldsplit -pass-remarks=hotcoldsplit -S < %s 2>&1 | FileCheck %s
 
 ; Make sure this compiles. This test used to fail with an invalid phi node: the
 ; two predecessors were outlined and the SSA representation was invalid.
 
+; CHECK: remark: <unknown>:0:0: fun split cold code into fun.cold.1
+; CHECK-LABEL: @fun
+; CHECK: codeRepl:
+; CHECK-NEXT: call void @fun.cold.1
+
 define void @fun() {
 entry:
   br i1 undef, label %if.then, label %if.else
 
 if.then:
-  unreachable
+  ret void
 
 if.else:
   br label %if.then4
diff --git a/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..becfaf8e63d3f728c6a54c0f6035b382a1f98eca
--- /dev/null
+++ b/test/Transforms/HotColdSplit/split-out-dbg-val-of-arg.ll
@@ -0,0 +1,44 @@
+; RUN: opt -hotcoldsplit -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; CHECK-LABEL: define {{.*}}@foo.cold
+; CHECK-NOT: llvm.dbg.value
+
+define void @foo(i32 %arg1) !dbg !6 {
+entry:
+  %var = add i32 0, 0, !dbg !11
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret void
+
+if.end:                                           ; preds = %entry
+  call void @llvm.dbg.value(metadata i32 %arg1, metadata !9, metadata !DIExpression()), !dbg !11
+  call void @sink()
+  call void @sink()
+  call void @sink()
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+declare void @sink() cold
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!3, !4}
+!llvm.module.flags = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{}
+!3 = !{i32 7}
+!4 = !{i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: null, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !8)
+!7 = !DISubroutineType(types: !2)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !6, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty32", size: 32, encoding: DW_ATE_unsigned)
+!11 = !DILocation(line: 1, column: 1, scope: !6)
diff --git a/test/Transforms/IndVarSimplify/constant_result.ll b/test/Transforms/IndVarSimplify/constant_result.ll
new file mode 100644
index 0000000000000000000000000000000000000000..749c4af07aea7a05d3c2376b107d91344b527aa7
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/constant_result.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+@Y = global [400 x i16] zeroinitializer, align 1
+
+define i16 @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [400 x i16], [400 x i16]* @Y, i16 0, i16 [[I]]
+; CHECK-NEXT:    store i16 0, i16* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i16 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[INC]], 400
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i16 400
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i = phi i16 [ 0, %entry ], [ %inc, %for.body ]
+
+  %arrayidx = getelementptr inbounds [400 x i16], [400 x i16]* @Y, i16 0, i16 %i
+  store i16 0, i16* %arrayidx, align 1
+  %inc = add nuw nsw i16 %i, 1
+  %cmp = icmp ult i16 %inc, 400
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %inc.lcssa = phi i16 [ %inc, %for.body ]
+  ret i16 %inc.lcssa
+}
diff --git a/test/Transforms/IndVarSimplify/dont-recompute.ll b/test/Transforms/IndVarSimplify/dont-recompute.ll
index 713a55154ba8834341ec90ed16e5e226e13f9652..c87cd6596c623b0316ed203db81a665f4ebaa76a 100644
--- a/test/Transforms/IndVarSimplify/dont-recompute.ll
+++ b/test/Transforms/IndVarSimplify/dont-recompute.ll
@@ -97,3 +97,29 @@ for.end:                                          ; preds = %for.body
   tail call void @func(i32 %add)
   ret void
 }
+
+; CHECK-LABEL: @test4(
+define void @test4(i32 %m) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add i32 %a.05, %m
+; CHECK: tail call void @func(i32 %add)
+  tail call void @func(i32 %add)
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 186
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+; CHECK: for.end:
+; CHECK-NOT: mul i32 %m, 186
+; CHECK:%add.lcssa = phi i32 [ %add, %for.body ]
+; CHECK-NEXT: %soft_use = add i32 %add.lcssa, 123
+; CHECK-NEXT: tail call void @func(i32 %soft_use)
+  %soft_use = add i32 %add, 123
+  tail call void @func(i32 %soft_use)
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/drop-exact.ll b/test/Transforms/IndVarSimplify/drop-exact.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ab5b2b5a859aba25ebd1e6f7b83c60c58fa1432c
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/drop-exact.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+; We make a transform by getting rid of add nsw i32 %tmp17, -1; make sure that
+; we drop "exact" flag on lshr as we do it.
+define void @drop_exact(i32* %p, i64* %p1) {
+; CHECK-LABEL: @drop_exact(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB12:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    ret void
+; CHECK:       bb12:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ -47436, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP42:%.*]], [[BB12]] ]
+; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[TMP13]], -1
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 42831, [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP17]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = urem i32 [[TMP19]], 250
+; CHECK-NEXT:    [[TMP22:%.*]] = lshr i32 [[TMP17]], 1
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP20]] to i64
+; CHECK-NEXT:    store i64 [[TMP26]], i64* [[P1:%.*]], align 4
+; CHECK-NEXT:    [[TMP42]] = add nuw nsw i32 [[TMP14]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP42]], 719
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB7:%.*]], label [[BB12]]
+;
+bb:
+  br label %bb12
+
+bb7:                                              ; preds = %bb12
+  ret void
+
+bb12:                                             ; preds = %bb12, %bb
+  %tmp13 = phi i32 [ -47436, %bb ], [ %tmp15, %bb12 ]
+  %tmp14 = phi i32 [ 0, %bb ], [ %tmp42, %bb12 ]
+  %tmp15 = add i32 %tmp13, -1
+  %tmp16 = shl i32 %tmp15, 1
+  %tmp17 = sub i32 42831, %tmp16
+  %tmp19 = lshr i32 %tmp17, 1
+  %tmp20 = urem i32 %tmp19, 250
+  %tmp21 = add nsw i32 %tmp17, -1
+  %tmp22 = lshr exact i32 %tmp21, 1
+  store i32 %tmp22, i32* %p, align 4
+  %tmp26 = zext i32 %tmp20 to i64
+  store i64 %tmp26, i64* %p1, align 4
+  %tmp42 = add nuw nsw i32 %tmp14, 1
+  %tmp43 = icmp ugt i32 %tmp14, 717
+  br i1 %tmp43, label %bb7, label %bb12
+}
+
+; Throw away add nsw i32 %tmp17, 0, do not drop exact flag.
+define void @dont_drop_exact(i32* %p, i64* %p1) {
+; CHECK-LABEL: @dont_drop_exact(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB12:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    ret void
+; CHECK:       bb12:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ -47436, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB12]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP42:%.*]], [[BB12]] ]
+; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[TMP13]], -1
+; CHECK-NEXT:    [[TMP16:%.*]] = shl i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 42831, [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP17]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = urem i32 [[TMP19]], 250
+; CHECK-NEXT:    [[TMP22:%.*]] = lshr exact i32 [[TMP17]], 1
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP20]] to i64
+; CHECK-NEXT:    store i64 [[TMP26]], i64* [[P1:%.*]], align 4
+; CHECK-NEXT:    [[TMP42]] = add nuw nsw i32 [[TMP14]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP42]], 719
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB7:%.*]], label [[BB12]]
+;
+bb:
+  br label %bb12
+
+bb7:                                              ; preds = %bb12
+  ret void
+
+bb12:                                             ; preds = %bb12, %bb
+  %tmp13 = phi i32 [ -47436, %bb ], [ %tmp15, %bb12 ]
+  %tmp14 = phi i32 [ 0, %bb ], [ %tmp42, %bb12 ]
+  %tmp15 = add i32 %tmp13, -1
+  %tmp16 = shl i32 %tmp15, 1
+  %tmp17 = sub i32 42831, %tmp16
+  %tmp19 = lshr i32 %tmp17, 1
+  %tmp20 = urem i32 %tmp19, 250
+  %tmp21 = add nsw i32 %tmp17, 0
+  %tmp22 = lshr exact i32 %tmp21, 1
+  store i32 %tmp22, i32* %p, align 4
+  %tmp26 = zext i32 %tmp20 to i64
+  store i64 %tmp26, i64* %p1, align 4
+  %tmp42 = add nuw nsw i32 %tmp14, 1
+  %tmp43 = icmp ugt i32 %tmp14, 717
+  br i1 %tmp43, label %bb7, label %bb12
+}
diff --git a/test/Transforms/InferFunctionAttrs/annotate.ll b/test/Transforms/InferFunctionAttrs/annotate.ll
index 37dfe41cfcb17dbe2deb3c1791c5b588e7442490..161873be56e2b2e05b8e53dcb56d53d758915eda 100644
--- a/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -mtriple=x86_64-- -inferattrs -S | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-- -passes=inferattrs -S | FileCheck %s
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -inferattrs -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-DARWIN %s
-; RUN: opt < %s -mtriple=x86_64-unknown-linux -inferattrs -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-LINUX %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -inferattrs -S | FileCheck -check-prefix=CHECK -check-prefix=CHECK-LINUX %s
 ; RUN: opt < %s -mtriple=nvptx -inferattrs -S | FileCheck -check-prefix=CHECK-NVPTX %s
 
 ; operator new routines
diff --git a/test/Transforms/Inline/ARM/loop-add.ll b/test/Transforms/Inline/ARM/loop-add.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a4717bc95b78d63f31c63f1bf1137664cdc815c6
--- /dev/null
+++ b/test/Transforms/Inline/ARM/loop-add.ll
@@ -0,0 +1,95 @@
+; RUN: opt -inline %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; CHECK-LABEL: void @doCalls
+define void @doCalls(i8* nocapture %p1, i8* nocapture %p2, i32 %n) #0 {
+entry:
+  %div = lshr i32 %n, 1
+; CHECK: call void @LoopCall
+  tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div) #0
+
+  %div2 = lshr i32 %n, 2
+; CHECK: call void @LoopCall
+  tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div2) #0
+
+; CHECK-NOT: call void @LoopCall
+  tail call void @LoopCall(i8* %p2, i8* %p1, i32 0) #0
+
+; CHECK-NOT: call void @LoopCall_internal
+  tail call void @LoopCall_internal(i8* %p1, i8* %p2, i32 %div2) #0
+
+  %div3 = lshr i32 %n, 4
+; CHECK-NOT: call void @SimpleCall
+  tail call void @SimpleCall(i8* %p2, i8* %p1, i32 %div3) #0
+  ret void
+}
+
+; CHECK-LABEL: define void @LoopCall
+define void @LoopCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  %c = icmp ne i32 %num, 0
+  br i1 %c, label %while.cond, label %while.end
+
+while.cond:                                       ; preds = %while.body, %entry
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = add i8 %0, %1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+; CHECK-LABEL-NOT: define void @LoopCall_internal
+define internal void @LoopCall_internal(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  %c = icmp ne i32 %num, 0
+  br i1 %c, label %while.cond, label %while.end
+
+while.cond:                                       ; preds = %while.body, %entry
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %1 = trunc i32 %num.addr.0 to i8
+  %conv1 = add i8 %0, %1
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %conv1, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+; CHECK-LABEL: define void @SimpleCall
+define void @SimpleCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  %arrayidx = getelementptr inbounds i8, i8* %source, i32 %num
+  %0 = load i8, i8* %arrayidx, align 1
+  %1 = xor i8 %0, 127
+  %arrayidx2 = getelementptr inbounds i8, i8* %dest, i32 %num
+  store i8 %1, i8* %arrayidx2, align 1
+  ret void
+}
+
+attributes #0 = { minsize optsize }
+
diff --git a/test/Transforms/Inline/ARM/loop-memcpy.ll b/test/Transforms/Inline/ARM/loop-memcpy.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3b3625c602796fa7cf51f38a4be6cf616874f253
--- /dev/null
+++ b/test/Transforms/Inline/ARM/loop-memcpy.ll
@@ -0,0 +1,87 @@
+; RUN: opt -inline %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; CHECK-LABEL: define void @matcpy
+define void @matcpy(i8* %dest, i8* %source, i32 %num) #0 {
+entry:
+  %0 = ptrtoint i8* %dest to i32
+  %1 = ptrtoint i8* %source to i32
+  %2 = xor i32 %0, %1
+  %3 = and i32 %2, 3
+  %cmp = icmp eq i32 %3, 0
+  br i1 %cmp, label %if.then, label %if.else20
+
+if.then:                                          ; preds = %entry
+  %sub = sub i32 0, %0
+  %and2 = and i32 %sub, 3
+  %add = or i32 %and2, 4
+  %cmp3 = icmp ugt i32 %add, %num
+  br i1 %cmp3, label %if.else, label %if.then4
+
+if.then4:                                         ; preds = %if.then
+  %sub5 = sub i32 %num, %and2
+  %shr = and i32 %sub5, -4
+  %sub7 = sub i32 %sub5, %shr
+  %tobool = icmp eq i32 %and2, 0
+  br i1 %tobool, label %if.end, label %if.then8
+
+if.then8:                                         ; preds = %if.then4
+; CHECK: call fastcc void @memcpy
+  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %and2) #0
+  %add.ptr = getelementptr inbounds i8, i8* %dest, i32 %and2
+  %add.ptr9 = getelementptr inbounds i8, i8* %source, i32 %and2
+  br label %if.end
+
+if.end:                                           ; preds = %if.then4, %if.then8
+  %p_dest.0 = phi i8* [ %add.ptr, %if.then8 ], [ %dest, %if.then4 ]
+  %p_source.0 = phi i8* [ %add.ptr9, %if.then8 ], [ %source, %if.then4 ]
+  %tobool14 = icmp eq i32 %sub7, 0
+  br i1 %tobool14, label %if.end22, label %if.then15
+
+if.then15:                                        ; preds = %if.end
+  %add.ptr13 = getelementptr inbounds i8, i8* %p_source.0, i32 %shr
+  %add.ptr11 = getelementptr inbounds i8, i8* %p_dest.0, i32 %shr
+; CHECK: call fastcc void @memcpy
+  call fastcc void @memcpy(i8* %add.ptr11, i8* %add.ptr13, i32 %sub7) #0
+  br label %if.end22
+
+if.else:                                          ; preds = %if.then
+  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0
+  br label %if.end22
+
+if.else20:                                        ; preds = %entry
+  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0
+  br label %if.end22
+
+if.end22:                                         ; preds = %if.then15, %if.end, %if.else, %if.else20
+  ret void
+}
+
+; CHECK-LABEL: define internal void @memcpy
+define internal void @memcpy(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
+  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr1, %while.body ]
+  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i32 %num.addr.0, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
+  %0 = load i8, i8* %p_source.0, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
+  store i8 %0, i8* %p_dest.0, align 1
+  %dec = add i32 %num.addr.0, -1
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+attributes #0 = { minsize optsize }
+
diff --git a/test/Transforms/Inline/ARM/loop-noinline.ll b/test/Transforms/Inline/ARM/loop-noinline.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8438d16b03e85769096fc4e85919eb2cbabe4640
--- /dev/null
+++ b/test/Transforms/Inline/ARM/loop-noinline.ll
@@ -0,0 +1,49 @@
+; RUN: opt -inline %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7m-arm-none-eabi"
+
+; Check we don't inline loops at -Oz. They tend to be larger than we
+; expect.
+
+; CHECK: define i8* @H
+@digits = constant [16 x i8] c"0123456789ABCDEF", align 1
+define i8* @H(i8* %p, i32 %val, i32 %num) #0 {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %p.addr.0 = phi i8* [ %p, %entry ], [ %incdec.ptr, %do.body ]
+  %val.addr.0 = phi i32 [ %val, %entry ], [ %shl, %do.body ]
+  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %do.body ]
+  %shr = lshr i32 %val.addr.0, 28
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* @digits, i32 0, i32 %shr
+  %0 = load i8, i8* %arrayidx, align 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i32 1
+  store i8 %0, i8* %p.addr.0, align 1
+  %shl = shl i32 %val.addr.0, 4
+  %dec = add i32 %num.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %scevgep = getelementptr i8, i8* %p, i32 %num
+  ret i8* %scevgep
+}
+
+define nonnull i8* @call1(i8* %p, i32 %val, i32 %num) #0 {
+entry:
+; CHECK: tail call i8* @H
+  %call = tail call i8* @H(i8* %p, i32 %val, i32 %num) #0
+  ret i8* %call
+}
+
+define nonnull i8* @call2(i8* %p, i32 %val) #0 {
+entry:
+; CHECK: tail call i8* @H
+  %call = tail call i8* @H(i8* %p, i32 %val, i32 32) #0
+  ret i8* %call
+}
+
+attributes #0 = { minsize optsize }
+
diff --git a/test/Transforms/Inline/cgscc-cycle.ll b/test/Transforms/Inline/cgscc-cycle.ll
index 69874c3ef2f9ace93b105ca7a54d4590d9488669..bc3bdc99fff2d488587ec90c3bfd794cfed0cc55 100644
--- a/test/Transforms/Inline/cgscc-cycle.ll
+++ b/test/Transforms/Inline/cgscc-cycle.ll
@@ -5,7 +5,7 @@
 ; some out-of-band way to prevent infinitely re-inlining and re-transforming the
 ; code.
 ;
-; RUN: opt < %s -passes='cgscc(inline,function(sroa,instcombine))' -S | FileCheck %s
+; RUN: opt < %s -passes='cgscc(inline,function(sroa,instcombine))' -inline-threshold=50 -S | FileCheck %s
 
 
 ; The `test1_*` collection of functions form a directly cycling pattern.
@@ -123,3 +123,110 @@ bb2:
 
   ret void
 }
+
+; Another infinite inlining case. The initial callgraph is like following:
+;
+;         test3_a <---> test3_b
+;             |         ^
+;             v         |
+;         test3_c <---> test3_d
+;
+; For all the call edges in the call graph, only test3_c and test3_d can be
+; inlined into test3_a, and no other call edge can be inlined.
+;
+; After test3_c is inlined into test3_a, the original call edge test3_a->test3_c
+; will be removed, a new call edge will be added and the call graph becomes:
+;
+;            test3_a <---> test3_b
+;                  \      ^
+;                   v    /
+;     test3_c <---> test3_d
+; But test3_a, test3_b, test3_c and test3_d still belong to the same SCC.
+;
+; Then after test3_a->test3_d is inlined, when test3_a->test3_d is converted to
+; a ref edge, the original SCC will be split into two: {test3_c, test3_d} and
+; {test3_a, test3_b}, immediately after the newly added ref edge
+; test3_a->test3_c will be converted to a call edge, and the two SCCs will be
+; merged into the original one again. During this cycle, the original SCC will
+; be added into UR.CWorklist again and this creates an infinite loop.
+
+@a = global i64 0
+@b = global i64 0
+
+define void @test3_c(i32 %i) {
+entry:
+  %cmp = icmp eq i32 %i, 5
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i64 @random()
+  %t0 = load i64, i64* @a
+  %add = add nsw i64 %t0, %call
+  store i64 %add, i64* @a
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  tail call void @test3_d(i32 %i)
+  %t6 = load i64, i64* @a
+  %add85 = add nsw i64 %t6, 1
+  store i64 %add85, i64* @a
+  ret void
+}
+
+declare i64 @random()
+
+define void @test3_d(i32 %i) {
+entry:
+  %cmp = icmp eq i32 %i, 5
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i64 @random()
+  %t0 = load i64, i64* @a
+  %add = add nsw i64 %t0, %call
+  store i64 %add, i64* @a
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  tail call void @test3_c(i32 %i)
+  tail call void @test3_b()
+  %t6 = load i64, i64* @a
+  %add79 = add nsw i64 %t6, 3
+  store i64 %add79, i64* @a
+  ret void
+}
+
+; Function Attrs: noinline
+define void @test3_b() #0 {
+entry:
+  tail call void @test3_a()
+  %t0 = load i64, i64* @a
+  %add = add nsw i64 %t0, 2
+  store i64 %add, i64* @a
+  ret void
+}
+
+; Check test3_c is inlined into test3_a once and only once.
+; CHECK-LABEL: @test3_a(
+; CHECK: tail call void @test3_b()
+; CHECK-NEXT: tail call void @test3_d(i32 5)
+; CHECK-NEXT: %[[LD1:.*]] = load i64, i64* @a
+; CHECK-NEXT: %[[ADD1:.*]] = add nsw i64 %[[LD1]], 1
+; CHECK-NEXT: store i64 %[[ADD1]], i64* @a
+; CHECK-NEXT: %[[LD2:.*]] = load i64, i64* @b
+; CHECK-NEXT: %[[ADD2:.*]] = add nsw i64 %[[LD2]], 5
+; CHECK-NEXT: store i64 %[[ADD2]], i64* @b
+; CHECK-NEXT: ret void
+
+; Function Attrs: noinline
+define void @test3_a() #0 {
+entry:
+  tail call void @test3_b()
+  tail call void @test3_c(i32 5)
+  %t0 = load i64, i64* @b
+  %add = add nsw i64 %t0, 5
+  store i64 %add, i64* @b
+  ret void
+}
+
+attributes #0 = { noinline }
diff --git a/test/Transforms/Inline/monster_scc.ll b/test/Transforms/Inline/monster_scc.ll
index 0f8f1f21c8b56a5dc94f18d2a7082ab8e6618d15..b32a2aed331e009d86f721bf7713cfbfa1d7753a 100644
--- a/test/Transforms/Inline/monster_scc.ll
+++ b/test/Transforms/Inline/monster_scc.ll
@@ -154,11 +154,7 @@ if.end3:
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb1ELi3EEvPbS0_(
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
 ; NEW-NOT: call
@@ -198,19 +194,11 @@ if.end3:
 ; NEW-NOT: call
 ; NEW: call void @_Z1gi(
 ; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb1ELi3EEvPbS0_(
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb1ELi3EEvPbS0_(
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
 ; NEW-NOT: call
@@ -260,7 +248,7 @@ if.end3:
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb1ELi4EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb0ELi4EEvPbS0_(
 ; NEW-NOT: call
 define void @_Z1fILb0ELi2EEvPbS0_(i8* %B, i8* %E) {
 entry:
@@ -304,21 +292,13 @@ if.end3:
 ; NEW-NOT: call
 ; NEW: call void @_Z1gi(
 ; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1gi(
+; NEW: call void @_Z1fILb1ELi4EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW: call void @_Z1fILb0ELi4EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
+; NEW: call void @_Z1fILb1ELi4EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW: call void @_Z1fILb0ELi4EEvPbS0_(
 ; NEW-NOT: call
 define void @_Z1fILb1ELi2EEvPbS0_(i8* %B, i8* %E) {
 entry:
@@ -433,15 +413,7 @@ entry:
 ; NEW-NOT: call
 ; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
 ; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1gi(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
-; NEW-NOT: call
-; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
 ; NEW-NOT: call
 define void @_Z1fILb1ELi4EEvPbS0_(i8* %B, i8* %E) {
 entry:
diff --git a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
index 0ac76354a2b7066308bff2eaa8e826db46e56970..8692abfaf197080d4b47b211341793715b1b43bd 100644
--- a/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
+++ b/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
@@ -22,15 +22,15 @@
 ; YAML:      --- !Passed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            Inlined
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 10 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 10 }
 ; YAML-NEXT: Function:        bar
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: foo
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 1, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 1, Column: 0 }
 ; YAML-NEXT:   - String: ' inlined into '
 ; YAML-NEXT:   - Caller: bar
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 3, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 3, Column: 0 }
 ; YAML-NEXT:   - String: ' with '
 ; YAML-NEXT:   - String: '(cost='
 ; YAML-NEXT:   - Cost: '{{[0-9\-]+}}'
diff --git a/test/Transforms/Inline/optimization-remarks-yaml.ll b/test/Transforms/Inline/optimization-remarks-yaml.ll
index cb366dbbdd3248cb3fd93a4ade00e2bce4be0162..10a93f5cd79a452a1bfb63cbfa7aedcd35ea01c3 100644
--- a/test/Transforms/Inline/optimization-remarks-yaml.ll
+++ b/test/Transforms/Inline/optimization-remarks-yaml.ll
@@ -52,27 +52,27 @@
 ; YAML:      --- !Missed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            NoDefinition
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 5, Column: 10 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 5, Column: 10 }
 ; YAML-NEXT: Function:        baz
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: foo
 ; YAML-NEXT:   - String: ' will not be inlined into '
 ; YAML-NEXT:   - Caller: baz
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 0 }
 ; YAML-NEXT:   - String: ' because its definition is unavailable'
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Missed
 ; YAML-NEXT: Pass:            inline
 ; YAML-NEXT: Name:            NoDefinition
-; YAML-NEXT: DebugLoc:        { File: /tmp/s.c, Line: 5, Column: 18 }
+; YAML-NEXT: DebugLoc:        { File: '/tmp/s.c', Line: 5, Column: 18 }
 ; YAML-NEXT: Function:        baz
 ; YAML-NEXT: Hotness:         30
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - Callee: bar
 ; YAML-NEXT:   - String: ' will not be inlined into '
 ; YAML-NEXT:   - Caller: baz
-; YAML-NEXT:     DebugLoc:        { File: /tmp/s.c, Line: 4, Column: 0 }
+; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 4, Column: 0 }
 ; YAML-NEXT:   - String: ' because its definition is unavailable'
 ; YAML-NEXT: ...
 
diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll
index 3266fa6e4434b887912bfb99bb300a0bd172e0ed..efc088637c47fec699bfd1eddda6a168432918d7 100644
--- a/test/Transforms/InstCombine/apint-shift.ll
+++ b/test/Transforms/InstCombine/apint-shift.ll
@@ -526,3 +526,22 @@ define i40 @test26(i40 %A) {
   %D = shl i40 %C, 1
   ret i40 %D
 }
+
+; OSS-Fuzz #9880
+; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=9880
+define i177 @ossfuzz_9880(i177 %X) {
+; CHECK-LABEL: @ossfuzz_9880(
+; CHECK-NEXT:    ret i177 1
+;
+  %A = alloca i177
+  %L1 = load i177, i177* %A
+  %B = or i177 0, -1
+  %B5 = udiv i177 %L1, %B
+  %B4 = add i177 %B5, %B
+  %B2 = add i177 %B, %B4
+  %B6 = mul i177 %B5, %B2
+  %B20 = shl i177 %L1, %B6
+  %B14 = sub i177 %B20, %B5
+  %B1 = udiv i177 %B14, %B6
+  ret i177 %B1
+}
diff --git a/test/Transforms/InstCombine/fcmp-select.ll b/test/Transforms/InstCombine/fcmp-select.ll
index e04ab3e8923095dcc04809eceebde72a42c3a6cb..7fc59bbcb7d3d1089dc6f306d561fc7ef0c4d459 100644
--- a/test/Transforms/InstCombine/fcmp-select.ll
+++ b/test/Transforms/InstCombine/fcmp-select.ll
@@ -1,53 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
+declare void @use(i1)
+
+; X == 42.0 ? X : 42.0 --> 42.0
+
+define double @oeq(double %x) {
+; CHECK-LABEL: @oeq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret double 4.200000e+01
+;
+  %cmp = fcmp oeq double %x, 42.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
+  %cond = select i1 %cmp, double %x, double 42.0
+  ret double %cond
+}
+
+; X == 42.0 ? 42.0 : X --> X
+
+define float @oeq_swapped(float %x) {
+; CHECK-LABEL: @oeq_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret float [[X]]
+;
+  %cmp = fcmp oeq float %x, 42.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
+  %cond = select i1 %cmp, float 42.0, float %x
+  ret float %cond
+}
+
 ; x != y ? x : y -> x if it's the right kind of != and at least
 ; one of x and y is not negative zero.
 
-; CHECK: f0
-; CHECK: ret double %x
-define double @f0(double %x) nounwind readnone {
-entry:
-  %cmp = fcmp une double %x, -1.0
-  %cond = select i1 %cmp, double %x, double -1.0
+; X != 42.0 ? X : 42.0 --> X
+
+define double @une(double %x) {
+; CHECK-LABEL: @une(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret double [[X]]
+;
+  %cmp = fcmp une double %x, 42.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
+  %cond = select i1 %cmp, double %x, double 42.0
   ret double %cond
 }
-; CHECK: f1
-; CHECK: ret double -1.000000e+00
-define double @f1(double %x) nounwind readnone {
-entry:
-  %cmp = fcmp une double %x, -1.0
-  %cond = select i1 %cmp, double -1.0, double %x
+
+; X != 42.0 ? 42.0 : X --> 42.0
+
+define double @une_swapped(double %x) {
+; CHECK-LABEL: @une_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret double 4.200000e+01
+;
+  %cmp = fcmp une double %x, 42.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
+  %cond = select i1 %cmp, double 42.0, double %x
   ret double %cond
 }
-; CHECK: f2
-; CHECK: ret double %cond
-define double @f2(double %x, double %y) nounwind readnone {
-entry:
+
+define double @une_could_be_negzero(double %x, double %y) {
+; CHECK-LABEL: @une_could_be_negzero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double [[Y]]
+; CHECK-NEXT:    ret double [[COND]]
+;
   %cmp = fcmp une double %x, %y
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
   %cond = select i1 %cmp, double %x, double %y
   ret double %cond
 }
-; CHECK: f3
-; CHECK: ret double %cond
-define double @f3(double %x, double %y) nounwind readnone {
-entry:
+
+define double @une_swapped_could_be_negzero(double %x, double %y) {
+; CHECK-LABEL: @une_swapped_could_be_negzero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[Y]], double [[X]]
+; CHECK-NEXT:    ret double [[COND]]
+;
   %cmp = fcmp une double %x, %y
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
   %cond = select i1 %cmp, double %y, double %x
   ret double %cond
 }
-; CHECK: f4
-; CHECK: ret double %cond
-define double @f4(double %x) nounwind readnone {
-entry:
+
+define double @one(double %x) {
+; CHECK-LABEL: @one(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[X:%.*]], -1.000000e+00
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double -1.000000e+00
+; CHECK-NEXT:    ret double [[COND]]
+;
   %cmp = fcmp one double %x, -1.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
   %cond = select i1 %cmp, double %x, double -1.0
   ret double %cond
 }
-; CHECK: f5
-; CHECK: ret double %cond
-define double @f5(double %x) nounwind readnone {
-entry:
+
+define double @one_swapped(double %x) {
+; CHECK-LABEL: @one_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[X:%.*]], -1.000000e+00
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double -1.000000e+00, double [[X]]
+; CHECK-NEXT:    ret double [[COND]]
+;
   %cmp = fcmp one double %x, -1.0
+  call void @use(i1 %cmp)      ; extra use to thwart predicate canonicalization
   %cond = select i1 %cmp, double -1.0, double %x
   ret double %cond
 }
+
diff --git a/test/Transforms/InstCombine/fcmp-special.ll b/test/Transforms/InstCombine/fcmp-special.ll
index 8d131b3c2a66bc466e96cc1a2c2530e15f61d4ab..490dab5f24d3c4376739edc5f36edf43e427c1df 100644
--- a/test/Transforms/InstCombine/fcmp-special.ll
+++ b/test/Transforms/InstCombine/fcmp-special.ll
@@ -161,6 +161,7 @@ define <2 x i1> @ord_vec_with_undef(<2 x double> %x) {
   %f = fcmp ord <2 x double> %x, <double 0.0, double undef>
   ret <2 x i1> %f
 }
+
 ; TODO: This could be handled in InstSimplify.
 
 define i1 @nnan_ops_to_fcmp_ord(float %x, float %y) {
@@ -185,3 +186,59 @@ define i1 @nnan_ops_to_fcmp_uno(float %x, float %y) {
   ret i1 %cmp
 }
 
+; TODO: For any predicate/type/FMF, comparison to -0.0 is the same as comparison to +0.0.
+
+define i1 @negative_zero_oeq(float %x) {
+; CHECK-LABEL: @negative_zero_oeq(
+; CHECK-NEXT:    [[R:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = fcmp oeq float %x, -0.0
+  ret i1 %r
+}
+
+define i1 @negative_zero_oge(double %x) {
+; CHECK-LABEL: @negative_zero_oge(
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan oge double [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = fcmp nnan oge double %x, -0.0
+  ret i1 %r
+}
+
+define i1 @negative_zero_uge(half %x) {
+; CHECK-LABEL: @negative_zero_uge(
+; CHECK-NEXT:    [[R:%.*]] = fcmp fast uge half [[X:%.*]], 0xH0000
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %r = fcmp fast uge half %x, -0.0
+  ret i1 %r
+}
+
+define <2 x i1> @negative_zero_olt_vec(<2 x float> %x) {
+; CHECK-LABEL: @negative_zero_olt_vec(
+; CHECK-NEXT:    [[R:%.*]] = fcmp reassoc ninf olt <2 x float> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = fcmp reassoc ninf olt <2 x float> %x, <float -0.0, float -0.0>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @negative_zero_une_vec_undef(<2 x double> %x) {
+; CHECK-LABEL: @negative_zero_une_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = fcmp nnan une <2 x double> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = fcmp nnan une <2 x double> %x, <double -0.0, double undef>
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @negative_zero_ule_vec_mixed(<2 x float> %x) {
+; CHECK-LABEL: @negative_zero_ule_vec_mixed(
+; CHECK-NEXT:    [[R:%.*]] = fcmp ule <2 x float> [[X:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %r = fcmp ule <2 x float> %x, <float 0.0, float -0.0>
+  ret <2 x i1> %r
+}
+
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index ff47496abe9b906a0a3d995735bf13270eb283c3..be7aedc7c60fd151b4e781392088f1b71243bb4b 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -1,31 +1,43 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
-declare double @llvm.fabs.f64(double) readnone
+declare half @llvm.fabs.f16(half)
+declare double @llvm.fabs.f64(double)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 
-define i1 @test1(float %x, float %y) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+define i1 @fpext_fpext(float %x, float %y) {
+; CHECK-LABEL: @fpext_fpext(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ogt float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ext1 = fpext float %x to double
   %ext2 = fpext float %y to double
-  %cmp = fcmp ogt double %ext1, %ext2
+  %cmp = fcmp nnan ogt double %ext1, %ext2
   ret i1 %cmp
 }
 
-define i1 @test2(float %a) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[A:%.*]], 1.000000e+00
+define i1 @fpext_constant(float %a) {
+; CHECK-LABEL: @fpext_constant(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ogt float [[A:%.*]], 1.000000e+00
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ext = fpext float %a to double
-  %cmp = fcmp ogt double %ext, 1.000000e+00
+  %cmp = fcmp ninf ogt double %ext, 1.000000e+00
   ret i1 %cmp
 }
 
-define i1 @test3(float %a) {
-; CHECK-LABEL: @test3(
+define <2 x i1> @fpext_constant_vec_splat(<2 x half> %a) {
+; CHECK-LABEL: @fpext_constant_vec_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ole <2 x half> [[A:%.*]], <half 0xH5140, half 0xH5140>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %ext = fpext <2 x half> %a to <2 x double>
+  %cmp = fcmp nnan ole <2 x double> %ext, <double 42.0, double 42.0>
+  ret <2 x i1> %cmp
+}
+
+define i1 @fpext_constant_lossy(float %a) {
+; CHECK-LABEL: @fpext_constant_lossy(
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[A:%.*]] to double
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[EXT]], 0x3FF0000000000001
 ; CHECK-NEXT:    ret i1 [[CMP]]
@@ -35,8 +47,8 @@ define i1 @test3(float %a) {
   ret i1 %cmp
 }
 
-define i1 @test4(float %a) {
-; CHECK-LABEL: @test4(
+define i1 @fpext_constant_denorm(float %a) {
+; CHECK-LABEL: @fpext_constant_denorm(
 ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[A:%.*]] to double
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[EXT]], 0x36A0000000000000
 ; CHECK-NEXT:    ret i1 [[CMP]]
@@ -76,25 +88,49 @@ define <2 x i1> @fneg_constant_swap_pred_vec_undef(<2 x float> %x) {
   ret <2 x i1> %cmp
 }
 
+; The new fcmp should have the same FMF as the original.
+
+define i1 @fneg_fmf(float %x) {
+; CHECK-LABEL: @fneg_fmf(
+; CHECK-NEXT:    [[R:%.*]] = fcmp fast oeq float [[X:%.*]], -4.200000e+01
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %n = fsub fast float -0.0, %x
+  %r = fcmp fast oeq float %n, 42.0
+  ret i1 %r
+}
+
+; The new fcmp should have the same FMF as the original, vector edition.
+
+define <2 x i1> @fcmp_fneg_fmf_vec(<2 x float> %x) {
+; CHECK-LABEL: @fcmp_fneg_fmf_vec(
+; CHECK-NEXT:    [[R:%.*]] = fcmp reassoc nnan ule <2 x float> [[X:%.*]], <float -4.200000e+01, float 1.900000e+01>
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %n = fsub nsz <2 x float> zeroinitializer, %x
+  %r = fcmp nnan reassoc uge <2 x float> %n, <float 42.0, float -19.0>
+  ret <2 x i1> %r
+}
+
 define i1 @fneg_fneg_swap_pred(float %x, float %y) {
 ; CHECK-LABEL: @fneg_fneg_swap_pred(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp nnan ogt float [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %neg1 = fsub float -0.0, %x
   %neg2 = fsub float -0.0, %y
-  %cmp = fcmp olt float %neg1, %neg2
+  %cmp = fcmp nnan olt float %neg1, %neg2
   ret i1 %cmp
 }
 
 define <2 x i1> @fneg_fneg_swap_pred_vec(<2 x float> %x, <2 x float> %y) {
 ; CHECK-LABEL: @fneg_fneg_swap_pred_vec(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf ogt <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %neg1 = fsub <2 x float> <float -0.0, float -0.0>, %x
   %neg2 = fsub <2 x float> <float -0.0, float -0.0>, %y
-  %cmp = fcmp olt <2 x float> %neg1, %neg2
+  %cmp = fcmp ninf olt <2 x float> %neg1, %neg2
   ret <2 x i1> %cmp
 }
 
@@ -133,194 +169,151 @@ define float @test8(float %x) {
 ; Float comparison to zero shouldn't cast to double.
 }
 
-declare double @fabs(double) readnone
-
-define i32 @test9(double %a) {
-; CHECK-LABEL: @test9(
-; CHECK-NEXT:    ret i32 0
-;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp olt double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @test9_intrinsic(double %a) {
-; CHECK-LABEL: @test9_intrinsic(
-; CHECK-NEXT:    ret i32 0
+define i1 @fabs_uge(double %a) {
+; CHECK-LABEL: @fabs_uge(
+; CHECK-NEXT:    ret i1 true
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp olt double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp uge double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test10(double %a) {
-; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_olt(half %a) {
+; CHECK-LABEL: @fabs_olt(
+; CHECK-NEXT:    ret i1 false
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp ole double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call half @llvm.fabs.f16(half %a)
+  %cmp = fcmp olt half %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test10_intrinsic(double %a) {
-; CHECK-LABEL: @test10_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define <2 x i1> @fabs_ole(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ole(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf oeq <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp ole double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp ninf ole <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
 }
 
-define i32 @test11(double %a) {
-; CHECK-LABEL: @test11(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define <2 x i1> @fabs_ule(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ule(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf arcp ueq <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp ogt double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp ninf arcp ule <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
 }
 
-define i32 @test11_intrinsic(double %a) {
-; CHECK-LABEL: @test11_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_ogt(double %a) {
+; CHECK-LABEL: @fabs_ogt(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc one double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp ogt double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp reassoc ogt double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test12(double %a) {
-; CHECK-LABEL: @test12(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_ugt(double %a) {
+; CHECK-LABEL: @fabs_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc ninf une double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp oge double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp ninf reassoc ugt double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test12_intrinsic(double %a) {
-; CHECK-LABEL: @test12_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_oge(double %a) {
+; CHECK-LABEL: @fabs_oge(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp afn ord double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp oge double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp afn oge double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test13(double %a) {
-; CHECK-LABEL: @test13(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_ult(double %a) {
+; CHECK-LABEL: @fabs_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc arcp uno double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp une double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp reassoc arcp ult double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test13_intrinsic(double %a) {
-; CHECK-LABEL: @test13_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define <2 x i1> @fabs_ult_nnan(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ult_nnan(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp une double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp nnan reassoc arcp ult <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
 }
 
-define i32 @test14(double %a) {
-; CHECK-LABEL: @test14(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_une(half %a) {
+; CHECK-LABEL: @fabs_une(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf une half [[A:%.*]], 0xH0000
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp oeq double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call half @llvm.fabs.f16(half %a)
+  %cmp = fcmp ninf une half %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test14_intrinsic(double %a) {
-; CHECK-LABEL: @test14_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_oeq(double %a) {
+; CHECK-LABEL: @fabs_oeq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc ninf oeq double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp oeq double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp ninf reassoc oeq double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test15(double %a) {
-; CHECK-LABEL: @test15(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define i1 @fabs_one(double %a) {
+; CHECK-LABEL: @fabs_one(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast one double [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp one double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call double @llvm.fabs.f64(double %a)
+  %cmp = fcmp fast one double %call, 0.0
+  ret i1 %cmp
 }
 
-define i32 @test15_intrinsic(double %a) {
-; CHECK-LABEL: @test15_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define <2 x i1> @fabs_ueq(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ueq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp ueq <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp one double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp arcp ueq <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
 }
 
-define i32 @test16(double %a) {
-; CHECK-LABEL: @test16(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define <2 x i1> @fabs_ord(<2 x float> %a) {
+; CHECK-LABEL: @fabs_ord(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp ord <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %call = tail call double @fabs(double %a)
-  %cmp = fcmp ueq double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp arcp ord <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
 }
 
-define i32 @test16_intrinsic(double %a) {
-; CHECK-LABEL: @test16_intrinsic(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double [[A:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+define <2 x i1> @fabs_uno(<2 x float> %a) {
+; CHECK-LABEL: @fabs_uno(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp arcp uno <2 x float> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %call = tail call double @llvm.fabs.f64(double %a)
-  %cmp = fcmp ueq double %call, 0.000000e+00
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
+  %call = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %cmp = fcmp arcp uno <2 x float> %call, zeroinitializer
+  ret <2 x i1> %cmp
 }
 
 ; Don't crash.
@@ -460,7 +453,7 @@ define i1 @test26_recipX_unorderd(float %X) {
 ; Fold <-1.0, -1.0> / X > <-0.0, -0.0>
 define <2 x i1> @test27_recipX_gt_vecsplat(<2 x float> %X) {
 ; CHECK-LABEL: @test27_recipX_gt_vecsplat(
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf olt <2 x float> [[X:%.*]], <float -0.000000e+00, float -0.000000e+00>
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ninf olt <2 x float> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %div = fdiv ninf <2 x float> <float -1.0, float -1.0>, %X
diff --git a/test/Transforms/InstCombine/fpcast.ll b/test/Transforms/InstCombine/fpcast.ll
index 4cada061a8f4ca6b182896736865447943f33589..bfc1de4ff6ddd576b6aa960096be177e0a8df958 100644
--- a/test/Transforms/InstCombine/fpcast.ll
+++ b/test/Transforms/InstCombine/fpcast.ll
@@ -29,8 +29,8 @@ define half @test3(float %a) {
   ret half %c
 }
 
-define half @test4(float %a) {
-; CHECK-LABEL: @test4(
+define half @fneg_fptrunc(float %a) {
+; CHECK-LABEL: @fneg_fptrunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
 ; CHECK-NEXT:    [[C:%.*]] = fsub half 0xH8000, [[TMP1]]
 ; CHECK-NEXT:    ret half [[C]]
@@ -40,6 +40,17 @@ define half @test4(float %a) {
   ret half %c
 }
 
+define <2 x half> @fneg_fptrunc_vec_undef(<2 x float> %a) {
+; CHECK-LABEL: @fneg_fptrunc_vec_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc <2 x float> [[A:%.*]] to <2 x half>
+; CHECK-NEXT:    [[C:%.*]] = fsub <2 x half> <half 0xH8000, half 0xH8000>, [[TMP1]]
+; CHECK-NEXT:    ret <2 x half> [[C]]
+;
+  %b = fsub <2 x float> <float -0.0, float undef>, %a
+  %c = fptrunc <2 x float> %b to <2 x half>
+  ret <2 x half> %c
+}
+
 define half @test4-fast(float %a) {
 ; CHECK-LABEL: @test4-fast(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index fb25c2342798c634b9bff9a13c73be5555b1703a..2de9c66d4639e16a30c55d006a42016cf4b1358b 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -3,7 +3,7 @@
 
 define <1 x i8> @test1(<8 x i8> %in) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <8 x i8> %in, <8 x i8> undef, <1 x i32> <i32 5>
+; CHECK-NEXT:    [[VEC:%.*]] = shufflevector <8 x i8> [[IN:%.*]], <8 x i8> undef, <1 x i32> <i32 5>
 ; CHECK-NEXT:    ret <1 x i8> [[VEC]]
 ;
   %val = extractelement <8 x i8> %in, i32 5
@@ -13,7 +13,7 @@ define <1 x i8> @test1(<8 x i8> %in) {
 
 define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[VEC_3:%.*]] = shufflevector <8 x i16> %in2, <8 x i16> %in, <4 x i32> <i32 11, i32 9, i32 0, i32 10>
+; CHECK-NEXT:    [[VEC_3:%.*]] = shufflevector <8 x i16> [[IN2:%.*]], <8 x i16> [[IN:%.*]], <4 x i32> <i32 11, i32 9, i32 0, i32 10>
 ; CHECK-NEXT:    ret <4 x i16> [[VEC_3]]
 ;
   %elt0 = extractelement <8 x i16> %in, i32 3
@@ -31,8 +31,8 @@ define <4 x i16> @test2(<8 x i16> %in, <8 x i16> %in2) {
 
 define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: @test_vcopyq_lane_p64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i64> %b, <1 x i64> undef, <2 x i32> <i32 0, i32 undef>
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i64> %a, <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i64> [[B:%.*]], <1 x i64> undef, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %elt = extractelement <1 x i64> %b, i32 0
@@ -44,8 +44,8 @@ define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) {
 
 define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {
 ; CHECK-LABEL: @widen_extract2(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT:    [[I2:%.*]] = shufflevector <4 x float> %ins, <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[EXT:%.*]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[I2:%.*]] = shufflevector <4 x float> [[INS:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 2, i32 5>
 ; CHECK-NEXT:    ret <4 x float> [[I2]]
 ;
   %e1 = extractelement <2 x float> %ext, i32 0
@@ -57,8 +57,8 @@ define <4 x float> @widen_extract2(<4 x float> %ins, <2 x float> %ext) {
 
 define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {
 ; CHECK-LABEL: @widen_extract3(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> %ext, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-; CHECK-NEXT:    [[I3:%.*]] = shufflevector <4 x float> %ins, <4 x float> [[TMP1]], <4 x i32> <i32 6, i32 5, i32 4, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x float> [[EXT:%.*]], <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+; CHECK-NEXT:    [[I3:%.*]] = shufflevector <4 x float> [[INS:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 6, i32 5, i32 4, i32 3>
 ; CHECK-NEXT:    ret <4 x float> [[I3]]
 ;
   %e1 = extractelement <3 x float> %ext, i32 0
@@ -72,8 +72,8 @@ define <4 x float> @widen_extract3(<4 x float> %ins, <3 x float> %ext) {
 
 define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
 ; CHECK-LABEL: @widen_extract4(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> %ext, <2 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[I1:%.*]] = shufflevector <8 x float> %ins, <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x float> [[EXT:%.*]], <2 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[I1:%.*]] = shufflevector <8 x float> [[INS:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x float> [[I1]]
 ;
   %e1 = extractelement <2 x float> %ext, i32 0
@@ -86,7 +86,7 @@ define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
 
 define <8 x i16> @pr26015(<4 x i16> %t0) {
 ; CHECK-LABEL: @pr26015(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[T0:%.*]], <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 10, i32 4, i32 5, i32 6, i32 11>
 ; CHECK-NEXT:    ret <8 x i16> [[T5]]
 ;
@@ -103,10 +103,10 @@ define <8 x i16> @pr26015(<4 x i16> %t0) {
 
 define <8 x i16> @pr25999(<4 x i16> %t0, i1 %b) {
 ; CHECK-LABEL: @pr25999(
-; CHECK-NEXT:    [[T1:%.*]] = extractelement <4 x i16> %t0, i32 2
-; CHECK-NEXT:    br i1 %b, label %if, label %end
+; CHECK-NEXT:    [[T1:%.*]] = extractelement <4 x i16> [[T0:%.*]], i32 2
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[IF:%.*]], label [[END:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[T0]], <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[T3:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, i16 [[T1]], i32 3
 ; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> [[T3]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
 ; CHECK-NEXT:    ret <8 x i16> [[T5]]
@@ -137,13 +137,13 @@ end:
 define <4 x double> @pr25999_phis1(i1 %c, <2 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @pr25999_phis1(
 ; CHECK-NEXT:  bb1:
-; CHECK-NEXT:    br i1 %c, label %bb2, label %bb3
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> %a)
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> [[A:%.*]])
+; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ %a, %bb1 ], [ [[R]], %bb2 ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ [[A]], [[BB1:%.*]] ], [ [[R]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ [[B:%.*]], [[BB1]] ], [ zeroinitializer, [[BB2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
 ; CHECK-NEXT:    ret <4 x double> [[TMP4]]
@@ -168,13 +168,13 @@ declare <2 x double> @dummy(<2 x double>)
 define <4 x double> @pr25999_phis2(i1 %c, <2 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: @pr25999_phis2(
 ; CHECK-NEXT:  bb1:
-; CHECK-NEXT:    br i1 %c, label %bb2, label %bb3
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> %a)
-; CHECK-NEXT:    br label %bb3
+; CHECK-NEXT:    [[R:%.*]] = call <2 x double> @dummy(<2 x double> [[A:%.*]])
+; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ %a, %bb1 ], [ [[R]], %bb2 ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ %b, %bb1 ], [ zeroinitializer, %bb2 ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x double> [ [[A]], [[BB1:%.*]] ], [ [[R]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x double> [ [[B:%.*]], [[BB1]] ], [ zeroinitializer, [[BB2]] ]
 ; CHECK-NEXT:    [[D:%.*]] = fadd <2 x double> [[TMP1]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x double> [[D]], <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP0]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
@@ -202,15 +202,15 @@ bb3:
 define double @pr26354(<2 x double>* %tmp, i1 %B) {
 ; CHECK-LABEL: @pr26354(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* %tmp, align 16
+; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* [[TMP:%.*]], align 16
 ; CHECK-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[LD]], i32 0
-; CHECK-NEXT:    br i1 %B, label %if, label %end
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[IF:%.*]], label [[END:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[LD]], i32 1
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double undef>, double [[E2]], i32 3
-; CHECK-NEXT:    br label %end
+; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PH:%.*]] = phi <4 x double> [ undef, %entry ], [ [[I1]], %if ]
+; CHECK-NEXT:    [[PH:%.*]] = phi <4 x double> [ undef, [[ENTRY:%.*]] ], [ [[I1]], [[IF]] ]
 ; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x double> [[PH]], i32 1
 ; CHECK-NEXT:    [[MU:%.*]] = fmul double [[E1]], [[E3]]
 ; CHECK-NEXT:    ret double [[MU]]
@@ -239,11 +239,11 @@ end:
 define <4 x float> @PR30923(<2 x float> %x) {
 ; CHECK-LABEL: @PR30923(
 ; CHECK-NEXT:  bb1:
-; CHECK-NEXT:    [[EXT1:%.*]] = extractelement <2 x float> %x, i32 1
+; CHECK-NEXT:    [[EXT1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
 ; CHECK-NEXT:    store float [[EXT1]], float* undef, align 4
-; CHECK-NEXT:    br label %bb2
+; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[EXT2:%.*]] = extractelement <2 x float> %x, i32 0
+; CHECK-NEXT:    [[EXT2:%.*]] = extractelement <2 x float> [[X]], i32 0
 ; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef>, float [[EXT2]], i32 2
 ; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float [[EXT1]], i32 3
 ; CHECK-NEXT:    ret <4 x float> [[INS2]]
@@ -283,3 +283,145 @@ entry:
   %ret = select i1 %e, <4 x i32> %b, <4 x i32> zeroinitializer
   ret <4 x i32> %ret
 }
+
+; PR34724: https://bugs.llvm.org/show_bug.cgi?id=34724
+
+define <4 x float> @collectShuffleElts(<2 x float> %x, float %y) {
+; CHECK-LABEL: @collectShuffleElts(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x float> [[X]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 1
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[X1]], i32 2
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[Y:%.*]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[V3]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %v1 = insertelement <4 x float> undef, float %x0, i32 1
+  %v2 = insertelement <4 x float> %v1, float %x1, i32 2
+  %v3 = insertelement <4 x float> %v2, float %y, i32 3
+  ret <4 x float> %v3
+}
+
+; Simplest case - insert scalar into undef, then shuffle that value in place into another vector.
+
+define <4 x float> @insert_shuffle(float %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_shuffle(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 0
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> undef, float %x, i32 0
+  %r = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %r
+}
+
+; Insert scalar into some element of a dummy vector, then move it to a different element in another vector.
+
+define <4 x float> @insert_shuffle_translate(float %x, <4 x float> %y) {
+; CHECK-LABEL: @insert_shuffle_translate(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> undef, float %x, i32 0
+  %r = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x float> %r
+}
+
+; The vector operand of the insert is irrelevant.
+
+define <4 x float> @insert_not_undef_shuffle_translate(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 2
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 3
+  %r = shufflevector <4 x float> %xv, <4 x float> %y, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
+  ret <4 x float> %r
+}
+
+; The insert may be the 2nd operand of the shuffle. The shuffle mask can include undef elements.
+
+define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute(
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 2
+  %r = shufflevector <4 x float> %y, <4 x float> %xv, <4 x i32> <i32 0, i32 6, i32 2, i32 undef>
+  ret <4 x float> %r
+}
+
+; Both shuffle operands may be inserts - choose the correct side.
+
+define <4 x float> @insert_insert_shuffle_translate(float %x1, float %x2, <4 x float> %q) {
+; CHECK-LABEL: @insert_insert_shuffle_translate(
+; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X2:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[XV2]], float [[X1:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv1 = insertelement <4 x float> %q, float %x1, i32 0
+  %xv2 = insertelement <4 x float> %q, float %x2, i32 2
+  %r = shufflevector <4 x float> %xv1, <4 x float> %xv2, <4 x i32> <i32 4, i32 0, i32 6, i32 7>
+  ret <4 x float> %r
+}
+
+; Both shuffle operands may be inserts - choose the correct side.
+
+define <4 x float> @insert_insert_shuffle_translate_commute(float %x1, float %x2, <4 x float> %q) {
+; CHECK-LABEL: @insert_insert_shuffle_translate_commute(
+; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[XV1]], float [[X2:%.*]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv1 = insertelement <4 x float> %q, float %x1, i32 0
+  %xv2 = insertelement <4 x float> %q, float %x2, i32 2
+  %r = shufflevector <4 x float> %xv1, <4 x float> %xv2, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x float> %r
+}
+
+; Negative test - this only works if the shuffle is choosing exactly 1 element from 1 of the inputs.
+; TODO: But this could be a special-case because we're inserting into the same base vector.
+
+define <4 x float> @insert_insert_shuffle_translate_wrong_mask(float %x1, float %x2, <4 x float> %q) {
+; CHECK-LABEL: @insert_insert_shuffle_translate_wrong_mask(
+; CHECK-NEXT:    [[XV1:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X1:%.*]], i32 0
+; CHECK-NEXT:    [[XV2:%.*]] = insertelement <4 x float> [[Q]], float [[X2:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[XV1]], <4 x float> [[XV2]], <4 x i32> <i32 0, i32 6, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv1 = insertelement <4 x float> %q, float %x1, i32 0
+  %xv2 = insertelement <4 x float> %q, float %x2, i32 2
+  %r = shufflevector <4 x float> %xv1, <4 x float> %xv2, <4 x i32> <i32 0, i32 6, i32 2, i32 7>
+  ret <4 x float> %r
+}
+
+; The insert may have other uses.
+
+declare void @use(<4 x float>)
+
+define <4 x float> @insert_not_undef_shuffle_translate_commute_uses(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute_uses(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> [[Q:%.*]], float [[X:%.*]], i32 2
+; CHECK-NEXT:    call void @use(<4 x float> [[XV]])
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[X]], i32 0
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 2
+  call void @use(<4 x float> %xv)
+  %r = shufflevector <4 x float> %y, <4 x float> %xv, <4 x i32> <i32 6, i32 undef, i32 2, i32 3>
+  ret <4 x float> %r
+}
+
+; Negative test - size-changing shuffle.
+
+define <5 x float> @insert_not_undef_shuffle_translate_commute_lengthen(float %x, <4 x float> %y, <4 x float> %q) {
+; CHECK-LABEL: @insert_not_undef_shuffle_translate_commute_lengthen(
+; CHECK-NEXT:    [[XV:%.*]] = insertelement <4 x float> undef, float [[X:%.*]], i32 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[XV]], <5 x i32> <i32 0, i32 6, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <5 x float> [[R]]
+;
+  %xv = insertelement <4 x float> %q, float %x, i32 2
+  %r = shufflevector <4 x float> %y, <4 x float> %xv, <5 x i32> <i32 0, i32 6, i32 2, i32 undef, i32 undef>
+  ret <5 x float> %r
+}
+
diff --git a/test/Transforms/InstCombine/load-combine-metadata.ll b/test/Transforms/InstCombine/load-combine-metadata.ll
index b7f42e7a0e768dff01a983abb8c557830c24bca0..536f1bb75f63a5fb4d2927ab77cb38fc908d8ba2 100644
--- a/test/Transforms/InstCombine/load-combine-metadata.ll
+++ b/test/Transforms/InstCombine/load-combine-metadata.ll
@@ -17,7 +17,7 @@ define void @test_load_load_combine_metadata(i32*, i32*, i32*) {
   ret void
 }
 
-; CHECK: ![[RANGE]] = !{i32 0, i32 5, i32 7, i32 9}
+; CHECK: ![[RANGE]] = !{i32 0, i32 5}
 !0 = !{ i32 0, i32 5 }
 !1 = !{ i32 7, i32 9 }
 !2 = !{!2}
diff --git a/test/Transforms/InstCombine/logical-select.ll b/test/Transforms/InstCombine/logical-select.ll
index dd95cc02751a43224142ac609ad6f614e8b7f2a2..999e4512723a2f7357235b251b86cc111995bfea 100644
--- a/test/Transforms/InstCombine/logical-select.ll
+++ b/test/Transforms/InstCombine/logical-select.ll
@@ -531,3 +531,107 @@ define <4 x i32> @vec_sel_xor_multi_use(<4 x i32> %a, <4 x i32> %b, <4 x i1> %c)
   ret <4 x i32> %add
 }
 
+; The 'ashr' guarantees that we have a bitmask, so this is select with truncated condition.
+
+define i32 @allSignBits(i32 %cond, i32 %tval, i32 %fval) {
+; CHECK-LABEL: @allSignBits(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[COND:%.*]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[TVAL:%.*]], i32 [[FVAL:%.*]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %bitmask = ashr i32 %cond, 31
+  %not_bitmask = xor i32 %bitmask, -1
+  %a1 = and i32 %tval, %bitmask
+  %a2 = and i32 %not_bitmask, %fval
+  %sel = or i32 %a1, %a2
+  ret i32 %sel
+}
+
+define <4 x i8> @allSignBits_vec(<4 x i8> %cond, <4 x i8> %tval, <4 x i8> %fval) {
+; CHECK-LABEL: @allSignBits_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i8> [[COND:%.*]], <i8 -1, i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[FVAL:%.*]], <4 x i8> [[TVAL:%.*]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
+;
+  %bitmask = ashr <4 x i8> %cond, <i8 7, i8 7, i8 7, i8 7>
+  %not_bitmask = xor <4 x i8> %bitmask, <i8 -1, i8 -1, i8 -1, i8 -1>
+  %a1 = and <4 x i8> %tval, %bitmask
+  %a2 = and <4 x i8> %fval, %not_bitmask
+  %sel = or <4 x i8> %a2, %a1
+  ret <4 x i8> %sel
+}
+
+; Negative test - make sure that bitcasts from FP do not cause a crash.
+
+define <2 x i64> @fp_bitcast(<4 x i1> %cmp, <2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @fp_bitcast(
+; CHECK-NEXT:    [[SIA:%.*]] = fptosi <2 x double> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[SIB:%.*]] = fptosi <2 x double> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[AND1:%.*]] = and <2 x i64> [[SIA]], [[BC1]]
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x double> [[B]] to <2 x i64>
+; CHECK-NEXT:    [[AND2:%.*]] = and <2 x i64> [[SIB]], [[BC2]]
+; CHECK-NEXT:    [[OR:%.*]] = or <2 x i64> [[AND2]], [[AND1]]
+; CHECK-NEXT:    ret <2 x i64> [[OR]]
+;
+  %sia = fptosi <2 x double> %a to <2 x i64>
+  %sib = fptosi <2 x double> %b to <2 x i64>
+  %bc1 = bitcast <2 x double> %a to <2 x i64>
+  %and1 = and <2 x i64> %sia, %bc1
+  %bc2 = bitcast <2 x double> %b to <2 x i64>
+  %and2 = and <2 x i64> %sib, %bc2
+  %or = or <2 x i64> %and2, %and1
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; CHECK-LABEL: @computesignbits_through_shuffles(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole <4 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <4 x i32> [[SEXT]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF_OR1:%.*]] = or <4 x i32> [[S1]], [[S2]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <4 x i32> [[SHUF_OR1]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <4 x i32> [[SHUF_OR1]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+; CHECK-NEXT:    [[SHUF_OR2:%.*]] = or <4 x i32> [[S3]], [[S4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[SHUF_OR2]] to <4 x i1>
+; CHECK-NEXT:    [[DOTV:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[Z:%.*]], <4 x float> [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[DOTV]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %cmp = fcmp ole <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %s1 = shufflevector <4 x i32> %sext, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %s2 = shufflevector <4 x i32> %sext, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %shuf_or1 = or <4 x i32> %s1, %s2
+  %s3 = shufflevector <4 x i32> %shuf_or1, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+  %s4 = shufflevector <4 x i32> %shuf_or1, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
+  %shuf_or2 = or <4 x i32> %s3, %s4
+  %not_or2 = xor <4 x i32> %shuf_or2, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %xbc = bitcast <4 x float> %x to <4 x i32>
+  %zbc = bitcast <4 x float> %z to <4 x i32>
+  %and1 = and <4 x i32> %not_or2, %xbc
+  %and2 = and <4 x i32> %shuf_or2, %zbc
+  %sel = or <4 x i32> %and1, %and2
+  ret <4 x i32> %sel
+}
+
+define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i32> %y, <4 x i1> %cond1, <4 x i1> %cond2) {
+; CHECK-LABEL: @computesignbits_through_two_input_shuffle(
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[COND]] to <4 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %sext1 = sext <4 x i1> %cond1 to <4 x i32>
+  %sext2 = sext <4 x i1> %cond2 to <4 x i32>
+  %cond = shufflevector <4 x i32> %sext1, <4 x i32> %sext2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %notcond = xor <4 x i32> %cond, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and1 = and <4 x i32> %notcond, %x
+  %and2 = and <4 x i32> %cond, %y
+  %sel = or <4 x i32> %and1, %and2
+  ret <4 x i32> %sel
+}
+
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index e66151025b52f503532fc3cbb215fc5d2c57aa59..7e7b6d9aee567a4e27c60d5c8b5c135162dd1d08 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -257,3 +257,32 @@ define void @test11() {
   call void @_ZdlPv(i8* %call)
   ret void
 }
+
+;; Check that the optimization that moves a call to free in its predecessor
+;; block (see test6) also happens when noop casts are involved.
+; CHECK-LABEL: @test12(
+define void @test12(i32* %foo) minsize {
+; CHECK:  %tobool = icmp eq i32* %foo, null
+;; Everything before the call to free should have been moved as well.
+; CHECK-NEXT:   %bitcast = bitcast i32* %foo to i8*
+;; Call to free moved
+; CHECK-NEXT: tail call void @free(i8* %bitcast)
+; CHECK-NEXT: br i1 %tobool, label %if.end, label %if.then
+; CHECK: if.then:
+;; Block is now empty and may be simplified by simplifycfg
+; CHECK-NEXT:   br label %if.end
+; CHECK: if.end:
+; CHECK-NEXT:  ret void
+entry:
+  %tobool = icmp eq i32* %foo, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %bitcast = bitcast i32* %foo to i8*
+  tail call void @free(i8* %bitcast)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
diff --git a/test/Transforms/InstCombine/maximum.ll b/test/Transforms/InstCombine/maximum.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bd97a3794d45b6b35cd8decc741fa44f9a45a065
--- /dev/null
+++ b/test/Transforms/InstCombine/maximum.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+declare float @llvm.maximum.f32(float, float)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+
+declare double @llvm.maximum.f64(double, double)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+define float @constant_fold_maximum_f32() {
+; CHECK-LABEL: @constant_fold_maximum_f32(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float 1.0, float 2.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_inv() {
+; CHECK-LABEL: @constant_fold_maximum_f32_inv(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float 2.0, float 1.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_nan0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_nan0(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.maximum.f32(float 0x7FF8000000000000, float 2.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_nan1() {
+; CHECK-LABEL: @constant_fold_maximum_f32_nan1(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.maximum.f32(float 2.0, float 0x7FF8000000000000)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_nan_nan() {
+; CHECK-LABEL: @constant_fold_maximum_f32_nan_nan(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.maximum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_p0_p0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_p0_p0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float 0.0, float 0.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_p0_n0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_p0_n0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float 0.0, float -0.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_n0_p0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_n0_p0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float -0.0, float 0.0)
+  ret float %x
+}
+
+define float @constant_fold_maximum_f32_n0_n0() {
+; CHECK-LABEL: @constant_fold_maximum_f32_n0_n0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %x = call float @llvm.maximum.f32(float -0.0, float -0.0)
+  ret float %x
+}
+
+define <4 x float> @constant_fold_maximum_v4f32() {
+; CHECK-LABEL: @constant_fold_maximum_v4f32(
+; CHECK-NEXT:    ret <4 x float> <float 2.000000e+00, float 8.000000e+00, float 1.000000e+01, float 9.000000e+00>
+;
+  %x = call <4 x float> @llvm.maximum.v4f32(<4 x float> <float 1.0, float 8.0, float 3.0, float 9.0>, <4 x float> <float 2.0, float 2.0, float 10.0, float 5.0>)
+  ret <4 x float> %x
+}
+
+define double @constant_fold_maximum_f64() {
+; CHECK-LABEL: @constant_fold_maximum_f64(
+; CHECK-NEXT:    ret double 2.000000e+00
+;
+  %x = call double @llvm.maximum.f64(double 1.0, double 2.0)
+  ret double %x
+}
+
+define double @constant_fold_maximum_f64_nan0() {
+; CHECK-LABEL: @constant_fold_maximum_f64_nan0(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.maximum.f64(double 0x7FF8000000000000, double 2.0)
+  ret double %x
+}
+
+define double @constant_fold_maximum_f64_nan1() {
+; CHECK-LABEL: @constant_fold_maximum_f64_nan1(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.maximum.f64(double 2.0, double 0x7FF8000000000000)
+  ret double %x
+}
+
+define double @constant_fold_maximum_f64_nan_nan() {
+; CHECK-LABEL: @constant_fold_maximum_f64_nan_nan(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.maximum.f64(double 0x7FF8000000000000, double 0x7FF8000000000000)
+  ret double %x
+}
+
+define float @canonicalize_constant_maximum_f32(float %x) {
+; CHECK-LABEL: @canonicalize_constant_maximum_f32(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float 1.000000e+00)
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %y = call float @llvm.maximum.f32(float 1.0, float %x)
+  ret float %y
+}
+
+define float @maximum_f32_nan_val(float %x) {
+; CHECK-LABEL: @maximum_f32_nan_val(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %y = call float @llvm.maximum.f32(float 0x7FF8000000000000, float %x)
+  ret float %y
+}
+
+define float @maximum_f32_val_nan(float %x) {
+; CHECK-LABEL: @maximum_f32_val_nan(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %y = call float @llvm.maximum.f32(float %x, float 0x7FF8000000000000)
+  ret float %y
+}
+
+define float @maximum_f32_1_maximum_val_p0(float %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_val_p0(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float %x, float 0.0)
+  %z = call float @llvm.maximum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maximum_f32_1_maximum_p0_val_fast(float %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_p0_val_fast(
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.maximum.f32(float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float 0.0, float %x)
+  %z = call fast float @llvm.maximum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maximum_f32_1_maximum_p0_val_nnan_ninf(float %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_p0_val_nnan_ninf(
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.maximum.f32(float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float 0.0, float %x)
+  %z = call nnan ninf float @llvm.maximum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maximum_f32_p0_maximum_val_n0(float %x) {
+; CHECK-LABEL: @maximum_f32_p0_maximum_val_n0(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float %x, float -0.0)
+  %z = call float @llvm.maximum.f32(float %y, float 0.0)
+  ret float %z
+}
+
+define float @maximum_f32_1_maximum_p0_val(float %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_p0_val(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maximum.f32(float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maximum.f32(float 0.0, float %x)
+  %z = call float @llvm.maximum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define <2 x float> @maximum_f32_1_maximum_val_p0_val_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @maximum_f32_1_maximum_val_p0_val_v2f32(
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: ret <2 x float> [[RES]]
+  %y = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+  %z = call <2 x float> @llvm.maximum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
+  ret <2 x float> %z
+}
+
+define float @maximum4(float %x, float %y, float %z, float %w) {
+; CHECK-LABEL: @maximum4(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.maximum.f32(float [[Z:%.*]], float [[W:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = call float @llvm.maximum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[C]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %z, float %w)
+  %c = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %c
+}
+
+; PR37404 - https://bugs.llvm.org/show_bug.cgi?id=37404
+
+define <2 x float> @neg_neg(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_neg(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %negx = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %negy = fsub <2 x float> <float -0.0, float -0.0>, %y
+  %r = call <2 x float> @llvm.maximum.v2f32(<2 x float> %negx, <2 x float> %negy)
+  ret <2 x float> %r
+}
+
+; FMF is not required, but it should be propagated from the intrinsic (not the fnegs).
+
+define float @neg_neg_vec_fmf(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_vec_fmf(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub fast float -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %negx = fsub arcp float -0.0, %x
+  %negy = fsub afn float -0.0, %y
+  %r = call fast float @llvm.maximum.f32(float %negx, float %negy)
+  ret float %r
+}
+
+; 1 extra use of an intermediate value should still allow the fold,
+; but 2 would require more instructions than we started with.
+
+declare void @use(float)
+define float @neg_neg_extra_use_x(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_extra_use_x(
+; CHECK-NEXT:    [[NEGX:%.*]] = fsub float -0.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.minimum.f32(float [[X]], float [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub float -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    call void @use(float [[NEGX]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %negx = fsub float -0.0, %x
+  %negy = fsub float -0.0, %y
+  %r = call float @llvm.maximum.f32(float %negx, float %negy)
+  call void @use(float %negx)
+  ret float %r
+}
+
+define float @neg_neg_extra_use_y(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_extra_use_y(
+; CHECK-NEXT:    [[NEGY:%.*]] = fsub float -0.000000e+00, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y]])
+; CHECK-NEXT:    [[R:%.*]] = fsub float -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    call void @use(float [[NEGY]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %negx = fsub float -0.0, %x
+  %negy = fsub float -0.0, %y
+  %r = call float @llvm.maximum.f32(float %negx, float %negy)
+  call void @use(float %negy)
+  ret float %r
+}
+
+define float @neg_neg_extra_use_x_and_y(float %x, float %y) {
+; CHECK-LABEL: @neg_neg_extra_use_x_and_y(
+; CHECK-NEXT:    [[NEGX:%.*]] = fsub float -0.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[NEGY:%.*]] = fsub float -0.000000e+00, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.maximum.f32(float [[NEGX]], float [[NEGY]])
+; CHECK-NEXT:    call void @use(float [[NEGX]])
+; CHECK-NEXT:    call void @use(float [[NEGY]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %negx = fsub float -0.0, %x
+  %negy = fsub float -0.0, %y
+  %r = call float @llvm.maximum.f32(float %negx, float %negy)
+  call void @use(float %negx)
+  call void @use(float %negy)
+  ret float %r
+}
diff --git a/test/Transforms/InstCombine/maxnum.ll b/test/Transforms/InstCombine/maxnum.ll
index e3630ba3ea560de26e63ce478b7c9ff1302cea17..d81158c066f1b21218422c7f048b9e219f93afff 100644
--- a/test/Transforms/InstCombine/maxnum.ll
+++ b/test/Transforms/InstCombine/maxnum.ll
@@ -145,6 +145,60 @@ define float @maxnum_f32_val_nan(float %x) {
   ret float %y
 }
 
+define float @maxnum_f32_1_maxnum_val_p0(float %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_val_p0(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float %x, float 0.0)
+  %z = call float @llvm.maxnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maxnum_f32_1_maxnum_p0_val_fast(float %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_p0_val_fast(
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.maxnum.f32(float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float 0.0, float %x)
+  %z = call fast float @llvm.maxnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maxnum_f32_1_maxnum_p0_val_nnan_ninf(float %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_p0_val_nnan_ninf(
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.maxnum.f32(float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float 0.0, float %x)
+  %z = call nnan ninf float @llvm.maxnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @maxnum_f32_p0_maxnum_val_n0(float %x) {
+; CHECK-LABEL: @maxnum_f32_p0_maxnum_val_n0(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float %x, float -0.0)
+  %z = call float @llvm.maxnum.f32(float %y, float 0.0)
+  ret float %z
+}
+
+define float @maxnum_f32_1_maxnum_p0_val(float %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_p0_val(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.maxnum.f32(float %x, float 1.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.maxnum.f32(float 0.0, float %x)
+  %z = call float @llvm.maxnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define <2 x float> @maxnum_f32_1_maxnum_val_p0_val_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @maxnum_f32_1_maxnum_val_p0_val_v2f32(
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; CHECK-NEXT: ret <2 x float> [[RES]]
+  %y = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+  %z = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
+  ret <2 x float> %z
+}
+
 define float @maxnum4(float %x, float %y, float %z, float %w) {
 ; CHECK-LABEL: @maxnum4(
 ; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]])
diff --git a/test/Transforms/InstCombine/minimum.ll b/test/Transforms/InstCombine/minimum.ll
new file mode 100644
index 0000000000000000000000000000000000000000..32aae6417eba434e24bae66aa581fe09e12fb1cc
--- /dev/null
+++ b/test/Transforms/InstCombine/minimum.ll
@@ -0,0 +1,317 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+declare float @llvm.minimum.f32(float, float)
+declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+
+declare double @llvm.minimum.f64(double, double)
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
+
+declare float @llvm.maximum.f32(float, float)
+
+define float @constant_fold_minimum_f32() {
+; CHECK-LABEL: @constant_fold_minimum_f32(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float 1.0, float 2.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_inv() {
+; CHECK-LABEL: @constant_fold_minimum_f32_inv(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float 2.0, float 1.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_nan0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_nan0(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.minimum.f32(float 0x7FF8000000000000, float 2.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_nan1() {
+; CHECK-LABEL: @constant_fold_minimum_f32_nan1(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.minimum.f32(float 2.0, float 0x7FF8000000000000)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_nan_nan() {
+; CHECK-LABEL: @constant_fold_minimum_f32_nan_nan(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %x = call float @llvm.minimum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_p0_p0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_p0_p0(
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float 0.0, float 0.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_p0_n0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_p0_n0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float 0.0, float -0.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_n0_p0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_n0_p0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float -0.0, float 0.0)
+  ret float %x
+}
+
+define float @constant_fold_minimum_f32_n0_n0() {
+; CHECK-LABEL: @constant_fold_minimum_f32_n0_n0(
+; CHECK-NEXT:    ret float -0.000000e+00
+;
+  %x = call float @llvm.minimum.f32(float -0.0, float -0.0)
+  ret float %x
+}
+
+define <4 x float> @constant_fold_minimum_v4f32() {
+; CHECK-LABEL: @constant_fold_minimum_v4f32(
+; CHECK-NEXT:    ret <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 5.000000e+00>
+;
+  %x = call <4 x float> @llvm.minimum.v4f32(<4 x float> <float 1.0, float 8.0, float 3.0, float 9.0>, <4 x float> <float 2.0, float 2.0, float 10.0, float 5.0>)
+  ret <4 x float> %x
+}
+
+define double @constant_fold_minimum_f64() {
+; CHECK-LABEL: @constant_fold_minimum_f64(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %x = call double @llvm.minimum.f64(double 1.0, double 2.0)
+  ret double %x
+}
+
+define double @constant_fold_minimum_f64_nan0() {
+; CHECK-LABEL: @constant_fold_minimum_f64_nan0(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.minimum.f64(double 0x7FF8000000000000, double 2.0)
+  ret double %x
+}
+
+define double @constant_fold_minimum_f64_nan1() {
+; CHECK-LABEL: @constant_fold_minimum_f64_nan1(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.minimum.f64(double 2.0, double 0x7FF8000000000000)
+  ret double %x
+}
+
+define double @constant_fold_minimum_f64_nan_nan() {
+; CHECK-LABEL: @constant_fold_minimum_f64_nan_nan(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %x = call double @llvm.minimum.f64(double 0x7FF8000000000000, double 0x7FF8000000000000)
+  ret double %x
+}
+
+define float @canonicalize_constant_minimum_f32(float %x) {
+; CHECK-LABEL: @canonicalize_constant_minimum_f32(
+; CHECK-NEXT:    [[Y:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 1.000000e+00)
+; CHECK-NEXT:    ret float [[Y]]
+;
+  %y = call float @llvm.minimum.f32(float 1.0, float %x)
+  ret float %y
+}
+
+define float @minimum_f32_nan_val(float %x) {
+; CHECK-LABEL: @minimum_f32_nan_val(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %y = call float @llvm.minimum.f32(float 0x7FF8000000000000, float %x)
+  ret float %y
+}
+
+define float @minimum_f32_val_nan(float %x) {
+; CHECK-LABEL: @minimum_f32_val_nan(
+; CHECK-NEXT:    ret float 0x7FF8000000000000
+;
+  %y = call float @llvm.minimum.f32(float %x, float 0x7FF8000000000000)
+  ret float %y
+}
+
+define float @minimum_f32_1_minimum_val_p0(float %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_val_p0(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float %x, float 0.0)
+  %z = call float @llvm.minimum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minimum_f32_1_minimum_p0_val_fast(float %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_p0_val_fast(
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.minimum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float 0.0, float %x)
+  %z = call fast float @llvm.minimum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minimum_f32_1_minimum_p0_val_nnan_ninf(float %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_p0_val_nnan_ninf(
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.minimum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float 0.0, float %x)
+  %z = call nnan ninf float @llvm.minimum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minimum_f32_p0_minimum_val_n0(float %x) {
+; CHECK-LABEL: @minimum_f32_p0_minimum_val_n0(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float %x, float -0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float %x, float -0.0)
+  %z = call float @llvm.minimum.f32(float %y, float 0.0)
+  ret float %z
+}
+
+define float @minimum_f32_1_minimum_p0_val(float %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_p0_val(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minimum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minimum.f32(float 0.0, float %x)
+  %z = call float @llvm.minimum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define <2 x float> @minimum_f32_1_minimum_val_p0_val_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @minimum_f32_1_minimum_val_p0_val_v2f32(
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+; CHECK-NEXT: ret <2 x float> [[RES]]
+  %y = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+  %z = call <2 x float> @llvm.minimum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
+  ret <2 x float> %z
+}
+
+define float @minimum4(float %x, float %y, float %z, float %w) {
+; CHECK-LABEL: @minimum4(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.minimum.f32(float [[Z:%.*]], float [[W:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = call float @llvm.minimum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[C]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %z, float %w)
+  %c = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %c
+}
+
+define float @minimum_x_maximum_x_y(float %x, float %y) {
+; CHECK-LABEL: @minimum_x_maximum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.minimum.f32(float [[X]], float [[A]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %x, float %a)
+  ret float %b
+}
+
+define float @maximum_x_minimum_x_y(float %x, float %y) {
+; CHECK-LABEL: @maximum_x_minimum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.maximum.f32(float [[X]], float [[A]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %x, float %a)
+  ret float %b
+}
+
+; PR37405 - https://bugs.llvm.org/show_bug.cgi?id=37405
+
+define double @neg_neg(double %x, double %y) {
+; CHECK-LABEL: @neg_neg(
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.maximum.f64(double [[X:%.*]], double [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub double -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    ret double [[R]]
+;
+  %negx = fsub double -0.0, %x
+  %negy = fsub double -0.0, %y
+  %r = call double @llvm.minimum.f64(double %negx, double %negy)
+  ret double %r
+}
+
+; FMF is not required, but it should be propagated from the intrinsic (not the fnegs).
+; Also, make sure this works with vectors.
+
+define <2 x double> @neg_neg_vec_fmf(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @neg_neg_vec_fmf(
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan ninf <2 x double> @llvm.maximum.v2f64(<2 x double> [[X:%.*]], <2 x double> [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub nnan ninf <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
+  %negx = fsub reassoc <2 x double> <double -0.0, double -0.0>, %x
+  %negy = fsub fast <2 x double> <double -0.0, double -0.0>, %y
+  %r = call nnan ninf <2 x double> @llvm.minimum.v2f64(<2 x double> %negx, <2 x double> %negy)
+  ret <2 x double> %r
+}
+
+; 1 extra use of an intermediate value should still allow the fold,
+; but 2 would require more instructions than we started with.
+
+declare void @use(double)
+define double @neg_neg_extra_use_x(double %x, double %y) {
+; CHECK-LABEL: @neg_neg_extra_use_x(
+; CHECK-NEXT:    [[NEGX:%.*]] = fsub double -0.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.maximum.f64(double [[X]], double [[Y:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fsub double -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    call void @use(double [[NEGX]])
+; CHECK-NEXT:    ret double [[R]]
+;
+  %negx = fsub double -0.0, %x
+  %negy = fsub double -0.0, %y
+  %r = call double @llvm.minimum.f64(double %negx, double %negy)
+  call void @use(double %negx)
+  ret double %r
+}
+
+define double @neg_neg_extra_use_y(double %x, double %y) {
+; CHECK-LABEL: @neg_neg_extra_use_y(
+; CHECK-NEXT:    [[NEGY:%.*]] = fsub double -0.000000e+00, [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @llvm.maximum.f64(double [[X:%.*]], double [[Y]])
+; CHECK-NEXT:    [[R:%.*]] = fsub double -0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    call void @use(double [[NEGY]])
+; CHECK-NEXT:    ret double [[R]]
+;
+  %negx = fsub double -0.0, %x
+  %negy = fsub double -0.0, %y
+  %r = call double @llvm.minimum.f64(double %negx, double %negy)
+  call void @use(double %negy)
+  ret double %r
+}
+
+define double @neg_neg_extra_use_x_and_y(double %x, double %y) {
+; CHECK-LABEL: @neg_neg_extra_use_x_and_y(
+; CHECK-NEXT:    [[NEGX:%.*]] = fsub double -0.000000e+00, [[X:%.*]]
+; CHECK-NEXT:    [[NEGY:%.*]] = fsub double -0.000000e+00, [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call double @llvm.minimum.f64(double [[NEGX]], double [[NEGY]])
+; CHECK-NEXT:    call void @use(double [[NEGX]])
+; CHECK-NEXT:    call void @use(double [[NEGY]])
+; CHECK-NEXT:    ret double [[R]]
+;
+  %negx = fsub double -0.0, %x
+  %negy = fsub double -0.0, %y
+  %r = call double @llvm.minimum.f64(double %negx, double %negy)
+  call void @use(double %negx)
+  call void @use(double %negy)
+  ret double %r
+}
diff --git a/test/Transforms/InstCombine/minmax-demandbits.ll b/test/Transforms/InstCombine/minmax-demandbits.ll
new file mode 100644
index 0000000000000000000000000000000000000000..29a569663d21b80d071bd2f47a833208adf91ea8
--- /dev/null
+++ b/test/Transforms/InstCombine/minmax-demandbits.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+
+define i32 @and_umax_less(i32 %A) {
+; CHECK-LABEL: @and_umax_less(
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[A:%.*]], -32
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %l0 = icmp ugt i32 31, %A
+  %l1 = select i1 %l0, i32 31, i32 %A
+  %x = and i32 %l1, -32
+  ret i32 %x
+}
+
+define i32 @and_umax_muchless(i32 %A) {
+; CHECK-LABEL: @and_umax_muchless(
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[A:%.*]], -32
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %l0 = icmp ugt i32 12, %A
+  %l1 = select i1 %l0, i32 12, i32 %A
+  %x = and i32 %l1, -32
+  ret i32 %x
+}
+
+define i32 @and_umax_more(i32 %A) {
+; CHECK-LABEL: @and_umax_more(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[A:%.*]], 32
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[TMP1]], i32 [[A]], i32 32
+; CHECK-NEXT:    [[X:%.*]] = and i32 [[L1]], -32
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %l0 = icmp ugt i32 32, %A
+  %l1 = select i1 %l0, i32 32, i32 %A
+  %x = and i32 %l1, -32
+  ret i32 %x
+}
+
+define i32 @shr_umax(i32 %A) {
+; CHECK-LABEL: @shr_umax(
+; CHECK-NEXT:    [[X:%.*]] = lshr i32 [[A:%.*]], 4
+; CHECK-NEXT:    ret i32 [[X]]
+;
+  %l0 = icmp ugt i32 15, %A
+  %l1 = select i1 %l0, i32 15, i32 %A
+  %x = lshr i32 %l1, 4
+  ret i32 %x
+}
+
+; Various constants for C2 & umax(A, C1)
+
+define i8 @t_0_1(i8 %A) {
+; CHECK-LABEL: @t_0_1(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], 1
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 0
+  %l1 = select i1 %l2, i8 %A, i8 0
+  %x = and i8 %l1, 1
+  ret i8 %x
+}
+
+define i8 @t_0_10(i8 %A) {
+; CHECK-LABEL: @t_0_10(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], 10
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 0
+  %l1 = select i1 %l2, i8 %A, i8 0
+  %x = and i8 %l1, 10
+  ret i8 %x
+}
+
+define i8 @t_1_10(i8 %A) {
+; CHECK-LABEL: @t_1_10(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], 10
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 1
+  %l1 = select i1 %l2, i8 %A, i8 1
+  %x = and i8 %l1, 10
+  ret i8 %x
+}
+
+define i8 @t_2_4(i8 %A) {
+; CHECK-LABEL: @t_2_4(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], 4
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 2
+  %l1 = select i1 %l2, i8 %A, i8 2
+  %x = and i8 %l1, 4
+  ret i8 %x
+}
+
+define i8 @t_2_192(i8 %A) {
+; CHECK-LABEL: @t_2_192(
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[A:%.*]], -64
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 2
+  %l1 = select i1 %l2, i8 %A, i8 2
+  %x = and i8 %l1, -64
+  ret i8 %x
+}
+
+define i8 @t_2_63_or(i8 %A) {
+; CHECK-LABEL: @t_2_63_or(
+; CHECK-NEXT:    [[X:%.*]] = or i8 [[A:%.*]], 63
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 2
+  %l1 = select i1 %l2, i8 %A, i8 2
+  %x = or i8 %l1, 63
+  ret i8 %x
+}
+
+define i8 @f_1_1(i8 %A) {
+; CHECK-LABEL: @f_1_1(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 1
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], 1
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 1
+  %l1 = select i1 %l2, i8 %A, i8 1
+  %x = and i8 %l1, 1
+  ret i8 %x
+}
+
+define i8 @f_32_32(i8 %A) {
+; CHECK-LABEL: @f_32_32(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 32
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 32
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], -32
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 32
+  %l1 = select i1 %l2, i8 %A, i8 32
+  %x = and i8 %l1, -32
+  ret i8 %x
+}
+
+define i8 @f_191_192(i8 %A) {
+; CHECK-LABEL: @f_191_192(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], -65
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 -65
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], -64
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 191
+  %l1 = select i1 %l2, i8 %A, i8 191
+  %x = and i8 %l1, 192
+  ret i8 %x
+}
+
+define i8 @f_10_1(i8 %A) {
+; CHECK-LABEL: @f_10_1(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ugt i8 [[A:%.*]], 10
+; CHECK-NEXT:    [[L1:%.*]] = select i1 [[L2]], i8 [[A]], i8 10
+; CHECK-NEXT:    [[X:%.*]] = and i8 [[L1]], 1
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %l2 = icmp ugt i8 %A, 10
+  %l1 = select i1 %l2, i8 %A, i8 10
+  %x = and i8 %l1, 1
+  ret i8 %x
+}
+
+define i32 @and_umin(i32 %A) {
+; CHECK-LABEL: @and_umin(
+; CHECK-NEXT:    ret i32 0
+;
+  %l0 = icmp ult i32 15, %A
+  %l1 = select i1 %l0, i32 15, i32 %A
+  %x = and i32 %l1, -32
+  ret i32 %x
+}
+
+define i32 @or_umin(i32 %A) {
+; CHECK-LABEL: @or_umin(
+; CHECK-NEXT:    ret i32 31
+;
+  %l0 = icmp ult i32 15, %A
+  %l1 = select i1 %l0, i32 15, i32 %A
+  %x = or i32 %l1, 31
+  ret i32 %x
+}
+
+define i8 @or_min_31_30(i8 %A) {
+; CHECK-LABEL: @or_min_31_30(
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[A:%.*]], 31
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %cmp = icmp ult i8 %A, -30
+  %min = select i1 %cmp, i8 %A, i8 -30
+  %r = or i8 %min, 31
+  ret i8 %r
+}
+
+define i8 @and_min_7_7(i8 %A) {
+; CHECK-LABEL: @and_min_7_7(
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A:%.*]], -8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %l2 = icmp ult i8 %A, -7
+  %min = select i1 %l2, i8 %A, i8 -7
+  %r = and i8 %min, -8
+  ret i8 %r
+}
+
+define i8 @and_min_7_8(i8 %A) {
+; CHECK-LABEL: @and_min_7_8(
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A:%.*]], -8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %l2 = icmp ult i8 %A, -8
+  %min = select i1 %l2, i8 %A, i8 -8
+  %r = and i8 %min, -8
+  ret i8 %r
+}
+
+define i8 @and_min_7_9(i8 %A) {
+; CHECK-LABEL: @and_min_7_9(
+; CHECK-NEXT:    [[L2:%.*]] = icmp ult i8 [[A:%.*]], -9
+; CHECK-NEXT:    [[MIN:%.*]] = select i1 [[L2]], i8 [[A]], i8 -9
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[MIN]], -8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %l2 = icmp ult i8 %A, -9
+  %min = select i1 %l2, i8 %A, i8 -9
+  %r = and i8 %min, -8
+  ret i8 %r
+}
+
diff --git a/test/Transforms/InstCombine/minmax-fp.ll b/test/Transforms/InstCombine/minmax-fp.ll
index b94bce2dbb8081d11e03150103007f0313768b79..11418156a4858451cd71af78750ca498bd458a09 100644
--- a/test/Transforms/InstCombine/minmax-fp.ll
+++ b/test/Transforms/InstCombine/minmax-fp.ll
@@ -4,8 +4,8 @@
 ; This is the canonical form for a type-changing min/max.
 define double @t1(float %a) {
 ; CHECK-LABEL: @t1(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
@@ -18,8 +18,8 @@ define double @t1(float %a) {
 ; Check this is converted into canonical form, as above.
 define double @t2(float %a) {
 ; CHECK-LABEL: @t2(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
@@ -32,8 +32,8 @@ define double @t2(float %a) {
 ; Same again, with trunc.
 define float @t4(double %a) {
 ; CHECK-LABEL: @t4(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge double %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], double 5.000000e+00, double %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge double [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], double 5.000000e+00, double [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc double [[TMP1]] to float
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
@@ -46,8 +46,8 @@ define float @t4(double %a) {
 ; different values, should not be converted.
 define double @t5(float %a) {
 ; CHECK-LABEL: @t5(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext float %a to double
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[A]] to double
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], double [[TMP2]], double 5.001000e+00
 ; CHECK-NEXT:    ret double [[TMP3]]
 ;
@@ -57,13 +57,15 @@ define double @t5(float %a) {
   ret double %3
 }
 
-; Signed zero, should not be converted
+; From IEEE754: "Comparisons shall ignore the sign of zero (so +0 = -0)."
+; So the compare constant may be treated as +0.0, and we sink the fpext.
+
 define double @t6(float %a) {
 ; CHECK-LABEL: @t6(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float %a, -0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext float %a to double
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], double [[TMP2]], double 0.000000e+00
-; CHECK-NEXT:    ret double [[TMP3]]
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
+; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %1 = fcmp ult float %a, -0.0
   %2 = fpext float %a to double
@@ -71,13 +73,15 @@ define double @t6(float %a) {
   ret double %3
 }
 
-; Signed zero, should not be converted
+; From IEEE754: "Comparisons shall ignore the sign of zero (so +0 = -0)."
+; So the compare constant may be treated as -0.0, and we sink the fpext.
+
 define double @t7(float %a) {
 ; CHECK-LABEL: @t7(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext float %a to double
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], double [[TMP2]], double -0.000000e+00
-; CHECK-NEXT:    ret double [[TMP3]]
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float -0.000000e+00, float [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext float [[TMP1]] to double
+; CHECK-NEXT:    ret double [[TMP2]]
 ;
   %1 = fcmp ult float %a, 0.0
   %2 = fpext float %a to double
@@ -85,10 +89,40 @@ define double @t7(float %a) {
   ret double %3
 }
 
+; min(min(x, 0.0), 0.0) --> min(x, 0.0)
+
+define float @fmin_fmin_zero_mismatch(float %x) {
+; CHECK-LABEL: @fmin_fmin_zero_mismatch(
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[MIN2:%.*]] = select i1 [[TMP1]], float [[X]], float 0.000000e+00
+; CHECK-NEXT:    ret float [[MIN2]]
+;
+  %cmp1 = fcmp olt float %x, -0.0
+  %min1 = select i1 %cmp1, float %x, float 0.0
+  %cmp2 = fcmp olt float %min1, 0.0
+  %min2 = select i1 %cmp2, float %min1, float 0.0
+  ret float %min2
+}
+
+; max(max(x, -0.0), -0.0) --> max(x, -0.0)
+
+define float @fmax_fmax_zero_mismatch(float %x) {
+; CHECK-LABEL: @fmax_fmax_zero_mismatch(
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[MAX11:%.*]] = select i1 [[TMP1]], float [[X]], float -0.000000e+00
+; CHECK-NEXT:    ret float [[MAX11]]
+;
+  %cmp1 = fcmp ogt float %x, 0.0
+  %max1 = select i1 %cmp1, float %x, float -0.0
+  %cmp2 = fcmp ogt float 0.0, %max1
+  %max2 = select i1 %cmp2, float -0.0, float %max1
+  ret float %max2
+}
+
 define i64 @t8(float %a) {
 ; CHECK-LABEL: @t8(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 5.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 5.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 5.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptoui float [[TMP1]] to i64
 ; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
@@ -100,8 +134,8 @@ define i64 @t8(float %a) {
 
 define i8 @t9(float %a) {
 ; CHECK-LABEL: @t9(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
@@ -114,8 +148,8 @@ define i8 @t9(float %a) {
   ; Either operand could be NaN, but fast modifier applied.
 define i8 @t11(float %a, float %b) {
 ; CHECK-LABEL: @t11(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp fast oge float %b, %a
-; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[DOTINV]], float %a, float %b
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp fast oge float [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[DOTINV]], float [[A]], float [[B]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptosi float [[DOTV]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
@@ -129,8 +163,8 @@ define i8 @t11(float %a, float %b) {
 ; Either operand could be NaN, but nnan modifier applied.
 define i8 @t12(float %a, float %b) {
 ; CHECK-LABEL: @t12(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp nnan oge float %b, %a
-; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[DOTINV]], float %a, float %b
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp nnan oge float [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[DOTV:%.*]] = select i1 [[DOTINV]], float [[A]], float [[B]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptosi float [[DOTV]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
@@ -144,8 +178,8 @@ define i8 @t12(float %a, float %b) {
 ; Float and int values do not match.
 define i8 @t13(float %a) {
 ; CHECK-LABEL: @t13(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float %a, 1.500000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float %a to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ult float [[A:%.*]], 1.500000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[A]] to i8
 ; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], i8 [[TMP2]], i8 1
 ; CHECK-NEXT:    ret i8 [[TMP3]]
 ;
@@ -158,8 +192,8 @@ define i8 @t13(float %a) {
 ; %a could be -0.0, but it doesn't matter because the conversion to int is the same for 0.0 or -0.0.
 define i8 @t14(float %a) {
 ; CHECK-LABEL: @t14(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp oge float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
@@ -171,8 +205,8 @@ define i8 @t14(float %a) {
 
 define i8 @t14_commute(float %a) {
 ; CHECK-LABEL: @t14_commute(
-; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], float %a, float 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], float [[A]], float 0.000000e+00
 ; CHECK-NEXT:    [[TMP3:%.*]] = fptosi float [[TMP2]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP3]]
 ;
@@ -184,8 +218,8 @@ define i8 @t14_commute(float %a) {
 
 define i8 @t15(float %a) {
 ; CHECK-LABEL: @t15(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp nsz oge float %a, 0.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float %a
+; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp nsz oge float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[DOTINV]], float 0.000000e+00, float [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
@@ -197,8 +231,8 @@ define i8 @t15(float %a) {
 
 define double @t16(i32 %x) {
 ; CHECK-LABEL: @t16(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
-; CHECK-NEXT:    [[CST:%.*]] = sitofp i32 %x to double
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[CST:%.*]] = sitofp i32 [[X]] to double
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], double [[CST]], double 5.000000e-01
 ; CHECK-NEXT:    ret double [[SEL]]
 ;
@@ -210,8 +244,8 @@ define double @t16(i32 %x) {
 
 define double @t17(i32 %x) {
 ; CHECK-LABEL: @t17(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 %x, 2
-; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[TMP1]], i32 %x, i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[TMP1]], i32 [[X]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = sitofp i32 [[SEL1]] to double
 ; CHECK-NEXT:    ret double [[TMP2]]
 ;
diff --git a/test/Transforms/InstCombine/minnum.ll b/test/Transforms/InstCombine/minnum.ll
index a5236d2e50f550a2b81ae04e280ca4dc32819562..73b4f0c9251a65b24f860574ae2bffa056bde567 100644
--- a/test/Transforms/InstCombine/minnum.ll
+++ b/test/Transforms/InstCombine/minnum.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
 declare float @llvm.minnum.f32(float, float)
-declare float @llvm.minnum.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
 
 declare double @llvm.minnum.f64(double, double)
@@ -147,6 +147,60 @@ define float @minnum_f32_val_nan(float %x) {
   ret float %y
 }
 
+define float @minnum_f32_1_minnum_val_p0(float %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_val_p0(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float %x, float 0.0)
+  %z = call float @llvm.minnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minnum_f32_1_minnum_p0_val_fast(float %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_p0_val_fast(
+; CHECK-NEXT: [[RES:%.*]] = call fast float @llvm.minnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float 0.0, float %x)
+  %z = call fast float @llvm.minnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minnum_f32_1_minnum_p0_val_nnan_ninf(float %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_p0_val_nnan_ninf(
+; CHECK-NEXT: [[RES:%.*]] = call nnan ninf float @llvm.minnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float 0.0, float %x)
+  %z = call nnan ninf float @llvm.minnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define float @minnum_f32_p0_minnum_val_n0(float %x) {
+; CHECK-LABEL: @minnum_f32_p0_minnum_val_n0(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float %x, float -0.0)
+  %z = call float @llvm.minnum.f32(float %y, float 0.0)
+  ret float %z
+}
+
+define float @minnum_f32_1_minnum_p0_val(float %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_p0_val(
+; CHECK-NEXT: [[RES:%.*]] = call float @llvm.minnum.f32(float %x, float 0.000000e+00)
+; CHECK-NEXT: ret float [[RES]]
+  %y = call float @llvm.minnum.f32(float 0.0, float %x)
+  %z = call float @llvm.minnum.f32(float %y, float 1.0)
+  ret float %z
+}
+
+define <2 x float> @minnum_f32_1_minnum_val_p0_val_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @minnum_f32_1_minnum_val_p0_val_v2f32(
+; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+; CHECK-NEXT: ret <2 x float> [[RES]]
+  %y = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> zeroinitializer)
+  %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %y, <2 x float><float 1.0, float 1.0>)
+  ret <2 x float> %z
+}
+
 define float @minnum4(float %x, float %y, float %z, float %w) {
 ; CHECK-LABEL: @minnum4(
 ; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]])
diff --git a/test/Transforms/InstCombine/narrow-switch.ll b/test/Transforms/InstCombine/narrow-switch.ll
index 474bd820c8f8e8e7dc539e615413657a701cebdc..a8fa3e528db2cfbda869df155f8e9d418d2e6530 100644
--- a/test/Transforms/InstCombine/narrow-switch.ll
+++ b/test/Transforms/InstCombine/narrow-switch.ll
@@ -3,9 +3,6 @@
 ; RUN: opt < %s -instcombine -S -data-layout=n32    | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32
 ; RUN: opt < %s -instcombine -S -data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64
 
-; In all cases, the data-layout is irrelevant. We should shrink as much as possible in InstCombine
-; and allow the backend to expand as much as needed to ensure optimal codegen for any target.
-
 define i32 @positive1(i64 %a) {
 ; ALL-LABEL: @positive1(
 ; ALL:         switch i32
@@ -102,13 +99,19 @@ return:
 ; Make sure to avoid assertion crashes and use the type before
 ; truncation to generate the sub constant expressions that leads
 ; to the recomputed condition.
+; We allow to truncate from i64 to i59 if in 32-bit mode,
+; because both are illegal.
 
 define void @trunc64to59(i64 %a) {
 ; ALL-LABEL: @trunc64to59(
-; ALL:         switch i59
-; ALL-NEXT:    i59 0, label %sw.bb1
-; ALL-NEXT:    i59 18717182647723699, label %sw.bb2
-; ALL-NEXT:    ]
+; ALL-CHECK32:         switch i59
+; ALL-CHECK32-NEXT:    i59 0, label %sw.bb1
+; ALL-CHECK32-NEXT:    i59 18717182647723699, label %sw.bb2
+; ALL-CHECK32-NEXT:    ]
+; ALL-CHECK64:         switch i64
+; ALL-CHECK64-NEXT:    i64 0, label %sw.bb1
+; ALL-CHECK64-NEXT:    i64 18717182647723699, label %sw.bb2
+; ALL-CHECK64-NEXT:    ]
 ;
 entry:
   %tmp0 = and i64 %a, 15
@@ -206,3 +209,54 @@ return:                                           ; preds = %sw.epilog, %sw.bb2,
   ret i32 %rval
 }
 
+; https://llvm.org/bugs/show_bug.cgi?id=29009
+
+@a = global i32 0, align 4
+@njob = global i32 0, align 4
+
+declare i32 @goo()
+
+; Make sure we do not shrink to illegal types (i3 in this case)
+; if original type is legal (i32 in this case)
+
+define void @PR29009() {
+; ALL-LABEL: @PR29009(
+; ALL:         switch i32
+; ALL-NEXT:    i32 0, label
+; ALL-NEXT:    i32 3, label
+; ALL-NEXT:    ]
+;
+  br label %1
+
+; <label>:1:                                      ; preds = %10, %0
+  %2 = load volatile i32, i32* @njob, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %11
+
+; <label>:4:                                      ; preds = %1
+  %5 = call i32 @goo()
+  %6 = and i32 %5, 7
+  switch i32 %6, label %7 [
+    i32 0, label %8
+    i32 3, label %9
+  ]
+
+; <label>:7:                                      ; preds = %4
+  store i32 6, i32* @a, align 4
+  br label %10
+
+; <label>:8:                                      ; preds = %4
+  store i32 1, i32* @a, align 4
+  br label %10
+
+; <label>:9:                                      ; preds = %4
+  store i32 2, i32* @a, align 4
+  br label %10
+
+; <label>:10:                                     ; preds = %13, %12, %11, %10, %9, %8, %7
+  br label %1
+
+; <label>:11:                                     ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/InstCombine/no_sink_instruction.ll b/test/Transforms/InstCombine/no_sink_instruction.ll
new file mode 100644
index 0000000000000000000000000000000000000000..caeba16fa2e8cde8e8c462b3f08542c1ecae0747
--- /dev/null
+++ b/test/Transforms/InstCombine/no_sink_instruction.ll
@@ -0,0 +1,19 @@
+; RUN: opt -instcombine -instcombine-code-sinking=0 -S < %s | FileCheck %s
+
+define i32 @test(i1 %C, i32 %A, i32 %B) {
+; CHECK-LABEL: @test(
+; CHECK: sdiv i32
+; CHECK-NEXT: add i32
+entry:
+        %tmp.2 = sdiv i32 %A, %B                ; <i32> [#uses=1]
+        %tmp.9 = add i32 %B, %A         ; <i32> [#uses=1]
+        br i1 %C, label %then, label %endif
+
+then:           ; preds = %entry
+; CHECK: ret i32
+        ret i32 %tmp.9
+
+endif:          ; preds = %entry
+; CHECK: ret i32
+        ret i32 %tmp.2
+}
diff --git a/test/Transforms/InstCombine/nsw.ll b/test/Transforms/InstCombine/nsw.ll
index 0bed76717ce9db13a4d0427b51c6366cad4226a1..8cb6421268fbda691a5b7579ecc5ebad17245f38 100644
--- a/test/Transforms/InstCombine/nsw.ll
+++ b/test/Transforms/InstCombine/nsw.ll
@@ -1,83 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; CHECK-LABEL: @sub1(
-; CHECK: %y = sub i32 0, %x
-; CHECK: %z = sdiv i32 %y, 337
-; CHECK: ret i32 %z
 define i32 @sub1(i32 %x) {
+; CHECK-LABEL: @sub1(
+; CHECK-NEXT:    [[Y:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[Z:%.*]] = sdiv i32 [[Y]], 337
+; CHECK-NEXT:    ret i32 [[Z]]
+;
   %y = sub i32 0, %x
   %z = sdiv i32 %y, 337
   ret i32 %z
 }
 
-; CHECK-LABEL: @sub2(
-; CHECK: %z = sdiv i32 %x, -337
-; CHECK: ret i32 %z
 define i32 @sub2(i32 %x) {
+; CHECK-LABEL: @sub2(
+; CHECK-NEXT:    [[Z:%.*]] = sdiv i32 [[X:%.*]], -337
+; CHECK-NEXT:    ret i32 [[Z]]
+;
   %y = sub nsw i32 0, %x
   %z = sdiv i32 %y, 337
   ret i32 %z
 }
 
+define i1 @shl_icmp(i64 %X) {
 ; CHECK-LABEL: @shl_icmp(
-; CHECK: %B = icmp eq i64 %X, 0
-; CHECK: ret i1 %B
-define i1 @shl_icmp(i64 %X) nounwind {
+; CHECK-NEXT:    [[B:%.*]] = icmp eq i64 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[B]]
+;
   %A = shl nuw i64 %X, 2   ; X/4
   %B = icmp eq i64 %A, 0
   ret i1 %B
 }
 
+define i64 @shl1(i64 %X, i64* %P) {
 ; CHECK-LABEL: @shl1(
-; CHECK: %B = shl nuw nsw i64 %A, 8
-; CHECK: ret i64 %B
-define i64 @shl1(i64 %X, i64* %P) nounwind {
+; CHECK-NEXT:    [[A:%.*]] = and i64 [[X:%.*]], 312
+; CHECK-NEXT:    store i64 [[A]], i64* [[P:%.*]], align 4
+; CHECK-NEXT:    [[B:%.*]] = shl nuw nsw i64 [[A]], 8
+; CHECK-NEXT:    ret i64 [[B]]
+;
   %A = and i64 %X, 312
   store i64 %A, i64* %P  ; multiple uses of A.
   %B = shl i64 %A, 8
   ret i64 %B
 }
 
+define i32 @preserve1(i32 %x) {
 ; CHECK-LABEL: @preserve1(
-; CHECK: add nsw i32 %x, 5
-define i32 @preserve1(i32 %x) nounwind {
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[X:%.*]], 5
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
   %add = add nsw i32 %x, 2
   %add3 = add nsw i32 %add, 3
   ret i32 %add3
 }
 
+define i8 @nopreserve1(i8 %x) {
 ; CHECK-LABEL: @nopreserve1(
-; CHECK: add i8 %x, -126
-define i8 @nopreserve1(i8 %x) nounwind {
+; CHECK-NEXT:    [[ADD3:%.*]] = add i8 [[X:%.*]], -126
+; CHECK-NEXT:    ret i8 [[ADD3]]
+;
   %add = add nsw i8 %x, 127
   %add3 = add nsw i8 %add, 3
   ret i8 %add3
 }
 
+define i8 @nopreserve2(i8 %x) {
 ; CHECK-LABEL: @nopreserve2(
-; CHECK: add i8 %x, 3
-define i8 @nopreserve2(i8 %x) nounwind {
+; CHECK-NEXT:    [[ADD3:%.*]] = add i8 [[X:%.*]], 3
+; CHECK-NEXT:    ret i8 [[ADD3]]
+;
   %add = add i8 %x, 1
   %add3 = add nsw i8 %add, 2
   ret i8 %add3
 }
 
+define i8 @nopreserve3(i8 %A, i8 %B) {
 ; CHECK-LABEL: @nopreserve3(
-; CHECK: add i8 %A, %B
-; CHECK: add i8
-define i8 @nopreserve3(i8 %A, i8 %B) nounwind {
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[Y]], 20
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
   %x = add i8 %A, 10
   %y = add i8 %B, 10
   %add = add nsw i8 %x, %y
   ret i8 %add
 }
 
+define i8 @nopreserve4(i8 %A, i8 %B) {
 ; CHECK-LABEL: @nopreserve4(
-; CHECK: add i8 %A, %B
-; CHECK: add i8
-define i8 @nopreserve4(i8 %A, i8 %B) nounwind {
+; CHECK-NEXT:    [[Y:%.*]] = add i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[Y]], 20
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
   %x = add nsw i8 %A, 10
   %y = add nsw i8 %B, 10
   %add = add nsw i8 %x, %y
   ret i8 %add
 }
+
+; TODO: computeKnownBits() should look through a shufflevector.
+
+define <3 x i32> @shl_nuw_nsw_shuffle_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @shl_nuw_nsw_shuffle_splat_vec(
+; CHECK-NEXT:    [[T2:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[T2]], <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[T3:%.*]] = shl nsw <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
+; CHECK-NEXT:    ret <3 x i32> [[T3]]
+;
+  %t2 = zext <2 x i8> %x to <2 x i32>
+  %shuf = shufflevector <2 x i32> %t2, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
+  %t3 = shl <3 x i32> %shuf, <i32 17, i32 17, i32 17>
+  ret <3 x i32> %t3
+}
+
+; Negative test - if the shuffle mask contains an undef, we bail out to
+; avoid propagating information that may not be used consistently by callers.
+
+define <3 x i32> @shl_nuw_nsw_shuffle_undef_elt_splat_vec(<2 x i8> %x) {
+; CHECK-LABEL: @shl_nuw_nsw_shuffle_undef_elt_splat_vec(
+; CHECK-NEXT:    [[T2:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[T2]], <2 x i32> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+; CHECK-NEXT:    [[T3:%.*]] = shl <3 x i32> [[SHUF]], <i32 17, i32 17, i32 17>
+; CHECK-NEXT:    ret <3 x i32> [[T3]]
+;
+  %t2 = zext <2 x i8> %x to <2 x i32>
+  %shuf = shufflevector <2 x i32> %t2, <2 x i32> undef, <3 x i32> <i32 1, i32 undef, i32 0>
+  %t3 = shl <3 x i32> %shuf, <i32 17, i32 17, i32 17>
+  ret <3 x i32> %t3
+}
+
diff --git a/test/Transforms/InstCombine/operand-complexity.ll b/test/Transforms/InstCombine/operand-complexity.ll
new file mode 100644
index 0000000000000000000000000000000000000000..20abe7b48f9fd6cb4b54e917201b4fdc7c48e3c7
--- /dev/null
+++ b/test/Transforms/InstCombine/operand-complexity.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; 'Negate' is considered less complex than a normal binop, so the mul should have the binop as the first operand.
+
+define i8 @neg(i8 %x) {
+; CHECK-LABEL: @neg(
+; CHECK-NEXT:    [[BO:%.*]] = udiv i8 [[X:%.*]], 42
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i8 0, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = mul i8 [[BO]], [[NEGX]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %bo = udiv i8 %x, 42
+  %negx = sub i8 0, %x
+  %r = mul i8 %negx, %bo
+  ret i8 %r
+}
+
+define <2 x i8> @neg_vec(<2 x i8> %x) {
+; CHECK-LABEL: @neg_vec(
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
+; CHECK-NEXT:    [[NEGX:%.*]] = sub <2 x i8> zeroinitializer, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[BO]], [[NEGX]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
+  %negx = sub <2 x i8> <i8 0, i8 0>, %x
+  %r = mul <2 x i8> %negx, %bo
+  ret <2 x i8> %r
+}
+
+define <2 x i8> @neg_vec_undef(<2 x i8> %x) {
+; CHECK-LABEL: @neg_vec_undef(
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
+; CHECK-NEXT:    [[NEGX:%.*]] = sub <2 x i8> <i8 0, i8 undef>, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[BO]], [[NEGX]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
+  %negx = sub <2 x i8> <i8 0, i8 undef>, %x
+  %r = mul <2 x i8> %negx, %bo
+  ret <2 x i8> %r
+}
+
+; 'Not' is considered less complex than a normal binop, so the mul should have the binop as the first operand.
+
+define i8 @not(i8 %x) {
+; CHECK-LABEL: @not(
+; CHECK-NEXT:    [[BO:%.*]] = udiv i8 [[X:%.*]], 42
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = mul i8 [[BO]], [[NOTX]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %bo = udiv i8 %x, 42
+  %notx = xor i8 -1, %x
+  %r = mul i8 %notx, %bo
+  ret i8 %r
+}
+
+define <2 x i8> @not_vec(<2 x i8> %x) {
+; CHECK-LABEL: @not_vec(
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
+; CHECK-NEXT:    [[NOTX:%.*]] = xor <2 x i8> [[X]], <i8 -1, i8 -1>
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[BO]], [[NOTX]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
+  %notx = xor <2 x i8> <i8 -1, i8 -1>, %x
+  %r = mul <2 x i8> %notx, %bo
+  ret <2 x i8> %r
+}
+
+define <2 x i8> @not_vec_undef(<2 x i8> %x) {
+; CHECK-LABEL: @not_vec_undef(
+; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
+; CHECK-NEXT:    [[NOTX:%.*]] = xor <2 x i8> [[X]], <i8 -1, i8 undef>
+; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[BO]], [[NOTX]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
+  %notx = xor <2 x i8> <i8 -1, i8 undef>, %x
+  %r = mul <2 x i8> %notx, %bo
+  ret <2 x i8> %r
+}
+
+; 'Fneg' is considered less complex than a normal binop, so the fmul should have the binop as the first operand.
+; Extra uses are required to ensure that the fneg is not canonicalized after the fmul.
+
+declare void @use(float)
+declare void @use_vec(<2 x float>)
+
+define float @fneg(float %x) {
+; CHECK-LABEL: @fneg(
+; CHECK-NEXT:    [[BO:%.*]] = fdiv float [[X:%.*]], 4.200000e+01
+; CHECK-NEXT:    [[FNEGX:%.*]] = fsub float -0.000000e+00, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = fmul float [[BO]], [[FNEGX]]
+; CHECK-NEXT:    call void @use(float [[FNEGX]])
+; CHECK-NEXT:    ret float [[R]]
+;
+  %bo = fdiv float %x, 42.0
+  %fnegx = fsub float -0.0, %x
+  %r = fmul float %fnegx, %bo
+  call void @use(float %fnegx)
+  ret float %r
+}
+
+define <2 x float> @fneg_vec(<2 x float> %x) {
+; CHECK-LABEL: @fneg_vec(
+; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], <float 4.200000e+01, float -4.200000e+01>
+; CHECK-NEXT:    [[FNEGX:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = fmul <2 x float> [[BO]], [[FNEGX]]
+; CHECK-NEXT:    call void @use_vec(<2 x float> [[FNEGX]])
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %bo = fdiv <2 x float> %x, <float 42.0, float -42.0>
+  %fnegx = fsub <2 x float> <float -0.0, float -0.0>, %x
+  %r = fmul <2 x float> %fnegx, %bo
+  call void @use_vec(<2 x float> %fnegx)
+  ret <2 x float> %r
+}
+
+define <2 x float> @fneg_vec_undef(<2 x float> %x) {
+; CHECK-LABEL: @fneg_vec_undef(
+; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], <float 4.200000e+01, float -4.200000e+01>
+; CHECK-NEXT:    [[FNEGX:%.*]] = fsub <2 x float> <float -0.000000e+00, float undef>, [[X]]
+; CHECK-NEXT:    [[R:%.*]] = fmul <2 x float> [[BO]], [[FNEGX]]
+; CHECK-NEXT:    call void @use_vec(<2 x float> [[FNEGX]])
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %bo = fdiv <2 x float> %x, <float 42.0, float -42.0>
+  %fnegx = fsub <2 x float> <float -0.0, float undef>, %x
+  %r = fmul <2 x float> %fnegx, %bo
+  call void @use_vec(<2 x float> %fnegx)
+  ret <2 x float> %r
+}
+
diff --git a/test/Transforms/InstCombine/pr27343.ll b/test/Transforms/InstCombine/pr27343.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5a9267b16af0a89c163d8add07e82f5f8b405184
--- /dev/null
+++ b/test/Transforms/InstCombine/pr27343.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -instcombine | FileCheck %s
+
+define i32 @__isnan(float %x) alwaysinline nounwind optsize {
+; CHECK-LABEL: @__isnan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast float [[X:%.*]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[DOTCAST]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SHL]], -16777216
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %x.addr = alloca float, align 4
+  store float %x, float* %x.addr, align 4
+  %0 = load float, float* %x.addr, align 4
+  %1 = bitcast float %0 to i32
+  %shl = shl i32 %1, 1
+  %cmp = icmp ugt i32 %shl, -16777216
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i1 @icmp_shl7(i32 %x) {
+; CHECK-LABEL: @icmp_shl7(
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[SHL]], 4608
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shl = shl i32 %x, 7
+  %cmp = icmp slt i32 %shl, 4608
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/pr39177.ll b/test/Transforms/InstCombine/pr39177.ll
new file mode 100644
index 0000000000000000000000000000000000000000..35c5ce0d3f621983d69300f41fb08bdcf6990cae
--- /dev/null
+++ b/test/Transforms/InstCombine/pr39177.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] }
+%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
+
+@stderr = external global %struct._IO_FILE*, align 8
+@.str = private constant [8 x i8] c"crash!\0A\00", align 1
+
+@fwrite = alias i64 (i8*, i64, i64, %struct._IO_FILE*), i64 (i8*, i64, i64, %struct._IO_FILE*)* @__fwrite_alias
+
+define i64 @__fwrite_alias(i8* %ptr, i64 %size, i64 %n, %struct._IO_FILE* %s) {
+; CHECK-LABEL: @__fwrite_alias(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %size.addr = alloca i64, align 8
+  %n.addr = alloca i64, align 8
+  %s.addr = alloca %struct._IO_FILE*, align 8
+  store i8* %ptr, i8** %ptr.addr, align 8
+  store i64 %size, i64* %size.addr, align 8
+  store i64 %n, i64* %n.addr, align 8
+  store %struct._IO_FILE* %s, %struct._IO_FILE** %s.addr, align 8
+  ret i64 0
+}
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @__fwrite_alias(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i64 7, i64 1, %struct._IO_FILE* [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %0 = load %struct._IO_FILE*, %struct._IO_FILE** @stderr, align 8
+  %call = call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i32 0, i32 0))
+  ret void
+}
+
+declare i32 @fprintf(%struct._IO_FILE*, i8*, ...)
diff --git a/test/Transforms/InstCombine/select-binop-cmp.ll b/test/Transforms/InstCombine/select-binop-cmp.ll
index 5609643235d2ef73543fc531a5c3ee9008d65a9f..a473acd73049369f33b74479250e430f3d0e1766 100644
--- a/test/Transforms/InstCombine/select-binop-cmp.ll
+++ b/test/Transforms/InstCombine/select-binop-cmp.ll
@@ -142,7 +142,7 @@ define i32 @select_xor_inv_icmp2(i32 %x, i32 %y, i32 %z) {
 
 define float @select_fadd_fcmp(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z:%.*]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -152,9 +152,23 @@ define float @select_fadd_fcmp(float %x, float %y, float %z) {
   ret float %C
 }
 
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_poszero(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fadd_fcmp_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z:%.*]], float [[Y:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp oeq float %x, 0.0
+  %B = fadd nsz float %z, %x
+  %C = select i1 %A, float %B, float %y
+  ret float %C
+}
+
 define float @select_fadd_fcmp_2(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_2(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -166,9 +180,25 @@ define float @select_fadd_fcmp_2(float %x, float %y, float %v) {
   ret float %C
 }
 
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_2_poszero(float %x, float %y, float %v) {
+; CHECK-LABEL: @select_fadd_fcmp_2_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp une float %x, 0.0
+  %z = fadd float %v, 0.0 ; cannot produce -0.0
+  %B = fadd float %z, %x
+  %C = select i1 %A, float %y, float %B
+  ret float %C
+}
+
 define float @select_fadd_fcmp_3(float %x, float %y) {
 ; CHECK-LABEL: @select_fadd_fcmp_3(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float 6.000000e+00
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -178,9 +208,23 @@ define float @select_fadd_fcmp_3(float %x, float %y) {
   ret float %C
 }
 
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_3_poszero(float %x, float %y) {
+; CHECK-LABEL: @select_fadd_fcmp_3_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float 6.000000e+00
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp une float %x, 0.0
+  %B = fadd float 6.0, %x
+  %C = select i1 %A, float %y, float %B
+  ret float %C
+}
+
 define float @select_fadd_fcmp_4(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_4(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -190,9 +234,23 @@ define float @select_fadd_fcmp_4(float %x, float %y, float %z) {
   ret float %C
 }
 
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_4_poszero(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fadd_fcmp_4_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp une float %x, 0.0
+  %B = fadd nsz float %z, %x
+  %C = select i1 %A, float %y, float %B
+  ret float %C
+}
+
 define float @select_fadd_fcmp_5(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_5(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -204,9 +262,25 @@ define float @select_fadd_fcmp_5(float %x, float %y, float %v) {
   ret float %C
 }
 
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_5_poszero(float %x, float %y, float %v) {
+; CHECK-LABEL: @select_fadd_fcmp_5_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z]], float [[Y:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp oeq float %x, 0.0
+  %z = fadd float %v, 0.0 ; cannot produce -0.0
+  %B = fadd float %z, %x
+  %C = select i1 %A, float %B, float %y
+  ret float %C
+}
+
 define float @select_fadd_fcmp_6(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_6(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float 6.000000e+00, float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -216,6 +290,20 @@ define float @select_fadd_fcmp_6(float %x, float %y, float %z) {
   ret float %C
 }
 
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fadd_fcmp_6_poszero(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fadd_fcmp_6_poszero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float 6.000000e+00, float [[Y:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp oeq float %x, 0.0
+  %B = fadd float %x, 6.0
+  %C = select i1 %A, float %B, float %y
+  ret float %C
+}
+
 define float @select_fmul_fcmp(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fmul_fcmp(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 1.000000e+00
@@ -240,6 +328,20 @@ define float @select_fsub_fcmp(float %x, float %y, float %z) {
   ret float %C
 }
 
+; This is logically equivalent to the previous test - fcmp ignores the sign of 0.0.
+
+define float @select_fsub_fcmp_negzero(float %x, float %y, float %z) {
+; CHECK-LABEL: @select_fsub_fcmp_negzero(
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Z:%.*]], float [[Y:%.*]]
+; CHECK-NEXT:    ret float [[C]]
+;
+  %A = fcmp oeq float %x, -0.0
+  %B = fsub nsz float %z, %x
+  %C = select i1 %A, float %B, float %y
+  ret float %C
+}
+
 define float @select_fdiv_fcmp(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fdiv_fcmp(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 1.000000e+00
@@ -573,7 +675,7 @@ define float @select_fadd_fcmp_bad_3(float %x, float %y, float %z, float %k) {
 ; Invalid order of operands of select
 define float @select_fadd_fcmp_bad_4(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_4(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[X]], [[Z:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -587,7 +689,7 @@ define float @select_fadd_fcmp_bad_4(float %x, float %y, float %z) {
 ; Invalid comparison type
 define float @select_fadd_fcmp_bad_5(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_5(
-; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -601,7 +703,7 @@ define float @select_fadd_fcmp_bad_5(float %x, float %y, float %z) {
 ; Invalid order of operands of select
 define float @select_fadd_fcmp_bad_6(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_6(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -615,7 +717,7 @@ define float @select_fadd_fcmp_bad_6(float %x, float %y, float %z) {
 ; Do not transform if we have signed zeros and if Z is possibly negative zero
 define float @select_fadd_fcmp_bad_7(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_7(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[X]], [[Z:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -629,7 +731,7 @@ define float @select_fadd_fcmp_bad_7(float %x, float %y, float %z) {
 ; Invalid comparison type
 define float @select_fadd_fcmp_bad_8(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_8(
-; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], -1.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
@@ -645,7 +747,7 @@ define float @select_fadd_fcmp_bad_8(float %x, float %y, float %v) {
 ; Invalid comparison type
 define float @select_fadd_fcmp_bad_9(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_9(
-; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -659,7 +761,7 @@ define float @select_fadd_fcmp_bad_9(float %x, float %y, float %z) {
 ; Invalid comparison type
 define float @select_fadd_fcmp_bad_10(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_10(
-; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp one float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
@@ -675,7 +777,7 @@ define float @select_fadd_fcmp_bad_10(float %x, float %y, float %v) {
 ; Do not transform if Z is possibly negative zero
 define float @select_fadd_fcmp_bad_11(float %x, float %y, float %v) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_11(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[Z:%.*]] = fadd float [[V:%.*]], -1.000000e+00
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[Z]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -690,7 +792,7 @@ define float @select_fadd_fcmp_bad_11(float %x, float %y, float %v) {
 ; Do not transform if we have signed zeros and if Z is possibly negative zero
 define float @select_fadd_fcmp_bad_12(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_12(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -704,7 +806,7 @@ define float @select_fadd_fcmp_bad_12(float %x, float %y, float %z) {
 ; Invalid order of operands of select
 define float @select_fadd_fcmp_bad_13(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fadd_fcmp_bad_13(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], -0.000000e+00
+; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
 ; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[X]], [[Z:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
 ; CHECK-NEXT:    ret float [[C]]
@@ -729,34 +831,6 @@ define float @select_fadd_fcmp_bad_14(float %x, float %y, float %z) {
   ret float %C
 }
 
-; Invalid identity constant for FP op
-define float @select_fadd_fcmp_bad_15(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fadd_fcmp_bad_15(
-; CHECK-NEXT:    [[A:%.*]] = fcmp une float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[Y:%.*]], float [[B]]
-; CHECK-NEXT:    ret float [[C]]
-;
-  %A = fcmp une float %x, 0.0
-  %B = fadd nsz float %z, %x
-  %C = select i1 %A, float %y, float %B
-  ret float %C
-}
-
-; Invalid identity constant for FP op
-define float @select_fadd_fcmp_bad_16(float %x, float %y, float %z) {
-; CHECK-LABEL: @select_fadd_fcmp_bad_16(
-; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fadd nsz float [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
-; CHECK-NEXT:    ret float [[C]]
-;
-  %A = fcmp oeq float %x, 0.0
-  %B = fadd nsz float %z, %x
-  %C = select i1 %A, float %B, float %y
-  ret float %C
-}
-
 define float @select_fmul_fcmp_bad(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fmul_fcmp_bad(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 3.000000e+00
@@ -835,15 +909,18 @@ define float @select_fdiv_fcmp_bad_2(float %x, float %y, float %z) {
   ret float %C
 }
 
+; The transform is not valid when x = -0.0 and z = -0.0
+; (optimized code would return -0.0, but this returns +0.0).
+
 define float @select_fsub_fcmp_bad(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fsub_fcmp_bad(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 0.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fsub float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fsub float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp oeq float %x, 0.0
-  %B = fsub float %x, %z
+  %B = fsub float %z, %x
   %C = select i1 %A, float %B, float %y
   ret float %C
 }
@@ -851,12 +928,12 @@ define float @select_fsub_fcmp_bad(float %x, float %y, float %z) {
 define float @select_fsub_fcmp_bad_2(float %x, float %y, float %z) {
 ; CHECK-LABEL: @select_fsub_fcmp_bad_2(
 ; CHECK-NEXT:    [[A:%.*]] = fcmp oeq float [[X:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[B:%.*]] = fsub nsz float [[X]], [[Z:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = fsub nsz float [[Z:%.*]], [[X]]
 ; CHECK-NEXT:    [[C:%.*]] = select i1 [[A]], float [[B]], float [[Y:%.*]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
   %A = fcmp oeq float %x, 1.0
-  %B = fsub nsz float %x, %z
+  %B = fsub nsz float %z, %x
   %C = select i1 %A, float %B, float %y
   ret float %C
 }
diff --git a/test/Transforms/InstCombine/select_meta.ll b/test/Transforms/InstCombine/select_meta.ll
index 816504b296c5215730f1dc549496b4dc41f6200b..67dd246c040822f485c55cbc920dafc71ab66cd2 100644
--- a/test/Transforms/InstCombine/select_meta.ll
+++ b/test/Transforms/InstCombine/select_meta.ll
@@ -298,6 +298,44 @@ define i32 @umax2(i32 %x) {
   ret i32 %sel
 }
 
+; The condition is inverted, and the select ops are swapped. The metadata should be swapped.
+
+define i32 @not_cond(i1 %c, i32 %tv, i32 %fv) {
+; CHECK-LABEL: @not_cond(
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[C:%.*]], i32 [[FV:%.*]], i32 [[TV:%.*]], !prof ![[$MD3]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %notc = xor i1 %c, true
+  %r = select i1 %notc, i32 %tv, i32 %fv, !prof !1
+  ret i32 %r
+}
+
+; The condition is inverted, and the select ops are swapped. The metadata should be swapped.
+
+define <2 x i32> @not_cond_vec(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
+; CHECK-LABEL: @not_cond_vec(
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C:%.*]], <2 x i32> [[FV:%.*]], <2 x i32> [[TV:%.*]], !prof ![[$MD3]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %notc = xor <2 x i1> %c, <i1 true, i1 true>
+  %r = select <2 x i1> %notc, <2 x i32> %tv, <2 x i32> %fv, !prof !1
+  ret <2 x i32> %r
+}
+
+; Should match vector 'not' with undef element.
+; The condition is inverted, and the select ops are swapped. The metadata should be swapped.
+
+define <2 x i32> @not_cond_vec_undef(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
+; CHECK-LABEL: @not_cond_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C:%.*]], <2 x i32> [[FV:%.*]], <2 x i32> [[TV:%.*]], !prof ![[$MD3]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %notc = xor <2 x i1> %c, <i1 undef, i1 true>
+  %r = select <2 x i1> %notc, <2 x i32> %tv, <2 x i32> %fv, !prof !1
+  ret <2 x i32> %r
+}
+
+
 !1 = !{!"branch_weights", i32 2, i32 10}
 !2 = !{!"branch_weights", i32 3, i32 10}
 
diff --git a/test/Transforms/InstCombine/simplify-libcalls-erased.ll b/test/Transforms/InstCombine/simplify-libcalls-erased.ll
new file mode 100644
index 0000000000000000000000000000000000000000..19cfcf8eba9c7adb4026adc0bea43b99b9f1dd2c
--- /dev/null
+++ b/test/Transforms/InstCombine/simplify-libcalls-erased.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -instcombine | FileCheck %s
+
+target triple = "x86_64"
+
+define double @pow_exp(double %x, double %y) {
+; CHECK-LABEL: @pow_exp(
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[EXP:%.*]] = call fast double @llvm.exp.f64(double [[MUL]])
+; CHECK-NEXT:    ret double [[EXP]]
+;
+  %A = alloca i1
+  %call = call fast double @exp(double %x) #1
+  %pow = call fast double @llvm.pow.f64(double %call, double %y)
+  %C1 = fcmp ule double %call, %pow
+  store i1 %C1, i1* %A
+  ret double %pow
+}
+
+declare double @exp(double)
+
+declare double @llvm.pow.f64(double, double) #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll
index 8a568602f2bc79457db3c5d0705fa460492562cd..dd9fadf20232ec96b0c0bd45509992838ac70f56 100644
--- a/test/Transforms/InstCombine/sub.ll
+++ b/test/Transforms/InstCombine/sub.ll
@@ -1,43 +1,175 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
 target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
-; Optimize subtracts.
+define i32 @sub_constant(i32 %x) {
+; CHECK-LABEL: @sub_constant(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[X:%.*]], -42
+; CHECK-NEXT:    ret i32 [[R]]
 ;
-; RUN: opt < %s -instcombine -S | FileCheck %s
+  %r = sub i32 %x, 42
+  ret i32 %r
+}
+
+@g = global i32 0
 
-define i32 @test1(i32 %A) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    ret i32 0
+define i32 @sub_constant_expression(i32 %x) {
+; CHECK-LABEL: @sub_constant_expression(
+; CHECK-NEXT:    [[R:%.*]] = sub i32 [[X:%.*]], ptrtoint (i32* @g to i32)
+; CHECK-NEXT:    ret i32 [[R]]
 ;
-  %B = sub i32 %A, %A
-  ret i32 %B
+  %r = sub i32 %x, ptrtoint (i32* @g to i32)
+  ret i32 %r
 }
 
-define i32 @test2(i32 %A) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    ret i32 [[A:%.*]]
+define <2 x i32> @sub_constant_vec(<2 x i32> %x) {
+; CHECK-LABEL: @sub_constant_vec(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[X:%.*]], <i32 -42, i32 12>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %B = sub i32 %A, 0
-  ret i32 %B
+  %r = sub <2 x i32> %x, <i32 42, i32 -12>
+  ret <2 x i32> %r
 }
 
-define i32 @test3(i32 %A) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    ret i32 [[A:%.*]]
+define <3 x i33> @sub_constant_vec_weird_type(<3 x i33> %x) {
+; CHECK-LABEL: @sub_constant_vec_weird_type(
+; CHECK-NEXT:    [[R:%.*]] = add <3 x i33> [[X:%.*]], <i33 42, i33 -42, i33 12>
+; CHECK-NEXT:    ret <3 x i33> [[R]]
 ;
-  %B = sub i32 0, %A
-  %C = sub i32 0, %B
-  ret i32 %C
+  %r = sub <3 x i33> %x, <i33 -42, i33 42, i33 -12>
+  ret <3 x i33> %r
 }
 
-define i32 @test4(i32 %A, i32 %x) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C:%.*]] = add i32 [[X:%.*]], [[A:%.*]]
-; CHECK-NEXT:    ret i32 [[C]]
+define <4 x i32> @sub_constant_expression_vec(<4 x i32> %x) {
+; CHECK-LABEL: @sub_constant_expression_vec(
+; CHECK-NEXT:    [[R:%.*]] = sub <4 x i32> [[X:%.*]], bitcast (i128 ptrtoint (i32* @g to i128) to <4 x i32>)
+; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %B = sub i32 0, %A
-  %C = sub i32 %x, %B
-  ret i32 %C
+  %r = sub <4 x i32> %x, bitcast (i128 ptrtoint (i32* @g to i128) to <4 x i32>)
+  ret <4 x i32> %r
+}
+
+define i32 @neg_sub(i32 %x, i32 %y) {
+; CHECK-LABEL: @neg_sub(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %neg = sub i32 0, %x
+  %r = sub i32 %y, %neg
+  ret i32 %r
+}
+
+define i32 @neg_nsw_sub(i32 %x, i32 %y) {
+; CHECK-LABEL: @neg_nsw_sub(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %neg = sub nsw i32 0, %x
+  %r = sub i32 %y, %neg
+  ret i32 %r
+}
+
+define i32 @neg_sub_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: @neg_sub_nsw(
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %neg = sub i32 0, %x
+  %r = sub nsw i32 %y, %neg
+  ret i32 %r
+}
+
+define i32 @neg_nsw_sub_nsw(i32 %x, i32 %y) {
+; CHECK-LABEL: @neg_nsw_sub_nsw(
+; CHECK-NEXT:    [[R:%.*]] = add nsw i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %neg = sub nsw i32 0, %x
+  %r = sub nsw i32 %y, %neg
+  ret i32 %r
+}
+
+define <2 x i32> @neg_sub_vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_vec(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub <2 x i32> zeroinitializer, %x
+  %r = sub <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_nsw_sub_vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_vec(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub nsw <2 x i32> zeroinitializer, %x
+  %r = sub <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_sub_nsw_vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_nsw_vec(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub <2 x i32> zeroinitializer, %x
+  %r = sub nsw <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_nsw_sub_nsw_vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_nsw_vec(
+; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub nsw <2 x i32> zeroinitializer, %x
+  %r = sub nsw <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub <2 x i32> <i32 0, i32 undef>, %x
+  %r = sub <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_nsw_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub nsw <2 x i32> <i32 undef, i32 0>, %x
+  %r = sub <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @neg_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_nsw_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub <2 x i32> <i32 undef, i32 0>, %x
+  %r = sub nsw <2 x i32> %y, %neg
+  ret <2 x i32> %r
+}
+
+; This should not drop 'nsw'.
+
+define <2 x i32> @neg_nsw_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_nsw_vec_undef(
+; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %neg = sub nsw <2 x i32> <i32 0, i32 undef>, %x
+  %r = sub nsw <2 x i32> %y, %neg
+  ret <2 x i32> %r
 }
 
 ; (~X) - (~Y) --> Y - X
@@ -499,7 +631,7 @@ define <2 x i32> @test27commutedvecmixed(<2 x i32> %x, <2 x i32> %y) {
 
 define i32 @test28(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @test28(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[Z:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[Y:%.*]], [[Z:%.*]]
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
@@ -626,16 +758,6 @@ define i32 @test38(i32 %A) {
   ret i32 %sub
 }
 
-define i32 @test39(i32 %A, i32 %x) {
-; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[C:%.*]] = add i32 [[X:%.*]], [[A:%.*]]
-; CHECK-NEXT:    ret i32 [[C]]
-;
-  %B = sub i32 0, %A
-  %C = sub nsw i32 %x, %B
-  ret i32 %C
-}
-
 define i16 @test40(i16 %a, i16 %b) {
 ; CHECK-LABEL: @test40(
 ; CHECK-NEXT:    [[ASHR:%.*]] = ashr i16 [[A:%.*]], 1
@@ -921,7 +1043,8 @@ define i32 @test56(i32 %A, i32 %B) {
 ;
   %X = add i32 %A, %B
   %Y = sub i32 %A, %X
-  ret i32 %Y                                                                                                                                                                                                                                             }
+  ret i32 %Y
+}
 
 define i32 @test57(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test57(
@@ -930,22 +1053,22 @@ define i32 @test57(i32 %A, i32 %B) {
 ;
   %X = add i32 %B, %A
   %Y = sub i32 %A, %X
-  ret i32 %Y                                                                                                                                                                                                                                             }
+  ret i32 %Y
+}
 
 @dummy_global1 = external global i8*
 @dummy_global2 = external global i8*
 
 define i64 @test58([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
-; CHECK-LABEL: @test58(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[J:%.*]], 4200
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[I:%.*]], 4200
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2:%.*]] [[TMP1:%.*]]
-; CHECK-NEXT:    ret i64 [[TMP3]]
-;
 ; Note the reassociate pass and another instcombine pass will further optimize this to
 ; "%sub = i64 %i, %j, ret i64 %sub"
-;
 ; gep1 and gep2 have only one use
+; CHECK-LABEL: @test58(
+; CHECK-NEXT:    [[GEP2_OFFS:%.*]] = add i64 [[J:%.*]], 4200
+; CHECK-NEXT:    [[GEP1_OFFS:%.*]] = add i64 [[I:%.*]], 4200
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[GEP1_OFFS]], [[GEP2_OFFS]]
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
   %gep1 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
   %gep2 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %j
   %cast1 = ptrtoint i8* %gep1 to i64
@@ -956,11 +1079,11 @@ define i64 @test58([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
 
 define i64 @test59([100 x [100 x i8]]* %foo, i64 %i) {
 ; CHECK-LABEL: @test59(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO:%.*]], i64 0, i64 42, i64 [[I:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO]], i64 0, i64 42, i64 0
 ; CHECK-NEXT:    store i8* [[GEP1]], i8** @dummy_global1, align 8
 ; CHECK-NEXT:    store i8* [[GEP2]], i8** @dummy_global2, align 8
-; CHECK-NEXT:    ret i64 %i
+; CHECK-NEXT:    ret i64 [[I]]
 ;
 ; gep1 and gep2 have more than one uses
   %gep1 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
@@ -975,8 +1098,8 @@ define i64 @test59([100 x [100 x i8]]* %foo, i64 %i) {
 
 define i64 @test60([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 %j, i64 %i
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO:%.*]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO]], i64 0, i64 42, i64 0
 ; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint i8* [[GEP1]] to i64
 ; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint i8* [[GEP2]] to i64
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
@@ -995,8 +1118,8 @@ define i64 @test60([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
 
 define i64 @test61([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test61(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 %j, i64 %i
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO:%.*]], i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* [[FOO]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
 ; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint i8* [[GEP1]] to i64
 ; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint i8* [[GEP2]] to i64
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
diff --git a/test/Transforms/InstCombine/vec_sext.ll b/test/Transforms/InstCombine/vec_sext.ll
index ea76115fc4497d3eca87692d62bf087aecc02e95..39bd40874160b2b9f516956dc72a56770f9d8755 100644
--- a/test/Transforms/InstCombine/vec_sext.ll
+++ b/test/Transforms/InstCombine/vec_sext.ll
@@ -1,15 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define <4 x i32> @psignd_3(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @psignd_3(
+define <4 x i32> @vec_select(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @vec_select(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
-; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[T1:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[T1]], [[A]]
-; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT1]], [[SUB]]
-; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
-; CHECK-NEXT:    ret <4 x i32> [[COND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A]], <4 x i32> [[SUB]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %cmp = icmp slt <4 x i32> %b, zeroinitializer
   %sext = sext <4 x i1> %cmp to <4 x i32>
@@ -23,15 +20,12 @@ define <4 x i32> @psignd_3(<4 x i32> %a, <4 x i32> %b) {
   ret <4 x i32> %cond
 }
 
-define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b) {
-; CHECK-LABEL: @test1(
+define <4 x i32> @vec_select_alternate_sign_bit_test(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @vec_select_alternate_sign_bit_test(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, [[A:%.*]]
-; CHECK-NEXT:    [[B_LOBIT1:%.*]] = ashr <4 x i32> [[B:%.*]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[B_LOBIT1_NOT:%.*]] = xor <4 x i32> [[B_LOBIT1]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[B_LOBIT1]], [[A]]
-; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT1_NOT]], [[SUB]]
-; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
-; CHECK-NEXT:    ret <4 x i32> [[COND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[SUB]], <4 x i32> [[A]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %cmp = icmp sgt <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   %sext = sext <4 x i1> %cmp to <4 x i32>
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index 8adb211b0a9354b3830d52cef054be97e1c8a6b9..7692fe3e05c046880233ecc4efd0b05e59eb7063 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -170,7 +170,7 @@ define <8 x i8> @test12a(<8 x i8> %t6, <8 x i8> %t2) {
   ret <8 x i8> %t3
 }
 
-; TODO: The mask length of the 1st shuffle can be reduced to eliminate the 2nd shuffle.
+; The mask length of the 1st shuffle can be reduced to eliminate the 2nd shuffle.
 
 define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @extract_subvector_of_shuffle(
@@ -182,7 +182,6 @@ define <2 x i8> @extract_subvector_of_shuffle(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i8> %extract_subv
 }
 
-; TODO:
 ; Extra uses are ok.
 ; Undef elements in either mask are ok. Undefs from the 2nd shuffle mask should propagate to the new shuffle.
 ; The type of the inputs does not have to match the output type.
diff --git a/test/Transforms/InstCombine/xor.ll b/test/Transforms/InstCombine/xor.ll
index c149cef295bfcb494c76079ed0f2a72156bfcce2..b06abe2919b9f3fb855427908322ad6e06cbaf95 100644
--- a/test/Transforms/InstCombine/xor.ll
+++ b/test/Transforms/InstCombine/xor.ll
@@ -190,16 +190,6 @@ define void @test20(i32 %A, i32 %B) {
   ret void
 }
 
-define i32 @test21(i1 %C, i32 %A, i32 %B) {
-; CHECK-LABEL: @test21(
-; CHECK-NEXT:    [[D:%.*]] = select i1 [[C:%.*]], i32 [[B:%.*]], i32 [[A:%.*]]
-; CHECK-NEXT:    ret i32 [[D]]
-;
-  %C2 = xor i1 %C, true
-  %D = select i1 %C2, i32 %A, i32 %B
-  ret i32 %D
-}
-
 define i32 @test22(i1 %X) {
 ; CHECK-LABEL: @test22(
 ; CHECK-NEXT:    [[Z:%.*]] = zext i1 [[X:%.*]] to i32
diff --git a/test/Transforms/InstMerge/st_sink_check_debug.ll b/test/Transforms/InstMerge/st_sink_check_debug.ll
new file mode 100644
index 0000000000000000000000000000000000000000..94d46a58f4cdeaba9aba842fd842e304cd745232
--- /dev/null
+++ b/test/Transforms/InstMerge/st_sink_check_debug.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -S -debugify -mldst-motion -o - | FileCheck %s
+
+%struct.S = type { i32 }
+
+define dso_local void @foo(%struct.S* %this, i32 %bar) {
+entry:
+  %this.addr = alloca %struct.S*, align 8
+  %bar.addr = alloca i32, align 4
+  store %struct.S* %this, %struct.S** %this.addr, align 8
+  store i32 %bar, i32* %bar.addr, align 4
+  %this1 = load %struct.S*, %struct.S** %this.addr, align 8
+  %0 = load i32, i32* %bar.addr, align 4
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %foo = getelementptr inbounds %struct.S, %struct.S* %this1, i32 0, i32 0
+  store i32 1, i32* %foo, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %foo2 = getelementptr inbounds %struct.S, %struct.S* %this1, i32 0, i32 0
+  store i32 0, i32* %foo2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; CHECK:      @foo
+; CHECK:      if.end: ; preds = %if.else, %if.then
+; CHECK-NEXT:   %.sink = phi {{.*}} !dbg
diff --git a/test/Transforms/InstSimplify/fcmp-select.ll b/test/Transforms/InstSimplify/fcmp-select.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eae885c8471f320dcd0d5ff8764a10361c3f57a2
--- /dev/null
+++ b/test/Transforms/InstSimplify/fcmp-select.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; X == 42.0 ? X : 42.0 --> 42.0
+
+define double @oeq(double %x) {
+; CHECK-LABEL: @oeq(
+; CHECK-NEXT:    ret double 4.200000e+01
+;
+  %cmp = fcmp oeq double %x, 42.0
+  %cond = select i1 %cmp, double %x, double 42.0
+  ret double %cond
+}
+
+; X == 42.0 ? 42.0 : X --> X
+
+define float @oeq_swapped(float %x) {
+; CHECK-LABEL: @oeq_swapped(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %cmp = fcmp oeq float %x, 42.0
+  %cond = select i1 %cmp, float 42.0, float %x
+  ret float %cond
+}
+
+; x != y ? x : y -> x if it's the right kind of != and at least
+; one of x and y is not negative zero.
+
+; X != 42.0 ? X : 42.0 --> X
+
+define double @une(double %x) {
+; CHECK-LABEL: @une(
+; CHECK-NEXT:    ret double [[X:%.*]]
+;
+  %cmp = fcmp une double %x, 42.0
+  %cond = select i1 %cmp, double %x, double 42.0
+  ret double %cond
+}
+
+; X != 42.0 ? 42.0 : X --> 42.0
+
+define double @une_swapped(double %x) {
+; CHECK-LABEL: @une_swapped(
+; CHECK-NEXT:    ret double 4.200000e+01
+;
+  %cmp = fcmp une double %x, 42.0
+  %cond = select i1 %cmp, double 42.0, double %x
+  ret double %cond
+}
+
+define double @une_could_be_negzero(double %x, double %y) {
+; CHECK-LABEL: @une_could_be_negzero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double [[Y]]
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp une double %x, %y
+  %cond = select i1 %cmp, double %x, double %y
+  ret double %cond
+}
+
+define double @une_swapped_could_be_negzero(double %x, double %y) {
+; CHECK-LABEL: @une_swapped_could_be_negzero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[Y]], double [[X]]
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp une double %x, %y
+  %cond = select i1 %cmp, double %y, double %x
+  ret double %cond
+}
+
+define double @one(double %x) {
+; CHECK-LABEL: @one(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[X:%.*]], -1.000000e+00
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double [[X]], double -1.000000e+00
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp one double %x, -1.0
+  %cond = select i1 %cmp, double %x, double -1.0
+  ret double %cond
+}
+
+define double @one_swapped(double %x) {
+; CHECK-LABEL: @one_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double [[X:%.*]], -1.000000e+00
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], double -1.000000e+00, double [[X]]
+; CHECK-NEXT:    ret double [[COND]]
+;
+  %cmp = fcmp one double %x, -1.0
+  %cond = select i1 %cmp, double -1.0, double %x
+  ret double %cond
+}
+
diff --git a/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index 6b6ae48f516165ea761a3e8e0a6ce69317559f2f..acc24d9ba60fcf2563d4d41dd9f82f79b76eed3e 100644
--- a/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -790,6 +790,322 @@ define float @maxnum_neginf(float %x) {
   ret float %val
 }
 
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
+declare double @llvm.minimum.f64(double, double)
+declare double @llvm.maximum.f64(double, double)
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+; From the LangRef for minimum/maximum:
+; "If either operand is a NaN, returns NaN."
+
+define double @maximum_nan_op0(double %x) {
+; CHECK-LABEL: @maximum_nan_op0(
+; CHECK-NEXT:    ret double 0x7FF8000000000000
+;
+  %r = call double @llvm.maximum.f64(double 0x7ff8000000000000, double %x)
+  ret double %r
+}
+
+define double @maximum_nan_op1(double %x) {
+; CHECK-LABEL: @maximum_nan_op1(
+; CHECK-NEXT:    ret double 0x7FF800000000DEAD
+;
+  %r = call double @llvm.maximum.f64(double %x, double 0x7ff800000000dead)
+  ret double %r
+}
+
+define double @minimum_nan_op0(double %x) {
+; CHECK-LABEL: @minimum_nan_op0(
+; CHECK-NEXT:    ret double 0x7FF8000DEAD00000
+;
+  %r = call double @llvm.minimum.f64(double 0x7ff8000dead00000, double %x)
+  ret double %r
+}
+
+define double @minimum_nan_op1(double %x) {
+; CHECK-LABEL: @minimum_nan_op1(
+; CHECK-NEXT:    ret double 0x7FF800DEAD00DEAD
+;
+  %r = call double @llvm.minimum.f64(double %x, double 0x7ff800dead00dead)
+  ret double %r
+}
+
+define <2 x double> @maximum_nan_op0_vec(<2 x double> %x) {
+; CHECK-LABEL: @maximum_nan_op0_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double undef>
+;
+  %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> <double 0x7ff8000000000000, double undef>, <2 x double> %x)
+  ret <2 x double> %r
+}
+
+define <2 x double> @maximum_nan_op1_vec(<2 x double> %x) {
+; CHECK-LABEL: @maximum_nan_op1_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF800000000DEAD, double 0x7FF8FFFFFFFFFFFF>
+;
+  %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> <double 0x7ff800000000dead, double 0x7ff8ffffffffffff>)
+  ret <2 x double> %r
+}
+
+define <2 x double> @minimum_nan_op0_vec(<2 x double> %x) {
+; CHECK-LABEL: @minimum_nan_op0_vec(
+; CHECK-NEXT:    ret <2 x double> <double undef, double 0x7FF8000DEAD00000>
+;
+  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> <double undef, double 0x7ff8000dead00000>, <2 x double> %x)
+  ret <2 x double> %r
+}
+
+define <2 x double> @minimum_nan_op1_vec(<2 x double> %x) {
+; CHECK-LABEL: @minimum_nan_op1_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF800DEAD00DEAD, double 0x7FF800DEAD00DEAD>
+;
+  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0x7ff800dead00dead, double 0x7ff800dead00dead>)
+  ret <2 x double> %r
+}
+
+define float @maximum_undef_op1(float %x) {
+; CHECK-LABEL: @maximum_undef_op1(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %val = call float @llvm.maximum.f32(float %x, float undef)
+  ret float %val
+}
+
+define float @maximum_undef_op0(float %x) {
+; CHECK-LABEL: @maximum_undef_op0(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %val = call float @llvm.maximum.f32(float undef, float %x)
+  ret float %val
+}
+
+define float @minimum_undef_op1(float %x) {
+; CHECK-LABEL: @minimum_undef_op1(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %val = call float @llvm.minimum.f32(float %x, float undef)
+  ret float %val
+}
+
+define float @minimum_undef_op0(float %x) {
+; CHECK-LABEL: @minimum_undef_op0(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %val = call float @llvm.minimum.f32(float undef, float %x)
+  ret float %val
+}
+
+define float @minimum_undef_undef(float %x) {
+; CHECK-LABEL: @minimum_undef_undef(
+; CHECK-NEXT:    ret float undef
+;
+  %val = call float @llvm.minimum.f32(float undef, float undef)
+  ret float %val
+}
+
+define float @maximum_undef_undef(float %x) {
+; CHECK-LABEL: @maximum_undef_undef(
+; CHECK-NEXT:    ret float undef
+;
+  %val = call float @llvm.maximum.f32(float undef, float undef)
+  ret float %val
+}
+
+define float @minimum_same_args(float %x) {
+; CHECK-LABEL: @minimum_same_args(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %y = call float @llvm.minimum.f32(float %x, float %x)
+  ret float %y
+}
+
+define float @maximum_same_args(float %x) {
+; CHECK-LABEL: @maximum_same_args(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %y = call float @llvm.maximum.f32(float %x, float %x)
+  ret float %y
+}
+
+define float @minimum_x_minimum_x_y(float %x, float %y) {
+; CHECK-LABEL: @minimum_x_minimum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %x, float %a)
+  ret float %b
+}
+
+define float @minimum_y_minimum_x_y(float %x, float %y) {
+; CHECK-LABEL: @minimum_y_minimum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %y, float %a)
+  ret float %b
+}
+
+define float @minimum_x_y_minimum_x(float %x, float %y) {
+; CHECK-LABEL: @minimum_x_y_minimum_x(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %a, float %x)
+  ret float %b
+}
+
+define float @minimum_x_y_minimum_y(float %x, float %y) {
+; CHECK-LABEL: @minimum_x_y_minimum_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %a, float %y)
+  ret float %b
+}
+
+; negative test
+
+define float @minimum_z_minimum_x_y(float %x, float %y, float %z) {
+; CHECK-LABEL: @minimum_z_minimum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.minimum.f32(float [[Z:%.*]], float [[A]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %z, float %a)
+  ret float %b
+}
+
+; negative test
+
+define float @minimum_x_y_minimum_z(float %x, float %y, float %z) {
+; CHECK-LABEL: @minimum_x_y_minimum_z(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.minimum.f32(float [[A]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.minimum.f32(float %x, float %y)
+  %b = call float @llvm.minimum.f32(float %a, float %z)
+  ret float %b
+}
+
+; minimum(X, -INF) --> -INF
+
+define float @minimum_neginf(float %x) {
+; CHECK-LABEL: @minimum_neginf(
+; CHECK-NEXT:    ret float 0xFFF0000000000000
+;
+  %val = call float @llvm.minimum.f32(float %x, float 0xFFF0000000000000)
+  ret float %val
+}
+
+define <2 x double> @minimum_neginf_commute_vec(<2 x double> %x) {
+; CHECK-LABEL: @minimum_neginf_commute_vec(
+; CHECK-NEXT:    ret <2 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000>
+;
+  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000>, <2 x double> %x)
+  ret <2 x double> %r
+}
+
+; negative test
+
+define float @minimum_inf(float %x) {
+; CHECK-LABEL: @minimum_inf(
+; CHECK-NEXT:    [[VAL:%.*]] = call float @llvm.minimum.f32(float 0x7FF0000000000000, float [[X:%.*]])
+; CHECK-NEXT:    ret float [[VAL]]
+;
+  %val = call float @llvm.minimum.f32(float 0x7FF0000000000000, float %x)
+  ret float %val
+}
+define float @maximum_x_maximum_x_y(float %x, float %y) {
+; CHECK-LABEL: @maximum_x_maximum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %x, float %a)
+  ret float %b
+}
+
+define float @maximum_y_maximum_x_y(float %x, float %y) {
+; CHECK-LABEL: @maximum_y_maximum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %y, float %a)
+  ret float %b
+}
+
+define float @maximum_x_y_maximum_x(float %x, float %y) {
+; CHECK-LABEL: @maximum_x_y_maximum_x(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %a, float %x)
+  ret float %b
+}
+
+define float @maximum_x_y_maximum_y(float %x, float %y) {
+; CHECK-LABEL: @maximum_x_y_maximum_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    ret float [[A]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %a, float %y)
+  ret float %b
+}
+
+; negative test
+
+define float @maximum_z_maximum_x_y(float %x, float %y, float %z) {
+; CHECK-LABEL: @maximum_z_maximum_x_y(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.maximum.f32(float [[Z:%.*]], float [[A]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %z, float %a)
+  ret float %b
+}
+
+; negative test
+
+define float @maximum_x_y_maximum_z(float %x, float %y, float %z) {
+; CHECK-LABEL: @maximum_x_y_maximum_z(
+; CHECK-NEXT:    [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = call float @llvm.maximum.f32(float [[A]], float [[Z:%.*]])
+; CHECK-NEXT:    ret float [[B]]
+;
+  %a = call float @llvm.maximum.f32(float %x, float %y)
+  %b = call float @llvm.maximum.f32(float %a, float %z)
+  ret float %b
+}
+
+; maximum(X, INF) --> INF
+
+define <2 x double> @maximum_inf(<2 x double> %x) {
+; CHECK-LABEL: @maximum_inf(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000>
+;
+  %val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double><double 0x7FF0000000000000, double 0x7FF0000000000000>)
+  ret <2 x double> %val
+}
+
+define float @maximum_inf_commute(float %x) {
+; CHECK-LABEL: @maximum_inf_commute(
+; CHECK-NEXT:    ret float 0x7FF0000000000000
+;
+  %val = call float @llvm.maximum.f32(float 0x7FF0000000000000, float %x)
+  ret float %val
+}
+
 ; Y - (Y - X) --> X
 
 define float @fsub_fsub_common_op(float %x, float %y) {
@@ -951,4 +1267,3 @@ define float @fsub_fadd_common_op_wrong_commute_commute(float %x, float %y) {
   %r = fadd reassoc nsz float %s, %y
   ret float %r
 }
-
diff --git a/test/Transforms/InstSimplify/floating-point-compare.ll b/test/Transforms/InstSimplify/floating-point-compare.ll
index bc5c58a698e3c96970e9597d761aedf73104bad8..14e6ccee7b274c3ef4d6f000f7c35303f52c0421 100644
--- a/test/Transforms/InstSimplify/floating-point-compare.ll
+++ b/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -179,6 +179,7 @@ declare double @llvm.powi.f64(double,i32)
 declare float @llvm.exp.f32(float)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
 declare double @llvm.exp2.f64(double)
 declare float @llvm.fma.f32(float,float,float)
 
@@ -233,13 +234,148 @@ define i1 @orderedLessZeroPowi(double,double) {
   ret i1 %olt
 }
 
-define i1 @orderedLessZeroUIToFP(i32) {
-; CHECK-LABEL: @orderedLessZeroUIToFP(
+define i1 @UIToFP_is_nan_or_positive_or_zero(i32 %x) {
+; CHECK-LABEL: @UIToFP_is_nan_or_positive_or_zero(
 ; CHECK-NEXT:    ret i1 true
 ;
-  %a = uitofp i32 %0 to float
-  %uge = fcmp uge float %a, 0.000000e+00
-  ret i1 %uge
+  %a = uitofp i32 %x to float
+  %r = fcmp uge float %a, 0.000000e+00
+  ret i1 %r
+}
+
+define <2 x i1> @UIToFP_is_nan_or_positive_or_zero_vec(<2 x i32> %x) {
+; CHECK-LABEL: @UIToFP_is_nan_or_positive_or_zero_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %a = uitofp <2 x i32> %x to <2 x float>
+  %r = fcmp uge <2 x float> %a, zeroinitializer
+  ret <2 x i1> %r
+}
+
+define i1 @UIToFP_nnan_is_positive_or_zero(i32 %x) {
+; CHECK-LABEL: @UIToFP_nnan_is_positive_or_zero(
+; CHECK-NEXT:    ret i1 true
+;
+  %a = uitofp i32 %x to float
+  %r = fcmp nnan oge float %a, 0.000000e+00
+  ret i1 %r
+}
+
+define <2 x i1> @UIToFP_nnan_is_positive_or_zero_vec(<2 x i32> %x) {
+; CHECK-LABEL: @UIToFP_nnan_is_positive_or_zero_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %a = uitofp <2 x i32> %x to <2 x float>
+  %r = fcmp nnan oge <2 x float> %a, zeroinitializer
+  ret <2 x i1> %r
+}
+
+define i1 @UIToFP_is_not_negative(i32 %x) {
+; CHECK-LABEL: @UIToFP_is_not_negative(
+; CHECK-NEXT:    ret i1 false
+;
+  %a = uitofp i32 %x to float
+  %r = fcmp olt float %a, 0.000000e+00
+  ret i1 %r
+}
+
+define <2 x i1> @UIToFP_is_not_negative_vec(<2 x i32> %x) {
+; CHECK-LABEL: @UIToFP_is_not_negative_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %a = uitofp <2 x i32> %x to <2 x float>
+  %r = fcmp olt <2 x float> %a, zeroinitializer
+  ret <2 x i1> %r
+}
+
+define i1 @UIToFP_nnan_is_not_negative(i32 %x) {
+; CHECK-LABEL: @UIToFP_nnan_is_not_negative(
+; CHECK-NEXT:    ret i1 false
+;
+  %a = uitofp i32 %x to float
+  %r = fcmp nnan ult float %a, 0.000000e+00
+  ret i1 %r
+}
+
+define <2 x i1> @UIToFP_nnan_is_not_negative_vec(<2 x i32> %x) {
+; CHECK-LABEL: @UIToFP_nnan_is_not_negative_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %a = uitofp <2 x i32> %x to <2 x float>
+  %r = fcmp nnan ult <2 x float> %a, zeroinitializer
+  ret <2 x i1> %r
+}
+
+define i1 @fabs_is_nan_or_positive_or_zero(double %x) {
+; CHECK-LABEL: @fabs_is_nan_or_positive_or_zero(
+; CHECK-NEXT:    ret i1 true
+;
+  %fabs = tail call double @llvm.fabs.f64(double %x)
+  %cmp = fcmp uge double %fabs, 0.0
+  ret i1 %cmp
+}
+
+define <2 x i1> @fabs_is_nan_or_positive_or_zero_vec(<2 x double> %x) {
+; CHECK-LABEL: @fabs_is_nan_or_positive_or_zero_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
+  %cmp = fcmp uge <2 x double> %fabs, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+define i1 @fabs_nnan_is_positive_or_zero(double %x) {
+; CHECK-LABEL: @fabs_nnan_is_positive_or_zero(
+; CHECK-NEXT:    ret i1 true
+;
+  %fabs = tail call double @llvm.fabs.f64(double %x)
+  %cmp = fcmp nnan oge double %fabs, 0.0
+  ret i1 %cmp
+}
+
+define <2 x i1> @fabs_nnan_is_positive_or_zero_vec(<2 x double> %x) {
+; CHECK-LABEL: @fabs_nnan_is_positive_or_zero_vec(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
+  %cmp = fcmp nnan oge <2 x double> %fabs, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+define i1 @fabs_is_not_negative(double %x) {
+; CHECK-LABEL: @fabs_is_not_negative(
+; CHECK-NEXT:    ret i1 false
+;
+  %fabs = tail call double @llvm.fabs.f64(double %x)
+  %cmp = fcmp olt double %fabs, 0.0
+  ret i1 %cmp
+}
+
+define <2 x i1> @fabs_is_not_negative_vec(<2 x double> %x) {
+; CHECK-LABEL: @fabs_is_not_negative_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
+  %cmp = fcmp olt <2 x double> %fabs, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+define i1 @fabs_nnan_is_not_negative(double %x) {
+; CHECK-LABEL: @fabs_nnan_is_not_negative(
+; CHECK-NEXT:    ret i1 false
+;
+  %fabs = tail call double @llvm.fabs.f64(double %x)
+  %cmp = fcmp nnan ult double %fabs, 0.0
+  ret i1 %cmp
+}
+
+define <2 x i1> @fabs_nnan_is_not_negative_vec(<2 x double> %x) {
+; CHECK-LABEL: @fabs_nnan_is_not_negative_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %fabs = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
+  %cmp = fcmp nnan ult <2 x double> %fabs, zeroinitializer
+  ret <2 x i1> %cmp
 }
 
 define i1 @orderedLessZeroSelect(float, float) {
@@ -282,6 +418,18 @@ define i1 @orderedLessZeroMaxNum(float, float) {
   ret i1 %uge
 }
 
+; But using maximum, we can simplify, since the NaN would be propagated
+
+define i1 @orderedLessZeroMaximum(float, float) {
+; CHECK-LABEL: @orderedLessZeroMaximum(
+; CHECK-NEXT:    ret i1 true
+;
+  %a = call float @llvm.exp.f32(float %0)
+  %b = call float @llvm.maximum.f32(float %a, float %1)
+  %uge = fcmp uge float %b, 0.000000e+00
+  ret i1 %uge
+}
+
 define i1 @known_positive_olt_with_negative_constant(double %a) {
 ; CHECK-LABEL: @known_positive_olt_with_negative_constant(
 ; CHECK-NEXT:    ret i1 false
@@ -375,4 +523,3 @@ define <2 x i1> @unorderedCompareWithNaNVector_undef_elt(<2 x double> %A) {
   %cmp = fcmp ult <2 x double> %A, <double undef, double 0xFFFFFFFFFFFFFFFF>
   ret <2 x i1> %cmp
 }
-
diff --git a/test/Transforms/InstSimplify/icmp-abs-nabs.ll b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..41ffc33ef57f1df40b12ec92a77e5f48214e9429
--- /dev/null
+++ b/test/Transforms/InstSimplify/icmp-abs-nabs.ll
@@ -0,0 +1,376 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; This is canonical form for this IR.
+
+define i1 @abs_nsw_is_positive(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sgt i32 %abs, -1
+  ret i1 %r
+}
+
+; Test non-canonical predicate and non-canonical form of abs().
+
+define i1 @abs_nsw_is_positive_sge(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_sge(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sge i32 %abs, 0
+  ret i1 %r
+}
+
+; This is a range-based analysis. Any negative constant works.
+
+define i1 @abs_nsw_is_positive_reduced_range(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_reduced_range(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sgt i32 %abs, -42
+  ret i1 %r
+}
+
+; Negative test - we need 'nsw' in the abs().
+
+define i1 @abs_is_positive_reduced_range(i32 %x) {
+; CHECK-LABEL: @abs_is_positive_reduced_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], 42
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sgt i32 %abs, 42
+  ret i1 %r
+}
+
+; Negative test - range intersection is not subset.
+
+define i1 @abs_nsw_is_positive_wrong_range(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[ABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sgt i32 %abs, 0
+  ret i1 %r
+}
+
+; This is canonical form for this IR.
+
+define i1 @abs_nsw_is_not_negative(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp slt i32 %abs, 0
+  ret i1 %r
+}
+
+; Test non-canonical predicate and non-canonical form of abs().
+
+define i1 @abs_nsw_is_not_negative_sle(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_sle(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sle i32 %abs, -1
+  ret i1 %r
+}
+
+; This is a range-based analysis. Any negative constant works.
+
+define i1 @abs_nsw_is_not_negative_reduced_range(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_reduced_range(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp slt i32 %abs, -24
+  ret i1 %r
+}
+
+; Negative test - we need 'nsw' in the abs().
+
+define i1 @abs_is_not_negative_reduced_range(i32 %x) {
+; CHECK-LABEL: @abs_is_not_negative_reduced_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[ABS]], 42
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp slt i32 %abs, 42
+  ret i1 %r
+}
+
+; Negative test - range intersection is not empty.
+
+define i1 @abs_nsw_is_not_negative_wrong_range(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[NEGX:%.*]] = sub nsw i32 0, [[X]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGX]], i32 [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sle i32 [[ABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp sle i32 %abs, 0
+  ret i1 %r
+}
+
+; This is canonical form for this IR. For nabs(), we don't require 'nsw'
+
+define i1 @nabs_is_negative_or_0(i32 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp slt i32 %nabs, 1
+  ret i1 %r
+}
+
+; Test non-canonical predicate and non-canonical form of nabs().
+
+define i1 @nabs_is_negative_or_0_sle(i32 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0_sle(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sle i32 %nabs, 0
+  ret i1 %r
+}
+
+; This is a range-based analysis. Any positive constant works.
+
+define i1 @nabs_is_negative_or_0_reduced_range(i32 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0_reduced_range(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp slt i32 %nabs, 421
+  ret i1 %r
+}
+
+; Negative test - range intersection is not subset.
+
+define i1 @nabs_is_negative_or_0_wrong_range(i32 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp slt i32 [[NABS]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp slt i32 %nabs, 0
+  ret i1 %r
+}
+
+; This is canonical form for this IR. For nabs(), we don't require 'nsw'
+
+define i1 @nabs_is_not_over_0(i32 %x) {
+; CHECK-LABEL: @nabs_is_not_over_0(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i32 %x, 0
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sgt i32 %nabs, 0
+  ret i1 %r
+}
+
+; Test non-canonical predicate and non-canonical form of nabs().
+
+define i1 @nabs_is_not_over_0_sle(i32 %x) {
+; CHECK-LABEL: @nabs_is_not_over_0_sle(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sge i32 %nabs, 1
+  ret i1 %r
+}
+
+; This is a range-based analysis. Any positive constant works.
+
+define i1 @nabs_is_not_over_0_reduced_range(i32 %x) {
+; CHECK-LABEL: @nabs_is_not_over_0_reduced_range(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sgt i32 %nabs, 4223
+  ret i1 %r
+}
+
+; Negative test - range intersection is not subset.
+
+define i1 @nabs_is_not_over_0_wrong_range(i32 %x) {
+; CHECK-LABEL: @nabs_is_not_over_0_wrong_range(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, [[X]]
+; CHECK-NEXT:    [[NABS:%.*]] = select i1 [[CMP]], i32 [[X]], i32 [[NEGX]]
+; CHECK-NEXT:    [[R:%.*]] = icmp sgt i32 [[NABS]], -1
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub i32 0, %x
+  %nabs = select i1 %cmp, i32 %x, i32 %negx
+  %r = icmp sgt i32 %nabs, -1
+  ret i1 %r
+}
+
+; More miscellaneous tests for predicates/types.
+
+; Equality predicates are ok.
+
+define i1 @abs_nsw_is_positive_eq(i32 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_eq(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i32 %x, 1
+  %negx = sub nsw i32 0, %x
+  %abs = select i1 %cmp, i32 %negx, i32 %x
+  %r = icmp eq i32 %abs, -8
+  ret i1 %r
+}
+
+; An unsigned compare may work.
+
+define i1 @abs_nsw_is_positive_ult(i8 %x) {
+; CHECK-LABEL: @abs_nsw_is_positive_ult(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp slt i8 %x, 0
+  %negx = sub nsw i8 0, %x
+  %abs = select i1 %cmp, i8 %negx, i8 %x
+  %r = icmp ult i8 %abs, 139
+  ret i1 %r
+}
+
+; An unsigned compare may work.
+
+define i1 @abs_nsw_is_not_negative_ugt(i8 %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_ugt(
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp = icmp slt i8 %x, 0
+  %negx = sub nsw i8 0, %x
+  %abs = select i1 %cmp, i8 %negx, i8 %x
+  %r = icmp ugt i8 %abs, 127
+  ret i1 %r
+}
+
+; Vector types are ok.
+
+define <2 x i1> @abs_nsw_is_not_negative_vec_splat(<2 x i32> %x) {
+; CHECK-LABEL: @abs_nsw_is_not_negative_vec_splat(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %cmp = icmp slt <2 x i32> %x, zeroinitializer
+  %negx = sub nsw <2 x i32> zeroinitializer, %x
+  %abs = select <2 x i1> %cmp, <2 x i32> %negx, <2 x i32> %x
+  %r = icmp slt <2 x i32> %abs, <i32 -8, i32 -8>
+  ret <2 x i1> %r
+}
+
+; Equality predicates are ok.
+
+define i1 @nabs_is_negative_or_0_ne(i8 %x) {
+; CHECK-LABEL: @nabs_is_negative_or_0_ne(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp slt i8 %x, 0
+  %negx = sub i8 0, %x
+  %nabs = select i1 %cmp, i8 %x, i8 %negx
+  %r = icmp ne i8 %nabs, 12
+  ret i1 %r
+}
+
+; Vector types are ok.
+
+define <3 x i1> @nabs_is_not_over_0_sle_vec_splat(<3 x i33> %x) {
+; CHECK-LABEL: @nabs_is_not_over_0_sle_vec_splat(
+; CHECK-NEXT:    ret <3 x i1> zeroinitializer
+;
+  %cmp = icmp slt <3 x i33> %x, <i33 1, i33 1, i33 1>
+  %negx = sub <3 x i33> zeroinitializer, %x
+  %nabs = select <3 x i1> %cmp, <3 x i33> %x, <3 x i33> %negx
+  %r = icmp sge <3 x i33> %nabs, <i33 1, i33 1, i33 1>
+  ret <3 x i1> %r
+}
+
+; Negative test - intersection does not equal absolute value range.
+; PR39510 - https://bugs.llvm.org/show_bug.cgi?id=39510
+
+define i1 @abs_no_intersection(i32 %a) {
+; CHECK-LABEL: @abs_no_intersection(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[A]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[A]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i32 [[COND]], 2
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp slt i32 %a, 0
+  %sub = sub nsw i32 0, %a
+  %cond = select i1 %cmp, i32 %sub, i32 %a
+  %r = icmp ne i32 %cond, 2
+  ret i1 %r
+}
+
+; Negative test - intersection does not equal absolute value range.
+
+define i1 @nabs_no_intersection(i32 %a) {
+; CHECK-LABEL: @nabs_no_intersection(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[A]]
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[A]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i32 [[COND]], -2
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp sgt i32 %a, 0
+  %sub = sub i32 0, %a
+  %cond = select i1 %cmp, i32 %sub, i32 %a
+  %r = icmp ne i32 %cond, -2
+  ret i1 %r
+}
+
diff --git a/test/Transforms/InstSimplify/sub.ll b/test/Transforms/InstSimplify/sub.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4e2064527c41fc66cc7b5806f33d9b2fb1c24d1b
--- /dev/null
+++ b/test/Transforms/InstSimplify/sub.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i32 @sub_self(i32 %A) {
+; CHECK-LABEL: @sub_self(
+; CHECK-NEXT:    ret i32 0
+;
+  %B = sub i32 %A, %A
+  ret i32 %B
+}
+
+define <2 x i32> @sub_self_vec(<2 x i32> %A) {
+; CHECK-LABEL: @sub_self_vec(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %B = sub <2 x i32> %A, %A
+  ret <2 x i32> %B
+}
+
+define i32 @sub_zero(i32 %A) {
+; CHECK-LABEL: @sub_zero(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
+  %B = sub i32 %A, 0
+  ret i32 %B
+}
+
+define <2 x i32> @sub_zero_vec(<2 x i32> %A) {
+; CHECK-LABEL: @sub_zero_vec(
+; CHECK-NEXT:    ret <2 x i32> [[A:%.*]]
+;
+  %B = sub <2 x i32> %A, <i32 0, i32 undef>
+  ret <2 x i32> %B
+}
+
+define i32 @neg_neg(i32 %A) {
+; CHECK-LABEL: @neg_neg(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
+  %B = sub i32 0, %A
+  %C = sub i32 0, %B
+  ret i32 %C
+}
+
+define <2 x i32> @neg_neg_vec(<2 x i32> %A) {
+; CHECK-LABEL: @neg_neg_vec(
+; CHECK-NEXT:    ret <2 x i32> [[A:%.*]]
+;
+  %B = sub <2 x i32> <i32 0, i32 undef>, %A
+  %C = sub <2 x i32> <i32 0, i32 undef>, %B
+  ret <2 x i32> %C
+}
+
diff --git a/test/Transforms/JumpThreading/thread-loads.ll b/test/Transforms/JumpThreading/thread-loads.ll
index 3606e796cdd5f16f73e7ffb618590257370d7611..1156f39d4a2480aa470eb03ad4d0706c744f5238 100644
--- a/test/Transforms/JumpThreading/thread-loads.ll
+++ b/test/Transforms/JumpThreading/thread-loads.ll
@@ -246,13 +246,15 @@ bb3:
   ret i32 %res.0
 }
 
-; Make sure we merge the aliasing metadata. (If we don't, we have a load
-; with the wrong metadata, so the branch gets incorrectly eliminated.)
+; Make sure we merge the aliasing metadata. We keep the range metadata for the
+; first load, as it dominates the second load. Hence we can eliminate the
+; branch.
 define void @test8(i32*, i32*, i32*) {
 ; CHECK-LABEL: @test8(
-; CHECK: %a = load i32, i32* %0, !range !4
+; CHECK: %a = load i32, i32* %0, !range ![[RANGE4:[0-9]+]]
 ; CHECK-NEXT: store i32 %a
-; CHECK: br i1 %c
+; CHECK-NEXT: %xxx = tail call i32 (...) @f1()
+; CHECK-NEXT: ret void
   %a = load i32, i32* %0, !tbaa !0, !range !4, !alias.scope !9, !noalias !10
   %b = load i32, i32* %0, !range !5
   store i32 %a, i32* %1
@@ -525,6 +527,8 @@ right_x:
   ret i32 10
 }
 
+; CHECK: ![[RANGE4]] = !{i32 0, i32 1}
+
 !0 = !{!3, !3, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll b/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
new file mode 100644
index 0000000000000000000000000000000000000000..563a75f407f0e38ff725fc18d6bdc58440dacda6
--- /dev/null
+++ b/test/Transforms/LCSSA/rewrite-existing-dbg-values.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -lcssa < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Reproducer for PR39019.
+;
+; Verify that the llvm.dbg.value in the %for.cond.cleanup2 block is rewritten
+; to use the PHI node for %add that is created by LCSSA.
+
+; CHECK-LABEL: for.cond.cleanup2:
+; CHECK-NEXT: [[PN:%[^ ]*]] = phi i32 [ %add.lcssa, %for.cond.cleanup1 ]
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 [[PN]], metadata [[VAR:![0-9]+]], metadata !DIExpression())
+; CHECK-NEXT: call void @bar(i32 [[PN]])
+
+; CHECK-LABEL: for.body:
+; CHECK: %add = add nsw i32 0, 2
+; CHECK: call void @llvm.dbg.value(metadata i32 %add, metadata [[VAR]], metadata !DIExpression())
+
+; CHECK: [[VAR]] = !DILocalVariable(name: "sum",
+
+; Function Attrs: nounwind
+define void @foo() #0 !dbg !6 {
+entry:
+  br label %for.cond.preheader, !dbg !12
+
+for.cond.preheader:                               ; preds = %for.cond.cleanup1, %entry
+  br label %for.body, !dbg !12
+
+for.cond.cleanup2:                                ; preds = %for.cond.cleanup1
+  call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12
+  tail call void @bar(i32 %add) #0, !dbg !12
+  ret void, !dbg !12
+
+for.cond.cleanup1:                                ; preds = %for.body
+  br i1 false, label %for.cond.preheader, label %for.cond.cleanup2, !dbg !12
+
+for.body:                                         ; preds = %for.body, %for.cond.preheader
+  %add = add nsw i32 0, 2, !dbg !12
+  call void @llvm.dbg.value(metadata i32 %add, metadata !9, metadata !DIExpression()), !dbg !12
+  br i1 false, label %for.body, label %for.cond.cleanup1, !dbg !12
+}
+
+; Function Attrs: nounwind
+declare void @bar(i32) #0
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, nameTableKind: None)
+!1 = !DIFile(filename: "foo.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 8.0.0"}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 10, type: !7, isLocal: false, isDefinition: true, scopeLine: 10, isOptimized: true, unit: !0, retainedNodes: !8)
+!7 = !DISubroutineType(types: !2)
+!8 = !{!9}
+!9 = !DILocalVariable(name: "sum", scope: !10, file: !1, line: 11, type: !11)
+!10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 0)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DILocation(line: 0, scope: !10)
diff --git a/test/Transforms/LICM/guards.ll b/test/Transforms/LICM/guards.ll
index b2f672104f8d1f2f94f3b11a36a77f4bbba47b22..b37c418928406f133f1803387d225e5cbe555288 100644
--- a/test/Transforms/LICM/guards.ll
+++ b/test/Transforms/LICM/guards.ll
@@ -85,15 +85,15 @@ loop:
 }
 
 
-; TODO: We can also hoist this load and guard from mustexec non-header block.
+; TODO: We can also hoist this guard from mustexec non-header block.
 define void @test4(i1 %c, i32* %p) {
 
 ; CHECK-LABEL: @test4(
 ; CHECK-LABEL: entry:
-; CHECK-LABEL: loop:
-; CHECK-LABEL: backedge:
 ; CHECK:       %a = load i32, i32* %p
 ; CHECK:       %invariant_cond = icmp ne i32 %a, 100
+; CHECK-LABEL: loop:
+; CHECK-LABEL: backedge:
 ; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %invariant_cond)
 
 entry:
diff --git a/test/Transforms/LICM/hoist-mustexec.ll b/test/Transforms/LICM/hoist-mustexec.ll
index 5bce1fbce1c8b95837670a8f06141c69072afe9d..53f78e88f72beea51c1a5c1b792cfec9d96d318b 100644
--- a/test/Transforms/LICM/hoist-mustexec.ll
+++ b/test/Transforms/LICM/hoist-mustexec.ll
@@ -456,3 +456,150 @@ backedge:
 exit:
   ret void
 }
+
+; Check that we can hoist a mustexecute load from backedge even if something
+; throws after it.
+define void @test_hoist_from_backedge_01(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_01(
+; CHECK:       entry:
+; CHECK-NEXT:  %load = load i32, i32* %p
+; CHECK-NOT:   load i32
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %load = load i32, i32* %p
+  call void @may_throw()
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't hoist the load if something before it can throw.
+define void @test_hoist_from_backedge_02(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_02(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  call void @may_throw()
+  %load = load i32, i32* %p
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_hoist_from_backedge_03(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_03(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  call void @may_throw()
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %load = load i32, i32* %p
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_hoist_from_backedge_04(i32* %p, i32 %n) {
+
+; CHECK-LABEL: @test_hoist_from_backedge_04(
+; CHECK:       entry:
+; CHECK:       loop:
+; CHECK:       %load = load i32, i32* %p
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %dummy = phi i32 [ 0, %entry ], [ %merge, %backedge ]
+  call void @may_throw()
+  %cond = icmp slt i32 %iv, %n
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %a = add i32 %iv, %iv
+  br label %backedge
+
+if.false:
+  %b = mul i32 %iv, %iv
+  br label %backedge
+
+backedge:
+  %merge = phi i32 [ %a, %if.true ], [ %b, %if.false ]
+  %iv.next = add i32 %iv, %merge
+  %load = load i32, i32* %p
+  %loop.cond = icmp ult i32 %iv.next, %load
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/LICM/hoist-nounwind.ll b/test/Transforms/LICM/hoist-nounwind.ll
index 9fc4903b83020cf9336ee2a843067f199da81a85..d53e4043af14086b6becec3b1f108e9b517fc80b 100644
--- a/test/Transforms/LICM/hoist-nounwind.ll
+++ b/test/Transforms/LICM/hoist-nounwind.ll
@@ -49,14 +49,16 @@ for.cond.cleanup:
   ret i32 0
 }
 
-; Don't hoist load past volatile load.
+; Hoist a non-volatile load past volatile load.
 define i32 @test3(i32* noalias nocapture readonly %a, i32* %v) nounwind uwtable {
 ; CHECK-LABEL: @test3(
 entry:
   br label %for.body
 
+; CHECK: load i32
+; CHECK: for.body:
 ; CHECK: load volatile i32
-; CHECK-NEXT: load i32
+; CHECK-NOT: load
 for.body:
   %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
   %x.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
@@ -70,3 +72,26 @@ for.body:
 for.cond.cleanup:
   ret i32 %add
 }
+
+; Don't a volatile load past volatile load.
+define i32 @test4(i32* noalias nocapture readonly %a, i32* %v) nounwind uwtable {
+; CHECK-LABEL: @test4(
+entry:
+  br label %for.body
+
+; CHECK: for.body:
+; CHECK: load volatile i32
+; CHECK-NEXT: load volatile i32
+for.body:
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %x.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %xxx = load volatile i32, i32* %v, align 4
+  %i1 = load volatile i32, i32* %a, align 4
+  %add = add nsw i32 %i1, %x.05
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret i32 %add
+}
\ No newline at end of file
diff --git a/test/Transforms/LICM/hoist-round.ll b/test/Transforms/LICM/hoist-round.ll
index 87a7050668defb4200e76c8b38f977b85ed78175..35851f39d2596000a4d4e7b7f87991a285921a98 100644
--- a/test/Transforms/LICM/hoist-round.ll
+++ b/test/Transforms/LICM/hoist-round.ll
@@ -4,8 +4,8 @@
 target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:32-v128:32:32-a0:0:32-n32"
 
 ; This test verifies that ceil, floor, nearbyint, trunc, rint, round,
-; copysign, minnum, maxnum and fabs intrinsics are considered safe
-; to speculate.
+; copysign, minnum, maxnum, minimum, maximum, and fabs intrinsics are
+; considered safe to speculate.
 
 ; CHECK-LABEL: @test
 ; CHECK: call float @llvm.ceil.f32
@@ -41,8 +41,10 @@ for.body:
   %tmp.8 = call float @llvm.copysign.f32(float %tmp.7, float %arg2)
   %tmp.9 = call float @llvm.minnum.f32(float %tmp.8, float %arg2)
   %tmp.10 = call float @llvm.maxnum.f32(float %tmp.9, float %arg2)
-  %tmp.11 = call float @llvm.powi.f32(float %tmp.10, i32 4)
-  call void @consume(float %tmp.11)
+  %tmp.11 = call float @llvm.minimum.f32(float %tmp.10, float %arg2)
+  %tmp.12 = call float @llvm.maximum.f32(float %tmp.11, float %arg2)
+  %tmp.13 = call float @llvm.powi.f32(float %tmp.12, i32 4)
+  call void @consume(float %tmp.13)
   %IND.new = add i32 %IND, 1
   br label %for.head
 
@@ -62,4 +64,6 @@ declare float @llvm.fabs.f32(float)
 declare float @llvm.copysign.f32(float, float)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
+declare float @llvm.minimum.f32(float, float)
+declare float @llvm.maximum.f32(float, float)
 declare float @llvm.powi.f32(float, i32)
diff --git a/test/Transforms/LICM/loopsink-pr39570.ll b/test/Transforms/LICM/loopsink-pr39570.ll
new file mode 100644
index 0000000000000000000000000000000000000000..65d3e1f513959ab41c125b80e7ee3de6042ae3b5
--- /dev/null
+++ b/test/Transforms/LICM/loopsink-pr39570.ll
@@ -0,0 +1,112 @@
+; RUN: opt -S -loop-sink < %s | FileCheck %s
+
+; CHECK: pr39570
+; Make sure not to assert.
+
+%0 = type { i32, %1*, %2, %6*, %33* }
+%1 = type { i32 (...)** }
+%2 = type { %3* }
+%3 = type { %4, i32, %5* }
+%4 = type { i32 (...)**, i32 }
+%5 = type opaque
+%6 = type { %7, %1*, %31*, i8, %2, %32* }
+%7 = type <{ %8, %9*, %10, i32, %33*, %33*, %33*, %27, %28, i16 }>
+%8 = type { i32 (...)** }
+%9 = type opaque
+%10 = type { %11, %16, %18, %19 }
+%11 = type { %12*, i32, i32, %13* }
+%12 = type { i32 (...)** }
+%13 = type { %14*, %14* }
+%14 = type { %15, i32 }
+%15 = type { %12*, i32, i32, i16* }
+%16 = type { %12*, i32, i32, %17* }
+%17 = type { %13, %14* }
+%18 = type { %12*, i32, i32, %14** }
+%19 = type { %20, %21, %12*, float, i32, i32, %22, %22, %24, i32, i32 }
+%20 = type { i8 }
+%21 = type { i8 }
+%22 = type { %12*, %23*, %23* }
+%23 = type opaque
+%24 = type { %12*, i32, i32, %25* }
+%25 = type { %12*, i32, i32, %26* }
+%26 = type opaque
+%27 = type { %33* }
+%28 = type { %29, i32, i32, %14* }
+%29 = type { %30 }
+%30 = type { i32 (...)** }
+%31 = type opaque
+%32 = type { i32 (...)** }
+%33 = type <{ %8, %9*, %10, i32, %33*, %33*, %33*, %27, %28, i16, [2 x i8] }>
+
+define dso_local void @pr39570() local_unnamed_addr align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !prof !1 {
+  br i1 undef, label %8, label %1, !prof !2
+
+; <label>:1:                                      ; preds = %0
+  %2 = load %0*, %0** undef, align 4
+  br label %3
+
+; <label>:3:                                      ; preds = %7, %1
+  %4 = getelementptr inbounds %0, %0* %2, i32 undef, i32 0
+  br label %5
+
+; <label>:5:                                      ; preds = %3
+  %6 = getelementptr inbounds %0, %0* %2, i32 undef, i32 4
+  br i1 undef, label %18, label %7, !prof !3
+
+; <label>:7:                                      ; preds = %5
+  br label %3
+
+; <label>:8:                                      ; preds = %0
+  invoke void @baz()
+          to label %9 unwind label %12
+
+; <label>:9:                                      ; preds = %8
+  invoke void @bar()
+          to label %17 unwind label %10
+
+; <label>:10:                                     ; preds = %9
+  %11 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+; <label>:12:                                     ; preds = %8
+  %13 = landingpad { i8*, i32 }
+          cleanup
+  invoke void @bar()
+          to label %16 unwind label %14
+
+; <label>:14:                                     ; preds = %12
+  %15 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+; <label>:16:                                     ; preds = %12
+  resume { i8*, i32 } %13
+
+; <label>:17:                                     ; preds = %9
+  br label %18
+
+; <label>:18:                                     ; preds = %17, %5
+  invoke void @baz()
+          to label %19 unwind label %20
+
+; <label>:19:                                     ; preds = %18
+  invoke void @bar()
+          to label %22 unwind label %20
+
+; <label>:20:                                     ; preds = %19
+  %21 = landingpad { i8*, i32 }
+          catch i8* null
+  unreachable
+
+; <label>:22:                                     ; preds = %19
+  ret void
+}
+
+declare dso_local i32 @__gxx_personality_v0(...)
+declare dso_local void @bar() local_unnamed_addr
+declare dso_local void @baz() local_unnamed_addr align 2
+
+!1 = !{!"function_entry_count", i64 0}
+!2 = !{!"branch_weights", i32 1, i32 3215551}
+!3 = !{!"branch_weights", i32 3215551, i32 1}
diff --git a/test/Transforms/LICM/preheader-safe.ll b/test/Transforms/LICM/preheader-safe.ll
index 0bfe123862c67ed707f67f8e1d2e55dd1848657f..03a7258df117c0254339820453de908df9428724 100644
--- a/test/Transforms/LICM/preheader-safe.ll
+++ b/test/Transforms/LICM/preheader-safe.ll
@@ -112,11 +112,31 @@ loop-if:
 exit:
   ret void
 }
+
+; Positive test - can hoist something that happens before thrower.
+define void @nothrow_header_pos(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: nothrow_header_pos
+; CHECK-LABEL: entry
+; CHECK: %div = udiv i64 %x, %y
+; CHECK-LABEL: loop
+; CHECK: call void @use(i64 %div)
+entry:
+  br label %loop
+loop:                                         ; preds = %entry, %for.inc
+  br label %loop-if
+loop-if:
+  %div = udiv i64 %x, %y
+  call void @use(i64 %div)
+  br label %loop
+}
+
+
 ; Negative test - can't move out of throwing block
 define void @nothrow_header_neg(i64 %x, i64 %y, i1 %cond) {
 ; CHECK-LABEL: nothrow_header_neg
 ; CHECK-LABEL: entry
 ; CHECK-LABEL: loop
+; CHECK: call void @maythrow()
 ; CHECK: %div = udiv i64 %x, %y
 ; CHECK: call void @use(i64 %div)
 entry:
@@ -124,6 +144,7 @@ entry:
 loop:                                         ; preds = %entry, %for.inc
   br label %loop-if
 loop-if:
+  call void @maythrow()
   %div = udiv i64 %x, %y
   call void @use(i64 %div)
   br label %loop
diff --git a/test/Transforms/LoopInterchange/inner-only-reductions.ll b/test/Transforms/LoopInterchange/inner-only-reductions.ll
new file mode 100644
index 0000000000000000000000000000000000000000..74543fb16472540f2fef41a1424c3355f4d01eb0
--- /dev/null
+++ b/test/Transforms/LoopInterchange/inner-only-reductions.ll
@@ -0,0 +1,124 @@
+; RUN: opt < %s -basicaa -loop-interchange -pass-remarks-missed='loop-interchange' -pass-remarks-output=%t -S \
+; RUN:     -verify-dom-info -verify-loop-info 2>&1 | FileCheck -check-prefix=IR %s
+; RUN: FileCheck --input-file=%t %s
+
+; Inner loop only reductions are not supported currently. See discussion at
+; D53027 for more information on the required checks.
+
+@A = common global [500 x [500 x i32]] zeroinitializer
+@X = common global i32 0
+@B = common global [500 x [500 x i32]] zeroinitializer
+@Y = common global i32 0
+
+;; global X
+
+;;  for( int i=1;i<N;i++)
+;;    for( int j=1;j<N;j++)
+;;      X+=A[j][i];
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHI
+; CHECK-NEXT: Function:        reduction_01
+
+; IR-LABEL: @reduction_01(
+; IR-NOT: split
+
+define void @reduction_01(i32 %N) {
+entry:
+  %cmp16 = icmp sgt i32 %N, 1
+  br i1 %cmp16, label %for.body3.lr.ph, label %for.end8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
+  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
+  %X.promoted = load i32, i32* @X
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %add15 = phi i32 [ %X.promoted, %for.body3.lr.ph ], [ %add, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
+  %0 = load i32, i32* %arrayidx5
+  %add = add nsw i32 %add15, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body3
+
+for.cond1.for.inc6_crit_edge:                     ; preds = %for.body3
+  %add.lcssa = phi i32 [ %add, %for.body3 ]
+  store i32 %add.lcssa, i32* @X
+  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
+  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
+  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
+  br i1 %exitcond21, label %for.end8, label %for.body3.lr.ph
+
+for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
+  ret void
+}
+
+;; Not tightly nested. Do not interchange.
+;;  for( int i=1;i<N;i++)
+;;    for( int j=1;j<N;j++) {
+;;      for( int k=1;k<N;k++) {
+;;        X+=A[k][j];
+;;      }
+;;      Y+=B[j][i];
+;;    }
+
+;; Not tightly nested. Do not interchange.
+;; Not interchanged hence the phi's in the inner loop will not be split.
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK-NEXT: Name:            UnsupportedPHIOuter
+; CHECK-NEXT: Function:        reduction_03
+
+; IR-LABEL: @reduction_03(
+; IR-NOT: split
+
+define void @reduction_03(i32 %N) {
+entry:
+  %cmp35 = icmp sgt i32 %N, 1
+  br i1 %cmp35, label %for.cond4.preheader.lr.ph, label %for.end19
+
+for.cond4.preheader.lr.ph:                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
+  %indvars.iv41 = phi i64 [ %indvars.iv.next42, %for.cond1.for.inc17_crit_edge ], [ 1, %entry ]
+  %Y.promoted = load i32, i32* @Y
+  br label %for.body6.lr.ph
+
+for.body6.lr.ph:                                  ; preds = %for.cond4.for.end_crit_edge, %for.cond4.preheader.lr.ph
+  %indvars.iv37 = phi i64 [ 1, %for.cond4.preheader.lr.ph ], [ %indvars.iv.next38, %for.cond4.for.end_crit_edge ]
+  %add1334 = phi i32 [ %Y.promoted, %for.cond4.preheader.lr.ph ], [ %add13, %for.cond4.for.end_crit_edge ]
+  %X.promoted = load i32, i32* @X
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv37
+  %0 = load i32, i32* %arrayidx8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.cond4.for.end_crit_edge, label %for.body6
+
+for.cond4.for.end_crit_edge:                      ; preds = %for.body6
+  %arrayidx12 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @B, i64 0, i64 %indvars.iv37, i64 %indvars.iv41
+  %1 = load i32, i32* %arrayidx12
+  %add13 = add nsw i32 %add1334, %1
+  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
+  %lftr.wideiv39 = trunc i64 %indvars.iv.next38 to i32
+  %exitcond40 = icmp eq i32 %lftr.wideiv39, %N
+  br i1 %exitcond40, label %for.cond1.for.inc17_crit_edge, label %for.body6.lr.ph
+
+for.cond1.for.inc17_crit_edge:                    ; preds = %for.cond4.for.end_crit_edge
+  %add13.lcssa = phi i32 [ %add13, %for.cond4.for.end_crit_edge ]
+  store i32 %add13.lcssa, i32* @Y
+  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
+  %lftr.wideiv43 = trunc i64 %indvars.iv.next42 to i32
+  %exitcond44 = icmp eq i32 %lftr.wideiv43, %N
+  br i1 %exitcond44, label %for.end19, label %for.cond4.preheader.lr.ph
+
+for.end19:                                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopInterchange/lcssa.ll b/test/Transforms/LoopInterchange/lcssa.ll
index 8886cf4925fc35a608d569c06a8f3c19b6e1b755..2bd9ee69c163b99b52608859b171961e18b9c228 100644
--- a/test/Transforms/LoopInterchange/lcssa.ll
+++ b/test/Transforms/LoopInterchange/lcssa.ll
@@ -246,7 +246,6 @@ for.body3:                                        ; preds = %for.body3, %outer.h
 
 outer.inc:                                        ; preds = %for.body3, %outer.header
   %sv = phi i64 [ 0, %outer.header ], [ 1, %for.body3 ]
-  store i64 %sv, i64* %ptr
   %iv.outer.next = add nsw i64 %iv.outer, 1
   %cmp = icmp eq i64 %iv.outer.next, 100
   br i1 %cmp, label %outer.header, label %for.exit
diff --git a/test/Transforms/LoopInterchange/phi-ordering.ll b/test/Transforms/LoopInterchange/phi-ordering.ll
index c7416973758ff1436364de19198755f245d07d01..2854fe19f7ae548f79db718105abf74af88e7ee5 100644
--- a/test/Transforms/LoopInterchange/phi-ordering.ll
+++ b/test/Transforms/LoopInterchange/phi-ordering.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -loop-interchange-threshold=-1000 -S 2>&1 | FileCheck %s
 ;; Checks the order of the inner phi nodes does not cause havoc.
 ;; The inner loop has a reduction into c. The IV is not the first phi.
 
@@ -23,8 +23,6 @@ define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocaptu
 ; CHECK-NEXT:    br label [[FOR2_HEADER:%.*]]
 ; CHECK:       for2.header:
 ; CHECK-NEXT:    [[J:%.*]] = phi i32 [ [[INC17:%.*]], [[FOR2_INC16:%.*]] ], [ 0, [[FOR2_HEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [90 x i32], [90 x i32]* [[C:%.*]], i32 [[I]], i32 [[J]]
-; CHECK-NEXT:    [[ARRAYIDX14_PROMOTED:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    br label [[FOR3_SPLIT1:%.*]]
 ; CHECK:       for3.preheader:
 ; CHECK-NEXT:    br label [[FOR3:%.*]]
@@ -35,15 +33,14 @@ define void @test(i32 %T, [90 x i32]* noalias nocapture %C, i16* noalias nocaptu
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[K]], [[MUL]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i32 [[ADD]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
-; CHECK-NEXT:    [[ADD15:%.*]] = add nsw i32 [[CONV]], [[ARRAYIDX14_PROMOTED]]
+; CHECK-NEXT:    [[ADD15:%.*]] = add nsw i16 [[TMP0]], 1
+; CHECK-NEXT:    store i16 [[ADD15]], i16* [[ARRAYIDX]]
 ; CHECK-NEXT:    br label [[FOR2_INC16]]
 ; CHECK:       for3.split:
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[K]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 90
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR1_LOOPEXIT:%.*]], label [[FOR3]]
 ; CHECK:       for2.inc16:
-; CHECK-NEXT:    store i32 [[ADD15]], i32* [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INC17]] = add nuw nsw i32 [[J]], 1
 ; CHECK-NEXT:    [[EXITCOND47:%.*]] = icmp eq i32 [[INC17]], 90
 ; CHECK-NEXT:    br i1 [[EXITCOND47]], label [[FOR1_INC19]], label [[FOR2_HEADER]]
@@ -66,25 +63,20 @@ for1.header:                                  ; preds = %entry
 
 for2.header:                                  ; preds = %for2.inc16, %for1.header
   %j = phi i32 [ 0, %for1.header ], [ %inc17, %for2.inc16 ]
-  %arrayidx14 = getelementptr inbounds [90 x i32], [90 x i32]* %C, i32 %i, i32 %j
-  %arrayidx14.promoted = load i32, i32* %arrayidx14, align 4
   br label %for3
 
 for3:                                        ; preds = %for3, %for2.header
-  %add1541 = phi i32 [ %arrayidx14.promoted, %for2.header ], [ %add15, %for3 ]
   %k = phi i32 [ 1, %for2.header ], [ %inc, %for3 ]
   %add = add nsw i32 %k, %mul
   %arrayidx = getelementptr inbounds i16, i16* %A, i32 %add
   %0 = load i16, i16* %arrayidx, align 2
-  %conv = sext i16 %0 to i32
-  %add15 = add nsw i32 %conv, %add1541
+  %add15 = add nsw i16 %0, 1
+  store i16 %add15, i16* %arrayidx
   %inc = add nuw nsw i32 %k, 1
   %exitcond = icmp eq i32 %inc, 90
   br i1 %exitcond, label %for2.inc16, label %for3
 
 for2.inc16:                                        ; preds = %for.body6
-  %add15.lcssa = phi i32 [ %add15, %for3 ]
-  store i32 %add15.lcssa, i32* %arrayidx14, align 4
   %inc17 = add nuw nsw i32 %j, 1
   %exitcond47 = icmp eq i32 %inc17, 90
   br i1 %exitcond47, label %for1.inc19, label %for2.header
diff --git a/test/Transforms/LoopInterchange/reductions.ll b/test/Transforms/LoopInterchange/reductions.ll
deleted file mode 100644
index 28a2d8d6a66b6ded33fdbcf74d16f3208eb84663..0000000000000000000000000000000000000000
--- a/test/Transforms/LoopInterchange/reductions.ll
+++ /dev/null
@@ -1,272 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt < %s -basicaa -loop-interchange -verify-dom-info -verify-loop-info -verify-loop-lcssa -S -debug 2>&1 | FileCheck %s
-
-@A = common global [500 x [500 x i32]] zeroinitializer
-@X = common global i32 0
-@B = common global [500 x [500 x i32]] zeroinitializer
-@Y = common global i32 0
-
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++)
-;;      X+=A[j][i];
-
-;; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi.
-; CHECK: Loops interchanged.
-
-define void @reduction_01(i32 %N) {
-entry:
-  %cmp16 = icmp sgt i32 %N, 1
-  br i1 %cmp16, label %for.body3.lr.ph, label %for.end8
-
-for.body3.lr.ph:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
-  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
-  %X.promoted = load i32, i32* @X
-  br label %for.body3
-
-for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
-  %add15 = phi i32 [ %X.promoted, %for.body3.lr.ph ], [ %add, %for.body3 ]
-  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
-  %0 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %add15, %0
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body3
-
-for.cond1.for.inc6_crit_edge:                     ; preds = %for.body3
-  %add.lcssa = phi i32 [ %add, %for.body3 ]
-  store i32 %add.lcssa, i32* @X
-  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
-  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
-  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
-  br i1 %exitcond21, label %for.end8, label %for.body3.lr.ph
-
-for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
-  ret void
-}
-
-;; Test for more than 1 reductions inside a loop.
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++)
-;;      for( int k=1;k<N;k++) {
-;;        X+=A[k][j];
-;;        Y+=B[k][i];
-;;      }
-
-;; Loop is interchanged check that the phi nodes are split and the promoted value is used instead of the reduction phi.
-; CHECK: Loops interchanged.
-
-define void @reduction_02(i32 %N) {
-entry:
-  %cmp34 = icmp sgt i32 %N, 1
-  br i1 %cmp34, label %for.cond4.preheader.preheader, label %for.end19
-
-for.cond4.preheader.preheader:                    ; preds = %for.inc17, %entry
-  %indvars.iv40 = phi i64 [ %indvars.iv.next41, %for.inc17 ], [ 1, %entry ]
-  br label %for.body6.lr.ph
-
-for.body6.lr.ph:                                  ; preds = %for.cond4.for.inc14_crit_edge, %for.cond4.preheader.preheader
-  %indvars.iv36 = phi i64 [ %indvars.iv.next37, %for.cond4.for.inc14_crit_edge ], [ 1, %for.cond4.preheader.preheader ]
-  %X.promoted = load i32, i32* @X
-  %Y.promoted = load i32, i32* @Y
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
-  %add1331 = phi i32 [ %Y.promoted, %for.body6.lr.ph ], [ %add13, %for.body6 ]
-  %add30 = phi i32 [ %X.promoted, %for.body6.lr.ph ], [ %add, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv36
-  %0 = load i32, i32* %arrayidx8
-  %add = add nsw i32 %add30, %0
-  %arrayidx12 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @B, i64 0, i64 %indvars.iv, i64 %indvars.iv40
-  %1 = load i32, i32* %arrayidx12
-  %add13 = add nsw i32 %add1331, %1
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond4.for.inc14_crit_edge, label %for.body6
-
-for.cond4.for.inc14_crit_edge:                    ; preds = %for.body6
-  %add.lcssa = phi i32 [ %add, %for.body6 ]
-  %add13.lcssa = phi i32 [ %add13, %for.body6 ]
-  store i32 %add.lcssa, i32* @X
-  store i32 %add13.lcssa, i32* @Y
-  %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, 1
-  %lftr.wideiv38 = trunc i64 %indvars.iv.next37 to i32
-  %exitcond39 = icmp eq i32 %lftr.wideiv38, %N
-  br i1 %exitcond39, label %for.inc17, label %for.body6.lr.ph
-
-for.inc17:                                        ; preds = %for.cond4.for.inc14_crit_edge
-  %add.lcssa.lcssa = phi i32 [ %add.lcssa, %for.cond4.for.inc14_crit_edge ]
-  %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
-  %lftr.wideiv42 = trunc i64 %indvars.iv.next41 to i32
-  %exitcond43 = icmp eq i32 %lftr.wideiv42, %N
-  br i1 %exitcond43, label %for.end19, label %for.cond4.preheader.preheader
-
-for.end19:                                        ; preds = %for.inc17, %entry
-  %res1 = phi i32 [ 0, %entry ], [ %add.lcssa.lcssa, %for.inc17 ]
-  store i32 %res1, i32* @X
-  ret void
-}
-
-;; Not tightly nested. Do not interchange.
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++) {
-;;      for( int k=1;k<N;k++) {
-;;        X+=A[k][j];
-;;      }
-;;      Y+=B[j][i];
-;;    }
-
-;; Not tightly nested. Do not interchange.
-;; Not interchanged hence the phi's in the inner loop will not be split.
-; CHECK: Outer loops with reductions are not supported currently.
-
-define void @reduction_03(i32 %N) {
-entry:
-  %cmp35 = icmp sgt i32 %N, 1
-  br i1 %cmp35, label %for.cond4.preheader.lr.ph, label %for.end19
-
-for.cond4.preheader.lr.ph:                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
-  %indvars.iv41 = phi i64 [ %indvars.iv.next42, %for.cond1.for.inc17_crit_edge ], [ 1, %entry ]
-  %Y.promoted = load i32, i32* @Y
-  br label %for.body6.lr.ph
-
-for.body6.lr.ph:                                  ; preds = %for.cond4.for.end_crit_edge, %for.cond4.preheader.lr.ph
-  %indvars.iv37 = phi i64 [ 1, %for.cond4.preheader.lr.ph ], [ %indvars.iv.next38, %for.cond4.for.end_crit_edge ]
-  %add1334 = phi i32 [ %Y.promoted, %for.cond4.preheader.lr.ph ], [ %add13, %for.cond4.for.end_crit_edge ]
-  %X.promoted = load i32, i32* @X
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
-  %add31 = phi i32 [ %X.promoted, %for.body6.lr.ph ], [ %add, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv37
-  %0 = load i32, i32* %arrayidx8
-  %add = add nsw i32 %add31, %0
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond4.for.end_crit_edge, label %for.body6
-
-for.cond4.for.end_crit_edge:                      ; preds = %for.body6
-  %add.lcssa = phi i32 [ %add, %for.body6 ]
-  store i32 %add.lcssa, i32* @X
-  %arrayidx12 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @B, i64 0, i64 %indvars.iv37, i64 %indvars.iv41
-  %1 = load i32, i32* %arrayidx12
-  %add13 = add nsw i32 %add1334, %1
-  %indvars.iv.next38 = add nuw nsw i64 %indvars.iv37, 1
-  %lftr.wideiv39 = trunc i64 %indvars.iv.next38 to i32
-  %exitcond40 = icmp eq i32 %lftr.wideiv39, %N
-  br i1 %exitcond40, label %for.cond1.for.inc17_crit_edge, label %for.body6.lr.ph
-
-for.cond1.for.inc17_crit_edge:                    ; preds = %for.cond4.for.end_crit_edge
-  %add13.lcssa = phi i32 [ %add13, %for.cond4.for.end_crit_edge ]
-  store i32 %add13.lcssa, i32* @Y
-  %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1
-  %lftr.wideiv43 = trunc i64 %indvars.iv.next42 to i32
-  %exitcond44 = icmp eq i32 %lftr.wideiv43, %N
-  br i1 %exitcond44, label %for.end19, label %for.cond4.preheader.lr.ph
-
-for.end19:                                        ; preds = %for.cond1.for.inc17_crit_edge, %entry
-  ret void
-}
-
-;; Multiple use of reduction not safe. Do not interchange.
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++)
-;;      for( int k=1;k<N;k++) {
-;;        X+=A[k][j];
-;;        Y+=X;
-;;      }
-
-;; Not interchanged hence the phi's in the inner loop will not be split.
-; CHECK: Only inner loops with induction or reduction PHI nodes are supported currently.
-
-define void @reduction_04(i32 %N) {
-entry:
-  %cmp28 = icmp sgt i32 %N, 1
-  br i1 %cmp28, label %for.cond4.preheader.preheader, label %for.end15
-
-for.cond4.preheader.preheader:                    ; preds = %for.inc13, %entry
-  %i.029 = phi i32 [ %inc14, %for.inc13 ], [ 1, %entry ]
-  br label %for.body6.lr.ph
-
-for.body6.lr.ph:                                  ; preds = %for.cond4.for.inc10_crit_edge, %for.cond4.preheader.preheader
-  %indvars.iv30 = phi i64 [ %indvars.iv.next31, %for.cond4.for.inc10_crit_edge ], [ 1, %for.cond4.preheader.preheader ]
-  %X.promoted = load i32, i32* @X
-  %Y.promoted = load i32, i32* @Y
-  br label %for.body6
-
-for.body6:                                        ; preds = %for.body6, %for.body6.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ]
-  %add925 = phi i32 [ %Y.promoted, %for.body6.lr.ph ], [ %add9, %for.body6 ]
-  %add24 = phi i32 [ %X.promoted, %for.body6.lr.ph ], [ %add, %for.body6 ]
-  %arrayidx8 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv30
-  %0 = load i32, i32* %arrayidx8
-  %add = add nsw i32 %add24, %0
-  %add9 = add nsw i32 %add925, %add
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond4.for.inc10_crit_edge, label %for.body6
-
-for.cond4.for.inc10_crit_edge:                    ; preds = %for.body6
-  %add.lcssa = phi i32 [ %add, %for.body6 ]
-  %add9.lcssa = phi i32 [ %add9, %for.body6 ]
-  store i32 %add.lcssa, i32* @X
-  store i32 %add9.lcssa, i32* @Y
-  %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 1
-  %lftr.wideiv32 = trunc i64 %indvars.iv.next31 to i32
-  %exitcond33 = icmp eq i32 %lftr.wideiv32, %N
-  br i1 %exitcond33, label %for.inc13, label %for.body6.lr.ph
-
-for.inc13:                                        ; preds = %for.cond4.for.inc10_crit_edge
-  %inc14 = add nuw nsw i32 %i.029, 1
-  %exitcond34 = icmp eq i32 %inc14, %N
-  br i1 %exitcond34, label %for.end15, label %for.cond4.preheader.preheader
-
-for.end15:                                        ; preds = %for.inc13, %entry
-  ret void
-}
-
-;;  for( int i=1;i<N;i++)
-;;    for( int j=1;j<N;j++)
-;;      X+=A[j][i];
-;;  Y = X
-; CHECK: Loops interchanged.
-define void @reduction_05(i32 %N) {
-entry:
-  %cmp16 = icmp sgt i32 %N, 1
-  br i1 %cmp16, label %for.body7.lr.ph, label %for.end8
-
-for.body7.lr.ph:                                  ; preds = %for.cond1.for.inc6_crit_edge, %entry
-  %indvars.iv18 = phi i64 [ %indvars.iv.next19, %for.cond1.for.inc6_crit_edge ], [ 1, %entry ]
-  %X.promoted = load i32, i32* @X
-  br label %for.body7
-
-for.body7:                                        ; preds = %for.body7, %for.body7.lr.ph
-  %indvars.iv = phi i64 [ 1, %for.body7.lr.ph ], [ %indvars.iv.next, %for.body7 ]
-  %add15 = phi i32 [ %X.promoted, %for.body7.lr.ph ], [ %add, %for.body7 ]
-  %arrayidx5 = getelementptr inbounds [500 x [500 x i32]], [500 x [500 x i32]]* @A, i64 0, i64 %indvars.iv, i64 %indvars.iv18
-  %0 = load i32, i32* %arrayidx5
-  %add = add nsw i32 %add15, %0
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %N
-  br i1 %exitcond, label %for.cond1.for.inc6_crit_edge, label %for.body7
-
-for.cond1.for.inc6_crit_edge:                     ; preds = %for.body7
-  %add.lcssa = phi i32 [ %add, %for.body7 ]
-  store i32 %add.lcssa, i32* @X
-  %indvars.iv.next19 = add nuw nsw i64 %indvars.iv18, 1
-  %lftr.wideiv20 = trunc i64 %indvars.iv.next19 to i32
-  %exitcond21 = icmp eq i32 %lftr.wideiv20, %N
-  br i1 %exitcond21, label %for.end8, label %for.body7.lr.ph
-
-for.end8:                                         ; preds = %for.cond1.for.inc6_crit_edge, %entry
-  %add.res = phi i32 [ %add.lcssa, %for.cond1.for.inc6_crit_edge ], [ 0, %entry ]
-  store i32 %add.res, i32* @Y
-  ret void
-}
diff --git a/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll b/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll
index 7036d2d9c3a94dec2b0c5cc72f8d2eadaee27ec7..a09a2290e0a54b5705a03ad9375b277f5f108014 100644
--- a/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll
+++ b/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 ; PR3408
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/LoopRotate/PhiRename-1.ll b/test/Transforms/LoopRotate/PhiRename-1.ll
index 6d75888d70db55f62ca17f32d6cb4f3092226980..8bece445cf46354771b4299b855f17c8966cfb01 100644
--- a/test/Transforms/LoopRotate/PhiRename-1.ll
+++ b/test/Transforms/LoopRotate/PhiRename-1.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 ; CHECK-NOT: [ {{.}}tmp224
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/Transforms/LoopRotate/PhiSelfReference-1.ll b/test/Transforms/LoopRotate/PhiSelfReference-1.ll
index ed4944833913579d860e33f32bddffac20adef74..7726c53e55eee2649c8a4e81b4ac72f378add624 100644
--- a/test/Transforms/LoopRotate/PhiSelfReference-1.ll
+++ b/test/Transforms/LoopRotate/PhiSelfReference-1.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 ; ModuleID = 'PhiSelfReference-1.bc'
 
 define void @snrm2(i32 %incx) {
diff --git a/test/Transforms/LoopRotate/alloca.ll b/test/Transforms/LoopRotate/alloca.ll
index bbcfb39c37221c1d2d6cb9f21235e186f2107912..59da33f8802ad2f1840a0ffd1a880bc010ab88ea 100644
--- a/test/Transforms/LoopRotate/alloca.ll
+++ b/test/Transforms/LoopRotate/alloca.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 ; Test alloca in -loop-rotate.
 
diff --git a/test/Transforms/LoopRotate/basic.ll b/test/Transforms/LoopRotate/basic.ll
index 299c18c871e81551df3a3763db32527e78f93e9f..d01d19f7f128ab731841062f3adf8c95a67cc925 100644
--- a/test/Transforms/LoopRotate/basic.ll
+++ b/test/Transforms/LoopRotate/basic.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 ; RUN: opt -S -passes='require<targetir>,require<assumptions>,loop(rotate)' < %s | FileCheck %s
+; RUN: opt -S -passes='require<targetir>,require<assumptions>,loop(rotate)' -enable-mssa-loop-dependency=true -verify-memoryssa  < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/LoopRotate/catchret.ll b/test/Transforms/LoopRotate/catchret.ll
index c035e49d79c214fce6c0a048df132f356069c2cf..f28af8aed601c2d09ff57e2d2069f162c8f0a9a5 100755
--- a/test/Transforms/LoopRotate/catchret.ll
+++ b/test/Transforms/LoopRotate/catchret.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 target triple = "x86_64-pc-windows-msvc"
 
diff --git a/test/Transforms/LoopRotate/convergent.ll b/test/Transforms/LoopRotate/convergent.ll
index c8b34fd75f0763fd4545664959d9849caa4222c1..37671562142eb4107028a5be21e71e3cb2e592fd 100644
--- a/test/Transforms/LoopRotate/convergent.ll
+++ b/test/Transforms/LoopRotate/convergent.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 
 @e = global i32 10
 
diff --git a/test/Transforms/LoopRotate/crash.ll b/test/Transforms/LoopRotate/crash.ll
index 5e2b66d6803da9ea6ee6b2e17da9771463320be9..2a45e370e18c91517fb98c2fef0816272fa31e07 100644
--- a/test/Transforms/LoopRotate/crash.ll
+++ b/test/Transforms/LoopRotate/crash.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info < %s
+; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/test/Transforms/LoopRotate/dbg-value-duplicates.ll b/test/Transforms/LoopRotate/dbg-value-duplicates.ll
index 2fea06b5afe66fbcd8ae24f10546381bb646ff0e..ce7157c571f080fdfcc1b799f389fa8a54afcbb4 100644
--- a/test/Transforms/LoopRotate/dbg-value-duplicates.ll
+++ b/test/Transforms/LoopRotate/dbg-value-duplicates.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 source_filename = "/tmp/loop.c"
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.13.0"
diff --git a/test/Transforms/LoopRotate/dbgvalue.ll b/test/Transforms/LoopRotate/dbgvalue.ll
index bc0b20d0fea9fcdf3fa3c404c2e569caeeb1b96d..93e3c4c252cae320f5355e1fea7e90eba7309d40 100644
--- a/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/test/Transforms/LoopRotate/dbgvalue.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
diff --git a/test/Transforms/LoopRotate/indirectbr.ll b/test/Transforms/LoopRotate/indirectbr.ll
index 8f059d505057f4db409e3b4a0dde897255688b64..a26ec375953d203abcfd1a2c65cac0f58ec1678d 100644
--- a/test/Transforms/LoopRotate/indirectbr.ll
+++ b/test/Transforms/LoopRotate/indirectbr.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info | FileCheck %s
+; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
 
 ; PR5502
 define void @z80_do_opcodes() nounwind {
diff --git a/test/Transforms/LoopRotate/loopexitinglatch.ll b/test/Transforms/LoopRotate/loopexitinglatch.ll
index c05e512831ec9bbc25a8a28383a5699e60b9bb33..dee29ec958e197c72dc86fe45614a6f98c74991f 100644
--- a/test/Transforms/LoopRotate/loopexitinglatch.ll
+++ b/test/Transforms/LoopRotate/loopexitinglatch.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8m.base-arm-none-eabi"
diff --git a/test/Transforms/LoopRotate/multiple-exits.ll b/test/Transforms/LoopRotate/multiple-exits.ll
index f38c855b9c8c5f35bfc8dd018d24b10e8b058275..c6f153b8ca3ec1fbeedfd5a2faeff2da9f324642 100644
--- a/test/Transforms/LoopRotate/multiple-exits.ll
+++ b/test/Transforms/LoopRotate/multiple-exits.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopRotate/phi-dbgvalue.ll b/test/Transforms/LoopRotate/phi-dbgvalue.ll
index c4b2a6a76f2161356d4e2a9531a1a84227d8c682..1f7e129c26e3a60ce86becddad5eb26314efd15b 100644
--- a/test/Transforms/LoopRotate/phi-dbgvalue.ll
+++ b/test/Transforms/LoopRotate/phi-dbgvalue.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 
 ;CHECK-LABEL: func
 ;CHECK-LABEL: entry
diff --git a/test/Transforms/LoopRotate/phi-duplicate.ll b/test/Transforms/LoopRotate/phi-duplicate.ll
index 46ee5961ba5410fbeae8d78397b95d324f0585c1..d7f69d8c9cc3b659f98676475c0e28acf42f142c 100644
--- a/test/Transforms/LoopRotate/phi-duplicate.ll
+++ b/test/Transforms/LoopRotate/phi-duplicate.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0"
 
diff --git a/test/Transforms/LoopRotate/pr22337.ll b/test/Transforms/LoopRotate/pr22337.ll
index 03e804b775e4ca990cb2792cd60130ce94c4d179..8195affbcd3b2c58c48284f8f9d11b0f42b6860a 100644
--- a/test/Transforms/LoopRotate/pr22337.ll
+++ b/test/Transforms/LoopRotate/pr22337.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 @a = external global i8, align 4
 @tmp = global i8* @a
diff --git a/test/Transforms/LoopRotate/pr33701.ll b/test/Transforms/LoopRotate/pr33701.ll
index ed162b12098285523a90e32b48c82123fc534ac5..8535e3176761938794477dae5f83e0da2a91d4e6 100644
--- a/test/Transforms/LoopRotate/pr33701.ll
+++ b/test/Transforms/LoopRotate/pr33701.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
 
 define void @func() {
 bb0:
diff --git a/test/Transforms/LoopRotate/pr35210.ll b/test/Transforms/LoopRotate/pr35210.ll
index 3033ca84732c589413a3832cb116f3afb011e881..a705642c435abf28089f236825bd2bb02de12f29 100644
--- a/test/Transforms/LoopRotate/pr35210.ll
+++ b/test/Transforms/LoopRotate/pr35210.ll
@@ -1,4 +1,5 @@
 ;RUN: opt %s -passes='adce,loop(rotate),adce' -S -debug-pass-manager -debug-only=loop-rotate 2>&1 | FileCheck %s
+;RUN: opt %s -passes='adce,loop(rotate),adce' -S -debug-pass-manager -debug-only=loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa 2>&1 | FileCheck %s --check-prefix=MSSA
 ;REQUIRES: asserts
 
 ; This test is to make sure we invalidate the post dominator pass after loop rotate simplifies the loop latch.
@@ -32,6 +33,36 @@
 ; CHECK-NEXT: Running analysis: PostDominatorTreeAnalysis on f
 ; CHECK-NEXT: Finished llvm::Function pass manager run.
 
+; MSSA: Starting llvm::Function pass manager run.
+; MSSA-NEXT: Running pass: ADCEPass on f
+; MSSA-NEXT: Running analysis: PostDominatorTreeAnalysis on f
+; MSSA-NEXT: Running pass: FunctionToLoopPassAdaptor{{.*}} on f
+; MSSA-NEXT: Starting llvm::Function pass manager run.
+; MSSA-NEXT: Running pass: LoopSimplifyPass on f
+; MSSA-NEXT: Running analysis: LoopAnalysis on f
+; MSSA-NEXT: Running analysis: DominatorTreeAnalysis on f
+; MSSA-NEXT: Running analysis: AssumptionAnalysis on f
+; MSSA-NEXT: Running pass: LCSSAPass on f
+; MSSA-NEXT: Finished llvm::Function pass manager run.
+; MSSA-NEXT: Running analysis: MemorySSAAnalysis on f
+; MSSA-NEXT: Running analysis: AAManager on f
+; MSSA-NEXT: Running analysis: TargetLibraryAnalysis on f
+; MSSA-NEXT: Running analysis: ScalarEvolutionAnalysis on f
+; MSSA-NEXT: Running analysis: TargetIRAnalysis on f
+; MSSA-NEXT: Running analysis: InnerAnalysisManagerProxy{{.*}} on f
+; MSSA-NEXT: Starting Loop pass manager run.
+; MSSA-NEXT: Running analysis: PassInstrumentationAnalysis on bb
+; MSSA-NEXT: Running pass: LoopRotatePass on Loop at depth 1 containing: %bb<header><exiting>,%bb4<latch>
+; MSSA-NEXT: Folding loop latch bb4 into bb
+; MSSA-NEXT: Invalidating all non-preserved analyses for: bb
+; MSSA-NEXT: Finished Loop pass manager run.
+; MSSA-NEXT: Invalidating all non-preserved analyses for: f
+; MSSA-NEXT: Invalidating analysis: PostDominatorTreeAnalysis on f
+; MSSA-NEXT: Running pass: ADCEPass on f
+; MSSA-NEXT: Running analysis: PostDominatorTreeAnalysis on f
+; MSSA-NEXT: Finished llvm::Function pass manager run.
+
+
 ; CHECK-LABEL: define i8 @f() {
 ; CHECK-NEXT : entry:
 ; CHECK-NEXT :   br label %bb
@@ -52,6 +83,26 @@
 ; CHECK-NEXT :
 ; CHECK-NEXT : attributes #0 = { noreturn }
 
+; MSSA-LABEL: define i8 @f() {
+; MSSA-NEXT : entry:
+; MSSA-NEXT :   br label %bb
+; MSSA-NEXT :
+; MSSA-NEXT : bb:                                               ; preds = %bb, %entry
+; MSSA-NEXT :   %mode.0 = phi i8 [ 0, %entry ], [ %indvar.next, %bb ]
+; MSSA-NEXT :   %tmp5 = icmp eq i8 %mode.0, 1
+; MSSA-NEXT :   %indvar.next = add i8 %mode.0, 1
+; MSSA-NEXT :   br i1 %tmp5, label %bb5, label %bb
+; MSSA-NEXT :
+; MSSA-NEXT : bb5:                                              ; preds = %bb
+; MSSA-NEXT :   tail call void @raise_exception() #0
+; MSSA-NEXT :   unreachable
+; MSSA-NEXT : }
+; MSSA-NEXT :
+; MSSA-NEXT : ; Function Attrs: noreturn
+; MSSA-NEXT : declare void @raise_exception() #0
+; MSSA-NEXT :
+; MSSA-NEXT : attributes #0 = { noreturn }
+
 define i8 @f() {
 entry:
   br label %bb
diff --git a/test/Transforms/LoopRotate/pr37205.ll b/test/Transforms/LoopRotate/pr37205.ll
index 3ba6c04545f80e53a6555bbcbfb5a5863e23a371..20ad7568189845e91205f7200c00f79f472f5ecf 100644
--- a/test/Transforms/LoopRotate/pr37205.ll
+++ b/test/Transforms/LoopRotate/pr37205.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom < %s | FileCheck %s
+; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Verify that we invalidate SCEV properly.
diff --git a/test/Transforms/LoopRotate/preserve-mssa.ll b/test/Transforms/LoopRotate/preserve-mssa.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d975f80cd9e473d60b7cb17dae2108b17fb27e54
--- /dev/null
+++ b/test/Transforms/LoopRotate/preserve-mssa.ll
@@ -0,0 +1,109 @@
+; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+
+; CHECK-LABEL: @multiedge(
+define void @multiedge() {
+entry:
+  br label %retry
+
+retry:                                            ; preds = %sw.epilog, %entry
+  br i1 undef, label %cleanup, label %if.end
+
+if.end:                                           ; preds = %retry
+  switch i32 undef, label %sw.epilog [
+    i32 -3, label %cleanup
+    i32 -5, label %cleanup
+    i32 -16, label %cleanup
+    i32 -25, label %cleanup
+  ]
+
+sw.epilog:                                        ; preds = %if.end
+  br label %retry
+
+cleanup:                                          ; preds = %if.end, %if.end, %if.end, %if.end, %retry
+  ret void
+}
+
+; CHECK-LABEL: @read_line(
+define internal fastcc i32 @read_line(i8* nocapture %f) unnamed_addr {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %call = call i8* @prepbuffer(i8* nonnull undef)
+  %call1 = call i8* @fgets(i8* %call, i32 8192, i8* %f)
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.cond
+  ret i32 undef
+
+if.end:                                           ; preds = %for.cond
+  %call4 = call i64 @strlen(i8* %call)
+  br label %for.cond
+}
+
+declare dso_local i8* @prepbuffer(i8*) local_unnamed_addr
+declare dso_local i8* @fgets(i8*, i32, i8* nocapture) local_unnamed_addr
+declare dso_local i64 @strlen(i8* nocapture) local_unnamed_addr
+
+
+; CHECK-LABEL: @loop3
+define dso_local fastcc void @loop3() unnamed_addr {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.body, label %for.end81
+
+for.body:                                         ; preds = %for.cond
+  %.idx122.val = load i32, i32* undef, align 8
+  call fastcc void @cont()
+  br label %for.cond
+
+for.end81:                                        ; preds = %for.cond
+  ret void
+}
+
+; CHECK-LABEL: @loop4
+define dso_local fastcc void @loop4() unnamed_addr {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  call fastcc void @cont()
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  call fastcc void @cont()
+  call fastcc void @cont()
+  ret void
+}
+
+; Function Attrs: inlinehint nounwind uwtable
+declare dso_local fastcc void @cont() unnamed_addr
+
+@glob_array = internal unnamed_addr constant [3 x i32] [i32 1, i32 0, i32 2], align 4
+; Test against failure in MemorySSAUpdater, when rotate clones instructions as Value.
+; CHECK-LABEL: @loop5
+define dso_local fastcc void @loop5() unnamed_addr {
+entry:
+  br label %for.body
+
+do.cond:                          ; preds = %for.body
+  unreachable
+
+for.body:                               ; preds = %if.end, %entry
+  %indvar = phi i64 [ %indvar.next, %if.end ], [ 0, %entry ]
+  %array = getelementptr inbounds [3 x i32], [3 x i32]* @glob_array, i64 0, i64 %indvar
+  %0 = load i32, i32* %array, align 4
+  br i1 undef, label %do.cond, label %if.end
+
+if.end:                                 ; preds = %for.body
+  store i32 undef, i32* undef, align 4
+  %indvar.next = add nuw nsw i64 %indvar, 1
+  br label %for.body
+}
+
+
diff --git a/test/Transforms/LoopRotate/preserve-scev.ll b/test/Transforms/LoopRotate/preserve-scev.ll
index 7bd22326864a4e3882efc74d384a05e5ef004b14..2faf8ec487aaab1718cff4061a929ec0291f46a6 100644
--- a/test/Transforms/LoopRotate/preserve-scev.ll
+++ b/test/Transforms/LoopRotate/preserve-scev.ll
@@ -1,27 +1,48 @@
 ; RUN: opt < %s -loop-rotate -loop-reduce -verify-dom-info -verify-loop-info -disable-output
+; RUN: opt < %s -loop-rotate -loop-reduce -enable-mssa-loop-dependency=true -verify-memoryssa -verify-dom-info -verify-loop-info -disable-output
 
-define fastcc void @foo() nounwind {
+define fastcc void @foo(i32* %A, i64 %i) nounwind {
 BB:
   br label %BB1
 
 BB1:                                              ; preds = %BB19, %BB
+  %tttmp1 = getelementptr i32, i32* %A, i64 %i
+  %tttmp2 = load i32, i32* %tttmp1
+  %tttmp3 = add i32 %tttmp2, 1
+  store i32 %tttmp3, i32* %tttmp1
   br label %BB4
 
 BB2:                                              ; preds = %BB4
   %tmp = bitcast i32 undef to i32                 ; <i32> [#uses=1]
+  %tttmp7 = getelementptr i32, i32* %A, i64 %i
+  %tttmp8 = load i32, i32* %tttmp7
+  %tttmp9 = add i32 %tttmp8, 3
+  store i32 %tttmp9, i32* %tttmp7
   br label %BB4
 
-BB4:                                              ; preds = %BB3, %BB1
+BB4:                                              ; preds = %BB2, %BB1
   %tmp5 = phi i32 [ undef, %BB1 ], [ %tmp, %BB2 ] ; <i32> [#uses=1]
+  %tttmp4 = getelementptr i32, i32* %A, i64 %i
+  %tttmp5 = load i32, i32* %tttmp4
+  %tttmp6 = add i32 %tttmp5, 3
+  store i32 %tttmp6, i32* %tttmp4
   br i1 false, label %BB8, label %BB2
 
 BB8:                                              ; preds = %BB6
   %tmp7 = bitcast i32 %tmp5 to i32                ; <i32> [#uses=2]
+  %tttmp10 = getelementptr i32, i32* %A, i64 %i
+  %tttmp11 = load i32, i32* %tttmp10
+  %tttmp12 = add i32 %tttmp11, 3
+  store i32 %tttmp12, i32* %tttmp10
   br i1 false, label %BB9, label %BB13
 
 BB9:                                              ; preds = %BB12, %BB8
   %tmp10 = phi i32 [ %tmp11, %BB12 ], [ %tmp7, %BB8 ] ; <i32> [#uses=2]
   %tmp11 = add i32 %tmp10, 1                      ; <i32> [#uses=1]
+  %tttmp13 = getelementptr i32, i32* %A, i64 %i
+  %tttmp14 = load i32, i32* %tttmp13
+  %tttmp15 = add i32 %tttmp14, 3
+  store i32 %tttmp15, i32* %tttmp13
   br label %BB12
 
 BB12:                                             ; preds = %BB9
@@ -29,16 +50,28 @@ BB12:                                             ; preds = %BB9
 
 BB13:                                             ; preds = %BB15, %BB8
   %tmp14 = phi i32 [ %tmp16, %BB15 ], [ %tmp7, %BB8 ] ; <i32> [#uses=1]
+  %tttmp16 = getelementptr i32, i32* %A, i64 %i
+  %tttmp17 = load i32, i32* %tttmp16
+  %tttmp18 = add i32 %tttmp17, 3
+  store i32 %tttmp18, i32* %tttmp16
   br label %BB15
 
 BB15:                                             ; preds = %BB13
   %tmp16 = add i32 %tmp14, -1                     ; <i32> [#uses=1]
+  %tttmp19 = getelementptr i32, i32* %A, i64 %i
+  %tttmp20 = load i32, i32* %tttmp19
+  %tttmp21 = add i32 %tttmp20, 3
+  store i32 %tttmp21, i32* %tttmp19
   br i1 false, label %BB13, label %BB18
 
 BB17:                                             ; preds = %BB12
   br label %BB19
 
 BB18:                                             ; preds = %BB15
+  %tttmp22 = getelementptr i32, i32* %A, i64 %i
+  %tttmp23 = load i32, i32* %tttmp22
+  %tttmp24 = add i32 %tttmp23, 3
+  store i32 %tttmp24, i32* %tttmp22
   br label %BB19
 
 BB19:                                             ; preds = %BB18, %BB17
diff --git a/test/Transforms/LoopRotate/vect.omp.persistence.ll b/test/Transforms/LoopRotate/vect.omp.persistence.ll
index 6a1865499d30bdd06e7b2d7a29d4ebe29ffc474d..c4c987e7b2bafb4444848b67fc4c2b1c21a60f48 100644
--- a/test/Transforms/LoopRotate/vect.omp.persistence.ll
+++ b/test/Transforms/LoopRotate/vect.omp.persistence.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-rotate -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll b/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..44f1c0bcd889d858fef0b027718952233332be14
--- /dev/null
+++ b/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -0,0 +1,1447 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-simplifycfg < %s | FileCheck %s
+; RUN: opt -S -passes='require<domtree>,loop(simplify-cfg)' < %s | FileCheck %s
+; RUN: opt -S -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+; Make sure that we can eliminate a provably dead backedge.
+define i32 @dead_backedge_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_backedge_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_BE:%.*]], [[HEADER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[I_1:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[I_1]], 100
+; CHECK-NEXT:    br i1 [[CMP1]], label [[HEADER_BACKEDGE]], label [[DEAD_BACKEDGE:%.*]]
+; CHECK:       header.backedge:
+; CHECK-NEXT:    [[I_BE]] = phi i32 [ [[I_1]], [[HEADER]] ], [ [[I_2:%.*]], [[DEAD_BACKEDGE]] ]
+; CHECK-NEXT:    br label [[HEADER]]
+; CHECK:       dead_backedge:
+; CHECK-NEXT:    [[I_2]] = add i32 [[I_1]], 10
+; CHECK-NEXT:    br i1 false, label [[HEADER_BACKEDGE]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_2_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.1, %header], [%i.2, %dead_backedge]
+  %i.1 = add i32 %i, 1
+  %cmp1 = icmp slt i32 %i.1, 100
+  br i1 %cmp1, label %header, label %dead_backedge
+
+dead_backedge:
+  %i.2 = add i32 %i.1, 10
+  br i1 false, label %header, label %exit
+
+exit:
+  ret i32 %i.2
+}
+
+; Make sure that we can eliminate a provably dead backedge with switch.
+define i32 @dead_backedge_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_backedge_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_BE:%.*]], [[HEADER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[I_1:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[I_1]], 100
+; CHECK-NEXT:    br i1 [[CMP1]], label [[HEADER_BACKEDGE]], label [[DEAD_BACKEDGE:%.*]]
+; CHECK:       header.backedge:
+; CHECK-NEXT:    [[I_BE]] = phi i32 [ [[I_1]], [[HEADER]] ], [ [[I_2:%.*]], [[DEAD_BACKEDGE]] ]
+; CHECK-NEXT:    br label [[HEADER]]
+; CHECK:       dead_backedge:
+; CHECK-NEXT:    [[I_2]] = add i32 [[I_1]], 10
+; CHECK-NEXT:    switch i32 1, label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[HEADER_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_2_LCSSA:%.*]] = phi i32 [ [[I_2]], [[DEAD_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_2_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.1, %header], [%i.2, %dead_backedge]
+  %i.1 = add i32 %i, 1
+  %cmp1 = icmp slt i32 %i.1, 100
+  br i1 %cmp1, label %header, label %dead_backedge
+
+dead_backedge:
+  %i.2 = add i32 %i.1, 10
+  switch i32 1, label %exit [i32 0, label %header]
+
+exit:
+  ret i32 %i.2
+}
+
+; Check that we can eliminate a triangle.
+define i32 @dead_block_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_block_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can eliminate dead branches of a switch.
+define i32 @dead_block_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_block_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can eliminate several dead blocks.
+define i32 @dead_block_propogate_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_block_propogate_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can eliminate several blocks while removing a switch.
+define i32 @dead_block_propogate_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_block_propogate_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we preserve static reachibility of a dead exit block while deleting
+; a branch.
+define i32 @dead_exit_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_exit_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[DUMMY:%.*]]
+; CHECK:       dummy:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I_LCSSA]], [[DUMMY]] ], [ [[I_INC_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_1]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  br label %dummy
+
+dummy:
+  br label %exit
+
+backedge:
+  %i.inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  %i.1 = phi i32 [%i.inc, %backedge], [%i, %dummy]
+  ret i32 %i.1
+}
+
+; Check that we preserve static reachibility of a dead exit block while deleting
+; a switch.
+define i32 @dead_exit_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_exit_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_LCSSA:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I]], [[HEADER]] ], [ [[I]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[DUMMY:%.*]]
+; CHECK:       dummy:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I_LCSSA]], [[DUMMY]] ], [ [[I_INC_LCSSA]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_1]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+
+dead:
+  br label %dummy
+
+dummy:
+  br label %exit
+
+backedge:
+  %i.inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  %i.1 = phi i32 [%i.inc, %backedge], [%i, %dummy]
+  ret i32 %i.1
+}
+
+; Check that we can completely eliminate the current loop, branch case.
+define i32 @dead_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_loop_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 false, label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 false, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can completely eliminate the current loop, switch case.
+define i32 @dead_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_loop_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 false, label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 false, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can delete a dead inner loop entirely.
+define i32 @dead_sub_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @dead_sub_loop_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[LIVE_PREHEADER:%.*]], label [[DEAD_PREHEADER:%.*]]
+; CHECK:       live_preheader:
+; CHECK-NEXT:    br label [[LIVE_LOOP:%.*]]
+; CHECK:       live_loop:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 0, [[LIVE_PREHEADER]] ], [ [[A_INC:%.*]], [[LIVE_LOOP]] ]
+; CHECK-NEXT:    [[A_INC]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp slt i32 [[A_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_A]], label [[LIVE_LOOP]], label [[EXIT_A:%.*]]
+; CHECK:       exit.a:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       dead_preheader:
+; CHECK-NEXT:    br label [[DEAD_LOOP:%.*]]
+; CHECK:       dead_loop:
+; CHECK-NEXT:    [[B:%.*]] = phi i32 [ 0, [[DEAD_PREHEADER]] ], [ [[B_INC:%.*]], [[DEAD_LOOP]] ]
+; CHECK-NEXT:    [[B_INC]] = add i32 [[B]], 1
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp slt i32 [[B_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_B]], label [[DEAD_LOOP]], label [[EXIT_B:%.*]]
+; CHECK:       exit.b:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %live_preheader, label %dead_preheader
+
+live_preheader:
+  br label %live_loop
+
+live_loop:
+  %a = phi i32 [0, %live_preheader], [%a.inc, %live_loop]
+  %a.inc = add i32 %a, 1
+  %cmp.a = icmp slt i32 %a.inc, %end
+  br i1 %cmp.a, label %live_loop, label %exit.a
+
+exit.a:
+  br label %backedge
+
+dead_preheader:
+  br label %dead_loop
+
+dead_loop:
+  %b = phi i32 [0, %dead_preheader], [%b.inc, %dead_loop]
+  %b.inc = add i32 %b, 1
+  %cmp.b = icmp slt i32 %b.inc, %end
+  br i1 %cmp.b, label %dead_loop, label %exit.b
+
+exit.b:
+  br label %backedge
+
+backedge:
+  %i.inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @dead_sub_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @dead_sub_loop_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD_PREHEADER:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD_PREHEADER]]
+; CHECK-NEXT:    i32 1, label [[LIVE_PREHEADER:%.*]]
+; CHECK-NEXT:    i32 2, label [[DEAD_PREHEADER]]
+; CHECK-NEXT:    ]
+; CHECK:       live_preheader:
+; CHECK-NEXT:    br label [[LIVE_LOOP:%.*]]
+; CHECK:       live_loop:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 0, [[LIVE_PREHEADER]] ], [ [[A_INC:%.*]], [[LIVE_LOOP]] ]
+; CHECK-NEXT:    [[A_INC]] = add i32 [[A]], 1
+; CHECK-NEXT:    [[CMP_A:%.*]] = icmp slt i32 [[A_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_A]], label [[LIVE_LOOP]], label [[EXIT_A:%.*]]
+; CHECK:       exit.a:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       dead_preheader:
+; CHECK-NEXT:    br label [[DEAD_LOOP:%.*]]
+; CHECK:       dead_loop:
+; CHECK-NEXT:    [[B:%.*]] = phi i32 [ 0, [[DEAD_PREHEADER]] ], [ [[B_INC:%.*]], [[DEAD_LOOP]] ]
+; CHECK-NEXT:    [[B_INC]] = add i32 [[B]], 1
+; CHECK-NEXT:    [[CMP_B:%.*]] = icmp slt i32 [[B_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_B]], label [[DEAD_LOOP]], label [[EXIT_B:%.*]]
+; CHECK:       exit.b:
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead_preheader [i32 0, label %dead_preheader
+  i32 1, label %live_preheader
+  i32 2, label %dead_preheader]
+
+live_preheader:
+  br label %live_loop
+
+live_loop:
+  %a = phi i32 [0, %live_preheader], [%a.inc, %live_loop]
+  %a.inc = add i32 %a, 1
+  %cmp.a = icmp slt i32 %a.inc, %end
+  br i1 %cmp.a, label %live_loop, label %exit.a
+
+exit.a:
+  br label %backedge
+
+dead_preheader:
+  br label %dead_loop
+
+dead_loop:
+  %b = phi i32 [0, %dead_preheader], [%b.inc, %dead_loop]
+  %b.inc = add i32 %b, 1
+  %cmp.b = icmp slt i32 %b.inc, %end
+  br i1 %cmp.b, label %dead_loop, label %exit.b
+
+exit.b:
+  br label %backedge
+
+backedge:
+  %i.inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we preserve static reachability of an exit block even if we prove
+; that the loop is infinite. Branch case.
+define i32 @inf_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @inf_loop_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 true, label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+
+dummy:
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 true, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @inf_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @inf_loop_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 true, label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+dead:
+  %i.2 = add i32 %i, 1
+  br label %dummy
+dummy:
+  br label %backedge
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dummy]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 true, label %header, label %exit
+exit:
+  ret i32 %i.inc
+}
+
+; Check that when the block is not actually dead, we don't remove it.
+define i32 @live_block_test_branch_loop(i1 %c, i32 %end) {
+; CHECK-LABEL: @live_block_test_branch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[CHECK:%.*]], label [[LIVE:%.*]]
+; CHECK:       check:
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[LIVE]]
+; CHECK:       live:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[CHECK]] ], [ [[I_2]], [[LIVE]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 %c, label %check, label %live
+
+check:
+  br i1 true, label %backedge, label %live
+
+live:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %check], [%i.2, %live]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @live_block_test_switch_loop(i1 %c, i32 %end) {
+; CHECK-LABEL: @live_block_test_switch_loop(
+; CHECK-NEXT:  preheader:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[PREHEADER:%.*]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[CHECK:%.*]], label [[LIVE:%.*]]
+; CHECK:       check:
+; CHECK-NEXT:    switch i32 1, label [[LIVE]] [
+; CHECK-NEXT:    i32 0, label [[LIVE]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[LIVE]]
+; CHECK-NEXT:    ]
+; CHECK:       live:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[CHECK]] ], [ [[I_2]], [[LIVE]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA]]
+;
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 %c, label %check, label %live
+
+check:
+  switch i32 1, label %live [i32 0, label %live
+  i32 1, label %backedge
+  i32 2, label %live]
+
+live:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %check], [%i.2, %live]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can remove part of blocks of inner loop while the loop still
+; preserves, in presence of outer loop.
+define i32 @partial_sub_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @partial_sub_loop_test_branch_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @partial_sub_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @partial_sub_loop_test_switch_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[DEAD]]
+; CHECK-NEXT:    i32 1, label [[BACKEDGE]]
+; CHECK-NEXT:    i32 2, label [[DEAD]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  switch i32 1, label %dead [i32 0, label %dead
+  i32 1, label %backedge
+  i32 2, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %header], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  %cmp = icmp slt i32 %i.inc, %end
+  br i1 %cmp, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Check that we can completely delete inner loop and preserve the outer loop.
+define i32 @full_sub_loop_test_branch_loop(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_branch_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    br i1 false, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    br i1 false, label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  br i1 false, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  br i1 false, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_switch_loop(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_switch_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    switch i32 1, label [[OUTER_BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[HEADER]]
+; CHECK-NEXT:    ]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  switch i32 1, label %dead [i32 0, label %backedge]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  switch i32 1, label %outer_backedge [i32 0, label %header]
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+; Inverted condition in live_part.
+define i32 @full_sub_loop_test_branch_loop_inverse_1(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_branch_loop_inverse_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    br i1 false, label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  br i1 false, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_switch_loop_inverse_1(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_switch_loop_inverse_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    switch i32 1, label [[BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[DEAD:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    switch i32 1, label [[OUTER_BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[HEADER]]
+; CHECK-NEXT:    ]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  switch i32 1, label %backedge [i32 0, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  switch i32 1, label %outer_backedge [i32 0, label %header]
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_branch_loop_inverse_2(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_branch_loop_inverse_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    br i1 false, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    br i1 true, label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  br i1 false, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  br i1 true, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_switch_loop_inverse_2(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_switch_loop_inverse_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    switch i32 1, label [[DEAD:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    switch i32 1, label [[HEADER]] [
+; CHECK-NEXT:    i32 0, label [[OUTER_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  switch i32 1, label %dead [i32 0, label %backedge]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  switch i32 1, label %header [i32 0, label %outer_backedge]
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+
+define i32 @full_sub_loop_test_branch_loop_inverse_3(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_branch_loop_inverse_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    br i1 true, label [[BACKEDGE]], label [[DEAD:%.*]]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    br i1 true, label [[HEADER]], label [[OUTER_BACKEDGE]]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  br i1 true, label %backedge, label %dead
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  br i1 true, label %header, label %outer_backedge
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
+
+define i32 @full_sub_loop_test_switch_loop_inverse_3(i32 %end) {
+; CHECK-LABEL: @full_sub_loop_test_switch_loop_inverse_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer_header:
+; CHECK-NEXT:    [[J:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[J_INC:%.*]], [[OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[I_INC:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[I]], [[I]]
+; CHECK-NEXT:    switch i32 1, label [[BACKEDGE]] [
+; CHECK-NEXT:    i32 0, label [[DEAD:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       dead:
+; CHECK-NEXT:    [[I_2:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[BACKEDGE]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[I]], [[HEADER]] ], [ [[I_2]], [[DEAD]] ]
+; CHECK-NEXT:    [[I_INC]] = add i32 [[I_1]], 1
+; CHECK-NEXT:    switch i32 1, label [[HEADER]] [
+; CHECK-NEXT:    i32 0, label [[OUTER_BACKEDGE]]
+; CHECK-NEXT:    ]
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    [[I_INC_LCSSA:%.*]] = phi i32 [ [[I_INC]], [[BACKEDGE]] ]
+; CHECK-NEXT:    [[J_INC]] = add i32 [[J]], 1
+; CHECK-NEXT:    [[CMP_J:%.*]] = icmp slt i32 [[J_INC]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP_J]], label [[OUTER_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[I_INC_LCSSA_LCSSA:%.*]] = phi i32 [ [[I_INC_LCSSA]], [[OUTER_BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[I_INC_LCSSA_LCSSA]]
+;
+entry:
+  br label %outer_header
+
+outer_header:
+  %j = phi i32 [0, %entry], [%j.inc, %outer_backedge]
+  br label %preheader
+
+preheader:
+  br label %header
+
+header:
+  %i = phi i32 [0, %preheader], [%i.inc, %backedge]
+  br label  %live_part
+
+live_part:
+  %mul = mul i32 %i, %i
+  switch i32 1, label %backedge [i32 0, label %dead]
+
+dead:
+  %i.2 = add i32 %i, 1
+  br label %backedge
+
+backedge:
+  %i.1 = phi i32 [%i, %live_part], [%i.2, %dead]
+  %i.inc = add i32 %i.1, 1
+  switch i32 1, label %header [i32 0, label %outer_backedge]
+
+outer_backedge:
+  %j.inc = add i32 %j, 1
+  %cmp.j = icmp slt i32 %j.inc, %end
+  br i1 %cmp.j, label %outer_header, label %exit
+
+exit:
+  ret i32 %i.inc
+}
diff --git a/test/Transforms/LoopUnroll/ARM/loop-unrolling.ll b/test/Transforms/LoopUnroll/ARM/loop-unrolling.ll
index c159a88e723c2aa4e346d318323f461b5e6b574b..bb5277bedc0b668165ae896d3eb93266e5ac5bfb 100644
--- a/test/Transforms/LoopUnroll/ARM/loop-unrolling.ll
+++ b/test/Transforms/LoopUnroll/ARM/loop-unrolling.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -mtriple=armv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-A
 ; RUN: opt -mtriple=thumbv7 -mcpu=cortex-a57 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-A
+; RUN: opt -mtriple=thumbv7 -mcpu=cortex-a72 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-A
 ; RUN: opt -mtriple=thumbv8m -mcpu=cortex-m23 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-T1
 ; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-T2
 ; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m7 -loop-unroll -S %s -o - | FileCheck %s --check-prefix=CHECK-UNROLL-T2
diff --git a/test/Transforms/LoopUnroll/peel-loop.ll b/test/Transforms/LoopUnroll/peel-loop.ll
index d535414b3ebfcd786629e46e14203a86529bec00..eb3d29cb49436c9c676505f2ac889e14bb38ce06 100644
--- a/test/Transforms/LoopUnroll/peel-loop.ll
+++ b/test/Transforms/LoopUnroll/peel-loop.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -S -loop-unroll -unroll-force-peel-count=3 -verify-dom-info -simplifycfg -instcombine | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll,simplify-cfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll<peeling;no-runtime>,simplify-cfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s
 
 ; Basic loop peeling - check that we can peel-off the first 3 loop iterations
 ; when explicitly requested.
diff --git a/test/Transforms/LoopUnroll/runtime-loop.ll b/test/Transforms/LoopUnroll/runtime-loop.ll
index 720d0d76e4e650c5ee84b91e31eca00d581ef647..19072855d2592edcdef1b5114a7d44db04515c19 100644
--- a/test/Transforms/LoopUnroll/runtime-loop.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop.ll
@@ -1,33 +1,53 @@
-; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefix=EPILOG
-; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
-
-; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefix=EPILOG
-; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefix=PROLOG
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=EPILOG,COMMON
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON
+;
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=EPILOG,COMMON
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON
+;
+; Restricted versions of unroll (unroll<peeling;noruntime>, unroll-full) should not be doing runtime unrolling
+; even if it is globally enabled through -unroll-runtime option
+;
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll<peeling;no-runtime>' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=NOEPILOG,COMMON
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,unroll<peeling;no-runtime>' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=NOPROLOG,COMMON
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-runtime=true -unroll-runtime-epilog=true  | FileCheck %s -check-prefixes=NOEPILOG,COMMON
+; RUN: opt < %s -S -passes='require<opt-remark-emit>,loop(unroll-full)' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=NOPROLOG,COMMON
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Tests for unrolling loops with run-time trip counts
 
+; COMMON-LABEL: @test(
+
 ; EPILOG: %xtraiter = and i32 %n
 ; EPILOG:  %lcmp.mod = icmp ne i32 %xtraiter, 0
 ; EPILOG:  br i1 %lcmp.mod, label %for.body.epil.preheader, label %for.end.loopexit
 
+; NOEPILOG-NOT: %xtraiter = and i32 %n
+
 ; PROLOG: %xtraiter = and i32 %n
 ; PROLOG:  %lcmp.mod = icmp ne i32 %xtraiter, 0
 ; PROLOG:  br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit
 
+; NOPROLOG-NOT: %xtraiter = and i32 %n
+
 ; EPILOG: for.body.epil:
 ; EPILOG: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ],  [ %indvars.iv.unr, %for.body.epil.preheader ]
 ; EPILOG:  %epil.iter.sub = sub i32 %epil.iter, 1
 ; EPILOG:  %epil.iter.cmp = icmp ne i32 %epil.iter.sub, 0
 ; EPILOG:  br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop !0
 
+; NOEPILOG: for.body:
+; NOEPILOG-NOT: for.body.epil:
+
 ; PROLOG: for.body.prol:
 ; PROLOG: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ]
 ; PROLOG:  %prol.iter.sub = sub i32 %prol.iter, 1
 ; PROLOG:  %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
 ; PROLOG:  br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop !0
 
+; NOPROLOG: for.body:
+; NOPROLOG-NOT: for.body.prol:
+
 
 define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly {
 entry:
@@ -54,11 +74,10 @@ for.end:                                          ; preds = %for.body, %entry
 ; Still try to completely unroll loops with compile-time trip counts
 ; even if the -unroll-runtime is specified
 
-; EPILOG: for.body:
-; EPILOG-NOT: for.body.epil:
-
-; PROLOG: for.body:
-; PROLOG-NOT: for.body.prol:
+; COMMON-LABEL: @test1(
+; COMMON: for.body:
+; COMMON-NOT: for.body.epil:
+; COMMON-NOT: for.body.prol:
 
 define i32 @test1(i32* nocapture %a) nounwind uwtable readonly {
 entry:
@@ -82,8 +101,11 @@ for.end:                                          ; preds = %for.body
 ; This is test 2007-05-09-UnknownTripCount.ll which can be unrolled now
 ; if the -unroll-runtime option is turned on
 
+; COMMON-LABEL: @foo(
 ; EPILOG: bb72.2:
 ; PROLOG: bb72.2:
+; NOEPILOG-NOT: bb72.2:
+; NOPROLOG-NOT: bb72.2:
 
 define void @foo(i32 %trips) {
 entry:
@@ -105,12 +127,19 @@ cond_true138:
 
 ; Test run-time unrolling for a loop that counts down by -2.
 
+; COMMON-LABEL: @down(
 ; EPILOG: for.body.epil:
 ; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.cond.for.end_crit_edge.epilog-lcssa
 
+; NOEPILOG: for.body:
+; NOEPILOG-NOT: for.body.epil:
+
 ; PROLOG: for.body.prol:
 ; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit
 
+; NOPROLOG: for.body:
+; NOPROLOG-NOT: for.body.prol:
+
 define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
 entry:
   %cmp2 = icmp eq i32 %len, 0
@@ -138,12 +167,20 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 }
 
 ; Test run-time unrolling disable metadata.
+; COMMON-LABEL: @test2(
+
 ; EPILOG: for.body:
 ; EPILOG-NOT: for.body.epil:
 
+; NOEPILOG: for.body:
+; NOEPILOG-NOT: for.body.epil:
+
 ; PROLOG: for.body:
 ; PROLOG-NOT: for.body.prol:
 
+; NOPROLOG: for.body:
+; NOPROLOG-NOT: for.body.prol:
+
 define zeroext i16 @test2(i16* nocapture %p, i32 %len) nounwind uwtable readonly {
 entry:
   %cmp2 = icmp eq i32 %len, 0
@@ -174,11 +211,9 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; -runtime-unroll-multi-exit=true
 ; single exit, multiple exiting blocks.
 define void @unique_exit(i32 %arg) {
-; PROLOG: unique_exit(
-; PROLOG-NOT: .unr
+; COMMON-LABEL: @unique_exit(
+; COMMON-NOT: .unr
 
-; EPILOG: unique_exit(
-; EPILOG-NOT: .unr
 entry:
   %tmp = icmp sgt i32 undef, %arg
   br i1 %tmp, label %preheader, label %returnblock
@@ -206,11 +241,9 @@ latch:                                            ; preds = %header
 
 ; multiple exit blocks. don't unroll
 define void @multi_exit(i64 %trip, i1 %cond) {
-; PROLOG: multi_exit(
-; PROLOG-NOT: .unr
+; COMMON-LABEL: @multi_exit(
+; COMMON-NOT: .unr
 
-; EPILOG: multi_exit(
-; EPILOG-NOT: .unr
 entry:
   br label %loop_header
 
@@ -238,11 +271,15 @@ exit1:
 exit2.loopexit:
   ret void
 }
+
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.unroll.runtime.disable"}
 
-; EPILOG: !0 = distinct !{!0, !1}
+; need to use LABEL here to separate function IR matching from metadata matching
+; COMMON-LABEL: {{^}}!0 =
+
+; EPILOG-SAME: distinct !{!0, !1}
 ; EPILOG: !1 = !{!"llvm.loop.unroll.disable"}
 
-; PROLOG: !0 = distinct !{!0, !1}
+; PROLOG-SAME: distinct !{!0, !1}
 ; PROLOG: !1 = !{!"llvm.loop.unroll.disable"}
diff --git a/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll b/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1925527eacf16c79e31a4412df7803f9b47a407f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
@@ -0,0 +1,27 @@
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a scalarized load does not get operands scalarization costs added.
+
+define void @fun(i64* %data, i64 %n, i64 %s, double* %Src) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %mul = mul nsw i64 %iv, %s
+  %gep = getelementptr inbounds double, double* %Src, i64 %mul
+  %bct = bitcast double* %gep to i64*
+  %ld = load i64, i64* %bct
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp110.us = icmp slt i64 %iv.next, %n
+  br i1 %cmp110.us, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %mul = mul nsw i64 %iv, %s
+; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %ld = load i64, i64* %bct
+}
diff --git a/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll b/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fbf8b1145424c1bed3dc9c156e37653430fb0311
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
@@ -0,0 +1,28 @@
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -enable-interleaved-mem-accesses=false -disable-output < %s 2>&1 \
+; RUN:   | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a scalarized load does not get a zero cost in a vectorized
+; loop. It can only be folded into the add operand in the scalar loop.
+
+define i32 @fun(i64* %data, i64 %n, i64 %s, i32* %Src) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %acc = phi i32 [ 0, %entry ], [ %acc_next, %for.body ]
+  %gep = getelementptr inbounds i32, i32* %Src, i64 %iv
+  %ld = load i32, i32* %gep
+  %acc_next = add i32 %acc, %ld
+  %iv.next = add nuw nsw i64 %iv, 2
+  %cmp110.us = icmp slt i64 %iv.next, %n
+  br i1 %cmp110.us, label %for.body, label %for.end
+
+for.end:
+  ret i32 %acc_next
+
+; CHECK: Found an estimated cost of 4 for VF 4 For instruction:   %ld = load i32, i32* %gep
+}
diff --git a/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4c992cedd884a483ef0db263314907ef85ad38a5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
@@ -0,0 +1,149 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -debug-only=loop-vectorize,vectorutils -max-interleave-group-factor=64\
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that some cost estimations for interleave groups make sense.
+
+; This loop is loading four i16 values at indices [0, 1, 2, 3], with a stride
+; of 4. At VF=4, memory interleaving means loading 4 * 4 * 16 bits = 2 vector
+; registers. Each of the 4 vector values must then be constructed from the
+; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6.
+;
+; CHECK: LV: Checking a loop in "fun0"
+; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld3 = load i16
+define void @fun0(i16 *%ptr, i16 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i16* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i16, i16* %ivptr, i64 0
+  %ld0 = load i16, i16* %ptr0
+  %ptr1 = getelementptr inbounds i16, i16* %ivptr, i64 1
+  %ld1 = load i16, i16* %ptr1
+  %ptr2 = getelementptr inbounds i16, i16* %ivptr, i64 2
+  %ld2 = load i16, i16* %ptr2
+  %ptr3 = getelementptr inbounds i16, i16* %ivptr, i64 3
+  %ld3 = load i16, i16* %ptr3
+  %a1 = add i16 %ld0, %ld1
+  %a2 = add i16 %a1, %ld2
+  %a3 = add i16 %a2, %ld3
+  %dstptr = getelementptr inbounds i16, i16* %dst, i64 %iv
+  store i16 %a3, i16* %dstptr
+  %ptr.next = getelementptr inbounds i16, i16* %ivptr, i64 4
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop loads one i8 value in a stride of 3. At VF=16, this means loading
+; 3 vector registers, and then constructing the vector value with two vperms,
+; which gives a cost of 5.
+;
+; CHECK: LV: Checking a loop in "fun1"
+; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
+define void @fun1(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %ld0, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop is loading 4 i8 values at indexes [0, 1, 2, 3], with a stride of
+; 32. At VF=2, this means loading 2 vector registers, and using 4 vperms to
+; produce the vector values, which gives a cost of 6.
+;
+; CHECK: LV: Checking a loop in "fun2"
+; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+define void @fun2(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %ptr1 = getelementptr inbounds i8, i8* %ivptr, i64 1
+  %ld1 = load i8, i8* %ptr1
+  %ptr2 = getelementptr inbounds i8, i8* %ivptr, i64 2
+  %ld2 = load i8, i8* %ptr2
+  %ptr3 = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %ld3 = load i8, i8* %ptr3
+  %a1 = add i8 %ld0, %ld1
+  %a2 = add i8 %a1, %ld2
+  %a3 = add i8 %a2, %ld3
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %a3, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 32
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop is loading 4 i8 values at indexes [0, 1, 2, 3], with a stride of
+; 30. At VF=2, this means loading 3 vector registers, and using 4 vperms to
+; produce the vector values, which gives a cost of 7. This is the same loop
+; as in fun2, except the stride makes the second iterations values overlap a
+; vector register boundary.
+;
+; CHECK: LV: Checking a loop in "fun3"
+; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+define void @fun3(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %ptr1 = getelementptr inbounds i8, i8* %ivptr, i64 1
+  %ld1 = load i8, i8* %ptr1
+  %ptr2 = getelementptr inbounds i8, i8* %ivptr, i64 2
+  %ld2 = load i8, i8* %ptr2
+  %ptr3 = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %ld3 = load i8, i8* %ptr3
+  %a1 = add i8 %ld0, %ld1
+  %a2 = add i8 %a1, %ld2
+  %a3 = add i8 %a2, %ld3
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %a3, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 30
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index cbba5300b9cb742cf211ac1078adad7087405f52..c78bcdd172147d6e10bdbd50bb4f5cd0a6166549 100644
--- a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -3,9 +3,23 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-;CHECK-LABEL: @foo(
-;CHECK-NOT: <4 x i32>
-;CHECK: ret void
+; CHECK-LABEL: @foo(
+; CHECK: <4 x i32>
+; CHECK: ret void
+
+; PR15794
+; incorrect addition of llvm.mem.parallel_loop_access metadata is undefined
+; behaviour. Vectorizer ignores the memory dependency checks and goes ahead and
+; vectorizes this loop with uniform stores which has an output dependency.
+
+; void foo(int *a, int *b, int k, int m) {
+;   for (int i = 0; i < m; i++) {
+;     for (int j = 0; j < m; j++) {
+;       a[i] = a[i + j + k] + 1; <<<
+;     }
+;     b[i] = b[i] + 3;
+;   }
+; }
 
 ; Function Attrs: nounwind uwtable 
 define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
@@ -48,6 +62,53 @@ for.end15:                                        ; preds = %for.end.us, %entry
   ret void
 }
 
+; Same test as above, but without the invalid parallel_loop_access metadata.
+
+; Here we can see the vectorizer does the mem dep checks and decides it is
+; unsafe to vectorize.
+; CHECK-LABEL: no-par-mem-metadata(
+; CHECK-NOT: <4 x i32>
+; CHECK:     ret void
+define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+entry:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
+
+for.end.us:                                       ; preds = %for.body3.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
+  %0 = load i32, i32* %arrayidx9.us, align 4
+  %add10.us = add nsw i32 %0, 3
+  store i32 %add10.us, i32* %arrayidx9.us, align 4
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
+  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
+  %1 = trunc i64 %indvars.iv29 to i32
+  %add4.us = add i32 %add.us, %1
+  %idxprom.us = sext i32 %add4.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
+  %2 = load i32, i32* %arrayidx.us, align 4
+  %add5.us = add nsw i32 %2, 1
+  store i32 %add5.us, i32* %arrayidx7.us, align 4
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
+  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
+
+for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
+  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
+  %3 = trunc i64 %indvars.iv33 to i32
+  %add.us = add i32 %3, %k
+  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  br label %for.body3.us
+
+for.end15:                                        ; preds = %for.end.us, %entry
+  ret void
+}
+
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !3 = !{!4, !5}
diff --git a/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index e8c369da3be2cb646a46629c5180882c8789ff19..9428a6d6f7404e81baa64c8caf21f4fae0065bd7 100644
--- a/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -130,3 +130,108 @@ latch:
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32* %c, i32 %k) {
+; CHECK-LABEL: @variant_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[C5:%.*]] = bitcast i32* [[C:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i32, i32* [[C]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[B]]
+; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]]
+; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[A]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ugt i8* [[UGLYGEP]], [[C5]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT16]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT18]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT20]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT17]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP5]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 8, <16 x i1> [[TMP4]], <16 x i32> undef), !alias.scope !21
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !23
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 8
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !24
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  %tmp3 = getelementptr inbounds i32, i32* %c, i64 %i
+  %tmp4 = load i32, i32* %tmp3, align 8
+  store i32 %tmp4, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/optsize.ll b/test/Transforms/LoopVectorize/X86/optsize.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9fa65534f3203ee2dafd613cf14c1742dc1e471d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; This test verifies that the loop vectorizer will NOT vectorize loops that
+; will produce a tail loop with the optimize for size or the minimize size
+; attributes. This is a target-dependent version of the test.
+; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s
+; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_optsize() #0 {
+; CHECK-LABEL: @foo_optsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { optsize }
+
+define i32 @foo_minsize() #1 {
+; CHECK-LABEL: @foo_minsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !5
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #1 = { minsize }
+
+
+; We can't vectorize this one because we version for stride==1; even having TC
+; a multiple of VF.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+; AUTOVF-LABEL: @scev4stride1
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = mul nsw i32 %i.07, %k
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+  store i32 %0, i32* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 256
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  ret void
+}
+
+attributes #2 = { optsize }
+
+
+; PR39497
+; We can't vectorize this one because we version for overflow check and tiny
+; trip count leads to opt-for-size (which otherwise could fold the tail by
+; masking).
+; CHECK-LABEL: @main
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.cond:
+; AUTOVF-LABEL: @main
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.cond:
+define i32 @main() local_unnamed_addr {
+while.cond:
+  br label %for.cond
+
+for.cond:
+  %d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ]
+  %conv = and i32 %d.0, 65535
+  %cmp = icmp ult i32 %conv, 4
+  %add = add nuw nsw i32 %conv, 1
+  br i1 %cmp, label %for.cond, label %while.cond.loopexit
+
+while.cond.loopexit:
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll
index 89d69e232f5b524042e885075682bc594a7ce5d6..2027963f071902b44b55587421b75083e02ee593 100644
--- a/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -20,12 +21,33 @@ target triple = "x86_64-apple-macosx10.8.0"
 @dj = common global [1024 x i32] zeroinitializer, align 16
 
 ; We can optimize this test without a tail.
-;CHECK-LABEL: @example1(
-;CHECK: load <4 x i32>
-;CHECK: add nsw <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret void
 define void @example1() optsize {
+; CHECK-LABEL: @example1(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:         br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop !2
+; CHECK:         ret void
+;
   br label %1
 
 ; <label>:1                                       ; preds = %1, %0
@@ -46,11 +68,68 @@ define void @example1() optsize {
   ret void
 }
 
-; Can't vectorize in 'optsize' mode because we need a tail.
-;CHECK-LABEL: @example2(
-;CHECK-NOT: store <4 x i32>
-;CHECK: ret void
+; Can vectorize in 'optsize' mode by masking the needed tail.
 define void @example2(i32 %n, i32 %x) optsize {
+; CHECK-LABEL: @example2(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH5_PREHEADER:%.*]], label [[DOTPREHEADER:%.*]]
+; CHECK:       .lr.ph5.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP2]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP10]], align 16
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP6]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP14]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP7]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP16]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret void
+;
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph5, label %.preheader
 
@@ -91,7 +170,8 @@ define void @example2(i32 %n, i32 %x) optsize {
   ret void
 }
 
-; N is unknown, we need a tail. Can't vectorize.
+; N is unknown, we need a tail. Can't vectorize because loop has no primary
+; induction.
 ;CHECK-LABEL: @example3(
 ;CHECK-NOT: <4 x i32>
 ;CHECK: ret void
@@ -142,10 +222,31 @@ define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
 
 
 ; We CAN vectorize this example because the pointers are marked as noalias.
-;CHECK-LABEL: @example23b(
-;CHECK: <4 x i32>
-;CHECK: ret void
 define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23b(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[NEXT_GEP]] to <4 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[NEXT_GEP4]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7
+; CHECK:         ret void
+;
   br label %1
 
 ; <label>:1                                       ; preds = %1, %0
@@ -166,4 +267,144 @@ define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst
   ret void
 }
 
+; We CAN vectorize this example by folding the tail it entails.
+define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23c(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], <i64 257, i64 257, i64 257, i64 257>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, i16* [[NEXT_GEP]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i16 [ undef, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; CHECK:       pred.load.continue12:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP7]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.continue14:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP11]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
+; CHECK:       pred.load.if15:
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; CHECK:       pred.load.continue16:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP15]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK:       pred.store.if17:
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i32 [[TMP21]], 7
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP23]]
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK:       pred.store.if19:
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = shl nuw nsw i32 [[TMP25]], 7
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP27]]
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; CHECK:       pred.store.continue20:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.if21:
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = shl nuw nsw i32 [[TMP29]], 7
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP31]]
+; CHECK-NEXT:    store i32 [[TMP30]], i32* [[NEXT_GEP10]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.continue22:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP33:%.*]]
+; CHECK:         br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; We CAN'T vectorize this example because it would entail a tail and an
+; induction is used outside the loop.
+define i64 @example23d(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+;CHECK-LABEL: @example23d(
+; CHECK-NOT: <4 x
+; CHECK: ret i64
+  br label %1
 
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i64 %7
+}
diff --git a/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
index 387eec4d5ede4b3024814a16bc1a98289a24a887..e08ef002d0ec7f33b43c798a79a29bbae6617db1 100644
--- a/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -5,8 +5,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 
-; CHECK: cost of 10 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
-; CHECK: cost of 20 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 6 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
 define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
 entry:
   br label %for.body
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
index 46fd022af6653cf162448528527f5bc3139ccef0..81f3113bf22fe47b8a6d1759f731e25e6b74890b 100644
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -1,16 +1,8 @@
-; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S -vectorizer-min-trip-count=21 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
-; CHECK: LV: Loop hints: force=enabled
-; CHECK: LV: Loop hints: force=?
-; CHECK: LV: Loop hints: force=?
-; No more loops in the module
-; CHECK-NOT: LV: Loop hints: force=
-; CHECK: 3 loop-vectorize               - Number of loops analyzed for vectorization
-; CHECK: 2 loop-vectorize               - Number of loops vectorized
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -mcpu=corei7-avx -S -vectorizer-min-trip-count=21 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
+target triple = "x86_64-unknown-linux"
 
 ;
 ; The source code for the test:
@@ -25,6 +17,51 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
 ;
 define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access !3
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !3
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !3
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !4
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -48,9 +85,42 @@ for.end:
 !2 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;
-; This loop will not be vectorized as the trip count is below the threshold.
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed thanks to folding its tail.
+;
+define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
-define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
 entry:
   br label %for.body
 
@@ -77,6 +147,51 @@ for.end:
 ; scalar iterations are needed.
 ;
 define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !9
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access !7
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !7
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.mem.parallel_loop_access !7
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0b23d6286f9c7b9291f95e1d4bf52b5434f02a8e
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -0,0 +1,826 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; When masked-interleaved-groups are disabled:
+; Check that the predicated load is not vectorized as an
+; interleaved-group but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+;  the only remaining alternative is to scalarize).
+; In this case a scalar epilogue is not needed.
+;
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load for an interleave-group (with
+; a single member).
+; Since the last (second) member of the load-group is a gap, peeling is used,
+; so we also expect to find a scalar epilogue loop.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED: for.body:
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Exactly the same scenario except we are now optimizing for size, therefore
+; we check that no scalar epilogue is created. Since we can't create an epilog
+; we need the ability to mask out the gaps.
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads with the mask properly shuffled and
+; And-ed with the gaps mask.
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]])
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+; Accesses with gaps under Optsize scenario again, with unknown trip-count
+; this time, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality checks; So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.010, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Same, with stride 3. This is to check the gaps-mask and the shuffled mask
+; with a different stride.
+; So accesses are with gaps under Optsize scenario again, with unknown trip-
+; count, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided3_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[3*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+;
+define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = mul nsw i32 %ix.010, 3
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Back to stride 2 with gaps with a known trip count under opt for size,
+; but this time the load/store are not predicated. 
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks because we have gaps and we can't
+; create an epilog. The access is thus scalarized.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads (masking away the gaps).
+;
+; void unconditional_strided1_optsize(const unsigned char* restrict p,
+;                                unsigned char* restrict q,
+;                                unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;DISABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = shl nuw nsw i32 %ix.06, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.06, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+
+; Unconditioal accesses with gaps under Optsize scenario again, with unknown
+; trip-count this time, in order to check the behavior of folding-the-tail 
+; (folding the remainder loop into the main loop using masking) together with
+; interleaved-groups. Folding-the-tail turns the accesses to conditional which
+; requires proper masking. In addition we need to mask out the gaps (all
+; because we are not allowed to use an epilog due to optsize).
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks. So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The shuffled mask is also And-ed with the gaps mask.
+;
+;   for(ix=0; ix < n; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISBLED_MASKED_STRIDED-NOT:    for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.07, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.07
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.07, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Check also a scenario with full interleave-groups (no gaps) as well as both
+; load and store groups. We check that when masked-interleave-group is disabled
+; the predicated loads (and stores) are not vectorized as an
+; interleaved-group but rather as four separate scalarized accesses.
+; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load/store for the two interleave-
+; groups.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:        %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+;ENABLED_MASKED_STRIDED:       call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr  {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Full groups again, this time checking an Optsize scenario, with unknown trip-
+; count, to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality check, so nothing to check here.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+;
+; void masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or i32 [[TMP1]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add <8 x i32> {{.*}}, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = icmp eq i32 {{.*}}, {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP13]], 
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp22 = icmp sgt i32 %n, 0
+  br i1 %cmp22, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.023 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp sgt i32 %ix.023, %guard
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.023, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx3, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx5 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx5, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx9 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx9, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.023, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Full groups under Optsize scenario again, with unknown trip-count, again in
+; order to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; This time the accesses are not conditional, they become conditional only
+; due to tail folding.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during cost-model checks, so we check for no epilogue and
+; scalarized conditional accesses.
+; When masked-interleave-group is enabled we check for no epilogue,
+; and interleave-groups vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; (Same vectorization scheme as for the previous loop with conditional accesses
+; except here the mask only masks away the remainder iterations.)
+;
+; void unconditional_masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+; }
+;}
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = or i32 [[TMP0]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i32 {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP11]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp20 = icmp sgt i32 %n, 0
+  br i1 %cmp20, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.021 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.021, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx2 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx2, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx4, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx8, align 1
+  %inc = add nuw nsw i32 %ix.021, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9ed66a22dbfd757978117cbb47af2a956eabec79
--- /dev/null
+++ b/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
@@ -0,0 +1,222 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; We test here that the loop-vectorizer forms an interleave-groups from 
+; predicated memory accesses only if they are both in the same (predicated)
+; block (first scenario below).
+; If the accesses are not in the same predicated block, an interleave-group
+; is not formed (scenarios 2,3 below).
+
+; Scenario 1: Check the case where it is legal to create masked interleave-
+; groups. Altogether two groups are created (one for loads and one for stores)
+; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses
+; are disabled we do not create any interleave-group.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  store i8  %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:   %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with   %{{.*}} = load i8, i8* %{{.*}}, align 1
+
+; Scenario 2: Check the case where it is illegal to create a masked interleave-
+; group because the first access is predicated, and the second isn't.
+; We therefore create a separate interleave-group with gaps for each of the
+; stores (if masked-interleaved-accesses are enabled) and these are later
+; invalidated because interleave-groups of stores with gaps are not supported. 
+; If masked-interleaved-accesses is not enabled we create only one interleave
+; group of stores (for the non-predicated store) and it is later invalidated
+; due to gaps.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     q[2*ix+1] = 2;
+; }
+;}
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; Scenario 3: Check the case where it is illegal to create a masked interleave-
+; group because the two accesses are in separate predicated blocks.
+; We therefore create a separate interleave-group with gaps for each of the accesses,
+; (which are later invalidated because interleave-groups of stores with gaps are 
+; not supported).
+; If masked-interleaved-accesses is not enabled we don't create any interleave
+; group because all accesses are predicated.
+;
+; void masked_strided3(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     if (ix > guard2) {
+;         q[2*ix+1] = 2;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.012, 1
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  %cmp1 = icmp ugt i32 %ix.012, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.012, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard1 to i32
+  %conv3 = zext i8 %guard2 to i32
+  br label %for.body
+
+for.body:
+  %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.018, 1
+  %cmp1 = icmp ugt i32 %ix.018, %conv
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  br label %if.end
+
+if.end:
+  %cmp4 = icmp ugt i32 %ix.018, %conv3
+  br i1 %cmp4, label %if.then6, label %for.inc
+
+if.then6:
+  %add = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx7, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.018, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = {  "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"  }
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
index 89c0ac109167625f70b62c416c94346ea6a7cfe0..c647f586b18e5b3f89fbd5515826744e939426d9 100644
--- a/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 %pair = type { i64, i64 }
diff --git a/test/Transforms/LoopVectorize/intrinsic.ll b/test/Transforms/LoopVectorize/intrinsic.ll
index 178d602e7f3359e37ec4c55cb7644f812bf99220..203c4435c88c8fb21f555319a3026f68561e7598 100644
--- a/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1247,3 +1247,59 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
+
+declare float @llvm.minimum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @minimum_f32(
+;CHECK: llvm.minimum.v4f32
+;CHECK: ret void
+define void @minimum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.minimum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.maximum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @maximum_f32(
+;CHECK: llvm.maximum.v4f32
+;CHECK: ret void
+define void @maximum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.maximum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
index 7bdfd405455cec8b92091794d0088c7dbe317999..e341576e5316e21ba74347b0957c19e85340e6b7 100644
--- a/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
+++ b/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -162,10 +162,74 @@ for.end:                                          ; preds = %for.body
 
 ; Instcombine'd version of above test. Now the store is no longer of invariant
 ; value.
-; TODO: We should be able to vectorize this loop once we support vectorizing
-; stores of variant values to invariant addresses.
+; scalar store the value extracted from the last element of the vector value.
 ; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic
-; CHECK-NOT:   <4 x
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32> [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[A]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       cond_store_k:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
+; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) {
 entry:
   %ntrunc = trunc i64 %n to i32
@@ -199,10 +263,74 @@ for.end:                                          ; preds = %for.body
 ; invariant val stored to invariant address predicated on invariant condition
 ; This is not treated as a predicated store since the block the store belongs to
 ; is the latch block (which doesn't need to be predicated).
-; TODO: We should vectorize this loop once we relax the check for
 ; variant/invariant values being stored to invariant address.
+; test checks that the last element of the phi is extracted and scalar stored
+; into the uniform address within the loop.
+; Since the condition and the phi is loop invariant, they are LICM'ed after
+; vectorization.
 ; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv
-; CHECK-NOT: <4 x
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[K]], i32 3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[A]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       cond_store_k:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
+; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
 define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
 entry:
   %ntrunc = trunc i64 %n to i32
@@ -233,10 +361,67 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-; TODO: This loop can be vectorized once we support variant value being
-; stored into invariant address.
+; variant value stored to uniform address tests that the code gen extracts the
+; last element from the variant vector and scalar stores it into the uniform
+; address.
 ; CHECK-LABEL: variant_val_store_to_inv_address
-; CHECK-NOT: <4 x i32>
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A1]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX3:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[DOTLCSSA]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[DOTLCSSA]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
 define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
 entry:
   %ntrunc = trunc i64 %n to i32
@@ -255,6 +440,112 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %cond, label %for.body, label %for.end
 
 for.end:                                          ; preds = %for.body
-  %rdx.lcssa = phi i32 [ %tmp0, %for.body ]
+  %rdx.lcssa = phi i32 [ %tmp3, %for.body ]
   ret i32 %rdx.lcssa
 }
+
+; Multiple variant stores to the same uniform address
+; We do not vectorize such loops currently.
+;  for(; i < itr; i++) {
+;    for(; j < itr; j++) {
+;      var1[i] = var2[j] + var1[i];
+;      var1[i]++;
+;    }
+;  }
+
+; CHECK-LABEL: multiple_uniform_stores
+; CHECK-NOT:     <4 x i32>
+define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+entry:
+  %cmp20 = icmp eq i32 %itr, 0
+  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
+  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
+  %cmp218 = icmp ult i32 %j.022, %itr
+  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %0 = zext i32 %j.022 to i64
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  store i32 %4, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %itr
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
+  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %for.body3 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
+  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
+
+for.end10:                                        ; preds = %for.inc8, %entry
+  ret i32 undef
+}
+
+; second uniform store to the same address is conditional.
+; we do not vectorize this.
+; CHECK-LABEL: multiple_uniform_stores_conditional
+; CHECK-NOT:    <4 x i32>
+define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+entry:
+  %cmp20 = icmp eq i32 %itr, 0
+  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
+  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
+  %cmp218 = icmp ult i32 %j.022, %itr
+  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %0 = zext i32 %j.022 to i64
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %latch ]
+  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  %5 = icmp ugt i32 %3, 42
+  br i1 %5, label %cond_store, label %latch
+
+cond_store:
+  store i32 %4, i32* %arrayidx5, align 4
+  br label %latch
+
+latch:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %itr
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
+  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %latch ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
+  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
+
+for.end10:                                        ; preds = %for.inc8, %entry
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6032fb18a387c32b4eba2b650cf3c8b4de5a5957
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR39417
+; Check that the need for overflow check prevents vectorizing a loop with tiny
+; trip count (which implies opt for size).
+; CHECK-LABEL: @func_34
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: bb67:
+define void @func_34() {
+bb1:
+  br label %bb67
+
+bb67:
+  %storemerge2 = phi i32 [ 0, %bb1 ], [ %_tmp2300, %bb67 ]
+  %sext = shl i32 %storemerge2, 16
+  %_tmp2299 = ashr exact i32 %sext, 16
+  %_tmp2300 = add nsw i32 %_tmp2299, 1
+  %_tmp2310 = trunc i32 %_tmp2300 to i16
+  %_tmp2312 = icmp slt i16 %_tmp2310, 3
+  br i1 %_tmp2312, label %bb67, label %bb68
+
+bb68:
+  ret void
+}
+
+; Check that the need for stride==1 check prevents vectorizing a loop under opt
+; for size.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #0 {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = mul nsw i32 %i.07, %k
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+  store i32 %0, i32* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  ret void
+}
+
+attributes #0 = { optsize }
diff --git a/test/Transforms/MergeFunc/external-before-local.ll b/test/Transforms/MergeFunc/external-before-local.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7dcdb0153dfb89b995574b2d80965e961e95a95f
--- /dev/null
+++ b/test/Transforms/MergeFunc/external-before-local.ll
@@ -0,0 +1,55 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+; We should normalize to test2 rather than test1,
+; because it allows us to drop test1 entirely
+
+; CHECK-NOT: define internal void @test1() unnamed_addr
+; CHECK: define void @test3() unnamed_addr
+; CHECK-NEXT: call void @test2()
+; CHECK-NEXT: call void @test2()
+  
+declare void @dummy()
+
+define internal void @test1() unnamed_addr {
+    call void @dummy()
+    call void @dummy()
+    ret void
+}
+
+define void @test2() unnamed_addr {
+    call void @dummy()
+    call void @dummy()
+    ret void
+}
+
+define void @test3() unnamed_addr {
+    call void @test1()
+    call void @test2()
+    ret void
+}
+
+; We should normalize to the existing test6 rather than
+; to a new anonymous strong backing function
+
+; CHECK: define weak void @test5()
+; CHECK-NEXT: tail call void @test6()
+; CHECK: define weak void @test4()
+; CHECK-NEXT: tail call void @test6()
+
+declare void @dummy2()
+  
+define weak void @test4() {
+    call void @dummy2()
+    call void @dummy2()
+    ret void
+}
+define weak void @test5() {
+    call void @dummy2()
+    call void @dummy2()
+    ret void
+}
+define void @test6() {
+    call void @dummy2()
+    call void @dummy2()
+    ret void
+}
diff --git a/test/Transforms/MergeFunc/unnamed-addr-reprocessing.ll b/test/Transforms/MergeFunc/unnamed-addr-reprocessing.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5902edc0e88b4afb912d1a60bed07eb522f20da0
--- /dev/null
+++ b/test/Transforms/MergeFunc/unnamed-addr-reprocessing.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -mergefunc < %s | FileCheck %s
+
+; After test3 and test4 have been merged, we should detect that
+; test1 and test2 can also be merged.
+
+; CHECK: define void @test4() unnamed_addr
+; CHECK-NEXT: tail call void @test3()
+; CHECK: define void @test2() unnamed_addr
+; CHECK-NEXT: tail call void @test1()
+
+declare void @dummy()
+  
+define void @test1() unnamed_addr {
+    call void @test3()
+    call void @test3()
+    ret void
+}
+
+define void @test2() unnamed_addr {
+    call void @test4()
+    call void @test4()
+    ret void
+}
+
+define void @test3() unnamed_addr {
+    call void @dummy()
+    call void @dummy()
+    ret void
+}
+
+define void @test4() unnamed_addr {
+    call void @dummy()
+    call void @dummy()
+    ret void
+}
diff --git a/test/Transforms/MergeICmps/X86/gep-used-outside.ll b/test/Transforms/MergeICmps/X86/gep-used-outside.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9c944d525002632c2cc3278d9fb8df7f44abfaaf
--- /dev/null
+++ b/test/Transforms/MergeICmps/X86/gep-used-outside.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mergeicmps -mtriple=x86_64-unknown-unknown -S | FileCheck %s
+
+%"struct.std::pair" = type { i32, i32 }
+
+; Check that the transformation is avoided when GEP has a use outside of the
+; parant block of the load instruction.
+
+define zeroext i32 @opeq1(
+; CHECK-LABEL: @opeq1(
+; CHECK-NOT:    [[MEMCMP:%.*]] = call i32 @memcmp
+
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %a,
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
+entry:
+  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 
+  %0 = load i32, i32* %first.i, align 4
+  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1 
+  %1 = load i32, i32* %first1.i, align 4
+  %cmp.i = icmp eq i32 %0, %1
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+  %2 = load i32, i32* %second.i, align 4
+  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+  %3 = load i32, i32* %second2.i, align 4
+  %cmp3.i = icmp eq i32 %2, %3
+  br label %opeq1.exit
+
+opeq1.exit:
+  %4 = phi i1 [ false, %entry ], [ %cmp3.i,  %land.rhs.i]
+  %5 = load i32, i32* %first.i, align 4
+  %6 = select i1 %4, i32 %5, i32 0
+  ret i32 %6
+}
diff --git a/test/Transforms/MergeICmps/X86/int64-and-ptr.ll b/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..78924aea9bec1ad359c16239e6ec6be062c3a84a
--- /dev/null
+++ b/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mergeicmps -S | FileCheck %s --check-prefix=X86
+
+; 8-byte int and 8-byte pointer should merge into a 16-byte memcpy.
+; X86: memcmp(i8* {{.*}}, i8* {{.*}}, i64 16)
+
+%struct.outer = type { i64, %struct.inner* }
+%struct.inner = type { i32, i32, i32 }
+
+; Function Attrs: nounwind uwtable
+define dso_local i1 @"?foo@@YAHAEAUouter@@0@Z"(%struct.outer* align 8 dereferenceable(16) %o1, %struct.outer* align 8 dereferenceable(116) %o2) local_unnamed_addr #0 {
+entry:
+  %p1 = getelementptr inbounds %struct.outer, %struct.outer* %o1, i64 0, i32 0
+  %0 = load i64, i64* %p1, align 8
+  %p11 = getelementptr inbounds %struct.outer, %struct.outer* %o2, i64 0, i32 0
+  %1 = load i64, i64* %p11, align 8
+  %cmp = icmp eq i64 %0, %1
+  br i1 %cmp, label %if.then, label %if.end5
+
+if.then:                                          ; preds = %entry
+  %p2 = getelementptr inbounds %struct.outer, %struct.outer* %o1, i64 0, i32 1
+  %2 = load %struct.inner*, %struct.inner** %p2, align 8
+  %p22 = getelementptr inbounds %struct.outer, %struct.outer* %o2, i64 0, i32 1
+  %3 = load %struct.inner*, %struct.inner** %p22, align 8
+  %cmp3 = icmp eq %struct.inner* %2, %3
+  br label %if.end5
+
+if.end5:                                          ; preds = %if.then, %entry
+  %rez.0 = phi i1 [ %cmp3, %if.then ], [ false, %entry ]
+  ret i1 %rez.0
+}
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
diff --git a/test/Transforms/MergeICmps/X86/tuple-four-int8.ll b/test/Transforms/MergeICmps/X86/tuple-four-int8.ll
index c7f2d45257da0bbc1d301cbf1739622570d65ec5..097a1c232fc039b81629e48d0e956270f2ea89a8 100644
--- a/test/Transforms/MergeICmps/X86/tuple-four-int8.ll
+++ b/test/Transforms/MergeICmps/X86/tuple-four-int8.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mergeicmps -mtriple=x86_64-unknown-unknown -S | FileCheck %s
 
diff --git a/test/Transforms/NewGVN/range.ll b/test/Transforms/NewGVN/range.ll
index 55efa5955b19900a2d8319836a6a66f40bd29620..29b911cb5f6ecabce232634f4f99b7839d955bb8 100644
--- a/test/Transforms/NewGVN/range.ll
+++ b/test/Transforms/NewGVN/range.ll
@@ -2,7 +2,7 @@
 
 define i32 @test1(i32* %p) {
 ; CHECK-LABEL: @test1(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range !0
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !0
@@ -12,8 +12,7 @@ define i32 @test1(i32* %p) {
 
 define i32 @test2(i32* %p) {
 ; CHECK-LABEL: @test2(i32* %p)
-; CHECK: %a = load i32, i32* %p
-; CHECK-NOT: range
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p
@@ -23,7 +22,7 @@ define i32 @test2(i32* %p) {
 
 define i32 @test3(i32* %p) {
 ; CHECK-LABEL: @test3(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[DISJOINT_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !1
@@ -33,7 +32,7 @@ define i32 @test3(i32* %p) {
 
 define i32 @test4(i32* %p) {
 ; CHECK-LABEL: @test4(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE0]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !0
   %b = load i32, i32* %p, !range !2
@@ -43,7 +42,7 @@ define i32 @test4(i32* %p) {
 
 define i32 @test5(i32* %p) {
 ; CHECK-LABEL: @test5(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_SIGNED_RANGE:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE3:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !3
   %b = load i32, i32* %p, !range !4
@@ -53,7 +52,7 @@ define i32 @test5(i32* %p) {
 
 define i32 @test6(i32* %p) {
 ; CHECK-LABEL: @test6(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_TEST6:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE5:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !5
   %b = load i32, i32* %p, !range !6
@@ -63,7 +62,7 @@ define i32 @test6(i32* %p) {
 
 define i32 @test7(i32* %p) {
 ; CHECK-LABEL: @test7(i32* %p)
-; CHECK: %a = load i32, i32* %p, !range ![[MERGED_TEST7:[0-9]+]]
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE7:[0-9]+]]
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !7
   %b = load i32, i32* %p, !range !8
@@ -73,7 +72,7 @@ define i32 @test7(i32* %p) {
 
 define i32 @test8(i32* %p) {
 ; CHECK-LABEL: @test8(i32* %p)
-; CHECK: %a = load i32, i32* %p
+; CHECK: %a = load i32, i32* %p, !range ![[RANGE9:[0-9]+]]
 ; CHECK-NOT: range
 ; CHECK: %c = add i32 %a, %a
   %a = load i32, i32* %p, !range !9
@@ -82,11 +81,11 @@ define i32 @test8(i32* %p) {
   ret i32 %c
 }
 
-; CHECK: ![[DISJOINT_RANGE]] = !{i32 0, i32 2, i32 3, i32 5}
-; CHECK: ![[MERGED_RANGE]] = !{i32 0, i32 5}
-; CHECK: ![[MERGED_SIGNED_RANGE]] = !{i32 -5, i32 -2, i32 1, i32 5}
-; CHECK: ![[MERGED_TEST6]] = !{i32 10, i32 1}
-; CHECK: ![[MERGED_TEST7]] = !{i32 3, i32 4, i32 5, i32 2}
+; CHECK: ![[RANGE0]] = !{i32 0, i32 2}
+; CHECK: ![[RANGE3]] = !{i32 -5, i32 -2}
+; CHECK: ![[RANGE5]] = !{i32 10, i32 1}
+; CHECK: ![[RANGE7]] = !{i32 1, i32 2, i32 3, i32 4}
+; CHECK: ![[RANGE9]] = !{i32 1, i32 5}
 
 !0 = !{i32 0, i32 2}
 !1 = !{i32 3, i32 5}
diff --git a/test/Transforms/NewGVN/simp-to-self.ll b/test/Transforms/NewGVN/simp-to-self.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ca46af76849b767148bd7609466fe42fe8e87f04
--- /dev/null
+++ b/test/Transforms/NewGVN/simp-to-self.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S < %s -newgvn | FileCheck %s
+
+; CHECK-LABEL: for.cond:
+; CHECK-NEXT:    %lv = load i32, i32* bitcast (i64* @a to i32*)
+; CHECK-NEXT:    %bf.clear = and i32 %lv, -131072
+; CHECK-NEXT:    %bf.set = or i32 1, %bf.clear
+; CHECK-NEXT:    br i1 %bc, label %for.cond, label %exit
+@a = external global i64
+
+define void @fn1(i1 %bc) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1.1, %entry
+  %tmp = phi i1 [ undef, %entry ], [ 1, %for.cond ]
+  %conv = zext i1 %tmp to i32
+  %lv = load i32, i32* bitcast (i64* @a to i32*)
+  %bf.clear = and i32 %lv, -131072
+  %bf.set = or i32 %conv, %bf.clear
+  %bf.clear.1 = and i32 %bf.set, -131072
+  %bf.set.1 = or i32 1, %bf.clear.1
+  br i1 %bc, label %for.cond, label %exit
+
+exit:                              ; preds = %for.cond1
+  store i32 %bf.set.1, i32* bitcast (i64* @a to i32*)
+  ret void
+}
diff --git a/test/Transforms/PGOProfile/Inputs/func_entry.proftext b/test/Transforms/PGOProfile/Inputs/func_entry.proftext
new file mode 100644
index 0000000000000000000000000000000000000000..2dc2c2ec9f36e4036508b954a99d9cecb80421c6
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/func_entry.proftext
@@ -0,0 +1,17 @@
+# IR level Instrumentation Flag
+:ir
+foo
+# Func Hash:
+12884901887
+# Num Counters:
+1
+# Counter Values:
+9999
+
+bar
+# Func Hash:
+12884901887
+# Num Counters:
+1
+# Counter Values:
+0
diff --git a/test/Transforms/PGOProfile/Inputs/remap.map b/test/Transforms/PGOProfile/Inputs/remap.map
new file mode 100644
index 0000000000000000000000000000000000000000..df3d82d38bd673ed99a74c883d7e6e51ffa52bc2
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/remap.map
@@ -0,0 +1,8 @@
+# foo:: and foo::detail:: are equivalent
+name 3foo N3foo6detailE
+
+# foo::qux and foo::quux are equivalent
+type N3foo3quxE N3foo4quuxE
+
+# N::X and M::X are equivalent
+name N1N1XE N1M1XE
diff --git a/test/Transforms/PGOProfile/Inputs/remap.proftext b/test/Transforms/PGOProfile/Inputs/remap.proftext
new file mode 100644
index 0000000000000000000000000000000000000000..40054d78f5aab58b2d12f61d73a259229bfd1d91
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/remap.proftext
@@ -0,0 +1,8 @@
+# :ir is the flag to indicate this is IR level profile.
+:ir
+_ZN3foo3barERKN1N1XINS_4quuxEEE
+25571299074
+2
+3
+2
+
diff --git a/test/Transforms/PGOProfile/func_entry.ll b/test/Transforms/PGOProfile/func_entry.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dac996e35cb3da17c479c6e6c9db84c86cd541a0
--- /dev/null
+++ b/test/Transforms/PGOProfile/func_entry.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-profdata merge %S/Inputs/func_entry.proftext -o %t.profdata
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@s = common dso_local local_unnamed_addr global i32 0, align 4
+
+define void @bar() {
+; CHECK-LABEL: @bar
+; CHECK-SAME: !prof ![[FUNC_ENTRY_COUNT_ZERO:[0-9]+]]
+
+entry:
+  store i32 1, i32* @s, align 4
+  ret void
+}
+
+define void @foo() {
+; CHECK-LABEL: @foo
+; CHECK-SAME: !prof ![[FUNC_ENTRY_COUNT_NON_ZERO:[0-9]+]]
+entry:
+  %0 = load i32, i32* @s, align 4
+  %add = add nsw i32 %0, 4
+  store i32 %add, i32* @s, align 4
+  ret void
+}
+
+; CHECK-DAG: ![[FUNC_ENTRY_COUNT_ZERO]] = !{!"function_entry_count", i64 0}
+; CHECK-DAG: ![[FUNC_ENTRY_COUNT_NON_ZERO]] = !{!"function_entry_count", i64 9999}
diff --git a/test/Transforms/PGOProfile/remap.ll b/test/Transforms/PGOProfile/remap.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2fdca9e33d18759d820848cc65235a98fc647eac
--- /dev/null
+++ b/test/Transforms/PGOProfile/remap.ll
@@ -0,0 +1,28 @@
+; RUN: llvm-profdata merge %S/Inputs/remap.proftext -o %t.profdata
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -pgo-test-profile-remapping-file=%S/Inputs/remap.map -S | FileCheck %s --check-prefix=USE
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @_ZN3foo3barERKN1M1XINS_6detail3quxEEE(i32 %i) {
+; USE-LABEL: @_ZN3foo3barERKN1M1XINS_6detail3quxEEE
+; USE-SAME: !prof ![[FUNC_ENTRY_COUNT:[0-9]+]]
+entry:
+  %cmp = icmp sgt i32 %i, 0
+  br i1 %cmp, label %if.then, label %if.end
+; USE: br i1 %cmp, label %if.then, label %if.end
+; USE-SAME: !prof ![[BW_ENTRY:[0-9]+]]
+
+if.then:
+  %add = add nsw i32 %i, 2
+  br label %if.end
+
+if.end:
+  %retv = phi i32 [ %add, %if.then ], [ %i, %entry ]
+  ret i32 %retv
+}
+
+; USE-DAG: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}}
+; USE-DAG: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}}
+; USE-DAG: ![[FUNC_ENTRY_COUNT]] = !{!"function_entry_count", i64 3}
+; USE-DAG: ![[BW_ENTRY]] = !{!"branch_weights", i32 2, i32 1}
diff --git a/test/Transforms/Reassociate/fp-expr.ll b/test/Transforms/Reassociate/fp-expr.ll
index e616c52f28e66e97d070294879ae29161c8d52d7..dcbf835ba54402018fa7da71b33dcea0835f2ee7 100644
--- a/test/Transforms/Reassociate/fp-expr.ll
+++ b/test/Transforms/Reassociate/fp-expr.ll
@@ -4,8 +4,8 @@
 define void @test1() {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[T1:%.*]] = tail call <4 x float> @blam()
-; CHECK-NEXT:    [[T1_NEG:%.*]] = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[T1]]
-; CHECK-NEXT:    [[T24:%.*]] = fadd fast <4 x float> [[T1_NEG]], undef
+; CHECK-NEXT:    [[T23:%.*]] = fsub fast <4 x float> undef, [[T1]]
+; CHECK-NEXT:    [[T24:%.*]] = fadd fast <4 x float> [[T23]], undef
 ; CHECK-NEXT:    tail call void @wombat(<4 x float> [[T24]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/test/Transforms/Reassociate/inverses.ll b/test/Transforms/Reassociate/inverses.ll
index 8500cd867fd3e178f7bc556f98fc2e3ac2e153b8..14753b1724b810cc04539935435ea3fb154ff196 100644
--- a/test/Transforms/Reassociate/inverses.ll
+++ b/test/Transforms/Reassociate/inverses.ll
@@ -1,46 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -reassociate -die -S | FileCheck %s
 
+; (A&B)&~A == 0
 define i32 @test1(i32 %a, i32 %b) {
-	%tmp.2 = and i32 %b, %a
-	%tmp.4 = xor i32 %a, -1
-        ; (A&B)&~A == 0
-	%tmp.5 = and i32 %tmp.2, %tmp.4
-	ret i32 %tmp.5
 ; CHECK-LABEL: @test1(
-; CHECK: ret i32 0
+; CHECK-NEXT:    ret i32 0
+;
+  %t2 = and i32 %b, %a
+  %t4 = xor i32 %a, -1
+  %t5 = and i32 %t2, %t4
+  ret i32 %t5
 }
 
+define <2 x i32> @not_op_vec_undef(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @not_op_vec_undef(
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+  %t2 = and <2 x i32> %b, %a
+  %t4 = xor <2 x i32> %a, <i32 -1, i32 undef>
+  %t5 = and <2 x i32> %t2, %t4
+  ret <2 x i32> %t5
+}
+
+; A&~A == 0
 define i32 @test2(i32 %a, i32 %b) {
-	%tmp.1 = and i32 %a, 1234
-	%tmp.2 = and i32 %b, %tmp.1
-	%tmp.4 = xor i32 %a, -1
-	; A&~A == 0
-        %tmp.5 = and i32 %tmp.2, %tmp.4
-	ret i32 %tmp.5
 ; CHECK-LABEL: @test2(
-; CHECK: ret i32 0
+; CHECK-NEXT:    ret i32 0
+;
+  %t1 = and i32 %a, 1234
+  %t2 = and i32 %b, %t1
+  %t4 = xor i32 %a, -1
+  %t5 = and i32 %t2, %t4
+  ret i32 %t5
 }
 
+; (b+(a+1234))+-a -> b+1234
 define i32 @test3(i32 %b, i32 %a) {
-	%tmp.1 = add i32 %a, 1234
-	%tmp.2 = add i32 %b, %tmp.1
-	%tmp.4 = sub i32 0, %a
-        ; (b+(a+1234))+-a -> b+1234
-  	%tmp.5 = add i32 %tmp.2, %tmp.4
-	ret i32 %tmp.5
 ; CHECK-LABEL: @test3(
-; CHECK: %tmp.5 = add i32 %b, 1234
-; CHECK: ret i32 %tmp.5
+; CHECK-NEXT:    [[T5:%.*]] = add i32 [[B:%.*]], 1234
+; CHECK-NEXT:    ret i32 [[T5]]
+;
+  %t1 = add i32 %a, 1234
+  %t2 = add i32 %b, %t1
+  %t4 = sub i32 0, %a
+  %t5 = add i32 %t2, %t4
+  ret i32 %t5
 }
 
+; (b+(a+1234))+~a -> b+1233
 define i32 @test4(i32 %b, i32 %a) {
-        %tmp.1 = add i32 %a, 1234
-        %tmp.2 = add i32 %b, %tmp.1
-        %tmp.4 = xor i32 %a, -1
-        ; (b+(a+1234))+~a -> b+1233
-        %tmp.5 = add i32 %tmp.2, %tmp.4
-        ret i32 %tmp.5
 ; CHECK-LABEL: @test4(
-; CHECK: %tmp.5 = add i32 %b, 1233
-; CHECK: ret i32 %tmp.5
+; CHECK-NEXT:    [[T5:%.*]] = add i32 [[B:%.*]], 1233
+; CHECK-NEXT:    ret i32 [[T5]]
+;
+  %t1 = add i32 %a, 1234
+  %t2 = add i32 %b, %t1
+  %t4 = xor i32 %a, -1
+  %t5 = add i32 %t2, %t4
+  ret i32 %t5
 }
+
diff --git a/test/Transforms/Reassociate/negation.ll b/test/Transforms/Reassociate/negation.ll
index 12d2c86192bb9fabb1bbd0e6cb056c1c7c6f2a5b..f443083ff3f112148c1713698d0d4d7338016f34 100644
--- a/test/Transforms/Reassociate/negation.ll
+++ b/test/Transforms/Reassociate/negation.ll
@@ -1,14 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
 ; Test that we can turn things like X*-(Y*Z) -> X*-1*Y*Z.
 
 define i32 @test1(i32 %a, i32 %b, i32 %z) {
-; CHECK-LABEL: test1
-; CHECK-NEXT: %e = mul i32 %a, 12345
-; CHECK-NEXT: %f = mul i32 %e, %b
-; CHECK-NEXT: %g = mul i32 %f, %z
-; CHECK-NEXT: ret i32 %g
-
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], 12345
+; CHECK-NEXT:    [[F:%.*]] = mul i32 [[E]], [[B:%.*]]
+; CHECK-NEXT:    [[G:%.*]] = mul i32 [[F]], [[Z:%.*]]
+; CHECK-NEXT:    ret i32 [[G]]
+;
   %c = sub i32 0, %z
   %d = mul i32 %a, %b
   %e = mul i32 %c, %d
@@ -18,14 +19,28 @@ define i32 @test1(i32 %a, i32 %b, i32 %z) {
 }
 
 define i32 @test2(i32 %a, i32 %b, i32 %z) {
-; CHECK-LABEL: test2
-; CHECK-NEXT: %e = mul i32 %a, 40
-; CHECK-NEXT: %f = mul i32 %e, %z
-; CHECK-NEXT: ret i32 %f
-
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], 40
+; CHECK-NEXT:    [[F:%.*]] = mul i32 [[E]], [[Z:%.*]]
+; CHECK-NEXT:    ret i32 [[F]]
+;
   %d = mul i32 %z, 40
   %c = sub i32 0, %d
   %e = mul i32 %a, %c
   %f = sub i32 0, %e
   ret i32 %f
 }
+
+define <2 x i32> @negate_vec_undefs(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) {
+; CHECK-LABEL: @negate_vec_undefs(
+; CHECK-NEXT:    [[E:%.*]] = mul <2 x i32> [[A:%.*]], <i32 40, i32 40>
+; CHECK-NEXT:    [[F:%.*]] = mul <2 x i32> [[E]], [[Z:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[F]]
+;
+  %d = mul <2 x i32> %z, <i32 40, i32 40>
+  %c = sub <2 x i32> <i32 0, i32 undef>, %d
+  %e = mul <2 x i32> %a, %c
+  %f = sub <2 x i32> <i32 0, i32 undef>, %e
+  ret <2 x i32> %f
+}
+
diff --git a/test/Transforms/Reassociate/negation1.ll b/test/Transforms/Reassociate/negation1.ll
index 34b943cf496deaac6328d2bd20c7a1c428b73121..674e57df956e1d22ea1c17455d5138b8605b6a42 100644
--- a/test/Transforms/Reassociate/negation1.ll
+++ b/test/Transforms/Reassociate/negation1.ll
@@ -1,11 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -reassociate -instcombine -S | FileCheck %s
 
 ; Test that we can turn things like A*B + X - A*B -> X.
 
 define i32 @test1(i32 %a, i32 %b, i32 %x) {
-; CHECK-LABEL: test1
-; CHECK: ret i32 %x
-
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 [[X:%.*]]
+;
   %c = mul i32 %a, %b
   %d = add i32 %c, %x
   %c1 = mul i32 %a, %b
diff --git a/test/Transforms/SCCP/ipsccp-basic.ll b/test/Transforms/SCCP/ipsccp-basic.ll
index ae08b4823c94dcd9292d425653c5f621b00cebe4..b1660b54565238e823dc1d10d709793e3f1cb9c1 100644
--- a/test/Transforms/SCCP/ipsccp-basic.ll
+++ b/test/Transforms/SCCP/ipsccp-basic.ll
@@ -258,3 +258,16 @@ define i64 @test11b() {
 }
 
 declare i64 @llvm.ctpop.i64(i64)
+
+;;======================== test12
+;; Ensure that a struct as an arg to a potentially constant-foldable
+;; function does not crash SCCP (for now it'll just ignores it)
+
+define i1 @test12() {
+  %c = call i1 @llvm.is.constant.sl_i32i32s({i32, i32} {i32 -1, i32 32})
+  ret i1 %c
+; CHECK-LABEL: define i1 @test12
+; CHECK: ret i1 %c
+}
+
+declare i1 @llvm.is.constant.sl_i32i32s({i32, i32} %a)
diff --git a/test/Transforms/SCCP/latticeval-invalidate.ll b/test/Transforms/SCCP/latticeval-invalidate.ll
new file mode 100644
index 0000000000000000000000000000000000000000..19ea425312f689b73eeb2c16546eb2eaf5419e32
--- /dev/null
+++ b/test/Transforms/SCCP/latticeval-invalidate.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -sccp %s
+
+@A = external constant i32
+
+define void @test1() {
+BB4:
+  %A20 = alloca i1
+  %A15 = alloca i64
+  %A7 = alloca i64
+  %A3 = alloca i32**
+  %P = getelementptr i32, i32* @A, i32 0
+  %B = ptrtoint i32* %P to i64
+  %B8 = shl i64 %B, 9223372036854775807
+  %G10 = getelementptr i32*, i32** undef, i64 %B
+  %B10 = urem i64 %B, %B8
+  %B12 = shl i64 %B, %B
+  %BB = and i64 %B, %B8
+  %B1 = xor i64 %B, %B
+  %B23 = lshr i64 %B8, undef
+  %C5 = icmp uge i64 %B, %B10
+  %C17 = fcmp ord double 4.940660e-324, 0x7FEFFFFFFFFFFFFF
+  %C2 = icmp uge i1 %C17, false
+  %G = getelementptr i32, i32* %P, i1 %C17
+  %X = select i1 false, i712 0, i712 1
+  %C4 = icmp ule i1 true, false
+  %B3 = xor i1 %C17, %C2
+  %C33 = icmp slt i1 false, %C5
+  %B15 = sub i64 %B8, %B23
+  %C18 = icmp slt i64 undef, %BB
+  %G29 = getelementptr i32**, i32*** undef, i64 %B15
+  %C35 = icmp eq i1 %C17, undef
+  %C31 = icmp ult i1 %C35, %C5
+  %C29 = icmp sle i1 true, %C5
+  %C16 = icmp ne i16 -1, -32768
+  %A24 = alloca i1
+  %A21 = alloca i1
+  %A25 = alloca i32**
+  %C7 = icmp ule i1 %C4, %B3
+  %C14 = icmp slt i64 %B8, 0
+  ret void
+}
diff --git a/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/test/Transforms/SLPVectorizer/AArch64/transpose.ll
index 780665d94edefd7d889d0ef9022808a552beb969..9e9f40825ef152491d6f1686774be39d8032cb33 100644
--- a/test/Transforms/SLPVectorizer/AArch64/transpose.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -5,10 +6,10 @@ target triple = "aarch64--linux-gnu"
 
 define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: @build_vec_v2i64(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i64> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i64> %v0, i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i64> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i64> %v1, i32 1
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i64> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i64> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i64> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i64> [[V1]], i32 1
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
@@ -36,12 +37,12 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 
 define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
 ; CHECK-LABEL: @store_chain_v2i64(
-; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* %a, i64 1
-; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* %b, i64 1
-; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* %c, i64 1
-; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* %a, align 8
+; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* [[A]], align 8
 ; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
-; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* %b, align 8
+; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* [[B]], align 8
 ; CHECK-NEXT:    [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
@@ -49,7 +50,7 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
 ; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
 ; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
-; CHECK-NEXT:    store i64 [[TMP2_0]], i64* %c, align 8
+; CHECK-NEXT:    store i64 [[TMP2_0]], i64* [[C]], align 8
 ; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -76,16 +77,16 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
 
 define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
@@ -122,13 +123,13 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_0(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
@@ -155,10 +156,10 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_reuse_1(
-; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0
-; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1
-; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0
-; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
 ; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
 ; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
 ; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
@@ -197,13 +198,13 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
 
 define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: @build_vec_v4i32_3_binops(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> %v0, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
@@ -239,16 +240,16 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
 
 define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @reduction_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> %v0, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
diff --git a/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
index 8ee37df4e90c47b6fca9dfab935933f4685bde6a..80c9044e80a26ff6697198a14d4c30b49820a70e 100644
--- a/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -3,19 +3,19 @@
 ; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -debug < %s 2>&1 | FileCheck --check-prefix=SSE2 %s
 ; REQUIRES: asserts
 
-; int test(unsigned int *p) {
-;   int sum = 0;
+; int test_add(unsigned int *p) {
+;   int result = 0;
 ;   for (int i = 0; i < 8; i++)
-;     sum += p[i];
-;   return sum;
+;     result += p[i];
+;   return result;
 ; }
 
 ; Vector cost is 5, Scalar cost is 7
 ; CHECK: Adding cost -2 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
 ; Vector cost is 11, Scalar cost is 7
 ; SSE2:  Adding cost 4 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
-define i32 @test(i32* nocapture readonly %p) {
-; CHECK-LABEL: @test(
+define i32 @test_add(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_add(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
@@ -42,7 +42,7 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
-; SSE2-LABEL: @test(
+; SSE2-LABEL: @test_add(
 ; SSE2-NEXT:  entry:
 ; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
 ; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
@@ -94,3 +94,350 @@ entry:
   %mul.714 = add i32 %7, %mul.613
   ret i32 %mul.714
 }
+
+; int test_mul(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result *= p[i];
+;   return result;
+; }
+
+define i32 @test_mul(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+; SSE2-LABEL: @test_mul(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; SSE2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
+; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; SSE2-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
+; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; SSE2-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
+; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; SSE2-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
+; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; SSE2-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
+; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; SSE2-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
+; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; SSE2-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[MUL_714]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = mul i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = mul i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = mul i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = mul i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = mul i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = mul i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = mul i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_and(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result &= p[i];
+;   return result;
+; }
+
+define i32 @test_and(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = and i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = and i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = and i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = and i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = and i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = and i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = and i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+; SSE2-LABEL: @test_and(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
+; SSE2-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
+; SSE2-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
+; SSE2-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
+; SSE2-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
+; SSE2-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
+; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE2-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = and i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = and i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = and i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = and i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = and i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = and i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = and i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_or(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result |= p[i];
+;   return result;
+; }
+
+define i32 @test_or(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = or i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = or i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = or i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = or i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = or i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = or i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = or i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+; SSE2-LABEL: @test_or(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
+; SSE2-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
+; SSE2-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
+; SSE2-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
+; SSE2-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
+; SSE2-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
+; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE2-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = or i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = or i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = or i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = or i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = or i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = or i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = or i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_xor(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result ^= p[i];
+;   return result;
+; }
+
+define i32 @test_xor(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = xor i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = xor i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = xor i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = xor i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = xor i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = xor i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = xor i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+; SSE2-LABEL: @test_xor(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; SSE2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; SSE2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; SSE2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; SSE2-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; SSE2-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; SSE2-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; SSE2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; SSE2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; SSE2-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
+; SSE2-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
+; SSE2-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
+; SSE2-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
+; SSE2-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
+; SSE2-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
+; SSE2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; SSE2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; SSE2-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SSE2-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; SSE2-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; SSE2-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
+; SSE2-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = xor i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = xor i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = xor i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = xor i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = xor i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = xor i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = xor i32 %7, %mul.613
+  ret i32 %mul.714
+}
diff --git a/test/Transforms/SLPVectorizer/X86/uitofp.ll b/test/Transforms/SLPVectorizer/X86/uitofp.ll
index ff63fe35bddfe6d896dcf15a0b701a137ce5089f..652184094234bb4638ea959e22af436c1005e6bb 100644
--- a/test/Transforms/SLPVectorizer/X86/uitofp.ll
+++ b/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -20,29 +20,11 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ;
 
 define void @uitofp_2i64_2f64() #0 {
-; SSE-LABEL: @uitofp_2i64_2f64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    ret void
-;
-; AVX256-LABEL: @uitofp_2i64_2f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_2i64_2f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
-; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
-; AVX512-NEXT:    ret void
+; CHECK-LABEL: @uitofp_2i64_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -55,40 +37,19 @@ define void @uitofp_2i64_2f64() #0 {
 
 define void @uitofp_4i64_4f64() #0 {
 ; SSE-LABEL: @uitofp_4i64_4f64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
-; AVX256-LABEL: @uitofp_4i64_4f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; AVX256-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; AVX256-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX256-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @uitofp_4i64_4f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
-; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
-; AVX512-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
-; AVX512-NEXT:    ret void
+; AVX-LABEL: @uitofp_4i64_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
   %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
@@ -107,57 +68,27 @@ define void @uitofp_4i64_4f64() #0 {
 
 define void @uitofp_8i64_8f64() #0 {
 ; SSE-LABEL: @uitofp_8i64_8f64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; SSE-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to double
-; SSE-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to double
-; SSE-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to double
-; SSE-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to double
-; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
 ; SSE-NEXT:    ret void
 ;
 ; AVX256-LABEL: @uitofp_8i64_8f64(
-; AVX256-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
-; AVX256-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
-; AVX256-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
-; AVX256-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
-; AVX256-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
-; AVX256-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
-; AVX256-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
-; AVX256-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
-; AVX256-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to double
-; AVX256-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to double
-; AVX256-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to double
-; AVX256-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to double
-; AVX256-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to double
-; AVX256-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to double
-; AVX256-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to double
-; AVX256-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to double
-; AVX256-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
-; AVX256-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
-; AVX256-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
-; AVX256-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
-; AVX256-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
-; AVX256-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
-; AVX256-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
-; AVX256-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
 ; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @uitofp_8i64_8f64(
diff --git a/test/Transforms/SROA/pointer-offset-size.ll b/test/Transforms/SROA/pointer-offset-size.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c632c37988b0b29ee494d2c1bd08852c04f02a64
--- /dev/null
+++ b/test/Transforms/SROA/pointer-offset-size.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64:32"
+
+%struct.test = type { %struct.basic, %struct.basic }
+%struct.basic = type { i16, i8 }
+
+define i16 @test(%struct.test* %ts2.i) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S_SROA_0:%.*]] = alloca [3 x i8], align 2
+; CHECK-NEXT:    [[S_SROA_0_0__SROA_CAST:%.*]] = bitcast %struct.test* [[TS2_I:%.*]] to i8*
+; CHECK-NEXT:    [[S_SROA_0_0__SROA_IDX:%.*]] = getelementptr inbounds [3 x i8], [3 x i8]* [[S_SROA_0]], i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[S_SROA_0_0__SROA_CAST]], i8* align 2 [[S_SROA_0_0__SROA_IDX]], i32 3, i1 false)
+; CHECK-NEXT:    [[X1_I_I:%.*]] = getelementptr inbounds [[STRUCT_TEST:%.*]], %struct.test* [[TS2_I]], i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[X1_I_I]]
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  %s = alloca %struct.test
+  %0 = bitcast %struct.test* %ts2.i to i8*
+  %1 = bitcast %struct.test* %s to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 3, i1 false)
+  %x1.i.i = getelementptr inbounds %struct.test, %struct.test* %ts2.i, i32 0, i32 0, i32 0
+  %2 = load i16, i16* %x1.i.i
+  ret i16 %2
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i1)
diff --git a/test/Transforms/SampleProfile/Inputs/remap.map b/test/Transforms/SampleProfile/Inputs/remap.map
new file mode 100644
index 0000000000000000000000000000000000000000..df3d82d38bd673ed99a74c883d7e6e51ffa52bc2
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/remap.map
@@ -0,0 +1,8 @@
+# foo:: and foo::detail:: are equivalent
+name 3foo N3foo6detailE
+
+# foo::qux and foo::quux are equivalent
+type N3foo3quxE N3foo4quuxE
+
+# N::X and M::X are equivalent
+name N1N1XE N1M1XE
diff --git a/test/Transforms/SampleProfile/Inputs/remap.prof b/test/Transforms/SampleProfile/Inputs/remap.prof
new file mode 100644
index 0000000000000000000000000000000000000000..8244a51a165ad98ca5d48eccd6b9228a0b2c2588
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/remap.prof
@@ -0,0 +1,10 @@
+_ZN3foo3barERKN1N1XINS_4quuxEEE:15680:2500
+ 1: 2500
+ 4: 1000
+ 5: 1000
+ 6: 800
+ 7: 500
+ 9: 10226
+ 10: 2243
+ 16: 0
+ 18: 0
diff --git a/test/Transforms/SampleProfile/remap.ll b/test/Transforms/SampleProfile/remap.ll
new file mode 100644
index 0000000000000000000000000000000000000000..206962a3befa927112fbde49c27c90c4e7bae8c9
--- /dev/null
+++ b/test/Transforms/SampleProfile/remap.ll
@@ -0,0 +1,60 @@
+; RUN: opt %s -passes=sample-profile -sample-profile-file=%S/Inputs/remap.prof -sample-profile-remapping-file=%S/Inputs/remap.map | opt -analyze -branch-prob | FileCheck %s
+
+; Reduced from branch.ll
+
+declare i1 @foo()
+
+define void @_ZN3foo3barERKN1M1XINS_6detail3quxEEE() !dbg !2 {
+; CHECK: Printing analysis 'Branch Probability Analysis' for function '_ZN3foo3barERKN1M1XINS_6detail3quxEEE':
+
+entry:
+  %cmp = call i1 @foo(), !dbg !6
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK:  edge entry -> if.then probability is 0x4ccf6b16 / 0x80000000 = 60.01%
+; CHECK:  edge entry -> if.end probability is 0x333094ea / 0x80000000 = 39.99%
+
+if.then:
+  br label %return
+
+if.end:
+  %cmp1 = call i1 @foo(), !dbg !7
+  br i1 %cmp1, label %if.then.2, label %if.else
+; CHECK: edge if.end -> if.then.2 probability is 0x6652c748 / 0x80000000 = 79.94%
+; CHECK: edge if.end -> if.else probability is 0x19ad38b8 / 0x80000000 = 20.06%
+
+if.then.2:
+  call i1 @foo(), !dbg !8
+  br label %for.cond
+
+for.cond:
+  %cmp5 = call i1 @foo()
+  br i1 %cmp5, label %for.body, label %for.end, !prof !9
+; CHECK: edge for.cond -> for.body probability is 0x73333333 / 0x80000000 = 90.00%
+; CHECK: edge for.cond -> for.end probability is 0x0ccccccd / 0x80000000 = 10.00%
+
+for.body:
+  br label %for.cond
+
+for.end:
+  br label %return
+
+if.else:
+  br label %return
+
+return:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!4, !5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "foo++", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !{}, retainedTypes: !{})
+!1 = !DIFile(filename: "test.cc", directory: "/foo/bar")
+!2 = distinct !DISubprogram(name: "_ZN3foo3barERKN1M1XINS_6detail3quxEEE", scope: !1, file: !1, line: 4, type: !3, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !{})
+!3 = !DISubroutineType(types: !{})
+!4 = !{i32 2, !"Dwarf Version", i32 4}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !DILocation(line: 5, column: 8, scope: !2)
+!7 = !DILocation(line: 8, column: 6, scope: !2)
+!8 = !DILocation(line: 10, column: 11, scope: !2)
+!9 = !{!"branch_weights", i32 90, i32 10}
diff --git a/test/Transforms/Scalarizer/intrinsics.ll b/test/Transforms/Scalarizer/intrinsics.ll
index 6c85ac3d0925e14b31fb6f5c1b0fa246b0d4579d..7cebdffab7c40d673cb191f7962f19a2de4058d8 100644
--- a/test/Transforms/Scalarizer/intrinsics.ll
+++ b/test/Transforms/Scalarizer/intrinsics.ll
@@ -5,6 +5,8 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
 
 ; Binary fp
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
 
 ; Ternary fp
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
@@ -40,6 +42,28 @@ define <2 x float> @scalarize_minnum_v2f32(<2 x float> %x, <2 x float> %y) #0 {
   ret <2 x float> %minnum
 }
 
+; CHECK-LABEL: @scalarize_minimum_v2f32(
+; CHECK: %minimum.i0 = call float @llvm.minimum.f32(float %x.i0, float %y.i0)
+; CHECK: %minimum.i1 = call float @llvm.minimum.f32(float %x.i1, float %y.i1)
+; CHECK: %minimum.upto0 = insertelement <2 x float> undef, float %minimum.i0, i32 0
+; CHECK: %minimum = insertelement <2 x float> %minimum.upto0, float %minimum.i1, i32 1
+; CHECK: ret <2 x float> %minimum
+define <2 x float> @scalarize_minimum_v2f32(<2 x float> %x, <2 x float> %y) #0 {
+  %minimum = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> %y)
+  ret <2 x float> %minimum
+}
+
+; CHECK-LABEL: @scalarize_maximum_v2f32(
+; CHECK: %maximum.i0 = call float @llvm.maximum.f32(float %x.i0, float %y.i0)
+; CHECK: %maximum.i1 = call float @llvm.maximum.f32(float %x.i1, float %y.i1)
+; CHECK: %maximum.upto0 = insertelement <2 x float> undef, float %maximum.i0, i32 0
+; CHECK: %maximum = insertelement <2 x float> %maximum.upto0, float %maximum.i1, i32 1
+; CHECK: ret <2 x float> %maximum
+define <2 x float> @scalarize_maximum_v2f32(<2 x float> %x, <2 x float> %y) #0 {
+  %maximum = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> %y)
+  ret <2 x float> %maximum
+}
+
 ; CHECK-LABEL: @scalarize_fma_v2f32(
 ; CHECK: %fma.i0 = call float @llvm.fma.f32(float %x.i0, float %y.i0, float %z.i0)
 ; CHECK: %fma.i1 = call float @llvm.fma.f32(float %x.i1, float %y.i1, float %z.i1)
diff --git a/test/Transforms/SimpleLoopUnswitch/guards.ll b/test/Transforms/SimpleLoopUnswitch/guards.ll
new file mode 100644
index 0000000000000000000000000000000000000000..95661c425e184f203659b8c171cf3046c7cca27f
--- /dev/null
+++ b/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -0,0 +1,238 @@
+; RUN: opt -passes='loop(unswitch),verify<loops>' -enable-nontrivial-unswitch -simple-loop-unswitch-guards -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -simple-loop-unswitch-guards -S < %s | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test_simple_case(i1 %cond, i32 %N) {
+; CHECK-LABEL: @test_simple_case(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US]]
+; CHECK:       guarded.us:
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  %iv.next = add i32 %iv, 1
+  %loop.cond = icmp slt i32 %iv.next, %N
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @test_two_guards(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
+; CHECK:       entry.split.us.split.us:
+; CHECK-NEXT:    br label [[LOOP_US_US:%.*]]
+; CHECK:       loop.us.us:
+; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US_US:%.*]]
+; CHECK:       guarded.us.us:
+; CHECK-NEXT:    br label [[GUARDED_US2]]
+; CHECK:       guarded.us2:
+; CHECK-NEXT:    [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]]
+; CHECK:       deopt1:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"() ]
+  %iv.next = add i32 %iv, 1
+  %loop.cond = icmp slt i32 %iv.next, %N
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_conditional_guards(i1 %cond, i32 %N) {
+; CHECK-LABEL: @test_conditional_guards(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[BACKEDGE_US:%.*]] ]
+; CHECK-NEXT:    [[CONDITION_US:%.*]] = icmp eq i32 [[IV_US]], 123
+; CHECK-NEXT:    br i1 [[CONDITION_US]], label [[GUARD_US:%.*]], label [[BACKEDGE_US]]
+; CHECK:       guard.us:
+; CHECK-NEXT:    br label [[GUARDED_US:%.*]]
+; CHECK:       backedge.us:
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[CONDITION:%.*]] = icmp eq i32 [[IV]], 123
+; CHECK-NEXT:    br i1 [[CONDITION]], label [[GUARD:%.*]], label [[BACKEDGE]]
+; CHECK:       guard:
+; CHECK-NEXT:    br label [[DEOPT:%.*]]
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:       backedge:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %loop, label [[EXIT_SPLIT:%.*]]
+;
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %condition = icmp eq i32 %iv, 123
+  br i1 %condition, label %guard, label %backedge
+
+guard:
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  br label %backedge
+
+backedge:
+  %iv.next = add i32 %iv, 1
+  %loop.cond = icmp slt i32 %iv.next, %N
+  br i1 %loop.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_nested_loop(i1 %cond, i32 %N) {
+; CHECK-LABEL: @test_nested_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT:%.*]], label [[OUTER_LOOP_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECK:       outer_loop:
+; CHECK-NEXT:    br label [[OUTER_LOOP_SPLIT_US:%.*]]
+; CHECK:       outer_loop.split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US]]
+; CHECK:       guarded.us:
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[OUTER_BACKEDGE_SPLIT_US:%.*]]
+; CHECK:       outer_backedge.split.us:
+; CHECK-NEXT:    br label [[OUTER_BACKEDGE:%.*]]
+; CHECK:       deopt:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:       outer_backedge:
+; CHECK-NEXT:    br i1 false, label [[OUTER_LOOP]], label [[EXIT:%.*]]
+;
+
+entry:
+  br label %outer_loop
+
+outer_loop:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %outer_loop ], [ %iv.next, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  %iv.next = add i32 %iv, 1
+  %loop.cond = icmp slt i32 %iv.next, %N
+  br i1 %loop.cond, label %loop, label %outer_backedge
+
+outer_backedge:
+  br i1 undef, label %outer_loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) {
+; CHECK-LABEL: @test_sibling_loops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:         [[IV1_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US]]
+; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+; CHECK:         [[IV2_US:%.*]] = phi i32 [ 0, [[BETWEEN:%.*]] ], [ [[IV1_NEXT_US2:%.*]], [[GUARDED_US2:%.*]] ]
+; CHECK-NEXT:    br label [[GUARDED_US2]]
+; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-NEXT:    unreachable
+;
+
+entry:
+  br label %loop1
+
+loop1:
+  %iv1 = phi i32 [ 0, %entry ], [ %iv1.next, %loop1 ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+  %iv1.next = add i32 %iv1, 1
+  %loop1.cond = icmp slt i32 %iv1.next, %N
+  br i1 %loop1.cond, label %loop1, label %between
+
+between:
+  br label %loop2
+
+loop2:
+  %iv2 = phi i32 [ 0, %between ], [ %iv2.next, %loop2 ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"() ]
+  %iv2.next = add i32 %iv2, 1
+  %loop2.cond = icmp slt i32 %iv2.next, %N
+  br i1 %loop2.cond, label %loop2, label %exit
+
+exit:
+  ret void
+}
+
+; Check that we don't do anything because of cleanuppad.
+; CHECK-LABEL: @test_cleanuppad(
+; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+; CHECK-NOT:   call void (i1, ...) @llvm.experimental.guard(
+define void @test_cleanuppad(i1 %cond, i32 %N) personality i32 (...)* @__CxxFrameHandler3 {
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  %iv.next = add i32 %iv, 1
+  invoke void @may_throw(i32 %iv) to label %loop unwind label %exit
+
+exit:
+  %cp = cleanuppad within none []
+  cleanupret from %cp unwind to caller
+
+}
+
+declare void @may_throw(i32 %i)
+declare i32 @__CxxFrameHandler3(...)
diff --git a/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
index fc8cd5be25cf27acfdc676d41dd0bc0c81005ded..367d6fe28e978eae4aa21aed0a577d04b6f88c0b 100644
--- a/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
+++ b/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
@@ -2796,10 +2796,10 @@ loop_begin:
 ; CHECK:       loop_begin.us:
 ; CHECK-NEXT:    %[[V1_US:.*]] = load i1, i1* %ptr1
 ; CHECK-NEXT:    %[[V2_US:.*]] = load i1, i1* %ptr2
-; CHECK-NEXT:    %[[AND1_US:.*]] = and i1 %[[V1_US]], false
+; CHECK-NEXT:    %[[AND1_US:.*]] = and i1 %[[V1_US]], %cond1
 ; CHECK-NEXT:    %[[OR1_US:.*]] = or i1 %[[V2_US]], %cond2
 ; CHECK-NEXT:    %[[AND2_US:.*]] = and i1 %[[AND1_US]], %[[OR1_US]]
-; CHECK-NEXT:    %[[AND3_US:.*]] = and i1 %[[AND2_US]], false
+; CHECK-NEXT:    %[[AND3_US:.*]] = and i1 %[[AND2_US]], %cond3
 ; CHECK-NEXT:    br label %loop_b.us
 ;
 ; CHECK:       loop_b.us:
@@ -2857,12 +2857,99 @@ loop_exit:
 ; CHECK-NEXT:    ret
 }
 
-; Non-trivial unswitching of a switch.
-define i32 @test27(i1* %ptr, i32 %cond) {
+; Non-trivial partial loop unswitching of multiple invariant inputs to an `or`
+; chain. Basically an inverted version of corresponding `and` test (test26).
+define i32 @test27(i1* %ptr1, i1* %ptr2, i1* %ptr3, i1 %cond1, i1 %cond2, i1 %cond3) {
 ; CHECK-LABEL: @test27(
 entry:
   br label %loop_begin
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %[[INV_OR:.*]] = or i1 %cond3, %cond1
+; CHECK-NEXT:    br i1 %[[INV_OR]], label %entry.split.us, label %entry.split
+
+loop_begin:
+  %v1 = load i1, i1* %ptr1
+  %v2 = load i1, i1* %ptr2
+  %cond_or1 = or i1 %v1, %cond1
+  %cond_and1 = and i1 %v2, %cond2
+  %cond_or2 = or i1 %cond_or1, %cond_and1
+  %cond_or3 = or i1 %cond_or2, %cond3
+  br i1 %cond_or3, label %loop_b, label %loop_a
+; The 'loop_b' unswitched loop.
+;
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label %loop_begin.us
+;
+; CHECK:       loop_begin.us:
+; CHECK-NEXT:    %[[V1_US:.*]] = load i1, i1* %ptr1
+; CHECK-NEXT:    %[[V2_US:.*]] = load i1, i1* %ptr2
+; CHECK-NEXT:    %[[OR1_US:.*]] = or i1 %[[V1_US]], %cond1
+; CHECK-NEXT:    %[[AND1_US:.*]] = and i1 %[[V2_US]], %cond2
+; CHECK-NEXT:    %[[OR2_US:.*]] = or i1 %[[OR1_US]], %[[AND1_US]]
+; CHECK-NEXT:    %[[OR3_US:.*]] = or i1 %[[OR2_US]], %cond3
+; CHECK-NEXT:    br label %loop_b.us
+;
+; CHECK:       loop_b.us:
+; CHECK-NEXT:    call i32 @b()
+; CHECK-NEXT:    br label %latch.us
+;
+; CHECK:       latch.us:
+; CHECK-NEXT:    %[[V3_US:.*]] = load i1, i1* %ptr3
+; CHECK-NEXT:    br i1 %[[V3_US]], label %loop_begin.us, label %loop_exit.split.us
+;
+; CHECK:       loop_exit.split.us:
+; CHECK-NEXT:    br label %loop_exit
+
+; The original loop.
+;
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label %loop_begin
+;
+; CHECK:       loop_begin:
+; CHECK-NEXT:    %[[V1:.*]] = load i1, i1* %ptr1
+; CHECK-NEXT:    %[[V2:.*]] = load i1, i1* %ptr2
+; CHECK-NEXT:    %[[OR1:.*]] = or i1 %[[V1]], false
+; CHECK-NEXT:    %[[AND1:.*]] = and i1 %[[V2]], %cond2
+; CHECK-NEXT:    %[[OR2:.*]] = or i1 %[[OR1]], %[[AND1]]
+; CHECK-NEXT:    %[[OR3:.*]] = or i1 %[[OR2]], false
+; CHECK-NEXT:    br i1 %[[OR3]], label %loop_b, label %loop_a
+
+loop_a:
+  call i32 @a()
+  br label %latch
+; CHECK:       loop_a:
+; CHECK-NEXT:    call i32 @a()
+; CHECK-NEXT:    br label %latch
+
+loop_b:
+  call i32 @b()
+  br label %latch
+; CHECK:       loop_b:
+; CHECK-NEXT:    call i32 @b()
+; CHECK-NEXT:    br label %latch
+
+latch:
+  %v3 = load i1, i1* %ptr3
+  br i1 %v3, label %loop_begin, label %loop_exit
+; CHECK:       latch:
+; CHECK-NEXT:    %[[V3:.*]] = load i1, i1* %ptr3
+; CHECK-NEXT:    br i1 %[[V3]], label %loop_begin, label %loop_exit.split
+
+loop_exit:
+  ret i32 0
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    br label %loop_exit
+;
+; CHECK:       loop_exit:
+; CHECK-NEXT:    ret
+}
+
+; Non-trivial unswitching of a switch.
+define i32 @test28(i1* %ptr, i32 %cond) {
+; CHECK-LABEL: @test28(
+entry:
+  br label %loop_begin
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 %cond, label %[[ENTRY_SPLIT_LATCH:.*]] [
 ; CHECK-NEXT:      i32 0, label %[[ENTRY_SPLIT_A:.*]]
 ; CHECK-NEXT:      i32 1, label %[[ENTRY_SPLIT_B:.*]]
@@ -2970,8 +3057,8 @@ loop_exit:
 ; can introduce multiple edges to successors. These need lots of special case
 ; handling as they get collapsed in many cases (domtree, the unswitch itself)
 ; but not in all cases (the PHI node operands).
-define i32 @test28(i32 %arg) {
-; CHECK-LABEL: @test28(
+define i32 @test29(i32 %arg) {
+; CHECK-LABEL: @test29(
 entry:
   br label %header
 ; CHECK-NEXT:  entry:
@@ -3149,12 +3236,12 @@ exit:
 ; CHECK-NEXT:    ret i32 %[[EXIT_PHI2]]
 }
 
-; Similar to @test28 but designed to have one of the duplicate edges be
+; Similar to @test29 but designed to have one of the duplicate edges be
 ; a loop exit edge as those can in some cases be special. Among other things,
 ; this includes an LCSSA phi with multiple entries despite being a dedicated
 ; exit block.
-define i32 @test29(i32 %arg) {
-; CHECK-LABEL: define i32 @test29(
+define i32 @test30(i32 %arg) {
+; CHECK-LABEL: define i32 @test30(
 entry:
   br label %header
 ; CHECK-NEXT:  entry:
@@ -3946,8 +4033,8 @@ exit:
 ; viable for unswitching the inner-most loop. This lets us check that the
 ; unswitching doesn't end up cycling infinitely even when the cycle is
 ; indirect and due to revisiting a loop after cloning.
-define void @test30(i32 %arg) {
-; CHECK-LABEL: define void @test30(
+define void @test31(i32 %arg) {
+; CHECK-LABEL: define void @test31(
 entry:
   br label %outer.header
 ; CHECK-NEXT:  entry:
diff --git a/test/Transforms/SimplifyCFG/speculate-math.ll b/test/Transforms/SimplifyCFG/speculate-math.ll
index 5655d5d788218486883a847eaf16855f1a0f0444..87e01663edf42a4b5c84091b3ad3b63539c9c0f9 100644
--- a/test/Transforms/SimplifyCFG/speculate-math.ll
+++ b/test/Transforms/SimplifyCFG/speculate-math.ll
@@ -7,6 +7,8 @@ declare float @llvm.fmuladd.f32(float, float, float) nounwind readonly
 declare float @llvm.fabs.f32(float) nounwind readonly
 declare float @llvm.minnum.f32(float, float) nounwind readonly
 declare float @llvm.maxnum.f32(float, float) nounwind readonly
+declare float @llvm.minimum.f32(float, float) nounwind readonly
+declare float @llvm.maximum.f32(float, float) nounwind readonly
 
 ; ALL-LABEL: @fdiv_test(
 ; EXPENSIVE: select i1 %cmp, double %div, double 0.0
@@ -127,3 +129,37 @@ test_maxnum.exit:                                   ; preds = %cond.else.i, %ent
   store float %cond.i, float addrspace(1)* %out, align 4
   ret void
 }
+
+; ALL-LABEL: @minimum_test(
+; ALL: select
+define void @minimum_test(float addrspace(1)* noalias nocapture %out, float %a, float %b) nounwind {
+entry:
+  %cmp.i = fcmp olt float %a, 0.000000e+00
+  br i1 %cmp.i, label %test_minimum.exit, label %cond.else.i
+
+cond.else.i:                                      ; preds = %entry
+  %0 = tail call float @llvm.minimum.f32(float %a, float %b) nounwind readnone
+  br label %test_minimum.exit
+
+test_minimum.exit:                                   ; preds = %cond.else.i, %entry
+  %cond.i = phi float [ %0, %cond.else.i ], [ 0x7FF8000000000000, %entry ]
+  store float %cond.i, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; ALL-LABEL: @maximum_test(
+; ALL: select
+define void @maximum_test(float addrspace(1)* noalias nocapture %out, float %a, float %b) nounwind {
+entry:
+  %cmp.i = fcmp olt float %a, 0.000000e+00
+  br i1 %cmp.i, label %test_maximum.exit, label %cond.else.i
+
+cond.else.i:                                      ; preds = %entry
+  %0 = tail call float @llvm.maximum.f32(float %a, float %b) nounwind readnone
+  br label %test_maximum.exit
+
+test_maximum.exit:                                   ; preds = %cond.else.i, %entry
+  %cond.i = phi float [ %0, %cond.else.i ], [ 0x7FF8000000000000, %entry ]
+  store float %cond.i, float addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/Transforms/StraightLineStrengthReduce/slsr-add.ll b/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
index b4f448ace2aecc0a08e11b4d4ecf7ec7a0326011..92af617dab82b1151d3240cc7779cbc15ea9ca9a 100644
--- a/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
+++ b/test/Transforms/StraightLineStrengthReduce/slsr-add.ll
@@ -1,51 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slsr -gvn -S | FileCheck %s
 
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 
 define void @shl(i32 %b, i32 %s) {
 ; CHECK-LABEL: @shl(
-  %1 = add i32 %b, %s
-; [[BASIS:%[a-zA-Z0-9]+]] = add i32 %b, %s
-  call void @foo(i32 %1)
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B:%.*]], [[S:%.*]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[T1]], [[S]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    ret void
+;
+  %t1 = add i32 %b, %s
+  call void @foo(i32 %t1)
   %s2 = shl i32 %s, 1
-  %2 = add i32 %b, %s2
-; add i32 [[BASIS]], %s
-  call void @foo(i32 %2)
+  %t2 = add i32 %b, %s2
+  call void @foo(i32 %t2)
   ret void
 }
 
 define void @stride_is_2s(i32 %b, i32 %s) {
 ; CHECK-LABEL: @stride_is_2s(
+; CHECK-NEXT:    [[S2:%.*]] = shl i32 [[S:%.*]], 1
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B:%.*]], [[S2]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[T1]], [[S2]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = add i32 [[T2]], [[S2]]
+; CHECK-NEXT:    call void @foo(i32 [[T3]])
+; CHECK-NEXT:    ret void
+;
   %s2 = shl i32 %s, 1
-; CHECK: %s2 = shl i32 %s, 1
-  %1 = add i32 %b, %s2
-; CHECK: [[t1:%[a-zA-Z0-9]+]] = add i32 %b, %s2
-  call void @foo(i32 %1)
+  %t1 = add i32 %b, %s2
+  call void @foo(i32 %t1)
   %s4 = shl i32 %s, 2
-  %2 = add i32 %b, %s4
-; CHECK: [[t2:%[a-zA-Z0-9]+]] = add i32 [[t1]], %s2
-  call void @foo(i32 %2)
+  %t2 = add i32 %b, %s4
+  call void @foo(i32 %t2)
   %s6 = mul i32 %s, 6
-  %3 = add i32 %b, %s6
-; CHECK: add i32 [[t2]], %s2
-  call void @foo(i32 %3)
+  %t3 = add i32 %b, %s6
+  call void @foo(i32 %t3)
   ret void
 }
 
 define void @stride_is_3s(i32 %b, i32 %s) {
 ; CHECK-LABEL: @stride_is_3s(
-  %1 = add i32 %s, %b
-; CHECK: [[t1:%[a-zA-Z0-9]+]] = add i32 %s, %b
-  call void @foo(i32 %1)
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[S:%.*]], [[B:%.*]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[S]], 3
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[T1]], [[TMP1]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = add i32 [[T2]], [[TMP1]]
+; CHECK-NEXT:    call void @foo(i32 [[T3]])
+; CHECK-NEXT:    ret void
+;
+  %t1 = add i32 %s, %b
+  call void @foo(i32 %t1)
   %s4 = shl i32 %s, 2
-  %2 = add i32 %s4, %b
-; CHECK: [[bump:%[a-zA-Z0-9]+]] = mul i32 %s, 3
-; CHECK: [[t2:%[a-zA-Z0-9]+]] = add i32 [[t1]], [[bump]]
-  call void @foo(i32 %2)
+  %t2 = add i32 %s4, %b
+  call void @foo(i32 %t2)
   %s7 = mul i32 %s, 7
-  %3 = add i32 %s7, %b
-; CHECK: add i32 [[t2]], [[bump]]
-  call void @foo(i32 %3)
+  %t3 = add i32 %s7, %b
+  call void @foo(i32 %t3)
   ret void
 }
 
@@ -62,22 +77,53 @@ define void @stride_is_3s(i32 %b, i32 %s) {
 ; foo(t3);
 define void @stride_is_minus_2s(i32 %b, i32 %s) {
 ; CHECK-LABEL: @stride_is_minus_2s(
+; CHECK-NEXT:    [[S6:%.*]] = mul i32 [[S:%.*]], 6
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B:%.*]], [[S6]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[S]], 1
+; CHECK-NEXT:    [[T2:%.*]] = sub i32 [[T1]], [[TMP1]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    [[T3:%.*]] = sub i32 [[T2]], [[TMP1]]
+; CHECK-NEXT:    call void @foo(i32 [[T3]])
+; CHECK-NEXT:    ret void
+;
   %s6 = mul i32 %s, 6
-  %1 = add i32 %b, %s6
-; CHECK: [[t1:%[a-zA-Z0-9]+]] = add i32 %b, %s6
-; CHECK: call void @foo(i32 [[t1]])
-  call void @foo(i32 %1)
+  %t1 = add i32 %b, %s6
+  call void @foo(i32 %t1)
   %s4 = shl i32 %s, 2
-  %2 = add i32 %b, %s4
-; CHECK: [[bump:%[a-zA-Z0-9]+]] = shl i32 %s, 1
-; CHECK: [[t2:%[a-zA-Z0-9]+]] = sub i32 [[t1]], [[bump]]
-  call void @foo(i32 %2)
-; CHECK: call void @foo(i32 [[t2]])
+  %t2 = add i32 %b, %s4
+  call void @foo(i32 %t2)
   %s2 = shl i32 %s, 1
-  %3 = add i32 %b, %s2
-; CHECK: [[t3:%[a-zA-Z0-9]+]] = sub i32 [[t2]], [[bump]]
-  call void @foo(i32 %3)
-; CHECK: call void @foo(i32 [[t3]])
+  %t3 = add i32 %b, %s2
+  call void @foo(i32 %t3)
+  ret void
+}
+
+; TODO: This pass is targeted at simple address-calcs, so it is artificially limited to
+; match scalar values. The code could be modified to handle vector types too.
+
+define void @stride_is_minus_2s_vec(<2 x i32> %b, <2 x i32> %s) {
+; CHECK-LABEL: @stride_is_minus_2s_vec(
+; CHECK-NEXT:    [[S6:%.*]] = mul <2 x i32> [[S:%.*]], <i32 6, i32 6>
+; CHECK-NEXT:    [[T1:%.*]] = add <2 x i32> [[B:%.*]], [[S6]]
+; CHECK-NEXT:    call void @voo(<2 x i32> [[T1]])
+; CHECK-NEXT:    [[S4:%.*]] = shl <2 x i32> [[S]], <i32 2, i32 2>
+; CHECK-NEXT:    [[T2:%.*]] = add <2 x i32> [[B]], [[S4]]
+; CHECK-NEXT:    call void @voo(<2 x i32> [[T2]])
+; CHECK-NEXT:    [[S2:%.*]] = shl <2 x i32> [[S]], <i32 1, i32 1>
+; CHECK-NEXT:    [[T3:%.*]] = add <2 x i32> [[B]], [[S2]]
+; CHECK-NEXT:    call void @voo(<2 x i32> [[T3]])
+; CHECK-NEXT:    ret void
+;
+  %s6 = mul <2 x i32> %s, <i32 6, i32 6>
+  %t1 = add <2 x i32> %b, %s6
+  call void @voo(<2 x i32> %t1)
+  %s4 = shl <2 x i32> %s, <i32 2, i32 2>
+  %t2 = add <2 x i32> %b, %s4
+  call void @voo(<2 x i32> %t2)
+  %s2 = shl <2 x i32> %s, <i32 1, i32 1>
+  %t3 = add <2 x i32> %b, %s2
+  call void @voo(<2 x i32> %t3)
   ret void
 }
 
@@ -88,29 +134,39 @@ define void @stride_is_minus_2s(i32 %b, i32 %s) {
 ; do not rewrite b + s to t - 7 * s because the latter is more complicated.
 define void @simple_enough(i32 %b, i32 %s) {
 ; CHECK-LABEL: @simple_enough(
+; CHECK-NEXT:    [[S8:%.*]] = shl i32 [[S:%.*]], 3
+; CHECK-NEXT:    [[T1:%.*]] = add i32 [[B:%.*]], [[S8]]
+; CHECK-NEXT:    call void @foo(i32 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = add i32 [[B]], [[S]]
+; CHECK-NEXT:    call void @foo(i32 [[T2]])
+; CHECK-NEXT:    ret void
+;
   %s8 = shl i32 %s, 3
-  %1 = add i32 %b, %s8
-  call void @foo(i32 %1)
-  %2 = add i32 %b, %s
-; CHECK: [[t:%[a-zA-Z0-9]+]] = add i32 %b, %s{{$}}
-  call void @foo(i32 %2)
-; CHECK: call void @foo(i32 [[t]])
+  %t1 = add i32 %b, %s8
+  call void @foo(i32 %t1)
+  %t2 = add i32 %b, %s
+  call void @foo(i32 %t2)
   ret void
 }
 
 define void @slsr_strided_add_128bit(i128 %b, i128 %s) {
 ; CHECK-LABEL: @slsr_strided_add_128bit(
+; CHECK-NEXT:    [[S125:%.*]] = shl i128 [[S:%.*]], 125
+; CHECK-NEXT:    [[T1:%.*]] = add i128 [[B:%.*]], [[S125]]
+; CHECK-NEXT:    call void @bar(i128 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = add i128 [[T1]], [[S125]]
+; CHECK-NEXT:    call void @bar(i128 [[T2]])
+; CHECK-NEXT:    ret void
+;
   %s125 = shl i128 %s, 125
   %s126 = shl i128 %s, 126
-  %1 = add i128 %b, %s125
-; CHECK: [[t1:%[a-zA-Z0-9]+]] = add i128 %b, %s125
-  call void @bar(i128 %1)
-  %2 = add i128 %b, %s126
-; CHECK: [[t2:%[a-zA-Z0-9]+]] = add i128 [[t1]], %s125
-  call void @bar(i128 %2)
-; CHECK: call void @bar(i128 [[t2]])
+  %t1 = add i128 %b, %s125
+  call void @bar(i128 %t1)
+  %t2 = add i128 %b, %s126
+  call void @bar(i128 %t2)
   ret void
 }
 
 declare void @foo(i32)
+declare void @voo(<2 x i32>)
 declare void @bar(i128)
diff --git a/test/Transforms/StructurizeCFG/invert-constantexpr.ll b/test/Transforms/StructurizeCFG/invert-constantexpr.ll
index ac12b5d6b6539fcbe8b7098e8e07eb5bec97b4f6..61482bb73ad05714f3d8698b0291dc886dd806db 100644
--- a/test/Transforms/StructurizeCFG/invert-constantexpr.ll
+++ b/test/Transforms/StructurizeCFG/invert-constantexpr.ll
@@ -12,13 +12,12 @@ define void @invert_constantexpr_condition(i32 %arg, i32 %arg1) #0 {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[FLOW]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP1:%.*]], [[FLOW]] ], [ [[TMP7:%.*]], [[BB6:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ undef, [[FLOW]] ], [ [[TMP7:%.*]], [[BB6:%.*]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], icmp eq (i32 ptrtoint (i32* @g to i32), i32 0)
 ; CHECK-NEXT:    br label [[BB8:%.*]]
 ; CHECK:       Flow:
-; CHECK-NEXT:    [[TMP1]] = phi i1 [ undef, [[BB2]] ], [ undef, [[BB:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ [[TMP0]], [[BB2]] ], [ icmp ne (i32 ptrtoint (i32* @g to i32), i32 0), [[BB]] ]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[BB6]], label [[BB3:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[BB2]] ], [ icmp ne (i32 ptrtoint (i32* @g to i32), i32 0), [[BB:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB6]], label [[BB3:%.*]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    [[TMP7]] = icmp slt i32 [[ARG]], [[ARG1:%.*]]
 ; CHECK-NEXT:    br label [[BB3]]
diff --git a/test/Transforms/StructurizeCFG/loop-continue-phi.ll b/test/Transforms/StructurizeCFG/loop-continue-phi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2300aea077f64ad74ee30ce8cc03148766d18300
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/loop-continue-phi.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -o - -structurizecfg < %s | FileCheck %s
+
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       Flow:
+; CHECK-NEXT:    br label [[FLOW1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[CTR_NEXT:%.*]], [[FLOW1]] ]
+; CHECK-NEXT:    [[CTR_NEXT]] = add i32 [[CTR]], 1
+; CHECK-NEXT:    br i1 undef, label [[LOOP_A:%.*]], label [[FLOW1]]
+; CHECK:       loop.a:
+; CHECK-NEXT:    br i1 undef, label [[LOOP_B:%.*]], label [[FLOW:%.*]]
+; CHECK:       loop.b:
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[FLOW]] ], [ true, [[LOOP]] ]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %ctr = phi i32 [ 0, %entry ], [ %ctr.next, %loop.a ], [ %ctr.next, %loop.b ]
+  %ctr.next = add i32 %ctr, 1
+  br i1 undef, label %exit, label %loop.a
+
+loop.a:
+  br i1 undef, label %loop, label %loop.b
+
+loop.b:
+  br label %loop
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll b/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
index 668a1e99d814dea1b282a1cff0c9c7f10f6c7ee2..0af25d61b92c379643d01fe8fcc94d812bc71abc 100644
--- a/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
+++ b/test/Transforms/StructurizeCFG/one-loop-multiple-backedges.ll
@@ -8,33 +8,36 @@ bb:
   br label %bb3
 
 ; CHECK: bb3:
+; CHECK:   %0 = xor i1 %tmp4, true
+; CHECK:   br i1 %0, label %bb5, label %Flow
 bb3:                                              ; preds = %bb7, %bb
   %tmp = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ]
   %tmp4 = fcmp ult float %arg1, 3.500000e+00
-; CHECK: %0 = xor i1 %tmp4, true
-; CHECK: br i1 %0, label %bb5, label %Flow
   br i1 %tmp4, label %bb7, label %bb5
 
 ; CHECK: bb5:
+; CHECK:   %1 = xor i1 %tmp6, true
+; CHECK:   br label %Flow
 bb5:                                              ; preds = %bb3
   %tmp6 = fcmp olt float 0.000000e+00, %arg2
-; CHECK: br label %Flow
   br i1 %tmp6, label %bb10, label %bb7
 
 ; CHECK: Flow:
-; CHECK: br i1 %3, label %bb7, label %Flow1
+; CHECK:   %2 = phi i1 [ %1, %bb5 ], [ %tmp4, %bb3 ]
+; CHECK:   br i1 %2, label %bb7, label %Flow1
 
-; CHECK: bb7
+; CHECK: bb7:
+; CHECK:   br label %Flow1
 bb7:                                              ; preds = %bb5, %bb3
   %tmp8 = add nuw nsw i64 %tmp, 1
   %tmp9 = icmp slt i64 %tmp8, 5
-; CHECK: br label %Flow1
   br i1 %tmp9, label %bb3, label %bb10
 
 ; CHECK: Flow1:
-; CHECK: br i1 %7, label %bb10, label %bb3
+; CHECK:   %6 = phi i1 [ %3, %bb7 ], [ true, %Flow ]
+; CHECK:   br i1 %6, label %bb10, label %bb3
 
-; CHECK: bb10
+; CHECK: bb10:
 bb10:                                             ; preds = %bb7, %bb5
   %tmp11 = phi i32 [ 15, %bb5 ], [ 255, %bb7 ]
   store i32 %tmp11, i32 addrspace(1)* %arg, align 4
diff --git a/test/Transforms/Util/call-promotion-utils-ptrcast.ll b/test/Transforms/Util/call-promotion-utils-ptrcast.ll
new file mode 100644
index 0000000000000000000000000000000000000000..351ec292f18152e296351c2b1074bc167dcf717d
--- /dev/null
+++ b/test/Transforms/Util/call-promotion-utils-ptrcast.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -pgo-icall-prom -icp-total-percent-threshold=0 -icp-max-prom=4 < %s 2>&1 | FileCheck %s
+
+; Test that CallPromotionUtils will promote calls which require pointer casts.
+
+@foo = common global i64 (i64)* null, align 8
+
+; Check ptrcast arguments.
+define i64 @func1(i8* %a) {
+  ret i64 undef
+}
+
+; Check ptrcast return.
+define i8* @func2(i64 %a) {
+  ret i8* undef
+}
+
+; Check ptrcast arguments and return.
+define i8* @func3(i8 *%a) {
+  ret i8* undef
+}
+
+; Check mixed ptrcast and bitcast.
+define i8* @func4(double %f) {
+  ret i8* undef
+}
+
+define i64 @bar() {
+  %tmp = load i64 (i64)*, i64 (i64)** @foo, align 8
+
+; CHECK: [[ARG:%[0-9]+]] = bitcast i64 1 to double
+; CHECK-NEXT: [[RET:%[0-9]+]] = call i8* @func4(double [[ARG]])
+; CHECK-NEXT: ptrtoint i8* [[RET]] to i64
+
+; CHECK: [[RET:%[0-9]+]] = call i8* @func2(i64 1)
+; CHECK-NEXT: ptrtoint i8* [[RET]] to i64
+
+; CHECK: [[ARG:%[0-9]+]] = inttoptr i64 1 to i8*
+; CHECK-NEXT: [[RET:%[0-9]+]] = call i8* @func3(i8* [[ARG]])
+; CHECK-NEXT: ptrtoint i8* [[RET]] to i64
+
+; CHECK: [[ARG:%[0-9]+]] = inttoptr i64 1 to i8*
+; CHECK-NEXT: call i64 @func1(i8* [[ARG]])
+; CHECK-NOT: ptrtoint
+; CHECK-NOT: bitcast
+
+  %call = call i64 %tmp(i64 1), !prof !1
+  ret i64 %call
+}
+
+!1 = !{!"VP", i32 0, i64 1600, i64 7651369219802541373, i64 1030, i64 -4377547752858689819, i64 410, i64 -6929281286627296573, i64 150, i64 -2545542355363006406, i64 10}
diff --git a/test/tools/dsymutil/X86/dummy-debug-map.map b/test/tools/dsymutil/X86/dummy-debug-map.map
index f9bc7b099858a78fc13b18af897042b08505ea65..aa000182e47b7c33cc7480be784f5bed73bbe6ca 100644
--- a/test/tools/dsymutil/X86/dummy-debug-map.map
+++ b/test/tools/dsymutil/X86/dummy-debug-map.map
@@ -1,6 +1,6 @@
 # This is a dummy debug map used for some tests where the contents of the
 # map are just an implementation detail. The tests wanting to use that file
-# should put all there object files in an explicitely named sub-directory
+# should put all their object files in an explicitely named sub-directory
 # of Inputs, and they should be named 1.o, 2.o, ...
 # As not finding an object file or symbols isn't a fatal error for dsymutil,
 # you can extend this file with as much object files and symbols as needed.
diff --git a/test/tools/gold/X86/cache.ll b/test/tools/gold/X86/cache.ll
index 51ffee282b1ebf6a89a0ca99b537d92afb71610e..4446aa6d8878da5a5ac700a2096496967c6c5af9 100644
--- a/test/tools/gold/X86/cache.ll
+++ b/test/tools/gold/X86/cache.ll
@@ -53,6 +53,9 @@
 ; RUN: ls %t.cache | count 5
 
 
+; Increase the age of llvmcache-foo
+; RUN: touch -r %t.cache/llvmcache-foo -d '-2 minutes' %t.cache/llvmcache-foo
+
 ; This should remove it.
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold%shlibext \
 ; RUN:     --plugin-opt=thinlto \
diff --git a/test/tools/llvm-ar/Inputs/add-lib1.yaml b/test/tools/llvm-ar/Inputs/add-lib1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ae9fd95a62f532c5b031ff420c3aef7ba5e0221
--- /dev/null
+++ b/test/tools/llvm-ar/Inputs/add-lib1.yaml
@@ -0,0 +1,30 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000004
+    Content:         ''
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x0000000000000001
+    Content:         00636C616E672076657273696F6E20332E392E3020287472756E6B203237333632342920286C6C766D2F7472756E6B203237333633362900
+  - Name:            .note.GNU-stack
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x0000000000000001
+    Content:         ''
+Symbols:
+  Global:
+    - Name:     lib1
+      Index:    SHN_ABS
+      Value:    0x1234
+  Local:
+    - Name:            '-'
+      Type:            STT_FILE
+...
diff --git a/test/tools/llvm-ar/Inputs/add-lib2.yaml b/test/tools/llvm-ar/Inputs/add-lib2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8d224b95a4d76eeef65f82ea07781883fef15092
--- /dev/null
+++ b/test/tools/llvm-ar/Inputs/add-lib2.yaml
@@ -0,0 +1,30 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000004
+    Content:         ''
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x0000000000000001
+    Content:         00636C616E672076657273696F6E20332E392E3020287472756E6B203237333632342920286C6C766D2F7472756E6B203237333633362900
+  - Name:            .note.GNU-stack
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x0000000000000001
+    Content:         ''
+Symbols:
+  Global:
+    - Name:     lib2
+      Index:    SHN_ABS
+      Value:    0x1234
+  Local:
+    - Name:            '-'
+      Type:            STT_FILE
+...
diff --git a/test/tools/llvm-ar/Inputs/add-lib3.yaml b/test/tools/llvm-ar/Inputs/add-lib3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f1cfe7d80663613299a0afc714259e753b9289f
--- /dev/null
+++ b/test/tools/llvm-ar/Inputs/add-lib3.yaml
@@ -0,0 +1,30 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000004
+    Content:         ''
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x0000000000000001
+    Content:         00636C616E672076657273696F6E20332E392E3020287472756E6B203237333632342920286C6C766D2F7472756E6B203237333633362900
+  - Name:            .note.GNU-stack
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x0000000000000001
+    Content:         ''
+Symbols:
+  Global:
+    - Name:     lib3
+      Index:    SHN_ABS
+      Value:    0x1234
+  Local:
+    - Name:            '-'
+      Type:            STT_FILE
+...
diff --git a/test/tools/llvm-ar/add-library.test b/test/tools/llvm-ar/add-library.test
new file mode 100644
index 0000000000000000000000000000000000000000..bd44a7e9a4a8c83acb67be78720f150fd955f199
--- /dev/null
+++ b/test/tools/llvm-ar/add-library.test
@@ -0,0 +1,43 @@
+RUN: yaml2obj %S/Inputs/add-lib1.yaml -o %t-add-lib1.o
+RUN: yaml2obj %S/Inputs/add-lib2.yaml -o %t-add-lib2.o
+RUN: yaml2obj %S/Inputs/add-lib2.yaml -o %t-add-lib3.o
+
+RUN: rm -f %t.ar
+RUN: llvm-ar crs %t.ar %t-add-lib1.o
+RUN: llvm-ar cqs %t.ar %t-add-lib2.o
+
+RUN: llvm-ar tv %t.ar | FileCheck %s --check-prefix=CHECK-NAMES-NO-ADDLIB
+CHECK-NAMES-NO-ADDLIB: add-library.test.tmp-add-lib1.o
+CHECK-NAMES-NO-ADDLIB: add-library.test.tmp-add-lib2.o
+
+RUN: llvm-nm %t.ar | FileCheck %s --check-prefix=CHECK-SYMBOLS-NO-ADDLIB
+CHECK-SYMBOLS-NO-ADDLIB: add-lib1
+CHECK-SYMBOLS-NO-ADDLIB: add-lib2
+
+RUN: rm -f %t1.ar
+RUN: llvm-ar crs %t1.ar %t-add-lib3.o
+RUN: llvm-ar cqLs %t1.ar %t.ar
+
+RUN: llvm-ar tv %t1.ar | FileCheck %s --check-prefix=CHECK-NAMES-ADDLIB
+CHECK-NAMES-ADDLIB: add-library.test.tmp-add-lib3.o
+CHECK-NAMES-ADDLIB: add-library.test.tmp-add-lib1.o
+CHECK-NAMES-ADDLIB: add-library.test.tmp-add-lib2.o
+
+RUN: llvm-nm %t1.ar | FileCheck %s --check-prefix=CHECK-SYMBOLS-ADDLIB
+CHECK-SYMBOLS-ADDLIB: add-lib3
+CHECK-SYMBOLS-ADDLIB: add-lib1
+CHECK-SYMBOLS-ADDLIB: add-lib2
+
+RUN: llvm-ar cqLs %t1.ar %t-add-lib1.o
+
+RUN: llvm-ar tv %t1.ar | FileCheck %s --check-prefix=CHECK-NAMES-DUPLICATE
+CHECK-NAMES-DUPLICATE: add-library.test.tmp-add-lib3.o
+CHECK-NAMES-DUPLICATE: add-library.test.tmp-add-lib1.o
+CHECK-NAMES-DUPLICATE: add-library.test.tmp-add-lib2.o
+CHECK-NAMES-DUPLICATE: add-library.test.tmp-add-lib1.o
+
+RUN: llvm-nm %t1.ar | FileCheck %s --check-prefix=CHECK-SYMBOLS-DUPLICATE
+CHECK-SYMBOLS-DUPLICATE: add-lib3
+CHECK-SYMBOLS-DUPLICATE: add-lib1
+CHECK-SYMBOLS-DUPLICATE: add-lib2
+CHECK-SYMBOLS-DUPLICATE: add-lib1
diff --git a/test/tools/llvm-dwarfdump/X86/debug-verify-object.s b/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
new file mode 100644
index 0000000000000000000000000000000000000000..be79c95c0b15b0e3224cf72302b265eacc6fe2b3
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/debug-verify-object.s
@@ -0,0 +1,57 @@
+# RUN: llvm-mc -triple x86_64-unknown-linux-gnu -filetype obj -o - %s | llvm-dwarfdump --verify -
+
+	.text
+
+	.section	.text.f,"ax",@progbits
+	.globl	f
+	.type	f,@function
+f:
+.Lfunc_begin0:
+	pushq	$32
+	popq	%rax
+	retq
+.Lfunc_end0:
+	.size	f, .Lfunc_end0-f
+
+	.section	.text.g,"ax",@progbits
+	.globl	g
+	.type	g,@function
+g:
+.Lfunc_begin1:
+	pushq   $64
+	popq    %rax
+	retq
+.Lfunc_end1:
+	.size	g, .Lfunc_end1-g
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                       # Abbreviation Code
+	.byte	17                      # DW_TAG_compile_unit
+	.byte	0                       # DW_CHILDREN_no
+	.byte	17                      # DW_AT_low_pc
+	.byte	1                       # DW_FORM_addr
+	.byte	85                      # DW_AT_ranges
+	.byte	23                      # DW_FORM_sec_offset
+	.byte	0                       # EOM(1)
+	.byte	0                       # EOM(2)
+	.byte	0                       # EOM(3)
+
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	20                      # Length of Unit
+	.short	4                       # DWARF version number
+	.long	.debug_abbrev           # Offset Into Abbrev. Section
+	.byte	8                       # Address Size (in bytes)
+	.byte	1                       # Abbrev [1] 0xb:0x1f DW_TAG_compile_unit
+	.quad	0                       # DW_AT_low_pc
+	.long	.Ldebug_ranges0         # DW_AT_ranges
+
+	.section        .debug_ranges,"",@progbits
+.Ldebug_ranges0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_end0
+	.quad	.Lfunc_begin1
+	.quad	.Lfunc_end1
+	.quad	0
+	.quad	0
+
diff --git a/test/tools/llvm-dwarfdump/X86/debug_loclists_startx_length.s b/test/tools/llvm-dwarfdump/X86/debug_loclists_startx_length.s
new file mode 100644
index 0000000000000000000000000000000000000000..07c68ab2618f157256856ac01c0a1d96ed375728
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/debug_loclists_startx_length.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux -o %t.o
+# RUN: llvm-dwarfdump -v %t.o | FileCheck %s
+
+# DW_LLE_startx_length has different `length` encoding in pre-DWARF 5
+# and final DWARF 5 versions. This test checks we are able to parse
+# the final version which uses ULEB128 and not the U32.
+
+# CHECK:         .debug_loclists contents:
+# CHECK-NEXT:    0x00000000: locations list header: length = 0x0000000f, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+# CHECK-NEXT:    0x00000000:
+# CHECK-NEXT:    Addr idx 1 (w/ length 16): DW_OP_reg5 RDI
+
+.section .debug_loclists,"",@progbits
+ .long  .Ldebug_loclist_table_end0-.Ldebug_loclist_table_start0
+.Ldebug_loclist_table_start0:
+ .short 5         # Version.
+ .byte 8          # Address size.
+ .byte 0          # Segmen selector size.
+ .long 0          # Offset entry count.
+ 
+ .byte 3          # DW_LLE_startx_length
+ .byte 0x01       # Index
+ .uleb128 0x10    # Length
+ .short 1         # Loc expr size
+ .byte 85         # DW_OP_reg5
+ .byte 0          # DW_LLE_end_of_list
+.Ldebug_loclist_table_end0:
diff --git a/test/tools/llvm-dwarfdump/X86/debug_rnglists.s b/test/tools/llvm-dwarfdump/X86/debug_rnglists.s
index 8f718b699f55e4d0c860a3b8278913b5bbb4b2ff..60533ca27217ed7bfbd6ae17b35e5be932c885aa 100644
--- a/test/tools/llvm-dwarfdump/X86/debug_rnglists.s
+++ b/test/tools/llvm-dwarfdump/X86/debug_rnglists.s
@@ -57,6 +57,29 @@
 # BOTH:         ranges:
 # BOTH-NOT:     [
 
+# TERSE-NEXT:   range list header: length = 0x0000000b, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+
+# VERBOSE-NEXT: 0x{{[0-9a-f]*}}:
+# VERBOSE-SAME: range list header: length = 0x0000000b, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+
+# BOTH-NEXT:    ranges:
+# TERSE-NEXT:   <End of list>
+
+# VERBOSE-NEXT: 0x00000082: [DW_RLE_base_addressx]:  0x0000000000000000
+# VERBOSE-NEXT: 0x00000084: [DW_RLE_end_of_list ]
+
+# TERSE-NEXT:   range list header: length = 0x0000000c, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+
+# VERBOSE-NEXT: 0x{{[0-9a-f]*}}:
+# VERBOSE-SAME: range list header: length = 0x0000000c, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+
+# BOTH-NEXT:    ranges:
+# TERSE-NEXT:   [0x0000000000000000, 0x000000000000002a)
+# TERSE-NEXT:   <End of list>
+
+# VERBOSE-NEXT: 0x000000a1: [DW_RLE_startx_length]:  0x0000000000000002, 0x000000000000002a => [0x0000000000000000, 0x000000000000002a)
+# VERBOSE-NEXT: 0x000000a4: [DW_RLE_end_of_list ]
+
 # TERSE-NEXT:   range list header: length = 0x0000000e, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
 
 # VERBOSE-NEXT: 0x{{[0-9a-f]*}}:
@@ -87,9 +110,7 @@
 # BOTH-NOT:     range list header:
 
 # ERR-NOT:  error:
-# ERR:      error: unsupported rnglists encoding DW_RLE_base_addressx at offset 0x82
-# ERR-NEXT: error: unsupported rnglists encoding DW_RLE_startx_endx at offset 0x91
-# ERR-NEXT: error: unsupported rnglists encoding DW_RLE_startx_length at offset 0xa1
+# ERR: error: unsupported rnglists encoding DW_RLE_startx_endx at offset 0x91
 # ERR-NOT:  error:
 
 .section .debug_rnglists,"",@progbits
diff --git a/test/tools/llvm-dwarfdump/X86/typeunit-name.s b/test/tools/llvm-dwarfdump/X86/typeunit-name.s
new file mode 100644
index 0000000000000000000000000000000000000000..7b60ac96ffe8d4a708b5f6080159561f6346f3d7
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/typeunit-name.s
@@ -0,0 +1,100 @@
+# Demonstrate that -name works with type units.
+# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o
+# RUN: llvm-dwarfdump -name=V4_type_unit -name=V5_split_type_unit %t.o | FileCheck %s
+#
+# The names should appear twice, once for the unit and once for the type DIE,
+# because we give them the same name.
+# CHECK: V4_type_unit
+# CHECK: V4_type_unit
+# CHECK: V5_split_type_unit
+# CHECK: V5_split_type_unit
+
+        .section .debug_str,"MS",@progbits,1
+str_TU_4:
+        .asciz "V4_type_unit"
+
+        .section .debug_str.dwo,"MS",@progbits,1
+dwo_TU_5:
+        .asciz "V5_split_type_unit"
+
+# Abbrev section for the normal type unit.
+        .section .debug_abbrev,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x41  # DW_TAG_type_unit
+        .byte 0x01  # DW_CHILDREN_yes
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x02  # Abbrev code
+        .byte 0x13  # DW_TAG_structure_type
+        .byte 0x00  # DW_CHILDREN_no (no members)
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x00  # EOM(3)
+
+# And a .dwo copy for the .dwo section.
+        .section .debug_abbrev.dwo,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x41  # DW_TAG_type_unit
+        .byte 0x01  # DW_CHILDREN_yes
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x02  # Abbrev code
+        .byte 0x13  # DW_TAG_structure_type
+        .byte 0x00  # DW_CHILDREN_no (no members)
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x00  # EOM(3)
+
+        .section .debug_types,"",@progbits
+
+# DWARF v4 Type unit header. Normal/split are identical so we do only one.
+TU_4_start:
+        .long  TU_4_end-TU_4_version  # Length of Unit
+TU_4_version:
+        .short 4               # DWARF version number
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+        .byte 8                # Address Size (in bytes)
+        .quad 0x0011223344556677 # Type Signature
+        .long TU_4_type-TU_4_start # Type offset
+# The type-unit DIE, which has a name.
+        .byte 1
+        .long str_TU_4
+# The type DIE, which has the same name.
+TU_4_type:
+        .byte 2
+        .long str_TU_4
+        .byte 0 # NULL
+        .byte 0 # NULL
+TU_4_end:
+
+        .section .debug_types.dwo,"",@progbits
+# FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
+
+# DWARF v5 split type unit header.
+TU_split_5_start:
+        .long  TU_split_5_end-TU_split_5_version  # Length of Unit
+TU_split_5_version:
+        .short 5               # DWARF version number
+        .byte 6                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev.dwo    # Offset Into Abbrev. Section
+        .quad 0x8899aabbccddeeff # Type Signature
+        .long TU_split_5_type-TU_split_5_start  # Type offset
+# The type-unit DIE, which has a name.
+        .byte 1
+        .long dwo_TU_5
+# The type DIE, which has the same name.
+TU_split_5_type:
+        .byte 2
+        .long dwo_TU_5
+        .byte 0 # NULL
+        .byte 0 # NULL
+TU_split_5_end:
diff --git a/test/tools/llvm-dwarfdump/X86/verify_debug_info.s b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
index e5a748b89f98a3db6fe2cf5e45f96aa8365ff62b..e3eae9b986f5fa5e2aeefe6041f4cefd114e1dfd 100644
--- a/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
+++ b/test/tools/llvm-dwarfdump/X86/verify_debug_info.s
@@ -7,7 +7,7 @@
 # CHECK-NEXT: DW_AT_producer [DW_FORM_strp]	( .debug_str[0x00000000] = "clang version 5.0.0 (trunk 308185) (llvm/trunk 308186)")
 # CHECK-NEXT: DW_AT_language [DW_FORM_data2]	(DW_LANG_C99)
 # CHECK-NEXT: DW_AT_name [DW_FORM_strp]	( .debug_str[0x00000037] = "basic.c")
-# CHECK-NEXT: DW_AT_stmt_list [DW_FORM_strx4]	( indexed (00000000) string = )
+# CHECK-NEXT: DW_AT_stmt_list [DW_FORM_block4]
 # CHECK-NEXT: DW_AT_comp_dir [DW_FORM_strp]	( .debug_str[0x0000003f] = "/Users/sgravani/Development/tests")
 # CHECK-NEXT: DW_AT_low_pc [DW_FORM_addr]	(0x0000000000000000)
 # CHECK-NEXT: DW_AT_high_pc [DW_FORM_data4]	(0x00000016){{[[:space:]]}}
@@ -82,7 +82,7 @@ Lsection_abbrev:
 	.byte	3                       ## DW_AT_name
 	.byte	14                      ## DW_FORM_strp
 	.byte	16                      ## DW_AT_stmt_list
-	.byte	40                      ## DW_FORM_sec_offset -- error: DIE has invalid DW_AT_stmt_list encoding:
+	.byte	4                       ## DW_FORM_sec_offset -- error: DIE has invalid DW_AT_stmt_list encoding:
 	.byte	27                      ## DW_AT_comp_dir
 	.byte	14                      ## DW_FORM_strp
 	.byte	17                      ## DW_AT_low_pc
diff --git a/test/tools/llvm-dwarfdump/X86/verify_strings.s b/test/tools/llvm-dwarfdump/X86/verify_strings.s
new file mode 100644
index 0000000000000000000000000000000000000000..e09ffd502cbdbe4602ef58377028aa21e6208f26
--- /dev/null
+++ b/test/tools/llvm-dwarfdump/X86/verify_strings.s
@@ -0,0 +1,88 @@
+# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o
+# RUN: not llvm-dwarfdump -verify %t.o | FileCheck --check-prefix=VERIFY %s
+
+# Check that the verifier correctly diagnoses various error conditions with
+# the usage of string indices and string offsets tables.
+
+        .section .debug_str,"MS",@progbits,1
+str_producer:
+        .asciz "Handmade DWARF producer"
+
+        .section .debug_str_offsets,"",@progbits
+# The string offsets table
+        .long .debug_str_offsets_segment0_end-.debug_str_offsets_base0+4
+        .short 5    # DWARF version
+        .short 0    # Padding
+.debug_str_offsets_base0:
+        .long str_producer
+        .long 1000  # Invalid string address.
+.debug_str_offsets_segment0_end:
+
+# A simple abbrev section with a basic compile unit DIE.
+        .section .debug_abbrev,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x11  # DW_TAG_compile_unit
+        .byte 0x01  # DW_CHILDREN_no
+        .byte 0x25  # DW_AT_producer
+        .byte 0x1a  # DW_FORM_strx
+        .byte 0x72  # DW_AT_str_offsets_base
+        .byte 0x17  # DW_FORM_sec_offset
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+
+        .section .debug_info,"",@progbits
+
+# The first unit's CU DIE has an invalid DW_AT_str_offsets_base which
+# renders any string index unresolvable.
+
+# DWARF v5 CU header.
+        .long  CU1_5_end-CU1_5_version  # Length of Unit
+CU1_5_version:
+        .short 5               # DWARF version number
+        .byte 1                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+# The compile-unit DIE, which has DW_AT_producer and DW_AT_str_offsets.
+        .byte 1                # Abbreviation code
+        .byte 0                # Index of string for DW_AT_producer.
+        .long 1000             # Bad value for DW_AT_str_offsets_base
+        .byte 0 # NULL
+CU1_5_end:
+
+# The second unit's CU DIE uses an invalid string index.
+
+# DWARF v5 CU header
+        .long  CU2_5_end-CU2_5_version  # Length of Unit
+CU2_5_version:
+        .short 5               # DWARF version number
+        .byte 1                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+# The compile-unit DIE, which has DW_AT_producer and DW_AT_str_offsets.
+        .byte 1                # Abbreviation code
+        .byte 100              # Invalid string index
+        .long .debug_str_offsets_base0
+        .byte 0 # NULL
+CU2_5_end:
+
+# The third unit's CU DIE uses a valid string index but the entry in the 
+# string offsets table is invalid. 
+
+# DWARF v5 CU header
+        .long  CU3_5_end-CU3_5_version  # Length of Unit
+CU3_5_version:
+        .short 5               # DWARF version number
+        .byte 1                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+# The compile-unit DIE, which has DW_AT_producer and DW_AT_str_offsets.
+        .byte 1                # Abbreviation code
+        .byte 1                # Index of string for DW_AT_producer.
+        .long .debug_str_offsets_base0
+        .byte 0 # NULL
+CU3_5_end:
+        
+# VERIFY-DAG:      error: DW_FORM_strx used without a valid string offsets table:
+# VERIFY-DAG:      error: DW_FORM_strx uses index 100, which is too large:
+# VERIFY-DAG:      error: DW_FORM_strx uses index 1, but the referenced string offset 
+# VERIFY-DAG-SAME: is beyond .debug_str bounds:
diff --git a/test/tools/llvm-dwarfdump/cmdline.test b/test/tools/llvm-dwarfdump/cmdline.test
index 1314990a7b97d97be0acd4449b036bf0704e09f0..5b5ea618c1a3bc9eb4c547e1689c320c5f81e400 100644
--- a/test/tools/llvm-dwarfdump/cmdline.test
+++ b/test/tools/llvm-dwarfdump/cmdline.test
@@ -24,3 +24,6 @@ HELP-NOT: -reverse-iterate
 
 RUN: llvm-dwarfdump --version 2>&1 | FileCheck --check-prefix=VERSION %s
 VERSION: {{ version }}
+
+RUN: llvm-dwarfdump -diff -verbose 2>&1 | FileCheck --check-prefix=INCOMPATIBLE %s
+INCOMPATIBLE: error: incompatible arguments: specifying both -diff and -verbose is currently not supported
diff --git a/test/tools/llvm-exegesis/X86/uops-ADD32mi8.s b/test/tools/llvm-exegesis/X86/uops-ADD32mi8.s
new file mode 100644
index 0000000000000000000000000000000000000000..e3b3b80efc2586b07e69cc12ad2fbbf05addd22e
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-ADD32mi8.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=ADD32mi8 | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     ADD32mi8
diff --git a/test/tools/llvm-exegesis/X86/uops-ADD32mr.s b/test/tools/llvm-exegesis/X86/uops-ADD32mr.s
new file mode 100644
index 0000000000000000000000000000000000000000..80ecb3033b0fc285e5cf7a2ad349df6f40fcadcd
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-ADD32mr.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=ADD32mr | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     ADD32mr
diff --git a/test/tools/llvm-exegesis/X86/uops-ADD32rm.s b/test/tools/llvm-exegesis/X86/uops-ADD32rm.s
new file mode 100644
index 0000000000000000000000000000000000000000..0e6bdb587b9a14dca1e91b61ac22bfd750a1c24c
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-ADD32rm.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=ADD32rm | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     ADD32rm
diff --git a/test/tools/llvm-exegesis/X86/uops-BEXTR32rm.s b/test/tools/llvm-exegesis/X86/uops-BEXTR32rm.s
new file mode 100644
index 0000000000000000000000000000000000000000..c4d2c7d840f12e28a3a81b99316aba381201c50f
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-BEXTR32rm.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=BEXTR32rm | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     BEXTR32rm
diff --git a/test/tools/llvm-exegesis/X86/uops-BSF16rm.s b/test/tools/llvm-exegesis/X86/uops-BSF16rm.s
new file mode 100644
index 0000000000000000000000000000000000000000..9cb278dc0524a12dcfbad69062960f5ca117660e
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-BSF16rm.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=BSF16rm | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     BSF16rm
diff --git a/test/tools/llvm-exegesis/X86/uops-BTR64mr.s b/test/tools/llvm-exegesis/X86/uops-BTR64mr.s
new file mode 100644
index 0000000000000000000000000000000000000000..6d4544b5c521f11b767c8c7a638253f10402d759
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-BTR64mr.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=BTR64mr | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     BTR64mr
diff --git a/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s b/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
new file mode 100644
index 0000000000000000000000000000000000000000..c323395ef5b0ab6f15e31a7cf4ad511a6d7233e6
--- /dev/null
+++ b/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
@@ -0,0 +1,6 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=VFMADDSS4rm | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     VFMADDSS4rm
diff --git a/test/tools/llvm-extract/extract-block.ll b/test/tools/llvm-extract/extract-block.ll
index c812a567523b6d89771e91cef128e1440df375b4..7cf0f16033794f5438caf71a53b845e6690a408a 100644
--- a/test/tools/llvm-extract/extract-block.ll
+++ b/test/tools/llvm-extract/extract-block.ll
@@ -12,7 +12,7 @@ bb:
   ret void
 }
 
-; CHECK: @foo_bb4
+; CHECK: @foo.bb4
 ; CHECK: call void @bar()
 ; CHECK: %tmp5
 define i32 @foo(i32 %arg) {
diff --git a/test/tools/llvm-extract/extract-multiple-blocks.ll b/test/tools/llvm-extract/extract-multiple-blocks.ll
index a7f270bdcd6f903db99a15d0f1e90906db0e04f1..343edff342fce2ef569f3b74ff729e302beceff3 100644
--- a/test/tools/llvm-extract/extract-multiple-blocks.ll
+++ b/test/tools/llvm-extract/extract-multiple-blocks.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-extract -S -bb foo:bb4 -bb foo:bb7 %s | FileCheck %s
 
-; CHECK: @foo_bb4
-; CHECK: @foo_bb7
+; CHECK: @foo.bb4
+; CHECK: @foo.bb7
 define i32 @foo(i32 %arg) {
 bb:
   %tmp = alloca i32, align 4
diff --git a/test/tools/llvm-lto2/X86/pipeline.ll b/test/tools/llvm-lto2/X86/pipeline.ll
index 29276d8d13a78a57026654504fc91983a293fdfc..9ab81ac70a7dccce4b4b664ecd151fe8ccef4160 100644
--- a/test/tools/llvm-lto2/X86/pipeline.ll
+++ b/test/tools/llvm-lto2/X86/pipeline.ll
@@ -32,11 +32,11 @@ define void @patatino() {
 ; RUN:  -r %t1.bc,patatino,px -opt-pipeline foogoo 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=ERR
 
-; ERR: LLVM ERROR: unable to parse pass pipeline description: foogoo
+; ERR: LLVM ERROR: unable to parse pass pipeline description 'foogoo': unknown pass name 'foogoo'
 
 ; RUN: not llvm-lto2 run %t1.bc -o %t.o \
 ; RUN:  -r %t1.bc,patatino,px -aa-pipeline patatino \
 ; RUN:  -opt-pipeline loweratomic 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=AAERR
 
-; AAERR: LLVM ERROR: unable to parse AA pipeline description: patatino
+; AAERR: LLVM ERROR: unable to parse AA pipeline description 'patatino': unknown alias analysis name 'patatino'
diff --git a/test/tools/llvm-mca/ARM/unsupported-write-variant.s b/test/tools/llvm-mca/ARM/unsupported-write-variant.s
new file mode 100644
index 0000000000000000000000000000000000000000..f4511f54ab55ba4483178c845d3099206cf8c029
--- /dev/null
+++ b/test/tools/llvm-mca/ARM/unsupported-write-variant.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mca -march=arm -mcpu=swift -all-views=false 2>&1 < %s | FileCheck %s
+
+add r3, r1, r12, lsl #2
+
+# CHECK:      error: unable to resolve scheduling class for write variant.
+# CHECK-NEXT: note: instruction:    add r3, r1, r12, lsl #2
diff --git a/test/tools/llvm-mca/X86/BdVer2/add-sequence.s b/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
new file mode 100644
index 0000000000000000000000000000000000000000..004def6ab71be99157173b21707e4fcbdf1c857e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
@@ -0,0 +1,107 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1000 -timeline < %s | FileCheck %s
+
+add %eax, %ecx
+add %esi, %eax
+add %eax, %edx
+
+# CHECK:      Iterations:        1000
+# CHECK-NEXT: Instructions:      3000
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %ecx
+# CHECK-NEXT:  1      1     0.50                        addl	%esi, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %edx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %edx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [0,1]     DeER .    .    . .   addl	%esi, %eax
+# CHECK-NEXT: [0,2]     D=eER.    .    . .   addl	%eax, %edx
+# CHECK-NEXT: [1,0]     D==eER    .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [1,1]     .DeE-R    .    . .   addl	%esi, %eax
+# CHECK-NEXT: [1,2]     .D=eER    .    . .   addl	%eax, %edx
+# CHECK-NEXT: [2,0]     .D==eER   .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [2,1]     .D==eER   .    . .   addl	%esi, %eax
+# CHECK-NEXT: [2,2]     . D==eER  .    . .   addl	%eax, %edx
+# CHECK-NEXT: [3,0]     . D===eER .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [3,1]     . D==eE-R .    . .   addl	%esi, %eax
+# CHECK-NEXT: [3,2]     . D===eER .    . .   addl	%eax, %edx
+# CHECK-NEXT: [4,0]     .  D===eER.    . .   addl	%eax, %ecx
+# CHECK-NEXT: [4,1]     .  D===eER.    . .   addl	%esi, %eax
+# CHECK-NEXT: [4,2]     .  D====eER    . .   addl	%eax, %edx
+# CHECK-NEXT: [5,0]     .  D=====eER   . .   addl	%eax, %ecx
+# CHECK-NEXT: [5,1]     .   D===eE-R   . .   addl	%esi, %eax
+# CHECK-NEXT: [5,2]     .   D====eER   . .   addl	%eax, %edx
+# CHECK-NEXT: [6,0]     .   D=====eER  . .   addl	%eax, %ecx
+# CHECK-NEXT: [6,1]     .   D=====eER  . .   addl	%esi, %eax
+# CHECK-NEXT: [6,2]     .    D=====eER . .   addl	%eax, %edx
+# CHECK-NEXT: [7,0]     .    D======eER. .   addl	%eax, %ecx
+# CHECK-NEXT: [7,1]     .    D=====eE-R. .   addl	%esi, %eax
+# CHECK-NEXT: [7,2]     .    D======eER. .   addl	%eax, %edx
+# CHECK-NEXT: [8,0]     .    .D======eER .   addl	%eax, %ecx
+# CHECK-NEXT: [8,1]     .    .D======eER .   addl	%esi, %eax
+# CHECK-NEXT: [8,2]     .    .D=======eER.   addl	%eax, %edx
+# CHECK-NEXT: [9,0]     .    .D========eER   addl	%eax, %ecx
+# CHECK-NEXT: [9,1]     .    . D======eE-R   addl	%esi, %eax
+# CHECK-NEXT: [9,2]     .    . D=======eER   addl	%eax, %edx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     10    5.0    0.6    0.0       addl	%eax, %ecx
+# CHECK-NEXT: 1.     10    4.2    0.5    0.5       addl	%esi, %eax
+# CHECK-NEXT: 2.     10    5.0    0.0    0.0       addl	%eax, %edx
diff --git a/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s
new file mode 100644
index 0000000000000000000000000000000000000000..973ce7d8a04287d169ebfc349a844daa86c07db4
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+## Sets register RAX.
+imulq $5, %rcx, %rax
+
+## Kills the previous definition of RAX.
+## The upper portion of RAX is cleared.
+lzcnt %ecx, %eax
+
+## The AND can start immediately after the LZCNT.
+## It doesn't need to wait for the IMUL.
+and   %rcx, %rax
+bsf   %rax, %rcx
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      702
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.42
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     4.00                        imulq	$5, %rcx, %rax
+# CHECK-NEXT:  2      2     0.50                        lzcntl	%ecx, %eax
+# CHECK-NEXT:  1      1     0.50                        andq	%rcx, %rax
+# CHECK-NEXT:  6      3     2.00                        bsfq	%rax, %rcx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .   imulq	$5, %rcx, %rax
+# CHECK-NEXT: [0,1]     DeeE----R .    .   lzcntl	%ecx, %eax
+# CHECK-NEXT: [0,2]     D==eE---R .    .   andq	%rcx, %rax
+# CHECK-NEXT: [0,3]     .D==eeeER .    .   bsfq	%rax, %rcx
+# CHECK-NEXT: [1,0]     . D====eeeeeeER.   imulq	$5, %rcx, %rax
+# CHECK-NEXT: [1,1]     .  D====eeE---R.   lzcntl	%ecx, %eax
+# CHECK-NEXT: [1,2]     .  D======eE--R.   andq	%rcx, %rax
+# CHECK-NEXT: [1,3]     .   D======eeeER   bsfq	%rax, %rcx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     3.0    0.5    0.0       imulq	$5, %rcx, %rax
+# CHECK-NEXT: 1.     2     3.0    1.0    3.5       lzcntl	%ecx, %eax
+# CHECK-NEXT: 2.     2     5.0    0.0    2.5       andq	%rcx, %rax
+# CHECK-NEXT: 3.     2     5.0    0.0    0.0       bsfq	%rax, %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..edbe726a9cbe7ea74bbdea88c34a38a32385f135
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s
@@ -0,0 +1,135 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+# In this test, the VDIVPS takes 38 cycles to write to register YMM3.  The first
+# VADDPS does not depend on the VDIVPS (the WAW dependency is eliminated at
+# register renaming stage). So the first VADDPS can be executed in parallel to
+# the VDIVPS. That VADDPS also writes to register XMM3, and the upper half of
+# YMM3 is implicitly cleared. As a consequence, the definition of YMM3 from the
+# VDIVPS is killed, and the subsequent VADDPS instructions don't need to wait
+# for the VDIVPS to complete.
+# The block reciprocal throughput is limited by the VDIVPS reciprocal throughput
+# (which is 38 cycles). The sequence of VADDPS can be executed in parallel on
+# the FPA unit; their latency is "hidden" by the long latency of the VDIVPS.
+
+vdivps %ymm0, %ymm1, %ymm3
+vaddps %xmm0, %xmm1, %xmm3
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vaddps %ymm3, %ymm1, %ymm4
+vandps %xmm4, %xmm1, %xmm0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1800
+# CHECK-NEXT: Total Cycles:      4003
+# CHECK-NEXT: Total uOps:        3400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.85
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 31.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      9     19.00                       vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT:  1      2     0.50                        vandps	%xmm4, %xmm1, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .    .    .    .    .    .   .   vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT: [0,1]     DeeeeeE----R   .    .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,3]     .D======eeeeeER.    .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,4]     . D=======eeeeeER   .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,5]     . D=========eeeeeER .    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,6]     .  D==========eeeeeER    .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,7]     .  D============eeeeeER  .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,8]     .   D=============eeeeeER.    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,9]     .   D===============eeeeeER   .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,10]    .    D================eeeeeER .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,11]    .    D==================eeeeeER    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,12]    .    .D===================eeeeeER  .    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,13]    .    .D=====================eeeeeER.    .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,14]    .    . D======================eeeeeER   .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,15]    .    . D========================eeeeeER .    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,16]    .    .  D=========================eeeeeER    .    .    .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [0,17]    .    .  D==============================eeER  .    .    .    .    .    .    .   .   vandps	%xmm4, %xmm1, %xmm0
+# CHECK-NEXT: [1,0]     .    .   D===============================eeeeeeeeeER   .    .    .    .    .   .   vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT: [1,1]     .    .   D===============================eeeeeE----R   .    .    .    .    .   .   vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: [1,2]     .    .    D===================================eeeeeER  .    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,3]     .    .    D=====================================eeeeeER.    .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,4]     .    .    .D======================================eeeeeER   .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,5]     .    .    .D========================================eeeeeER .    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,6]     .    .    . D=========================================eeeeeER    .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,7]     .    .    . D===========================================eeeeeER  .    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,8]     .    .    .  D============================================eeeeeER.    .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,9]     .    .    .  D==============================================eeeeeER   .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,10]    .    .    .   D===============================================eeeeeER .    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,11]    .    .    .   D=================================================eeeeeER    .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,12]    .    .    .    D==================================================eeeeeER  .   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,13]    .    .    .    D====================================================eeeeeER.   .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,14]    .    .    .    .D=====================================================eeeeeER  .   vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: [1,15]    .    .    .    .D=======================================================eeeeeER.   vaddps	%ymm3, %ymm1, %ymm4
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     16.5   0.5    0.0       vdivps	%ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.     2     16.5   0.5    4.0       vaddps	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 2.     2     20.5   0.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 3.     2     22.5   2.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 4.     2     23.5   4.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 5.     2     25.5   6.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 6.     2     26.5   8.0    0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 7.     2     28.5   10.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 8.     2     29.5   12.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 9.     2     31.5   14.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 10.    2     32.5   16.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 11.    2     34.5   18.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 12.    2     35.5   20.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 13.    2     37.5   22.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 14.    2     38.5   23.5   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 15.    2     40.5   25.5   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 16.    2     41.5   27.0   0.0       vaddps	%ymm3, %ymm1, %ymm4
+# CHECK-NEXT: 17.    2     46.5   0.0    0.0       vandps	%xmm4, %xmm1, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
new file mode 100644
index 0000000000000000000000000000000000000000..12bf3748cb1dfa4fedd628c48e8ff8c037c4cf56
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# Perf stat reports an IPC of 1.97 for this block of code.
+
+# The CMP instruction doesn't depend on the value of EAX.  It can set the flags
+# without having to read the inputs.
+
+cmp %eax, %eax
+cmovae %ebx, %eax
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      3000
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        cmpl	%eax, %eax
+# CHECK-NEXT:  1      1     0.50                        cmovael	%ebx, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	%ebx, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeER ..   cmpl	%eax, %eax
+# CHECK-NEXT: [0,1]     D=eER..   cmovael	%ebx, %eax
+# CHECK-NEXT: [1,0]     DeE-R..   cmpl	%eax, %eax
+# CHECK-NEXT: [1,1]     D==eER.   cmovael	%ebx, %eax
+# CHECK-NEXT: [2,0]     .DeE-R.   cmpl	%eax, %eax
+# CHECK-NEXT: [2,1]     .D==eER   cmovael	%ebx, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.0    1.0    0.7       cmpl	%eax, %eax
+# CHECK-NEXT: 1.     3     2.7    0.0    0.0       cmovael	%ebx, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
new file mode 100644
index 0000000000000000000000000000000000000000..4f869e656f3d6f228110b0374dcad6e064df79a0
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# perf stat reports an IPC of 2.00 for this block of code.
+
+# All of the vector packed compares from this test are dependency breaking
+# instructions. That means, there is no RAW dependency between any of the
+# instructions, and the code can be fully parallelized in hardware.
+
+vpcmpeqb %xmm0, %xmm0, %xmm1
+vpcmpeqw %xmm1, %xmm1, %xmm2
+vpcmpeqd %xmm2, %xmm2, %xmm3
+vpcmpeqq %xmm3, %xmm3, %xmm0
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      6000
+# CHECK-NEXT: Total Cycles:      3005
+# CHECK-NEXT: Total uOps:        6000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    2.00
+# CHECK-NEXT: IPC:               2.00
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     2.00   2.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     D=eeER    .   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     DeeE-R    .   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,3]     D==eeER   .   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [1,0]     .DeeE-R   .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     .D==eeER  .   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .D=eeE-R  .   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,3]     .D===eeER .   vpcmpeqq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [2,0]     . D=eeE-R .   vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     . D===eeER.   vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,2]     . D==eeE-R.   vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,3]     . D====eeER   vpcmpeqq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.3    1.3    0.7       vpcmpeqb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     3     3.0    3.0    0.0       vpcmpeqw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 2.     3     2.0    2.0    1.0       vpcmpeqd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 3.     3     4.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
new file mode 100644
index 0000000000000000000000000000000000000000..019d3fd5067e1c7b1cb4865d2cfc6aae5e9d6114
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# perf stat reports an IPC of 2.00 for this block of code.
+
+# All of the vector packed compares from this test are zero idioms.  These zero
+# idioms are all detected and removed by the register renamer.  That means, no
+# uOp is executed, and there is no RAW dependency for any of the packed
+# compares.
+
+vpcmpgtb %xmm0, %xmm0, %xmm1
+vpcmpgtw %xmm1, %xmm1, %xmm2
+vpcmpgtd %xmm2, %xmm2, %xmm3
+vpcmpgtq %xmm3, %xmm3, %xmm0
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      6000
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        6000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    3.99
+# CHECK-NEXT: IPC:               3.99
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DR   ..   vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DR   ..   vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     DR   ..   vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,3]     DeeER..   vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [1,0]     .D--R..   vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     .D--R..   vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .D--R..   vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,3]     .DeeER.   vpcmpgtq	%xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [2,0]     . D--R.   vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     . D--R.   vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,2]     . D--R.   vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,3]     . DeeER   vpcmpgtq	%xmm3, %xmm3, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    1.3       vpcmpgtb	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     3     0.0    0.0    1.3       vpcmpgtw	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 2.     3     0.0    0.0    1.3       vpcmpgtd	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 3.     3     1.0    1.0    0.0       vpcmpgtq	%xmm3, %xmm3, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
new file mode 100644
index 0000000000000000000000000000000000000000..0503bd8552bccc2600150ccf17c2b301b2c9c980
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
@@ -0,0 +1,84 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# perf stat reports an IPC of 1.00 for this code block.
+
+# Although both SBB are dependency breaking instructions, there is still an
+# implicit dependency on EFLAGS which limits the ILP. So, the hardware backend
+# can only execute one instruction per cycle.
+
+sbb %edx, %edx
+sbb %eax, %eax
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      3000
+# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                        sbbl	%edx, %edx
+# CHECK-NEXT:  1      1     1.00                        sbbl	%eax, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%edx, %edx
+# CHECK-NEXT:  -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeER .  .   sbbl	%edx, %edx
+# CHECK-NEXT: [0,1]     D=eER.  .   sbbl	%eax, %eax
+# CHECK-NEXT: [1,0]     D==eER  .   sbbl	%edx, %edx
+# CHECK-NEXT: [1,1]     D===eER .   sbbl	%eax, %eax
+# CHECK-NEXT: [2,0]     .D===eER.   sbbl	%edx, %edx
+# CHECK-NEXT: [2,1]     .D====eER   sbbl	%eax, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     2.7    0.3    0.0       sbbl	%edx, %edx
+# CHECK-NEXT: 1.     3     3.7    0.0    0.0       sbbl	%eax, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..ba29a29e7dcdbb11abf6705edb589c00aa780bb5
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s
+
+# perf stat reports a throughput of 1.51 IPC for this block of code.
+
+# The SBB does not depend on the value of register EAX. That means, it doesn't
+# have to wait for the IMUL to write-back on EAX. However, it still depends on
+# the ADD for EFLAGS.
+
+imul %edx, %eax
+add %edx, %edx
+sbb %eax, %eax
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      4500
+# CHECK-NEXT: Total Cycles:      3006
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.50
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00                        imull	%edx, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%edx, %edx
+# CHECK-NEXT:  1      1     1.00                        sbbl	%eax, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%edx, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %edx
+# CHECK-NEXT:  -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%eax, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     D=eeeeER  ..   imull	%edx, %eax
+# CHECK-NEXT: [0,1]     DeE----R  ..   addl	%edx, %edx
+# CHECK-NEXT: [0,2]     D==eE--R  ..   sbbl	%eax, %eax
+# CHECK-NEXT: [1,0]     D===eeeeER..   imull	%edx, %eax
+# CHECK-NEXT: [1,1]     .DeE-----R..   addl	%edx, %edx
+# CHECK-NEXT: [1,2]     .D===eE--R..   sbbl	%eax, %eax
+# CHECK-NEXT: [2,0]     .D====eeeeER   imull	%edx, %eax
+# CHECK-NEXT: [2,1]     .D=eE------R   addl	%edx, %edx
+# CHECK-NEXT: [2,2]     . D====eE--R   sbbl	%eax, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     3.7    0.7    0.0       imull	%edx, %eax
+# CHECK-NEXT: 1.     3     1.3    0.3    5.0       addl	%edx, %edx
+# CHECK-NEXT: 2.     3     4.0    2.0    2.0       sbbl	%eax, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s b/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
new file mode 100644
index 0000000000000000000000000000000000000000..bd5b724bbd1a1fa6273582002dd5828549942faa
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
@@ -0,0 +1,107 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=500 -timeline < %s | FileCheck %s
+
+vpmuld %xmm0, %xmm0, %xmm1
+vpaddd %xmm1, %xmm1, %xmm0
+vpaddd %xmm0, %xmm0, %xmm3
+
+# CHECK:      Iterations:        500
+# CHECK-NEXT: Instructions:      1500
+# CHECK-NEXT: Total Cycles:      3005
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00                        vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      2     0.50                        vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT:  1      2     0.50                        vpaddd	%xmm0, %xmm0, %xmm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     2.00   1.00    -     1.50   1.50    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpaddd	%xmm0, %xmm0, %xmm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     D====eeER .    .    .    .    .    .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [0,2]     D======eeER    .    .    .    .    .    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [1,0]     D======eeeeER  .    .    .    .    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1]     .D=========eeER.    .    .    .    .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [1,2]     .D===========eeER   .    .    .    .    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [2,0]     .D===========eeeeER .    .    .    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1]     .D===============eeER    .    .    .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [2,2]     . D================eeER  .    .    .    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [3,0]     . D================eeeeER.    .    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [3,1]     . D====================eeER   .    .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [3,2]     . D======================eeER .    .    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [4,0]     .  D=====================eeeeER    .    .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [4,1]     .  D=========================eeER  .    .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [4,2]     .  D===========================eeER.    .    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [5,0]     .  D===========================eeeeER   .    .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [5,1]     .   D==============================eeER .    .    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [5,2]     .   D================================eeER    .    .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [6,0]     .   D================================eeeeER  .    .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [6,1]     .   D====================================eeER.    .    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [6,2]     .    D=====================================eeER   .    .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [7,0]     .    D=====================================eeeeER .    .    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [7,1]     .    D=========================================eeER    .    .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [7,2]     .    D===========================================eeER  .    .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [8,0]     .    .D==========================================eeeeER.    .   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [8,1]     .    .D==============================================eeER   .   .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [8,2]     .    .D================================================eeER .   .   vpaddd	%xmm0, %xmm0, %xmm3
+# CHECK-NEXT: [9,0]     .    .D================================================eeeeER   .   vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [9,1]     .    . D===================================================eeER .   vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: [9,2]     .    . D=====================================================eeER   vpaddd	%xmm0, %xmm0, %xmm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     10    25.0   0.1    0.0       vpmuldq	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     10    28.7   0.0    0.0       vpaddd	%xmm1, %xmm1, %xmm0
+# CHECK-NEXT: 2.     10    30.5   0.0    0.0       vpaddd	%xmm0, %xmm0, %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/dot-product.s b/test/tools/llvm-mca/X86/BdVer2/dot-product.s
new file mode 100644
index 0000000000000000000000000000000000000000..d83cda27b0aaff8aa947d66bbe0777be44fe3f95
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/dot-product.s
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=300 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+vmulps   %xmm0, %xmm1, %xmm2
+vhaddps  %xmm2, %xmm2, %xmm3
+vhaddps  %xmm3, %xmm3, %xmm4
+
+# CHECK:      Iterations:        300
+# CHECK-NEXT: Instructions:      900
+# CHECK-NEXT: Total Cycles:      627
+# CHECK-NEXT: Total uOps:        2100
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    3.35
+# CHECK-NEXT: IPC:               1.44
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  3      11    1.00                        vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  3      11    1.00                        vhaddps	%xmm3, %xmm3, %xmm4
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.49   1.51    -      -      -      -     2.00   1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.49   0.51    -      -      -      -     1.00    -      -      -      -      -     vhaddps	%xmm3, %xmm3, %xmm4
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    . .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,1]     D=====eeeeeeeeeeeER .    .    . .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,2]     .D===============eeeeeeeeeeeER. .   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [1,0]     .DeeeeeE---------------------R. .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,1]     . D====eeeeeeeeeeeE----------R. .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,2]     .  D==============eeeeeeeeeeeER .   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [2,0]     .  DeeeeeE--------------------R .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [2,1]     .   D====eeeeeeeeeeeE---------R .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,2]     .    D==============eeeeeeeeeeeER   vhaddps	%xmm3, %xmm3, %xmm4
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.0    1.0    13.7      vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1.     3     5.3    0.0    6.3       vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 2.     3     15.3   0.0    0.0       vhaddps	%xmm3, %xmm3, %xmm4
diff --git a/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s
new file mode 100644
index 0000000000000000000000000000000000000000..c2ea467ef54402a06f01bc4f33e616be883b6b2d
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+vshufps $0, %xmm0, %xmm1, %xmm1
+vhaddps (%rdi), %xmm1, %xmm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      20
+# CHECK-NEXT: Total uOps:        5
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.10
+# CHECK-NEXT: Block RThroughput: 1.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50                        vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT:  4      16    1.00    *                   vhaddps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .   .   vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeER   vhaddps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       vhaddps	(%rdi), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..8988498705a39628e657f952df3dc2f357181175
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+vshufps $0, %xmm0, %xmm1, %xmm1
+vhaddps (%rdi), %ymm1, %ymm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      20
+# CHECK-NEXT: Total uOps:        11
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.55
+# CHECK-NEXT: IPC:               0.10
+# CHECK-NEXT: Block RThroughput: 2.8
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50                        vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT:  10     16    2.00    *                   vhaddps	(%rdi), %ymm1, %ymm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .   .   vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeeeeeeeeER   vhaddps	(%rdi), %ymm1, %ymm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       vhaddps	(%rdi), %ymm1, %ymm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s b/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s
new file mode 100644
index 0000000000000000000000000000000000000000..dfa9aaa6d89459908cee7aaf236c0977cd857642
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s
@@ -0,0 +1,36 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -instruction-info=true < %s | FileCheck %s --check-prefix=ENABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefix=DISABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -instruction-info < %s | FileCheck %s -check-prefix=ENABLED
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false < %s | FileCheck %s -check-prefix=ENABLED
+
+vmulps   %xmm0, %xmm1, %xmm2
+vhaddps  %xmm2, %xmm2, %xmm3
+vhaddps  %xmm3, %xmm3, %xmm4
+
+# DISABLED-NOT: Instruction Info:
+
+
+# ENABLED:       Iterations:        100
+# ENABLED-NEXT:  Instructions:      300
+# ENABLED-NEXT:  Total Cycles:      228
+# ENABLED-NEXT:  Total uOps:        700
+
+
+# ENABLED:       Dispatch Width:    4
+# ENABLED-NEXT:  uOps Per Cycle:    3.07
+# ENABLED-NEXT:  IPC:               1.32
+# ENABLED-NEXT:  Block RThroughput: 2.0
+
+# ENABLED:       Instruction Info:
+# ENABLED-NEXT:  [1]: #uOps
+# ENABLED-NEXT:  [2]: Latency
+# ENABLED-NEXT:  [3]: RThroughput
+# ENABLED-NEXT:  [4]: MayLoad
+# ENABLED-NEXT:  [5]: MayStore
+# ENABLED-NEXT:  [6]: HasSideEffects (U)
+
+# ENABLED:       [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# ENABLED-NEXT:   1      5     1.00                        vmulps	%xmm0, %xmm1, %xmm2
+# ENABLED-NEXT:   3      11    1.00                        vhaddps	%xmm2, %xmm2, %xmm3
+# ENABLED-NEXT:   3      11    1.00                        vhaddps	%xmm3, %xmm3, %xmm4
diff --git a/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s b/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
new file mode 100644
index 0000000000000000000000000000000000000000..90d0d392977f7c5f409505220b8b770ce01aae63
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
@@ -0,0 +1,105 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s
+
+vmovaps (%rsi), %xmm0
+vmovaps %xmm0, (%rdi)
+vmovaps 16(%rsi), %xmm0
+vmovaps %xmm0, 16(%rdi)
+vmovaps 32(%rsi), %xmm0
+vmovaps %xmm0, 32(%rdi)
+vmovaps 48(%rsi), %xmm0
+vmovaps %xmm0, 48(%rdi)
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      2403
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -     8.00    -      -      -      -      -      -      -     4.00    -      -      -     4.00   3.99   4.01    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     0.99   0.01    -      -      -      -     vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: [0,1]     D=====eER .    .    .    ..   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    ..   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: [0,3]     D===========eER.    .    ..   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4]     .D===========eeeeeER.    ..   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: [0,5]     .D================eER    ..   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6]     .D=================eeeeeER.   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: [0,7]     .D======================eER   vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: 2.     1     7.0    0.0    0.0       vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: 3.     1     12.0   0.0    0.0       vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: 4.     1     12.0   0.0    0.0       vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: 5.     1     17.0   0.0    0.0       vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: 6.     1     18.0   0.0    0.0       vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: 7.     1     23.0   0.0    0.0       vmovaps	%xmm0, 48(%rdi)
diff --git a/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s b/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
new file mode 100644
index 0000000000000000000000000000000000000000..b69f77b36938b1b551cfd9c791c721834ade6142
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/memcpy-like-test.s
@@ -0,0 +1,105 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 -timeline -timeline-max-iterations=1 < %s | FileCheck %s
+
+vmovaps (%rsi), %xmm0
+vmovaps %xmm0, (%rdi)
+vmovaps 16(%rsi), %xmm0
+vmovaps %xmm0, 16(%rdi)
+vmovaps 32(%rsi), %xmm0
+vmovaps %xmm0, 32(%rdi)
+vmovaps 48(%rsi), %xmm0
+vmovaps %xmm0, 48(%rdi)
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      408
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.96
+# CHECK-NEXT: IPC:               1.96
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 4.05   3.95    -      -      -      -      -      -     3.95   0.05    -      -      -     4.00   3.95   4.05    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     0.97   0.03    -      -      -      -     0.97   0.03    -      -      -      -     vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -      -     vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: 0.02   0.98    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: 0.02   0.98    -      -      -      -      -      -     1.00    -      -      -      -      -     0.98   0.02    -      -      -      -     vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: 0.98   0.02    -      -      -      -      -      -     0.98   0.02    -      -      -      -     1.00    -      -      -      -      -     vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: 0.03   0.97    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: [0,1]     D=====eER ..   vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: [0,2]     DeeeeeE-R ..   vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: [0,3]     D======eER..   vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: [0,4]     .DeeeeeE-R..   vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: [0,5]     .D======eER.   vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: [0,6]     .DeeeeeE--R.   vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: [0,7]     .D=======eER   vmovaps	%xmm0, 48(%rdi)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vmovaps	(%rsi), %xmm0
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       vmovaps	%xmm0, (%rdi)
+# CHECK-NEXT: 2.     1     1.0    1.0    1.0       vmovaps	16(%rsi), %xmm0
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       vmovaps	%xmm0, 16(%rdi)
+# CHECK-NEXT: 4.     1     1.0    1.0    1.0       vmovaps	32(%rsi), %xmm0
+# CHECK-NEXT: 5.     1     7.0    0.0    0.0       vmovaps	%xmm0, 32(%rdi)
+# CHECK-NEXT: 6.     1     1.0    1.0    2.0       vmovaps	48(%rsi), %xmm0
+# CHECK-NEXT: 7.     1     8.0    0.0    0.0       vmovaps	%xmm0, 48(%rdi)
diff --git a/test/tools/llvm-mca/X86/BdVer2/one-idioms.s b/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
new file mode 100644
index 0000000000000000000000000000000000000000..c2e0debcf351ffb9e919e1c08c04e0f285ec15e5
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/one-idioms.s
@@ -0,0 +1,164 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=1 -register-file-stats < %s | FileCheck %s
+
+# These are dependency-breaking one-idioms.
+# Much like zero-idioms, but they produce ones, and do consume resources.
+
+# perf stats reports a throughput of 2.00 IPC.
+
+pcmpeqb   %mm2, %mm2
+pcmpeqd   %mm2, %mm2
+pcmpeqw   %mm2, %mm2
+
+pcmpeqb   %xmm2, %xmm2
+pcmpeqd   %xmm2, %xmm2
+pcmpeqq   %xmm2, %xmm2
+pcmpeqw   %xmm2, %xmm2
+
+vpcmpeqb  %xmm3, %xmm3, %xmm3
+vpcmpeqd  %xmm3, %xmm3, %xmm3
+vpcmpeqq  %xmm3, %xmm3, %xmm3
+vpcmpeqw  %xmm3, %xmm3, %xmm3
+
+vpcmpeqb  %xmm3, %xmm3, %xmm5
+vpcmpeqd  %xmm3, %xmm3, %xmm5
+vpcmpeqq  %xmm3, %xmm3, %xmm5
+vpcmpeqw  %xmm3, %xmm3, %xmm5
+
+# FIXME: their handling is broken in llvm-mca.
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1500
+# CHECK-NEXT: Total Cycles:      754
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.99
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50                        pcmpeqb	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqd	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqw	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqw	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    1500
+# CHECK-NEXT: Max number of mappings used:         72
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 1500
+# CHECK-NEXT:    Max number of mappings used:      72
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     7.50   7.50    -      -     7.50   7.50    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -      -      -      -     pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    ..   pcmpeqb	%mm2, %mm2
+# CHECK-NEXT: [0,1]     DeeER.    ..   pcmpeqd	%mm2, %mm2
+# CHECK-NEXT: [0,2]     D=eeER    ..   pcmpeqw	%mm2, %mm2
+# CHECK-NEXT: [0,3]     D==eeER   ..   pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT: [0,4]     .DeeE-R   ..   pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT: [0,5]     .D==eeER  ..   pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT: [0,6]     .D=eeE-R  ..   pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT: [0,7]     .D===eeER ..   vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,8]     . D=eeE-R ..   vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,9]     . D===eeER..   vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,10]    . D==eeE-R..   vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,11]    . D===eeER..   vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,12]    .  D===eeER.   vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,13]    .  D===eeER.   vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,14]    .  D====eeER   vpcmpeqw	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       pcmpeqb	%mm2, %mm2
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       pcmpeqd	%mm2, %mm2
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       pcmpeqw	%mm2, %mm2
+# CHECK-NEXT: 3.     1     3.0    3.0    0.0       pcmpeqb	%xmm2, %xmm2
+# CHECK-NEXT: 4.     1     1.0    1.0    1.0       pcmpeqd	%xmm2, %xmm2
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       pcmpeqq	%xmm2, %xmm2
+# CHECK-NEXT: 6.     1     2.0    2.0    1.0       pcmpeqw	%xmm2, %xmm2
+# CHECK-NEXT: 7.     1     4.0    4.0    0.0       vpcmpeqb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 8.     1     2.0    2.0    1.0       vpcmpeqd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 10.    1     3.0    3.0    1.0       vpcmpeqw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 11.    1     4.0    4.0    0.0       vpcmpeqb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 12.    1     4.0    4.0    0.0       vpcmpeqd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 13.    1     4.0    0.0    0.0       vpcmpeqq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 14.    1     5.0    5.0    0.0       vpcmpeqw	%xmm3, %xmm3, %xmm5
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..e5dcf7d761f178217d5da9854c77f49e0f48b59b
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-2.s
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s
+
+imul   %rax, %rbx
+lzcnt  %ax,  %bx
+add    %ecx, %ebx
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      3
+# CHECK-NEXT: Total Cycles:      11
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     4.00                        imulq	%rax, %rbx
+# CHECK-NEXT:  2      2     0.50                        lzcntw	%ax, %bx
+# CHECK-NEXT:  1      1     0.50                        addl	%ecx, %ebx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .   imulq	%rax, %rbx
+# CHECK-NEXT: [0,1]     D=====eeER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [0,2]     D=======eER   addl	%ecx, %ebx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       imulq	%rax, %rbx
+# CHECK-NEXT: 1.     1     6.0    0.0    0.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       addl	%ecx, %ebx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
new file mode 100644
index 0000000000000000000000000000000000000000..4aad4729a5f7d22e0b14835ee0970177cf84cf33
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-3.s
@@ -0,0 +1,91 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# perf stat reports a throughput of 1.00 IPC for this code snippet.
+
+# The ILP is limited by the false dependency on %dx. So, the mov cannot execute
+# in parallel with the add.
+
+add %cx, %dx
+mov %ax, %dx
+xor %bx, %dx
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      4500
+# CHECK-NEXT: Total Cycles:      4503
+# CHECK-NEXT: Total uOps:        4500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        addw	%cx, %dx
+# CHECK-NEXT:  1      1     0.50                        movw	%ax, %dx
+# CHECK-NEXT:  1      1     0.50                        xorw	%bx, %dx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movw	%ax, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%bx, %dx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    ..   addw	%cx, %dx
+# CHECK-NEXT: [0,1]     D=eER.    ..   movw	%ax, %dx
+# CHECK-NEXT: [0,2]     D==eER    ..   xorw	%bx, %dx
+# CHECK-NEXT: [1,0]     D===eER   ..   addw	%cx, %dx
+# CHECK-NEXT: [1,1]     .D===eER  ..   movw	%ax, %dx
+# CHECK-NEXT: [1,2]     .D====eER ..   xorw	%bx, %dx
+# CHECK-NEXT: [2,0]     .D=====eER..   addw	%cx, %dx
+# CHECK-NEXT: [2,1]     .D======eER.   movw	%ax, %dx
+# CHECK-NEXT: [2,2]     . D======eER   xorw	%bx, %dx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     3.7    0.3    0.0       addw	%cx, %dx
+# CHECK-NEXT: 1.     3     4.3    0.0    0.0       movw	%ax, %dx
+# CHECK-NEXT: 2.     3     5.0    0.0    0.0       xorw	%bx, %dx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
new file mode 100644
index 0000000000000000000000000000000000000000..6194ecbb127d8e2e0803c70e8470edd4c860bcd8
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-4.s
@@ -0,0 +1,91 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# perf stat reports a throughput of 0.60 IPC for this code snippet.
+
+# The lzcnt cannot execute in parallel with the imul because there is a false
+# dependency on %bx.
+
+imul %ax, %bx
+lzcnt %ax, %bx
+add %cx, %bx
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      4500
+# CHECK-NEXT: Total Cycles:      9003
+# CHECK-NEXT: Total uOps:        6000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.67
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00                        imulw	%ax, %bx
+# CHECK-NEXT:  2      2     0.50                        lzcntw	%ax, %bx
+# CHECK-NEXT:  1      1     0.50                        addw	%cx, %bx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     1.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	%ax, %bx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%ax, %bx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%cx, %bx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeeER   .    .    .   imulw	%ax, %bx
+# CHECK-NEXT: [0,1]     D===eeER  .    .    .   lzcntw	%ax, %bx
+# CHECK-NEXT: [0,2]     D=====eER .    .    .   addw	%cx, %bx
+# CHECK-NEXT: [1,0]     .D=====eeeeER  .    .   imulw	%ax, %bx
+# CHECK-NEXT: [1,1]     .D========eeER .    .   lzcntw	%ax, %bx
+# CHECK-NEXT: [1,2]     .D==========eER.    .   addw	%cx, %bx
+# CHECK-NEXT: [2,0]     . D==========eeeeER .   imulw	%ax, %bx
+# CHECK-NEXT: [2,1]     . D=============eeER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [2,2]     . D===============eER   addw	%cx, %bx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     6.0    0.3    0.0       imulw	%ax, %bx
+# CHECK-NEXT: 1.     3     9.0    0.0    0.0       lzcntw	%ax, %bx
+# CHECK-NEXT: 2.     3     11.0   0.0    0.0       addw	%cx, %bx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
new file mode 100644
index 0000000000000000000000000000000000000000..ee892a4231fe7bc9d72093b4c9131dc542098967
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-5.s
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# perf stat reports a throughput of 1.00 IPC for this code snippet.
+
+lzcnt %ax, %bx  ## partial register stall.
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      1500
+# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total uOps:        3000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      2     0.50                        lzcntw	%ax, %bx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%ax, %bx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456
+
+# CHECK:      [0,0]     DeeER..   lzcntw	%ax, %bx
+# CHECK-NEXT: [1,0]     D=eeER.   lzcntw	%ax, %bx
+# CHECK-NEXT: [2,0]     .D=eeER   lzcntw	%ax, %bx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.7    0.3    0.0       lzcntw	%ax, %bx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
new file mode 100644
index 0000000000000000000000000000000000000000..8723744aaa681ba353189ca91fbefbade476da75
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update-6.s
@@ -0,0 +1,92 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1500 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# perf stat reports a throughput of 0.60 IPC for this code snippet.
+# Each lzcnt has a false dependency on %ecx; the first lzcnt has to wait on the
+# imul. However, the folded load can start immediately.
+# The last lzcnt has a false dependency on %cx. However, even in this case, the
+# folded load can start immediately.
+
+imul %edx, %ecx
+lzcnt (%rsp), %cx
+lzcnt 2(%rsp), %cx
+
+# CHECK:      Iterations:        1500
+# CHECK-NEXT: Instructions:      4500
+# CHECK-NEXT: Total Cycles:      10503
+# CHECK-NEXT: Total uOps:        7500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: Block RThroughput: 1.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00                        imull	%edx, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntw	(%rsp), %cx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntw	2(%rsp), %cx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -     2.00    -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -     1.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%edx, %ecx
+# CHECK-NEXT:  -     1.00    -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	(%rsp), %cx
+# CHECK-NEXT:  -     1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	2(%rsp), %cx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeER   .    .    .  .   imull	%edx, %ecx
+# CHECK-NEXT: [0,1]     DeeeeeeER .    .    .  .   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    .    .  .   lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [1,0]     .D======eeeeER .    .  .   imull	%edx, %ecx
+# CHECK-NEXT: [1,1]     . D=====eeeeeeER    .  .   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [1,2]     . D======eeeeeeER   .  .   lzcntw	2(%rsp), %cx
+# CHECK-NEXT: [2,0]     .  D===========eeeeER  .   imull	%edx, %ecx
+# CHECK-NEXT: [2,1]     .  D===========eeeeeeER.   lzcntw	(%rsp), %cx
+# CHECK-NEXT: [2,2]     .   D===========eeeeeeER   lzcntw	2(%rsp), %cx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     6.7    0.3    0.0       imull	%edx, %ecx
+# CHECK-NEXT: 1.     3     6.3    0.0    0.0       lzcntw	(%rsp), %cx
+# CHECK-NEXT: 2.     3     6.7    0.0    0.0       lzcntw	2(%rsp), %cx
diff --git a/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s
new file mode 100644
index 0000000000000000000000000000000000000000..6c1146b2224877dc84b4fc34d28fb4ca2e8187b2
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/partial-reg-update.s
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s
+
+imul %ax, %cx
+add  %al, %cl
+add  %ecx, %ebx
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      3
+# CHECK-NEXT: Total Cycles:      9
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00                        imulw	%ax, %cx
+# CHECK-NEXT:  1      1     0.50                        addb	%al, %cl
+# CHECK-NEXT:  1      1     0.50                        addl	%ecx, %ebx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeER .   imulw	%ax, %cx
+# CHECK-NEXT: [0,1]     D====eER.   addb	%al, %cl
+# CHECK-NEXT: [0,2]     D=====eER   addl	%ecx, %ebx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       imulw	%ax, %cx
+# CHECK-NEXT: 1.     1     5.0    0.0    0.0       addb	%al, %cl
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       addl	%ecx, %ebx
diff --git a/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s b/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
new file mode 100644
index 0000000000000000000000000000000000000000..86fee396350869ec3282f7888ffebd0fb633c75a
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/pipes-fpu.s
@@ -0,0 +1,120 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+# VALU0/VALU1
+vpmulld     %xmm0, %xmm1, %xmm2
+vpand       %xmm0, %xmm1, %xmm2
+
+# VIMUL/STC
+vcvttps2dq  %xmm0, %xmm2
+vpclmulqdq  $0, %xmm0, %xmm1, %xmm2
+
+# FPA/FPM
+vaddps      %xmm0, %xmm1, %xmm2
+vsqrtps     %xmm0, %xmm2
+
+# FPA/FPM YMM
+vaddps      %ymm0, %ymm1, %ymm2
+vsqrtps     %ymm0, %ymm2
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      3244
+# CHECK-NEXT: Total uOps:        1500
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.46
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 32.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     2.00                        vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  6      13    1.00                        vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     10.50                       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      9     21.00                       vsqrtps	%ymm0, %ymm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     32.71  32.29   -     2.00   3.00   1.00   6.00   6.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00   2.00    -     2.00   1.00    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.71  10.29   -      -      -      -      -     1.00    -      -      -      -     vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtps	%ymm0, %ymm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .    .    .    .  .   vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,1]     D=eeE--R  .    .    .    .    .    .    .    .    .    .  .   vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     D==eeeeER .    .    .    .    .    .    .    .    .    .  .   vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: [0,3]     .D=eeeeeeeeeeeeeER  .    .    .    .    .    .    .    .  .   vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,4]     . D=eeeeeE-------R  .    .    .    .    .    .    .    .  .   vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [0,5]     . D=eeeeeeeeeE---R  .    .    .    .    .    .    .    .  .   vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: [0,6]     .  D=eeeeeE------R  .    .    .    .    .    .    .    .  .   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,7]     .  D==eeeeeeeeeE--R .    .    .    .    .    .    .    .  .   vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: [1,0]     .   D===eeeeeE----R .    .    .    .    .    .    .    .  .   vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,1]     .   DeeE----------R .    .    .    .    .    .    .    .  .   vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .   D====eeeeE----R .    .    .    .    .    .    .    .  .   vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: [1,3]     .    D=eeeeeeeeeeeeeER   .    .    .    .    .    .    .  .   vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,4]     .    .D==================eeeeeER   .    .    .    .    .  .   vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,5]     .    .D===================eeeeeeeeeER   .    .    .    .  .   vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: [1,6]     .    . D=======================================eeeeeER .  .   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,7]     .    . D========================================eeeeeeeeeER   vsqrtps	%ymm0, %ymm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     2.5    2.5    2.0       vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1.     2     1.5    1.5    6.0       vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 2.     2     4.0    4.0    2.0       vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 3.     2     2.0    2.0    0.0       vpclmulqdq	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 4.     2     10.5   10.5   3.5       vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 5.     2     11.0   11.0   1.5       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 6.     2     21.0   21.0   3.0       vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 7.     2     22.0   22.0   1.0       vsqrtps	%ymm0, %ymm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/pr37790.s b/test/tools/llvm-mca/X86/BdVer2/pr37790.s
new file mode 100644
index 0000000000000000000000000000000000000000..2471c42e4455d6ac46e4a16d756f77bf14b10758
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/pr37790.s
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -lqueue=2 -iterations=2 -resource-pressure=false -timeline -timeline-max-cycles=104 < %s | FileCheck %s
+
+int3
+stmxcsr (%rsp)
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      205
+# CHECK-NEXT: Total uOps:        6
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.03
+# CHECK-NEXT: IPC:               0.02
+# CHECK-NEXT: Block RThroughput: 0.8
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.50    *      *      U     int3
+# CHECK-NEXT:  2      1     0.50    *      *      U     stmxcsr	(%rsp)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER.   int3
+# CHECK-NEXT: [0,1]     D====================================================================================================eER   stmxcsr	(%rsp)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     1.0    0.5    0.0       int3
+# CHECK-NEXT: 1.     2     100.5  0.0    0.0       stmxcsr	(%rsp)
diff --git a/test/tools/llvm-mca/X86/BdVer2/rank.s b/test/tools/llvm-mca/X86/BdVer2/rank.s
new file mode 100644
index 0000000000000000000000000000000000000000..87f7d527c03b4893989016fa17b171ffc40daede
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/rank.s
@@ -0,0 +1,121 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+add %eax, %ecx
+add %eax, %edx
+add %eax, %ebx
+add %edx, %esi
+add %ebx, %eax
+add %edx, %esi
+add %ebx, %eax
+add %ebx, %eax
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      800
+# CHECK-NEXT: Total Cycles:      503
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.59
+# CHECK-NEXT: IPC:               1.59
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %ecx
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %edx
+# CHECK-NEXT:  1      1     0.50                        addl	%eax, %ebx
+# CHECK-NEXT:  1      1     0.50                        addl	%edx, %esi
+# CHECK-NEXT:  1      1     0.50                        addl	%ebx, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%edx, %esi
+# CHECK-NEXT:  1      1     0.50                        addl	%ebx, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	%ebx, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.01   0.99    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %edx
+# CHECK-NEXT:  -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%eax, %ebx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %esi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edx, %esi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%ebx, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    .    . .   addl	%eax, %ecx
+# CHECK-NEXT: [0,1]     DeER .    .    . .   addl	%eax, %edx
+# CHECK-NEXT: [0,2]     D=eER.    .    . .   addl	%eax, %ebx
+# CHECK-NEXT: [0,3]     D=eER.    .    . .   addl	%edx, %esi
+# CHECK-NEXT: [0,4]     .D=eER    .    . .   addl	%ebx, %eax
+# CHECK-NEXT: [0,5]     .D=eER    .    . .   addl	%edx, %esi
+# CHECK-NEXT: [0,6]     .D==eER   .    . .   addl	%ebx, %eax
+# CHECK-NEXT: [0,7]     .D===eER  .    . .   addl	%ebx, %eax
+# CHECK-NEXT: [1,0]     . D====eER.    . .   addl	%eax, %ecx
+# CHECK-NEXT: [1,1]     . D===eE-R.    . .   addl	%eax, %edx
+# CHECK-NEXT: [1,2]     . D===eE-R.    . .   addl	%eax, %ebx
+# CHECK-NEXT: [1,3]     . D====eER.    . .   addl	%edx, %esi
+# CHECK-NEXT: [1,4]     .  D====eER    . .   addl	%ebx, %eax
+# CHECK-NEXT: [1,5]     .  D====eER    . .   addl	%edx, %esi
+# CHECK-NEXT: [1,6]     .  D=====eER   . .   addl	%ebx, %eax
+# CHECK-NEXT: [1,7]     .  D======eER  . .   addl	%ebx, %eax
+# CHECK-NEXT: [2,0]     .   D=======eER. .   addl	%eax, %ecx
+# CHECK-NEXT: [2,1]     .   D======eE-R. .   addl	%eax, %edx
+# CHECK-NEXT: [2,2]     .   D======eE-R. .   addl	%eax, %ebx
+# CHECK-NEXT: [2,3]     .   D=======eER. .   addl	%edx, %esi
+# CHECK-NEXT: [2,4]     .    D=======eER .   addl	%ebx, %eax
+# CHECK-NEXT: [2,5]     .    D=======eER .   addl	%edx, %esi
+# CHECK-NEXT: [2,6]     .    D========eER.   addl	%ebx, %eax
+# CHECK-NEXT: [2,7]     .    D=========eER   addl	%ebx, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     4.7    1.0    0.0       addl	%eax, %ecx
+# CHECK-NEXT: 1.     3     4.0    0.3    0.7       addl	%eax, %edx
+# CHECK-NEXT: 2.     3     4.3    0.7    0.7       addl	%eax, %ebx
+# CHECK-NEXT: 3.     3     5.0    0.0    0.0       addl	%edx, %esi
+# CHECK-NEXT: 4.     3     5.0    0.7    0.0       addl	%ebx, %eax
+# CHECK-NEXT: 5.     3     5.0    0.0    0.0       addl	%edx, %esi
+# CHECK-NEXT: 6.     3     6.0    0.0    0.0       addl	%ebx, %eax
+# CHECK-NEXT: 7.     3     7.0    0.0    0.0       addl	%ebx, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s b/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
new file mode 100644
index 0000000000000000000000000000000000000000..42467f7b3a18ecf1916c2e77fc4f5da61b989451
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/rcu-statistics.s
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -retire-stats -iterations=1 < %s | FileCheck %s
+
+  vsqrtps %xmm0, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+  vaddps  %xmm0, %xmm1, %xmm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      16
+# CHECK-NEXT: Total Cycles:      22
+# CHECK-NEXT: Total uOps:        16
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.73
+# CHECK-NEXT: IPC:               0.73
+# CHECK-NEXT: Block RThroughput: 18.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      9     10.50                       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+
+# CHECK:      Retire Control Unit - number of cycles where we saw N instructions retired:
+# CHECK-NEXT: [# retired], [# cycles]
+# CHECK-NEXT:  0,           11  (50.0%)
+# CHECK-NEXT:  1,           9  (40.9%)
+# CHECK-NEXT:  3,           1  (4.5%)
+# CHECK-NEXT:  4,           1  (4.5%)
diff --git a/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s b/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s
new file mode 100644
index 0000000000000000000000000000000000000000..912b11b2ddd506a7cf00c50f0d17b746df8de521
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/read-advance-1.s
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+# The vmul can start executing 3cy in advance. That is beause the first use
+# operand (i.e. %xmm1) is a ReadAfterLd. That means, the memory operand is
+# evaluated before %xmm1.
+
+vaddps  %xmm0, %xmm0, %xmm1
+vmulps  (%rdi), %xmm1, %xmm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.15
+# CHECK-NEXT: IPC:               0.15
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      10    1.00    *                   vmulps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  . .   vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DeeeeeeeeeeER   vmulps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       vmulps	(%rdi), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s b/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..7f2d1ae9c6c2feefa2e73e21d9c9c4e5395d4ac5
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/read-advance-2.s
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=0 -timeline < %s | FileCheck %s
+
+  imull  %esi
+  imull  (%rdi)
+
+# The second integer multiply can start at cycle 2 because the implicit reads
+# can start after the load operand is evaluated.
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.17
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00                        imull	%esi
+# CHECK-NEXT:  1      8     1.00    *                   imull	(%rdi)
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   ..   imull	%esi
+# CHECK-NEXT: [0,1]     D=eeeeeeeeER   imull	(%rdi)
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       imull	%esi
+# CHECK-NEXT: 1.     1     2.0    1.0    0.0       imull	(%rdi)
diff --git a/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s b/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s
new file mode 100644
index 0000000000000000000000000000000000000000..44cea0a42533bd3e2e31edb2f806fafa731d994c
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/read-advance-3.s
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=0 -timeline -dispatch=3 < %s | FileCheck %s
+
+  add %rdi, %rsi
+  add (%rsp), %rsi
+  add %rdx, %r8
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      3
+# CHECK-NEXT: Total Cycles:      8
+# CHECK-NEXT: Total uOps:        3
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.38
+# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        addq	%rdi, %rsi
+# CHECK-NEXT:  1      5     0.50    *                   addq	(%rsp), %rsi
+# CHECK-NEXT:  1      1     0.50                        addq	%rdx, %r8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     01234567
+
+# CHECK:      [0,0]     DeER . .   addq	%rdi, %rsi
+# CHECK-NEXT: [0,1]     DeeeeeER   addq	(%rsp), %rsi
+# CHECK-NEXT: [0,2]     D=eE---R   addq	%rdx, %r8
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       addq	%rdi, %rsi
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       addq	(%rsp), %rsi
+# CHECK-NEXT: 2.     1     2.0    2.0    3.0       addq	%rdx, %r8
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
new file mode 100644
index 0000000000000000000000000000000000000000..19737e85d191dd3cbf366c9385e9554b7fc8c0ec
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-1.s
@@ -0,0 +1,103 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+# The register move from XMM0 to XMM1 can be eliminated at register renaming
+# stage. So, it should not consume pipeline resources.
+
+vxorps %xmm0, %xmm0, %xmm0
+vmovaps %xmm0, %xmm1
+vaddps %xmm1, %xmm1, %xmm2
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      9
+# CHECK-NEXT: Total Cycles:      11
+# CHECK-NEXT: Total uOps:        9
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.82
+# CHECK-NEXT: IPC:               0.82
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm1, %xmm1, %xmm2
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    6
+# CHECK-NEXT: Max number of mappings used:         6
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 6
+# CHECK-NEXT:    Max number of mappings used:      6
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   1.33    -      -      -      -     1.00   1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -      -     1.00    -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm1, %xmm1, %xmm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     DeER .    .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,2]     D=eeeeeER .   vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,0]     D-------R .   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     .DeE----R .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     .D=eeeeeER.   vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,0]     .D-------R.   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .D=eE----R.   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,2]     . D=eeeeeER   vaddps	%xmm1, %xmm1, %xmm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    4.7       vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     3     1.3    1.3    2.7       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     2.0    0.0    0.0       vaddps	%xmm1, %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..ee9fddec67390fe68640c02676f1b34a622882e7
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-2.s
@@ -0,0 +1,143 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+pxor %mm0, %mm0
+movq %mm0, %mm1
+
+xorps %xmm0, %xmm0
+movaps %xmm0, %xmm1
+movups %xmm1, %xmm2
+movapd %xmm2, %xmm3
+movupd %xmm3, %xmm4
+movdqa %xmm4, %xmm5
+movdqu %xmm5, %xmm0
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      27
+# CHECK-NEXT: Total Cycles:      18
+# CHECK-NEXT: Total uOps:        27
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.50
+# CHECK-NEXT: IPC:               1.50
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        pxor	%mm0, %mm0
+# CHECK-NEXT:  1      2     0.50                        movq	%mm0, %mm1
+# CHECK-NEXT:  1      0     0.25                        xorps	%xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.50                        movaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        movups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        movapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     0.50                        movupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      2     0.50                        movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      2     0.50                        movdqu	%xmm5, %xmm0
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    21
+# CHECK-NEXT: Max number of mappings used:         16
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 21
+# CHECK-NEXT:    Max number of mappings used:      16
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.67   1.33    -     3.00    -      -     3.33   3.67    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm0, %mm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     movq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     movaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     movups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     1.00    -      -      -      -      -     movapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     movupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     0.33   0.67    -      -      -      -     movdqu	%xmm5, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .    . .   pxor	%mm0, %mm0
+# CHECK-NEXT: [0,1]     DeeER.    .    . .   movq	%mm0, %mm1
+# CHECK-NEXT: [0,2]     D---R.    .    . .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,3]     DeE-R.    .    . .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,4]     .DeER.    .    . .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [0,5]     .D=eER    .    . .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,6]     .D==eER   .    . .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,7]     .D===eeER .    . .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,8]     . D====eeER    . .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     . D-------R    . .   pxor	%mm0, %mm0
+# CHECK-NEXT: [1,1]     . DeeE----R    . .   movq	%mm0, %mm1
+# CHECK-NEXT: [1,2]     . D-------R    . .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [1,3]     .  DeE-----R   . .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,4]     .  D=eE----R   . .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [1,5]     .  D==eE---R   . .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,6]     .  D===eE--R   . .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,7]     .   D===eeE-R  . .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,8]     .   D=====eeER . .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .   D--------R . .   pxor	%mm0, %mm0
+# CHECK-NEXT: [2,1]     .   D=eeE----R . .   movq	%mm0, %mm1
+# CHECK-NEXT: [2,2]     .    D-------R . .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [2,3]     .    D==eE----R. .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,4]     .    D===eE---R. .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [2,5]     .    D====eE--R. .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,6]     .    .D====eE-R. .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,7]     .    .D=====eeER .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,8]     .    .D=======eeER   movdqu	%xmm5, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    5.0       pxor	%mm0, %mm0
+# CHECK-NEXT: 1.     3     1.3    1.3    2.7       movq	%mm0, %mm1
+# CHECK-NEXT: 2.     3     0.0    0.0    5.7       xorps	%xmm0, %xmm0
+# CHECK-NEXT: 3.     3     1.7    1.7    3.3       movaps	%xmm0, %xmm1
+# CHECK-NEXT: 4.     3     2.3    0.0    2.3       movups	%xmm1, %xmm2
+# CHECK-NEXT: 5.     3     3.3    0.0    1.7       movapd	%xmm2, %xmm3
+# CHECK-NEXT: 6.     3     4.0    0.0    1.0       movupd	%xmm3, %xmm4
+# CHECK-NEXT: 7.     3     4.7    0.0    0.3       movdqa	%xmm4, %xmm5
+# CHECK-NEXT: 8.     3     6.3    0.0    0.0       movdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
new file mode 100644
index 0000000000000000000000000000000000000000..ada52545a9b184b14ddb3a6210de0566feb75602
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-3.s
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+vxorps  %xmm0, %xmm0, %xmm0
+vmovaps %xmm0, %xmm1
+vmovups %xmm1, %xmm2
+vmovapd %xmm2, %xmm3
+vmovupd %xmm3, %xmm4
+vmovdqa %xmm4, %xmm5
+vmovdqu %xmm5, %xmm0
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      21
+# CHECK-NEXT: Total Cycles:      17
+# CHECK-NEXT: Total uOps:        21
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.24
+# CHECK-NEXT: IPC:               1.24
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      2     0.50                        vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    18
+# CHECK-NEXT: Max number of mappings used:         15
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 18
+# CHECK-NEXT:    Max number of mappings used:      15
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00   1.33   0.67    -      -     3.00   3.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.67   0.33    -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     0.67   0.33    -      -      -      -     vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -     0.33   0.67    -      -      -      -     vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -     0.33   0.67    -      -      -      -     vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.67   0.33    -      -     0.33   0.67    -      -      -      -     vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.67   0.33    -      -     0.67   0.33    -      -      -      -     vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .    ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     DeER .    .    ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,2]     D=eER.    .    ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [0,3]     D==eER    .    ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,4]     .D==eER   .    ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,5]     .D===eeER .    ..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,6]     .D=====eeER    ..   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     .D--------R    ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     . DeE-----R    ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     . D=eE----R    ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [1,3]     . D==eE----R   ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,4]     . D===eE---R   ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,5]     .  D===eeE-R   ..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,6]     .  D=====eeER  ..   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .  D--------R  ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .  D==eE----R  ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,2]     .   D===eE--R  ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [2,3]     .   D====eE--R ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,4]     .   D=====eE-R ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,5]     .   D======eeER..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,6]     .    D=======eeER   vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    5.3       vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     3     1.7    1.7    3.0       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     2.7    0.3    2.0       vmovups	%xmm1, %xmm2
+# CHECK-NEXT: 3.     3     3.7    0.0    2.0       vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: 4.     3     4.3    0.0    1.3       vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: 5.     3     5.0    0.0    0.3       vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: 6.     3     6.7    0.0    0.0       vmovdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
new file mode 100644
index 0000000000000000000000000000000000000000..e651ff0becb307b91cec51bb6bd37e2142d25b67
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-4.s
@@ -0,0 +1,114 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+xor %eax, %eax
+mov %eax, %ebx
+mov %ebx, %ecx
+mov %ecx, %edx
+mov %edx, %eax
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      15
+# CHECK-NEXT: Total Cycles:      11
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.36
+# CHECK-NEXT: IPC:               1.36
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        xorl	%eax, %eax
+# CHECK-NEXT:  1      1     0.50                        movl	%eax, %ebx
+# CHECK-NEXT:  1      1     0.50                        movl	%ebx, %ecx
+# CHECK-NEXT:  1      1     0.50                        movl	%ecx, %edx
+# CHECK-NEXT:  1      1     0.50                        movl	%edx, %eax
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    12
+# CHECK-NEXT: Max number of mappings used:         11
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 12
+# CHECK-NEXT:    Max number of mappings used:      11
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%eax, %ebx
+# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ecx, %edx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%edx, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .   xorl	%eax, %eax
+# CHECK-NEXT: [0,1]     DeER .    .   movl	%eax, %ebx
+# CHECK-NEXT: [0,2]     D=eER.    .   movl	%ebx, %ecx
+# CHECK-NEXT: [0,3]     D==eER    .   movl	%ecx, %edx
+# CHECK-NEXT: [0,4]     .D==eER   .   movl	%edx, %eax
+# CHECK-NEXT: [1,0]     .D----R   .   xorl	%eax, %eax
+# CHECK-NEXT: [1,1]     .DeE--R   .   movl	%eax, %ebx
+# CHECK-NEXT: [1,2]     .D=eE-R   .   movl	%ebx, %ecx
+# CHECK-NEXT: [1,3]     . D=eE-R  .   movl	%ecx, %edx
+# CHECK-NEXT: [1,4]     . D==eER  .   movl	%edx, %eax
+# CHECK-NEXT: [2,0]     . D----R  .   xorl	%eax, %eax
+# CHECK-NEXT: [2,1]     . D==eER  .   movl	%eax, %ebx
+# CHECK-NEXT: [2,2]     .  D==eER .   movl	%ebx, %ecx
+# CHECK-NEXT: [2,3]     .  D===eER.   movl	%ecx, %edx
+# CHECK-NEXT: [2,4]     .  D====eER   movl	%edx, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    2.7       xorl	%eax, %eax
+# CHECK-NEXT: 1.     3     1.7    1.7    0.7       movl	%eax, %ebx
+# CHECK-NEXT: 2.     3     2.3    0.0    0.3       movl	%ebx, %ecx
+# CHECK-NEXT: 3.     3     3.0    0.0    0.3       movl	%ecx, %edx
+# CHECK-NEXT: 4.     3     3.7    0.0    0.0       movl	%edx, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
new file mode 100644
index 0000000000000000000000000000000000000000..188eb5dd15808dbe84846e85d22206f4e7767195
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/reg-move-elimination-5.s
@@ -0,0 +1,114 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+xor %rax, %rax
+mov %rax, %rbx
+mov %rbx, %rcx
+mov %rcx, %rdx
+mov %rdx, %rax
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      15
+# CHECK-NEXT: Total Cycles:      11
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.36
+# CHECK-NEXT: IPC:               1.36
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        xorq	%rax, %rax
+# CHECK-NEXT:  1      1     0.50                        movq	%rax, %rbx
+# CHECK-NEXT:  1      1     0.50                        movq	%rbx, %rcx
+# CHECK-NEXT:  1      1     0.50                        movq	%rcx, %rdx
+# CHECK-NEXT:  1      1     0.50                        movq	%rdx, %rax
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    12
+# CHECK-NEXT: Max number of mappings used:         11
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 12
+# CHECK-NEXT:    Max number of mappings used:      11
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rax, %rbx
+# CHECK-NEXT:  -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rcx, %rdx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rdx, %rax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .   xorq	%rax, %rax
+# CHECK-NEXT: [0,1]     DeER .    .   movq	%rax, %rbx
+# CHECK-NEXT: [0,2]     D=eER.    .   movq	%rbx, %rcx
+# CHECK-NEXT: [0,3]     D==eER    .   movq	%rcx, %rdx
+# CHECK-NEXT: [0,4]     .D==eER   .   movq	%rdx, %rax
+# CHECK-NEXT: [1,0]     .D----R   .   xorq	%rax, %rax
+# CHECK-NEXT: [1,1]     .DeE--R   .   movq	%rax, %rbx
+# CHECK-NEXT: [1,2]     .D=eE-R   .   movq	%rbx, %rcx
+# CHECK-NEXT: [1,3]     . D=eE-R  .   movq	%rcx, %rdx
+# CHECK-NEXT: [1,4]     . D==eER  .   movq	%rdx, %rax
+# CHECK-NEXT: [2,0]     . D----R  .   xorq	%rax, %rax
+# CHECK-NEXT: [2,1]     . D==eER  .   movq	%rax, %rbx
+# CHECK-NEXT: [2,2]     .  D==eER .   movq	%rbx, %rcx
+# CHECK-NEXT: [2,3]     .  D===eER.   movq	%rcx, %rdx
+# CHECK-NEXT: [2,4]     .  D====eER   movq	%rdx, %rax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    2.7       xorq	%rax, %rax
+# CHECK-NEXT: 1.     3     1.7    1.7    0.7       movq	%rax, %rbx
+# CHECK-NEXT: 2.     3     2.3    0.0    0.3       movq	%rbx, %rcx
+# CHECK-NEXT: 3.     3     3.0    0.0    0.3       movq	%rcx, %rdx
+# CHECK-NEXT: 4.     3     3.7    0.0    0.0       movq	%rdx, %rax
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-1.s b/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
new file mode 100644
index 0000000000000000000000000000000000000000..70685f1726add50f98e7f4bf0dcd2abaeef2ecf8
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-1.s
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=5 -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+
+vaddps %xmm0, %xmm0, %xmm0
+vmulps %xmm0, %xmm0, %xmm0
+
+# CHECK:      Iterations:        5
+# CHECK-NEXT: Instructions:      10
+# CHECK-NEXT: Total Cycles:      53
+# CHECK-NEXT: Total uOps:        10
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.19
+# CHECK-NEXT: IPC:               0.19
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              50  (94.3%)
+# CHECK-NEXT:  2,              1  (1.9%)
+# CHECK-NEXT:  4,              2  (3.8%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    10
+# CHECK-NEXT: Max number of mappings used:         10
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 10
+# CHECK-NEXT:    Max number of mappings used:      10
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,0]     D==========eeeeeER  .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     D===============eeeeeER  .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,0]     .D===================eeeeeER  .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .D========================eeeeeER  .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,0]     .D=============================eeeeeER  .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,1]     .D==================================eeeeeER  .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,0]     . D======================================eeeeeER  . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,1]     . D===========================================eeeeeER   vmulps	%xmm0, %xmm0, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     5     20.2   0.2    0.0       vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     5     25.2   0.0    0.0       vmulps	%xmm0, %xmm0, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-2.s b/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..354876befcdddf7e1ceca39855273e559fa982b7
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-2.s
@@ -0,0 +1,99 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -register-file-size=5 -iterations=5 -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+
+vaddps %xmm0, %xmm0, %xmm0
+vmulps %xmm0, %xmm0, %xmm0
+
+# CHECK:      Iterations:        5
+# CHECK-NEXT: Instructions:      10
+# CHECK-NEXT: Total Cycles:      53
+# CHECK-NEXT: Total uOps:        10
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.19
+# CHECK-NEXT: IPC:               0.19
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      26  (49.1%)
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              46  (86.8%)
+# CHECK-NEXT:  1,              6  (11.3%)
+# CHECK-NEXT:  4,              1  (1.9%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    10
+# CHECK-NEXT: Max number of mappings used:         5
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 10
+# CHECK-NEXT:    Max number of mappings used:      5
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     1.00   1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm0, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,0]     D==========eeeeeER  .    .    .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     D===============eeeeeER  .    .    .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,0]     .D===================eeeeeER  .    .    .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .    . D==================eeeeeER  .    .    .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,0]     .    .    . D==================eeeeeER  .    .    . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [3,1]     .    .    .    . D==================eeeeeER  .    . .   vmulps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,0]     .    .    .    .    . D==================eeeeeER  . .   vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [4,1]     .    .    .    .    .    . D==================eeeeeER   vmulps	%xmm0, %xmm0, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     5     14.0   0.2    0.0       vaddps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     5     15.8   0.0    0.0       vmulps	%xmm0, %xmm0, %xmm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-3.s b/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
new file mode 100644
index 0000000000000000000000000000000000000000..a5f5746d7f9be5700184c861ef5061621c81cf07
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-3.s
@@ -0,0 +1,98 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -register-file-size=5 -iterations=2 -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+
+idiv %eax
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      42
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.10
+# CHECK-NEXT: IPC:               0.05
+# CHECK-NEXT: Block RThroughput: 25.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      14    25.00                 U     idivl	%eax
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      16  (38.1%)
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              40  (95.2%)
+# CHECK-NEXT:  2,              2  (4.8%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    6
+# CHECK-NEXT: Max number of mappings used:         3
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 6
+# CHECK-NEXT:    Max number of mappings used:      3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeER   .    .    .    .    ..   idivl	%eax
+# CHECK-NEXT: [1,0]     .    .    .    .D=========eeeeeeeeeeeeeeER   idivl	%eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     5.5    5.5    0.0       idivl	%eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-4.s b/test/tools/llvm-mca/X86/BdVer2/register-files-4.s
new file mode 100644
index 0000000000000000000000000000000000000000..09c9e4af7e0c5f442247ec7c2aa5423df1d2b779
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-4.s
@@ -0,0 +1,69 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=22 -dispatch-stats -register-file-stats -resource-pressure=false -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+idiv %eax
+
+# CHECK:      Iterations:        22
+# CHECK-NEXT: Instructions:      22
+# CHECK-NEXT: Total Cycles:      542
+# CHECK-NEXT: Total uOps:        44
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.08
+# CHECK-NEXT: IPC:               0.04
+# CHECK-NEXT: Block RThroughput: 25.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      14    25.00                 U     idivl	%eax
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              531  (98.0%)
+# CHECK-NEXT:  4,              11  (2.0%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    66
+# CHECK-NEXT: Max number of mappings used:         66
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 66
+# CHECK-NEXT:    Max number of mappings used:      66
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeeeeeeeeeER   .    .    .    .    .    .    .    .    .    ..   idivl	%eax
+# CHECK-NEXT: [1,0]     D=========================eeeeeeeeeeeeeeER   .    .    .    .    ..   idivl	%eax
+# CHECK-NEXT: [2,0]     .D=================================================eeeeeeeeeeeeeeER   idivl	%eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     25.7   7.7    0.0       idivl	%eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/register-files-5.s b/test/tools/llvm-mca/X86/BdVer2/register-files-5.s
new file mode 100644
index 0000000000000000000000000000000000000000..28922197333ebaeb0c6736aa9a554ec840ff7cc4
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/register-files-5.s
@@ -0,0 +1,153 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=false -instruction-info=false -dispatch-stats -register-file-stats -timeline < %s | FileCheck %s
+
+  vdivps %ymm0, %ymm0, %ymm1
+  vaddps %ymm0, %ymm0, %ymm2
+  vaddps %ymm0, %ymm0, %ymm3
+  vaddps %ymm0, %ymm0, %ymm4
+  vaddps %ymm0, %ymm0, %ymm5
+  vaddps %ymm0, %ymm0, %ymm6
+  vaddps %ymm0, %ymm0, %ymm7
+  vaddps %ymm0, %ymm0, %ymm8
+  vaddps %ymm0, %ymm0, %ymm9
+  vaddps %ymm0, %ymm0, %ymm10
+  vaddps %ymm0, %ymm0, %ymm11
+  vaddps %ymm0, %ymm0, %ymm12
+  vaddps %ymm0, %ymm0, %ymm13
+  vaddps %ymm0, %ymm0, %ymm14
+  vaddps %ymm0, %ymm0, %ymm15
+  vaddps %ymm2, %ymm0, %ymm0
+  vaddps %ymm2, %ymm0, %ymm3
+  vaddps %ymm2, %ymm0, %ymm4
+  vaddps %ymm2, %ymm0, %ymm5
+  vaddps %ymm2, %ymm0, %ymm6
+  vaddps %ymm2, %ymm0, %ymm7
+  vaddps %ymm2, %ymm0, %ymm8
+  vaddps %ymm2, %ymm0, %ymm9
+  vaddps %ymm2, %ymm0, %ymm10
+  vaddps %ymm2, %ymm0, %ymm11
+  vaddps %ymm2, %ymm0, %ymm12
+  vaddps %ymm2, %ymm0, %ymm13
+  vaddps %ymm2, %ymm0, %ymm14
+  vaddps %ymm2, %ymm0, %ymm15
+  vaddps %ymm3, %ymm0, %ymm2
+  vaddps %ymm3, %ymm0, %ymm4
+  vaddps %ymm3, %ymm0, %ymm5
+  vaddps %ymm3, %ymm0, %ymm6
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      33
+# CHECK-NEXT: Total Cycles:      70
+# CHECK-NEXT: Total uOps:        66
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.94
+# CHECK-NEXT: IPC:               0.47
+# CHECK-NEXT: Block RThroughput: 64.0
+
+# CHECK:      Dynamic Dispatch Stall Cycles:
+# CHECK-NEXT: RAT     - Register unavailable:                      0
+# CHECK-NEXT: RCU     - Retire tokens unavailable:                 0
+# CHECK-NEXT: SCHEDQ  - Scheduler full:                            0
+# CHECK-NEXT: LQ      - Load queue full:                           0
+# CHECK-NEXT: SQ      - Store queue full:                          0
+# CHECK-NEXT: GROUP   - Static restrictions on the dispatch group: 0
+
+# CHECK:      Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
+# CHECK-NEXT: [# dispatched], [# cycles]
+# CHECK-NEXT:  0,              53  (75.7%)
+# CHECK-NEXT:  2,              1  (1.4%)
+# CHECK-NEXT:  4,              16  (22.9%)
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    66
+# CHECK-NEXT: Max number of mappings used:         54
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 66
+# CHECK-NEXT:    Max number of mappings used:      54
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .    .    .    .   .   vdivps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeeeeE----R   .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm2
+# CHECK-NEXT: [0,2]     .D=eeeeeE--R   .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm3
+# CHECK-NEXT: [0,3]     .D===eeeeeER   .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm4
+# CHECK-NEXT: [0,4]     . D====eeeeeER .    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm5
+# CHECK-NEXT: [0,5]     . D======eeeeeER    .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm6
+# CHECK-NEXT: [0,6]     .  D=======eeeeeER  .    .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm7
+# CHECK-NEXT: [0,7]     .  D===========eeeeeER   .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm8
+# CHECK-NEXT: [0,8]     .   D============eeeeeER .    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm9
+# CHECK-NEXT: [0,9]     .   D==============eeeeeER    .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm10
+# CHECK-NEXT: [0,10]    .    D===============eeeeeER  .    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm11
+# CHECK-NEXT: [0,11]    .    D=================eeeeeER.    .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm12
+# CHECK-NEXT: [0,12]    .    .D==================eeeeeER   .    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm13
+# CHECK-NEXT: [0,13]    .    .D======================eeeeeER    .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm14
+# CHECK-NEXT: [0,14]    .    . D=======================eeeeeER  .    .    .    .    .    .   .   vaddps	%ymm0, %ymm0, %ymm15
+# CHECK-NEXT: [0,15]    .    . D=====eeeeeE------------------R  .    .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm0
+# CHECK-NEXT: [0,16]    .    .  D==================eeeeeE----R  .    .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm3
+# CHECK-NEXT: [0,17]    .    .  D========================eeeeeER.    .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm4
+# CHECK-NEXT: [0,18]    .    .   D=========================eeeeeER   .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm5
+# CHECK-NEXT: [0,19]    .    .   D===========================eeeeeER .    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm6
+# CHECK-NEXT: [0,20]    .    .    D============================eeeeeER    .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm7
+# CHECK-NEXT: [0,21]    .    .    D==============================eeeeeER  .    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm8
+# CHECK-NEXT: [0,22]    .    .    .D===============================eeeeeER.    .    .    .   .   vaddps	%ymm2, %ymm0, %ymm9
+# CHECK-NEXT: [0,23]    .    .    .D=================================eeeeeER   .    .    .   .   vaddps	%ymm2, %ymm0, %ymm10
+# CHECK-NEXT: [0,24]    .    .    . D==================================eeeeeER .    .    .   .   vaddps	%ymm2, %ymm0, %ymm11
+# CHECK-NEXT: [0,25]    .    .    . D====================================eeeeeER    .    .   .   vaddps	%ymm2, %ymm0, %ymm12
+# CHECK-NEXT: [0,26]    .    .    .  D=====================================eeeeeER  .    .   .   vaddps	%ymm2, %ymm0, %ymm13
+# CHECK-NEXT: [0,27]    .    .    .  D=======================================eeeeeER.    .   .   vaddps	%ymm2, %ymm0, %ymm14
+# CHECK-NEXT: [0,28]    .    .    .   D========================================eeeeeER   .   .   vaddps	%ymm2, %ymm0, %ymm15
+# CHECK-NEXT: [0,29]    .    .    .   D==========================================eeeeeER .   .   vaddps	%ymm3, %ymm0, %ymm2
+# CHECK-NEXT: [0,30]    .    .    .    D===========================================eeeeeER   .   vaddps	%ymm3, %ymm0, %ymm4
+# CHECK-NEXT: [0,31]    .    .    .    D=============================================eeeeeER .   vaddps	%ymm3, %ymm0, %ymm5
+# CHECK-NEXT: [0,32]    .    .    .    .D==============================================eeeeeER   vaddps	%ymm3, %ymm0, %ymm6
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vdivps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       vaddps	%ymm0, %ymm0, %ymm2
+# CHECK-NEXT: 2.     1     2.0    2.0    2.0       vaddps	%ymm0, %ymm0, %ymm3
+# CHECK-NEXT: 3.     1     4.0    4.0    0.0       vaddps	%ymm0, %ymm0, %ymm4
+# CHECK-NEXT: 4.     1     5.0    5.0    0.0       vaddps	%ymm0, %ymm0, %ymm5
+# CHECK-NEXT: 5.     1     7.0    7.0    0.0       vaddps	%ymm0, %ymm0, %ymm6
+# CHECK-NEXT: 6.     1     8.0    8.0    0.0       vaddps	%ymm0, %ymm0, %ymm7
+# CHECK-NEXT: 7.     1     12.0   12.0   0.0       vaddps	%ymm0, %ymm0, %ymm8
+# CHECK-NEXT: 8.     1     13.0   13.0   0.0       vaddps	%ymm0, %ymm0, %ymm9
+# CHECK-NEXT: 9.     1     15.0   15.0   0.0       vaddps	%ymm0, %ymm0, %ymm10
+# CHECK-NEXT: 10.    1     16.0   16.0   0.0       vaddps	%ymm0, %ymm0, %ymm11
+# CHECK-NEXT: 11.    1     18.0   18.0   0.0       vaddps	%ymm0, %ymm0, %ymm12
+# CHECK-NEXT: 12.    1     19.0   19.0   0.0       vaddps	%ymm0, %ymm0, %ymm13
+# CHECK-NEXT: 13.    1     23.0   23.0   0.0       vaddps	%ymm0, %ymm0, %ymm14
+# CHECK-NEXT: 14.    1     24.0   24.0   0.0       vaddps	%ymm0, %ymm0, %ymm15
+# CHECK-NEXT: 15.    1     6.0    6.0    18.0      vaddps	%ymm2, %ymm0, %ymm0
+# CHECK-NEXT: 16.    1     19.0   9.0    4.0       vaddps	%ymm2, %ymm0, %ymm3
+# CHECK-NEXT: 17.    1     25.0   15.0   0.0       vaddps	%ymm2, %ymm0, %ymm4
+# CHECK-NEXT: 18.    1     26.0   17.0   0.0       vaddps	%ymm2, %ymm0, %ymm5
+# CHECK-NEXT: 19.    1     28.0   19.0   0.0       vaddps	%ymm2, %ymm0, %ymm6
+# CHECK-NEXT: 20.    1     29.0   21.0   0.0       vaddps	%ymm2, %ymm0, %ymm7
+# CHECK-NEXT: 21.    1     31.0   23.0   0.0       vaddps	%ymm2, %ymm0, %ymm8
+# CHECK-NEXT: 22.    1     32.0   25.0   0.0       vaddps	%ymm2, %ymm0, %ymm9
+# CHECK-NEXT: 23.    1     34.0   27.0   0.0       vaddps	%ymm2, %ymm0, %ymm10
+# CHECK-NEXT: 24.    1     35.0   29.0   0.0       vaddps	%ymm2, %ymm0, %ymm11
+# CHECK-NEXT: 25.    1     37.0   31.0   0.0       vaddps	%ymm2, %ymm0, %ymm12
+# CHECK-NEXT: 26.    1     38.0   33.0   0.0       vaddps	%ymm2, %ymm0, %ymm13
+# CHECK-NEXT: 27.    1     40.0   35.0   0.0       vaddps	%ymm2, %ymm0, %ymm14
+# CHECK-NEXT: 28.    1     41.0   37.0   0.0       vaddps	%ymm2, %ymm0, %ymm15
+# CHECK-NEXT: 29.    1     43.0   25.0   0.0       vaddps	%ymm3, %ymm0, %ymm2
+# CHECK-NEXT: 30.    1     44.0   27.0   0.0       vaddps	%ymm3, %ymm0, %ymm4
+# CHECK-NEXT: 31.    1     46.0   29.0   0.0       vaddps	%ymm3, %ymm0, %ymm5
+# CHECK-NEXT: 32.    1     47.0   31.0   0.0       vaddps	%ymm3, %ymm0, %ymm6
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s b/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
new file mode 100644
index 0000000000000000000000000000000000000000..12d3e6f2cc03bc32f6ffac6d1a2ebe405312d3e6
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-3dnow.s
@@ -0,0 +1,220 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+femms
+
+pavgusb     %mm0, %mm2
+pavgusb     (%rax), %mm2
+
+pf2id       %mm0, %mm2
+pf2id       (%rax), %mm2
+
+pf2iw       %mm0, %mm2
+pf2iw       (%rax), %mm2
+
+pfacc       %mm0, %mm2
+pfacc       (%rax), %mm2
+
+pfadd       %mm0, %mm2
+pfadd       (%rax), %mm2
+
+pfcmpeq     %mm0, %mm2
+pfcmpeq     (%rax), %mm2
+
+pfcmpge     %mm0, %mm2
+pfcmpge     (%rax), %mm2
+
+pfcmpgt     %mm0, %mm2
+pfcmpgt     (%rax), %mm2
+
+pfmax       %mm0, %mm2
+pfmax       (%rax), %mm2
+
+pfmin       %mm0, %mm2
+pfmin       (%rax), %mm2
+
+pfmul       %mm0, %mm2
+pfmul       (%rax), %mm2
+
+pfnacc      %mm0, %mm2
+pfnacc      (%rax), %mm2
+
+pfpnacc     %mm0, %mm2
+pfpnacc     (%rax), %mm2
+
+pfrcp       %mm0, %mm2
+pfrcp       (%rax), %mm2
+
+pfrcpit1    %mm0, %mm2
+pfrcpit1    (%rax), %mm2
+
+pfrcpit2    %mm0, %mm2
+pfrcpit2    (%rax), %mm2
+
+pfrsqit1    %mm0, %mm2
+pfrsqit1    (%rax), %mm2
+
+pfrsqrt     %mm0, %mm2
+pfrsqrt     (%rax), %mm2
+
+pfsub       %mm0, %mm2
+pfsub       (%rax), %mm2
+
+pfsubr      %mm0, %mm2
+pfsubr      (%rax), %mm2
+
+pi2fd       %mm0, %mm2
+pi2fd       (%rax), %mm2
+
+pi2fw       %mm0, %mm2
+pi2fw       (%rax), %mm2
+
+pmulhrw     %mm0, %mm2
+pmulhrw     (%rax), %mm2
+
+prefetch    (%rax)
+prefetchw   (%rax)
+
+pswapd      %mm0, %mm2
+pswapd      (%rax), %mm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50    *      *      U     femms
+# CHECK-NEXT:  1      2     0.50                        pavgusb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgusb	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pf2id	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pf2id	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pf2iw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pf2iw	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfacc	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfacc	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfadd	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfadd	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfcmpeq	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfcmpeq	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfcmpge	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfcmpge	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfcmpgt	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfcmpgt	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfmax	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfmax	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfmin	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfmin	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfmul	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfmul	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfnacc	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfnacc	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfpnacc	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfpnacc	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrcp	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrcp	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrcpit1	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrcpit1	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrcpit2	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrcpit2	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrsqit1	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrsqit1	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfrsqrt	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfrsqrt	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfsub	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfsub	(%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        pfsubr	%mm0, %mm2
+# CHECK-NEXT:  1      10    1.00    *                   pfsubr	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pi2fd	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pi2fd	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pi2fw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pi2fw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmulhrw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhrw	(%rax), %mm2
+# CHECK-NEXT:  1      5     0.50    *      *            prefetch	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetchw	(%rax)
+# CHECK-NEXT:  1      2     0.50                        pswapd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pswapd	(%rax), %mm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 13.00  13.00   -      -      -      -      -      -     17.50  17.50  2.00   2.00   2.00   8.00   38.50  10.50   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     femms
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgusb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2id	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2id	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2iw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pf2iw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfacc	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfadd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfadd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpeq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpeq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpge	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpge	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpgt	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfcmpgt	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmax	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmax	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmin	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmin	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmul	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfmul	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfnacc	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfnacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfpnacc	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfpnacc	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcp	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcp	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit1	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit1	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit2	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrcpit2	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqit1	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqit1	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqrt	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfrsqrt	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsub	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsubr	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pfsubr	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pi2fw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrw	(%rax), %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetch	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetchw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pswapd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pswapd	(%rax), %mm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-adx.s b/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
new file mode 100644
index 0000000000000000000000000000000000000000..a24213966edcc14c3cd67f38e426fe6dd6e6f376
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-adx.s
@@ -0,0 +1,67 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+adcx        %ebx, %ecx
+adcx        (%rbx), %ecx
+adcx        %rbx, %rcx
+adcx        (%rbx), %rcx
+
+adox        %ebx, %ecx
+adox        (%rbx), %ecx
+adox        %rbx, %rcx
+adox        (%rbx), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                        adcxl	%ebx, %ecx
+# CHECK-NEXT:  1      5     1.00    *                   adcxl	(%rbx), %ecx
+# CHECK-NEXT:  1      1     1.00                        adcxq	%rbx, %rcx
+# CHECK-NEXT:  1      5     1.00    *                   adcxq	(%rbx), %rcx
+# CHECK-NEXT:  1      1     1.00                        adoxl	%ebx, %ecx
+# CHECK-NEXT:  1      5     1.00    *                   adoxl	(%rbx), %ecx
+# CHECK-NEXT:  1      1     1.00                        adoxq	%rbx, %rcx
+# CHECK-NEXT:  1      5     1.00    *                   adoxq	(%rbx), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 2.00   2.00    -      -      -     8.00   8.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxl	%ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxl	(%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxq	%rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcxq	(%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxl	%ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxl	(%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxq	%rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adoxq	(%rbx), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-aes.s b/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
new file mode 100644
index 0000000000000000000000000000000000000000..c8d400142df75c7c8485083e19c1151cbd1be895
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-aes.s
@@ -0,0 +1,83 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+aesdec          %xmm0, %xmm2
+aesdec          (%rax), %xmm2
+
+aesdeclast      %xmm0, %xmm2
+aesdeclast      (%rax), %xmm2
+
+aesenc          %xmm0, %xmm2
+aesenc          (%rax), %xmm2
+
+aesenclast      %xmm0, %xmm2
+aesenclast      (%rax), %xmm2
+
+aesimc          %xmm0, %xmm2
+aesimc          (%rax), %xmm2
+
+aeskeygenassist $22, %xmm0, %xmm2
+aeskeygenassist $22, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      9     1.00                        aesdec	%xmm0, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   aesdec	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00                        aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00                        aesenc	%xmm0, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   aesenc	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00                        aesenclast	%xmm0, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        aesimc	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   aesimc	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   aeskeygenassist	$22, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 3.00   3.00    -      -      -      -      -      -      -      -      -      -     12.00   -     12.00   -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdec	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdec	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdeclast	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesdeclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenclast	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesenclast	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesimc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     aeskeygenassist	$22, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s b/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
new file mode 100644
index 0000000000000000000000000000000000000000..837127d4e58498949d0b84335e140a87801f2934
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-avx1.s
@@ -0,0 +1,2443 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+vaddpd            %xmm0, %xmm1, %xmm2
+vaddpd            (%rax), %xmm1, %xmm2
+
+vaddpd            %ymm0, %ymm1, %ymm2
+vaddpd            (%rax), %ymm1, %ymm2
+
+vaddps            %xmm0, %xmm1, %xmm2
+vaddps            (%rax), %xmm1, %xmm2
+
+vaddps            %ymm0, %ymm1, %ymm2
+vaddps            (%rax), %ymm1, %ymm2
+
+vaddsd            %xmm0, %xmm1, %xmm2
+vaddsd            (%rax), %xmm1, %xmm2
+
+vaddss            %xmm0, %xmm1, %xmm2
+vaddss            (%rax), %xmm1, %xmm2
+
+vaddsubpd         %xmm0, %xmm1, %xmm2
+vaddsubpd         (%rax), %xmm1, %xmm2
+
+vaddsubpd         %ymm0, %ymm1, %ymm2
+vaddsubpd         (%rax), %ymm1, %ymm2
+
+vaddsubps         %xmm0, %xmm1, %xmm2
+vaddsubps         (%rax), %xmm1, %xmm2
+
+vaddsubps         %ymm0, %ymm1, %ymm2
+vaddsubps         (%rax), %ymm1, %ymm2
+
+vaesdec           %xmm0, %xmm1, %xmm2
+vaesdec           (%rax), %xmm1, %xmm2
+
+vaesdeclast       %xmm0, %xmm1, %xmm2
+vaesdeclast       (%rax), %xmm1, %xmm2
+
+vaesenc           %xmm0, %xmm1, %xmm2
+vaesenc           (%rax), %xmm1, %xmm2
+
+vaesenclast       %xmm0, %xmm1, %xmm2
+vaesenclast       (%rax), %xmm1, %xmm2
+
+vaesimc           %xmm0, %xmm2
+vaesimc           (%rax), %xmm2
+
+vaeskeygenassist  $22, %xmm0, %xmm2
+vaeskeygenassist  $22, (%rax), %xmm2
+
+vandnpd           %xmm0, %xmm1, %xmm2
+vandnpd           (%rax), %xmm1, %xmm2
+
+vandnpd           %ymm0, %ymm1, %ymm2
+vandnpd           (%rax), %ymm1, %ymm2
+
+vandnps           %xmm0, %xmm1, %xmm2
+vandnps           (%rax), %xmm1, %xmm2
+
+vandnps           %ymm0, %ymm1, %ymm2
+vandnps           (%rax), %ymm1, %ymm2
+
+vandpd            %xmm0, %xmm1, %xmm2
+vandpd            (%rax), %xmm1, %xmm2
+
+vandpd            %ymm0, %ymm1, %ymm2
+vandpd            (%rax), %ymm1, %ymm2
+
+vandps            %xmm0, %xmm1, %xmm2
+vandps            (%rax), %xmm1, %xmm2
+
+vandps            %ymm0, %ymm1, %ymm2
+vandps            (%rax), %ymm1, %ymm2
+
+vblendpd          $11, %xmm0, %xmm1, %xmm2
+vblendpd          $11, (%rax), %xmm1, %xmm2
+
+vblendpd          $11, %ymm0, %ymm1, %ymm2
+vblendpd          $11, (%rax), %ymm1, %ymm2
+
+vblendps          $11, %xmm0, %xmm1, %xmm2
+vblendps          $11, (%rax), %xmm1, %xmm2
+
+vblendps          $11, %ymm0, %ymm1, %ymm2
+vblendps          $11, (%rax), %ymm1, %ymm2
+
+vblendvpd         %xmm3, %xmm0, %xmm1, %xmm2
+vblendvpd         %xmm3, (%rax), %xmm1, %xmm2
+
+vblendvpd         %ymm3, %ymm0, %ymm1, %ymm2
+vblendvpd         %ymm3, (%rax), %ymm1, %ymm2
+
+vblendvps         %xmm3, %xmm0, %xmm1, %xmm2
+vblendvps         %xmm3, (%rax), %xmm1, %xmm2
+
+vblendvps         %ymm3, %ymm0, %ymm1, %ymm2
+vblendvps         %ymm3, (%rax), %ymm1, %ymm2
+
+vbroadcastf128    (%rax), %ymm2
+
+vbroadcastsd      (%rax), %ymm2
+
+vbroadcastss      (%rax), %xmm2
+vbroadcastss      (%rax), %ymm2
+
+vcmppd            $0, %xmm0, %xmm1, %xmm2
+vcmppd            $0, (%rax), %xmm1, %xmm2
+
+vcmppd            $0, %ymm0, %ymm1, %ymm2
+vcmppd            $0, (%rax), %ymm1, %ymm2
+
+vcmpps            $0, %xmm0, %xmm1, %xmm2
+vcmpps            $0, (%rax), %xmm1, %xmm2
+
+vcmpps            $0, %ymm0, %ymm1, %ymm2
+vcmpps            $0, (%rax), %ymm1, %ymm2
+
+vcmpsd            $0, %xmm0, %xmm1, %xmm2
+vcmpsd            $0, (%rax), %xmm1, %xmm2
+
+vcmpss            $0, %xmm0, %xmm1, %xmm2
+vcmpss            $0, (%rax), %xmm1, %xmm2
+
+vcomisd           %xmm0, %xmm1
+vcomisd           (%rax), %xmm1
+
+vcomiss           %xmm0, %xmm1
+vcomiss           (%rax), %xmm1
+
+vcvtdq2pd         %xmm0, %xmm2
+vcvtdq2pd         (%rax), %xmm2
+
+vcvtdq2pd         %xmm0, %ymm2
+vcvtdq2pd         (%rax), %ymm2
+
+vcvtdq2ps         %xmm0, %xmm2
+vcvtdq2ps         (%rax), %xmm2
+
+vcvtdq2ps         %ymm0, %ymm2
+vcvtdq2ps         (%rax), %ymm2
+
+vcvtpd2dqx        %xmm0, %xmm2
+vcvtpd2dqx        (%rax), %xmm2
+
+vcvtpd2dqy        %ymm0, %xmm2
+vcvtpd2dqy        (%rax), %xmm2
+
+vcvtpd2psx        %xmm0, %xmm2
+vcvtpd2psx        (%rax), %xmm2
+
+vcvtpd2psy        %ymm0, %xmm2
+vcvtpd2psy        (%rax), %xmm2
+
+vcvtps2dq         %xmm0, %xmm2
+vcvtps2dq         (%rax), %xmm2
+
+vcvtps2dq         %ymm0, %ymm2
+vcvtps2dq         (%rax), %ymm2
+
+vcvtps2pd         %xmm0, %xmm2
+vcvtps2pd         (%rax), %xmm2
+
+vcvtps2pd         %xmm0, %ymm2
+vcvtps2pd         (%rax), %ymm2
+
+vcvtsd2si         %xmm0, %ecx
+vcvtsd2si         %xmm0, %rcx
+vcvtsd2si         (%rax), %ecx
+vcvtsd2si         (%rax), %rcx
+
+vcvtsd2ss         %xmm0, %xmm1, %xmm2
+vcvtsd2ss         (%rax), %xmm1, %xmm2
+
+vcvtsi2sdl        %ecx, %xmm0, %xmm2
+vcvtsi2sdq        %rcx, %xmm0, %xmm2
+vcvtsi2sdl        (%rax), %xmm0, %xmm2
+vcvtsi2sdq        (%rax), %xmm0, %xmm2
+
+vcvtsi2ssl        %ecx, %xmm0, %xmm2
+vcvtsi2ssq        %rcx, %xmm0, %xmm2
+vcvtsi2ssl        (%rax), %xmm0, %xmm2
+vcvtsi2ssq        (%rax), %xmm0, %xmm2
+
+vcvtss2sd         %xmm0, %xmm1, %xmm2
+vcvtss2sd         (%rax), %xmm1, %xmm2
+
+vcvtss2si         %xmm0, %ecx
+vcvtss2si         %xmm0, %rcx
+vcvtss2si         (%rax), %ecx
+vcvtss2si         (%rax), %rcx
+
+vcvttpd2dqx       %xmm0, %xmm2
+vcvttpd2dqx       (%rax), %xmm2
+
+vcvttpd2dqy       %ymm0, %xmm2
+vcvttpd2dqy       (%rax), %xmm2
+
+vcvttps2dq        %xmm0, %xmm2
+vcvttps2dq        (%rax), %xmm2
+
+vcvttps2dq        %ymm0, %ymm2
+vcvttps2dq        (%rax), %ymm2
+
+vcvttsd2si        %xmm0, %ecx
+vcvttsd2si        %xmm0, %rcx
+vcvttsd2si        (%rax), %ecx
+vcvttsd2si        (%rax), %rcx
+
+vcvttss2si        %xmm0, %ecx
+vcvttss2si        %xmm0, %rcx
+vcvttss2si        (%rax), %ecx
+vcvttss2si        (%rax), %rcx
+
+vdivpd            %xmm0, %xmm1, %xmm2
+vdivpd            (%rax), %xmm1, %xmm2
+
+vdivpd            %ymm0, %ymm1, %ymm2
+vdivpd            (%rax), %ymm1, %ymm2
+
+vdivps            %xmm0, %xmm1, %xmm2
+vdivps            (%rax), %xmm1, %xmm2
+
+vdivps            %ymm0, %ymm1, %ymm2
+vdivps            (%rax), %ymm1, %ymm2
+
+vdivsd            %xmm0, %xmm1, %xmm2
+vdivsd            (%rax), %xmm1, %xmm2
+
+vdivss            %xmm0, %xmm1, %xmm2
+vdivss            (%rax), %xmm1, %xmm2
+
+vdppd             $22, %xmm0, %xmm1, %xmm2
+vdppd             $22, (%rax), %xmm1, %xmm2
+
+vdpps             $22, %xmm0, %xmm1, %xmm2
+vdpps             $22, (%rax), %xmm1, %xmm2
+
+vdpps             $22, %ymm0, %ymm1, %ymm2
+vdpps             $22, (%rax), %ymm1, %ymm2
+
+vextractf128      $1, %ymm0, %xmm2
+vextractf128      $1, %ymm0, (%rax)
+
+vextractps        $1, %xmm0, %rcx
+vextractps        $1, %xmm0, (%rax)
+
+vhaddpd           %xmm0, %xmm1, %xmm2
+vhaddpd           (%rax), %xmm1, %xmm2
+
+vhaddpd           %ymm0, %ymm1, %ymm2
+vhaddpd           (%rax), %ymm1, %ymm2
+
+vhaddps           %xmm0, %xmm1, %xmm2
+vhaddps           (%rax), %xmm1, %xmm2
+
+vhaddps           %ymm0, %ymm1, %ymm2
+vhaddps           (%rax), %ymm1, %ymm2
+
+vhsubpd           %xmm0, %xmm1, %xmm2
+vhsubpd           (%rax), %xmm1, %xmm2
+
+vhsubpd           %ymm0, %ymm1, %ymm2
+vhsubpd           (%rax), %ymm1, %ymm2
+
+vhsubps           %xmm0, %xmm1, %xmm2
+vhsubps           (%rax), %xmm1, %xmm2
+
+vhsubps           %ymm0, %ymm1, %ymm2
+vhsubps           (%rax), %ymm1, %ymm2
+
+vinsertf128       $1, %xmm0, %ymm1, %ymm2
+vinsertf128       $1, (%rax), %ymm1, %ymm2
+
+vinsertps         $1, %xmm0, %xmm1, %xmm2
+vinsertps         $1, (%rax), %xmm1, %xmm2
+
+vlddqu            (%rax), %xmm2
+vlddqu            (%rax), %ymm2
+
+vldmxcsr          (%rax)
+
+vmaskmovdqu       %xmm0, %xmm1
+
+vmaskmovpd        (%rax), %xmm0, %xmm2
+vmaskmovpd        (%rax), %ymm0, %ymm2
+
+vmaskmovpd        %xmm0, %xmm1, (%rax)
+vmaskmovpd        %ymm0, %ymm1, (%rax)
+
+vmaskmovps        (%rax), %xmm0, %xmm2
+vmaskmovps        (%rax), %ymm0, %ymm2
+
+vmaskmovps        %xmm0, %xmm1, (%rax)
+vmaskmovps        %ymm0, %ymm1, (%rax)
+
+vmaxpd            %xmm0, %xmm1, %xmm2
+vmaxpd            (%rax), %xmm1, %xmm2
+
+vmaxpd            %ymm0, %ymm1, %ymm2
+vmaxpd            (%rax), %ymm1, %ymm2
+
+vmaxps            %xmm0, %xmm1, %xmm2
+vmaxps            (%rax), %xmm1, %xmm2
+
+vmaxps            %ymm0, %ymm1, %ymm2
+vmaxps            (%rax), %ymm1, %ymm2
+
+vmaxsd            %xmm0, %xmm1, %xmm2
+vmaxsd            (%rax), %xmm1, %xmm2
+
+vmaxss            %xmm0, %xmm1, %xmm2
+vmaxss            (%rax), %xmm1, %xmm2
+
+vminpd            %xmm0, %xmm1, %xmm2
+vminpd            (%rax), %xmm1, %xmm2
+
+vminpd            %ymm0, %ymm1, %ymm2
+vminpd            (%rax), %ymm1, %ymm2
+
+vminps            %xmm0, %xmm1, %xmm2
+vminps            (%rax), %xmm1, %xmm2
+
+vminps            %ymm0, %ymm1, %ymm2
+vminps            (%rax), %ymm1, %ymm2
+
+vminsd            %xmm0, %xmm1, %xmm2
+vminsd            (%rax), %xmm1, %xmm2
+
+vminss            %xmm0, %xmm1, %xmm2
+vminss            (%rax), %xmm1, %xmm2
+
+vmovapd           %xmm0, %xmm2
+vmovapd           %xmm0, (%rax)
+vmovapd           (%rax), %xmm2
+
+vmovapd           %ymm0, %ymm2
+vmovapd           %ymm0, (%rax)
+vmovapd           (%rax), %ymm2
+
+vmovaps           %xmm0, %xmm2
+vmovaps           %xmm0, (%rax)
+vmovaps           (%rax), %xmm2
+
+vmovaps           %ymm0, %ymm2
+vmovaps           %ymm0, (%rax)
+vmovaps           (%rax), %ymm2
+
+vmovd             %eax, %xmm2
+vmovd             (%rax), %xmm2
+
+vmovd             %xmm0, %ecx
+vmovd             %xmm0, (%rax)
+
+vmovddup          %xmm0, %xmm2
+vmovddup          (%rax), %xmm2
+
+vmovddup          %ymm0, %ymm2
+vmovddup          (%rax), %ymm2
+
+vmovdqa           %xmm0, %xmm2
+vmovdqa           %xmm0, (%rax)
+vmovdqa           (%rax), %xmm2
+
+vmovdqa           %ymm0, %ymm2
+vmovdqa           %ymm0, (%rax)
+vmovdqa           (%rax), %ymm2
+
+vmovdqu           %xmm0, %xmm2
+vmovdqu           %xmm0, (%rax)
+vmovdqu           (%rax), %xmm2
+
+vmovdqu           %ymm0, %ymm2
+vmovdqu           %ymm0, (%rax)
+vmovdqu           (%rax), %ymm2
+
+vmovhlps          %xmm0, %xmm1, %xmm2
+vmovlhps          %xmm0, %xmm1, %xmm2
+
+vmovhpd           %xmm0, (%rax)
+vmovhpd           (%rax), %xmm1, %xmm2
+
+vmovhps           %xmm0, (%rax)
+vmovhps           (%rax), %xmm1, %xmm2
+
+vmovlpd           %xmm0, (%rax)
+vmovlpd           (%rax), %xmm1, %xmm2
+
+vmovlps           %xmm0, (%rax)
+vmovlps           (%rax), %xmm1, %xmm2
+
+vmovmskpd         %xmm0, %rcx
+vmovmskpd         %ymm0, %rcx
+
+vmovmskps         %xmm0, %rcx
+vmovmskps         %ymm0, %rcx
+
+vmovntdq          %xmm0, (%rax)
+vmovntdq          %ymm0, (%rax)
+
+vmovntdqa         (%rax), %xmm2
+vmovntdqa         (%rax), %ymm2
+
+vmovntpd          %xmm0, (%rax)
+vmovntpd          %ymm0, (%rax)
+
+vmovntps          %xmm0, (%rax)
+vmovntps          %ymm0, (%rax)
+
+vmovq             %xmm0, %xmm2
+
+vmovq             %rax, %xmm2
+vmovq             (%rax), %xmm2
+
+vmovq             %xmm0, %rcx
+vmovq             %xmm0, (%rax)
+
+vmovsd            %xmm0, %xmm1, %xmm2
+vmovsd            %xmm0, (%rax)
+vmovsd            (%rax), %xmm2
+
+vmovshdup         %xmm0, %xmm2
+vmovshdup         (%rax), %xmm2
+
+vmovshdup         %ymm0, %ymm2
+vmovshdup         (%rax), %ymm2
+
+vmovsldup         %xmm0, %xmm2
+vmovsldup         (%rax), %xmm2
+
+vmovsldup         %ymm0, %ymm2
+vmovsldup         (%rax), %ymm2
+
+vmovss            %xmm0, %xmm1, %xmm2
+vmovss            %xmm0, (%rax)
+vmovss            (%rax), %xmm2
+
+vmovupd           %xmm0, %xmm2
+vmovupd           %xmm0, (%rax)
+vmovupd           (%rax), %xmm2
+
+vmovupd           %ymm0, %ymm2
+vmovupd           %ymm0, (%rax)
+vmovupd           (%rax), %ymm2
+
+vmovups           %xmm0, %xmm2
+vmovups           %xmm0, (%rax)
+vmovups           (%rax), %xmm2
+
+vmovups           %ymm0, %ymm2
+vmovups           %ymm0, (%rax)
+vmovups           (%rax), %ymm2
+
+vmpsadbw          $1, %xmm0, %xmm1, %xmm2
+vmpsadbw          $1, (%rax), %xmm1, %xmm2
+
+vmulpd            %xmm0, %xmm1, %xmm2
+vmulpd            (%rax), %xmm1, %xmm2
+
+vmulpd            %ymm0, %ymm1, %ymm2
+vmulpd            (%rax), %ymm1, %ymm2
+
+vmulps            %xmm0, %xmm1, %xmm2
+vmulps            (%rax), %xmm1, %xmm2
+
+vmulps            %ymm0, %ymm1, %ymm2
+vmulps            (%rax), %ymm1, %ymm2
+
+vmulsd            %xmm0, %xmm1, %xmm2
+vmulsd            (%rax), %xmm1, %xmm2
+
+vmulss            %xmm0, %xmm1, %xmm2
+vmulss            (%rax), %xmm1, %xmm2
+
+vorpd             %xmm0, %xmm1, %xmm2
+vorpd             (%rax), %xmm1, %xmm2
+
+vorpd             %ymm0, %ymm1, %ymm2
+vorpd             (%rax), %ymm1, %ymm2
+
+vorps             %xmm0, %xmm1, %xmm2
+vorps             (%rax), %xmm1, %xmm2
+
+vorps             %ymm0, %ymm1, %ymm2
+vorps             (%rax), %ymm1, %ymm2
+
+vpabsb            %xmm0, %xmm2
+vpabsb            (%rax), %xmm2
+
+vpabsd            %xmm0, %xmm2
+vpabsd            (%rax), %xmm2
+
+vpabsw            %xmm0, %xmm2
+vpabsw            (%rax), %xmm2
+
+vpackssdw         %xmm0, %xmm1, %xmm2
+vpackssdw         (%rax), %xmm1, %xmm2
+
+vpacksswb         %xmm0, %xmm1, %xmm2
+vpacksswb         (%rax), %xmm1, %xmm2
+
+vpackusdw         %xmm0, %xmm1, %xmm2
+vpackusdw         (%rax), %xmm1, %xmm2
+
+vpackuswb         %xmm0, %xmm1, %xmm2
+vpackuswb         (%rax), %xmm1, %xmm2
+
+vpaddb            %xmm0, %xmm1, %xmm2
+vpaddb            (%rax), %xmm1, %xmm2
+
+vpaddd            %xmm0, %xmm1, %xmm2
+vpaddd            (%rax), %xmm1, %xmm2
+
+vpaddq            %xmm0, %xmm1, %xmm2
+vpaddq            (%rax), %xmm1, %xmm2
+
+vpaddsb           %xmm0, %xmm1, %xmm2
+vpaddsb           (%rax), %xmm1, %xmm2
+
+vpaddsw           %xmm0, %xmm1, %xmm2
+vpaddsw           (%rax), %xmm1, %xmm2
+
+vpaddusb          %xmm0, %xmm1, %xmm2
+vpaddusb          (%rax), %xmm1, %xmm2
+
+vpaddusw          %xmm0, %xmm1, %xmm2
+vpaddusw          (%rax), %xmm1, %xmm2
+
+vpaddw            %xmm0, %xmm1, %xmm2
+vpaddw            (%rax), %xmm1, %xmm2
+
+vpalignr          $1, %xmm0, %xmm1, %xmm2
+vpalignr          $1, (%rax), %xmm1, %xmm2
+
+vpand             %xmm0, %xmm1, %xmm2
+vpand             (%rax), %xmm1, %xmm2
+
+vpandn            %xmm0, %xmm1, %xmm2
+vpandn            (%rax), %xmm1, %xmm2
+
+vpavgb            %xmm0, %xmm1, %xmm2
+vpavgb            (%rax), %xmm1, %xmm2
+
+vpavgw            %xmm0, %xmm1, %xmm2
+vpavgw            (%rax), %xmm1, %xmm2
+
+vpblendvb         %xmm3, %xmm0, %xmm1, %xmm2
+vpblendvb         %xmm3, (%rax), %xmm1, %xmm2
+
+vpblendw          $11, %xmm0, %xmm1, %xmm2
+vpblendw          $11, (%rax), %xmm1, %xmm2
+
+vpclmulqdq        $11, %xmm0, %xmm1, %xmm2
+vpclmulqdq        $11, (%rax), %xmm1, %xmm2
+
+vpcmpeqb          %xmm0, %xmm1, %xmm2
+vpcmpeqb          (%rax), %xmm1, %xmm2
+
+vpcmpeqd          %xmm0, %xmm1, %xmm2
+vpcmpeqd          (%rax), %xmm1, %xmm2
+
+vpcmpeqq          %xmm0, %xmm1, %xmm2
+vpcmpeqq          (%rax), %xmm1, %xmm2
+
+vpcmpeqw          %xmm0, %xmm1, %xmm2
+vpcmpeqw          (%rax), %xmm1, %xmm2
+
+vpcmpestri        $1, %xmm0, %xmm2
+vpcmpestri        $1, (%rax), %xmm2
+
+vpcmpestrm        $1, %xmm0, %xmm2
+vpcmpestrm        $1, (%rax), %xmm2
+
+vpcmpgtb          %xmm0, %xmm1, %xmm2
+vpcmpgtb          (%rax), %xmm1, %xmm2
+
+vpcmpgtd          %xmm0, %xmm1, %xmm2
+vpcmpgtd          (%rax), %xmm1, %xmm2
+
+vpcmpgtq          %xmm0, %xmm1, %xmm2
+vpcmpgtq          (%rax), %xmm1, %xmm2
+
+vpcmpgtw          %xmm0, %xmm1, %xmm2
+vpcmpgtw          (%rax), %xmm1, %xmm2
+
+vpcmpistri        $1, %xmm0, %xmm2
+vpcmpistri        $1, (%rax), %xmm2
+
+vpcmpistrm        $1, %xmm0, %xmm2
+vpcmpistrm        $1, (%rax), %xmm2
+
+vperm2f128        $1, %ymm0, %ymm1, %ymm2
+vperm2f128        $1, (%rax), %ymm1, %ymm2
+
+vpermilpd         $1, %xmm0, %xmm2
+vpermilpd         $1, (%rax), %xmm2
+vpermilpd         %xmm0, %xmm1, %xmm2
+vpermilpd         (%rax), %xmm1, %xmm2
+
+vpermilpd         $1, %ymm0, %ymm2
+vpermilpd         $1, (%rax), %ymm2
+vpermilpd         %ymm0, %ymm1, %ymm2
+vpermilpd         (%rax), %ymm1, %ymm2
+
+vpermilps         $1, %xmm0, %xmm2
+vpermilps         $1, (%rax), %xmm2
+vpermilps         %xmm0, %xmm1, %xmm2
+vpermilps         (%rax), %xmm1, %xmm2
+
+vpermilps         $1, %ymm0, %ymm2
+vpermilps         $1, (%rax), %ymm2
+vpermilps         %ymm0, %ymm1, %ymm2
+vpermilps         (%rax), %ymm1, %ymm2
+
+vpextrb           $1, %xmm0, %ecx
+vpextrb           $1, %xmm0, (%rax)
+
+vpextrd           $1, %xmm0, %ecx
+vpextrd           $1, %xmm0, (%rax)
+
+vpextrq           $1, %xmm0, %rcx
+vpextrq           $1, %xmm0, (%rax)
+
+vpextrw           $1, %xmm0, %ecx
+vpextrw           $1, %xmm0, (%rax)
+
+vphaddd           %xmm0, %xmm1, %xmm2
+vphaddd           (%rax), %xmm1, %xmm2
+
+vphaddsw          %xmm0, %xmm1, %xmm2
+vphaddsw          (%rax), %xmm1, %xmm2
+
+vphaddw           %xmm0, %xmm1, %xmm2
+vphaddw           (%rax), %xmm1, %xmm2
+
+vphminposuw       %xmm0, %xmm2
+vphminposuw       (%rax), %xmm2
+
+vphsubd           %xmm0, %xmm1, %xmm2
+vphsubd           (%rax), %xmm1, %xmm2
+
+vphsubsw          %xmm0, %xmm1, %xmm2
+vphsubsw          (%rax), %xmm1, %xmm2
+
+vphsubw           %xmm0, %xmm1, %xmm2
+vphsubw           (%rax), %xmm1, %xmm2
+
+vpinsrb           $1, %eax, %xmm1, %xmm2
+vpinsrb           $1, (%rax), %xmm1, %xmm2
+
+vpinsrd           $1, %eax, %xmm1, %xmm2
+vpinsrd           $1, (%rax), %xmm1, %xmm2
+
+vpinsrq           $1, %rax, %xmm1, %xmm2
+vpinsrq           $1, (%rax), %xmm1, %xmm2
+
+vpinsrw           $1, %eax, %xmm1, %xmm2
+vpinsrw           $1, (%rax), %xmm1, %xmm2
+
+vpmaddubsw        %xmm0, %xmm1, %xmm2
+vpmaddubsw        (%rax), %xmm1, %xmm2
+
+vpmaddwd          %xmm0, %xmm1, %xmm2
+vpmaddwd          (%rax), %xmm1, %xmm2
+
+vpmaxsb           %xmm0, %xmm1, %xmm2
+vpmaxsb           (%rax), %xmm1, %xmm2
+
+vpmaxsd           %xmm0, %xmm1, %xmm2
+vpmaxsd           (%rax), %xmm1, %xmm2
+
+vpmaxsw           %xmm0, %xmm1, %xmm2
+vpmaxsw           (%rax), %xmm1, %xmm2
+
+vpmaxub           %xmm0, %xmm1, %xmm2
+vpmaxub           (%rax), %xmm1, %xmm2
+
+vpmaxud           %xmm0, %xmm1, %xmm2
+vpmaxud           (%rax), %xmm1, %xmm2
+
+vpmaxuw           %xmm0, %xmm1, %xmm2
+vpmaxuw           (%rax), %xmm1, %xmm2
+
+vpminsb           %xmm0, %xmm1, %xmm2
+vpminsb           (%rax), %xmm1, %xmm2
+
+vpminsd           %xmm0, %xmm1, %xmm2
+vpminsd           (%rax), %xmm1, %xmm2
+
+vpminsw           %xmm0, %xmm1, %xmm2
+vpminsw           (%rax), %xmm1, %xmm2
+
+vpminub           %xmm0, %xmm1, %xmm2
+vpminub           (%rax), %xmm1, %xmm2
+
+vpminud           %xmm0, %xmm1, %xmm2
+vpminud           (%rax), %xmm1, %xmm2
+
+vpminuw           %xmm0, %xmm1, %xmm2
+vpminuw           (%rax), %xmm1, %xmm2
+
+vpmovmskb         %xmm0, %rcx
+
+vpmovsxbd         %xmm0, %xmm2
+vpmovsxbd         (%rax), %xmm2
+
+vpmovsxbq         %xmm0, %xmm2
+vpmovsxbq         (%rax), %xmm2
+
+vpmovsxbw         %xmm0, %xmm2
+vpmovsxbw         (%rax), %xmm2
+
+vpmovsxdq         %xmm0, %xmm2
+vpmovsxdq         (%rax), %xmm2
+
+vpmovsxwd         %xmm0, %xmm2
+vpmovsxwd         (%rax), %xmm2
+
+vpmovsxwq         %xmm0, %xmm2
+vpmovsxwq         (%rax), %xmm2
+
+vpmovzxbd         %xmm0, %xmm2
+vpmovzxbd         (%rax), %xmm2
+
+vpmovzxbq         %xmm0, %xmm2
+vpmovzxbq         (%rax), %xmm2
+
+vpmovzxbw         %xmm0, %xmm2
+vpmovzxbw         (%rax), %xmm2
+
+vpmovzxdq         %xmm0, %xmm2
+vpmovzxdq         (%rax), %xmm2
+
+vpmovzxwd         %xmm0, %xmm2
+vpmovzxwd         (%rax), %xmm2
+
+vpmovzxwq         %xmm0, %xmm2
+vpmovzxwq         (%rax), %xmm2
+
+vpmuldq           %xmm0, %xmm1, %xmm2
+vpmuldq           (%rax), %xmm1, %xmm2
+
+vpmulhrsw         %xmm0, %xmm1, %xmm2
+vpmulhrsw         (%rax), %xmm1, %xmm2
+
+vpmulhuw          %xmm0, %xmm1, %xmm2
+vpmulhuw          (%rax), %xmm1, %xmm2
+
+vpmulhw           %xmm0, %xmm1, %xmm2
+vpmulhw           (%rax), %xmm1, %xmm2
+
+vpmulld           %xmm0, %xmm1, %xmm2
+vpmulld           (%rax), %xmm1, %xmm2
+
+vpmullw           %xmm0, %xmm1, %xmm2
+vpmullw           (%rax), %xmm1, %xmm2
+
+vpmuludq          %xmm0, %xmm1, %xmm2
+vpmuludq          (%rax), %xmm1, %xmm2
+
+vpor              %xmm0, %xmm1, %xmm2
+vpor              (%rax), %xmm1, %xmm2
+
+vpsadbw           %xmm0, %xmm1, %xmm2
+vpsadbw           (%rax), %xmm1, %xmm2
+
+vpshufb           %xmm0, %xmm1, %xmm2
+vpshufb           (%rax), %xmm1, %xmm2
+
+vpshufd           $1, %xmm0, %xmm2
+vpshufd           $1, (%rax), %xmm2
+
+vpshufhw          $1, %xmm0, %xmm2
+vpshufhw          $1, (%rax), %xmm2
+
+vpshuflw          $1, %xmm0, %xmm2
+vpshuflw          $1, (%rax), %xmm2
+
+vpsignb           %xmm0, %xmm1, %xmm2
+vpsignb           (%rax), %xmm1, %xmm2
+
+vpsignd           %xmm0, %xmm1, %xmm2
+vpsignd           (%rax), %xmm1, %xmm2
+
+vpsignw           %xmm0, %xmm1, %xmm2
+vpsignw           (%rax), %xmm1, %xmm2
+
+vpslld            $1, %xmm0, %xmm2
+vpslld            %xmm0, %xmm1, %xmm2
+vpslld            (%rax), %xmm1, %xmm2
+
+vpslldq           $1, %xmm1, %xmm2
+
+vpsllq            $1, %xmm0, %xmm2
+vpsllq            %xmm0, %xmm1, %xmm2
+vpsllq            (%rax), %xmm1, %xmm2
+
+vpsllw            $1, %xmm0, %xmm2
+vpsllw            %xmm0, %xmm1, %xmm2
+vpsllw            (%rax), %xmm1, %xmm2
+
+vpsrad            $1, %xmm0, %xmm2
+vpsrad            %xmm0, %xmm1, %xmm2
+vpsrad            (%rax), %xmm1, %xmm2
+
+vpsraw            $1, %xmm0, %xmm2
+vpsraw            %xmm0, %xmm1, %xmm2
+vpsraw            (%rax), %xmm1, %xmm2
+
+vpsrld            $1, %xmm0, %xmm2
+vpsrld            %xmm0, %xmm1, %xmm2
+vpsrld            (%rax), %xmm1, %xmm2
+
+vpsrldq           $1, %xmm1, %xmm2
+
+vpsrlq            $1, %xmm0, %xmm2
+vpsrlq            %xmm0, %xmm1, %xmm2
+vpsrlq            (%rax), %xmm1, %xmm2
+
+vpsrlw            $1, %xmm0, %xmm2
+vpsrlw            %xmm0, %xmm1, %xmm2
+vpsrlw            (%rax), %xmm1, %xmm2
+
+vpsubb            %xmm0, %xmm1, %xmm2
+vpsubb            (%rax), %xmm1, %xmm2
+
+vpsubd            %xmm0, %xmm1, %xmm2
+vpsubd            (%rax), %xmm1, %xmm2
+
+vpsubq            %xmm0, %xmm1, %xmm2
+vpsubq            (%rax), %xmm1, %xmm2
+
+vpsubsb           %xmm0, %xmm1, %xmm2
+vpsubsb           (%rax), %xmm1, %xmm2
+
+vpsubsw           %xmm0, %xmm1, %xmm2
+vpsubsw           (%rax), %xmm1, %xmm2
+
+vpsubusb          %xmm0, %xmm1, %xmm2
+vpsubusb          (%rax), %xmm1, %xmm2
+
+vpsubusw          %xmm0, %xmm1, %xmm2
+vpsubusw          (%rax), %xmm1, %xmm2
+
+vpsubw            %xmm0, %xmm1, %xmm2
+vpsubw            (%rax), %xmm1, %xmm2
+
+vptest            %xmm0, %xmm1
+vptest            (%rax), %xmm1
+
+vptest            %ymm0, %ymm1
+vptest            (%rax), %ymm1
+
+vpunpckhbw        %xmm0, %xmm1, %xmm2
+vpunpckhbw        (%rax), %xmm1, %xmm2
+
+vpunpckhdq        %xmm0, %xmm1, %xmm2
+vpunpckhdq        (%rax), %xmm1, %xmm2
+
+vpunpckhqdq       %xmm0, %xmm1, %xmm2
+vpunpckhqdq       (%rax), %xmm1, %xmm2
+
+vpunpckhwd        %xmm0, %xmm1, %xmm2
+vpunpckhwd        (%rax), %xmm1, %xmm2
+
+vpunpcklbw        %xmm0, %xmm1, %xmm2
+vpunpcklbw        (%rax), %xmm1, %xmm2
+
+vpunpckldq        %xmm0, %xmm1, %xmm2
+vpunpckldq        (%rax), %xmm1, %xmm2
+
+vpunpcklqdq       %xmm0, %xmm1, %xmm2
+vpunpcklqdq       (%rax), %xmm1, %xmm2
+
+vpunpcklwd        %xmm0, %xmm1, %xmm2
+vpunpcklwd        (%rax), %xmm1, %xmm2
+
+vpxor             %xmm0, %xmm1, %xmm2
+vpxor             (%rax), %xmm1, %xmm2
+
+vrcpps            %xmm0, %xmm2
+vrcpps            (%rax), %xmm2
+
+vrcpps            %ymm0, %ymm2
+vrcpps            (%rax), %ymm2
+
+vrcpss            %xmm0, %xmm1, %xmm2
+vrcpss            (%rax), %xmm1, %xmm2
+
+vroundpd          $1, %xmm0, %xmm2
+vroundpd          $1, (%rax), %xmm2
+
+vroundpd          $1, %ymm0, %ymm2
+vroundpd          $1, (%rax), %ymm2
+
+vroundps          $1, %xmm0, %xmm2
+vroundps          $1, (%rax), %xmm2
+
+vroundps          $1, %ymm0, %ymm2
+vroundps          $1, (%rax), %ymm2
+
+vroundsd          $1, %xmm0, %xmm1, %xmm2
+vroundsd          $1, (%rax), %xmm1, %xmm2
+
+vroundss          $1, %xmm0, %xmm1, %xmm2
+vroundss          $1, (%rax), %xmm1, %xmm2
+
+vrsqrtps          %xmm0, %xmm2
+vrsqrtps          (%rax), %xmm2
+
+vrsqrtps          %ymm0, %ymm2
+vrsqrtps          (%rax), %ymm2
+
+vrsqrtss          %xmm0, %xmm1, %xmm2
+vrsqrtss          (%rax), %xmm1, %xmm2
+
+vshufpd           $1, %xmm0, %xmm1, %xmm2
+vshufpd           $1, (%rax), %xmm1, %xmm2
+
+vshufpd           $1, %ymm0, %ymm1, %ymm2
+vshufpd           $1, (%rax), %ymm1, %ymm2
+
+vshufps           $1, %xmm0, %xmm1, %xmm2
+vshufps           $1, (%rax), %xmm1, %xmm2
+
+vshufps           $1, %ymm0, %ymm1, %ymm2
+vshufps           $1, (%rax), %ymm1, %ymm2
+
+vsqrtpd           %xmm0, %xmm2
+vsqrtpd           (%rax), %xmm2
+
+vsqrtpd           %ymm0, %ymm2
+vsqrtpd           (%rax), %ymm2
+
+vsqrtps           %xmm0, %xmm2
+vsqrtps           (%rax), %xmm2
+
+vsqrtps           %ymm0, %ymm2
+vsqrtps           (%rax), %ymm2
+
+vsqrtsd           %xmm0, %xmm1, %xmm2
+vsqrtsd           (%rax), %xmm1, %xmm2
+
+vsqrtss           %xmm0, %xmm1, %xmm2
+vsqrtss           (%rax), %xmm1, %xmm2
+
+vstmxcsr          (%rax)
+
+vsubpd            %xmm0, %xmm1, %xmm2
+vsubpd            (%rax), %xmm1, %xmm2
+
+vsubpd            %ymm0, %ymm1, %ymm2
+vsubpd            (%rax), %ymm1, %ymm2
+
+vsubps            %xmm0, %xmm1, %xmm2
+vsubps            (%rax), %xmm1, %xmm2
+
+vsubps            %ymm0, %ymm1, %ymm2
+vsubps            (%rax), %ymm1, %ymm2
+
+vsubsd            %xmm0, %xmm1, %xmm2
+vsubsd            (%rax), %xmm1, %xmm2
+
+vsubss            %xmm0, %xmm1, %xmm2
+vsubss            (%rax), %xmm1, %xmm2
+
+vtestpd          %xmm0, %xmm1
+vtestpd          (%rax), %xmm1
+
+vtestpd          %ymm0, %ymm1
+vtestpd          (%rax), %ymm1
+
+vtestps          %xmm0, %xmm1
+vtestps          (%rax), %xmm1
+
+vtestps          %ymm0, %ymm1
+vtestps          (%rax), %ymm1
+
+vucomisd          %xmm0, %xmm1
+vucomisd          (%rax), %xmm1
+
+vucomiss          %xmm0, %xmm1
+vucomiss          (%rax), %xmm1
+
+vunpckhpd         %xmm0, %xmm1, %xmm2
+vunpckhpd         (%rax), %xmm1, %xmm2
+
+vunpckhpd         %ymm0, %ymm1, %ymm2
+vunpckhpd         (%rax), %ymm1, %ymm2
+
+vunpckhps         %xmm0, %xmm1, %xmm2
+vunpckhps         (%rax), %xmm1, %xmm2
+
+vunpckhps         %ymm0, %ymm1, %ymm2
+vunpckhps         (%rax), %ymm1, %ymm2
+
+vunpcklpd         %xmm0, %xmm1, %xmm2
+vunpcklpd         (%rax), %xmm1, %xmm2
+
+vunpcklpd         %ymm0, %ymm1, %ymm2
+vunpcklpd         (%rax), %ymm1, %ymm2
+
+vunpcklps         %xmm0, %xmm1, %xmm2
+vunpcklps         (%rax), %xmm1, %xmm2
+
+vunpcklps         %ymm0, %ymm1, %ymm2
+vunpcklps         (%rax), %ymm1, %ymm2
+
+vxorpd            %xmm0, %xmm1, %xmm2
+vxorpd            (%rax), %xmm1, %xmm2
+
+vxorpd            %ymm0, %ymm1, %ymm2
+vxorpd            (%rax), %ymm1, %ymm2
+
+vxorps            %xmm0, %xmm1, %xmm2
+vxorps            (%rax), %xmm1, %xmm2
+
+vxorps            %ymm0, %ymm1, %ymm2
+vxorps            (%rax), %ymm1, %ymm2
+
+vzeroall
+vzeroupper
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        vaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vaddsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaddsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vaddsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vaddsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaddsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vaddsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vaddsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      9     1.00                        vaesdec	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   vaesdec	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00                        vaesdeclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   vaesdeclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00                        vaesenc	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   vaesenc	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     1.00                        vaesenclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      14    1.00    *                   vaesenclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaesimc	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaesimc	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        vaeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vaeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vandnpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vandnpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vandnpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vandnpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vandnps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vandnps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vandnps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vandnps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vandpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vandpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vandpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vandpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vandps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vandps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vandps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vandps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vblendpd	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vblendpd	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vblendpd	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vblendpd	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vblendps	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vblendps	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vblendps	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vblendps	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     2.00                        vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     3.00                        vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     3.00    *                   vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     2.00                        vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   vblendvps	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     3.00                        vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     3.00    *                   vblendvps	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.50    *                   vbroadcastf128	(%rax), %ymm2
+# CHECK-NEXT:  2      6     2.00    *                   vbroadcastsd	(%rax), %ymm2
+# CHECK-NEXT:  1      7     0.50    *                   vbroadcastss	(%rax), %xmm2
+# CHECK-NEXT:  2      6     2.00    *                   vbroadcastss	(%rax), %ymm2
+# CHECK-NEXT:  1      2     1.00                        vcmppd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vcmppd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vcmppd	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vcmppd	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vcmpps	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vcmpps	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vcmpps	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vcmpps	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vcmpsd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vcmpsd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vcmpss	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vcmpss	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      1     1.00                        vcomisd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vcomisd	(%rax), %xmm1
+# CHECK-NEXT:  2      1     1.00                        vcomiss	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vcomiss	(%rax), %xmm1
+# CHECK-NEXT:  2      8     1.00                        vcvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtdq2pd	%xmm0, %ymm2
+# CHECK-NEXT:  5      13    2.00    *                   vcvtdq2pd	(%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vcvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vcvtdq2ps	%ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vcvtdq2ps	(%rax), %ymm2
+# CHECK-NEXT:  2      8     1.00                        vcvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvtpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtpd2dq	%ymm0, %xmm2
+# CHECK-NEXT:  4      13    2.00    *                   vcvtpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  2      8     1.00                        vcvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvtpd2psx	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtpd2ps	%ymm0, %xmm2
+# CHECK-NEXT:  4      13    2.00    *                   vcvtpd2psy	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        vcvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vcvtps2dq	%ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vcvtps2dq	(%rax), %ymm2
+# CHECK-NEXT:  2      8     1.00                        vcvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtps2pd	%xmm0, %ymm2
+# CHECK-NEXT:  5      13    2.00    *                   vcvtps2pd	(%rax), %ymm2
+# CHECK-NEXT:  2      13    1.00                        vcvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        vcvtsd2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   vcvtsd2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   vcvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  1      4     1.00                        vcvtsd2ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvtsd2ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtsi2sdl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtsi2sdq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtsi2sdl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtsi2sdq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtsi2ssl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vcvtsi2ssq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtsi2ssl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vcvtsi2ssq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vcvtss2sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvtss2sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      13    1.00                        vcvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        vcvtss2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   vcvtss2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   vcvtss2si	(%rax), %rcx
+# CHECK-NEXT:  2      8     1.00                        vcvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   vcvttpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvttpd2dq	%ymm0, %xmm2
+# CHECK-NEXT:  4      13    2.00    *                   vcvttpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vcvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vcvttps2dq	%ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vcvttps2dq	(%rax), %ymm2
+# CHECK-NEXT:  2      13    1.00                        vcvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        vcvttsd2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   vcvttsd2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   vcvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  2      13    1.00                        vcvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        vcvttss2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   vcvttss2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   vcvttss2si	(%rax), %rcx
+# CHECK-NEXT:  1      9     9.50                        vdivpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   vdivpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     19.00                       vdivpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      14    19.00   *                   vdivpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      9     9.50                        vdivps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   vdivps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      9     19.00                       vdivps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      14    19.00   *                   vdivps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      9     9.50                        vdivsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   vdivsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      9     9.50                        vdivss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   vdivss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  15     15    1.50                        vdppd	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  17     20    1.50    *                   vdppd	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  17     25    1.50                        vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  18     30    1.50    *                   vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  25     27    3.00                        vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  29     32    3.00    *                   vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vextractf128	$1, %ymm0, %xmm2
+# CHECK-NEXT:  2      7     0.50           *            vextractf128	$1, %ymm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        vextractps	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            vextractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  3      11    1.00                        vhaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   vhaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  8      11    2.00                        vhaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     16    2.00    *                   vhaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  3      11    1.00                        vhaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   vhaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  8      11    2.00                        vhaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     16    2.00    *                   vhaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  3      11    1.00                        vhsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   vhsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  8      11    2.00                        vhsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     16    2.00    *                   vhsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  3      11    1.00                        vhsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   vhsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  8      11    2.00                        vhsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     16    2.00    *                   vhsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      2     0.50                        vinsertf128	$1, %xmm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     0.50    *                   vinsertf128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vinsertps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vinsertps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vlddqu	(%rax), %xmm2
+# CHECK-NEXT:  2      5     0.50    *                   vlddqu	(%rax), %ymm2
+# CHECK-NEXT:  1      5     0.50    *      *      U     vldmxcsr	(%rax)
+# CHECK-NEXT:  1      1     1.00    *      *      U     vmaskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT:  1      6     1.00    *                   vmaskmovpd	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00    *                   vmaskmovpd	(%rax), %ymm0, %ymm2
+# CHECK-NEXT:  18     6     2.00    *      *            vmaskmovpd	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT:  34     6     2.00    *      *            vmaskmovpd	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  1      6     1.00    *                   vmaskmovps	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  2      6     2.00    *                   vmaskmovps	(%rax), %ymm0, %ymm2
+# CHECK-NEXT:  18     6     2.00    *      *            vmaskmovps	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT:  34     6     2.00    *      *            vmaskmovps	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  1      2     1.00                        vmaxpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vmaxpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vmaxpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vmaxpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vmaxps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vmaxps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vmaxps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vmaxps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vmaxss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vmaxss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vminpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vminpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vminpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vminpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vminps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vminps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     2.00                        vminps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     2.00    *                   vminps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     1.00                        vminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00                        vminss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vminss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovapd	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovapd	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovapd	%ymm0, %ymm2
+# CHECK-NEXT:  4      1     1.00           *            vmovapd	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovapd	(%rax), %ymm2
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovaps	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovaps	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovaps	%ymm0, %ymm2
+# CHECK-NEXT:  4      1     1.00           *            vmovaps	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovaps	(%rax), %ymm2
+# CHECK-NEXT:  2      10    0.50                        vmovd	%eax, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovd	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00                        vmovd	%xmm0, %ecx
+# CHECK-NEXT:  1      2     1.00           *            vmovd	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        vmovddup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovddup	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovddup	%ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vmovddup	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovdqa	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovdqa	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovdqa	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovdqa	%ymm0, %ymm2
+# CHECK-NEXT:  4      1     1.00           *            vmovdqa	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovdqa	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovdqu	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovdqu	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovdqu	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovdqu	%ymm0, %ymm2
+# CHECK-NEXT:  8      1     1.00           *            vmovdqu	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovdqu	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovhlps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vmovlhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00           *            vmovhpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00           *            vmovhps	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            vmovlpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovlpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            vmovlps	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   vmovlps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      10    1.00                        vmovmskpd	%xmm0, %ecx
+# CHECK-NEXT:  2      10    1.00                        vmovmskpd	%ymm0, %ecx
+# CHECK-NEXT:  2      10    1.00                        vmovmskps	%xmm0, %ecx
+# CHECK-NEXT:  2      10    1.00                        vmovmskps	%ymm0, %ecx
+# CHECK-NEXT:  1      2     1.00           *            vmovntdq	%xmm0, (%rax)
+# CHECK-NEXT:  4      2     2.00           *            vmovntdq	%ymm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT:  1      3     1.00           *            vmovntpd	%xmm0, (%rax)
+# CHECK-NEXT:  4      3     2.00           *            vmovntpd	%ymm0, (%rax)
+# CHECK-NEXT:  1      3     1.00           *            vmovntps	%xmm0, (%rax)
+# CHECK-NEXT:  4      3     2.00           *            vmovntps	%ymm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        vmovq	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    0.50                        vmovq	%rax, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   vmovq	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00                        vmovq	%xmm0, %rcx
+# CHECK-NEXT:  1      2     1.00           *            vmovq	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        vmovsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            vmovsd	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vmovshdup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovshdup	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovshdup	%ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vmovshdup	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovsldup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vmovsldup	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovsldup	%ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vmovsldup	(%rax), %ymm2
+# CHECK-NEXT:  1      2     0.50                        vmovss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            vmovss	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovupd	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovupd	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovupd	%ymm0, %ymm2
+# CHECK-NEXT:  8      1     1.00           *            vmovupd	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovupd	(%rax), %ymm2
+# CHECK-NEXT:  1      1     0.50                        vmovups	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            vmovups	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   vmovups	(%rax), %xmm2
+# CHECK-NEXT:  2      2     1.00                        vmovups	%ymm0, %ymm2
+# CHECK-NEXT:  8      1     1.00           *            vmovups	%ymm0, (%rax)
+# CHECK-NEXT:  2      5     0.50    *                   vmovups	(%rax), %ymm2
+# CHECK-NEXT:  9      9     2.00                        vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  9      14    2.00    *                   vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vmulpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vmulpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vmulpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vmulpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vmulps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vmulps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vmulsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vmulsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vmulss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vmulss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vpabsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpabsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpabsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpabsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpabsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpabsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpackssdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpackssdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpacksswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpacksswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpackusdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpackusdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpackuswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpackuswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpalignr	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpalignr	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpand	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpandn	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpandn	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpavgb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpavgb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpavgw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpavgw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     2.00                        vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpblendw	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpblendw	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  6      13    1.00                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  6      17    1.00    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpeqb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpeqd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpeqq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpeqw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpeqw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  27     15    4.00                        vpcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  28     20    4.50    *                   vpcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT:  27     10    4.00                        vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  28     15    4.50    *                   vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpgtb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpgtd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpgtq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpcmpgtw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  7      14    1.00                        vpcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  8      19    1.00    *                   vpcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      6     1.00                        vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  9      11    1.00    *                   vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  8      4     0.50                        vperm2f128	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  10     8     0.50    *                   vperm2f128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vpermilpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpermilpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     2.00                        vpermilpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     2.00    *                   vpermilpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpermilpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vpermilpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  2      3     3.00                        vpermilpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     3.00    *                   vpermilpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vpermilps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpermilps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      3     2.00                        vpermilps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     2.00    *                   vpermilps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vpermilps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vpermilps	$1, (%rax), %ymm2
+# CHECK-NEXT:  2      3     3.00                        vpermilps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      8     3.00    *                   vpermilps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  2      13    1.00                        vpextrb	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            vpextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        vpextrd	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            vpextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        vpextrq	$1, %xmm0, %rcx
+# CHECK-NEXT:  2      13    1.00           *            vpextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        vpextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            vpextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  3      5     0.50                        vphaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     1.00                        vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  3      5     0.50                        vphsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   vphsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     0.50                        vpinsrb	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     0.50    *                   vpinsrb	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     0.50                        vpinsrd	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     0.50    *                   vpinsrd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     0.50                        vpinsrq	$1, %rax, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     0.50    *                   vpinsrq	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     0.50                        vpinsrw	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT:  2      6     0.50    *                   vpinsrw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmaddubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmaddubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmaddwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmaddwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmaxuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmaxuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpminuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpminuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      13    1.00                        vpmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  1      2     0.50                        vpmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmuldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmuldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmulhrsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmulhrsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmulhuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmulhuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmulhw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmulhw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     2.00                        vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    2.00    *                   vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmullw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmullw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vpmuludq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vpmuludq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     0.50                        vpsadbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  2      9     0.50    *                   vpsadbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      3     2.00                        vpshufb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     2.00    *                   vpshufb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsignb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsignb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsignd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsignd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsignw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsignw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpslld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpslld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpslld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpslldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsllq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsllq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsllq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsllw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsllw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsllw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrad	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrad	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsrad	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsraw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsraw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsraw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsrld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrlq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrlq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsrlq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsrlw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        vpsrlw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   vpsrlw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      1     1.00                        vptest	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vptest	(%rax), %xmm1
+# CHECK-NEXT:  4      1     1.00                        vptest	%ymm0, %ymm1
+# CHECK-NEXT:  6      6     1.00    *                   vptest	(%rax), %ymm1
+# CHECK-NEXT:  1      2     0.50                        vpunpckhbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckhbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpckhdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckhdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpckhqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckhqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpckhwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckhwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpcklbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpcklbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpckldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpckldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpcklqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpcklqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpunpcklwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpunpcklwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpxor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vpxor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vrcpps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vrcpps	(%rax), %xmm2
+# CHECK-NEXT:  2      5     2.00                        vrcpps	%ymm0, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vrcpps	(%rax), %ymm2
+# CHECK-NEXT:  1      5     1.00                        vrcpss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vrcpss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vroundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vroundpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vroundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  2      4     2.00                        vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vroundps	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vroundsd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      4     1.00                        vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vroundss	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vrsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vrsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  2      5     2.00                        vrsqrtps	%ymm0, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vrsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  1      5     1.00                        vrsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vrsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vshufpd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vshufpd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vshufpd	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vshufpd	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vshufps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vshufps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vshufps	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vshufps	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      9     13.50                       vsqrtpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    13.50   *                   vsqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  2      9     27.00                       vsqrtpd	%ymm0, %ymm2
+# CHECK-NEXT:  2      14    27.00   *                   vsqrtpd	(%rax), %ymm2
+# CHECK-NEXT:  1      9     10.50                       vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    10.50   *                   vsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  2      9     21.00                       vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT:  2      14    21.00   *                   vsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  1      9     13.50                       vsqrtsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    13.50   *                   vsqrtsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      9     10.50                       vsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      14    10.50   *                   vsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      1     0.50    *      *      U     vstmxcsr	(%rax)
+# CHECK-NEXT:  1      5     1.00                        vsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    2.00    *                   vsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     1.00                        vsubsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vsubsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     1.00                        vsubss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   vsubss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      1     1.00                        vtestpd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vtestpd	(%rax), %xmm1
+# CHECK-NEXT:  4      1     1.00                        vtestpd	%ymm0, %ymm1
+# CHECK-NEXT:  6      6     1.00    *                   vtestpd	(%rax), %ymm1
+# CHECK-NEXT:  2      1     1.00                        vtestps	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vtestps	(%rax), %xmm1
+# CHECK-NEXT:  4      1     1.00                        vtestps	%ymm0, %ymm1
+# CHECK-NEXT:  6      6     1.00    *                   vtestps	(%rax), %ymm1
+# CHECK-NEXT:  2      1     1.00                        vucomisd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vucomisd	(%rax), %xmm1
+# CHECK-NEXT:  2      1     1.00                        vucomiss	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   vucomiss	(%rax), %xmm1
+# CHECK-NEXT:  1      2     0.50                        vunpckhpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vunpckhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vunpckhpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpckhpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vunpckhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vunpckhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vunpckhps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpckhps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vunpcklpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vunpcklpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vunpcklpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpcklpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vunpcklps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vunpcklps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vunpcklps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vunpcklps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vxorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vxorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vxorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vxorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      2     0.50                        vxorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   vxorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      2     1.00                        vxorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      7     1.00    *                   vxorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  32     90    8.00    *      *      U     vzeroall
+# CHECK-NEXT:  16     46    4.00    *      *      U     vzeroupper
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 229.00 229.00  -      -      -     56.00   -      -     588.00 588.00 127.50 127.50 38.00  107.00 402.50 429.50  -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vaddsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdec	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdec	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdeclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesdeclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenc	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenc	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenclast	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesenclast	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesimc	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaesimc	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaeskeygenassist	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vaeskeygenassist	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandnps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandnps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vandps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vandps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendpd	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendpd	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendpd	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendpd	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendps	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vblendps	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendps	$11, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendps	$11, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvpd	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvpd	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvpd	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvpd	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvps	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vblendvps	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvps	%ymm3, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vblendvps	%ymm3, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vbroadcastf128	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vbroadcastsd	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vbroadcastss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vbroadcastss	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmppd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmppd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmppd	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmppd	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpps	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpps	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmpps	$0, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vcmpps	$0, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpsd	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpsd	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpss	$0, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcmpss	$0, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vcomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2pd	%xmm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2pd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2ps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtdq2ps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2dq	%ymm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtpd2psx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2ps	%ymm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtpd2psy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2dq	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2dq	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2pd	%xmm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2pd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsd2ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2sdq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssl	%ecx, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssq	%rcx, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssl	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtsi2ssq	(%rax), %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvtss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttpd2dqx	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvttpd2dq	%ymm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvttpd2dqy	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvttps2dq	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvttps2dq	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     vcvttss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     19.00  19.00   -      -      -      -      -     2.00    -      -      -      -     vdivps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     vdivss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdppd	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdppd	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdpps	$22, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     vdpps	$22, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -      -     2.00    -      -      -      -     vdpps	$22, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -      -     2.00    -      -      -      -     vdpps	$22, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vextractf128	$1, %ymm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vextractf128	$1, %ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vextractps	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vextractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vhsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vhsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertf128	$1, %xmm0, %ymm1, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertf128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vinsertps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vlddqu	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vlddqu	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vldmxcsr	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmaskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovpd	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovpd	(%rax), %ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovpd	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovpd	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovps	(%rax), %xmm0, %xmm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovps	(%rax), %ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vmaskmovps	%xmm0, %xmm1, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vmaskmovps	%ymm0, %ymm1, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vmaxps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmaxss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vminps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vminss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovapd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovapd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovapd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovapd	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovapd	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovapd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovaps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovaps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovaps	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovaps	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovaps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovd	%eax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovd	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovddup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovddup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovddup	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovddup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqa	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqa	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -      -      -      -     vmovdqa	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqa	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqa	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqu	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqu	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -     1.00   1.00    -      -      -      -     vmovdqu	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovdqu	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovdqu	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovhlps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovlhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovhpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovhps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovlpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovlpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovlps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovlps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskpd	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskpd	%ymm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskps	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovmskps	%ymm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovntdq	%xmm0, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     vmovntdq	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovntdqa	(%rax), %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovntpd	%xmm0, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     vmovntpd	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovntps	%xmm0, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -     vmovntps	%ymm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovq	%rax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vmovq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vmovq	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovsd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovshdup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovshdup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovshdup	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovshdup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsldup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovsldup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovsldup	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovsldup	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovss	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovupd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovupd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovupd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovupd	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovupd	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovupd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovups	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovups	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovups	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vmovups	%ymm0, %ymm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vmovups	%ymm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vmovups	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     vmpsadbw	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     vmpsadbw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vmulss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpabsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackssdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackssdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpacksswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpacksswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackusdw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackusdw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackuswb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpackuswb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpalignr	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpalignr	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpand	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpand	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpandn	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpandn	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpavgw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpblendvb	%xmm3, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpblendw	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpblendw	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpeqw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmpgtw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vperm2f128	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vperm2f128	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermilps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermilps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrb	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrd	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrq	$1, %xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpextrw	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vpextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     vphminposuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     vphminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrb	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrb	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrd	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrq	$1, %rax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrq	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrw	$1, %eax, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpinsrw	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmaddwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmaxuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminub	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminub	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminud	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminud	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpminuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vpmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhrsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhrsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhuw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhuw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmulhw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmulld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmulld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmullw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmullw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuludq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmuludq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsadbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsadbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpshufb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpshufb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsignw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpslldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsllw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrad	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrad	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrad	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsraw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsraw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsraw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrld	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrld	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrld	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrldq	$1, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlq	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsrlw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusb	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusb	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubusw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vptest	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vptest	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vptest	%ymm0, %ymm1
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vptest	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckhwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklbw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklbw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckldq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpckldq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklqdq	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklqdq	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklwd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpunpcklwd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpxor	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpxor	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrcpps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrcpps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrcpss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundpd	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundps	$1, %ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vroundps	$1, (%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundsd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundsd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundss	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vroundss	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vrsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     vrsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufpd	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufpd	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufpd	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufpd	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufps	$1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vshufps	$1, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufps	$1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vshufps	$1, (%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     27.00  27.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtpd	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     27.00  27.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtpd	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtps	%ymm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     21.00  21.00   -      -      -      -      -     2.00    -      -      -      -     vsqrtps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     vsqrtss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vstmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubsd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubsd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vsubss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestpd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestpd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestpd	%ymm0, %ymm1
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestpd	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestps	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vtestps	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestps	%ymm0, %ymm1
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vtestps	(%rax), %ymm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     vucomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpckhps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpckhps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vunpcklps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vunpcklps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     vxorps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -      -      -     1.00   1.00    -      -      -      -     vxorps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vzeroall
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vzeroupper
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s b/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
new file mode 100644
index 0000000000000000000000000000000000000000..f1b155346b3c76ea3e55a63fdb7be9504adb6520
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-bmi1.s
@@ -0,0 +1,125 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+andn        %eax, %ebx, %ecx
+andn        (%rax), %ebx, %ecx
+
+andn        %rax, %rbx, %rcx
+andn        (%rax), %rbx, %rcx
+
+bextr       %eax, %ebx, %ecx
+bextr       %eax, (%rbx), %ecx
+
+bextr       %rax, %rbx, %rcx
+bextr       %rax, (%rbx), %rcx
+
+blsi        %eax, %ecx
+blsi        (%rax), %ecx
+
+blsi        %rax, %rcx
+blsi        (%rax), %rcx
+
+blsmsk      %eax, %ecx
+blsmsk      (%rax), %ecx
+
+blsmsk      %rax, %rcx
+blsmsk      (%rax), %rcx
+
+blsr        %eax, %ecx
+blsr        (%rax), %ecx
+
+blsr        %rax, %rcx
+blsr        (%rax), %rcx
+
+tzcnt       %eax, %ecx
+tzcnt       (%rax), %ecx
+
+tzcnt       %rax, %rcx
+tzcnt       (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        andnl	%eax, %ebx, %ecx
+# CHECK-NEXT:  1      5     0.50    *                   andnl	(%rax), %ebx, %ecx
+# CHECK-NEXT:  1      1     0.50                        andnq	%rax, %rbx, %rcx
+# CHECK-NEXT:  1      5     0.50    *                   andnq	(%rax), %rbx, %rcx
+# CHECK-NEXT:  2      2     0.50                        bextrl	%eax, %ebx, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   bextrl	%eax, (%rbx), %ecx
+# CHECK-NEXT:  2      2     0.50                        bextrq	%rax, %rbx, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   bextrq	%rax, (%rbx), %rcx
+# CHECK-NEXT:  2      2     0.50                        blsil	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsil	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blsiq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsiq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        blsmskl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsmskl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blsmskq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsmskq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        blsrl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsrl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blsrq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsrq	(%rax), %rcx
+# CHECK-NEXT:  2      2     1.00                        tzcntl	%eax, %ecx
+# CHECK-NEXT:  2      6     1.00    *                   tzcntl	(%rax), %ecx
+# CHECK-NEXT:  2      2     1.00                        tzcntq	%rax, %rcx
+# CHECK-NEXT:  2      6     1.00    *                   tzcntq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 6.00   6.00    -      -      -     14.00  14.00   -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnl	%eax, %ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnl	(%rax), %ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnq	%rax, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andnq	(%rax), %rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	%eax, %ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	%eax, (%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	%rax, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	%rax, (%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsil	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsil	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsiq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsiq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsmskq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s b/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
new file mode 100644
index 0000000000000000000000000000000000000000..1f6b9ed0b3a3f552d25d40be67356827ea1f7342
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-clflushopt.s
@@ -0,0 +1,45 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+clflushopt (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *      *      U     clflushopt	(%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     clflushopt	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s b/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
new file mode 100644
index 0000000000000000000000000000000000000000..93151dc1a72598fd1964723456b32d40621eb172
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-cmov.s
@@ -0,0 +1,335 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+cmovow    %si, %di
+cmovnow   %si, %di
+cmovbw    %si, %di
+cmovaew   %si, %di
+cmovew    %si, %di
+cmovnew   %si, %di
+cmovbew   %si, %di
+cmovaw    %si, %di
+cmovsw    %si, %di
+cmovnsw   %si, %di
+cmovpw    %si, %di
+cmovnpw   %si, %di
+cmovlw    %si, %di
+cmovgew   %si, %di
+cmovlew   %si, %di
+cmovgw    %si, %di
+
+cmovow    (%rax), %di
+cmovnow   (%rax), %di
+cmovbw    (%rax), %di
+cmovaew   (%rax), %di
+cmovew    (%rax), %di
+cmovnew   (%rax), %di
+cmovbew   (%rax), %di
+cmovaw    (%rax), %di
+cmovsw    (%rax), %di
+cmovnsw   (%rax), %di
+cmovpw    (%rax), %di
+cmovnpw   (%rax), %di
+cmovlw    (%rax), %di
+cmovgew   (%rax), %di
+cmovlew   (%rax), %di
+cmovgw    (%rax), %di
+
+cmovol    %esi, %edi
+cmovnol   %esi, %edi
+cmovbl    %esi, %edi
+cmovael   %esi, %edi
+cmovel    %esi, %edi
+cmovnel   %esi, %edi
+cmovbel   %esi, %edi
+cmoval    %esi, %edi
+cmovsl    %esi, %edi
+cmovnsl   %esi, %edi
+cmovpl    %esi, %edi
+cmovnpl   %esi, %edi
+cmovll    %esi, %edi
+cmovgel   %esi, %edi
+cmovlel   %esi, %edi
+cmovgl    %esi, %edi
+
+cmovol    (%rax), %edi
+cmovnol   (%rax), %edi
+cmovbl    (%rax), %edi
+cmovael   (%rax), %edi
+cmovel    (%rax), %edi
+cmovnel   (%rax), %edi
+cmovbel   (%rax), %edi
+cmoval    (%rax), %edi
+cmovsl    (%rax), %edi
+cmovnsl   (%rax), %edi
+cmovpl    (%rax), %edi
+cmovnpl   (%rax), %edi
+cmovll    (%rax), %edi
+cmovgel   (%rax), %edi
+cmovlel   (%rax), %edi
+cmovgl    (%rax), %edi
+
+cmovoq    %rsi, %rdi
+cmovnoq   %rsi, %rdi
+cmovbq    %rsi, %rdi
+cmovaeq   %rsi, %rdi
+cmoveq    %rsi, %rdi
+cmovneq   %rsi, %rdi
+cmovbeq   %rsi, %rdi
+cmovaq    %rsi, %rdi
+cmovsq    %rsi, %rdi
+cmovnsq   %rsi, %rdi
+cmovpq    %rsi, %rdi
+cmovnpq   %rsi, %rdi
+cmovlq    %rsi, %rdi
+cmovgeq   %rsi, %rdi
+cmovleq   %rsi, %rdi
+cmovgq    %rsi, %rdi
+
+cmovoq    (%rax), %rdi
+cmovnoq   (%rax), %rdi
+cmovbq    (%rax), %rdi
+cmovaeq   (%rax), %rdi
+cmoveq    (%rax), %rdi
+cmovneq   (%rax), %rdi
+cmovbeq   (%rax), %rdi
+cmovaq    (%rax), %rdi
+cmovsq    (%rax), %rdi
+cmovnsq   (%rax), %rdi
+cmovpq    (%rax), %rdi
+cmovnpq   (%rax), %rdi
+cmovlq    (%rax), %rdi
+cmovgeq   (%rax), %rdi
+cmovleq   (%rax), %rdi
+cmovgq    (%rax), %rdi
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        cmovow	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovnow	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovbw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovaew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovnew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovbew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovaw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovsw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovnsw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovpw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovnpw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovlw	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovgew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovlew	%si, %di
+# CHECK-NEXT:  1      1     0.50                        cmovgw	%si, %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovow	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovnow	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovbw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovaew	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovew	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovnew	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovbew	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovaw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovsw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovnsw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovpw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   cmovnpw	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovlw	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovgew	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovlew	(%rax), %di
+# CHECK-NEXT:  2      5     0.50    *                   cmovgw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        cmovol	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovnol	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovbl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovael	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovnel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovbel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmoval	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovsl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovnsl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovpl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovnpl	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovll	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovgel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovlel	%esi, %edi
+# CHECK-NEXT:  1      1     0.50                        cmovgl	%esi, %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovol	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnol	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovbl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovael	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovel	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnel	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovbel	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmoval	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovsl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnsl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovpl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnpl	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovll	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovgel	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovlel	(%rax), %edi
+# CHECK-NEXT:  2      5     0.50    *                   cmovgl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        cmovoq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovnoq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovbq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovaeq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmoveq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovneq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovbeq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovaq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovsq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovnsq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovpq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovnpq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovlq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovgeq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovleq	%rsi, %rdi
+# CHECK-NEXT:  1      1     0.50                        cmovgq	%rsi, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovoq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnoq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovbq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovaeq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmoveq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovneq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovbeq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovaq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovsq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnsq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovpq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmovnpq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovlq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovgeq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovleq	(%rax), %rdi
+# CHECK-NEXT:  2      5     0.50    *                   cmovgq	(%rax), %rdi
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 24.00  24.00   -      -      -     48.00  48.00   -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovow	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnow	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlew	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovow	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnow	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlew	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovol	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnol	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoval	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovll	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlel	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovol	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnol	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovael	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoval	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovll	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlel	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovoq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnoq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaeq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoveq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovneq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbeq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgeq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovleq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovoq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnoq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaeq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmoveq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovneq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovbeq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovaq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovsq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnsq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovpq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovnpq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovlq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgeq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovleq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmovgq	(%rax), %rdi
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s b/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
new file mode 100644
index 0000000000000000000000000000000000000000..d0ec04a5ee01aab281ad3b72ef3dead35b7e681f
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-cmpxchg.s
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+cmpxchg8b  (%rax)
+cmpxchg16b (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  18     3     1.00    *      *            cmpxchg8b	(%rax)
+# CHECK-NEXT:  22     3     1.00    *      *            cmpxchg16b	(%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s b/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
new file mode 100644
index 0000000000000000000000000000000000000000..757687a4af7b93de69a9f13c1b0b7321323e7436
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-f16c.s
@@ -0,0 +1,69 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+vcvtph2ps   %xmm0, %xmm2
+vcvtph2ps   (%rax), %xmm2
+
+vcvtph2ps   %xmm0, %ymm2
+vcvtph2ps   (%rax), %ymm2
+
+vcvtps2ph   $0, %xmm0, %xmm2
+vcvtps2ph   $0, %xmm0, (%rax)
+
+vcvtps2ph   $0, %ymm0, %xmm2
+vcvtps2ph   $0, %ymm0, (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      8     1.00                        vcvtph2ps	%xmm0, %xmm2
+# CHECK-NEXT:  3      13    1.00    *                   vcvtph2ps	(%rax), %xmm2
+# CHECK-NEXT:  4      8     2.00                        vcvtph2ps	%xmm0, %ymm2
+# CHECK-NEXT:  7      13    2.00    *                   vcvtph2ps	(%rax), %ymm2
+# CHECK-NEXT:  2      8     1.00                        vcvtps2ph	$0, %xmm0, %xmm2
+# CHECK-NEXT:  3      4     1.00           *            vcvtps2ph	$0, %xmm0, (%rax)
+# CHECK-NEXT:  4      8     2.00                        vcvtps2ph	$0, %ymm0, %xmm2
+# CHECK-NEXT:  4      4     2.00           *            vcvtps2ph	$0, %ymm0, (%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 2.50   2.50    -      -      -      -      -      -     1.00   1.00    -      -      -     8.00    -     12.00   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtph2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtph2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtph2ps	%xmm0, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vcvtph2ps	(%rax), %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2ph	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vcvtps2ph	$0, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2ph	$0, %ymm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -     1.00    -     2.00    -      -      -      -     vcvtps2ph	$0, %ymm0, (%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-fma.s b/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
new file mode 100644
index 0000000000000000000000000000000000000000..104b07fc5e6ff98da1aab861ed2d4d75299c87d3
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-fma.s
@@ -0,0 +1,713 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+vfmadd132pd %xmm0, %xmm1, %xmm2
+vfmadd132pd (%rax), %xmm1, %xmm2
+
+vfmadd132pd %ymm0, %ymm1, %ymm2
+vfmadd132pd (%rax), %ymm1, %ymm2
+
+vfmadd213pd %xmm0, %xmm1, %xmm2
+vfmadd213pd (%rax), %xmm1, %xmm2
+
+vfmadd213pd %ymm0, %ymm1, %ymm2
+vfmadd213pd (%rax), %ymm1, %ymm2
+
+vfmadd231pd %xmm0, %xmm1, %xmm2
+vfmadd231pd (%rax), %xmm1, %xmm2
+
+vfmadd231pd %ymm0, %ymm1, %ymm2
+vfmadd231pd (%rax), %ymm1, %ymm2
+
+vfmadd132ps %xmm0, %xmm1, %xmm2
+vfmadd132ps (%rax), %xmm1, %xmm2
+
+vfmadd132ps %ymm0, %ymm1, %ymm2
+vfmadd132ps (%rax), %ymm1, %ymm2
+
+vfmadd213ps %xmm0, %xmm1, %xmm2
+vfmadd213ps (%rax), %xmm1, %xmm2
+
+vfmadd213ps %ymm0, %ymm1, %ymm2
+vfmadd213ps (%rax), %ymm1, %ymm2
+
+vfmadd231ps %xmm0, %xmm1, %xmm2
+vfmadd231ps (%rax), %xmm1, %xmm2
+
+vfmadd231ps %ymm0, %ymm1, %ymm2
+vfmadd231ps (%rax), %ymm1, %ymm2
+
+vfmadd132sd %xmm0, %xmm1, %xmm2
+vfmadd132sd (%rax), %xmm1, %xmm2
+
+vfmadd213sd %xmm0, %xmm1, %xmm2
+vfmadd213sd (%rax), %xmm1, %xmm2
+
+vfmadd231sd %xmm0, %xmm1, %xmm2
+vfmadd231sd (%rax), %xmm1, %xmm2
+
+vfmadd132ss %xmm0, %xmm1, %xmm2
+vfmadd132ss (%rax), %xmm1, %xmm2
+
+vfmadd213ss %xmm0, %xmm1, %xmm2
+vfmadd213ss (%rax), %xmm1, %xmm2
+
+vfmadd231ss %xmm0, %xmm1, %xmm2
+vfmadd231ss (%rax), %xmm1, %xmm2
+
+vfmaddsub132pd %xmm0, %xmm1, %xmm2
+vfmaddsub132pd (%rax), %xmm1, %xmm2
+
+vfmaddsub132pd %ymm0, %ymm1, %ymm2
+vfmaddsub132pd (%rax), %ymm1, %ymm2
+
+vfmaddsub213pd %xmm0, %xmm1, %xmm2
+vfmaddsub213pd (%rax), %xmm1, %xmm2
+
+vfmaddsub213pd %ymm0, %ymm1, %ymm2
+vfmaddsub213pd (%rax), %ymm1, %ymm2
+
+vfmaddsub231pd %xmm0, %xmm1, %xmm2
+vfmaddsub231pd (%rax), %xmm1, %xmm2
+
+vfmaddsub231pd %ymm0, %ymm1, %ymm2
+vfmaddsub231pd (%rax), %ymm1, %ymm2
+
+vfmaddsub132ps %xmm0, %xmm1, %xmm2
+vfmaddsub132ps (%rax), %xmm1, %xmm2
+
+vfmaddsub132ps %ymm0, %ymm1, %ymm2
+vfmaddsub132ps (%rax), %ymm1, %ymm2
+
+vfmaddsub213ps %xmm0, %xmm1, %xmm2
+vfmaddsub213ps (%rax), %xmm1, %xmm2
+
+vfmaddsub213ps %ymm0, %ymm1, %ymm2
+vfmaddsub213ps (%rax), %ymm1, %ymm2
+
+vfmaddsub231ps %xmm0, %xmm1, %xmm2
+vfmaddsub231ps (%rax), %xmm1, %xmm2
+
+vfmaddsub231ps %ymm0, %ymm1, %ymm2
+vfmaddsub231ps (%rax), %ymm1, %ymm2
+
+vfmsub132pd %xmm0, %xmm1, %xmm2
+vfmsub132pd (%rax), %xmm1, %xmm2
+
+vfmsub132pd %ymm0, %ymm1, %ymm2
+vfmsub132pd (%rax), %ymm1, %ymm2
+
+vfmsub213pd %xmm0, %xmm1, %xmm2
+vfmsub213pd (%rax), %xmm1, %xmm2
+
+vfmsub213pd %ymm0, %ymm1, %ymm2
+vfmsub213pd (%rax), %ymm1, %ymm2
+
+vfmsub231pd %xmm0, %xmm1, %xmm2
+vfmsub231pd (%rax), %xmm1, %xmm2
+
+vfmsub231pd %ymm0, %ymm1, %ymm2
+vfmsub231pd (%rax), %ymm1, %ymm2
+
+vfmsub132ps %xmm0, %xmm1, %xmm2
+vfmsub132ps (%rax), %xmm1, %xmm2
+
+vfmsub132ps %ymm0, %ymm1, %ymm2
+vfmsub132ps (%rax), %ymm1, %ymm2
+
+vfmsub213ps %xmm0, %xmm1, %xmm2
+vfmsub213ps (%rax), %xmm1, %xmm2
+
+vfmsub213ps %ymm0, %ymm1, %ymm2
+vfmsub213ps (%rax), %ymm1, %ymm2
+
+vfmsub231ps %xmm0, %xmm1, %xmm2
+vfmsub231ps (%rax), %xmm1, %xmm2
+
+vfmsub231ps %ymm0, %ymm1, %ymm2
+vfmsub231ps (%rax), %ymm1, %ymm2
+
+vfmsub132sd %xmm0, %xmm1, %xmm2
+vfmsub132sd (%rax), %xmm1, %xmm2
+
+vfmsub213sd %xmm0, %xmm1, %xmm2
+vfmsub213sd (%rax), %xmm1, %xmm2
+
+vfmsub231sd %xmm0, %xmm1, %xmm2
+vfmsub231sd (%rax), %xmm1, %xmm2
+
+vfmsub132ss %xmm0, %xmm1, %xmm2
+vfmsub132ss (%rax), %xmm1, %xmm2
+
+vfmsub213ss %xmm0, %xmm1, %xmm2
+vfmsub213ss (%rax), %xmm1, %xmm2
+
+vfmsub231ss %xmm0, %xmm1, %xmm2
+vfmsub231ss (%rax), %xmm1, %xmm2
+
+vfmsubadd132pd %xmm0, %xmm1, %xmm2
+vfmsubadd132pd (%rax), %xmm1, %xmm2
+
+vfmsubadd132pd %ymm0, %ymm1, %ymm2
+vfmsubadd132pd (%rax), %ymm1, %ymm2
+
+vfmsubadd213pd %xmm0, %xmm1, %xmm2
+vfmsubadd213pd (%rax), %xmm1, %xmm2
+
+vfmsubadd213pd %ymm0, %ymm1, %ymm2
+vfmsubadd213pd (%rax), %ymm1, %ymm2
+
+vfmsubadd231pd %xmm0, %xmm1, %xmm2
+vfmsubadd231pd (%rax), %xmm1, %xmm2
+
+vfmsubadd231pd %ymm0, %ymm1, %ymm2
+vfmsubadd231pd (%rax), %ymm1, %ymm2
+
+vfmsubadd132ps %xmm0, %xmm1, %xmm2
+vfmsubadd132ps (%rax), %xmm1, %xmm2
+
+vfmsubadd132ps %ymm0, %ymm1, %ymm2
+vfmsubadd132ps (%rax), %ymm1, %ymm2
+
+vfmsubadd213ps %xmm0, %xmm1, %xmm2
+vfmsubadd213ps (%rax), %xmm1, %xmm2
+
+vfmsubadd213ps %ymm0, %ymm1, %ymm2
+vfmsubadd213ps (%rax), %ymm1, %ymm2
+
+vfmsubadd231ps %xmm0, %xmm1, %xmm2
+vfmsubadd231ps (%rax), %xmm1, %xmm2
+
+vfmsubadd231ps %ymm0, %ymm1, %ymm2
+vfmsubadd231ps (%rax), %ymm1, %ymm2
+
+vfnmadd132pd %xmm0, %xmm1, %xmm2
+vfnmadd132pd (%rax), %xmm1, %xmm2
+
+vfnmadd132pd %ymm0, %ymm1, %ymm2
+vfnmadd132pd (%rax), %ymm1, %ymm2
+
+vfnmadd213pd %xmm0, %xmm1, %xmm2
+vfnmadd213pd (%rax), %xmm1, %xmm2
+
+vfnmadd213pd %ymm0, %ymm1, %ymm2
+vfnmadd213pd (%rax), %ymm1, %ymm2
+
+vfnmadd231pd %xmm0, %xmm1, %xmm2
+vfnmadd231pd (%rax), %xmm1, %xmm2
+
+vfnmadd231pd %ymm0, %ymm1, %ymm2
+vfnmadd231pd (%rax), %ymm1, %ymm2
+
+vfnmadd132ps %xmm0, %xmm1, %xmm2
+vfnmadd132ps (%rax), %xmm1, %xmm2
+
+vfnmadd132ps %ymm0, %ymm1, %ymm2
+vfnmadd132ps (%rax), %ymm1, %ymm2
+
+vfnmadd213ps %xmm0, %xmm1, %xmm2
+vfnmadd213ps (%rax), %xmm1, %xmm2
+
+vfnmadd213ps %ymm0, %ymm1, %ymm2
+vfnmadd213ps (%rax), %ymm1, %ymm2
+
+vfnmadd231ps %xmm0, %xmm1, %xmm2
+vfnmadd231ps (%rax), %xmm1, %xmm2
+
+vfnmadd231ps %ymm0, %ymm1, %ymm2
+vfnmadd231ps (%rax), %ymm1, %ymm2
+
+vfnmadd132sd %xmm0, %xmm1, %xmm2
+vfnmadd132sd (%rax), %xmm1, %xmm2
+
+vfnmadd213sd %xmm0, %xmm1, %xmm2
+vfnmadd213sd (%rax), %xmm1, %xmm2
+
+vfnmadd231sd %xmm0, %xmm1, %xmm2
+vfnmadd231sd (%rax), %xmm1, %xmm2
+
+vfnmadd132ss %xmm0, %xmm1, %xmm2
+vfnmadd132ss (%rax), %xmm1, %xmm2
+
+vfnmadd213ss %xmm0, %xmm1, %xmm2
+vfnmadd213ss (%rax), %xmm1, %xmm2
+
+vfnmadd231ss %xmm0, %xmm1, %xmm2
+vfnmadd231ss (%rax), %xmm1, %xmm2
+
+vfnmsub132pd %xmm0, %xmm1, %xmm2
+vfnmsub132pd (%rax), %xmm1, %xmm2
+
+vfnmsub132pd %ymm0, %ymm1, %ymm2
+vfnmsub132pd (%rax), %ymm1, %ymm2
+
+vfnmsub213pd %xmm0, %xmm1, %xmm2
+vfnmsub213pd (%rax), %xmm1, %xmm2
+
+vfnmsub213pd %ymm0, %ymm1, %ymm2
+vfnmsub213pd (%rax), %ymm1, %ymm2
+
+vfnmsub231pd %xmm0, %xmm1, %xmm2
+vfnmsub231pd (%rax), %xmm1, %xmm2
+
+vfnmsub231pd %ymm0, %ymm1, %ymm2
+vfnmsub231pd (%rax), %ymm1, %ymm2
+
+vfnmsub132ps %xmm0, %xmm1, %xmm2
+vfnmsub132ps (%rax), %xmm1, %xmm2
+
+vfnmsub132ps %ymm0, %ymm1, %ymm2
+vfnmsub132ps (%rax), %ymm1, %ymm2
+
+vfnmsub213ps %xmm0, %xmm1, %xmm2
+vfnmsub213ps (%rax), %xmm1, %xmm2
+
+vfnmsub213ps %ymm0, %ymm1, %ymm2
+vfnmsub213ps (%rax), %ymm1, %ymm2
+
+vfnmsub231ps %xmm0, %xmm1, %xmm2
+vfnmsub231ps (%rax), %xmm1, %xmm2
+
+vfnmsub231ps %ymm0, %ymm1, %ymm2
+vfnmsub231ps (%rax), %ymm1, %ymm2
+
+vfnmsub132sd %xmm0, %xmm1, %xmm2
+vfnmsub132sd (%rax), %xmm1, %xmm2
+
+vfnmsub213sd %xmm0, %xmm1, %xmm2
+vfnmsub213sd (%rax), %xmm1, %xmm2
+
+vfnmsub231sd %xmm0, %xmm1, %xmm2
+vfnmsub231sd (%rax), %xmm1, %xmm2
+
+vfnmsub132ss %xmm0, %xmm1, %xmm2
+vfnmsub132ss (%rax), %xmm1, %xmm2
+
+vfnmsub213ss %xmm0, %xmm1, %xmm2
+vfnmsub213ss (%rax), %xmm1, %xmm2
+
+vfnmsub231ss %xmm0, %xmm1, %xmm2
+vfnmsub231ss (%rax), %xmm1, %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmaddsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmaddsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfmsubadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfmsubadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      5     0.50                        vfnmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      5     0.50                        vfnmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsub231ss	(%rax), %xmm1, %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 66.00  66.00   -      -      -      -      -      -     96.00  96.00   -      -      -      -     48.00  48.00  48.00  48.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsub231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmadd231ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231pd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231sd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231sd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub132ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub213ss	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ss	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsub231ss	(%rax), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s b/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
new file mode 100644
index 0000000000000000000000000000000000000000..b45abdfd387e4fcc667794a8a1ccdc6b30ab192f
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-fma4.s
@@ -0,0 +1,361 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+vfmaddpd    %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddpd    (%rax), %xmm1, %xmm2, %xmm3
+vfmaddpd    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddpd    %ymm0, %ymm1, %ymm2, %ymm3
+vfmaddpd    (%rax), %ymm1, %ymm2, %ymm3
+vfmaddpd    %ymm0, (%rax), %ymm2, %ymm3
+
+vfmaddps    %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddps    (%rax), %xmm1, %xmm2, %xmm3
+vfmaddps    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddps    %ymm0, %ymm1, %ymm2, %ymm3
+vfmaddps    (%rax), %ymm1, %ymm2, %ymm3
+vfmaddps    %ymm0, (%rax), %ymm2, %ymm3
+
+vfmaddsd    %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddsd    (%rax), %xmm1, %xmm2, %xmm3
+vfmaddsd    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddss    %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddss    (%rax), %xmm1, %xmm2, %xmm3
+vfmaddss    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddsubpd %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddsubpd (%rax), %xmm1, %xmm2, %xmm3
+vfmaddsubpd %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddsubpd %ymm0, %ymm1, %ymm2, %ymm3
+vfmaddsubpd (%rax), %ymm1, %ymm2, %ymm3
+vfmaddsubpd %ymm0, (%rax), %ymm2, %ymm3
+
+vfmaddsubps %xmm0, %xmm1, %xmm2, %xmm3
+vfmaddsubps (%rax), %xmm1, %xmm2, %xmm3
+vfmaddsubps %xmm0, (%rax), %xmm2, %xmm3
+
+vfmaddsubps %ymm0, %ymm1, %ymm2, %ymm3
+vfmaddsubps (%rax), %ymm1, %ymm2, %ymm3
+vfmaddsubps %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubaddpd %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubaddpd (%rax), %xmm1, %xmm2, %xmm3
+vfmsubaddpd %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubaddpd %ymm0, %ymm1, %ymm2, %ymm3
+vfmsubaddpd (%rax), %ymm1, %ymm2, %ymm3
+vfmsubaddpd %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubaddps %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubaddps (%rax), %xmm1, %xmm2, %xmm3
+vfmsubaddps %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubaddps %ymm0, %ymm1, %ymm2, %ymm3
+vfmsubaddps (%rax), %ymm1, %ymm2, %ymm3
+vfmsubaddps %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubpd    %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubpd    (%rax), %xmm1, %xmm2, %xmm3
+vfmsubpd    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubpd    %ymm0, %ymm1, %ymm2, %ymm3
+vfmsubpd    (%rax), %ymm1, %ymm2, %ymm3
+vfmsubpd    %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubps    %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubps    (%rax), %xmm1, %xmm2, %xmm3
+vfmsubps    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubps    %ymm0, %ymm1, %ymm2, %ymm3
+vfmsubps    (%rax), %ymm1, %ymm2, %ymm3
+vfmsubps    %ymm0, (%rax), %ymm2, %ymm3
+
+vfmsubsd    %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubsd    (%rax), %xmm1, %xmm2, %xmm3
+vfmsubsd    %xmm0, (%rax), %xmm2, %xmm3
+
+vfmsubss    %xmm0, %xmm1, %xmm2, %xmm3
+vfmsubss    (%rax), %xmm1, %xmm2, %xmm3
+vfmsubss    %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmaddpd   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmaddpd   (%rax), %xmm1, %xmm2, %xmm3
+vfnmaddpd   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmaddpd   %ymm0, %ymm1, %ymm2, %ymm3
+vfnmaddpd   (%rax), %ymm1, %ymm2, %ymm3
+vfnmaddpd   %ymm0, (%rax), %ymm2, %ymm3
+
+vfnmaddps   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmaddps   (%rax), %xmm1, %xmm2, %xmm3
+vfnmaddps   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmaddps   %ymm0, %ymm1, %ymm2, %ymm3
+vfnmaddps   (%rax), %ymm1, %ymm2, %ymm3
+vfnmaddps   %ymm0, (%rax), %ymm2, %ymm3
+
+vfnmaddsd   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmaddsd   (%rax), %xmm1, %xmm2, %xmm3
+vfnmaddsd   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmaddss   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmaddss   (%rax), %xmm1, %xmm2, %xmm3
+vfnmaddss   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmsubpd   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmsubpd   (%rax), %xmm1, %xmm2, %xmm3
+vfnmsubpd   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmsubpd   %ymm0, %ymm1, %ymm2, %ymm3
+vfnmsubpd   (%rax), %ymm1, %ymm2, %ymm3
+vfnmsubpd   %ymm0, (%rax), %ymm2, %ymm3
+
+vfnmsubps   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmsubps   (%rax), %xmm1, %xmm2, %xmm3
+vfnmsubps   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmsubps   %ymm0, %ymm1, %ymm2, %ymm3
+vfnmsubps   (%rax), %ymm1, %ymm2, %ymm3
+vfnmsubps   %ymm0, (%rax), %ymm2, %ymm3
+
+vfnmsubsd   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmsubsd   (%rax), %xmm1, %xmm2, %xmm3
+vfnmsubsd   %xmm0, (%rax), %xmm2, %xmm3
+
+vfnmsubss   %xmm0, %xmm1, %xmm2, %xmm3
+vfnmsubss   (%rax), %xmm1, %xmm2, %xmm3
+vfnmsubss   %xmm0, (%rax), %xmm2, %xmm3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50                        vfmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmaddsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  2      5     0.50                        vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      10    1.00    *                   vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  1      5     0.50                        vfnmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    0.50    *                   vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 44.00  44.00   -      -      -      -      -      -     48.00  48.00   -      -      -      -     24.00  24.00  24.00  24.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmaddsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfmsubss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmaddss	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubpd	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	(%rax), %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubps	%ymm0, (%rax), %ymm2, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubsd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubsd	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubsd	%xmm0, (%rax), %xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubss	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubss	(%rax), %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.25   0.25   0.25   0.25    -      -     vfnmsubss	%xmm0, (%rax), %xmm2, %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-lea.s b/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
new file mode 100644
index 0000000000000000000000000000000000000000..246d968a32ec5a4e0474586c5fd55cc969c41a25
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-lea.s
@@ -0,0 +1,449 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+lea 0(), %cx
+lea 0(), %ecx
+lea 0(), %rcx
+lea (%eax), %cx
+lea (%eax), %ecx
+lea (%eax), %rcx
+lea (%rax), %cx
+lea (%rax), %ecx
+lea (%rax), %rcx
+lea (, %ebx), %cx
+lea (, %ebx), %ecx
+lea (, %ebx), %rcx
+lea (, %rbx), %cx
+lea (, %rbx), %ecx
+lea (, %rbx), %rcx
+lea (, %ebx, 1), %cx
+lea (, %ebx, 1), %ecx
+lea (, %ebx, 1), %rcx
+lea (, %rbx, 1), %cx
+lea (, %rbx, 1), %ecx
+lea (, %rbx, 1), %rcx
+lea (, %ebx, 2), %cx
+lea (, %ebx, 2), %ecx
+lea (, %ebx, 2), %rcx
+lea (, %rbx, 2), %cx
+lea (, %rbx, 2), %ecx
+lea (, %rbx, 2), %rcx
+lea (%eax, %ebx), %cx
+lea (%eax, %ebx), %ecx
+lea (%eax, %ebx), %rcx
+lea (%rax, %rbx), %cx
+lea (%rax, %rbx), %ecx
+lea (%rax, %rbx), %rcx
+lea (%eax, %ebx, 1), %cx
+lea (%eax, %ebx, 1), %ecx
+lea (%eax, %ebx, 1), %rcx
+lea (%rax, %rbx, 1), %cx
+lea (%rax, %rbx, 1), %ecx
+lea (%rax, %rbx, 1), %rcx
+lea (%eax, %ebx, 2), %cx
+lea (%eax, %ebx, 2), %ecx
+lea (%eax, %ebx, 2), %rcx
+lea (%rax, %rbx, 2), %cx
+lea (%rax, %rbx, 2), %ecx
+lea (%rax, %rbx, 2), %rcx
+
+lea -16(), %cx
+lea -16(), %ecx
+lea -16(), %rcx
+lea -16(%eax), %cx
+lea -16(%eax), %ecx
+lea -16(%eax), %rcx
+lea -16(%rax), %cx
+lea -16(%rax), %ecx
+lea -16(%rax), %rcx
+lea -16(, %ebx), %cx
+lea -16(, %ebx), %ecx
+lea -16(, %ebx), %rcx
+lea -16(, %rbx), %cx
+lea -16(, %rbx), %ecx
+lea -16(, %rbx), %rcx
+lea -16(, %ebx, 1), %cx
+lea -16(, %ebx, 1), %ecx
+lea -16(, %ebx, 1), %rcx
+lea -16(, %rbx, 1), %cx
+lea -16(, %rbx, 1), %ecx
+lea -16(, %rbx, 1), %rcx
+lea -16(, %ebx, 2), %cx
+lea -16(, %ebx, 2), %ecx
+lea -16(, %ebx, 2), %rcx
+lea -16(, %rbx, 2), %cx
+lea -16(, %rbx, 2), %ecx
+lea -16(, %rbx, 2), %rcx
+lea -16(%eax, %ebx), %cx
+lea -16(%eax, %ebx), %ecx
+lea -16(%eax, %ebx), %rcx
+lea -16(%rax, %rbx), %cx
+lea -16(%rax, %rbx), %ecx
+lea -16(%rax, %rbx), %rcx
+lea -16(%eax, %ebx, 1), %cx
+lea -16(%eax, %ebx, 1), %ecx
+lea -16(%eax, %ebx, 1), %rcx
+lea -16(%rax, %rbx, 1), %cx
+lea -16(%rax, %rbx, 1), %ecx
+lea -16(%rax, %rbx, 1), %rcx
+lea -16(%eax, %ebx, 2), %cx
+lea -16(%eax, %ebx, 2), %ecx
+lea -16(%eax, %ebx, 2), %rcx
+lea -16(%rax, %rbx, 2), %cx
+lea -16(%rax, %rbx, 2), %ecx
+lea -16(%rax, %rbx, 2), %rcx
+
+lea 1024(), %cx
+lea 1024(), %ecx
+lea 1024(), %rcx
+lea 1024(%eax), %cx
+lea 1024(%eax), %ecx
+lea 1024(%eax), %rcx
+lea 1024(%rax), %cx
+lea 1024(%rax), %ecx
+lea 1024(%rax), %rcx
+lea 1024(, %ebx), %cx
+lea 1024(, %ebx), %ecx
+lea 1024(, %ebx), %rcx
+lea 1024(, %rbx), %cx
+lea 1024(, %rbx), %ecx
+lea 1024(, %rbx), %rcx
+lea 1024(, %ebx, 1), %cx
+lea 1024(, %ebx, 1), %ecx
+lea 1024(, %ebx, 1), %rcx
+lea 1024(, %rbx, 1), %cx
+lea 1024(, %rbx, 1), %ecx
+lea 1024(, %rbx, 1), %rcx
+lea 1024(, %ebx, 2), %cx
+lea 1024(, %ebx, 2), %ecx
+lea 1024(, %ebx, 2), %rcx
+lea 1024(, %rbx, 2), %cx
+lea 1024(, %rbx, 2), %ecx
+lea 1024(, %rbx, 2), %rcx
+lea 1024(%eax, %ebx), %cx
+lea 1024(%eax, %ebx), %ecx
+lea 1024(%eax, %ebx), %rcx
+lea 1024(%rax, %rbx), %cx
+lea 1024(%rax, %rbx), %ecx
+lea 1024(%rax, %rbx), %rcx
+lea 1024(%eax, %ebx, 1), %cx
+lea 1024(%eax, %ebx, 1), %ecx
+lea 1024(%eax, %ebx, 1), %rcx
+lea 1024(%rax, %rbx, 1), %cx
+lea 1024(%rax, %rbx, 1), %ecx
+lea 1024(%rax, %rbx, 1), %rcx
+lea 1024(%eax, %ebx, 2), %cx
+lea 1024(%eax, %ebx, 2), %ecx
+lea 1024(%eax, %ebx, 2), %rcx
+lea 1024(%rax, %rbx, 2), %cx
+lea 1024(%rax, %rbx, 2), %ecx
+lea 1024(%rax, %rbx, 2), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      1     0.50                        leaw	0, %cx
+# CHECK-NEXT:  2      1     0.50                        leal	0, %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	0, %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%eax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%eax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%eax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%rax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%rax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%rax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%eax,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	(%rax,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16, %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16, %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16, %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%eax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%eax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%eax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%rax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%rax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%rax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%eax,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	-16(%rax,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	-16(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	-16(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024, %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024, %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024, %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%eax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%eax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%eax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%rax), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%rax), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%rax), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(,%rbx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%eax,%ebx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  2      1     0.50                        leaw	1024(%rax,%rbx,2), %cx
+# CHECK-NEXT:  2      1     0.50                        leal	1024(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  2      1     0.50                        leaq	1024(%rax,%rbx,2), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     67.50  67.50   -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	0, %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	0, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16, %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	-16(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	-16(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	-16(%rax,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024, %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(,%rbx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%eax,%ebx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%eax,%ebx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%eax,%ebx,2), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaw	1024(%rax,%rbx,2), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leal	1024(%rax,%rbx,2), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	1024(%rax,%rbx,2), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s b/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
new file mode 100644
index 0000000000000000000000000000000000000000..1b2b38fb4a55e5c6b995701fe323bcad2a4e86fe
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-lzcnt.s
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+lzcntw      %cx, %cx
+lzcntw      (%rax), %cx
+
+lzcntl      %eax, %ecx
+lzcntl      (%rax), %ecx
+
+lzcntq      %rax, %rcx
+lzcntq      (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      2     0.50                        lzcntw	%cx, %cx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntw	(%rax), %cx
+# CHECK-NEXT:  2      2     0.50                        lzcntl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        lzcntq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   lzcntq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.50   1.50    -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	%cx, %cx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntw	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lzcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s b/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
new file mode 100644
index 0000000000000000000000000000000000000000..3dcc8083125eb2b2825ccff84a1af60664cce3ac
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-mmx.s
@@ -0,0 +1,405 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+emms
+
+movd        %eax, %mm2
+movd        (%rax), %mm2
+
+movd        %mm0, %ecx
+movd        %mm0, (%rax)
+
+movq        %rax, %mm2
+movq        (%rax), %mm2
+
+movq        %mm0, %rcx
+movq        %mm0, (%rax)
+
+packsswb    %mm0, %mm2
+packsswb    (%rax), %mm2
+
+packssdw    %mm0, %mm2
+packssdw    (%rax), %mm2
+
+packuswb    %mm0, %mm2
+packuswb    (%rax), %mm2
+
+paddb       %mm0, %mm2
+paddb       (%rax), %mm2
+
+paddd       %mm0, %mm2
+paddd       (%rax), %mm2
+
+paddsb      %mm0, %mm2
+paddsb      (%rax), %mm2
+
+paddsw      %mm0, %mm2
+paddsw      (%rax), %mm2
+
+paddusb     %mm0, %mm2
+paddusb     (%rax), %mm2
+
+paddusw     %mm0, %mm2
+paddusw     (%rax), %mm2
+
+paddw       %mm0, %mm2
+paddw       (%rax), %mm2
+
+pand        %mm0, %mm2
+pand        (%rax), %mm2
+
+pandn       %mm0, %mm2
+pandn       (%rax), %mm2
+
+pcmpeqb     %mm0, %mm2
+pcmpeqb     (%rax), %mm2
+
+pcmpeqd     %mm0, %mm2
+pcmpeqd     (%rax), %mm2
+
+pcmpeqw     %mm0, %mm2
+pcmpeqw     (%rax), %mm2
+
+pcmpgtb     %mm0, %mm2
+pcmpgtb     (%rax), %mm2
+
+pcmpgtd     %mm0, %mm2
+pcmpgtd     (%rax), %mm2
+
+pcmpgtw     %mm0, %mm2
+pcmpgtw     (%rax), %mm2
+
+pmaddwd     %mm0, %mm2
+pmaddwd     (%rax), %mm2
+
+pmulhw      %mm0, %mm2
+pmulhw      (%rax), %mm2
+
+pmullw      %mm0, %mm2
+pmullw      (%rax), %mm2
+
+por         %mm0, %mm2
+por         (%rax), %mm2
+
+pslld       $1, %mm2
+pslld       %mm0, %mm2
+pslld       (%rax), %mm2
+
+psllq       $1, %mm2
+psllq       %mm0, %mm2
+psllq       (%rax), %mm2
+
+psllw       $1, %mm2
+psllw       %mm0, %mm2
+psllw       (%rax), %mm2
+
+psrad       $1, %mm2
+psrad       %mm0, %mm2
+psrad       (%rax), %mm2
+
+psraw       $1, %mm2
+psraw       %mm0, %mm2
+psraw       (%rax), %mm2
+
+psrld       $1, %mm2
+psrld       %mm0, %mm2
+psrld       (%rax), %mm2
+
+psrlq       $1, %mm2
+psrlq       %mm0, %mm2
+psrlq       (%rax), %mm2
+
+psrlw       $1, %mm2
+psrlw       %mm0, %mm2
+psrlw       (%rax), %mm2
+
+psubb       %mm0, %mm2
+psubb       (%rax), %mm2
+
+psubd       %mm0, %mm2
+psubd       (%rax), %mm2
+
+psubsb      %mm0, %mm2
+psubsb      (%rax), %mm2
+
+psubsw      %mm0, %mm2
+psubsw      (%rax), %mm2
+
+psubusb     %mm0, %mm2
+psubusb     (%rax), %mm2
+
+psubusw     %mm0, %mm2
+psubusw     (%rax), %mm2
+
+psubw       %mm0, %mm2
+psubw       (%rax), %mm2
+
+punpckhbw   %mm0, %mm2
+punpckhbw   (%rax), %mm2
+
+punpckhdq   %mm0, %mm2
+punpckhdq   (%rax), %mm2
+
+punpckhwd   %mm0, %mm2
+punpckhwd   (%rax), %mm2
+
+punpcklbw   %mm0, %mm2
+punpcklbw   (%rax), %mm2
+
+punpckldq   %mm0, %mm2
+punpckldq   (%rax), %mm2
+
+punpcklwd   %mm0, %mm2
+punpcklwd   (%rax), %mm2
+
+pxor        %mm0, %mm2
+pxor        (%rax), %mm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50    *      *      U     emms
+# CHECK-NEXT:  2      10    0.50                        movd	%eax, %mm2
+# CHECK-NEXT:  1      5     0.50    *                   movd	(%rax), %mm2
+# CHECK-NEXT:  1      10    1.00                        movd	%mm0, %ecx
+# CHECK-NEXT:  1      2     1.00           *      U     movd	%mm0, (%rax)
+# CHECK-NEXT:  2      10    0.50                        movq	%rax, %mm2
+# CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %mm2
+# CHECK-NEXT:  1      10    1.00                        movq	%mm0, %rcx
+# CHECK-NEXT:  1      2     1.00           *            movq	%mm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        packsswb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   packsswb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        packssdw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   packssdw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        packuswb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   packuswb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddsb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddsb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddusb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddusb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddusw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddusw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pand	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pand	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pandn	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pandn	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmaddwd	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmaddwd	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmulhw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmullw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmullw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        por	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   por	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pslld	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        pslld	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   pslld	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psllq	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psllq	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psllq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psllw	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psllw	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psllw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psrad	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psrad	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psrad	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psraw	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psraw	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psraw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psrld	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psrld	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psrld	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psrlq	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psrlq	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psrlq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psrlw	$1, %mm2
+# CHECK-NEXT:  1      3     0.50                        psrlw	%mm0, %mm2
+# CHECK-NEXT:  1      8     0.50    *                   psrlw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubsb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubsb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubusb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubusb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubusw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubusw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpckhbw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhbw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpckhdq	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhdq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpckhwd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhwd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpcklbw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklbw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpckldq	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckldq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        punpcklwd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklwd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pxor	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pxor	(%rax), %mm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 24.00  24.00   -      -      -     2.00    -      -     2.50   2.50   46.00  46.00  6.00   2.00   55.50  49.50   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     emms
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movd	%eax, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movd	%mm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movd	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movq	%rax, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movq	%mm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movq	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	$1, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	(%rax), %mm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s b/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
new file mode 100644
index 0000000000000000000000000000000000000000..92367b17eef3b4ead146ac6cc19d8d89a6f97aaf
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-movbe.s
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+movbe  %cx, (%rax)
+movbe  (%rax), %cx
+
+movbe  %ecx, (%rax)
+movbe  (%rax), %ecx
+
+movbe  %rcx, (%rax)
+movbe  (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50           *            movbew	%cx, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movbew	(%rax), %cx
+# CHECK-NEXT:  1      1     0.50           *            movbel	%ecx, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movbel	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.50           *            movbeq	%rcx, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movbeq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 3.00   3.00    -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movbew	%cx, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movbew	(%rax), %cx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movbel	%ecx, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movbel	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movbeq	%rcx, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movbeq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s b/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
new file mode 100644
index 0000000000000000000000000000000000000000..81bbc40143a3aab0af413328bb496d035f506795
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-pclmul.s
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+pclmulqdq     $11, %xmm0, %xmm2
+pclmulqdq     $11, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  5      12    1.00                        pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  6      17    1.00    *                   pclmulqdq	$11, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     2.00    -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pclmulqdq	$11, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s b/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
new file mode 100644
index 0000000000000000000000000000000000000000..d31ed6cc528647b0692069cf6b1a8312fe894ff7
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-popcnt.s
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+popcntw     %cx, %cx
+popcntw     (%rax), %cx
+
+popcntl     %eax, %ecx
+popcntl     (%rax), %ecx
+
+popcntq     %rax, %rcx
+popcntq     (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     0.50                        popcntw	%cx, %cx
+# CHECK-NEXT:  1      8     0.50    *                   popcntw	(%rax), %cx
+# CHECK-NEXT:  1      4     0.50                        popcntl	%eax, %ecx
+# CHECK-NEXT:  1      8     0.50    *                   popcntl	(%rax), %ecx
+# CHECK-NEXT:  1      4     0.50                        popcntq	%rax, %rcx
+# CHECK-NEXT:  1      8     0.50    *                   popcntq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.50   1.50    -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntw	%cx, %cx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntw	(%rax), %cx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntl	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntq	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     popcntq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s b/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
new file mode 100644
index 0000000000000000000000000000000000000000..c6973d7bb86e75ea8bdfeca636b94308853a488b
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-prefetchw.s
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+prefetch    (%rax)
+prefetchw   (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     0.50    *      *            prefetch	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetchw	(%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetch	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetchw	(%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
new file mode 100644
index 0000000000000000000000000000000000000000..85fa5d56db93e7d110b7a936295ef35ba7090e59
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse1.s
@@ -0,0 +1,473 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+addps       %xmm0, %xmm2
+addps       (%rax), %xmm2
+
+addss       %xmm0, %xmm2
+addss       (%rax), %xmm2
+
+andnps      %xmm0, %xmm2
+andnps      (%rax), %xmm2
+
+andps       %xmm0, %xmm2
+andps       (%rax), %xmm2
+
+cmpps       $0, %xmm0, %xmm2
+cmpps       $0, (%rax), %xmm2
+
+cmpss       $0, %xmm0, %xmm2
+cmpss       $0, (%rax), %xmm2
+
+comiss      %xmm0, %xmm1
+comiss      (%rax), %xmm1
+
+cvtpi2ps    %mm0, %xmm2
+cvtpi2ps    (%rax), %xmm2
+
+cvtps2pi    %xmm0, %mm2
+cvtps2pi    (%rax), %mm2
+
+cvtsi2ss    %ecx, %xmm2
+cvtsi2ss    %rcx, %xmm2
+cvtsi2ss    (%rax), %xmm2
+cvtsi2ss    (%rax), %xmm2
+
+cvtss2si    %xmm0, %ecx
+cvtss2si    %xmm0, %rcx
+cvtss2si    (%rax), %ecx
+cvtss2si    (%rax), %rcx
+
+cvttps2pi   %xmm0, %mm2
+cvttps2pi   (%rax), %mm2
+
+cvttss2si   %xmm0, %ecx
+cvttss2si   %xmm0, %rcx
+cvttss2si   (%rax), %ecx
+cvttss2si   (%rax), %rcx
+
+divps       %xmm0, %xmm2
+divps       (%rax), %xmm2
+
+divss       %xmm0, %xmm2
+divss       (%rax), %xmm2
+
+ldmxcsr     (%rax)
+
+maskmovq    %mm0, %mm1
+
+maxps       %xmm0, %xmm2
+maxps       (%rax), %xmm2
+
+maxss       %xmm0, %xmm2
+maxss       (%rax), %xmm2
+
+minps       %xmm0, %xmm2
+minps       (%rax), %xmm2
+
+minss       %xmm0, %xmm2
+minss       (%rax), %xmm2
+
+movaps      %xmm0, %xmm2
+movaps      %xmm0, (%rax)
+movaps      (%rax), %xmm2
+
+movhlps     %xmm0, %xmm2
+movlhps     %xmm0, %xmm2
+
+movhps      %xmm0, (%rax)
+movhps      (%rax), %xmm2
+
+movlps      %xmm0, (%rax)
+movlps      (%rax), %xmm2
+
+movmskps    %xmm0, %rcx
+
+movntps     %xmm0, (%rax)
+movntq      %mm0, (%rax)
+
+movss       %xmm0, %xmm2
+movss       %xmm0, (%rax)
+movss       (%rax), %xmm2
+
+movups      %xmm0, %xmm2
+movups      %xmm0, (%rax)
+movups      (%rax), %xmm2
+
+mulps       %xmm0, %xmm2
+mulps       (%rax), %xmm2
+
+mulss       %xmm0, %xmm2
+mulss       (%rax), %xmm2
+
+orps        %xmm0, %xmm2
+orps        (%rax), %xmm2
+
+pavgb       %mm0, %mm2
+pavgb       (%rax), %mm2
+
+pavgw       %mm0, %mm2
+pavgw       (%rax), %mm2
+
+pextrw      $1, %mm0, %rcx
+
+pinsrw      $1, %rax, %mm2
+pinsrw      $1, (%rax), %mm2
+
+pmaxsw      %mm0, %mm2
+pmaxsw      (%rax), %mm2
+
+pmaxub      %mm0, %mm2
+pmaxub      (%rax), %mm2
+
+pminsw      %mm0, %mm2
+pminsw      (%rax), %mm2
+
+pminub      %mm0, %mm2
+pminub      (%rax), %mm2
+
+pmovmskb    %xmm0, %rcx
+
+pmulhuw     %mm0, %mm2
+pmulhuw     (%rax), %mm2
+
+prefetcht0  (%rax)
+prefetcht1  (%rax)
+prefetcht2  (%rax)
+prefetchnta (%rax)
+
+psadbw      %mm0, %mm2
+psadbw      (%rax), %mm2
+
+pshufw      $1, %mm0, %mm2
+pshufw      $1, (%rax), %mm2
+
+rcpps       %xmm0, %xmm2
+rcpps       (%rax), %xmm2
+
+rcpss       %xmm0, %xmm2
+rcpss       (%rax), %xmm2
+
+rsqrtps     %xmm0, %xmm2
+rsqrtps     (%rax), %xmm2
+
+rsqrtss     %xmm0, %xmm2
+rsqrtss     (%rax), %xmm2
+
+sfence
+
+shufps      $1, %xmm0, %xmm2
+shufps      $1, (%rax), %xmm2
+
+sqrtps      %xmm0, %xmm2
+sqrtps      (%rax), %xmm2
+
+sqrtss      %xmm0, %xmm2
+sqrtss      (%rax), %xmm2
+
+stmxcsr     (%rax)
+
+subps       %xmm0, %xmm2
+subps       (%rax), %xmm2
+
+subss       %xmm0, %xmm2
+subss       (%rax), %xmm2
+
+ucomiss     %xmm0, %xmm1
+ucomiss     (%rax), %xmm1
+
+unpckhps    %xmm0, %xmm2
+unpckhps    (%rax), %xmm2
+
+unpcklps    %xmm0, %xmm2
+unpcklps    (%rax), %xmm2
+
+xorps       %xmm0, %xmm2
+xorps       (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        addps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        addss	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addss	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        andnps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   andnps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        andps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   andps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        cmpps	$0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   cmpps	$0, (%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        cmpss	$0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   cmpss	$0, (%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        comiss	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   comiss	(%rax), %xmm1
+# CHECK-NEXT:  2      4     1.00                        cvtpi2ps	%mm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtpi2ps	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        cvtps2pi	%xmm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtps2pi	(%rax), %mm2
+# CHECK-NEXT:  2      4     1.00                        cvtsi2ssl	%ecx, %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvtsi2ssq	%rcx, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        cvtss2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   cvtss2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   cvtss2si	(%rax), %rcx
+# CHECK-NEXT:  1      4     1.00                        cvttps2pi	%xmm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   cvttps2pi	(%rax), %mm2
+# CHECK-NEXT:  2      13    1.00                        cvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        cvttss2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   cvttss2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   cvttss2si	(%rax), %rcx
+# CHECK-NEXT:  1      9     9.50                        divps	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   divps	(%rax), %xmm2
+# CHECK-NEXT:  1      9     9.50                        divss	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   divss	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *      *      U     ldmxcsr	(%rax)
+# CHECK-NEXT:  1      2     0.50    *      *      U     maskmovq	%mm0, %mm1
+# CHECK-NEXT:  1      2     1.00                        maxps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   maxps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        maxss	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   maxss	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        minps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   minps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        minss	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   minss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        movaps	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movaps	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movaps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movhlps	%xmm0, %xmm2
+# CHECK-NEXT:  1      2     0.50                        movlhps	%xmm0, %xmm2
+# CHECK-NEXT:  2      2     1.00           *            movhps	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   movhps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00           *            movlps	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   movlps	(%rax), %xmm2
+# CHECK-NEXT:  2      10    1.00                        movmskps	%xmm0, %ecx
+# CHECK-NEXT:  1      3     1.00           *            movntps	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     1.00    *      *      U     movntq	%mm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        movss	%xmm0, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            movss	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        movups	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movups	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movups	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        mulps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   mulps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        mulss	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   mulss	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        orps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   orps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pavgb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pavgw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgw	(%rax), %mm2
+# CHECK-NEXT:  2      13    1.00                        pextrw	$1, %mm0, %ecx
+# CHECK-NEXT:  2      2     0.50                        pinsrw	$1, %eax, %mm2
+# CHECK-NEXT:  2      6     0.50    *                   pinsrw	$1, (%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pmaxsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pmaxub	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxub	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pminsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pminsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pminub	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pminub	(%rax), %mm2
+# CHECK-NEXT:  2      13    1.00                        pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  1      4     1.00                        pmulhuw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhuw	(%rax), %mm2
+# CHECK-NEXT:  1      5     0.50    *      *            prefetcht0	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetcht1	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetcht2	(%rax)
+# CHECK-NEXT:  1      5     0.50    *      *            prefetchnta	(%rax)
+# CHECK-NEXT:  2      4     0.50                        psadbw	%mm0, %mm2
+# CHECK-NEXT:  2      9     0.50    *                   psadbw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pshufw	$1, %mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pshufw	$1, (%rax), %mm2
+# CHECK-NEXT:  1      5     1.00                        rcpps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   rcpps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        rcpss	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   rcpss	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        rsqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   rsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        rsqrtss	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   rsqrtss	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50    *      *      U     sfence
+# CHECK-NEXT:  1      2     0.50                        shufps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   shufps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      9     10.50                       sqrtps	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    10.50   *                   sqrtps	(%rax), %xmm2
+# CHECK-NEXT:  1      9     10.50                       sqrtss	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    10.50   *                   sqrtss	(%rax), %xmm2
+# CHECK-NEXT:  2      1     0.50    *      *      U     stmxcsr	(%rax)
+# CHECK-NEXT:  1      5     1.00                        subps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   subps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        subss	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   subss	(%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        ucomiss	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   ucomiss	(%rax), %xmm1
+# CHECK-NEXT:  1      2     0.50                        unpckhps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   unpckhps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        unpcklps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   unpcklps	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        xorps	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   xorps	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 33.50  33.50   -      -      -     15.00   -      -     115.50 115.50 9.50   9.50   2.00   25.00  50.50  66.50   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpps	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpps	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpss	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpss	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2ps	%mm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssl	%ecx, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssq	%rcx, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2ssl	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttss2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     ldmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     maskmovq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movaps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movaps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movaps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movhlps	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movlhps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movhps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movhps	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movlps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movlps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movmskps	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntps	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntq	%mm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movss	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movups	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movups	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movups	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrw	$1, %mm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrw	$1, %eax, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrw	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	(%rax), %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetcht0	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetcht1	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetcht2	(%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     prefetchnta	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufw	$1, %mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufw	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rcpss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     rsqrtss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     sfence
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     10.50  10.50   -      -      -      -      -     1.00    -      -      -      -     sqrtss	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     stmxcsr	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomiss	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomiss	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorps	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
new file mode 100644
index 0000000000000000000000000000000000000000..23be05e554a66bc131ac89d1c90327715e89169b
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse2.s
@@ -0,0 +1,961 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+addpd       %xmm0, %xmm2
+addpd       (%rax), %xmm2
+
+addsd       %xmm0, %xmm2
+addsd       (%rax), %xmm2
+
+andnpd      %xmm0, %xmm2
+andnpd      (%rax), %xmm2
+
+andpd       %xmm0, %xmm2
+andpd       (%rax), %xmm2
+
+clflush     (%rax)
+
+cmppd       $0, %xmm0, %xmm2
+cmppd       $0, (%rax), %xmm2
+
+cmpsd       $0, %xmm0, %xmm2
+cmpsd       $0, (%rax), %xmm2
+
+comisd      %xmm0, %xmm1
+comisd      (%rax), %xmm1
+
+cvtdq2pd    %xmm0, %xmm2
+cvtdq2pd    (%rax), %xmm2
+
+cvtdq2ps    %xmm0, %xmm2
+cvtdq2ps    (%rax), %xmm2
+
+cvtpd2dq    %xmm0, %xmm2
+cvtpd2dq    (%rax), %xmm2
+
+cvtpd2pi    %xmm0, %mm2
+cvtpd2pi    (%rax), %mm2
+
+cvtpd2ps    %xmm0, %xmm2
+cvtpd2ps    (%rax), %xmm2
+
+cvtpi2pd    %mm0, %xmm2
+cvtpi2pd    (%rax), %xmm2
+
+cvtps2dq    %xmm0, %xmm2
+cvtps2dq    (%rax), %xmm2
+
+cvtps2pd    %xmm0, %xmm2
+cvtps2pd    (%rax), %xmm2
+
+cvtsd2si    %xmm0, %ecx
+cvtsd2si    %xmm0, %rcx
+cvtsd2si    (%rax), %ecx
+cvtsd2si    (%rax), %rcx
+
+cvtsd2ss    %xmm0, %xmm2
+cvtsd2ss    (%rax), %xmm2
+
+cvtsi2sd    %ecx, %xmm2
+cvtsi2sd    %rcx, %xmm2
+cvtsi2sd    (%rax), %xmm2
+cvtsi2sd    (%rax), %xmm2
+
+cvtss2sd    %xmm0, %xmm2
+cvtss2sd    (%rax), %xmm2
+
+cvttpd2dq   %xmm0, %xmm2
+cvttpd2dq   (%rax), %xmm2
+
+cvttpd2pi   %xmm0, %mm2
+cvttpd2pi   (%rax), %mm2
+
+cvttps2dq   %xmm0, %xmm2
+cvttps2dq   (%rax), %xmm2
+
+cvttsd2si   %xmm0, %ecx
+cvttsd2si   %xmm0, %rcx
+cvttsd2si   (%rax), %ecx
+cvttsd2si   (%rax), %rcx
+
+divpd       %xmm0, %xmm2
+divpd       (%rax), %xmm2
+
+divsd       %xmm0, %xmm2
+divsd       (%rax), %xmm2
+
+lfence
+
+maskmovdqu  %xmm0, %xmm1
+
+maxpd       %xmm0, %xmm2
+maxpd       (%rax), %xmm2
+
+maxsd       %xmm0, %xmm2
+maxsd       (%rax), %xmm2
+
+minpd       %xmm0, %xmm2
+minpd       (%rax), %xmm2
+
+minsd       %xmm0, %xmm2
+minsd       (%rax), %xmm2
+
+movapd      %xmm0, %xmm2
+movapd      %xmm0, (%rax)
+movapd      (%rax), %xmm2
+
+movd        %eax, %xmm2
+movd        (%rax), %xmm2
+
+movd        %xmm0, %ecx
+movd        %xmm0, (%rax)
+
+movdqa      %xmm0, %xmm2
+movdqa      %xmm0, (%rax)
+movdqa      (%rax), %xmm2
+
+movdqu      %xmm0, %xmm2
+movdqu      %xmm0, (%rax)
+movdqu      (%rax), %xmm2
+
+movdq2q     %xmm0, %mm2
+
+movhpd      %xmm0, (%rax)
+movhpd      (%rax), %xmm2
+
+movlpd      %xmm0, (%rax)
+movlpd      (%rax), %xmm2
+
+movmskpd    %xmm0, %rcx
+
+movntil     %eax, (%rax)
+movntiq     %rax, (%rax)
+
+movntdq     %xmm0, (%rax)
+movntpd     %xmm0, (%rax)
+
+movq        %xmm0, %xmm2
+
+movq        %rax, %xmm2
+movq        (%rax), %xmm2
+
+movq        %xmm0, %rcx
+movq        %xmm0, (%rax)
+
+movq2dq     %mm0, %xmm2
+
+movsd       %xmm0, %xmm2
+movsd       %xmm0, (%rax)
+movsd       (%rax), %xmm2
+
+movupd      %xmm0, %xmm2
+movupd      %xmm0, (%rax)
+movupd      (%rax), %xmm2
+
+mulpd       %xmm0, %xmm2
+mulpd       (%rax), %xmm2
+
+mulsd       %xmm0, %xmm2
+mulsd       (%rax), %xmm2
+
+orpd        %xmm0, %xmm2
+orpd        (%rax), %xmm2
+
+packssdw    %xmm0, %xmm2
+packssdw    (%rax), %xmm2
+
+packsswb    %xmm0, %xmm2
+packsswb    (%rax), %xmm2
+
+packuswb    %xmm0, %xmm2
+packuswb    (%rax), %xmm2
+
+paddb       %xmm0, %xmm2
+paddb       (%rax), %xmm2
+
+paddd       %xmm0, %xmm2
+paddd       (%rax), %xmm2
+
+paddq       %mm0, %mm2
+paddq       (%rax), %mm2
+
+paddq       %xmm0, %xmm2
+paddq       (%rax), %xmm2
+
+paddsb      %xmm0, %xmm2
+paddsb      (%rax), %xmm2
+
+paddsw      %xmm0, %xmm2
+paddsw      (%rax), %xmm2
+
+paddusb     %xmm0, %xmm2
+paddusb     (%rax), %xmm2
+
+paddusw     %xmm0, %xmm2
+paddusw     (%rax), %xmm2
+
+paddw       %xmm0, %xmm2
+paddw       (%rax), %xmm2
+
+pand        %xmm0, %xmm2
+pand        (%rax), %xmm2
+
+pandn       %xmm0, %xmm2
+pandn       (%rax), %xmm2
+
+pavgb       %xmm0, %xmm2
+pavgb       (%rax), %xmm2
+
+pavgw       %xmm0, %xmm2
+pavgw       (%rax), %xmm2
+
+pcmpeqb     %xmm0, %xmm2
+pcmpeqb     (%rax), %xmm2
+
+pcmpeqd     %xmm0, %xmm2
+pcmpeqd     (%rax), %xmm2
+
+pcmpeqw     %xmm0, %xmm2
+pcmpeqw     (%rax), %xmm2
+
+pcmpgtb     %xmm0, %xmm2
+pcmpgtb     (%rax), %xmm2
+
+pcmpgtd     %xmm0, %xmm2
+pcmpgtd     (%rax), %xmm2
+
+pcmpgtw     %xmm0, %xmm2
+pcmpgtw     (%rax), %xmm2
+
+pextrw      $1, %xmm0, %rcx
+
+pmaddwd     %xmm0, %xmm2
+pmaddwd     (%rax), %xmm2
+
+pmaxsw      %xmm0, %xmm2
+pmaxsw      (%rax), %xmm2
+
+pmaxub      %xmm0, %xmm2
+pmaxub      (%rax), %xmm2
+
+pminsw      %xmm0, %xmm2
+pminsw      (%rax), %xmm2
+
+pminub      %xmm0, %xmm2
+pminub      (%rax), %xmm2
+
+pmovmskb    %xmm0, %rcx
+
+pmulhuw     %xmm0, %xmm2
+pmulhuw     (%rax), %xmm2
+
+pmulhw      %xmm0, %xmm2
+pmulhw      (%rax), %xmm2
+
+pmullw      %xmm0, %xmm2
+pmullw      (%rax), %xmm2
+
+pmuludq     %mm0, %mm2
+pmuludq     (%rax), %mm2
+
+pmuludq     %xmm0, %xmm2
+pmuludq     (%rax), %xmm2
+
+por         %xmm0, %xmm2
+por         (%rax), %xmm2
+
+psadbw      %xmm0, %xmm2
+psadbw      (%rax), %xmm2
+
+pshufd      $1, %xmm0, %xmm2
+pshufd      $1, (%rax), %xmm2
+
+pshufhw     $1, %xmm0, %xmm2
+pshufhw     $1, (%rax), %xmm2
+
+pshuflw     $1, %xmm0, %xmm2
+pshuflw     $1, (%rax), %xmm2
+
+pslld       $1, %xmm2
+pslld       %xmm0, %xmm2
+pslld       (%rax), %xmm2
+
+pslldq      $1, %xmm2
+
+psllq       $1, %xmm2
+psllq       %xmm0, %xmm2
+psllq       (%rax), %xmm2
+
+psllw       $1, %xmm2
+psllw       %xmm0, %xmm2
+psllw       (%rax), %xmm2
+
+psrad       $1, %xmm2
+psrad       %xmm0, %xmm2
+psrad       (%rax), %xmm2
+
+psraw       $1, %xmm2
+psraw       %xmm0, %xmm2
+psraw       (%rax), %xmm2
+
+psrld       $1, %xmm2
+psrld       %xmm0, %xmm2
+psrld       (%rax), %xmm2
+
+psrldq      $1, %xmm2
+
+psrlq       $1, %xmm2
+psrlq       %xmm0, %xmm2
+psrlq       (%rax), %xmm2
+
+psrlw       $1, %xmm2
+psrlw       %xmm0, %xmm2
+psrlw       (%rax), %xmm2
+
+psubb       %xmm0, %xmm2
+psubb       (%rax), %xmm2
+
+psubd       %xmm0, %xmm2
+psubd       (%rax), %xmm2
+
+psubq       %mm0, %mm2
+psubq       (%rax), %mm2
+
+psubq       %xmm0, %xmm2
+psubq       (%rax), %xmm2
+
+psubsb      %xmm0, %xmm2
+psubsb      (%rax), %xmm2
+
+psubsw      %xmm0, %xmm2
+psubsw      (%rax), %xmm2
+
+psubusb     %xmm0, %xmm2
+psubusb     (%rax), %xmm2
+
+psubusw     %xmm0, %xmm2
+psubusw     (%rax), %xmm2
+
+psubw       %xmm0, %xmm2
+psubw       (%rax), %xmm2
+
+punpckhbw   %xmm0, %xmm2
+punpckhbw   (%rax), %xmm2
+
+punpckhdq   %xmm0, %xmm2
+punpckhdq   (%rax), %xmm2
+
+punpckhqdq  %xmm0, %xmm2
+punpckhqdq  (%rax), %xmm2
+
+punpckhwd   %xmm0, %xmm2
+punpckhwd   (%rax), %xmm2
+
+punpcklbw   %xmm0, %xmm2
+punpcklbw   (%rax), %xmm2
+
+punpckldq   %xmm0, %xmm2
+punpckldq   (%rax), %xmm2
+
+punpcklqdq  %xmm0, %xmm2
+punpcklqdq  (%rax), %xmm2
+
+punpcklwd   %xmm0, %xmm2
+punpcklwd   (%rax), %xmm2
+
+pxor        %xmm0, %xmm2
+pxor        (%rax), %xmm2
+
+shufpd      $1, %xmm0, %xmm2
+shufpd      $1, (%rax), %xmm2
+
+sqrtpd      %xmm0, %xmm2
+sqrtpd      (%rax), %xmm2
+
+sqrtsd      %xmm0, %xmm2
+sqrtsd      (%rax), %xmm2
+
+subpd       %xmm0, %xmm2
+subpd       (%rax), %xmm2
+
+subsd       %xmm0, %xmm2
+subsd       (%rax), %xmm2
+
+ucomisd     %xmm0, %xmm1
+ucomisd     (%rax), %xmm1
+
+unpckhpd    %xmm0, %xmm2
+unpckhpd    (%rax), %xmm2
+
+unpcklpd    %xmm0, %xmm2
+unpcklpd    (%rax), %xmm2
+
+xorpd       %xmm0, %xmm2
+xorpd       (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        addpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        addsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        andnpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   andnpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        andpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   andpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *      *      U     clflush	(%rax)
+# CHECK-NEXT:  1      2     1.00                        cmppd	$0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   cmppd	$0, (%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        cmpsd	$0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   cmpsd	$0, (%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        comisd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   comisd	(%rax), %xmm1
+# CHECK-NEXT:  2      8     1.00                        cvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        cvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  2      8     1.00                        cvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      6     1.00                        cvtpd2pi	%xmm0, %mm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtpd2pi	(%rax), %mm2
+# CHECK-NEXT:  2      8     1.00                        cvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtpd2ps	(%rax), %xmm2
+# CHECK-NEXT:  2      6     1.00                        cvtpi2pd	%mm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtpi2pd	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        cvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      8     1.00                        cvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        cvtsd2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   cvtsd2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   cvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  1      4     1.00                        cvtsd2ss	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtsd2ss	(%rax), %xmm2
+# CHECK-NEXT:  2      4     1.00                        cvtsi2sdl	%ecx, %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvtsi2sdq	%rcx, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        cvtss2sd	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvtss2sd	(%rax), %xmm2
+# CHECK-NEXT:  2      8     1.00                        cvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT:  2      13    1.00    *                   cvttpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      6     1.00                        cvttpd2pi	%xmm0, %mm2
+# CHECK-NEXT:  2      13    1.00    *                   cvttpd2pi	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        cvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   cvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        cvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00                        cvttsd2si	%xmm0, %rcx
+# CHECK-NEXT:  2      18    1.00    *                   cvttsd2si	(%rax), %ecx
+# CHECK-NEXT:  2      18    1.00    *                   cvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  1      9     9.50                        divpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   divpd	(%rax), %xmm2
+# CHECK-NEXT:  1      9     9.50                        divsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    9.50    *                   divsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50    *      *      U     lfence
+# CHECK-NEXT:  1      1     1.00    *      *      U     maskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT:  1      2     1.00                        maxpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   maxpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        maxsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   maxsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        minpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   minpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00                        minsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   minsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        movapd	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movapd	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movapd	(%rax), %xmm2
+# CHECK-NEXT:  2      10    0.50                        movd	%eax, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movd	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00                        movd	%xmm0, %ecx
+# CHECK-NEXT:  1      2     1.00           *            movd	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        movdqa	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movdqa	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movdqa	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movdqu	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movdqu	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movdqu	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movdq2q	%xmm0, %mm2
+# CHECK-NEXT:  2      2     1.00           *            movhpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   movhpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     1.00           *            movlpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      7     0.50    *                   movlpd	(%rax), %xmm2
+# CHECK-NEXT:  2      10    1.00                        movmskpd	%xmm0, %ecx
+# CHECK-NEXT:  1      1     0.50           *            movntil	%eax, (%rax)
+# CHECK-NEXT:  1      1     0.50           *            movntiq	%rax, (%rax)
+# CHECK-NEXT:  1      2     1.00           *            movntdq	%xmm0, (%rax)
+# CHECK-NEXT:  1      3     1.00           *            movntpd	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        movq	%xmm0, %xmm2
+# CHECK-NEXT:  2      10    0.50                        movq	%rax, %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movq	(%rax), %xmm2
+# CHECK-NEXT:  1      10    1.00                        movq	%xmm0, %rcx
+# CHECK-NEXT:  1      2     1.00           *            movq	%xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        movq2dq	%mm0, %xmm2
+# CHECK-NEXT:  1      2     0.50                        movsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      2     1.00           *            movsd	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movsd	(%rax), %xmm2
+# CHECK-NEXT:  1      1     0.50                        movupd	%xmm0, %xmm2
+# CHECK-NEXT:  1      1     1.00           *            movupd	%xmm0, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   movupd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        mulpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   mulpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        mulsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   mulsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        orpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   orpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        packssdw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   packssdw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        packsswb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   packsswb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        packuswb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   packuswb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddq	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   paddq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        paddq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddusb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddusb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddusw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddusw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        paddw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   paddw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pand	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pand	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pandn	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pandn	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pavgb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pavgw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pavgw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtw	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        pextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  1      4     1.00                        pmaddwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmaddwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxub	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxub	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminub	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminub	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  1      4     1.00                        pmulhuw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhuw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmulhw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmullw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmullw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmuludq	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmuludq	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmuludq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmuludq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        por	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   por	(%rax), %xmm2
+# CHECK-NEXT:  2      4     0.50                        psadbw	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     0.50    *                   psadbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pslld	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        pslld	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   pslld	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pslldq	$1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        psllq	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psllq	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psllq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psllw	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psllw	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psllw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrad	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psrad	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psrad	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psraw	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psraw	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psraw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrld	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psrld	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psrld	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrldq	$1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrlq	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psrlq	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psrlq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psrlw	$1, %xmm2
+# CHECK-NEXT:  1      3     0.50                        psrlw	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     0.50    *                   psrlw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubq	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psubq	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psubq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubusb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubusb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubusw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubusw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psubw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckhbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckhdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckhqdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhqdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckhwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckhwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpcklbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpckldq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpckldq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpcklqdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklqdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        punpcklwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   punpcklwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pxor	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pxor	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        shufpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   shufpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      9     13.50                       sqrtpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    13.50   *                   sqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  1      9     13.50                       sqrtsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      14    13.50   *                   sqrtsd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        subpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   subpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        subsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   subsd	(%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        ucomisd	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   ucomisd	(%rax), %xmm1
+# CHECK-NEXT:  1      2     0.50                        unpckhpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   unpckhpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        unpcklpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   unpcklpd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        xorpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   xorpd	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 66.00  66.00   -      -      -     17.00   -      -     124.50 124.50 66.50  66.50  12.00  50.00  119.50 140.50  -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andnpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     andpd	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     clflush	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmppd	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmppd	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpsd	$0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     cmpsd	$0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     comisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtdq2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2ps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpd2ps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2pd	%mm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtpi2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtps2pd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2ss	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsd2ss	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdl	%ecx, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdq	%rcx, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtsi2sdl	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtss2sd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvtss2sd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2pi	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttpd2pi	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2dq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     cvttps2dq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	(%rax), %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -     1.00    -     1.00    -      -      -      -     cvttsd2si	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     divsd	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lfence
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     maskmovdqu	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     maxsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     minsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movapd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movapd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movapd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movd	%eax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movd	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqa	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movdqa	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqu	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movdqu	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movdq2q	%xmm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movhpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movhpd	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movlpd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movlpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movmskpd	%xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movntil	%eax, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     movntiq	%rax, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntdq	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntpd	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movq	%rax, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     movq	%xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movq	%xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movq2dq	%mm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movsd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movupd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movupd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movupd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     mulsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     orpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packssdw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packsswb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packuswb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddusw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     paddw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pand	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pandn	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pavgw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrw	$1, %xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxub	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminub	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pmovmskb	%xmm0, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmullw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuludq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     por	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psadbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufhw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshufhw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshuflw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pshuflw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pslldq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psllw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrad	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psraw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrldq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	$1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psrlw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubusw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psubw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhqdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhqdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckhwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpckldq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklqdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklqdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     punpcklwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pxor	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     shufpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     13.50  13.50   -      -      -      -      -     1.00    -      -      -      -     sqrtsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     subsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomisd	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ucomisd	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpckhpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     unpcklpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     xorpd	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
new file mode 100644
index 0000000000000000000000000000000000000000..ce08757f7de53a1f4d4ce59a2b4b4260ece9d455
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse3.s
@@ -0,0 +1,108 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+addsubpd  %xmm0, %xmm2
+addsubpd  (%rax),  %xmm2
+
+addsubps  %xmm0, %xmm2
+addsubps  (%rax), %xmm2
+
+haddpd    %xmm0, %xmm2
+haddpd    (%rax), %xmm2
+
+haddps    %xmm0, %xmm2
+haddps    (%rax), %xmm2
+
+hsubpd    %xmm0, %xmm2
+hsubpd    (%rax), %xmm2
+
+hsubps    %xmm0, %xmm2
+hsubps    (%rax), %xmm2
+
+lddqu     (%rax), %xmm2
+
+movddup   %xmm0, %xmm2
+movddup   (%rax), %xmm2
+
+movshdup  %xmm0, %xmm2
+movshdup  (%rax), %xmm2
+
+movsldup  %xmm0, %xmm2
+movsldup  (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        addsubpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addsubpd	(%rax), %xmm2
+# CHECK-NEXT:  1      5     1.00                        addsubps	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    1.00    *                   addsubps	(%rax), %xmm2
+# CHECK-NEXT:  3      11    1.00                        haddpd	%xmm0, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   haddpd	(%rax), %xmm2
+# CHECK-NEXT:  3      11    1.00                        haddps	%xmm0, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   haddps	(%rax), %xmm2
+# CHECK-NEXT:  3      11    1.00                        hsubpd	%xmm0, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   hsubpd	(%rax), %xmm2
+# CHECK-NEXT:  3      11    1.00                        hsubps	%xmm0, %xmm2
+# CHECK-NEXT:  4      16    1.00    *                   hsubps	(%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   lddqu	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movddup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   movddup	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movshdup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   movshdup	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        movsldup	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   movsldup	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 5.00   5.00    -      -      -      -      -      -     9.00   9.00   0.50   0.50    -      -     15.50  3.50    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     addsubps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     haddps	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubpd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubpd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubps	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     hsubps	(%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     lddqu	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movddup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movddup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movshdup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movshdup	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsldup	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     movsldup	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
new file mode 100644
index 0000000000000000000000000000000000000000..d1b36d10b2f62fba9d99e83ce38e3fe27ab93fe9
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse41.s
@@ -0,0 +1,378 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+blendpd     $11, %xmm0, %xmm2
+blendpd     $11, (%rax), %xmm2
+
+blendps     $11, %xmm0, %xmm2
+blendps     $11, (%rax), %xmm2
+
+blendvpd    %xmm0, %xmm2
+blendvpd    (%rax), %xmm2
+
+blendvps    %xmm0, %xmm2
+blendvps    (%rax), %xmm2
+
+dppd        $22, %xmm0, %xmm2
+dppd        $22, (%rax), %xmm2
+
+dpps        $22, %xmm0, %xmm2
+dpps        $22, (%rax), %xmm2
+
+extractps   $1, %xmm0, %rcx
+extractps   $1, %xmm0, (%rax)
+
+insertps    $1, %xmm0, %xmm2
+insertps    $1, (%rax), %xmm2
+
+movntdqa    (%rax), %xmm2
+
+mpsadbw     $1, %xmm0, %xmm2
+mpsadbw     $1, (%rax), %xmm2
+
+packusdw    %xmm0, %xmm2
+packusdw    (%rax), %xmm2
+
+pblendvb    %xmm0, %xmm2
+pblendvb    (%rax), %xmm2
+
+pblendw     $11, %xmm0, %xmm2
+pblendw     $11, (%rax), %xmm2
+
+pcmpeqq     %xmm0, %xmm2
+pcmpeqq     (%rax), %xmm2
+
+pextrb      $1, %xmm0, %ecx
+pextrb      $1, %xmm0, (%rax)
+
+pextrd      $1, %xmm0, %ecx
+pextrd      $1, %xmm0, (%rax)
+
+pextrq      $1, %xmm0, %rcx
+pextrq      $1, %xmm0, (%rax)
+
+pextrw      $1, %xmm0, (%rax)
+
+phminposuw  %xmm0, %xmm2
+phminposuw  (%rax), %xmm2
+
+pinsrb      $1, %eax, %xmm1
+pinsrb      $1, (%rax), %xmm1
+
+pinsrd      $1, %eax, %xmm1
+pinsrd      $1, (%rax), %xmm1
+
+pinsrq      $1, %rax, %xmm1
+pinsrq      $1, (%rax), %xmm1
+
+pmaxsb      %xmm0, %xmm2
+pmaxsb      (%rax), %xmm2
+
+pmaxsd      %xmm0, %xmm2
+pmaxsd      (%rax), %xmm2
+
+pmaxud      %xmm0, %xmm2
+pmaxud      (%rax), %xmm2
+
+pmaxuw      %xmm0, %xmm2
+pmaxuw      (%rax), %xmm2
+
+pminsb      %xmm0, %xmm2
+pminsb      (%rax), %xmm2
+
+pminsd      %xmm0, %xmm2
+pminsd      (%rax), %xmm2
+
+pminud      %xmm0, %xmm2
+pminud      (%rax), %xmm2
+
+pminuw      %xmm0, %xmm2
+pminuw      (%rax), %xmm2
+
+pmovsxbd    %xmm0, %xmm2
+pmovsxbd    (%rax), %xmm2
+
+pmovsxbq    %xmm0, %xmm2
+pmovsxbq    (%rax), %xmm2
+
+pmovsxbw    %xmm0, %xmm2
+pmovsxbw    (%rax), %xmm2
+
+pmovsxdq    %xmm0, %xmm2
+pmovsxdq    (%rax), %xmm2
+
+pmovsxwd    %xmm0, %xmm2
+pmovsxwd    (%rax), %xmm2
+
+pmovsxwq    %xmm0, %xmm2
+pmovsxwq    (%rax), %xmm2
+
+pmovzxbd    %xmm0, %xmm2
+pmovzxbd    (%rax), %xmm2
+
+pmovzxbq    %xmm0, %xmm2
+pmovzxbq    (%rax), %xmm2
+
+pmovzxbw    %xmm0, %xmm2
+pmovzxbw    (%rax), %xmm2
+
+pmovzxdq    %xmm0, %xmm2
+pmovzxdq    (%rax), %xmm2
+
+pmovzxwd    %xmm0, %xmm2
+pmovzxwd    (%rax), %xmm2
+
+pmovzxwq    %xmm0, %xmm2
+pmovzxwq    (%rax), %xmm2
+
+pmuldq      %xmm0, %xmm2
+pmuldq      (%rax), %xmm2
+
+pmulld      %xmm0, %xmm2
+pmulld      (%rax), %xmm2
+
+ptest       %xmm0, %xmm1
+ptest       (%rax), %xmm1
+
+roundpd     $1, %xmm0, %xmm2
+roundpd     $1, (%rax), %xmm2
+
+roundps     $1, %xmm0, %xmm2
+roundps     $1, (%rax), %xmm2
+
+roundsd     $1, %xmm0, %xmm2
+roundsd     $1, (%rax), %xmm2
+
+roundss     $1, %xmm0, %xmm2
+roundss     $1, (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50                        blendpd	$11, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   blendpd	$11, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        blendps	$11, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   blendps	$11, (%rax), %xmm2
+# CHECK-NEXT:  1      2     2.00                        blendvpd	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   blendvpd	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  1      2     2.00                        blendvps	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   blendvps	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  15     15    1.50                        dppd	$22, %xmm0, %xmm2
+# CHECK-NEXT:  17     20    1.50    *                   dppd	$22, (%rax), %xmm2
+# CHECK-NEXT:  16     25    1.50                        dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT:  18     30    1.50    *                   dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        extractps	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            extractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  1      2     0.50                        insertps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   insertps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      5     0.50    *                   movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  9      9     2.00                        mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT:  9      14    2.00    *                   mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        packusdw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   packusdw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     2.00                        pblendvb	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     2.00    *                   pblendvb	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pblendw	$11, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pblendw	$11, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpeqq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpeqq	(%rax), %xmm2
+# CHECK-NEXT:  2      13    1.00                        pextrb	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            pextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        pextrd	$1, %xmm0, %ecx
+# CHECK-NEXT:  2      13    1.00           *            pextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00                        pextrq	$1, %xmm0, %rcx
+# CHECK-NEXT:  2      13    1.00           *            pextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      13    1.00           *            pextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  2      4     1.00                        phminposuw	%xmm0, %xmm2
+# CHECK-NEXT:  2      9     1.00    *                   phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  2      2     0.50                        pinsrb	$1, %eax, %xmm1
+# CHECK-NEXT:  2      6     0.50    *                   pinsrb	$1, (%rax), %xmm1
+# CHECK-NEXT:  2      2     0.50                        pinsrd	$1, %eax, %xmm1
+# CHECK-NEXT:  2      6     0.50    *                   pinsrd	$1, (%rax), %xmm1
+# CHECK-NEXT:  2      2     0.50                        pinsrq	$1, %rax, %xmm1
+# CHECK-NEXT:  2      6     0.50    *                   pinsrq	$1, (%rax), %xmm1
+# CHECK-NEXT:  1      2     0.50                        pmaxsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxud	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxud	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmaxuw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmaxuw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminud	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminud	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pminuw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pminuw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmuldq	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmuldq	(%rax), %xmm2
+# CHECK-NEXT:  1      5     2.00                        pmulld	%xmm0, %xmm2
+# CHECK-NEXT:  1      10    2.00    *                   pmulld	(%rax), %xmm2
+# CHECK-NEXT:  2      1     1.00                        ptest	%xmm0, %xmm1
+# CHECK-NEXT:  2      6     1.00    *                   ptest	(%rax), %xmm1
+# CHECK-NEXT:  1      4     1.00                        roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   roundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   roundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   roundsd	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   roundss	$1, (%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 24.50  24.50   -      -      -     6.00    -      -     20.00  20.00  32.50  32.50  10.00  13.00  49.50  50.50   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendpd	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendpd	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendps	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     blendps	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvpd	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvpd	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvps	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     blendvps	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dppd	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dppd	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dpps	$22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     1.50   1.50    -      -      -      -      -     1.00    -      -      -      -     dpps	$22, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     extractps	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     extractps	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     insertps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50    -      -      -      -     insertps	$1, (%rax), %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     movntdqa	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     mpsadbw	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     2.00    -     1.00    -      -      -      -      -     mpsadbw	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packusdw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     packusdw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pblendvb	%xmm0, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pblendvb	%xmm0, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pblendw	$11, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pblendw	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpeqq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrb	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrb	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrd	$1, %xmm0, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrd	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     pextrq	$1, %xmm0, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrq	$1, %xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     pextrw	$1, %xmm0, (%rax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     phminposuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     1.00    -      -      -      -      -     phminposuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrb	$1, %eax, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrb	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrd	$1, %eax, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrd	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrq	$1, %rax, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pinsrq	$1, (%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxud	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxud	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmaxuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminud	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminud	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminuw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pminuw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovsxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxbw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxdq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxdq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pmovzxwq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuldq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmuldq	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     pmulld	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     pmulld	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ptest	%xmm0, %xmm1
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ptest	(%rax), %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundpd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundpd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundps	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundps	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundsd	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundsd	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundss	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     roundss	$1, (%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
new file mode 100644
index 0000000000000000000000000000000000000000..2d3a0ef4049b5162d4fb737ddc3af17e5809a893
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse42.s
@@ -0,0 +1,111 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+crc32b      %al, %ecx
+crc32b      (%rax), %ecx
+
+crc32l      %eax, %ecx
+crc32l      (%rax), %ecx
+
+crc32w      %ax, %ecx
+crc32w      (%rax), %ecx
+
+crc32b      %al, %rcx
+crc32b      (%rax), %rcx
+
+crc32q      %rax, %rcx
+crc32q      (%rax), %rcx
+
+pcmpestri   $1, %xmm0, %xmm2
+pcmpestri   $1, (%rax), %xmm2
+
+pcmpestrm   $1, %xmm0, %xmm2
+pcmpestrm   $1, (%rax), %xmm2
+
+pcmpistri   $1, %xmm0, %xmm2
+pcmpistri   $1, (%rax), %xmm2
+
+pcmpistrm   $1, %xmm0, %xmm2
+pcmpistrm   $1, (%rax), %xmm2
+
+pcmpgtq     %xmm0, %xmm2
+pcmpgtq     (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      3     2.00                        crc32b	%al, %ecx
+# CHECK-NEXT:  3      7     2.00    *                   crc32b	(%rax), %ecx
+# CHECK-NEXT:  7      6     2.00                        crc32l	%eax, %ecx
+# CHECK-NEXT:  3      7     2.00    *                   crc32l	(%rax), %ecx
+# CHECK-NEXT:  5      5     2.00                        crc32w	%ax, %ecx
+# CHECK-NEXT:  3      7     2.00    *                   crc32w	(%rax), %ecx
+# CHECK-NEXT:  3      3     2.00                        crc32b	%al, %rcx
+# CHECK-NEXT:  3      7     2.00    *                   crc32b	(%rax), %rcx
+# CHECK-NEXT:  11     10    2.00                        crc32q	%rax, %rcx
+# CHECK-NEXT:  3      7     2.00    *                   crc32q	(%rax), %rcx
+# CHECK-NEXT:  27     15    4.00                        pcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  28     20    4.50    *                   pcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT:  27     10    4.00                        pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  28     15    4.50    *                   pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      14    1.00                        pcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT:  8      19    1.00    *                   pcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      6     1.00                        pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  9      11    1.00    *                   pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtq	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pcmpgtq	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 21.00  21.00   -      -      -     28.00  20.00   -     6.00   6.00   9.00   9.00    -      -     1.00   9.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	%al, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32l	%eax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32l	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32w	%ax, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32w	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	%al, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32b	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32q	%rax, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     crc32q	(%rax), %rcx
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestri	$1, (%rax), %xmm2
+# CHECK-NEXT: 4.00   4.00    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 4.50   4.50    -      -      -     1.00    -      -     0.50   0.50   2.00   2.00    -      -      -     1.00    -      -      -      -     pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistri	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistri	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     1.00   1.00    -      -      -      -      -     1.00    -      -      -      -     pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtq	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pcmpgtq	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s b/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
new file mode 100644
index 0000000000000000000000000000000000000000..55347137df458251e76aba5da1c993fb9594626c
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-sse4a.s
@@ -0,0 +1,62 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+extrq       %xmm0, %xmm2
+extrq       $22, $2, %xmm2
+
+insertq     %xmm0, %xmm2
+insertq     $22, $22, %xmm0, %xmm2
+
+movntsd     %xmm0, (%rax)
+movntss     %xmm0, (%rax)
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     0.50                        extrq	%xmm0, %xmm2
+# CHECK-NEXT:  1      3     0.50                        extrq	$22, $2, %xmm2
+# CHECK-NEXT:  1      3     2.00                        insertq	%xmm0, %xmm2
+# CHECK-NEXT:  1      3     2.00                        insertq	$22, $22, %xmm0, %xmm2
+# CHECK-NEXT:  1      3     1.00           *            movntsd	%xmm0, (%rax)
+# CHECK-NEXT:  1      3     1.00           *            movntss	%xmm0, (%rax)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     5.00   5.00    -     2.00   2.00   4.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     extrq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     extrq	$22, $2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     insertq	%xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     insertq	$22, $22, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntsd	%xmm0, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     movntss	%xmm0, (%rax)
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s b/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
new file mode 100644
index 0000000000000000000000000000000000000000..c89ef2976295c55d6898d953af2eada893780c44
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-ssse3.s
@@ -0,0 +1,265 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+pabsb       %mm0, %mm2
+pabsb       (%rax), %mm2
+
+pabsb       %xmm0, %xmm2
+pabsb       (%rax), %xmm2
+
+pabsd       %mm0, %mm2
+pabsd       (%rax), %mm2
+
+pabsd       %xmm0, %xmm2
+pabsd       (%rax), %xmm2
+
+pabsw       %mm0, %mm2
+pabsw       (%rax), %mm2
+
+pabsw       %xmm0, %xmm2
+pabsw       (%rax), %xmm2
+
+palignr     $1, %mm0, %mm2
+palignr     $1, (%rax), %mm2
+
+palignr     $1, %xmm0, %xmm2
+palignr     $1, (%rax), %xmm2
+
+phaddd      %mm0, %mm2
+phaddd      (%rax), %mm2
+
+phaddd      %xmm0, %xmm2
+phaddd      (%rax), %xmm2
+
+phaddsw     %mm0, %mm2
+phaddsw     (%rax), %mm2
+
+phaddsw     %xmm0, %xmm2
+phaddsw     (%rax), %xmm2
+
+phaddw      %mm0, %mm2
+phaddw      (%rax), %mm2
+
+phaddw      %xmm0, %xmm2
+phaddw      (%rax), %xmm2
+
+phsubd      %mm0, %mm2
+phsubd      (%rax), %mm2
+
+phsubd      %xmm0, %xmm2
+phsubd      (%rax), %xmm2
+
+phsubsw     %mm0, %mm2
+phsubsw     (%rax), %mm2
+
+phsubsw     %xmm0, %xmm2
+phsubsw     (%rax), %xmm2
+
+phsubw      %mm0, %mm2
+phsubw      (%rax), %mm2
+
+phsubw      %xmm0, %xmm2
+phsubw      (%rax), %xmm2
+
+pmaddubsw   %mm0, %mm2
+pmaddubsw   (%rax), %mm2
+
+pmaddubsw   %xmm0, %xmm2
+pmaddubsw   (%rax), %xmm2
+
+pmulhrsw    %mm0, %mm2
+pmulhrsw    (%rax), %mm2
+
+pmulhrsw    %xmm0, %xmm2
+pmulhrsw    (%rax), %xmm2
+
+pshufb      %mm0, %mm2
+pshufb      (%rax), %mm2
+
+pshufb      %xmm0, %xmm2
+pshufb      (%rax), %xmm2
+
+psignb      %mm0, %mm2
+psignb      (%rax), %mm2
+
+psignb      %xmm0, %xmm2
+psignb      (%rax), %xmm2
+
+psignd      %mm0, %mm2
+psignd      (%rax), %mm2
+
+psignd      %xmm0, %xmm2
+psignd      (%rax), %xmm2
+
+psignw      %mm0, %mm2
+psignw      (%rax), %mm2
+
+psignw      %xmm0, %xmm2
+psignw      (%rax), %xmm2
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50                        pabsb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pabsb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pabsd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pabsd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        pabsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        pabsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   pabsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        palignr	$1, %mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   palignr	$1, (%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        palignr	$1, %xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   palignr	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phaddd	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddd	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phaddd	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddd	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phaddsw	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddsw	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phaddsw	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddsw	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phaddw	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddw	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phaddw	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phaddw	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phsubd	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubd	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phsubd	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubd	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phsubsw	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubsw	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phsubsw	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubsw	(%rax), %xmm2
+# CHECK-NEXT:  3      5     0.50                        phsubw	%mm0, %mm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubw	(%rax), %mm2
+# CHECK-NEXT:  3      5     0.50                        phsubw	%xmm0, %xmm2
+# CHECK-NEXT:  4      10    0.50    *                   phsubw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmaddubsw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmaddubsw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmaddubsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmaddubsw	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        pmulhrsw	%mm0, %mm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhrsw	(%rax), %mm2
+# CHECK-NEXT:  1      4     1.00                        pmulhrsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   pmulhrsw	(%rax), %xmm2
+# CHECK-NEXT:  1      3     2.00                        pshufb	%mm0, %mm2
+# CHECK-NEXT:  1      8     2.00    *                   pshufb	(%rax), %mm2
+# CHECK-NEXT:  1      3     2.00                        pshufb	%xmm0, %xmm2
+# CHECK-NEXT:  1      8     2.00    *                   pshufb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psignb	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psignb	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psignb	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psignb	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psignd	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psignd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psignd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psignd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        psignw	%mm0, %mm2
+# CHECK-NEXT:  1      7     0.50    *                   psignw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        psignw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     0.50    *                   psignw	(%rax), %xmm2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 16.00  16.00   -      -      -      -      -      -      -      -     34.00  34.00  8.00    -     36.00  28.00   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     pabsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, %mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, (%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, %xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     palignr	$1, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phaddw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     phsubw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmaddubsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     pmulhrsw	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     pshufb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignb	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignd	(%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	%mm0, %mm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	(%rax), %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	%xmm0, %xmm2
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     psignw	(%rax), %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s b/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
new file mode 100644
index 0000000000000000000000000000000000000000..0287d973171f632c35a5cd59538625b5eb8641dc
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-tbm.s
@@ -0,0 +1,181 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+bextr        $8192, %ebx, %ecx
+bextr        $8192, (%rbx), %ecx
+
+bextr        $16384, %rbx, %rcx
+bextr        $16384, (%rbx), %rcx
+
+blcfill      %eax, %ecx
+blcfill      (%rax), %ecx
+
+blcfill      %rax, %rcx
+blcfill      (%rax), %rcx
+
+blci         %eax, %ecx
+blci         (%rax), %ecx
+
+blci         %rax, %rcx
+blci         (%rax), %rcx
+
+blcic        %eax, %ecx
+blcic        (%rax), %ecx
+
+blcic        %rax, %rcx
+blcic        (%rax), %rcx
+
+blcmsk       %eax, %ecx
+blcmsk       (%rax), %ecx
+
+blcmsk       %rax, %rcx
+blcmsk       (%rax), %rcx
+
+blcs         %eax, %ecx
+blcs         (%rax), %ecx
+
+blcs         %rax, %rcx
+blcs         (%rax), %rcx
+
+blsfill      %eax, %ecx
+blsfill      (%rax), %ecx
+
+blsfill      %rax, %rcx
+blsfill      (%rax), %rcx
+
+blsic        %eax, %ecx
+blsic        (%rax), %ecx
+
+blsic        %rax, %rcx
+blsic        (%rax), %rcx
+
+t1mskc       %eax, %ecx
+t1mskc       (%rax), %ecx
+
+t1mskc       %rax, %rcx
+t1mskc       (%rax), %rcx
+
+tzmsk        %eax, %ecx
+tzmsk        (%rax), %ecx
+
+tzmsk        %rax, %rcx
+tzmsk        (%rax), %rcx
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      2     0.50                        bextrl	$8192, %ebx, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   bextrl	$8192, (%rbx), %ecx
+# CHECK-NEXT:  2      2     0.50                        bextrq	$16384, %rbx, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   bextrq	$16384, (%rbx), %rcx
+# CHECK-NEXT:  2      2     0.50                        blcfilll	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcfilll	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blcfillq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blcfillq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        blcil	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcil	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blciq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blciq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        blcicl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcicl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blcicq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blcicq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        blcmskl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcmskl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blcmskq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blcmskq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        blcsl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blcsl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blcsq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blcsq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        blsfilll	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsfilll	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blsfillq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsfillq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        blsicl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   blsicl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        blsicq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   blsicq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        t1mskcl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   t1mskcl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        t1mskcq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   t1mskcq	(%rax), %rcx
+# CHECK-NEXT:  2      2     0.50                        tzmskl	%eax, %ecx
+# CHECK-NEXT:  2      6     0.50    *                   tzmskl	(%rax), %ecx
+# CHECK-NEXT:  2      2     0.50                        tzmskq	%rax, %rcx
+# CHECK-NEXT:  2      6     0.50    *                   tzmskq	(%rax), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 1.00   1.00    -      -      -     20.00  20.00   -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	$8192, %ebx, %ecx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrl	$8192, (%rbx), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	$16384, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bextrq	$16384, (%rbx), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfilll	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfilll	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfillq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcfillq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcil	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcil	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blciq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blciq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcicq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcmskq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blcsq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfilll	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfilll	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfillq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsfillq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     blsicq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     t1mskcq	(%rax), %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskl	%eax, %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskl	(%rax), %ecx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskq	%rax, %rcx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     tzmskq	(%rax), %rcx
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s b/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
new file mode 100644
index 0000000000000000000000000000000000000000..5a6ee53713cda7c48ac591c0c0e05450a4af33c3
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x86_32.s
@@ -0,0 +1,90 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=i686-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+aaa
+
+aad
+aad $7
+
+aam
+aam $7
+
+aas
+
+bound %bx, (%eax)
+bound %ebx, (%eax)
+
+daa
+
+das
+
+into
+
+leave
+
+salc
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.50                        aaa
+# CHECK-NEXT:  1      100   0.50                        aad
+# CHECK-NEXT:  1      100   0.50                        aad	$7
+# CHECK-NEXT:  1      100   0.50                        aam
+# CHECK-NEXT:  1      100   0.50                        aam	$7
+# CHECK-NEXT:  1      100   0.50                        aas
+# CHECK-NEXT:  1      100   0.50                  U     bound	%bx, (%eax)
+# CHECK-NEXT:  1      100   0.50                  U     bound	%ebx, (%eax)
+# CHECK-NEXT:  1      100   0.50                        daa
+# CHECK-NEXT:  1      100   0.50                        das
+# CHECK-NEXT:  1      100   0.50                  U     into
+# CHECK-NEXT:  1      1     0.50    *                   leave
+# CHECK-NEXT:  1      1     0.50                  U     salc
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     6.50   6.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aaa
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aad
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aad	$7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aam
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aam	$7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     aas
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bound	%bx, (%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     bound	%ebx, (%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     daa
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     das
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     into
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leave
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     salc
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s b/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
new file mode 100644
index 0000000000000000000000000000000000000000..b72522411b82efef7b8a90940c7b4cfe729c7742
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x86_64.s
@@ -0,0 +1,2384 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+adcb $7, %al
+adcb $7, %dil
+adcb $7, (%rax)
+adcb %sil, %dil
+adcb %sil, (%rax)
+adcb (%rax), %dil
+
+adcw $511, %ax
+adcw $511, %di
+adcw $511, (%rax)
+adcw $7, %di
+adcw $7, (%rax)
+adcw %si, %di
+adcw %si, (%rax)
+adcw (%rax), %di
+
+adcl $665536, %eax
+adcl $665536, %edi
+adcl $665536, (%rax)
+adcl $7, %edi
+adcl $7, (%rax)
+adcl %esi, %edi
+adcl %esi, (%rax)
+adcl (%rax), %edi
+
+adcq $665536, %rax
+adcq $665536, %rdi
+adcq $665536, (%rax)
+adcq $7, %rdi
+adcq $7, (%rax)
+adcq %rsi, %rdi
+adcq %rsi, (%rax)
+adcq (%rax), %rdi
+
+addb $7, %al
+addb $7, %dil
+addb $7, (%rax)
+addb %sil, %dil
+addb %sil, (%rax)
+addb (%rax), %dil
+
+addw $511, %ax
+addw $511, %di
+addw $511, (%rax)
+addw $7, %di
+addw $7, (%rax)
+addw %si, %di
+addw %si, (%rax)
+addw (%rax), %di
+
+addl $665536, %eax
+addl $665536, %edi
+addl $665536, (%rax)
+addl $7, %edi
+addl $7, (%rax)
+addl %esi, %edi
+addl %esi, (%rax)
+addl (%rax), %edi
+
+addq $665536, %rax
+addq $665536, %rdi
+addq $665536, (%rax)
+addq $7, %rdi
+addq $7, (%rax)
+addq %rsi, %rdi
+addq %rsi, (%rax)
+addq (%rax), %rdi
+
+andb $7, %al
+andb $7, %dil
+andb $7, (%rax)
+andb %sil, %dil
+andb %sil, (%rax)
+andb (%rax), %dil
+
+andw $511, %ax
+andw $511, %di
+andw $511, (%rax)
+andw $7, %di
+andw $7, (%rax)
+andw %si, %di
+andw %si, (%rax)
+andw (%rax), %di
+
+andl $665536, %eax
+andl $665536, %edi
+andl $665536, (%rax)
+andl $7, %edi
+andl $7, (%rax)
+andl %esi, %edi
+andl %esi, (%rax)
+andl (%rax), %edi
+
+andq $665536, %rax
+andq $665536, %rdi
+andq $665536, (%rax)
+andq $7, %rdi
+andq $7, (%rax)
+andq %rsi, %rdi
+andq %rsi, (%rax)
+andq (%rax), %rdi
+
+bsfw %si, %di
+bsrw %si, %di
+bsfw (%rax), %di
+bsrw (%rax), %di
+
+bsfl %esi, %edi
+bsrl %esi, %edi
+bsfl (%rax), %edi
+bsrl (%rax), %edi
+
+bsfq %rsi, %rdi
+bsrq %rsi, %rdi
+bsfq (%rax), %rdi
+bsrq (%rax), %rdi
+
+bswap %eax
+bswap %rax
+
+btw  %si, %di
+btcw %si, %di
+btrw %si, %di
+btsw %si, %di
+btw  %si, (%rax)
+btcw %si, (%rax)
+btrw %si, (%rax)
+btsw %si, (%rax)
+btw  $7, %di
+btcw $7, %di
+btrw $7, %di
+btsw $7, %di
+btw  $7, (%rax)
+btcw $7, (%rax)
+btrw $7, (%rax)
+btsw $7, (%rax)
+
+btl  %esi, %edi
+btcl %esi, %edi
+btrl %esi, %edi
+btsl %esi, %edi
+btl  %esi, (%rax)
+btcl %esi, (%rax)
+btrl %esi, (%rax)
+btsl %esi, (%rax)
+btl  $7, %edi
+btcl $7, %edi
+btrl $7, %edi
+btsl $7, %edi
+btl  $7, (%rax)
+btcl $7, (%rax)
+btrl $7, (%rax)
+btsl $7, (%rax)
+
+btq  %rsi, %rdi
+btcq %rsi, %rdi
+btrq %rsi, %rdi
+btsq %rsi, %rdi
+btq  %rsi, (%rax)
+btcq %rsi, (%rax)
+btrq %rsi, (%rax)
+btsq %rsi, (%rax)
+btq  $7, %rdi
+btcq $7, %rdi
+btrq $7, %rdi
+btsq $7, %rdi
+btq  $7, (%rax)
+btcq $7, (%rax)
+btrq $7, (%rax)
+btsq $7, (%rax)
+
+cbw
+cwde
+cdqe
+cwd
+cdq
+cqo
+
+clc
+cld
+cmc
+
+cmpb $7, %al
+cmpb $7, %dil
+cmpb $7, (%rax)
+cmpb %sil, %dil
+cmpb %sil, (%rax)
+cmpb (%rax), %dil
+
+cmpw $511, %ax
+cmpw $511, %di
+cmpw $511, (%rax)
+cmpw $7, %di
+cmpw $7, (%rax)
+cmpw %si, %di
+cmpw %si, (%rax)
+cmpw (%rax), %di
+
+cmpl $665536, %eax
+cmpl $665536, %edi
+cmpl $665536, (%rax)
+cmpl $7, %edi
+cmpl $7, (%rax)
+cmpl %esi, %edi
+cmpl %esi, (%rax)
+cmpl (%rax), %edi
+
+cmpq $665536, %rax
+cmpq $665536, %rdi
+cmpq $665536, (%rax)
+cmpq $7, %rdi
+cmpq $7, (%rax)
+cmpq %rsi, %rdi
+cmpq %rsi, (%rax)
+cmpq (%rax), %rdi
+
+cmpsb
+cmpsw
+cmpsl
+cmpsq
+
+cmpxchgb %cl, %bl
+cmpxchgb %cl, (%rbx)
+
+cmpxchgw %cx, %bx
+cmpxchgw %cx, (%rbx)
+
+cmpxchgl %ecx, %ebx
+cmpxchgl %ecx, (%rbx)
+
+cmpxchgq %rcx, %rbx
+cmpxchgq %rcx, (%rbx)
+
+cpuid
+
+decb %dil
+decb (%rax)
+decw %di
+decw (%rax)
+decl %edi
+decl (%rax)
+decq %rdi
+decq (%rax)
+
+divb %dil
+divb (%rax)
+divw %si
+divw (%rax)
+divl %edx
+divl (%rax)
+divq %rcx
+divq (%rax)
+
+idivb %dil
+idivb (%rax)
+idivw %si
+idivw (%rax)
+idivl %edx
+idivl (%rax)
+idivq %rcx
+idivq (%rax)
+
+imulb %dil
+imulb (%rax)
+
+imulw %di
+imulw (%rax)
+imulw %si, %di
+imulw (%rax), %di
+imulw $511, %si, %di
+imulw $511, (%rax), %di
+imulw $7, %si, %di
+imulw $7, (%rax), %di
+
+imull %edi
+imull (%rax)
+imull %esi, %edi
+imull (%rax), %edi
+imull $665536, %esi, %edi
+imull $665536, (%rax), %edi
+imull $7, %esi, %edi
+imull $7, (%rax), %edi
+
+imulq %rdi
+imulq (%rax)
+imulq %rsi, %rdi
+imulq (%rax), %rdi
+imulq $665536, %rsi, %rdi
+imulq $665536, (%rax), %rdi
+imulq $7, %rsi, %rdi
+imulq $7, (%rax), %rdi
+
+inb $7,  %al
+inb %dx, %al
+inw $7,  %ax
+inw %dx, %ax
+inl $7,  %eax
+inl %dx, %eax
+
+incb %dil
+incb (%rax)
+incw %di
+incw (%rax)
+incl %edi
+incl (%rax)
+incq %rdi
+incq (%rax)
+
+insb
+insw
+insl
+
+int $7
+
+lahf
+
+lodsb
+lodsw
+lodsl
+lodsq
+
+movsb
+movsw
+movsl
+movsq
+
+movsbw %al, %di
+movzbw %al, %di
+movsbw (%rax), %di
+movzbw (%rax), %di
+movsbl %al, %edi
+movzbl %al, %edi
+movsbl (%rax), %edi
+movzbl (%rax), %edi
+movsbq %al, %rdi
+movzbq %al, %rdi
+movsbq (%rax), %rdi
+movzbq (%rax), %rdi
+
+movswl %ax, %edi
+movzwl %ax, %edi
+movswl (%rax), %edi
+movzwl (%rax), %edi
+movswq %ax, %rdi
+movzwq %ax, %rdi
+movswq (%rax), %rdi
+movzwq (%rax), %rdi
+
+movslq %eax, %rdi
+movslq (%rax), %rdi
+
+mulb %dil
+mulb (%rax)
+mulw %si
+mulw (%rax)
+mull %edx
+mull (%rax)
+mulq %rcx
+mulq (%rax)
+
+negb %dil
+negb (%r8)
+negw %si
+negw (%r9)
+negl %edx
+negl (%rax)
+negq %rcx
+negq (%r10)
+
+nop
+nopw %di
+nopw (%rcx)
+nopl %esi
+nopl (%r8)
+nopq %rdx
+nopq (%r9)
+
+notb %dil
+notb (%r8)
+notw %si
+notw (%r9)
+notl %edx
+notl (%rax)
+notq %rcx
+notq (%r10)
+
+orb $7, %al
+orb $7, %dil
+orb $7, (%rax)
+orb %sil, %dil
+orb %sil, (%rax)
+orb (%rax), %dil
+
+orw $511, %ax
+orw $511, %di
+orw $511, (%rax)
+orw $7, %di
+orw $7, (%rax)
+orw %si, %di
+orw %si, (%rax)
+orw (%rax), %di
+
+orl $665536, %eax
+orl $665536, %edi
+orl $665536, (%rax)
+orl $7, %edi
+orl $7, (%rax)
+orl %esi, %edi
+orl %esi, (%rax)
+orl (%rax), %edi
+
+orq $665536, %rax
+orq $665536, %rdi
+orq $665536, (%rax)
+orq $7, %rdi
+orq $7, (%rax)
+orq %rsi, %rdi
+orq %rsi, (%rax)
+orq (%rax), %rdi
+
+outb %al,  $7
+outb %al,  %dx
+outw %ax,  $7
+outw %ax,  %dx
+outl %eax, $7
+outl %eax, %dx
+
+outsb
+outsw
+outsl
+
+pause
+
+rclb %dil
+rcrb %dil
+rclb (%rax)
+rcrb (%rax)
+rclb $7, %dil
+rcrb $7, %dil
+rclb $7, (%rax)
+rcrb $7, (%rax)
+rclb %cl, %dil
+rcrb %cl, %dil
+rclb %cl, (%rax)
+rcrb %cl, (%rax)
+
+rclw %di
+rcrw %di
+rclw (%rax)
+rcrw (%rax)
+rclw $7, %di
+rcrw $7, %di
+rclw $7, (%rax)
+rcrw $7, (%rax)
+rclw %cl, %di
+rcrw %cl, %di
+rclw %cl, (%rax)
+rcrw %cl, (%rax)
+
+rcll %edi
+rcrl %edi
+rcll (%rax)
+rcrl (%rax)
+rcll $7, %edi
+rcrl $7, %edi
+rcll $7, (%rax)
+rcrl $7, (%rax)
+rcll %cl, %edi
+rcrl %cl, %edi
+rcll %cl, (%rax)
+rcrl %cl, (%rax)
+
+rclq %rdi
+rcrq %rdi
+rclq (%rax)
+rcrq (%rax)
+rclq $7, %rdi
+rcrq $7, %rdi
+rclq $7, (%rax)
+rcrq $7, (%rax)
+rclq %cl, %rdi
+rcrq %cl, %rdi
+rclq %cl, (%rax)
+rcrq %cl, (%rax)
+
+rolb %dil
+rorb %dil
+rolb (%rax)
+rorb (%rax)
+rolb $7, %dil
+rorb $7, %dil
+rolb $7, (%rax)
+rorb $7, (%rax)
+rolb %cl, %dil
+rorb %cl, %dil
+rolb %cl, (%rax)
+rorb %cl, (%rax)
+
+rolw %di
+rorw %di
+rolw (%rax)
+rorw (%rax)
+rolw $7, %di
+rorw $7, %di
+rolw $7, (%rax)
+rorw $7, (%rax)
+rolw %cl, %di
+rorw %cl, %di
+rolw %cl, (%rax)
+rorw %cl, (%rax)
+
+roll %edi
+rorl %edi
+roll (%rax)
+rorl (%rax)
+roll $7, %edi
+rorl $7, %edi
+roll $7, (%rax)
+rorl $7, (%rax)
+roll %cl, %edi
+rorl %cl, %edi
+roll %cl, (%rax)
+rorl %cl, (%rax)
+
+rolq %rdi
+rorq %rdi
+rolq (%rax)
+rorq (%rax)
+rolq $7, %rdi
+rorq $7, %rdi
+rolq $7, (%rax)
+rorq $7, (%rax)
+rolq %cl, %rdi
+rorq %cl, %rdi
+rolq %cl, (%rax)
+rorq %cl, (%rax)
+
+sahf
+
+sarb %dil
+shlb %dil
+shrb %dil
+sarb (%rax)
+shlb (%rax)
+shrb (%rax)
+sarb $7, %dil
+shlb $7, %dil
+shrb $7, %dil
+sarb $7, (%rax)
+shlb $7, (%rax)
+shrb $7, (%rax)
+sarb %cl, %dil
+shlb %cl, %dil
+shrb %cl, %dil
+sarb %cl, (%rax)
+shlb %cl, (%rax)
+shrb %cl, (%rax)
+
+sarw %di
+shlw %di
+shrw %di
+sarw (%rax)
+shlw (%rax)
+shrw (%rax)
+sarw $7, %di
+shlw $7, %di
+shrw $7, %di
+sarw $7, (%rax)
+shlw $7, (%rax)
+shrw $7, (%rax)
+sarw %cl, %di
+shlw %cl, %di
+shrw %cl, %di
+sarw %cl, (%rax)
+shlw %cl, (%rax)
+shrw %cl, (%rax)
+
+sarl %edi
+shll %edi
+shrl %edi
+sarl (%rax)
+shll (%rax)
+shrl (%rax)
+sarl $7, %edi
+shll $7, %edi
+shrl $7, %edi
+sarl $7, (%rax)
+shll $7, (%rax)
+shrl $7, (%rax)
+sarl %cl, %edi
+shll %cl, %edi
+shrl %cl, %edi
+sarl %cl, (%rax)
+shll %cl, (%rax)
+shrl %cl, (%rax)
+
+sarq %rdi
+shlq %rdi
+shrq %rdi
+sarq (%rax)
+shlq (%rax)
+shrq (%rax)
+sarq $7, %rdi
+shlq $7, %rdi
+shrq $7, %rdi
+sarq $7, (%rax)
+shlq $7, (%rax)
+shrq $7, (%rax)
+sarq %cl, %rdi
+shlq %cl, %rdi
+shrq %cl, %rdi
+sarq %cl, (%rax)
+shlq %cl, (%rax)
+shrq %cl, (%rax)
+
+sbbb $7, %al
+sbbb $7, %dil
+sbbb $7, (%rax)
+sbbb %sil, %dil
+sbbb %sil, (%rax)
+sbbb (%rax), %dil
+
+sbbw $511, %ax
+sbbw $511, %di
+sbbw $511, (%rax)
+sbbw $7, %di
+sbbw $7, (%rax)
+sbbw %si, %di
+sbbw %si, (%rax)
+sbbw (%rax), %di
+
+sbbl $665536, %eax
+sbbl $665536, %edi
+sbbl $665536, (%rax)
+sbbl $7, %edi
+sbbl $7, (%rax)
+sbbl %esi, %edi
+sbbl %esi, (%rax)
+sbbl (%rax), %edi
+
+sbbq $665536, %rax
+sbbq $665536, %rdi
+sbbq $665536, (%rax)
+sbbq $7, %rdi
+sbbq $7, (%rax)
+sbbq %rsi, %rdi
+sbbq %rsi, (%rax)
+sbbq (%rax), %rdi
+
+scasb
+scasw
+scasl
+scasq
+
+seto  %al
+seto  (%rax)
+setno %al
+setno (%rax)
+setb  %al
+setb  (%rax)
+setnb %al
+setnb (%rax)
+setz  %al
+setz  (%rax)
+setnz %al
+setnz (%rax)
+seta  %al
+seta  (%rax)
+setna %al
+setna (%rax)
+sets  %al
+sets  (%rax)
+setns %al
+setns (%rax)
+setp  %al
+setp  (%rax)
+setnp %al
+setnp (%rax)
+setl  %al
+setl  (%rax)
+setnl %al
+setnl (%rax)
+setg  %al
+setg  (%rax)
+setng %al
+setng (%rax)
+
+shldw %cl, %si, %di
+shrdw %cl, %si, %di
+shldw %cl, %si, (%rax)
+shrdw %cl, %si, (%rax)
+shldw $7, %si, %di
+shrdw $7, %si, %di
+shldw $7, %si, (%rax)
+shrdw $7, %si, (%rax)
+
+shldl %cl, %esi, %edi
+shrdl %cl, %esi, %edi
+shldl %cl, %esi, (%rax)
+shrdl %cl, %esi, (%rax)
+shldl $7, %esi, %edi
+shrdl $7, %esi, %edi
+shldl $7, %esi, (%rax)
+shrdl $7, %esi, (%rax)
+
+shldq %cl, %rsi, %rdi
+shrdq %cl, %rsi, %rdi
+shldq %cl, %rsi, (%rax)
+shrdq %cl, %rsi, (%rax)
+shldq $7, %rsi, %rdi
+shrdq $7, %rsi, %rdi
+shldq $7, %rsi, (%rax)
+shrdq $7, %rsi, (%rax)
+
+stc
+std
+
+stosb
+stosw
+stosl
+stosq
+
+subb $7, %al
+subb $7, %dil
+subb $7, (%rax)
+subb %sil, %dil
+subb %sil, (%rax)
+subb (%rax), %dil
+
+subw $511, %ax
+subw $511, %di
+subw $511, (%rax)
+subw $7, %di
+subw $7, (%rax)
+subw %si, %di
+subw %si, (%rax)
+subw (%rax), %di
+
+subl $665536, %eax
+subl $665536, %edi
+subl $665536, (%rax)
+subl $7, %edi
+subl $7, (%rax)
+subl %esi, %edi
+subl %esi, (%rax)
+subl (%rax), %edi
+
+subq $665536, %rax
+subq $665536, %rdi
+subq $665536, (%rax)
+subq $7, %rdi
+subq $7, (%rax)
+subq %rsi, %rdi
+subq %rsi, (%rax)
+subq (%rax), %rdi
+
+testb $7, %al
+testb $7, %dil
+testb $7, (%rax)
+testb %sil, %dil
+testb %sil, (%rax)
+
+testw $511, %ax
+testw $511, %di
+testw $511, (%rax)
+testw $7, %di
+testw $7, (%rax)
+testw %si, %di
+testw %si, (%rax)
+
+testl $665536, %eax
+testl $665536, %edi
+testl $665536, (%rax)
+testl $7, %edi
+testl $7, (%rax)
+testl %esi, %edi
+testl %esi, (%rax)
+
+testq $665536, %rax
+testq $665536, %rdi
+testq $665536, (%rax)
+testq $7, %rdi
+testq $7, (%rax)
+testq %rsi, %rdi
+testq %rsi, (%rax)
+
+ud2
+
+xaddb %bl, %cl
+xaddb %bl, (%rcx)
+
+xaddw %bx, %cx
+xaddw %ax, (%rbx)
+
+xaddl %ebx, %ecx
+xaddl %eax, (%rbx)
+
+xaddq %rbx, %rcx
+xaddq %rax, (%rbx)
+
+xchgb %bl, %cl
+xchgb %bl, (%rbx)
+
+xchgw %ax, %bx
+xchgw %bx, %cx
+xchgw %ax, (%rbx)
+
+xchgl %eax, %ebx
+xchgl %ebx, %ecx
+xchgl %eax, (%rbx)
+
+xchgq %rax, %rbx
+xchgq %rbx, %rcx
+xchgq %rax, (%rbx)
+
+xlatb
+
+xorb $7, %al
+xorb $7, %dil
+xorb $7, (%rax)
+xorb %sil, %dil
+xorb %sil, (%rax)
+xorb (%rax), %dil
+
+xorw $511, %ax
+xorw $511, %di
+xorw $511, (%rax)
+xorw $7, %di
+xorw $7, (%rax)
+xorw %si, %di
+xorw %si, (%rax)
+xorw (%rax), %di
+
+xorl $665536, %eax
+xorl $665536, %edi
+xorl $665536, (%rax)
+xorl $7, %edi
+xorl $7, (%rax)
+xorl %esi, %edi
+xorl %esi, (%rax)
+xorl (%rax), %edi
+
+xorq $665536, %rax
+xorq $665536, %rdi
+xorq $665536, (%rax)
+xorq $7, %rdi
+xorq $7, (%rax)
+xorq %rsi, %rdi
+xorq %rsi, (%rax)
+xorq (%rax), %rdi
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     1.00                        adcb	$7, %al
+# CHECK-NEXT:  1      1     1.00                        adcb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            adcb	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            adcb	%sil, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   adcb	(%rax), %dil
+# CHECK-NEXT:  1      1     1.00                        adcw	$511, %ax
+# CHECK-NEXT:  1      1     1.00                        adcw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            adcw	$511, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            adcw	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            adcw	%si, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   adcw	(%rax), %di
+# CHECK-NEXT:  1      1     1.00                        adcl	$665536, %eax
+# CHECK-NEXT:  1      1     1.00                        adcl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            adcl	$665536, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            adcl	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            adcl	%esi, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   adcl	(%rax), %edi
+# CHECK-NEXT:  1      1     1.00                        adcq	$665536, %rax
+# CHECK-NEXT:  1      1     1.00                        adcq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            adcq	$665536, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            adcq	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        adcq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            adcq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   adcq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.50                        addb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        addb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            addb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            addb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   addb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        addw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        addw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            addw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            addw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            addw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   addw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        addl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        addl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            addl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            addl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            addl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   addl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        addq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        addq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            addq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            addq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        addq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            addq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   addq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.50                        andb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        andb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            andb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            andb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   andb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        andw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        andw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            andw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            andw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            andw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   andw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        andl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        andl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            andl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            andl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            andl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   andl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        andq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        andq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            andq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            andq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        andq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            andq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   andq	(%rax), %rdi
+# CHECK-NEXT:  6      3     2.00                        bsfw	%si, %di
+# CHECK-NEXT:  7      4     2.00                        bsrw	%si, %di
+# CHECK-NEXT:  8      7     2.00    *                   bsfw	(%rax), %di
+# CHECK-NEXT:  9      8     2.00    *                   bsrw	(%rax), %di
+# CHECK-NEXT:  6      3     2.00                        bsfl	%esi, %edi
+# CHECK-NEXT:  7      4     2.00                        bsrl	%esi, %edi
+# CHECK-NEXT:  8      7     2.00    *                   bsfl	(%rax), %edi
+# CHECK-NEXT:  9      8     2.00    *                   bsrl	(%rax), %edi
+# CHECK-NEXT:  6      3     2.00                        bsfq	%rsi, %rdi
+# CHECK-NEXT:  7      4     2.00                        bsrq	%rsi, %rdi
+# CHECK-NEXT:  8      7     2.00    *                   bsfq	(%rax), %rdi
+# CHECK-NEXT:  9      8     2.00    *                   bsrq	(%rax), %rdi
+# CHECK-NEXT:  1      1     1.00                        bswapl	%eax
+# CHECK-NEXT:  1      1     1.00                        bswapq	%rax
+# CHECK-NEXT:  1      1     0.50                        btw	%si, %di
+# CHECK-NEXT:  2      2     0.50                        btcw	%si, %di
+# CHECK-NEXT:  2      2     0.50                        btrw	%si, %di
+# CHECK-NEXT:  2      2     0.50                        btsw	%si, %di
+# CHECK-NEXT:  7      5     0.50    *                   btw	%si, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btcw	%si, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btrw	%si, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btsw	%si, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btw	$7, %di
+# CHECK-NEXT:  2      2     0.50                        btcw	$7, %di
+# CHECK-NEXT:  2      2     0.50                        btrw	$7, %di
+# CHECK-NEXT:  2      2     0.50                        btsw	$7, %di
+# CHECK-NEXT:  1      5     0.50    *                   btw	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btcw	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btrw	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btsw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btl	%esi, %edi
+# CHECK-NEXT:  2      2     0.50                        btcl	%esi, %edi
+# CHECK-NEXT:  2      2     0.50                        btrl	%esi, %edi
+# CHECK-NEXT:  2      2     0.50                        btsl	%esi, %edi
+# CHECK-NEXT:  7      5     0.50    *                   btl	%esi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btcl	%esi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btrl	%esi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btsl	%esi, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btl	$7, %edi
+# CHECK-NEXT:  2      2     0.50                        btcl	$7, %edi
+# CHECK-NEXT:  2      2     0.50                        btrl	$7, %edi
+# CHECK-NEXT:  2      2     0.50                        btsl	$7, %edi
+# CHECK-NEXT:  1      5     0.50    *                   btl	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btcl	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btrl	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btsl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.50                        btcq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.50                        btrq	%rsi, %rdi
+# CHECK-NEXT:  2      2     0.50                        btsq	%rsi, %rdi
+# CHECK-NEXT:  7      5     0.50    *                   btq	%rsi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btcq	%rsi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btrq	%rsi, (%rax)
+# CHECK-NEXT:  11     7     1.00    *      *            btsq	%rsi, (%rax)
+# CHECK-NEXT:  1      1     0.50                        btq	$7, %rdi
+# CHECK-NEXT:  2      2     0.50                        btcq	$7, %rdi
+# CHECK-NEXT:  2      2     0.50                        btrq	$7, %rdi
+# CHECK-NEXT:  2      2     0.50                        btsq	$7, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   btq	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btcq	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btrq	$7, (%rax)
+# CHECK-NEXT:  5      7     1.00    *      *            btsq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cbtw
+# CHECK-NEXT:  1      1     0.50                        cwtl
+# CHECK-NEXT:  1      1     0.50                        cltq
+# CHECK-NEXT:  1      1     0.50                        cwtd
+# CHECK-NEXT:  1      1     0.50                        cltd
+# CHECK-NEXT:  1      1     0.50                        cqto
+# CHECK-NEXT:  1      1     0.50                  U     clc
+# CHECK-NEXT:  1      1     0.50                  U     cld
+# CHECK-NEXT:  1      1     0.50                  U     cmc
+# CHECK-NEXT:  1      1     0.50                        cmpb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        cmpb	$7, %dil
+# CHECK-NEXT:  1      5     0.50    *                   cmpb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpb	%sil, %dil
+# CHECK-NEXT:  1      5     0.50    *                   cmpb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   cmpb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        cmpw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        cmpw	$511, %di
+# CHECK-NEXT:  1      5     0.50    *                   cmpw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpw	$7, %di
+# CHECK-NEXT:  1      5     0.50    *                   cmpw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpw	%si, %di
+# CHECK-NEXT:  1      5     0.50    *                   cmpw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   cmpw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        cmpl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        cmpl	$665536, %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmpl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpl	$7, %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmpl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpl	%esi, %edi
+# CHECK-NEXT:  1      5     0.50    *                   cmpl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   cmpl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        cmpq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        cmpq	$665536, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmpq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpq	$7, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmpq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        cmpq	%rsi, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   cmpq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   cmpq	(%rax), %rdi
+# CHECK-NEXT:  1      100   0.50                  U     cmpsb	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  1      100   0.50                  U     cmpsw	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  1      100   0.50                  U     cmpsl	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  1      100   0.50                  U     cmpsq	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  3      3     1.00                        cmpxchgb	%cl, %bl
+# CHECK-NEXT:  5      3     1.00    *      *            cmpxchgb	%cl, (%rbx)
+# CHECK-NEXT:  5      3     1.00                        cmpxchgw	%cx, %bx
+# CHECK-NEXT:  6      3     1.00    *      *            cmpxchgw	%cx, (%rbx)
+# CHECK-NEXT:  5      3     1.00                        cmpxchgl	%ecx, %ebx
+# CHECK-NEXT:  6      3     1.00    *      *            cmpxchgl	%ecx, (%rbx)
+# CHECK-NEXT:  5      3     1.00                        cmpxchgq	%rcx, %rbx
+# CHECK-NEXT:  6      3     1.00    *      *            cmpxchgq	%rcx, (%rbx)
+# CHECK-NEXT:  1      100   0.50                  U     cpuid
+# CHECK-NEXT:  1      1     0.50                        decb	%dil
+# CHECK-NEXT:  2      6     1.00    *      *            decb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        decw	%di
+# CHECK-NEXT:  2      6     1.00    *      *            decw	(%rax)
+# CHECK-NEXT:  1      1     0.50                        decl	%edi
+# CHECK-NEXT:  2      6     1.00    *      *            decl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        decq	%rdi
+# CHECK-NEXT:  2      6     1.00    *      *            decq	(%rax)
+# CHECK-NEXT:  1      12    12.00                 U     divb	%dil
+# CHECK-NEXT:  1      16    12.00   *             U     divb	(%rax)
+# CHECK-NEXT:  2      15    15.00                 U     divw	%si
+# CHECK-NEXT:  2      19    15.00   *             U     divw	(%rax)
+# CHECK-NEXT:  2      14    14.00                 U     divl	%edx
+# CHECK-NEXT:  2      18    14.00   *             U     divl	(%rax)
+# CHECK-NEXT:  2      14    14.00                 U     divq	%rcx
+# CHECK-NEXT:  2      18    14.00   *             U     divq	(%rax)
+# CHECK-NEXT:  1      12    12.00                 U     idivb	%dil
+# CHECK-NEXT:  1      16    12.00   *             U     idivb	(%rax)
+# CHECK-NEXT:  2      15    17.00                 U     idivw	%si
+# CHECK-NEXT:  2      19    17.00   *             U     idivw	(%rax)
+# CHECK-NEXT:  2      14    25.00                 U     idivl	%edx
+# CHECK-NEXT:  2      18    25.00   *             U     idivl	(%rax)
+# CHECK-NEXT:  2      14    14.00                 U     idivq	%rcx
+# CHECK-NEXT:  2      18    14.00   *             U     idivq	(%rax)
+# CHECK-NEXT:  1      4     1.00                        imulb	%dil
+# CHECK-NEXT:  1      8     1.00    *                   imulb	(%rax)
+# CHECK-NEXT:  2      4     1.00                        imulw	%di
+# CHECK-NEXT:  2      8     1.00    *                   imulw	(%rax)
+# CHECK-NEXT:  1      4     1.00                        imulw	%si, %di
+# CHECK-NEXT:  1      8     1.00    *                   imulw	(%rax), %di
+# CHECK-NEXT:  2      5     1.00                        imulw	$511, %si, %di
+# CHECK-NEXT:  2      9     1.00    *                   imulw	$511, (%rax), %di
+# CHECK-NEXT:  2      5     1.00                        imulw	$7, %si, %di
+# CHECK-NEXT:  2      9     1.00    *                   imulw	$7, (%rax), %di
+# CHECK-NEXT:  1      4     1.00                        imull	%edi
+# CHECK-NEXT:  1      8     1.00    *                   imull	(%rax)
+# CHECK-NEXT:  1      4     1.00                        imull	%esi, %edi
+# CHECK-NEXT:  1      8     1.00    *                   imull	(%rax), %edi
+# CHECK-NEXT:  1      4     1.00                        imull	$665536, %esi, %edi
+# CHECK-NEXT:  2      8     1.00    *                   imull	$665536, (%rax), %edi
+# CHECK-NEXT:  1      4     1.00                        imull	$7, %esi, %edi
+# CHECK-NEXT:  2      8     1.00    *                   imull	$7, (%rax), %edi
+# CHECK-NEXT:  1      6     4.00                        imulq	%rdi
+# CHECK-NEXT:  1      10    4.00    *                   imulq	(%rax)
+# CHECK-NEXT:  1      6     4.00                        imulq	%rsi, %rdi
+# CHECK-NEXT:  1      10    4.00    *                   imulq	(%rax), %rdi
+# CHECK-NEXT:  1      6     4.00                        imulq	$665536, %rsi, %rdi
+# CHECK-NEXT:  2      10    4.00    *                   imulq	$665536, (%rax), %rdi
+# CHECK-NEXT:  1      6     4.00                        imulq	$7, %rsi, %rdi
+# CHECK-NEXT:  2      10    4.00    *                   imulq	$7, (%rax), %rdi
+# CHECK-NEXT:  1      100   0.50                  U     inb	$7, %al
+# CHECK-NEXT:  1      100   0.50                  U     inb	%dx, %al
+# CHECK-NEXT:  1      100   0.50                  U     inw	$7, %ax
+# CHECK-NEXT:  1      100   0.50                  U     inw	%dx, %ax
+# CHECK-NEXT:  1      100   0.50                  U     inl	$7, %eax
+# CHECK-NEXT:  1      100   0.50                  U     inl	%dx, %eax
+# CHECK-NEXT:  1      1     0.50                        incb	%dil
+# CHECK-NEXT:  2      6     1.00    *      *            incb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        incw	%di
+# CHECK-NEXT:  2      6     1.00    *      *            incw	(%rax)
+# CHECK-NEXT:  1      1     0.50                        incl	%edi
+# CHECK-NEXT:  2      6     1.00    *      *            incl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        incq	%rdi
+# CHECK-NEXT:  2      6     1.00    *      *            incq	(%rax)
+# CHECK-NEXT:  1      100   0.50                  U     insb	%dx, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     insw	%dx, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     insl	%dx, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50    *      *      U     int	$7
+# CHECK-NEXT:  4      2     0.50                        lahf
+# CHECK-NEXT:  1      100   0.50                  U     lodsb	(%rsi), %al
+# CHECK-NEXT:  1      100   0.50                  U     lodsw	(%rsi), %ax
+# CHECK-NEXT:  1      100   0.50                  U     lodsl	(%rsi), %eax
+# CHECK-NEXT:  1      100   0.50                  U     lodsq	(%rsi), %rax
+# CHECK-NEXT:  1      100   0.50                  U     movsb	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     movsw	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     movsl	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     movsq	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  1      1     0.50                        movsbw	%al, %di
+# CHECK-NEXT:  1      1     0.50                        movzbw	%al, %di
+# CHECK-NEXT:  1      5     0.50    *                   movsbw	(%rax), %di
+# CHECK-NEXT:  1      5     0.50    *                   movzbw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        movsbl	%al, %edi
+# CHECK-NEXT:  1      1     0.50                        movzbl	%al, %edi
+# CHECK-NEXT:  1      5     0.50    *                   movsbl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   movzbl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        movsbq	%al, %rdi
+# CHECK-NEXT:  1      1     0.50                        movzbq	%al, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movsbq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movzbq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.50                        movswl	%ax, %edi
+# CHECK-NEXT:  1      1     0.50                        movzwl	%ax, %edi
+# CHECK-NEXT:  1      5     0.50    *                   movswl	(%rax), %edi
+# CHECK-NEXT:  1      5     0.50    *                   movzwl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        movswq	%ax, %rdi
+# CHECK-NEXT:  1      1     0.50                        movzwq	%ax, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movswq	(%rax), %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movzwq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.50                        movslq	%eax, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   movslq	(%rax), %rdi
+# CHECK-NEXT:  1      4     1.00                        mulb	%dil
+# CHECK-NEXT:  1      8     1.00    *                   mulb	(%rax)
+# CHECK-NEXT:  2      4     1.00                        mulw	%si
+# CHECK-NEXT:  2      8     1.00    *                   mulw	(%rax)
+# CHECK-NEXT:  1      4     1.00                        mull	%edx
+# CHECK-NEXT:  1      8     1.00    *                   mull	(%rax)
+# CHECK-NEXT:  1      6     4.00                        mulq	%rcx
+# CHECK-NEXT:  1      10    4.00    *                   mulq	(%rax)
+# CHECK-NEXT:  1      1     0.50                        negb	%dil
+# CHECK-NEXT:  2      6     1.00    *      *            negb	(%r8)
+# CHECK-NEXT:  1      1     0.50                        negw	%si
+# CHECK-NEXT:  2      6     1.00    *      *            negw	(%r9)
+# CHECK-NEXT:  1      1     0.50                        negl	%edx
+# CHECK-NEXT:  2      6     1.00    *      *            negl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        negq	%rcx
+# CHECK-NEXT:  2      6     1.00    *      *            negq	(%r10)
+# CHECK-NEXT:  1      1     0.50                        nop
+# CHECK-NEXT:  1      1     0.50                        nopw	%di
+# CHECK-NEXT:  1      1     0.50                        nopw	(%rcx)
+# CHECK-NEXT:  1      1     0.50                        nopl	%esi
+# CHECK-NEXT:  1      1     0.50                        nopl	(%r8)
+# CHECK-NEXT:  1      1     0.50                        nopq	%rdx
+# CHECK-NEXT:  1      1     0.50                        nopq	(%r9)
+# CHECK-NEXT:  1      1     0.50                        notb	%dil
+# CHECK-NEXT:  2      6     1.00    *      *            notb	(%r8)
+# CHECK-NEXT:  1      1     0.50                        notw	%si
+# CHECK-NEXT:  2      6     1.00    *      *            notw	(%r9)
+# CHECK-NEXT:  1      1     0.50                        notl	%edx
+# CHECK-NEXT:  2      6     1.00    *      *            notl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        notq	%rcx
+# CHECK-NEXT:  2      6     1.00    *      *            notq	(%r10)
+# CHECK-NEXT:  1      1     0.50                        orb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        orb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            orb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            orb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   orb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        orw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        orw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            orw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            orw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            orw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   orw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        orl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        orl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            orl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            orl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            orl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   orl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        orq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        orq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            orq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            orq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        orq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            orq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   orq	(%rax), %rdi
+# CHECK-NEXT:  1      100   0.50                  U     outb	%al, $7
+# CHECK-NEXT:  1      100   0.50                  U     outb	%al, %dx
+# CHECK-NEXT:  1      100   0.50                  U     outw	%ax, $7
+# CHECK-NEXT:  1      100   0.50                  U     outw	%ax, %dx
+# CHECK-NEXT:  1      100   0.50                  U     outl	%eax, $7
+# CHECK-NEXT:  1      100   0.50                  U     outl	%eax, %dx
+# CHECK-NEXT:  1      100   0.50                  U     outsb	(%rsi), %dx
+# CHECK-NEXT:  1      100   0.50                  U     outsw	(%rsi), %dx
+# CHECK-NEXT:  1      100   0.50                  U     outsl	(%rsi), %dx
+# CHECK-NEXT:  1      1     0.50    *      *      U     pause
+# CHECK-NEXT:  1      1     0.50                        rclb	%dil
+# CHECK-NEXT:  1      1     0.50                        rcrb	%dil
+# CHECK-NEXT:  2      5     1.00           *            rclb	(%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrb	(%rax)
+# CHECK-NEXT:  25     13    0.50                        rclb	$7, %dil
+# CHECK-NEXT:  23     12    0.50                        rcrb	$7, %dil
+# CHECK-NEXT:  2      5     1.00           *            rclb	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrb	$7, (%rax)
+# CHECK-NEXT:  26     12    0.50                        rclb	%cl, %dil
+# CHECK-NEXT:  24     11    0.50                        rcrb	%cl, %dil
+# CHECK-NEXT:  2      5     1.00           *            rclb	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrb	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rclw	%di
+# CHECK-NEXT:  1      1     0.50                        rcrw	%di
+# CHECK-NEXT:  2      5     1.00           *            rclw	(%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrw	(%rax)
+# CHECK-NEXT:  21     11    0.50                        rclw	$7, %di
+# CHECK-NEXT:  19     10    0.50                        rcrw	$7, %di
+# CHECK-NEXT:  2      5     1.00           *            rclw	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrw	$7, (%rax)
+# CHECK-NEXT:  22     10    0.50                        rclw	%cl, %di
+# CHECK-NEXT:  20     9     0.50                        rcrw	%cl, %di
+# CHECK-NEXT:  2      5     1.00           *            rclw	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrw	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rcll	%edi
+# CHECK-NEXT:  1      1     0.50                        rcrl	%edi
+# CHECK-NEXT:  2      5     1.00           *            rcll	(%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrl	(%rax)
+# CHECK-NEXT:  16     8     0.50                        rcll	$7, %edi
+# CHECK-NEXT:  15     7     0.50                        rcrl	$7, %edi
+# CHECK-NEXT:  2      5     1.00           *            rcll	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrl	$7, (%rax)
+# CHECK-NEXT:  17     7     0.50                        rcll	%cl, %edi
+# CHECK-NEXT:  16     7     0.50                        rcrl	%cl, %edi
+# CHECK-NEXT:  2      5     1.00           *            rcll	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrl	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rclq	%rdi
+# CHECK-NEXT:  1      1     0.50                        rcrq	%rdi
+# CHECK-NEXT:  2      5     1.00           *            rclq	(%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrq	(%rax)
+# CHECK-NEXT:  16     8     0.50                        rclq	$7, %rdi
+# CHECK-NEXT:  15     7     0.50                        rcrq	$7, %rdi
+# CHECK-NEXT:  2      5     1.00           *            rclq	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrq	$7, (%rax)
+# CHECK-NEXT:  17     7     0.50                        rclq	%cl, %rdi
+# CHECK-NEXT:  16     7     0.50                        rcrq	%cl, %rdi
+# CHECK-NEXT:  2      5     1.00           *            rclq	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00           *            rcrq	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolb	%dil
+# CHECK-NEXT:  1      1     0.50                        rorb	%dil
+# CHECK-NEXT:  2      5     1.00    *      *            rolb	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        rolb	$7, %dil
+# CHECK-NEXT:  1      1     0.50                        rorb	$7, %dil
+# CHECK-NEXT:  2      5     1.00    *      *            rolb	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolb	%cl, %dil
+# CHECK-NEXT:  1      1     0.50                        rorb	%cl, %dil
+# CHECK-NEXT:  2      5     1.00    *      *            rolb	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorb	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolw	%di
+# CHECK-NEXT:  1      1     0.50                        rorw	%di
+# CHECK-NEXT:  2      5     1.00    *      *            rolw	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorw	(%rax)
+# CHECK-NEXT:  1      1     0.50                        rolw	$7, %di
+# CHECK-NEXT:  1      1     0.50                        rorw	$7, %di
+# CHECK-NEXT:  2      5     1.00    *      *            rolw	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolw	%cl, %di
+# CHECK-NEXT:  1      1     0.50                        rorw	%cl, %di
+# CHECK-NEXT:  2      5     1.00    *      *            rolw	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorw	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        roll	%edi
+# CHECK-NEXT:  1      1     0.50                        rorl	%edi
+# CHECK-NEXT:  2      5     1.00    *      *            roll	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        roll	$7, %edi
+# CHECK-NEXT:  1      1     0.50                        rorl	$7, %edi
+# CHECK-NEXT:  2      5     1.00    *      *            roll	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        roll	%cl, %edi
+# CHECK-NEXT:  1      1     0.50                        rorl	%cl, %edi
+# CHECK-NEXT:  2      5     1.00    *      *            roll	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorl	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolq	%rdi
+# CHECK-NEXT:  1      1     0.50                        rorq	%rdi
+# CHECK-NEXT:  2      5     1.00    *      *            rolq	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorq	(%rax)
+# CHECK-NEXT:  1      1     0.50                        rolq	$7, %rdi
+# CHECK-NEXT:  1      1     0.50                        rorq	$7, %rdi
+# CHECK-NEXT:  2      5     1.00    *      *            rolq	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        rolq	%cl, %rdi
+# CHECK-NEXT:  1      1     0.50                        rorq	%cl, %rdi
+# CHECK-NEXT:  2      5     1.00    *      *            rolq	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            rorq	%cl, (%rax)
+# CHECK-NEXT:  2      2     0.50                        sahf
+# CHECK-NEXT:  1      1     0.50                        sarb	%dil
+# CHECK-NEXT:  1      1     0.50                        shlb	%dil
+# CHECK-NEXT:  1      1     0.50                        shrb	%dil
+# CHECK-NEXT:  2      5     1.00    *      *            sarb	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlb	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sarb	$7, %dil
+# CHECK-NEXT:  1      1     0.50                        shlb	$7, %dil
+# CHECK-NEXT:  1      1     0.50                        shrb	$7, %dil
+# CHECK-NEXT:  2      5     1.00    *      *            sarb	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlb	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarb	%cl, %dil
+# CHECK-NEXT:  1      1     0.50                        shlb	%cl, %dil
+# CHECK-NEXT:  1      1     0.50                        shrb	%cl, %dil
+# CHECK-NEXT:  2      5     1.00    *      *            sarb	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlb	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrb	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarw	%di
+# CHECK-NEXT:  1      1     0.50                        shlw	%di
+# CHECK-NEXT:  1      1     0.50                        shrw	%di
+# CHECK-NEXT:  2      5     1.00    *      *            sarw	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlw	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrw	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sarw	$7, %di
+# CHECK-NEXT:  1      1     0.50                        shlw	$7, %di
+# CHECK-NEXT:  1      1     0.50                        shrw	$7, %di
+# CHECK-NEXT:  2      5     1.00    *      *            sarw	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlw	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarw	%cl, %di
+# CHECK-NEXT:  1      1     0.50                        shlw	%cl, %di
+# CHECK-NEXT:  1      1     0.50                        shrw	%cl, %di
+# CHECK-NEXT:  2      5     1.00    *      *            sarw	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlw	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrw	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarl	%edi
+# CHECK-NEXT:  1      1     0.50                        shll	%edi
+# CHECK-NEXT:  1      1     0.50                        shrl	%edi
+# CHECK-NEXT:  2      5     1.00    *      *            sarl	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shll	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sarl	$7, %edi
+# CHECK-NEXT:  1      1     0.50                        shll	$7, %edi
+# CHECK-NEXT:  1      1     0.50                        shrl	$7, %edi
+# CHECK-NEXT:  2      5     1.00    *      *            sarl	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shll	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarl	%cl, %edi
+# CHECK-NEXT:  1      1     0.50                        shll	%cl, %edi
+# CHECK-NEXT:  1      1     0.50                        shrl	%cl, %edi
+# CHECK-NEXT:  2      5     1.00    *      *            sarl	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shll	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrl	%cl, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarq	%rdi
+# CHECK-NEXT:  1      1     0.50                        shlq	%rdi
+# CHECK-NEXT:  1      1     0.50                        shrq	%rdi
+# CHECK-NEXT:  2      5     1.00    *      *            sarq	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlq	(%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrq	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sarq	$7, %rdi
+# CHECK-NEXT:  1      1     0.50                        shlq	$7, %rdi
+# CHECK-NEXT:  1      1     0.50                        shrq	$7, %rdi
+# CHECK-NEXT:  2      5     1.00    *      *            sarq	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlq	$7, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        sarq	%cl, %rdi
+# CHECK-NEXT:  1      1     0.50                        shlq	%cl, %rdi
+# CHECK-NEXT:  1      1     0.50                        shrq	%cl, %rdi
+# CHECK-NEXT:  2      5     1.00    *      *            sarq	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shlq	%cl, (%rax)
+# CHECK-NEXT:  2      5     1.00    *      *            shrq	%cl, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbb	$7, %al
+# CHECK-NEXT:  1      1     1.00                        sbbb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            sbbb	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            sbbb	%sil, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   sbbb	(%rax), %dil
+# CHECK-NEXT:  1      1     1.00                        sbbw	$511, %ax
+# CHECK-NEXT:  1      1     1.00                        sbbw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            sbbw	$511, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            sbbw	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            sbbw	%si, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   sbbw	(%rax), %di
+# CHECK-NEXT:  1      1     1.00                        sbbl	$665536, %eax
+# CHECK-NEXT:  1      1     1.00                        sbbl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbl	$665536, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbl	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbl	%esi, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   sbbl	(%rax), %edi
+# CHECK-NEXT:  1      1     1.00                        sbbq	$665536, %rax
+# CHECK-NEXT:  1      1     1.00                        sbbq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbq	$665536, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbq	$7, (%rax)
+# CHECK-NEXT:  1      1     1.00                        sbbq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            sbbq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     1.00    *                   sbbq	(%rax), %rdi
+# CHECK-NEXT:  1      100   0.50                  U     scasb	%es:(%rdi), %al
+# CHECK-NEXT:  1      100   0.50                  U     scasw	%es:(%rdi), %ax
+# CHECK-NEXT:  1      100   0.50                  U     scasl	%es:(%rdi), %eax
+# CHECK-NEXT:  1      100   0.50                  U     scasq	%es:(%rdi), %rax
+# CHECK-NEXT:  1      1     0.50                        seto	%al
+# CHECK-NEXT:  1      1     0.50           *            seto	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setno	%al
+# CHECK-NEXT:  1      1     0.50           *            setno	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setb	%al
+# CHECK-NEXT:  1      1     0.50           *            setb	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setae	%al
+# CHECK-NEXT:  1      1     0.50           *            setae	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sete	%al
+# CHECK-NEXT:  1      1     0.50           *            sete	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setne	%al
+# CHECK-NEXT:  1      1     0.50           *            setne	(%rax)
+# CHECK-NEXT:  1      1     0.50                        seta	%al
+# CHECK-NEXT:  1      1     0.50           *            seta	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setbe	%al
+# CHECK-NEXT:  1      1     0.50           *            setbe	(%rax)
+# CHECK-NEXT:  1      1     0.50                        sets	%al
+# CHECK-NEXT:  1      1     0.50           *            sets	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setns	%al
+# CHECK-NEXT:  1      1     0.50           *            setns	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setp	%al
+# CHECK-NEXT:  1      1     0.50           *            setp	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setnp	%al
+# CHECK-NEXT:  1      1     0.50           *            setnp	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setl	%al
+# CHECK-NEXT:  2      1     1.00           *            setl	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setge	%al
+# CHECK-NEXT:  2      1     1.00           *            setge	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setg	%al
+# CHECK-NEXT:  2      1     1.00           *            setg	(%rax)
+# CHECK-NEXT:  1      1     0.50                        setle	%al
+# CHECK-NEXT:  2      1     1.00           *            setle	(%rax)
+# CHECK-NEXT:  7      4     4.00                        shldw	%cl, %si, %di
+# CHECK-NEXT:  7      4     4.00                        shrdw	%cl, %si, %di
+# CHECK-NEXT:  8      4     11.00   *      *            shldw	%cl, %si, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdw	%cl, %si, (%rax)
+# CHECK-NEXT:  6      4     3.00                        shldw	$7, %si, %di
+# CHECK-NEXT:  6      3     3.00                        shrdw	$7, %si, %di
+# CHECK-NEXT:  8      4     11.00   *      *            shldw	$7, %si, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdw	$7, %si, (%rax)
+# CHECK-NEXT:  7      4     4.00                        shldl	%cl, %esi, %edi
+# CHECK-NEXT:  7      4     4.00                        shrdl	%cl, %esi, %edi
+# CHECK-NEXT:  8      4     11.00   *      *            shldl	%cl, %esi, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdl	%cl, %esi, (%rax)
+# CHECK-NEXT:  6      3     3.00                        shldl	$7, %esi, %edi
+# CHECK-NEXT:  6      4     3.00                        shrdl	$7, %esi, %edi
+# CHECK-NEXT:  8      4     11.00   *      *            shldl	$7, %esi, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdl	$7, %esi, (%rax)
+# CHECK-NEXT:  7      4     4.00                        shldq	%cl, %rsi, %rdi
+# CHECK-NEXT:  7      4     4.00                        shrdq	%cl, %rsi, %rdi
+# CHECK-NEXT:  8      4     11.00   *      *            shldq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  6      4     3.00                        shldq	$7, %rsi, %rdi
+# CHECK-NEXT:  6      4     3.00                        shrdq	$7, %rsi, %rdi
+# CHECK-NEXT:  8      4     11.00   *      *            shldq	$7, %rsi, (%rax)
+# CHECK-NEXT:  8      4     11.00   *      *            shrdq	$7, %rsi, (%rax)
+# CHECK-NEXT:  1      1     0.50                  U     stc
+# CHECK-NEXT:  1      1     0.50                  U     std
+# CHECK-NEXT:  1      100   0.50                  U     stosb	%al, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     stosw	%ax, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     stosl	%eax, %es:(%rdi)
+# CHECK-NEXT:  1      100   0.50                  U     stosq	%rax, %es:(%rdi)
+# CHECK-NEXT:  1      1     0.50                        subb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        subb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            subb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            subb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   subb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        subw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        subw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            subw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            subw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            subw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   subw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        subl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        subl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            subl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            subl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            subl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   subl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        subq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        subq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            subq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            subq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        subq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            subq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   subq	(%rax), %rdi
+# CHECK-NEXT:  1      1     0.50                        testb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        testb	$7, %dil
+# CHECK-NEXT:  1      5     0.50    *                   testb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testb	%sil, %dil
+# CHECK-NEXT:  1      5     0.50    *                   testb	%sil, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        testw	$511, %di
+# CHECK-NEXT:  1      5     0.50    *                   testw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testw	$7, %di
+# CHECK-NEXT:  1      5     0.50    *                   testw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testw	%si, %di
+# CHECK-NEXT:  1      5     0.50    *                   testw	%si, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        testl	$665536, %edi
+# CHECK-NEXT:  1      5     0.50    *                   testl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testl	$7, %edi
+# CHECK-NEXT:  1      5     0.50    *                   testl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testl	%esi, %edi
+# CHECK-NEXT:  1      5     0.50    *                   testl	%esi, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        testq	$665536, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   testq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testq	$7, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   testq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        testq	%rsi, %rdi
+# CHECK-NEXT:  1      5     0.50    *                   testq	%rsi, (%rax)
+# CHECK-NEXT:  1      100   0.50    *             U     ud2
+# CHECK-NEXT:  4      2     1.00                        xaddb	%bl, %cl
+# CHECK-NEXT:  4      6     1.00    *      *            xaddb	%bl, (%rcx)
+# CHECK-NEXT:  4      2     1.00                        xaddw	%bx, %cx
+# CHECK-NEXT:  4      6     1.00    *      *            xaddw	%ax, (%rbx)
+# CHECK-NEXT:  4      2     1.00                        xaddl	%ebx, %ecx
+# CHECK-NEXT:  4      6     1.00    *      *            xaddl	%eax, (%rbx)
+# CHECK-NEXT:  4      2     1.00                        xaddq	%rbx, %rcx
+# CHECK-NEXT:  4      6     1.00    *      *            xaddq	%rax, (%rbx)
+# CHECK-NEXT:  2      1     1.00                        xchgb	%bl, %cl
+# CHECK-NEXT:  2      5     1.00    *      *            xchgb	%bl, (%rbx)
+# CHECK-NEXT:  2      1     1.00                        xchgw	%bx, %ax
+# CHECK-NEXT:  2      2     1.00                        xchgw	%bx, %cx
+# CHECK-NEXT:  2      5     1.00    *      *            xchgw	%ax, (%rbx)
+# CHECK-NEXT:  2      1     1.00                        xchgl	%ebx, %eax
+# CHECK-NEXT:  2      1     1.00                        xchgl	%ebx, %ecx
+# CHECK-NEXT:  2      5     1.00    *      *            xchgl	%eax, (%rbx)
+# CHECK-NEXT:  2      1     1.00                        xchgq	%rbx, %rax
+# CHECK-NEXT:  2      1     1.00                        xchgq	%rbx, %rcx
+# CHECK-NEXT:  2      5     1.00    *      *            xchgq	%rax, (%rbx)
+# CHECK-NEXT:  1      6     0.50    *                   xlatb
+# CHECK-NEXT:  1      1     0.50                        xorb	$7, %al
+# CHECK-NEXT:  1      1     0.50                        xorb	$7, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            xorb	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorb	%sil, %dil
+# CHECK-NEXT:  2      6     1.00    *      *            xorb	%sil, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   xorb	(%rax), %dil
+# CHECK-NEXT:  1      1     0.50                        xorw	$511, %ax
+# CHECK-NEXT:  1      1     0.50                        xorw	$511, %di
+# CHECK-NEXT:  2      6     1.00    *      *            xorw	$511, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorw	$7, %di
+# CHECK-NEXT:  2      6     1.00    *      *            xorw	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorw	%si, %di
+# CHECK-NEXT:  2      6     1.00    *      *            xorw	%si, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   xorw	(%rax), %di
+# CHECK-NEXT:  1      1     0.50                        xorl	$665536, %eax
+# CHECK-NEXT:  1      1     0.50                        xorl	$665536, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            xorl	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorl	$7, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            xorl	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorl	%esi, %edi
+# CHECK-NEXT:  2      6     1.00    *      *            xorl	%esi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   xorl	(%rax), %edi
+# CHECK-NEXT:  1      1     0.50                        xorq	$665536, %rax
+# CHECK-NEXT:  1      1     0.50                        xorq	$665536, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            xorq	$665536, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorq	$7, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            xorq	$7, (%rax)
+# CHECK-NEXT:  1      1     0.50                        xorq	%rsi, %rdi
+# CHECK-NEXT:  2      6     1.00    *      *            xorq	%rsi, (%rax)
+# CHECK-NEXT:  1      5     0.50    *                   xorq	(%rax), %rdi
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 262.00 262.00  -      -     246.00 547.50 622.50  -      -      -      -      -      -      -      -      -      -      -      -     64.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     adcq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     andq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsfq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bsrq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bswapl	%eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     bswapq	%rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	%si, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	%si, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	%si, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	%si, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	%si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	$7, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	%esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	%esi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	%esi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	%esi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	%esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	$7, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	%rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	%rsi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	%rsi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	%rsi, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	%rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	$7, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btcq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btrq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     btsq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cbtw
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cwtl
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cltq
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cwtd
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cltd
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cqto
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     clc
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cld
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmc
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	%sil, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$7, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$7, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$7, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsb	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsw	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsl	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpsq	%es:(%rdi), (%rsi)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb	%cl, %bl
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgb	%cl, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw	%cx, %bx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgw	%cx, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl	%ecx, %ebx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgl	%ecx, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq	%rcx, %rbx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchgq	%rcx, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     cpuid
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     decq	(%rax)
+# CHECK-NEXT:  -      -      -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divb	(%rax)
+# CHECK-NEXT:  -      -      -      -     15.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
+# CHECK-NEXT: 0.50   0.50    -      -     15.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divw	(%rax)
+# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
+# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divl	(%rax)
+# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
+# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     divq	(%rax)
+# CHECK-NEXT:  -      -      -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -     12.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	(%rax)
+# CHECK-NEXT:  -      -      -      -     17.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
+# CHECK-NEXT: 0.50   0.50    -      -     17.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	(%rax)
+# CHECK-NEXT:  -      -      -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
+# CHECK-NEXT: 0.50   0.50    -      -     25.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	(%rax)
+# CHECK-NEXT:  -      -      -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
+# CHECK-NEXT: 0.50   0.50    -      -     14.00   -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	%di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$511, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$511, (%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$7, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imulw	$7, (%rax), %di
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$665536, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$665536, (%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$7, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   imull	$7, (%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	%rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$665536, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$665536, (%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$7, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   imulq	$7, (%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inb	%dx, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inw	$7, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inw	%dx, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inl	$7, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     inl	%dx, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     incq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     insb	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     insw	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     insl	%dx, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     int	$7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lahf
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsb	(%rsi), %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsw	(%rsi), %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsl	(%rsi), %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     lodsq	(%rsi), %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsb	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsw	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsl	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsq	(%rsi), %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbw	%al, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbw	%al, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbw	(%rax), %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbl	%al, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbl	%al, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbq	%al, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbq	%al, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movsbq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzbq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswl	%ax, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwl	%ax, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswl	(%rax), %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswq	%ax, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwq	%ax, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movswq	(%rax), %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movzwq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movslq	%eax, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     movslq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulb	%dil
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulw	%si
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mulw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mull	%edx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     1.00   mull	(%rax)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   mulq	%rcx
+# CHECK-NEXT: 0.50   0.50    -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     4.00   mulq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negb	(%r8)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negw	%si
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negw	(%r9)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negl	%edx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negq	%rcx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     negq	(%r10)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nop
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopw	(%rcx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopl	%esi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopl	(%r8)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopq	%rdx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     nopq	(%r9)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notb	(%r8)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notw	%si
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notw	(%r9)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notl	%edx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notq	%rcx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     notq	(%r10)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     orq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outb	%al, $7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outb	%al, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outw	%ax, $7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outw	%ax, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outl	%eax, $7
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outl	%eax, %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outsb	(%rsi), %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outsw	(%rsi), %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     outsl	(%rsi), %dx
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     pause
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%cl, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrb	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%cl, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrw	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%cl, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcll	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrl	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%cl, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rclq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rcrq	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%cl, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorb	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%cl, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorw	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%cl, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     roll	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorl	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%cl, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rolq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     rorq	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sahf
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	$7, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%cl, %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%cl, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlb	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrb	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	$7, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%cl, %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%cl, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlw	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrw	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	$7, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%cl, %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%cl, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarl	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shll	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrl	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	(%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	$7, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	$7, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%cl, %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%cl, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sarq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shlq	%cl, (%rax)
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     shrq	%cl, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     sbbq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasb	%es:(%rdi), %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasw	%es:(%rdi), %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasl	%es:(%rdi), %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     scasq	%es:(%rdi), %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seto	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seto	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setno	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setno	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setb	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setb	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setae	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setae	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sete	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sete	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setne	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setne	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seta	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     seta	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setbe	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setbe	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sets	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     sets	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setns	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setns	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setp	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setp	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setnp	%al
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setnp	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setl	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setl	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setge	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setge	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setg	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setg	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     setle	%al
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     setle	(%rax)
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	%cl, %si, %di
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	%cl, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	%cl, %si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	%cl, %si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	$7, %si, %di
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	$7, %si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldw	$7, %si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdw	$7, %si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	%cl, %esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	%cl, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	%cl, %esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	%cl, %esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	$7, %esi, %edi
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	$7, %esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldl	$7, %esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdl	$7, %esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	%cl, %rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     4.00   4.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	%cl, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	%cl, %rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	%cl, %rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	$7, %rsi, %rdi
+# CHECK-NEXT:  -      -      -      -      -     3.00   3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	$7, %rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shldq	$7, %rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     11.00  11.00   -      -      -      -      -      -      -      -      -      -      -      -      -     shrdq	$7, %rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stc
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     std
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosb	%al, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosw	%ax, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosl	%eax, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     stosq	%rax, %es:(%rdi)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     subq	(%rax), %rdi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	%sil, %dil
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testb	%sil, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$7, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	%si, %di
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testw	%si, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$7, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	%esi, %edi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testl	%esi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$7, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	%rsi, %rdi
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     testq	%rsi, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     ud2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddb	%bl, %cl
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddb	%bl, (%rcx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddw	%bx, %cx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddw	%ax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddl	%ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddl	%eax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddq	%rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xaddq	%rax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgb	%bl, %cl
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgb	%bl, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%bx, %ax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%bx, %cx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgw	%ax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%ebx, %eax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%ebx, %ecx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgl	%eax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rbx, %rax
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rbx, %rcx
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xchgq	%rax, (%rbx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xlatb
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, %al
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	%sil, %dil
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	%sil, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorb	(%rax), %dil
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, %ax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$511, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$7, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%si, %di
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	%si, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorw	(%rax), %di
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, %eax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$7, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%esi, %edi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%esi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	(%rax), %edi
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, %rax
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$665536, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$7, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	$7, (%rax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rsi, %rdi
+# CHECK-NEXT: 1.00   1.00    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rsi, (%rax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	(%rax), %rdi
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-x87.s b/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
new file mode 100644
index 0000000000000000000000000000000000000000..f64944cb11285efa56a66ea97b7bd1f7bc4dcd66
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-x87.s
@@ -0,0 +1,533 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+f2xm1
+
+fabs
+
+fadd %st(0), %st(1)
+fadd %st(2)
+fadds (%ecx)
+faddl (%ecx)
+faddp %st(1)
+faddp %st(2)
+fiadds (%ecx)
+fiaddl (%ecx)
+
+fbld (%ecx)
+fbstp (%eax)
+
+fchs
+
+fnclex
+
+fcmovb %st(1), %st(0)
+fcmovbe %st(1), %st(0)
+fcmove %st(1), %st(0)
+fcmovnb %st(1), %st(0)
+fcmovnbe %st(1), %st(0)
+fcmovne %st(1), %st(0)
+fcmovnu %st(1), %st(0)
+fcmovu %st(1), %st(0)
+
+fcom %st(1)
+fcom %st(3)
+fcoms (%ecx)
+fcoml (%eax)
+fcomp %st(1)
+fcomp %st(3)
+fcomps (%ecx)
+fcompl (%eax)
+fcompp
+
+fcomi %st(3)
+fcompi %st(3)
+
+fcos
+
+fdecstp
+
+fdiv %st(0), %st(1)
+fdiv %st(2)
+fdivs (%ecx)
+fdivl (%eax)
+fdivp %st(1)
+fdivp %st(2)
+fidivs (%ecx)
+fidivl (%eax)
+
+fdivr %st(0), %st(1)
+fdivr %st(2)
+fdivrs (%ecx)
+fdivrl (%eax)
+fdivrp %st(1)
+fdivrp %st(2)
+fidivrs (%ecx)
+fidivrl (%eax)
+
+ffree %st(0)
+
+ficoms (%ecx)
+ficoml (%eax)
+ficomps (%ecx)
+ficompl (%eax)
+
+filds (%edx)
+fildl (%ecx)
+fildll (%eax)
+
+fincstp
+
+fninit
+
+fists (%edx)
+fistl (%ecx)
+fistps (%edx)
+fistpl (%ecx)
+fistpll (%eax)
+
+fisttps (%edx)
+fisttpl (%ecx)
+fisttpll (%eax)
+
+fld %st(0)
+flds (%edx)
+fldl (%ecx)
+fldt (%eax)
+
+fldcw (%eax)
+fldenv (%eax)
+
+fld1
+fldl2e
+fldl2t
+fldlg2
+fldln2
+fldpi
+fldz
+
+fmul %st(0), %st(1)
+fmul %st(2)
+fmuls (%ecx)
+fmull (%eax)
+fmulp %st(1)
+fmulp %st(2)
+fimuls (%ecx)
+fimull (%eax)
+
+fnop
+
+fpatan
+
+fprem
+fprem1
+
+fptan
+
+frndint
+
+frstor (%eax)
+
+fnsave (%eax)
+
+fscale
+
+fsin
+
+fsincos
+
+fsqrt
+
+fst %st(0)
+fsts (%edx)
+fstl (%ecx)
+fstp %st(0)
+fstpl (%edx)
+fstpl (%ecx)
+fstpt (%eax)
+
+fnstcw (%eax)
+fnstenv (%eax)
+fnstsw (%eax)
+
+frstor (%eax)
+fsave (%eax)
+
+fsub %st(0), %st(1)
+fsub %st(2)
+fsubs (%ecx)
+fsubl (%eax)
+fsubp %st(1)
+fsubp %st(2)
+fisubs (%ecx)
+fisubl (%eax)
+
+fsubr %st(0), %st(1)
+fsubr %st(2)
+fsubrs (%ecx)
+fsubrl (%eax)
+fsubrp %st(1)
+fsubrp %st(2)
+fisubrs (%ecx)
+fisubrl (%eax)
+
+ftst
+
+fucom %st(1)
+fucom %st(3)
+fucomp %st(1)
+fucomp %st(3)
+fucompp
+
+fucomi %st(3)
+fucompi %st(3)
+
+fwait
+
+fxam
+
+fxch %st(1)
+fxch %st(3)
+
+fxrstor (%eax)
+fxsave (%eax)
+
+fxtract
+
+fyl2x
+fyl2xp1
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      100   0.50                  U     f2xm1
+# CHECK-NEXT:  1      1     1.00                  U     fabs
+# CHECK-NEXT:  1      5     1.00                  U     fadd	%st(0), %st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fadd	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fadds	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     faddl	(%ecx)
+# CHECK-NEXT:  1      5     1.00                  U     faddp	%st(1)
+# CHECK-NEXT:  1      5     1.00                  U     faddp	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fiadds	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fiaddl	(%ecx)
+# CHECK-NEXT:  1      100   0.50                  U     fbld	(%ecx)
+# CHECK-NEXT:  1      100   0.50                  U     fbstp	(%eax)
+# CHECK-NEXT:  1      1     1.00                  U     fchs
+# CHECK-NEXT:  1      100   0.50                  U     fnclex
+# CHECK-NEXT:  1      1     1.00                  U     fcmovb	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovbe	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmove	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovnb	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovnbe	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovne	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovnu	%st(1), %st(0)
+# CHECK-NEXT:  1      1     1.00                  U     fcmovu	%st(1), %st(0)
+# CHECK-NEXT:  2      1     1.00                  U     fcom	%st(1)
+# CHECK-NEXT:  2      1     1.00                  U     fcom	%st(3)
+# CHECK-NEXT:  1      6     1.00                  U     fcoms	(%ecx)
+# CHECK-NEXT:  1      6     1.00                  U     fcoml	(%eax)
+# CHECK-NEXT:  2      1     1.00                  U     fcomp	%st(1)
+# CHECK-NEXT:  2      1     1.00                  U     fcomp	%st(3)
+# CHECK-NEXT:  1      6     1.00                  U     fcomps	(%ecx)
+# CHECK-NEXT:  1      6     1.00                  U     fcompl	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fcompp
+# CHECK-NEXT:  2      1     1.00                  U     fcomi	%st(3)
+# CHECK-NEXT:  2      1     1.00                  U     fcompi	%st(3)
+# CHECK-NEXT:  1      100   0.50                  U     fcos
+# CHECK-NEXT:  1      100   0.50                  U     fdecstp
+# CHECK-NEXT:  1      9     9.50                  U     fdiv	%st(0), %st(1)
+# CHECK-NEXT:  1      9     9.50                  U     fdiv	%st(2)
+# CHECK-NEXT:  1      14    9.50    *             U     fdivs	(%ecx)
+# CHECK-NEXT:  1      14    9.50    *             U     fdivl	(%eax)
+# CHECK-NEXT:  1      9     9.50                  U     fdivp	%st(1)
+# CHECK-NEXT:  1      9     9.50                  U     fdivp	%st(2)
+# CHECK-NEXT:  1      14    9.50    *             U     fidivs	(%ecx)
+# CHECK-NEXT:  1      14    9.50    *             U     fidivl	(%eax)
+# CHECK-NEXT:  1      9     9.50                  U     fdivr	%st(0), %st(1)
+# CHECK-NEXT:  1      9     9.50                  U     fdivr	%st(2)
+# CHECK-NEXT:  1      14    9.50    *             U     fdivrs	(%ecx)
+# CHECK-NEXT:  1      14    9.50    *             U     fdivrl	(%eax)
+# CHECK-NEXT:  1      9     9.50                  U     fdivrp	%st(1)
+# CHECK-NEXT:  1      9     9.50                  U     fdivrp	%st(2)
+# CHECK-NEXT:  1      14    9.50    *             U     fidivrs	(%ecx)
+# CHECK-NEXT:  1      14    9.50    *             U     fidivrl	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     ffree	%st(0)
+# CHECK-NEXT:  2      6     1.00                  U     ficoms	(%ecx)
+# CHECK-NEXT:  2      6     1.00                  U     ficoml	(%eax)
+# CHECK-NEXT:  2      6     1.00                  U     ficomps	(%ecx)
+# CHECK-NEXT:  2      6     1.00                  U     ficompl	(%eax)
+# CHECK-NEXT:  1      5     0.50    *             U     filds	(%edx)
+# CHECK-NEXT:  1      5     0.50    *             U     fildl	(%ecx)
+# CHECK-NEXT:  1      5     0.50    *             U     fildll	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fincstp
+# CHECK-NEXT:  1      100   0.50                  U     fninit
+# CHECK-NEXT:  1      1     0.50           *      U     fists	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fistl	(%ecx)
+# CHECK-NEXT:  1      1     0.50           *      U     fistps	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fistpl	(%ecx)
+# CHECK-NEXT:  1      1     0.50           *      U     fistpll	(%eax)
+# CHECK-NEXT:  1      1     0.50           *      U     fisttps	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fisttpl	(%ecx)
+# CHECK-NEXT:  1      1     0.50           *      U     fisttpll	(%eax)
+# CHECK-NEXT:  1      1     0.50                  U     fld	%st(0)
+# CHECK-NEXT:  1      5     0.50    *             U     flds	(%edx)
+# CHECK-NEXT:  1      5     0.50    *             U     fldl	(%ecx)
+# CHECK-NEXT:  1      5     0.50    *             U     fldt	(%eax)
+# CHECK-NEXT:  1      5     0.50    *             U     fldcw	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fldenv	(%eax)
+# CHECK-NEXT:  1      3     1.00                  U     fld1
+# CHECK-NEXT:  1      3     1.00                  U     fldl2e
+# CHECK-NEXT:  1      3     1.00                  U     fldl2t
+# CHECK-NEXT:  1      3     1.00                  U     fldlg2
+# CHECK-NEXT:  1      3     1.00                  U     fldln2
+# CHECK-NEXT:  1      3     1.00                  U     fldpi
+# CHECK-NEXT:  1      3     1.00                  U     fldz
+# CHECK-NEXT:  1      5     1.00                  U     fmul	%st(0), %st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fmul	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fmuls	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fmull	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fmulp	%st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fmulp	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fimuls	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fimull	(%eax)
+# CHECK-NEXT:  1      1     0.50                  U     fnop
+# CHECK-NEXT:  1      100   0.50                  U     fpatan
+# CHECK-NEXT:  1      100   0.50                  U     fprem
+# CHECK-NEXT:  1      100   0.50                  U     fprem1
+# CHECK-NEXT:  1      100   0.50                  U     fptan
+# CHECK-NEXT:  1      100   0.50                  U     frndint
+# CHECK-NEXT:  1      100   0.50                  U     frstor	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fnsave	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fscale
+# CHECK-NEXT:  1      100   0.50                  U     fsin
+# CHECK-NEXT:  1      100   0.50                  U     fsincos
+# CHECK-NEXT:  1      1     17.50                 U     fsqrt
+# CHECK-NEXT:  1      1     0.50                  U     fst	%st(0)
+# CHECK-NEXT:  1      1     0.50           *      U     fsts	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fstl	(%ecx)
+# CHECK-NEXT:  1      1     0.50                  U     fstp	%st(0)
+# CHECK-NEXT:  1      1     0.50           *      U     fstpl	(%edx)
+# CHECK-NEXT:  1      1     0.50           *      U     fstpl	(%ecx)
+# CHECK-NEXT:  1      1     0.50           *      U     fstpt	(%eax)
+# CHECK-NEXT:  1      1     0.50           *      U     fnstcw	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fnstenv	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fnstsw	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     frstor	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     wait
+# CHECK-NEXT:  1      100   0.50                  U     fnsave	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fsub	%st(0), %st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fsub	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fsubs	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fsubl	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fsubp	%st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fsubp	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fisubs	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fisubl	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fsubr	%st(0), %st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fsubr	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fsubrs	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fsubrl	(%eax)
+# CHECK-NEXT:  1      5     1.00                  U     fsubrp	%st(1)
+# CHECK-NEXT:  1      5     1.00                  U     fsubrp	%st(2)
+# CHECK-NEXT:  1      10    1.00    *             U     fisubrs	(%ecx)
+# CHECK-NEXT:  1      10    1.00    *             U     fisubrl	(%eax)
+# CHECK-NEXT:  1      1     1.00                  U     ftst
+# CHECK-NEXT:  2      1     1.00                  U     fucom	%st(1)
+# CHECK-NEXT:  2      1     1.00                  U     fucom	%st(3)
+# CHECK-NEXT:  2      1     1.00                  U     fucomp	%st(1)
+# CHECK-NEXT:  2      1     1.00                  U     fucomp	%st(3)
+# CHECK-NEXT:  1      1     1.00                  U     fucompp
+# CHECK-NEXT:  2      1     1.00                  U     fucomi	%st(3)
+# CHECK-NEXT:  2      1     1.00                  U     fucompi	%st(3)
+# CHECK-NEXT:  1      100   0.50                  U     wait
+# CHECK-NEXT:  1      100   0.50                  U     fxam
+# CHECK-NEXT:  1      1     0.50                  U     fxch	%st(1)
+# CHECK-NEXT:  1      1     0.50                  U     fxch	%st(3)
+# CHECK-NEXT:  1      100   0.50    *      *      U     fxrstor	(%eax)
+# CHECK-NEXT:  1      100   0.50    *      *      U     fxsave	(%eax)
+# CHECK-NEXT:  1      100   0.50                  U     fxtract
+# CHECK-NEXT:  1      100   0.50                  U     fyl2x
+# CHECK-NEXT:  1      100   0.50                  U     fyl2xp1
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 24.00  24.00   -      -      -     36.00  20.00   -     201.50 201.50  -      -      -     7.00   48.00  40.00   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     f2xm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fabs
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fadd	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fadd	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fadds	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     faddl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     faddp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     faddp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fiadds	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fiaddl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fbld	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fbstp	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fchs
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnclex
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovb	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovbe	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmove	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovnb	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovnbe	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovne	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovnu	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcmovu	%st(1), %st(0)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcom	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcom	%st(3)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcoms	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcoml	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcomp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcomp	%st(3)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcomps	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fcompl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fcompp
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcomi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fcompi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fcos
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fdecstp
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdiv	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdiv	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivr	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivr	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fdivrp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     9.50   9.50    -      -      -      -      -     1.00    -      -      -      -     fidivrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     ffree	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficoms	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficoml	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficomps	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     ficompl	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     filds	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fildl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fildll	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fincstp
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fninit
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fists	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistps	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistpl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fistpll	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fisttps	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fisttpl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fisttpll	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fld	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     flds	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldt	(%eax)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fldcw	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fldenv	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fld1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldl2e
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldl2t
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldlg2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldln2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldpi
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     fldz
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmul	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmul	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmuls	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmull	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmulp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fmulp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fimuls	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fimull	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnop
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fpatan
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fprem
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fprem1
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fptan
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     frndint
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     frstor	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnsave	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fscale
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fsin
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fsincos
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     17.50  17.50   -      -      -      -      -     1.00    -      -      -      -     fsqrt
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fst	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fsts	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstl	(%ecx)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fstp	%st(0)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstpl	(%edx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstpl	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     fstpt	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnstcw	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnstenv	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnstsw	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     frstor	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     wait
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fnsave	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsub	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsub	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubr	%st(0), %st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubr	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fsubrp	%st(2)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubrs	(%ecx)
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fisubrl	(%eax)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     ftst
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucom	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucom	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucomp	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucomp	%st(3)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     1.00    -      -      -      -     fucompp
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucomi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     0.50   0.50    -      -      -      -     1.00    -      -      -      -      -     fucompi	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     wait
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxam
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxch	%st(1)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxch	%st(3)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxrstor	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxsave	(%eax)
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fxtract
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fyl2x
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     fyl2xp1
diff --git a/test/tools/llvm-mca/X86/BdVer2/resources-xop.s b/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
new file mode 100644
index 0000000000000000000000000000000000000000..306917defb10257b21bf562b2c4c64caa23ef003
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/resources-xop.s
@@ -0,0 +1,546 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -instruction-tables < %s | FileCheck %s
+
+vfrczpd %xmm0, %xmm3
+vfrczpd (%rax), %xmm3
+
+vfrczpd %ymm0, %ymm3
+vfrczpd (%rax), %ymm3
+
+vfrczps %xmm0, %xmm3
+vfrczps (%rax), %xmm3
+
+vfrczps %ymm0, %ymm3
+vfrczps (%rax), %ymm3
+
+vfrczsd %xmm0, %xmm3
+vfrczsd (%rax), %xmm3
+
+vfrczss %xmm0, %xmm3
+vfrczss (%rax), %xmm3
+
+vpcmov %xmm0, %xmm1, %xmm2, %xmm3
+vpcmov (%rax), %xmm0, %xmm1, %xmm3
+vpcmov %xmm0, (%rax), %xmm1, %xmm3
+
+vpcmov %ymm0, %ymm1, %ymm2, %ymm3
+vpcmov (%rax), %ymm0, %ymm1, %ymm3
+vpcmov %ymm0, (%rax), %ymm1, %ymm3
+
+vpcomb $0, %xmm0, %xmm1, %xmm3
+vpcomb $0, (%rax), %xmm0, %xmm3
+
+vpcomd $0, %xmm0, %xmm1, %xmm3
+vpcomd $0, (%rax), %xmm0, %xmm3
+
+vpcomq $0, %xmm0, %xmm1, %xmm3
+vpcomq $0, (%rax), %xmm0, %xmm3
+
+vpcomub $0, %xmm0, %xmm1, %xmm3
+vpcomub $0, (%rax), %xmm0, %xmm3
+
+vpcomud $0, %xmm0, %xmm1, %xmm3
+vpcomud $0, (%rax), %xmm0, %xmm3
+
+vpcomuq $0, %xmm0, %xmm1, %xmm3
+vpcomuq $0, (%rax), %xmm0, %xmm3
+
+vpcomuw $0, %xmm0, %xmm1, %xmm3
+vpcomuw $0, (%rax), %xmm0, %xmm3
+
+vpcomw $0, %xmm0, %xmm1, %xmm3
+vpcomw $0, (%rax), %xmm0, %xmm3
+
+vpermil2pd $0, %xmm0, %xmm1, %xmm2, %xmm3
+vpermil2pd $0, (%rax), %xmm0, %xmm1, %xmm3
+vpermil2pd $0, %xmm0, (%rax), %xmm1, %xmm3
+
+vpermil2pd $0, %ymm0, %ymm1, %ymm2, %ymm3
+vpermil2pd $0, (%rax), %ymm0, %ymm1, %ymm3
+vpermil2pd $0, %ymm0, (%rax), %ymm1, %ymm3
+
+vpermil2ps $0, %xmm0, %xmm1, %xmm2, %xmm3
+vpermil2ps $0, (%rax), %xmm0, %xmm1, %xmm3
+vpermil2ps $0, %xmm0, (%rax), %xmm1, %xmm3
+
+vpermil2ps $0, %ymm0, %ymm1, %ymm2, %ymm3
+vpermil2ps $0, (%rax), %ymm0, %ymm1, %ymm3
+vpermil2ps $0, %ymm0, (%rax), %ymm1, %ymm3
+
+vphaddbd %xmm0, %xmm3
+vphaddbd (%rax), %xmm3
+
+vphaddbq %xmm0, %xmm3
+vphaddbq (%rax), %xmm3
+
+vphaddbw %xmm0, %xmm3
+vphaddbw (%rax), %xmm3
+
+vphadddq %xmm0, %xmm3
+vphadddq (%rax), %xmm3
+
+vphaddubd %xmm0, %xmm3
+vphaddubd (%rax), %xmm3
+
+vphaddubq %xmm0, %xmm3
+vphaddubq (%rax), %xmm3
+
+vphaddubw %xmm0, %xmm3
+vphaddubw (%rax), %xmm3
+
+vphaddudq %xmm0, %xmm3
+vphaddudq (%rax), %xmm3
+
+vphadduwd %xmm0, %xmm3
+vphadduwd (%rax), %xmm3
+
+vphadduwq %xmm0, %xmm3
+vphadduwq (%rax), %xmm3
+
+vphaddwd %xmm0, %xmm3
+vphaddwd (%rax), %xmm3
+
+vphaddwq %xmm0, %xmm3
+vphaddwq (%rax), %xmm3
+
+vphsubbw %xmm0, %xmm3
+vphsubbw (%rax), %xmm3
+
+vphsubdq %xmm0, %xmm3
+vphsubdq (%rax), %xmm3
+
+vphsubwd %xmm0, %xmm3
+vphsubwd (%rax), %xmm3
+
+vpmacsdd %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsdd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacsdqh %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsdqh %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacsdql %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsdql %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacssdd %xmm0, %xmm1, %xmm2, %xmm3
+vpmacssdd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacssdqh %xmm0, %xmm1, %xmm2, %xmm3
+vpmacssdqh %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacssdql %xmm0, %xmm1, %xmm2, %xmm3
+vpmacssdql %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacsswd %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsswd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacssww %xmm0, %xmm1, %xmm2, %xmm3
+vpmacssww %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacswd %xmm0, %xmm1, %xmm2, %xmm3
+vpmacswd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmacsww %xmm0, %xmm1, %xmm2, %xmm3
+vpmacsww %xmm0, (%rax), %xmm1, %xmm3
+
+vpmadcsswd %xmm0, %xmm1, %xmm2, %xmm3
+vpmadcsswd %xmm0, (%rax), %xmm1, %xmm3
+
+vpmadcswd %xmm0, %xmm1, %xmm2, %xmm3
+vpmadcswd %xmm0, (%rax), %xmm1, %xmm3
+
+vpperm %xmm0, %xmm1, %xmm2, %xmm3
+vpperm (%rax), %xmm0, %xmm1, %xmm3
+vpperm %xmm0, (%rax), %xmm1, %xmm3
+
+vprotb %xmm0, %xmm1, %xmm3
+vprotb (%rax), %xmm0, %xmm3
+vprotb %xmm0, (%rax), %xmm3
+
+vprotb $0, %xmm0, %xmm3
+vprotb $0, (%rax), %xmm3
+
+vprotd %xmm0, %xmm1, %xmm3
+vprotd (%rax), %xmm0, %xmm3
+vprotd %xmm0, (%rax), %xmm3
+
+vprotd $0, %xmm0, %xmm3
+vprotd $0, (%rax), %xmm3
+
+vprotq %xmm0, %xmm1, %xmm3
+vprotq (%rax), %xmm0, %xmm3
+vprotq %xmm0, (%rax), %xmm3
+
+vprotq $0, %xmm0, %xmm3
+vprotq $0, (%rax), %xmm3
+
+vprotw %xmm0, %xmm1, %xmm3
+vprotw (%rax), %xmm0, %xmm3
+vprotw %xmm0, (%rax), %xmm3
+
+vprotw $0, %xmm0, %xmm3
+vprotw $0, (%rax), %xmm3
+
+vpshab %xmm0, %xmm1, %xmm3
+vpshab (%rax), %xmm0, %xmm3
+vpshab %xmm0, (%rax), %xmm3
+
+vpshad %xmm0, %xmm1, %xmm3
+vpshad (%rax), %xmm0, %xmm3
+vpshad %xmm0, (%rax), %xmm3
+
+vpshaq %xmm0, %xmm1, %xmm3
+vpshaq (%rax), %xmm0, %xmm3
+vpshaq %xmm0, (%rax), %xmm3
+
+vpshaw %xmm0, %xmm1, %xmm3
+vpshaw (%rax), %xmm0, %xmm3
+vpshaw %xmm0, (%rax), %xmm3
+
+vpshlb %xmm0, %xmm1, %xmm3
+vpshlb (%rax), %xmm0, %xmm3
+vpshlb %xmm0, (%rax), %xmm3
+
+vpshld %xmm0, %xmm1, %xmm3
+vpshld (%rax), %xmm0, %xmm3
+vpshld %xmm0, (%rax), %xmm3
+
+vpshlq %xmm0, %xmm1, %xmm3
+vpshlq (%rax), %xmm0, %xmm3
+vpshlq %xmm0, (%rax), %xmm3
+
+vpshlw %xmm0, %xmm1, %xmm3
+vpshlw (%rax), %xmm0, %xmm3
+vpshlw %xmm0, (%rax), %xmm3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      10    1.00                        vfrczpd	%xmm0, %xmm3
+# CHECK-NEXT:  2      15    1.00    *                   vfrczpd	(%rax), %xmm3
+# CHECK-NEXT:  4      10    2.00                        vfrczpd	%ymm0, %ymm3
+# CHECK-NEXT:  8      15    2.00    *                   vfrczpd	(%rax), %ymm3
+# CHECK-NEXT:  2      10    1.00                        vfrczps	%xmm0, %xmm3
+# CHECK-NEXT:  2      15    1.00    *                   vfrczps	(%rax), %xmm3
+# CHECK-NEXT:  4      10    2.00                        vfrczps	%ymm0, %ymm3
+# CHECK-NEXT:  8      15    2.00    *                   vfrczps	(%rax), %ymm3
+# CHECK-NEXT:  2      10    1.00                        vfrczsd	%xmm0, %xmm3
+# CHECK-NEXT:  2      15    1.00    *                   vfrczsd	(%rax), %xmm3
+# CHECK-NEXT:  2      10    1.00                        vfrczss	%xmm0, %xmm3
+# CHECK-NEXT:  2      15    1.00    *                   vfrczss	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcmov	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcmov	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  2      2     0.50                        vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      7     1.00    *                   vpcmov	(%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  2      7     1.00    *                   vpcmov	%ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  1      2     0.50                        vpcomb	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomb	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomd	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomd	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomub	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomub	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomud	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomud	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomuq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomuq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomuw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomuw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcomw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vpcomw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      3     2.00                        vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  2      3     3.00                        vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      8     3.00    *                   vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  2      8     3.00    *                   vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  1      3     2.00                        vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  2      3     3.00                        vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT:  2      8     3.00    *                   vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  2      8     3.00    *                   vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  1      2     0.50                        vphaddbd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddbd	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddbq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddbq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddbw	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddbw	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphadddq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphadddq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddubd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddubd	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddubq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddubq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddubw	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddubw	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddudq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddudq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphadduwd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphadduwd	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphadduwq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphadduwq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddwd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddwd	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphaddwq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphaddwq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphsubbw	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphsubbw	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphsubdq	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphsubdq	(%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vphsubwd	%xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vphsubwd	(%rax), %xmm3
+# CHECK-NEXT:  1      5     2.00                        vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     2.00                        vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     2.00                        vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      5     2.00                        vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     2.00                        vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     2.00                        vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      10    2.00    *                   vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      9     1.00    *                   vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      3     2.00                        vpperm	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpperm	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     2.00    *                   vpperm	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  1      3     0.50                        vprotb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vprotb	$0, %xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vprotb	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vprotd	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotd	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotd	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vprotd	$0, %xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vprotd	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vprotq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vprotq	$0, %xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vprotq	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vprotw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vprotw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      2     0.50                        vprotw	$0, %xmm0, %xmm3
+# CHECK-NEXT:  1      7     0.50    *                   vprotw	$0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshab	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshab	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshab	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshad	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshad	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshad	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshaq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshaq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshaq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshaw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshaw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshaw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshlb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshld	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshld	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshld	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshlq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  1      3     0.50                        vpshlw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT:  1      8     0.50    *                   vpshlw	%xmm0, (%rax), %xmm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT: 41.50  41.50   -      -      -      -      -      -     30.00  30.00  60.00  60.00  36.00  12.00  100.50 80.50   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczpd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczpd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczpd	%ymm0, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczpd	(%rax), %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczps	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczps	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczps	%ymm0, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     2.00    -      -      -      -     vfrczps	(%rax), %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczsd	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczsd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczss	%xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczss	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	(%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcmov	%ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomb	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomb	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomd	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomd	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomub	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomub	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomud	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomud	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuq	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuq	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomuw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomw	$0, %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpcomw	$0, (%rax), %xmm0, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2pd	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2pd	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2pd	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2pd	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2pd	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2pd	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2ps	$0, %xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2ps	$0, (%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.50   0.50    -      -      -      -     vpermil2ps	$0, %xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2ps	$0, %ymm0, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2ps	$0, (%rax), %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 1.00   1.00    -      -      -      -      -      -     3.00   3.00    -      -      -      -     1.00   1.00    -      -      -      -     vpermil2ps	$0, %ymm0, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbw	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddbw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadddq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadddq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubw	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddubw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddudq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddudq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphadduwq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphaddwq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubbw	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubbw	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubdq	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubdq	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubwd	%xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vphsubwd	(%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacsdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdqh	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdqh	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdql	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50   2.00    -     2.50   0.50    -      -      -      -     vpmacssdql	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacssww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacssww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsww	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmacsww	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcsswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcsswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcswd	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -      -     vpmadcswd	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpperm	%xmm0, %xmm1, %xmm2, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpperm	(%rax), %xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -      -      -      -     vpperm	%xmm0, (%rax), %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotb	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotd	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotq	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	$0, %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vprotw	$0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshab	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshab	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshab	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshad	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshad	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshad	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshaw	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlb	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlb	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlb	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshld	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshld	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshld	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlq	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlq	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlq	%xmm0, (%rax), %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlw	%xmm0, %xmm1, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlw	(%rax), %xmm0, %xmm3
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -      -      -      -     vpshlw	%xmm0, (%rax), %xmm3
diff --git a/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s b/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
new file mode 100644
index 0000000000000000000000000000000000000000..f1a7a47b47a4a7c2739fa966d2955f5e37202790
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/scheduler-queue-usage.s
@@ -0,0 +1,75 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -scheduler-stats < %s | FileCheck %s
+
+vmulps (%rsi), %xmm0, %xmm0
+add  %rsi, %rsi
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.15
+# CHECK-NEXT: IPC:               0.15
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      10    1.00    *                   vmulps	(%rsi), %xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.50                        addq	%rsi, %rsi
+
+# CHECK:      Schedulers - number of cycles where we saw N instructions issued:
+# CHECK-NEXT: [# issued], [# cycles]
+# CHECK-NEXT:  0,          12  (92.3%)
+# CHECK-NEXT:  2,          1  (7.7%)
+
+# CHECK:      Scheduler's queue usage:
+# CHECK-NEXT: [1] Resource name.
+# CHECK-NEXT: [2] Average number of used buffer entries.
+# CHECK-NEXT: [3] Maximum number of used buffer entries.
+# CHECK-NEXT: [4] Total number of buffer entries.
+
+# CHECK:       [1]            [2]        [3]        [4]
+# CHECK-NEXT: PdEX             0          2          40
+# CHECK-NEXT: PdFPU            0          1          64
+# CHECK-NEXT: PdLoad           0          1          40
+# CHECK-NEXT: PdStore          0          1          24
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -     1.00    -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vmulps	(%rsi), %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     addq	%rsi, %rsi
diff --git a/test/tools/llvm-mca/X86/BdVer2/simple-test.s b/test/tools/llvm-mca/X86/BdVer2/simple-test.s
new file mode 100644
index 0000000000000000000000000000000000000000..562bfbb0c07148a3a3c5b5cbe53e9eb3d9bae90c
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/simple-test.s
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=100 < %s | FileCheck %s
+
+add %edi, %eax
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      100
+# CHECK-NEXT: Total Cycles:      103
+# CHECK-NEXT: Total uOps:        100
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.97
+# CHECK-NEXT: IPC:               0.97
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.50                        addl	%edi, %eax
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     addl	%edi, %eax
diff --git a/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s b/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
new file mode 100644
index 0000000000000000000000000000000000000000..9ab4ab0baeb4d29bdd243c1a0674dad698900b09
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/vbroadcast-operand-latency.s
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+leaq 8(%rsp, %rdi, 2), %rax
+vbroadcastss (%rax), %ymm0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      206
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.94
+# CHECK-NEXT: IPC:               0.97
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      1     0.50                        leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT:  2      6     2.00    *                   vbroadcastss	(%rax), %ymm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -     1.00    -      -      -     0.50   0.50    -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -     2.00   2.00    -      -      -      -     1.00   1.00    -      -      -      -     vbroadcastss	(%rax), %ymm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    . .   leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: [0,1]     DeeeeeeER . .   vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT: [1,0]     .DeE----R . .   leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: [1,1]     .DeeeeeeER. .   vbroadcastss	(%rax), %ymm0
+# CHECK-NEXT: [2,0]     . DeE----R. .   leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: [2,1]     . D==eeeeeeER   vbroadcastss	(%rax), %ymm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.0    1.0    2.7       leaq	8(%rsp,%rdi,2), %rax
+# CHECK-NEXT: 1.     3     1.7    0.7    0.0       vbroadcastss	(%rax), %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s
new file mode 100644
index 0000000000000000000000000000000000000000..70868928d17424f11737a7787ef9823bd94e402f
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-1.s
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+vaddps %xmm0, %xmm0, %xmm1
+vandps (%rdi), %xmm1, %xmm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      10
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.20
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      5     1.00                        vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT:  1      7     0.50    *                   vandps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER .   vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DeeeeeeeER   vandps	(%rdi), %xmm1, %xmm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vaddps	%xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       vandps	(%rdi), %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..ef72be2cbdefbd30f215a317a1458d69ad5e8685
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/vec-logic-read-after-ld-2.s
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s
+
+vaddps %ymm0, %ymm0, %ymm1
+vandps (%rdi), %ymm1, %ymm2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      10
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  2      7     1.00    *                   vandps	(%rdi), %ymm1, %ymm2
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER .   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeeeeeeER   vandps	(%rdi), %ymm1, %ymm2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       vandps	(%rdi), %ymm1, %ymm2
diff --git a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
new file mode 100644
index 0000000000000000000000000000000000000000..678e6938bcebc1ed86a116a20010ccd2566d277e
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-1.s
@@ -0,0 +1,101 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+  vmulps  %ymm0, %ymm1, %ymm2
+  vfrczpd %xmm1, %xmm2
+  vmulps  %ymm2, %ymm3, %ymm4
+  vaddps  %ymm4, %ymm5, %ymm6
+  vmulps  %ymm6, %ymm3, %ymm4
+  vaddps  %ymm4, %ymm5, %ymm0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      717
+# CHECK-NEXT: Total uOps:        1200
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.67
+# CHECK-NEXT: IPC:               0.84
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      10    1.00                        vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.04   4.96    -      -      -     1.00   4.00   7.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     1.00    -     1.00    -      -      -      -     vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.01   0.99    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.01   0.99    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.02   0.98    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    .   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     D==eeeeeeeeeeER.    .    .    .    .    .   vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT: [0,2]     .D===========eeeeeER.    .    .    .    .   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [0,3]     .D================eeeeeER.    .    .    .   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [0,4]     . D====================eeeeeER.    .    .   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [0,5]     . D=========================eeeeeER.    .   vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [1,0]     .  D=============================eeeeeER.   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .  DeeeeeeeeeeE------------------------R.   vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .   D==========eeeeeE------------------R.   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [1,3]     .   D===============eeeeeE-------------R.   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [1,4]     .    D===================eeeeeE---------R   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [1,5]     .    D========================eeeeeE----R   vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     15.5   0.5    0.0       vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     2     2.0    2.0    12.0      vfrczpd	%xmm1, %xmm2
+# CHECK-NEXT: 2.     2     11.5   0.5    9.0       vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: 3.     2     16.5   0.0    6.5       vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: 4.     2     20.5   0.0    4.5       vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: 5.     2     25.5   0.0    2.0       vaddps	%ymm4, %ymm5, %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..c864c545f998753dcf7ed8dbcab704da4a42b120
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/xop-super-registers-2.s
@@ -0,0 +1,101 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=2 < %s | FileCheck %s
+
+  vmulps     %ymm0, %ymm1, %ymm2
+  vpermil2pd $16, %xmm3, %xmm5, %xmm1, %xmm2
+  vmulps     %ymm2, %ymm3, %ymm4
+  vaddps     %ymm4, %ymm5, %ymm6
+  vmulps     %ymm6, %ymm3, %ymm4
+  vaddps     %ymm4, %ymm5, %ymm0
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      653
+# CHECK-NEXT: Total uOps:        1100
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.68
+# CHECK-NEXT: IPC:               0.92
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      3     2.00                        vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  2      5     2.00                        vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.54   4.46    -      -      -      -     4.99   6.01    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.52   0.48    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -     0.99   0.01    -      -      -      -     vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -     2.00    -      -      -      -     vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.52   0.48    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    . .   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     DeeeE--R  .    .    .    .    . .   vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT: [0,2]     .D==eeeeeER    .    .    .    . .   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [0,3]     .D=======eeeeeER    .    .    . .   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [0,4]     . D============eeeeeER   .    . .   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [0,5]     . D=================eeeeeER   . .   vaddps	%ymm4, %ymm5, %ymm0
+# CHECK-NEXT: [1,0]     .  D=====================eeeeeER.   vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .  D=eeeE----------------------R.   vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT: [1,2]     .   D=====eeeeeE---------------R.   vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: [1,3]     .   D===========eeeeeE---------R.   vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: [1,4]     .    D===============eeeeeE-----R   vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: [1,5]     .    D====================eeeeeER   vaddps	%ymm4, %ymm5, %ymm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.5   0.5    0.0       vmulps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     2     1.5    1.5    12.0      vpermil2pd	$16, %xmm3, %xmm5, %xmm1, %xmm2
+# CHECK-NEXT: 2.     2     4.5    1.0    7.5       vmulps	%ymm2, %ymm3, %ymm4
+# CHECK-NEXT: 3.     2     10.0   0.5    4.5       vaddps	%ymm4, %ymm5, %ymm6
+# CHECK-NEXT: 4.     2     14.5   0.5    2.5       vmulps	%ymm6, %ymm3, %ymm4
+# CHECK-NEXT: 5.     2     19.5   0.0    0.0       vaddps	%ymm4, %ymm5, %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s b/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
new file mode 100644
index 0000000000000000000000000000000000000000..b98f36f325888195c1572bd65797abb2df288336
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/zero-idioms-avx-256.s
@@ -0,0 +1,429 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -timeline-max-iterations=3 < %s | FileCheck %s
+
+# TODO: Fix the processor resource usage for zero-idiom YMM XOR instructions.
+#       Those vector XOR instructions should only consume 1cy of JFPU1 (instead
+#       of 2cy).
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-1
+
+vaddps %ymm0, %ymm0, %ymm1
+vxorps %ymm1, %ymm1, %ymm1
+vblendps $2, %ymm1, %ymm2, %ymm3
+
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-2
+
+vaddpd %ymm0, %ymm0, %ymm1
+vxorpd %ymm1, %ymm1, %ymm1
+vblendpd $2, %ymm1, %ymm2, %ymm3
+
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-3
+vaddps %ymm0, %ymm1, %ymm2
+vandnps %ymm2, %ymm2, %ymm3
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-4
+vaddps %ymm0, %ymm1, %ymm2
+vandnps %ymm2, %ymm2, %ymm3
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ZERO-IDIOM-5
+vperm2f128 $136, %ymm0, %ymm0, %ymm1
+vaddps  %ymm1, %ymm1, %ymm0
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - ZERO-IDIOM-1
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      300
+# CHECK-NEXT: Total Cycles:      305
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.97
+# CHECK-NEXT: IPC:               0.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  2      2     1.00                        vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  2      2     1.00                        vblendps	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.52   2.48    -      -      -      -     3.00   3.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.98   1.02    -      -      -      -     vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.02   0.98    -      -      -      -     0.02   1.98    -      -      -      -     vblendps	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  . .   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeE---R  . .   vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,2]     .D=eeE-R  . .   vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     .D=eeeeeER. .   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1]     . D==eeE-R. .   vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [1,2]     . D====eeER .   vblendps	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     .  D=eeeeeER.   vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1]     .  D===eeE-R.   vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [2,2]     .   D====eeER   vblendps	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.7    1.7    0.0       vaddps	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     3     2.7    2.7    1.7       vxorps	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2.     3     4.0    0.0    0.3       vblendps	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      [1] Code Region - ZERO-IDIOM-2
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      300
+# CHECK-NEXT: Total Cycles:      305
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.97
+# CHECK-NEXT: IPC:               0.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     2.00                        vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  2      2     1.00                        vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  2      2     1.00                        vblendpd	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.52   2.48    -      -      -      -     3.00   3.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -     0.98   1.02    -      -      -      -     vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.02   0.98    -      -      -      -     0.02   1.98    -      -      -      -     vblendpd	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  . .   vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     DeeE---R  . .   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [0,2]     .D=eeE-R  . .   vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     .D=eeeeeER. .   vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1]     . D==eeE-R. .   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [1,2]     . D====eeER .   vblendpd	$2, %ymm1, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     .  D=eeeeeER.   vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1]     .  D===eeE-R.   vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: [2,2]     .   D====eeER   vblendpd	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     1.7    1.7    0.0       vaddpd	%ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     3     2.7    2.7    1.7       vxorpd	%ymm1, %ymm1, %ymm1
+# CHECK-NEXT: 2.     3     4.0    0.0    0.3       vblendpd	$2, %ymm1, %ymm2, %ymm3
+
+# CHECK:      [2] Code Region - ZERO-IDIOM-3
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      206
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.94
+# CHECK-NEXT: IPC:               0.97
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      2     1.00                        vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -     2.00   2.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -     2.00    -      -      -      -     vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     DeeE---R  ..   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     .D=eeeeeER..   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .D=eeE---R..   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     . D==eeeeeER   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [2,1]     . D==eeE---R   vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     2.0    2.0    0.0       vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     3     2.0    2.0    3.0       vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      [3] Code Region - ZERO-IDIOM-4
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      206
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.94
+# CHECK-NEXT: IPC:               0.97
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      2     1.00                        vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -     2.00   2.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -     2.00    -      -      -      -     vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [0,1]     DeeE---R  ..   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [1,0]     .D=eeeeeER..   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [1,1]     .D=eeE---R..   vandnps	%ymm2, %ymm2, %ymm3
+# CHECK-NEXT: [2,0]     . D==eeeeeER   vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: [2,1]     . D==eeE---R   vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     2.0    2.0    0.0       vaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1.     3     2.0    2.0    3.0       vandnps	%ymm2, %ymm2, %ymm3
+
+# CHECK:      [4] Code Region - ZERO-IDIOM-5
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      903
+# CHECK-NEXT: Total uOps:        1000
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    1.11
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  8      4     0.50                        vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  2      5     2.00                        vaddps	%ymm1, %ymm1, %ymm0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00    -      -      -      -     2.00   1.00    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     1.00    -      -      -      -     vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     2.00    -      -      -      -      -     vaddps	%ymm1, %ymm1, %ymm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [0,1]     . D==eeeeeER   .    .    .   .   vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [1,0]     .  D======eeeeER    .    .   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [1,1]     .    D========eeeeeER    .   .   vaddps	%ymm1, %ymm1, %ymm0
+# CHECK-NEXT: [2,0]     .    .D============eeeeER.   .   vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: [2,1]     .    .  D==============eeeeeER   vaddps	%ymm1, %ymm1, %ymm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     7.0    0.3    0.0       vperm2f128	$136, %ymm0, %ymm0, %ymm1
+# CHECK-NEXT: 1.     3     9.0    0.0    0.0       vaddps	%ymm1, %ymm1, %ymm0
diff --git a/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s b/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
new file mode 100644
index 0000000000000000000000000000000000000000..3f9c4dbb8f5977895fb82bb5bb7d564a9610337c
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BdVer2/zero-idioms.s
@@ -0,0 +1,449 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -timeline -register-file-stats -iterations=1 < %s | FileCheck %s
+
+subl  %eax, %eax
+subq  %rax, %rax
+xorl  %eax, %eax
+xorq  %rax, %rax
+
+pcmpgtb   %mm2, %mm2
+pcmpgtd   %mm2, %mm2
+# pcmpgtq   %mm2, %mm2 # invalid operand for instruction
+pcmpgtw   %mm2, %mm2
+
+pcmpgtb   %xmm2, %xmm2
+pcmpgtd   %xmm2, %xmm2
+pcmpgtq   %xmm2, %xmm2
+pcmpgtw   %xmm2, %xmm2
+
+vpcmpgtb  %xmm3, %xmm3, %xmm3
+vpcmpgtd  %xmm3, %xmm3, %xmm3
+vpcmpgtq  %xmm3, %xmm3, %xmm3
+vpcmpgtw  %xmm3, %xmm3, %xmm3
+
+vpcmpgtb  %xmm3, %xmm3, %xmm5
+vpcmpgtd  %xmm3, %xmm3, %xmm5
+vpcmpgtq  %xmm3, %xmm3, %xmm5
+vpcmpgtw  %xmm3, %xmm3, %xmm5
+
+psubb   %mm2, %mm2
+psubd   %mm2, %mm2
+psubq   %mm2, %mm2
+psubw   %mm2, %mm2
+psubb   %xmm2, %xmm2
+psubd   %xmm2, %xmm2
+psubq   %xmm2, %xmm2
+psubw   %xmm2, %xmm2
+vpsubb  %xmm3, %xmm3, %xmm3
+vpsubd  %xmm3, %xmm3, %xmm3
+vpsubq  %xmm3, %xmm3, %xmm3
+vpsubw  %xmm3, %xmm3, %xmm3
+
+vpsubb  %xmm3, %xmm3, %xmm5
+vpsubd  %xmm3, %xmm3, %xmm5
+vpsubq  %xmm3, %xmm3, %xmm5
+vpsubw  %xmm3, %xmm3, %xmm5
+
+psubsb   %mm2, %mm2
+psubsw   %mm2, %mm2
+psubsb   %xmm2, %xmm2
+psubsw   %xmm2, %xmm2
+vpsubsb  %xmm3, %xmm3, %xmm3
+vpsubsw  %xmm3, %xmm3, %xmm3
+
+vpsubsb  %xmm3, %xmm3, %xmm5
+vpsubsw  %xmm3, %xmm3, %xmm5
+
+psubusb   %mm2, %mm2
+psubusw   %mm2, %mm2
+psubusb   %xmm2, %xmm2
+psubusw   %xmm2, %xmm2
+vpsubusb  %xmm3, %xmm3, %xmm3
+vpsubusw  %xmm3, %xmm3, %xmm3
+
+vpsubsb  %xmm3, %xmm3, %xmm5
+vpsubsw  %xmm3, %xmm3, %xmm5
+
+andnps  %xmm0, %xmm0
+andnpd  %xmm1, %xmm1
+vandnps %xmm2, %xmm2, %xmm2
+vandnpd %xmm1, %xmm1, %xmm1
+pandn   %mm2, %mm2
+pandn   %xmm2, %xmm2
+vpandn  %xmm3, %xmm3, %xmm3
+
+vandnps %xmm2, %xmm2, %xmm5
+vandnpd %xmm1, %xmm1, %xmm5
+vpandn  %xmm3, %xmm3, %xmm5
+
+xorps  %xmm0, %xmm0
+xorpd  %xmm1, %xmm1
+vxorps %xmm2, %xmm2, %xmm2
+vxorpd %xmm1, %xmm1, %xmm1
+pxor   %mm2, %mm2
+pxor   %xmm2, %xmm2
+vpxor  %xmm3, %xmm3, %xmm3
+
+vxorps %xmm4, %xmm4, %xmm5
+vxorpd %xmm1, %xmm1, %xmm3
+vpxor  %xmm3, %xmm3, %xmm5
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      71
+# CHECK-NEXT: Total Cycles:      26
+# CHECK-NEXT: Total uOps:        71
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    2.73
+# CHECK-NEXT: IPC:               2.73
+# CHECK-NEXT: Block RThroughput: 17.8
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.25                        subl	%eax, %eax
+# CHECK-NEXT:  1      0     0.25                        subq	%rax, %rax
+# CHECK-NEXT:  1      0     0.25                        xorl	%eax, %eax
+# CHECK-NEXT:  1      0     0.25                        xorq	%rax, %rax
+# CHECK-NEXT:  1      0     0.25                        pcmpgtb	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtd	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtw	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        psubb	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        psubd	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        psubq	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        psubw	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        psubb	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        psubd	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        psubq	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        psubw	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        psubsb	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        psubsw	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        psubsb	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubsw	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        psubusb	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        psubusw	%mm2, %mm2
+# CHECK-NEXT:  1      2     0.50                        psubusb	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        psubusw	%xmm2, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      2     0.50                        vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      2     0.50                        vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        andnps	%xmm0, %xmm0
+# CHECK-NEXT:  1      0     0.25                        andnpd	%xmm1, %xmm1
+# CHECK-NEXT:  1      0     0.25                        vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  1      0     0.25                        pandn	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pandn	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  1      0     0.25                        xorps	%xmm0, %xmm0
+# CHECK-NEXT:  1      0     0.25                        xorpd	%xmm1, %xmm1
+# CHECK-NEXT:  1      0     0.25                        vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  1      0     0.25                        pxor	%mm2, %mm2
+# CHECK-NEXT:  1      0     0.25                        pxor	%xmm2, %xmm2
+# CHECK-NEXT:  1      0     0.25                        vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT:  1      0     0.25                        vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT:  1      0     0.25                        vpxor	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    1
+# CHECK-NEXT: Max number of mappings used:         1
+
+# CHECK:      *  Register File #1 -- PdFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     160
+# CHECK-NEXT:    Total number of mappings created: 1
+# CHECK-NEXT:    Max number of mappings used:      1
+
+# CHECK:      *  Register File #2 -- PdIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     96
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - PdAGLU01
+# CHECK-NEXT: [0.1] - PdAGLU01
+# CHECK-NEXT: [1]   - PdBranch
+# CHECK-NEXT: [2]   - PdCount
+# CHECK-NEXT: [3]   - PdDiv
+# CHECK-NEXT: [4]   - PdEX0
+# CHECK-NEXT: [5]   - PdEX1
+# CHECK-NEXT: [6]   - PdFPCVT
+# CHECK-NEXT: [7.0] - PdFPFMA
+# CHECK-NEXT: [7.1] - PdFPFMA
+# CHECK-NEXT: [8.0] - PdFPMAL
+# CHECK-NEXT: [8.1] - PdFPMAL
+# CHECK-NEXT: [9]   - PdFPMMA
+# CHECK-NEXT: [10]  - PdFPSTO
+# CHECK-NEXT: [11]  - PdFPU0
+# CHECK-NEXT: [12]  - PdFPU1
+# CHECK-NEXT: [13]  - PdFPU2
+# CHECK-NEXT: [14]  - PdFPU3
+# CHECK-NEXT: [15]  - PdFPXBR
+# CHECK-NEXT: [16]  - PdMul
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     8.00   11.00   -      -     9.00   10.00   -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1]    [2]    [3]    [4]    [5]    [6]    [7.0]  [7.1]  [8.0]  [8.1]  [9]    [10]   [11]   [12]   [13]   [14]   [15]   [16]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     subq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubd	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubq	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubd	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubq	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     psubw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     psubsb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     psubsw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     psubsb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     psubsw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     psubusb	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     psubusw	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     psubusb	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     psubusw	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -     vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -     vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -     1.00    -      -      -      -      -     vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     1.00    -      -      -      -     vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     andnpd	%xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pandn	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pandn	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorpd	%xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm2, %mm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%xmm2, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpxor	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DR   .    .    .    .    .   subl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .    .    .    .    .   subq	%rax, %rax
+# CHECK-NEXT: [0,2]     DR   .    .    .    .    .   xorl	%eax, %eax
+# CHECK-NEXT: [0,3]     DR   .    .    .    .    .   xorq	%rax, %rax
+# CHECK-NEXT: [0,4]     .DR  .    .    .    .    .   pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: [0,5]     .DR  .    .    .    .    .   pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: [0,6]     .DR  .    .    .    .    .   pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: [0,7]     .DR  .    .    .    .    .   pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: [0,8]     . DR .    .    .    .    .   pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: [0,9]     . DeeER   .    .    .    .   pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: [0,10]    . D---R   .    .    .    .   pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: [0,11]    . D---R   .    .    .    .   vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,12]    .  D--R   .    .    .    .   vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,13]    .  DeeER  .    .    .    .   vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,14]    .  D---R  .    .    .    .   vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,15]    .  D---R  .    .    .    .   vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,16]    .   D--R  .    .    .    .   vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,17]    .   DeeER .    .    .    .   vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,18]    .   D---R .    .    .    .   vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,19]    .   D---R .    .    .    .   psubb	%mm2, %mm2
+# CHECK-NEXT: [0,20]    .    D--R .    .    .    .   psubd	%mm2, %mm2
+# CHECK-NEXT: [0,21]    .    D---R.    .    .    .   psubq	%mm2, %mm2
+# CHECK-NEXT: [0,22]    .    D---R.    .    .    .   psubw	%mm2, %mm2
+# CHECK-NEXT: [0,23]    .    D---R.    .    .    .   psubb	%xmm2, %xmm2
+# CHECK-NEXT: [0,24]    .    .D--R.    .    .    .   psubd	%xmm2, %xmm2
+# CHECK-NEXT: [0,25]    .    .D---R    .    .    .   psubq	%xmm2, %xmm2
+# CHECK-NEXT: [0,26]    .    .D---R    .    .    .   psubw	%xmm2, %xmm2
+# CHECK-NEXT: [0,27]    .    .D---R    .    .    .   vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,28]    .    . D--R    .    .    .   vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,29]    .    . D---R   .    .    .   vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,30]    .    . D---R   .    .    .   vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,31]    .    . D---R   .    .    .   vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,32]    .    .  D--R   .    .    .   vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,33]    .    .  D---R  .    .    .   vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,34]    .    .  D---R  .    .    .   vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,35]    .    .  DeeER  .    .    .   psubsb	%mm2, %mm2
+# CHECK-NEXT: [0,36]    .    .   DeeER .    .    .   psubsw	%mm2, %mm2
+# CHECK-NEXT: [0,37]    .    .   DeeER .    .    .   psubsb	%xmm2, %xmm2
+# CHECK-NEXT: [0,38]    .    .   D=eeER.    .    .   psubsw	%xmm2, %xmm2
+# CHECK-NEXT: [0,39]    .    .   D=eeER.    .    .   vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,40]    .    .    D=eeER    .    .   vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,41]    .    .    D=eeER    .    .   vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,42]    .    .    D==eeER   .    .   vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,43]    .    .    D==eeER   .    .   psubusb	%mm2, %mm2
+# CHECK-NEXT: [0,44]    .    .    .D==eeER  .    .   psubusw	%mm2, %mm2
+# CHECK-NEXT: [0,45]    .    .    .D==eeER  .    .   psubusb	%xmm2, %xmm2
+# CHECK-NEXT: [0,46]    .    .    .D===eeER .    .   psubusw	%xmm2, %xmm2
+# CHECK-NEXT: [0,47]    .    .    .D===eeER .    .   vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,48]    .    .    . D===eeER.    .   vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,49]    .    .    . D===eeER.    .   vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,50]    .    .    . D====eeER    .   vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,51]    .    .    . D-------R    .   andnps	%xmm0, %xmm0
+# CHECK-NEXT: [0,52]    .    .    .  D------R    .   andnpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,53]    .    .    .  D------R    .   vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,54]    .    .    .  D-------R   .   vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,55]    .    .    .  D-------R   .   pandn	%mm2, %mm2
+# CHECK-NEXT: [0,56]    .    .    .   D------R   .   pandn	%xmm2, %xmm2
+# CHECK-NEXT: [0,57]    .    .    .   D------R   .   vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,58]    .    .    .   D-------R  .   vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: [0,59]    .    .    .   D-------R  .   vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: [0,60]    .    .    .    D------R  .   vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,61]    .    .    .    D------R  .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,62]    .    .    .    D-------R .   xorpd	%xmm1, %xmm1
+# CHECK-NEXT: [0,63]    .    .    .    D-------R .   vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: [0,64]    .    .    .    .D------R .   vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: [0,65]    .    .    .    .D------R .   pxor	%mm2, %mm2
+# CHECK-NEXT: [0,66]    .    .    .    .D-------R.   pxor	%xmm2, %xmm2
+# CHECK-NEXT: [0,67]    .    .    .    .D-------R.   vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,68]    .    .    .    . D------R.   vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: [0,69]    .    .    .    . D------R.   vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: [0,70]    .    .    .    . D-------R   vpxor	%xmm3, %xmm3, %xmm5
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     0.0    0.0    0.0       subl	%eax, %eax
+# CHECK-NEXT: 1.     1     0.0    0.0    0.0       subq	%rax, %rax
+# CHECK-NEXT: 2.     1     0.0    0.0    0.0       xorl	%eax, %eax
+# CHECK-NEXT: 3.     1     0.0    0.0    0.0       xorq	%rax, %rax
+# CHECK-NEXT: 4.     1     0.0    0.0    0.0       pcmpgtb	%mm2, %mm2
+# CHECK-NEXT: 5.     1     0.0    0.0    0.0       pcmpgtd	%mm2, %mm2
+# CHECK-NEXT: 6.     1     0.0    0.0    0.0       pcmpgtw	%mm2, %mm2
+# CHECK-NEXT: 7.     1     0.0    0.0    0.0       pcmpgtb	%xmm2, %xmm2
+# CHECK-NEXT: 8.     1     0.0    0.0    0.0       pcmpgtd	%xmm2, %xmm2
+# CHECK-NEXT: 9.     1     1.0    1.0    0.0       pcmpgtq	%xmm2, %xmm2
+# CHECK-NEXT: 10.    1     0.0    0.0    3.0       pcmpgtw	%xmm2, %xmm2
+# CHECK-NEXT: 11.    1     0.0    0.0    3.0       vpcmpgtb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 12.    1     0.0    0.0    2.0       vpcmpgtd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 13.    1     1.0    1.0    0.0       vpcmpgtq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 14.    1     0.0    0.0    3.0       vpcmpgtw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 15.    1     0.0    0.0    3.0       vpcmpgtb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 16.    1     0.0    0.0    2.0       vpcmpgtd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 17.    1     1.0    1.0    0.0       vpcmpgtq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 18.    1     0.0    0.0    3.0       vpcmpgtw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 19.    1     0.0    0.0    3.0       psubb	%mm2, %mm2
+# CHECK-NEXT: 20.    1     0.0    0.0    2.0       psubd	%mm2, %mm2
+# CHECK-NEXT: 21.    1     0.0    0.0    3.0       psubq	%mm2, %mm2
+# CHECK-NEXT: 22.    1     0.0    0.0    3.0       psubw	%mm2, %mm2
+# CHECK-NEXT: 23.    1     0.0    0.0    3.0       psubb	%xmm2, %xmm2
+# CHECK-NEXT: 24.    1     0.0    0.0    2.0       psubd	%xmm2, %xmm2
+# CHECK-NEXT: 25.    1     0.0    0.0    3.0       psubq	%xmm2, %xmm2
+# CHECK-NEXT: 26.    1     0.0    0.0    3.0       psubw	%xmm2, %xmm2
+# CHECK-NEXT: 27.    1     0.0    0.0    3.0       vpsubb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 28.    1     0.0    0.0    2.0       vpsubd	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 29.    1     0.0    0.0    3.0       vpsubq	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 30.    1     0.0    0.0    3.0       vpsubw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 31.    1     0.0    0.0    3.0       vpsubb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 32.    1     0.0    0.0    2.0       vpsubd	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 33.    1     0.0    0.0    3.0       vpsubq	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 34.    1     0.0    0.0    3.0       vpsubw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 35.    1     1.0    1.0    0.0       psubsb	%mm2, %mm2
+# CHECK-NEXT: 36.    1     1.0    1.0    0.0       psubsw	%mm2, %mm2
+# CHECK-NEXT: 37.    1     1.0    1.0    0.0       psubsb	%xmm2, %xmm2
+# CHECK-NEXT: 38.    1     2.0    2.0    0.0       psubsw	%xmm2, %xmm2
+# CHECK-NEXT: 39.    1     2.0    2.0    0.0       vpsubsb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 40.    1     2.0    2.0    0.0       vpsubsw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 41.    1     2.0    2.0    0.0       vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 42.    1     3.0    3.0    0.0       vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 43.    1     3.0    3.0    0.0       psubusb	%mm2, %mm2
+# CHECK-NEXT: 44.    1     3.0    3.0    0.0       psubusw	%mm2, %mm2
+# CHECK-NEXT: 45.    1     3.0    3.0    0.0       psubusb	%xmm2, %xmm2
+# CHECK-NEXT: 46.    1     4.0    4.0    0.0       psubusw	%xmm2, %xmm2
+# CHECK-NEXT: 47.    1     4.0    4.0    0.0       vpsubusb	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 48.    1     4.0    4.0    0.0       vpsubusw	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 49.    1     4.0    4.0    0.0       vpsubsb	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 50.    1     5.0    5.0    0.0       vpsubsw	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 51.    1     0.0    0.0    7.0       andnps	%xmm0, %xmm0
+# CHECK-NEXT: 52.    1     0.0    0.0    6.0       andnpd	%xmm1, %xmm1
+# CHECK-NEXT: 53.    1     0.0    0.0    6.0       vandnps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 54.    1     0.0    0.0    7.0       vandnpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 55.    1     0.0    0.0    7.0       pandn	%mm2, %mm2
+# CHECK-NEXT: 56.    1     0.0    0.0    6.0       pandn	%xmm2, %xmm2
+# CHECK-NEXT: 57.    1     0.0    0.0    6.0       vpandn	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 58.    1     0.0    0.0    7.0       vandnps	%xmm2, %xmm2, %xmm5
+# CHECK-NEXT: 59.    1     0.0    0.0    7.0       vandnpd	%xmm1, %xmm1, %xmm5
+# CHECK-NEXT: 60.    1     0.0    0.0    6.0       vpandn	%xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 61.    1     0.0    0.0    6.0       xorps	%xmm0, %xmm0
+# CHECK-NEXT: 62.    1     0.0    0.0    7.0       xorpd	%xmm1, %xmm1
+# CHECK-NEXT: 63.    1     0.0    0.0    7.0       vxorps	%xmm2, %xmm2, %xmm2
+# CHECK-NEXT: 64.    1     0.0    0.0    6.0       vxorpd	%xmm1, %xmm1, %xmm1
+# CHECK-NEXT: 65.    1     0.0    0.0    6.0       pxor	%mm2, %mm2
+# CHECK-NEXT: 66.    1     0.0    0.0    7.0       pxor	%xmm2, %xmm2
+# CHECK-NEXT: 67.    1     0.0    0.0    7.0       vpxor	%xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 68.    1     0.0    0.0    6.0       vxorps	%xmm4, %xmm4, %xmm5
+# CHECK-NEXT: 69.    1     0.0    0.0    6.0       vxorpd	%xmm1, %xmm1, %xmm3
+# CHECK-NEXT: 70.    1     0.0    0.0    7.0       vpxor	%xmm3, %xmm3, %xmm5
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s
index d2588bef30e2ff8af96b768e69f082c9472e920c..0c27d2cdac3d743f029339cf8ef1deb9a8c6f54e 100644
--- a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-1.s
@@ -32,13 +32,17 @@ vaddps %xmm1, %xmm1, %xmm2
 # CHECK-NEXT:  1      3     1.00                        vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Register File statistics:
-# CHECK-NEXT: Total number of mappings created:    6
-# CHECK-NEXT: Max number of mappings used:         5
+# CHECK-NEXT: Total number of mappings created:    3
+# CHECK-NEXT: Max number of mappings used:         3
 
 # CHECK:      *  Register File #1 -- JFpuPRF:
 # CHECK-NEXT:    Number of physical registers:     72
-# CHECK-NEXT:    Total number of mappings created: 6
-# CHECK-NEXT:    Max number of mappings used:      5
+# CHECK-NEXT:    Total number of mappings created: 3
+# CHECK-NEXT:    Max number of mappings used:      3
+# CHECK-NEXT:    Number of optimizable moves:      3
+# CHECK-NEXT:    Number of moves eliminated:       3  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             3  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   1
 
 # CHECK:      *  Register File #2 -- JIntegerPRF:
 # CHECK-NEXT:    Number of physical registers:     64
@@ -63,25 +67,25 @@ vaddps %xmm1, %xmm1, %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
-# CHECK-NEXT:  -      -      -     1.00   1.00   1.00   1.00    -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT:  -      -      -      -     1.00    -     1.00    -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
 # CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -      -      -      -      -      -      -     vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DR   .   .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [0,1]     DeER .   .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,1]     DR   .   .   vmovaps	%xmm0, %xmm1
 # CHECK-NEXT: [0,2]     .DeeeER  .   vaddps	%xmm1, %xmm1, %xmm2
 # CHECK-NEXT: [1,0]     .D----R  .   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [1,1]     . DeE--R .   vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: [1,2]     . D=eeeER.   vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,1]     . D----R .   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     . DeeeER .   vaddps	%xmm1, %xmm1, %xmm2
 # CHECK-NEXT: [2,0]     .  D----R.   vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: [2,1]     .  DeE---R   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,1]     .  D----R.   vmovaps	%xmm0, %xmm1
 # CHECK-NEXT: [2,2]     .   DeeeER   vaddps	%xmm1, %xmm1, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
@@ -92,5 +96,5 @@ vaddps %xmm1, %xmm1, %xmm2
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     3     0.0    0.0    2.7       vxorps	%xmm0, %xmm0, %xmm0
-# CHECK-NEXT: 1.     3     1.0    1.0    1.7       vmovaps	%xmm0, %xmm1
-# CHECK-NEXT: 2.     3     1.3    0.0    0.0       vaddps	%xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 1.     3     0.0    0.0    2.7       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     1.0    1.0    0.0       vaddps	%xmm1, %xmm1, %xmm2
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
new file mode 100644
index 0000000000000000000000000000000000000000..08465f907eec043b32aab6a1c206eb2edd7200e0
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-2.s
@@ -0,0 +1,141 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+pxor %mm0, %mm0
+movq %mm0, %mm1
+
+xorps %xmm0, %xmm0
+movaps %xmm0, %xmm1
+movups %xmm1, %xmm2
+movapd %xmm2, %xmm3
+movupd %xmm3, %xmm4
+movdqa %xmm4, %xmm5
+movdqu %xmm5, %xmm0
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      27
+# CHECK-NEXT: Total Cycles:      15
+# CHECK-NEXT: Total uOps:        27
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.80
+# CHECK-NEXT: IPC:               1.80
+# CHECK-NEXT: Block RThroughput: 4.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.50                        pxor	%mm0, %mm0
+# CHECK-NEXT:  1      1     0.50                        movq	%mm0, %mm1
+# CHECK-NEXT:  1      0     0.50                        xorps	%xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.50                        movaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        movups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        movapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     0.50                        movupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      1     0.50                        movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      1     0.50                        movdqu	%xmm5, %xmm0
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    0
+# CHECK-NEXT: Max number of mappings used:         0
+
+# CHECK:      *  Register File #1 -- JFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     72
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+# CHECK-NEXT:    Number of optimizable moves:      21
+# CHECK-NEXT:    Number of moves eliminated:       21  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             21  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   2
+
+# CHECK:      *  Register File #2 -- JIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     64
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     pxor	%mm0, %mm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%mm0, %mm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorps	%xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movdqu	%xmm5, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    .   .   pxor	%mm0, %mm0
+# CHECK-NEXT: [0,1]     DR   .    .   .   movq	%mm0, %mm1
+# CHECK-NEXT: [0,2]     .DR  .    .   .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [0,3]     .DR  .    .   .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,4]     . DR .    .   .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [0,5]     . DR .    .   .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,6]     .  DR.    .   .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,7]     .  DR.    .   .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,8]     .   DR    .   .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     .   DR    .   .   pxor	%mm0, %mm0
+# CHECK-NEXT: [1,1]     .    DR   .   .   movq	%mm0, %mm1
+# CHECK-NEXT: [1,2]     .    DR   .   .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [1,3]     .    .DR  .   .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,4]     .    .DR  .   .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [1,5]     .    . DR .   .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,6]     .    . DR .   .   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,7]     .    .  DR.   .   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,8]     .    .  DR.   .   movdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .    .   DR   .   pxor	%mm0, %mm0
+# CHECK-NEXT: [2,1]     .    .   DR   .   movq	%mm0, %mm1
+# CHECK-NEXT: [2,2]     .    .    DR  .   xorps	%xmm0, %xmm0
+# CHECK-NEXT: [2,3]     .    .    DR  .   movaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,4]     .    .    .DR .   movups	%xmm1, %xmm2
+# CHECK-NEXT: [2,5]     .    .    .DR .   movapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,6]     .    .    . DR.   movupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,7]     .    .    . DR.   movdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,8]     .    .    .  DR   movdqu	%xmm5, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    0.0       pxor	%mm0, %mm0
+# CHECK-NEXT: 1.     3     0.0    0.0    0.0       movq	%mm0, %mm1
+# CHECK-NEXT: 2.     3     0.0    0.0    0.0       xorps	%xmm0, %xmm0
+# CHECK-NEXT: 3.     3     0.0    0.0    0.0       movaps	%xmm0, %xmm1
+# CHECK-NEXT: 4.     3     0.0    0.0    0.0       movups	%xmm1, %xmm2
+# CHECK-NEXT: 5.     3     0.0    0.0    0.0       movapd	%xmm2, %xmm3
+# CHECK-NEXT: 6.     3     0.0    0.0    0.0       movupd	%xmm3, %xmm4
+# CHECK-NEXT: 7.     3     0.0    0.0    0.0       movdqa	%xmm4, %xmm5
+# CHECK-NEXT: 8.     3     0.0    0.0    0.0       movdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
new file mode 100644
index 0000000000000000000000000000000000000000..f3d850fc90aa3ff88787d7738c696442a24482a5
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-3.s
@@ -0,0 +1,126 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+vxorps  %xmm0, %xmm0, %xmm0
+vmovaps %xmm0, %xmm1
+vmovups %xmm1, %xmm2
+vmovapd %xmm2, %xmm3
+vmovupd %xmm3, %xmm4
+vmovdqa %xmm4, %xmm5
+vmovdqu %xmm5, %xmm0
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      21
+# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total uOps:        21
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.75
+# CHECK-NEXT: IPC:               1.75
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.50                        vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  1      1     0.50                        vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  1      1     0.50                        vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  1      1     0.50                        vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  1      1     0.50                        vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  1      1     0.50                        vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  1      1     0.50                        vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    0
+# CHECK-NEXT: Max number of mappings used:         0
+
+# CHECK:      *  Register File #1 -- JFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     72
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+# CHECK-NEXT:    Number of optimizable moves:      18
+# CHECK-NEXT:    Number of moves eliminated:       18  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             18  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   2
+
+# CHECK:      *  Register File #2 -- JIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     64
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovaps	%xmm0, %xmm1
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovups	%xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovapd	%xmm2, %xmm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovupd	%xmm3, %xmm4
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DR   .    ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1]     DR   .    ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [0,2]     .DR  .    ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [0,3]     .DR  .    ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [0,4]     . DR .    ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [0,5]     . DR .    ..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [0,6]     .  DR.    ..   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [1,0]     .  DR.    ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1]     .   DR    ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [1,2]     .   DR    ..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [1,3]     .    DR   ..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [1,4]     .    DR   ..   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [1,5]     .    .DR  ..   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [1,6]     .    .DR  ..   vmovdqu	%xmm5, %xmm0
+# CHECK-NEXT: [2,0]     .    . DR ..   vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1]     .    . DR ..   vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: [2,2]     .    .  DR..   vmovups	%xmm1, %xmm2
+# CHECK-NEXT: [2,3]     .    .  DR..   vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: [2,4]     .    .   DR.   vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: [2,5]     .    .   DR.   vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: [2,6]     .    .    DR   vmovdqu	%xmm5, %xmm0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    0.0       vxorps	%xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1.     3     0.0    0.0    0.0       vmovaps	%xmm0, %xmm1
+# CHECK-NEXT: 2.     3     0.0    0.0    0.0       vmovups	%xmm1, %xmm2
+# CHECK-NEXT: 3.     3     0.0    0.0    0.0       vmovapd	%xmm2, %xmm3
+# CHECK-NEXT: 4.     3     0.0    0.0    0.0       vmovupd	%xmm3, %xmm4
+# CHECK-NEXT: 5.     3     0.0    0.0    0.0       vmovdqa	%xmm4, %xmm5
+# CHECK-NEXT: 6.     3     0.0    0.0    0.0       vmovdqu	%xmm5, %xmm0
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
new file mode 100644
index 0000000000000000000000000000000000000000..c2df1baf5c03c579ef901291f9eb89bcb86c7e70
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-4.s
@@ -0,0 +1,111 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+xor %eax, %eax
+mov %eax, %ebx
+mov %ebx, %ecx
+mov %ecx, %edx
+mov %edx, %eax
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      15
+# CHECK-NEXT: Total Cycles:      9
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.67
+# CHECK-NEXT: IPC:               1.67
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.50                        xorl	%eax, %eax
+# CHECK-NEXT:  1      1     0.50                        movl	%eax, %ebx
+# CHECK-NEXT:  1      1     0.50                        movl	%ebx, %ecx
+# CHECK-NEXT:  1      1     0.50                        movl	%ecx, %edx
+# CHECK-NEXT:  1      1     0.50                        movl	%edx, %eax
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    0
+# CHECK-NEXT: Max number of mappings used:         0
+
+# CHECK:      *  Register File #1 -- JFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     72
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- JIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     64
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+# CHECK-NEXT:    Number of optimizable moves:      12
+# CHECK-NEXT:    Number of moves eliminated:       12  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             12  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorl	%eax, %eax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%eax, %ebx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ebx, %ecx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%ecx, %edx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movl	%edx, %eax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DR   .  .   xorl	%eax, %eax
+# CHECK-NEXT: [0,1]     DR   .  .   movl	%eax, %ebx
+# CHECK-NEXT: [0,2]     .DR  .  .   movl	%ebx, %ecx
+# CHECK-NEXT: [0,3]     .DR  .  .   movl	%ecx, %edx
+# CHECK-NEXT: [0,4]     . DR .  .   movl	%edx, %eax
+# CHECK-NEXT: [1,0]     . DR .  .   xorl	%eax, %eax
+# CHECK-NEXT: [1,1]     .  DR.  .   movl	%eax, %ebx
+# CHECK-NEXT: [1,2]     .  DR.  .   movl	%ebx, %ecx
+# CHECK-NEXT: [1,3]     .   DR  .   movl	%ecx, %edx
+# CHECK-NEXT: [1,4]     .   DR  .   movl	%edx, %eax
+# CHECK-NEXT: [2,0]     .    DR .   xorl	%eax, %eax
+# CHECK-NEXT: [2,1]     .    DR .   movl	%eax, %ebx
+# CHECK-NEXT: [2,2]     .    .DR.   movl	%ebx, %ecx
+# CHECK-NEXT: [2,3]     .    .DR.   movl	%ecx, %edx
+# CHECK-NEXT: [2,4]     .    . DR   movl	%edx, %eax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    0.0       xorl	%eax, %eax
+# CHECK-NEXT: 1.     3     0.0    0.0    0.0       movl	%eax, %ebx
+# CHECK-NEXT: 2.     3     0.0    0.0    0.0       movl	%ebx, %ecx
+# CHECK-NEXT: 3.     3     0.0    0.0    0.0       movl	%ecx, %edx
+# CHECK-NEXT: 4.     3     0.0    0.0    0.0       movl	%edx, %eax
diff --git a/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
new file mode 100644
index 0000000000000000000000000000000000000000..277293e429ba33032c48e414bc1060b2e3afeedc
--- /dev/null
+++ b/test/tools/llvm-mca/X86/BtVer2/reg-move-elimination-5.s
@@ -0,0 +1,111 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+xor %rax, %rax
+mov %rax, %rbx
+mov %rbx, %rcx
+mov %rcx, %rdx
+mov %rdx, %rax
+
+# CHECK:      Iterations:        3
+# CHECK-NEXT: Instructions:      15
+# CHECK-NEXT: Total Cycles:      9
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    1.67
+# CHECK-NEXT: IPC:               1.67
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      0     0.50                        xorq	%rax, %rax
+# CHECK-NEXT:  1      1     0.50                        movq	%rax, %rbx
+# CHECK-NEXT:  1      1     0.50                        movq	%rbx, %rcx
+# CHECK-NEXT:  1      1     0.50                        movq	%rcx, %rdx
+# CHECK-NEXT:  1      1     0.50                        movq	%rdx, %rax
+
+# CHECK:      Register File statistics:
+# CHECK-NEXT: Total number of mappings created:    0
+# CHECK-NEXT: Max number of mappings used:         0
+
+# CHECK:      *  Register File #1 -- JFpuPRF:
+# CHECK-NEXT:    Number of physical registers:     72
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+
+# CHECK:      *  Register File #2 -- JIntegerPRF:
+# CHECK-NEXT:    Number of physical registers:     64
+# CHECK-NEXT:    Total number of mappings created: 0
+# CHECK-NEXT:    Max number of mappings used:      0
+# CHECK-NEXT:    Number of optimizable moves:      12
+# CHECK-NEXT:    Number of moves eliminated:       12  (100.0%)
+# CHECK-NEXT:    Number of zero moves:             12  (100.0%)
+# CHECK-NEXT:    Max moves eliminated per cycle:   2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - JALU0
+# CHECK-NEXT: [1]   - JALU1
+# CHECK-NEXT: [2]   - JDiv
+# CHECK-NEXT: [3]   - JFPA
+# CHECK-NEXT: [4]   - JFPM
+# CHECK-NEXT: [5]   - JFPU0
+# CHECK-NEXT: [6]   - JFPU1
+# CHECK-NEXT: [7]   - JLAGU
+# CHECK-NEXT: [8]   - JMul
+# CHECK-NEXT: [9]   - JSAGU
+# CHECK-NEXT: [10]  - JSTC
+# CHECK-NEXT: [11]  - JVALU0
+# CHECK-NEXT: [12]  - JVALU1
+# CHECK-NEXT: [13]  - JVIMUL
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     xorq	%rax, %rax
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rax, %rbx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rbx, %rcx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rcx, %rdx
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     movq	%rdx, %rax
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DR   .  .   xorq	%rax, %rax
+# CHECK-NEXT: [0,1]     DR   .  .   movq	%rax, %rbx
+# CHECK-NEXT: [0,2]     .DR  .  .   movq	%rbx, %rcx
+# CHECK-NEXT: [0,3]     .DR  .  .   movq	%rcx, %rdx
+# CHECK-NEXT: [0,4]     . DR .  .   movq	%rdx, %rax
+# CHECK-NEXT: [1,0]     . DR .  .   xorq	%rax, %rax
+# CHECK-NEXT: [1,1]     .  DR.  .   movq	%rax, %rbx
+# CHECK-NEXT: [1,2]     .  DR.  .   movq	%rbx, %rcx
+# CHECK-NEXT: [1,3]     .   DR  .   movq	%rcx, %rdx
+# CHECK-NEXT: [1,4]     .   DR  .   movq	%rdx, %rax
+# CHECK-NEXT: [2,0]     .    DR .   xorq	%rax, %rax
+# CHECK-NEXT: [2,1]     .    DR .   movq	%rax, %rbx
+# CHECK-NEXT: [2,2]     .    .DR.   movq	%rbx, %rcx
+# CHECK-NEXT: [2,3]     .    .DR.   movq	%rcx, %rdx
+# CHECK-NEXT: [2,4]     .    . DR   movq	%rdx, %rax
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     3     0.0    0.0    0.0       xorq	%rax, %rax
+# CHECK-NEXT: 1.     3     0.0    0.0    0.0       movq	%rax, %rbx
+# CHECK-NEXT: 2.     3     0.0    0.0    0.0       movq	%rbx, %rcx
+# CHECK-NEXT: 3.     3     0.0    0.0    0.0       movq	%rcx, %rdx
+# CHECK-NEXT: 4.     3     0.0    0.0    0.0       movq	%rdx, %rax
diff --git a/test/tools/llvm-mca/X86/bextr-read-after-ld.s b/test/tools/llvm-mca/X86/bextr-read-after-ld.s
index 4e4e23231b926e5eb7f60ed52391058a71fdda79..4c1c38f2d10e7dcfc5edeb8ba8c1e1cf3d8b3ee8 100644
--- a/test/tools/llvm-mca/X86/bextr-read-after-ld.s
+++ b/test/tools/llvm-mca/X86/bextr-read-after-ld.s
@@ -2,6 +2,7 @@
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
 
@@ -11,6 +12,9 @@ bextrl	%esi, (%rdi), %eax
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
+# BDVER2-NEXT:  Total Cycles:      9
+# BDVER2-NEXT:  Total uOps:        3
+
 # BDWELL-NEXT:  Total Cycles:      10
 # BDWELL-NEXT:  Total uOps:        4
 
@@ -26,6 +30,11 @@ bextrl	%esi, (%rdi), %eax
 # ZNVER1-NEXT:  Total Cycles:      8
 # ZNVER1-NEXT:  Total uOps:        3
 
+# BDVER2:       Dispatch Width:    4
+# BDVER2-NEXT:  uOps Per Cycle:    0.33
+# BDVER2-NEXT:  IPC:               0.22
+# BDVER2-NEXT:  Block RThroughput: 1.0
+
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.40
 # BDWELL-NEXT:  IPC:               0.20
@@ -61,6 +70,9 @@ bextrl	%esi, (%rdi), %eax
 
 # ALL:          [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 
+# BDVER2-NEXT:   1      1     0.50                        addl	%edi, %esi
+# BDVER2-NEXT:   2      6     0.50    *                   bextrl	%esi, (%rdi), %eax
+
 # BDWELL-NEXT:   1      1     0.25                        addl	%edi, %esi
 # BDWELL-NEXT:   3      7     0.50    *                   bextrl	%esi, (%rdi), %eax
 
@@ -78,12 +90,16 @@ bextrl	%esi, (%rdi), %eax
 
 # ALL:          Timeline view:
 
+# BDVER2-NEXT:  Index     012345678
 # BDWELL-NEXT:  Index     0123456789
 # BTVER2-NEXT:  Index     0123456
 # HASWELL-NEXT: Index     0123456789
 # SKYLAKE-NEXT: Index     0123456789
 # ZNVER1-NEXT:  Index     01234567
 
+# BDVER2:       [0,0]     DeER .  .   addl	%edi, %esi
+# BDVER2-NEXT:  [0,1]     DeeeeeeER   bextrl	%esi, (%rdi), %eax
+
 # BDWELL:       [0,0]     DeER .   .   addl	%edi, %esi
 # BDWELL-NEXT:  [0,1]     DeeeeeeeER   bextrl	%esi, (%rdi), %eax
 
diff --git a/test/tools/llvm-mca/X86/cpus.s b/test/tools/llvm-mca/X86/cpus.s
index 47e1e83c543d426a22f5f38f81279ead333b1318..49169f1a6fe23244ff6184f44571f18052fedcf0 100644
--- a/test/tools/llvm-mca/X86/cpus.s
+++ b/test/tools/llvm-mca/X86/cpus.s
@@ -1,4 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=BDVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=BTVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=ZNVER1 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -resource-pressure=false -instruction-info=false < %s | FileCheck --check-prefix=ALL --check-prefix=SANDYBRIDGE %s
@@ -17,6 +18,11 @@ add %edi, %eax
 # ALL-NEXT:         Total Cycles:      103
 # ALL-NEXT:         Total uOps:        100
 
+# BDVER2:           Dispatch Width:    4
+# BDVER2-NEXT:      uOps Per Cycle:    0.97
+# BDVER2-NEXT:      IPC:               0.97
+# BDVER2-NEXT:      Block RThroughput: 0.5
+
 # BROADWELL:        Dispatch Width:    4
 # BROADWELL-NEXT:   uOps Per Cycle:    0.97
 # BROADWELL-NEXT:   IPC:               0.97
diff --git a/test/tools/llvm-mca/X86/intel-syntax.s b/test/tools/llvm-mca/X86/intel-syntax.s
index 1aaa3902866c502f88c5dd522215b9d3429f68e3..786d06ba0d16843072f7d1af76106c516c5a8743 100644
--- a/test/tools/llvm-mca/X86/intel-syntax.s
+++ b/test/tools/llvm-mca/X86/intel-syntax.s
@@ -5,7 +5,7 @@
 
   .intel_syntax noprefix
   mov	eax, 1
-  mov	ebx, 0ffh
+  mov	ebx, 0xff
   imul	esi, edi
   lea	eax, [rsi + rdi]
 
diff --git a/test/tools/llvm-mca/X86/read-after-ld-1.s b/test/tools/llvm-mca/X86/read-after-ld-1.s
index 1478eba77dea34d5cfa59882ed9c299fee57bfbb..0820fbc7c9b79ed33b876dcb8499a4f47c12c3ad 100644
--- a/test/tools/llvm-mca/X86/read-after-ld-1.s
+++ b/test/tools/llvm-mca/X86/read-after-ld-1.s
@@ -3,6 +3,7 @@
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=broadwell -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDWELL
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -resource-pressure=false -instruction-info=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
 
@@ -12,6 +13,9 @@ vaddps  (%rax), %xmm1, %xmm1
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
+# BDVER2-NEXT:  Total Cycles:      17
+# BDVER2-NEXT:  Total uOps:        2
+
 # BDWELL-NEXT:  Total Cycles:      17
 # BDWELL-NEXT:  Total uOps:        3
 
@@ -30,6 +34,11 @@ vaddps  (%rax), %xmm1, %xmm1
 # ZNVER1-NEXT:  Total Cycles:      20
 # ZNVER1-NEXT:  Total uOps:        2
 
+# BDVER2:       Dispatch Width:    4
+# BDVER2-NEXT:  uOps Per Cycle:    0.12
+# BDVER2-NEXT:  IPC:               0.12
+# BDVER2-NEXT:  Block RThroughput: 10.0
+
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.18
 # BDWELL-NEXT:  IPC:               0.12
@@ -62,6 +71,9 @@ vaddps  (%rax), %xmm1, %xmm1
 
 # ALL:          Timeline view:
 
+# BDVER2-NEXT:                      0123456
+# BDVER2-NEXT:  Index     0123456789
+
 # BDWELL-NEXT:                      0123456
 # BDWELL-NEXT:  Index     0123456789
 
@@ -80,6 +92,9 @@ vaddps  (%rax), %xmm1, %xmm1
 # ZNVER1-NEXT:                      0123456789
 # ZNVER1-NEXT:  Index     0123456789
 
+# BDVER2:       [0,0]     DeeeeeeeeeER   ..   vdivps	%xmm0, %xmm1, %xmm1
+# BDVER2-NEXT:  [0,1]     D====eeeeeeeeeeER   vaddps	(%rax), %xmm1, %xmm1
+
 # BDWELL:       [0,0]     DeeeeeeeeeeeER ..   vdivps	%xmm0, %xmm1, %xmm1
 # BDWELL-NEXT:  [0,1]     D======eeeeeeeeER   vaddps	(%rax), %xmm1, %xmm1
 
@@ -107,6 +122,7 @@ vaddps  (%rax), %xmm1, %xmm1
 # ALL:                [0]    [1]    [2]    [3]
 # ALL-NEXT:     0.     1     1.0    1.0    0.0       vdivps	%xmm0, %xmm1, %xmm1
 
+# BDVER2-NEXT:  1.     1     5.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
 # BDWELL-NEXT:  1.     1     7.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
 # BTVER2-NEXT:  1.     1     15.0   0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
 # HASWELL-NEXT: 1.     1     8.0    0.0    0.0       vaddps	(%rax), %xmm1, %xmm1
diff --git a/test/tools/llvm-mca/X86/read-after-ld-2.s b/test/tools/llvm-mca/X86/read-after-ld-2.s
index ee39b645d5a0be78daa08eb313787a643098d1a2..7d549b395957236ff95d1d31c0e080ed4b3c0c6e 100644
--- a/test/tools/llvm-mca/X86/read-after-ld-2.s
+++ b/test/tools/llvm-mca/X86/read-after-ld-2.s
@@ -23,7 +23,7 @@ cmp     %edi, %edx
 # HASWELL-NEXT: Total Cycles:      143
 # HASWELL-NEXT: Total uOps:        500
 
-# SKYLAKE-NEXT: Total Cycles:      803
+# SKYLAKE-NEXT: Total Cycles:      110
 # SKYLAKE-NEXT: Total uOps:        500
 
 # ZNVER1-NEXT:  Total Cycles:      110
@@ -40,8 +40,8 @@ cmp     %edi, %edx
 # HASWELL-NEXT: Block RThroughput: 1.3
 
 # SKYLAKE:      Dispatch Width:    6
-# SKYLAKE-NEXT: uOps Per Cycle:    0.62
-# SKYLAKE-NEXT: IPC:               0.50
+# SKYLAKE-NEXT: uOps Per Cycle:    4.55
+# SKYLAKE-NEXT: IPC:               3.64
 # SKYLAKE-NEXT: Block RThroughput: 0.8
 
 # ZNVER1:       Dispatch Width:    4
@@ -57,8 +57,8 @@ cmp     %edi, %edx
 # HASWELL-NEXT:                     0123456789
 # HASWELL-NEXT: Index     0123456789          012
 
-# SKYLAKE-NEXT:                     0123456789          0123456789          0123456789          01234
-# SKYLAKE-NEXT: Index     0123456789          0123456789          0123456789          0123456789
+# SKYLAKE-NEXT:                     0123456789
+# SKYLAKE-NEXT: Index     0123456789
 
 # ZNVER1-NEXT:                      0123456789
 # ZNVER1-NEXT:  Index     0123456789
@@ -145,43 +145,46 @@ cmp     %edi, %edx
 # HASWELL-NEXT: [9,2]     .    .    . DeE-------R   addq	$32, %r8
 # HASWELL-NEXT: [9,3]     .    .    .  DeE------R   cmpl	%edi, %edx
 
-# SKYLAKE:      [0,0]     DeER .    .    .    .    .    .    .    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [0,1]     DeeeeeeeeER    .    .    .    .    .    .    .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [0,2]     DeE-------R    .    .    .    .    .    .    .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [0,3]     D=eE------R    .    .    .    .    .    .    .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [1,0]     D=eE------R    .    .    .    .    .    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [1,1]     .D=======eeeeeeeeER .    .    .    .    .    .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [1,2]     .DeE--------------R .    .    .    .    .    .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [1,3]     .D=eE-------------R .    .    .    .    .    .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [2,0]     .D=eE-------------R .    .    .    .    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [2,1]     . D==============eeeeeeeeER   .    .    .    .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [2,2]     . DeE---------------------R   .    .    .    .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [2,3]     . D=eE--------------------R   .    .    .    .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [3,0]     . D=eE--------------------R   .    .    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [3,1]     .  D=====================eeeeeeeeER.    .    .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [3,2]     .  DeE----------------------------R.    .    .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [3,3]     .  D=eE---------------------------R.    .    .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [4,0]     .  D=eE---------------------------R.    .    .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [4,1]     .   D============================eeeeeeeeER  .    .    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [4,2]     .   DeE-----------------------------------R  .    .    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [4,3]     .   D=eE----------------------------------R  .    .    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [5,0]     .   D=eE----------------------------------R  .    .    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [5,1]     .    D===================================eeeeeeeeER    .    .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [5,2]     .    DeE------------------------------------------R    .    .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [5,3]     .    D=eE-----------------------------------------R    .    .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [6,0]     .    D=eE-----------------------------------------R    .    .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [6,1]     .    .D==========================================eeeeeeeeER .    .    .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [6,2]     .    .DeE-------------------------------------------------R .    .    .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [6,3]     .    .D=eE------------------------------------------------R .    .    .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [7,0]     .    .D=eE------------------------------------------------R .    .    .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [7,1]     .    . D=================================================eeeeeeeeER   .   .   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [7,2]     .    . DeE--------------------------------------------------------R   .   .   addq	$32, %r8
-# SKYLAKE-NEXT: [7,3]     .    . D=eE-------------------------------------------------------R   .   .   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [8,0]     .    . D=eE-------------------------------------------------------R   .   .   addl	$1, %edx
-# SKYLAKE-NEXT: [8,1]     .    .  D========================================================eeeeeeeeER   vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: [8,2]     .    .  DeE---------------------------------------------------------------R   addq	$32, %r8
-# SKYLAKE-NEXT: [8,3]     .    .  D=eE--------------------------------------------------------------R   cmpl	%edi, %edx
-# SKYLAKE-NEXT: [9,0]     .    .  D=eE--------------------------------------------------------------R   addl	$1, %edx
+# SKYLAKE:      [0,0]     DeER .    .    .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [0,1]     DeeeeeeeeER    .   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [0,2]     DeE-------R    .   .   addq	$32, %r8
+# SKYLAKE-NEXT: [0,3]     D=eE------R    .   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [1,0]     D=eE------R    .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [1,1]     .DeeeeeeeeER   .   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [1,2]     .DeE-------R   .   .   addq	$32, %r8
+# SKYLAKE-NEXT: [1,3]     .D=eE------R   .   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [2,0]     .D=eE------R   .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [2,1]     . DeeeeeeeeER  .   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [2,2]     . DeE-------R  .   .   addq	$32, %r8
+# SKYLAKE-NEXT: [2,3]     . D=eE------R  .   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [3,0]     . D=eE------R  .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [3,1]     .  DeeeeeeeeER .   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [3,2]     .  DeE-------R .   .   addq	$32, %r8
+# SKYLAKE-NEXT: [3,3]     .  D=eE------R .   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [4,0]     .  D=eE------R .   .   addl	$1, %edx
+# SKYLAKE-NEXT: [4,1]     .   DeeeeeeeeER.   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [4,2]     .   DeE-------R.   .   addq	$32, %r8
+# SKYLAKE-NEXT: [4,3]     .   D=eE------R.   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [5,0]     .   D=eE------R.   .   addl	$1, %edx
+# SKYLAKE-NEXT: [5,1]     .    DeeeeeeeeER   .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [5,2]     .    DeE-------R   .   addq	$32, %r8
+# SKYLAKE-NEXT: [5,3]     .    D=eE------R   .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [6,0]     .    D=eE------R   .   addl	$1, %edx
+# SKYLAKE-NEXT: [6,1]     .    .DeeeeeeeeER  .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [6,2]     .    .DeE-------R  .   addq	$32, %r8
+# SKYLAKE-NEXT: [6,3]     .    .D=eE------R  .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [7,0]     .    .D=eE------R  .   addl	$1, %edx
+# SKYLAKE-NEXT: [7,1]     .    . DeeeeeeeeER .   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [7,2]     .    . DeE-------R .   addq	$32, %r8
+# SKYLAKE-NEXT: [7,3]     .    . D=eE------R .   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [8,0]     .    . D=eE------R .   addl	$1, %edx
+# SKYLAKE-NEXT: [8,1]     .    .  DeeeeeeeeER.   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [8,2]     .    .  DeE-------R.   addq	$32, %r8
+# SKYLAKE-NEXT: [8,3]     .    .  D=eE------R.   cmpl	%edi, %edx
+# SKYLAKE-NEXT: [9,0]     .    .  D=eE------R.   addl	$1, %edx
+# SKYLAKE-NEXT: [9,1]     .    .   DeeeeeeeeER   vpaddd	(%r8), %ymm0, %ymm0
+# SKYLAKE-NEXT: [9,2]     .    .   DeE-------R   addq	$32, %r8
+# SKYLAKE-NEXT: [9,3]     .    .   D=eE------R   cmpl	%edi, %edx
 
 # ZNVER1:       [0,0]     DeER .    .    .   .   addl	$1, %edx
 # ZNVER1-NEXT:  [0,1]     DeeeeeeeeER    .   .   vpaddd	(%r8), %ymm0, %ymm0
@@ -233,21 +236,20 @@ cmp     %edi, %edx
 # ALL:                [0]    [1]    [2]    [3]
 
 # BDWELL-NEXT:  0.     10    1.0    0.4    4.5       addl	$1, %edx
-# BDWELL-NEXT:  1.     10    1.0    0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
+# HASWELL-NEXT: 0.     10    1.0    0.4    5.4       addl	$1, %edx
+# SKYLAKE-NEXT: 0.     10    1.9    0.1    5.4       addl	$1, %edx
+# ZNVER1-NEXT:  0.     10    1.0    0.1    5.4       addl	$1, %edx
+
+# ALL-NEXT:     1.     10    1.0    0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
+
 # BDWELL-NEXT:  2.     10    1.0    0.4    5.7       addq	$32, %r8
 # BDWELL-NEXT:  3.     10    1.0    0.0    5.3       cmpl	%edi, %edx
 
-# HASWELL-NEXT: 0.     10    1.0    0.4    5.4       addl	$1, %edx
-# HASWELL-NEXT: 1.     10    1.0    0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
 # HASWELL-NEXT: 2.     10    1.0    0.4    6.7       addq	$32, %r8
 # HASWELL-NEXT: 3.     10    1.0    0.0    6.3       cmpl	%edi, %edx
 
-# SKYLAKE-NEXT: 0.     10    1.9    0.1    30.6      addl	$1, %edx
-# SKYLAKE-NEXT: 1.     10    32.5   0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
-# SKYLAKE-NEXT: 2.     10    1.0    0.1    38.5      addq	$32, %r8
-# SKYLAKE-NEXT: 3.     10    2.0    0.0    37.5      cmpl	%edi, %edx
+# SKYLAKE-NEXT: 2.     10    1.0    0.1    7.0       addq	$32, %r8
+# SKYLAKE-NEXT: 3.     10    2.0    0.0    6.0       cmpl	%edi, %edx
 
-# ZNVER1-NEXT:  0.     10    1.0    0.1    5.4       addl	$1, %edx
-# ZNVER1-NEXT:  1.     10    1.0    0.1    0.0       vpaddd	(%r8), %ymm0, %ymm0
 # ZNVER1-NEXT:  2.     10    1.0    0.1    7.0       addq	$32, %r8
 # ZNVER1-NEXT:  3.     10    2.0    0.0    6.0       cmpl	%edi, %edx
diff --git a/test/tools/llvm-mca/X86/register-file-statistics.s b/test/tools/llvm-mca/X86/register-file-statistics.s
index e605ea94f4a9fb120421c11357d018023784f908..914eeaa82ddf0a218d55f9e3cbf2e18b6bd9c174 100644
--- a/test/tools/llvm-mca/X86/register-file-statistics.s
+++ b/test/tools/llvm-mca/X86/register-file-statistics.s
@@ -1,4 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL,BTVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL,ZNVER1 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -all-stats=false -all-views=false -register-file-stats < %s | FileCheck --check-prefixes=ALL %s
diff --git a/test/tools/llvm-mca/X86/scheduler-queue-usage.s b/test/tools/llvm-mca/X86/scheduler-queue-usage.s
index e22f4a51887137e6a546005f2f180e9d991bb18a..a1854a2821919daca031861f1a71fee6d3d68222 100644
--- a/test/tools/llvm-mca/X86/scheduler-queue-usage.s
+++ b/test/tools/llvm-mca/X86/scheduler-queue-usage.s
@@ -1,4 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,BDVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,BTVER2 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,ZNVER1 %s
 # RUN: llvm-mca %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge -iterations=1 -all-stats=false -all-views=false -scheduler-stats < %s | FileCheck --check-prefixes=ALL,SNB %s
@@ -17,6 +18,12 @@ xor %eax, %ebx
 # ALL-NEXT:         0,          3  (75.0%)
 # ALL-NEXT:         1,          1  (25.0%)
 
+# BDVER2:          Scheduler's queue usage:
+# BDVER2-NEXT:     [1] Resource name.
+# BDVER2-NEXT:     [2] Average number of used buffer entries.
+# BDVER2-NEXT:     [3] Maximum number of used buffer entries.
+# BDVER2-NEXT:     [4] Total number of buffer entries.
+
 # BDW:             Scheduler's queue usage:
 # BDW-NEXT:        [1] Resource name.
 # BDW-NEXT:        [2] Average number of used buffer entries.
@@ -74,6 +81,12 @@ xor %eax, %ebx
 # ZNVER1-NEXT:     [3] Maximum number of used buffer entries.
 # ZNVER1-NEXT:     [4] Total number of buffer entries.
 
+# BDVER2:           [1]            [2]        [3]        [4]
+# BDVER2-NEXT:     PdEX             0          1          40
+# BDVER2-NEXT:     PdFPU            0          0          64
+# BDVER2-NEXT:     PdLoad           0          0          40
+# BDVER2-NEXT:     PdStore          0          0          24
+
 # BDW:              [1]            [2]        [3]        [4]
 # BDW-NEXT:        BWPortAny        0          1          60
 
diff --git a/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s b/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s
index 66b87e72df21408b18d270e96dc01bce56512614..aa1bc886c106ef18ede4e772402dbfd84b349ab1 100644
--- a/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s
+++ b/test/tools/llvm-mca/X86/sqrt-rsqrt-rcp-memop.s
@@ -1,4 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -iterations=1 -all-views=false -timeline < %s | FileCheck %s -check-prefix=ALL -check-prefix=HASWELL
@@ -29,6 +30,9 @@ rcpss (%rax), %xmm1
 
 # ALL:            Timeline view:
 
+# BDVER2-NEXT:                        01234567
+# BDVER2-NEXT:    Index     0123456789
+
 # BROADWELL-NEXT:                     0123456789
 # BROADWELL-NEXT: Index     0123456789
 
@@ -44,6 +48,9 @@ rcpss (%rax), %xmm1
 # ZNVER1-NEXT:                        0123456789          0
 # ZNVER1-NEXT:    Index     0123456789          0123456789
 
+# BDVER2:         [0,0]     DeER .    .    . .   leaq	8(%rsp,%rdi,2), %rax
+# BDVER2-NEXT:    [0,1]     D=eeeeeeeeeeeeeeER   sqrtss	(%rax), %xmm1
+
 # BROADWELL:      [0,0]     DeER .    .    .   .   leaq	8(%rsp,%rdi,2), %rax
 # BROADWELL-NEXT: [0,1]     D=eeeeeeeeeeeeeeeeER   sqrtss	(%rax), %xmm1
 
@@ -68,6 +75,7 @@ rcpss (%rax), %xmm1
 # ALL:                  [0]    [1]    [2]    [3]
 # ALL-NEXT:       0.     1     1.0    1.0    0.0       leaq	8(%rsp,%rdi,2), %rax
 
+# BDVER2-NEXT:    1.     1     2.0    0.0    0.0       sqrtss	(%rax), %xmm1
 # BROADWELL-NEXT: 1.     1     2.0    0.0    0.0       sqrtss	(%rax), %xmm1
 # BTVER2-NEXT:    1.     1     3.0    0.0    0.0       sqrtss	(%rax), %xmm1
 # HASWELL-NEXT:   1.     1     2.0    0.0    0.0       sqrtss	(%rax), %xmm1
@@ -78,6 +86,9 @@ rcpss (%rax), %xmm1
 
 # ALL:            Timeline view:
 
+# BDVER2-NEXT:                        01234567
+# BDVER2-NEXT:    Index     0123456789
+
 # BROADWELL-NEXT:                     0123456789
 # BROADWELL-NEXT: Index     0123456789          01234
 
@@ -93,6 +104,9 @@ rcpss (%rax), %xmm1
 # ZNVER1-NEXT:                        0123456789          0
 # ZNVER1-NEXT:    Index     0123456789          0123456789
 
+# BDVER2:         [0,0]     DeER .    .    . .   leaq	8(%rsp,%rdi,2), %rax
+# BDVER2-NEXT:    [0,1]     D=eeeeeeeeeeeeeeER   sqrtsd	(%rax), %xmm1
+
 # BROADWELL:      [0,0]     DeER .    .    .    .   .   leaq	8(%rsp,%rdi,2), %rax
 # BROADWELL-NEXT: [0,1]     D=eeeeeeeeeeeeeeeeeeeeeER   sqrtsd	(%rax), %xmm1
 
@@ -117,6 +131,7 @@ rcpss (%rax), %xmm1
 # ALL:                  [0]    [1]    [2]    [3]
 # ALL-NEXT:       0.     1     1.0    1.0    0.0       leaq	8(%rsp,%rdi,2), %rax
 
+# BDVER2-NEXT:    1.     1     2.0    0.0    0.0       sqrtsd	(%rax), %xmm1
 # BROADWELL-NEXT: 1.     1     2.0    0.0    0.0       sqrtsd	(%rax), %xmm1
 # BTVER2-NEXT:    1.     1     3.0    0.0    0.0       sqrtsd	(%rax), %xmm1
 # HASWELL-NEXT:   1.     1     2.0    0.0    0.0       sqrtsd	(%rax), %xmm1
@@ -127,6 +142,7 @@ rcpss (%rax), %xmm1
 
 # ALL:            Timeline view:
 
+# BDVER2-NEXT:                        0123
 # BROADWELL-NEXT:                     0123
 # BTVER2-NEXT:                        01
 # HASWELL-NEXT:                       0123
@@ -135,6 +151,9 @@ rcpss (%rax), %xmm1
 
 # ALL-NEXT:       Index     0123456789
 
+# BDVER2:         [0,0]     DeER .    .  .   leaq	8(%rsp,%rdi,2), %rax
+# BDVER2-NEXT:    [0,1]     D=eeeeeeeeeeER   rsqrtss	(%rax), %xmm1
+
 # BROADWELL:      [0,0]     DeER .    .  .   leaq	8(%rsp,%rdi,2), %rax
 # BROADWELL-NEXT: [0,1]     D=eeeeeeeeeeER   rsqrtss	(%rax), %xmm1
 
@@ -159,6 +178,7 @@ rcpss (%rax), %xmm1
 # ALL:                  [0]    [1]    [2]    [3]
 # ALL-NEXT:       0.     1     1.0    1.0    0.0       leaq	8(%rsp,%rdi,2), %rax
 
+# BDVER2-NEXT:    1.     1     2.0    0.0    0.0       rsqrtss	(%rax), %xmm1
 # BROADWELL-NEXT: 1.     1     2.0    0.0    0.0       rsqrtss	(%rax), %xmm1
 # BTVER2-NEXT:    1.     1     3.0    0.0    0.0       rsqrtss	(%rax), %xmm1
 # HASWELL-NEXT:   1.     1     2.0    0.0    0.0       rsqrtss	(%rax), %xmm1
@@ -169,6 +189,7 @@ rcpss (%rax), %xmm1
 
 # ALL:            Timeline view:
 
+# BDVER2-NEXT:                        0123
 # BROADWELL-NEXT:                     0123
 # BTVER2-NEXT:                        01
 # HASWELL-NEXT:                       0123
@@ -177,6 +198,9 @@ rcpss (%rax), %xmm1
 
 # ALL-NEXT:       Index     0123456789
 
+# BDVER2:         [0,0]     DeER .    .  .   leaq	8(%rsp,%rdi,2), %rax
+# BDVER2-NEXT:    [0,1]     D=eeeeeeeeeeER   rcpss	(%rax), %xmm1
+
 # BROADWELL:      [0,0]     DeER .    .  .   leaq	8(%rsp,%rdi,2), %rax
 # BROADWELL-NEXT: [0,1]     D=eeeeeeeeeeER   rcpss	(%rax), %xmm1
 
@@ -201,6 +225,7 @@ rcpss (%rax), %xmm1
 # ALL:                  [0]    [1]    [2]    [3]
 # ALL-NEXT:       0.     1     1.0    1.0    0.0       leaq	8(%rsp,%rdi,2), %rax
 
+# BDVER2-NEXT:    1.     1     2.0    0.0    0.0       rcpss	(%rax), %xmm1
 # BROADWELL-NEXT: 1.     1     2.0    0.0    0.0       rcpss	(%rax), %xmm1
 # BTVER2-NEXT:    1.     1     3.0    0.0    0.0       rcpss	(%rax), %xmm1
 # HASWELL-NEXT:   1.     1     2.0    0.0    0.0       rcpss	(%rax), %xmm1
diff --git a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s
index 3a2f4d260f23979dcfc90c47df025100c28f6011..2a6022c93674ad662b58d6373dd2072a7e2e2a68 100644
--- a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s
+++ b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-1.s
@@ -9,6 +9,8 @@
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
 
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
+
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
@@ -19,6 +21,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
+# BDVER2-NEXT:  Total Cycles:      10
+# BDVER2-NEXT:  Total uOps:        2
+
 # BDWELL-NEXT:  Total Cycles:      10
 # BDWELL-NEXT:  Total uOps:        4
 
@@ -40,6 +45,11 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  Total Cycles:      11
 # ZNVER1-NEXT:  Total uOps:        2
 
+# BDVER2:       Dispatch Width:    4
+# BDVER2-NEXT:  uOps Per Cycle:    0.20
+# BDVER2-NEXT:  IPC:               0.20
+# BDVER2-NEXT:  Block RThroughput: 2.5
+
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.40
 # BDWELL-NEXT:  IPC:               0.20
@@ -75,6 +85,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  IPC:               0.18
 # ZNVER1-NEXT:  Block RThroughput: 1.0
 
+# BDVER2:       Timeline view:
+# BDVER2-NEXT:  Index     0123456789
+
 # BDWELL:       Timeline view:
 # BDWELL-NEXT:  Index     0123456789
 
@@ -102,6 +115,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:                      0
 # ZNVER1-NEXT:  Index     0123456789
 
+# BDVER2:       [0,0]     DeeeeeER .   vaddps	%xmm0, %xmm0, %xmm1
+# BDVER2-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
+
 # BDWELL:       [0,0]     DeeeER   .   vaddps	%xmm0, %xmm0, %xmm1
 # BDWELL-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 
@@ -132,6 +148,7 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:                [0]    [1]    [2]    [3]
 # ALL-NEXT:     0.     1     1.0    1.0    0.0       vaddps	%xmm0, %xmm0, %xmm1
 
+# BDVER2-NEXT:  1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # BDWELL-NEXT:  1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # BTVER2-NEXT:  1.     1     1.0    1.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # HASWELL-NEXT: 1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
diff --git a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s
index fd581e0debf689faaeb12732ed100a0658c4a421..e4bc9048eb8abe01f9c76895f49684547217f0ad 100644
--- a/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s
+++ b/test/tools/llvm-mca/X86/variable-blend-read-after-ld-2.s
@@ -9,6 +9,8 @@
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=SKYLAKE
 
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BDVER2
+
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=BTVER2
 
 # RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver1 -iterations=1 -timeline -instruction-info=false -resource-pressure=false < %s | FileCheck %s -check-prefix=ALL -check-prefix=ZNVER1
@@ -19,6 +21,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:          Iterations:        1
 # ALL-NEXT:     Instructions:      2
 
+# BDVER2-NEXT:  Total Cycles:      10
+# BDVER2-NEXT:  Total uOps:        2
+
 # BDWELL-NEXT:  Total Cycles:      10
 # BDWELL-NEXT:  Total uOps:        4
 
@@ -40,6 +45,11 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  Total Cycles:      11
 # ZNVER1-NEXT:  Total uOps:        2
 
+# BDVER2:       Dispatch Width:    4
+# BDVER2-NEXT:  uOps Per Cycle:    0.20
+# BDVER2-NEXT:  IPC:               0.20
+# BDVER2-NEXT:  Block RThroughput: 2.5
+
 # BDWELL:       Dispatch Width:    4
 # BDWELL-NEXT:  uOps Per Cycle:    0.40
 # BDWELL-NEXT:  IPC:               0.20
@@ -75,6 +85,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:  IPC:               0.18
 # ZNVER1-NEXT:  Block RThroughput: 1.0
 
+# BDVER2:       Timeline view:
+# BDVER2-NEXT:  Index     0123456789
+
 # BDWELL:       Timeline view:
 # BDWELL-NEXT:  Index     0123456789
 
@@ -102,6 +115,9 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ZNVER1-NEXT:                      0
 # ZNVER1-NEXT:  Index     0123456789
 
+# BDVER2:       [0,0]     DeeeeeER .   vaddps	%xmm0, %xmm0, %xmm2
+# BDVER2-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
+
 # BDWELL:       [0,0]     DeeeER   .   vaddps	%xmm0, %xmm0, %xmm2
 # BDWELL-NEXT:  [0,1]     DeeeeeeeER   vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 
@@ -132,6 +148,7 @@ vblendvps %xmm1, (%rdi), %xmm2, %xmm3
 # ALL:                [0]    [1]    [2]    [3]
 # ALL-NEXT:     0.     1     1.0    1.0    0.0       vaddps	%xmm0, %xmm0, %xmm2
 
+# BDVER2-NEXT:  1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # BDWELL-NEXT:  1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # BTVER2-NEXT:  1.     1     1.0    1.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
 # HASWELL-NEXT: 1.     1     1.0    0.0    0.0       vblendvps	%xmm1, (%rdi), %xmm2, %xmm3
diff --git a/test/tools/llvm-nm/X86/response-file.test b/test/tools/llvm-nm/X86/response-file.test
new file mode 100644
index 0000000000000000000000000000000000000000..5c53960056c6d3e1c0fcf33afa7af98eb6ebe454
--- /dev/null
+++ b/test/tools/llvm-nm/X86/response-file.test
@@ -0,0 +1,5 @@
+# RUN: echo "-P %p/Inputs/hello.obj.elf-x86_64" > %t-response
+# RUN: llvm-nm @%t-response | FileCheck %s
+
+CHECK: main T 0 0
+CHECK: puts U 0 0
diff --git a/test/tools/llvm-nm/libtool-response-file.test b/test/tools/llvm-nm/libtool-response-file.test
new file mode 100644
index 0000000000000000000000000000000000000000..5d4af74e316244fac538b3d357607cebe4f5b872
--- /dev/null
+++ b/test/tools/llvm-nm/libtool-response-file.test
@@ -0,0 +1,4 @@
+RUN: llvm-nm --help | FileCheck %s
+Check that the output of llvm-nm --help contains the literal text @FILE; this
+indicates to libtool that llvm-nm does support response files.
+CHECK: @FILE
diff --git a/test/tools/llvm-objcopy/basic-keep.test b/test/tools/llvm-objcopy/basic-keep.test
index 2ea4ea35577132f6c6c9853438d49468644fffa4..8f4acb0c971ed94b7486bb650cab08dfd62d87ce 100644
--- a/test/tools/llvm-objcopy/basic-keep.test
+++ b/test/tools/llvm-objcopy/basic-keep.test
@@ -1,6 +1,8 @@
 # RUN: yaml2obj %s > %t
 # RUN: llvm-objcopy -strip-non-alloc -keep=.test %t %t2
+# RUN: llvm-strip --strip-all -keep=.test %t -o %t3
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
+# RUN: cmp %t2 %t3
 
 !ELF
 FileHeader:
diff --git a/test/tools/llvm-objcopy/deterministic-archive.test b/test/tools/llvm-objcopy/deterministic-archive.test
new file mode 100644
index 0000000000000000000000000000000000000000..fd520fb9ed7f2169f38c6dd519edfaf9bec423e5
--- /dev/null
+++ b/test/tools/llvm-objcopy/deterministic-archive.test
@@ -0,0 +1,65 @@
+# RUN: yaml2obj %s > %t.o
+
+# Create an archive, specifying U so that timestamps/etc. are preserved.
+# We only test timestamps as a proxy for full deterministic writing; i.e. we
+# assume UID/GIDs are preserved if timestamps are preserved.
+# RUN: touch -t 199505050555.55 %t.o
+# RUN: rm -f %t.a
+# RUN: llvm-ar crsU %t.a %t.o
+
+# Test short flags.
+# RUN: llvm-objcopy -D %t.a %t.2D.a
+# RUN: env TZ=GMT llvm-ar tv %t.2D.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-objcopy -U %t.a %t.2U.a
+# RUN: env TZ=GMT llvm-ar tv %t.2U.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+
+# RUN: llvm-strip -D %t.a -o %t.3D.a
+# RUN: env TZ=GMT llvm-ar tv %t.3D.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-strip -U %t.a -o %t.3U.a
+# RUN: env TZ=GMT llvm-ar tv %t.3U.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+
+# Test long flags.
+# RUN: llvm-objcopy --enable-deterministic-archives %t.a %t.4D.a
+# RUN: env TZ=GMT llvm-ar tv %t.4D.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-objcopy --disable-deterministic-archives %t.a %t.4U.a
+# RUN: env TZ=GMT llvm-ar tv %t.4U.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+
+# RUN: llvm-strip --enable-deterministic-archives %t.a -o %t.5D.a
+# RUN: env TZ=GMT llvm-ar tv %t.5D.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-strip --disable-deterministic-archives %t.a -o %t.5U.a
+# RUN: env TZ=GMT llvm-ar tv %t.5U.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+
+# If unspecified, verify that deterministic is the default.
+# RUN: llvm-objcopy %t.a %t.6.a
+# RUN: env TZ=GMT llvm-ar tv %t.6.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-strip %t.a -o %t.7.a
+# RUN: env TZ=GMT llvm-ar tv %t.7.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+
+# If both are specified, last one wins.
+# RUN: llvm-objcopy -U -D %t.a %t.8.a
+# RUN: env TZ=GMT llvm-ar tv %t.8.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-objcopy -D -U %t.a %t.9.a
+# RUN: env TZ=GMT llvm-ar tv %t.9.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+# RUN: llvm-objcopy -D -U -D -U --enable-deterministic-archives %t.a %t.10.a
+# RUN: env TZ=GMT llvm-ar tv %t.10.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+
+# RUN: llvm-strip -U -D %t.a -o %t.11.a
+# RUN: env TZ=GMT llvm-ar tv %t.11.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+# RUN: llvm-strip -D -U %t.a -o %t.12.a
+# RUN: env TZ=GMT llvm-ar tv %t.12.a | FileCheck %s --check-prefix=CHECK-NONDETERMINISTIC
+# RUN: llvm-strip -D -U -D -U --enable-deterministic-archives %t.a -o %t.13.a
+# RUN: env TZ=GMT llvm-ar tv %t.13.a | FileCheck %s --check-prefix=CHECK-DETERMINISTIC
+
+# CHECK-DETERMINISTIC: {{[[:space:]]1970[[:space:]]}}
+# CHECK-NONDETERMINISTIC:  {{[[:space:]]1995[[:space:]]}}
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
diff --git a/test/tools/llvm-objcopy/globalize.test b/test/tools/llvm-objcopy/globalize.test
index 5c9d62e6c083b40fe00deb6467ad8231e20a91c4..4941cf12e921431fe626e44b95348b9666fe709f 100644
--- a/test/tools/llvm-objcopy/globalize.test
+++ b/test/tools/llvm-objcopy/globalize.test
@@ -1,5 +1,8 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy --globalize-symbol Global --globalize-symbol Local --globalize-symbol Weak %t %t2
+# RUN: llvm-objcopy --globalize-symbol Global \
+# RUN:   --globalize-symbol Local \
+# RUN:   --globalize-symbol Weak \
+# RUN:   --globalize-symbol WeakUndef %t %t2
 # RUN: llvm-readobj -symbols %t2 | FileCheck %s
 
 !ELF
@@ -28,6 +31,7 @@ Symbols:
       Size:     8
       Section:  .text
       Value:    0x1008
+    - Name:     WeakUndef
   Global:
     - Name:     Global
       Type:     STT_FUNC
@@ -72,4 +76,13 @@ Symbols:
 #CHECK-NEXT:    Other: 0
 #CHECK-NEXT:    Section: .text
 #CHECK-NEXT:  }
+#CHECK-NEXT:  Symbol {
+#CHECK-NEXT:    Name: WeakUndef
+#CHECK-NEXT:    Value: 0x0
+#CHECK-NEXT:    Size: 0
+#CHECK-NEXT:    Binding: Weak
+#CHECK-NEXT:    Type: None
+#CHECK-NEXT:    Other: 0
+#CHECK-NEXT:    Section: Undefined
+#CHECK-NEXT:  }
 #CHECK-NEXT:]
diff --git a/test/tools/llvm-objcopy/input-output-target.test b/test/tools/llvm-objcopy/input-output-target.test
new file mode 100644
index 0000000000000000000000000000000000000000..7a7df9fd50397c8cfa0ad8853d6ad6d8a074240e
--- /dev/null
+++ b/test/tools/llvm-objcopy/input-output-target.test
@@ -0,0 +1,22 @@
+# RUN: echo abcd > %t.txt
+
+# Preserve input to verify it is not modified
+# RUN: cp %t.txt %t-copy.txt
+
+# -F <target> is equivalent to -I <target> -O <target>
+# RUN: llvm-objcopy -F binary -B i386:x86-64 %t.txt %t.2.txt
+# RUN: cmp %t-copy.txt %t.2.txt
+
+# --target <target> is equivalent to --input-target <target> --output-target <target>
+# RUN: llvm-objcopy --target binary -B i386:x86-64 %t.txt %t.3.txt
+# RUN: cmp %t-copy.txt %t.3.txt
+
+# --target is incompatibile with --input-target/--output-target
+# RUN: not llvm-objcopy --target binary --input-target binary -B i386:x86-64 \
+# RUN:     %t.txt %t.4.txt 2>&1 \
+# RUN:     | FileCheck %s --check-prefix=BAD-FLAG
+# RUN: not llvm-objcopy --target binary --output-target binary -B i386:x86-64 \
+# RUN:     %t.txt %t.4.txt 2>&1 \
+# RUN:     | FileCheck %s --check-prefix=BAD-FLAG
+
+# BAD-FLAG: --target cannot be used with --input-target or --output-target.
diff --git a/test/tools/llvm-objcopy/keep-global-symbols.test b/test/tools/llvm-objcopy/keep-global-symbols.test
index 4f580b4ae8abf3709165fcc0612e6f2cb32eccfc..8ce1d7f3a2afa8afd90326e7d28e635125bc776e 100644
--- a/test/tools/llvm-objcopy/keep-global-symbols.test
+++ b/test/tools/llvm-objcopy/keep-global-symbols.test
@@ -18,6 +18,8 @@
 # "Global5 Global6": Global, because it appears in %t-globals2.txt, but we only
 #     trim leading and trailing whitespace. We don't just take the first chunk
 #     that looks like a symbol.
+# Global7: Global, because even though it doesn't appear as any -G flags, does
+#     not get demoted since it's undefined.
 
 # RUN: echo Global2 > %t-globals1.txt
 # RUN: echo "  Global3  " > %t-globals2.txt
@@ -77,8 +79,9 @@ Symbols:
       Section:     .text
     - Name:        "Global5 Global6"
       Section:     .text
+    - Name:        Global7
 
-# CHECK:      Symbol table '.symtab' contains 13 entries:
+# CHECK:      Symbol table '.symtab' contains 14 entries:
 # CHECK-NEXT:    Num: Value Size Type Bind Vis Ndx Name
 # CHECK-NEXT:      0: {{.*}}  LOCAL  {{.*}}
 # CHECK-NEXT:      1: {{.*}}  LOCAL  {{.*}} Local1
@@ -91,5 +94,6 @@ Symbols:
 # CHECK-NEXT:      8: {{.*}}  GLOBAL {{.*}} Global3
 # CHECK-NEXT:      9: {{.*}}  GLOBAL {{.*}} Global4
 # CHECK-NEXT:     10: {{.*}}  GLOBAL {{.*}} Global5 Global6
-# CHECK-NEXT:     11: {{.*}}  WEAK   {{.*}} Weak1
-# CHECK-NEXT:     12: {{.*}}  GLOBAL {{.*}} Weak2
+# CHECK-NEXT:     11: {{.*}}  GLOBAL {{.*}} UND Global7
+# CHECK-NEXT:     12: {{.*}}  WEAK   {{.*}} Weak1
+# CHECK-NEXT:     13: {{.*}}  GLOBAL {{.*}} Weak2
diff --git a/test/tools/llvm-objcopy/localize-hidden.test b/test/tools/llvm-objcopy/localize-hidden.test
index 92577075f077ea41130c60c8481c34bf1f098bab..05d747b800bc6d587aefc224adfe4dcc18df6567 100644
--- a/test/tools/llvm-objcopy/localize-hidden.test
+++ b/test/tools/llvm-objcopy/localize-hidden.test
@@ -55,6 +55,12 @@ Symbols:
       Value:    0x2006
       Size:     2
       Visibility: STV_HIDDEN
+    - Name:     hiddenGlobalCommon
+      Type:     STT_OBJECT
+      Index:    SHN_COMMON
+      Value:    0x2006
+      Size:     2
+      Visibility: STV_HIDDEN
     - Name:     undefGlobal
       Type:     STT_FUNC
       Size:     8
@@ -142,6 +148,17 @@ Symbols:
 #CHECK-NEXT:    Section: .text
 #CHECK-NEXT:  }
 #CHECK-NEXT:  Symbol {
+#CHECK-NEXT:    Name: hiddenGlobalCommon
+#CHECK-NEXT:    Value: 0x2006
+#CHECK-NEXT:    Size: 2
+#CHECK-NEXT:    Binding: Global
+#CHECK-NEXT:    Type: Object
+#CHECK-NEXT:    Other [
+#CHECK-NEXT:      STV_HIDDEN
+#CHECK-NEXT:    ]
+#CHECK-NEXT:    Section: Common (0xF
+#CHECK-NEXT:  }
+#CHECK-NEXT:  Symbol {
 #CHECK-NEXT:    Name: undefGlobal
 #CHECK-NEXT:    Value: 0x0
 #CHECK-NEXT:    Size: 8
diff --git a/test/tools/llvm-objcopy/localize.test b/test/tools/llvm-objcopy/localize.test
index d52852ac673b1623410d997f3406262570b09431..2e2d6ccd6bf5be45c126d9878d194d0b8ef134f3 100644
--- a/test/tools/llvm-objcopy/localize.test
+++ b/test/tools/llvm-objcopy/localize.test
@@ -1,5 +1,10 @@
 # RUN: yaml2obj %s > %t
-# RUN: llvm-objcopy --localize-symbol Global -L Local -L Weak %t %t2
+# RUN: llvm-objcopy \
+# RUN:     --localize-symbol Global \
+# RUN:     -L Local \
+# RUN:     -L Weak \
+# RUN:     -L GlobalCommon \
+# RUN:     %t %t2
 # RUN: llvm-readobj -symbols %t2 | FileCheck %s
 
 !ELF
@@ -40,6 +45,11 @@ Symbols:
       Size:     8
       Section:  .text
       Value:    0x1010
+    - Name:     GlobalCommon
+      Type:     STT_OBJECT
+      Index:    SHN_COMMON
+      Value:    0x2006
+      Size:     2
 
 #CHECK: Symbols [
 #CHECK-NEXT:  Symbol {
@@ -78,4 +88,13 @@ Symbols:
 #CHECK-NEXT:    Other: 0
 #CHECK-NEXT:    Section: .text
 #CHECK-NEXT:  }
+#CHECK-NEXT:  Symbol {
+#CHECK-NEXT:    Name: GlobalCommon
+#CHECK-NEXT:    Value: 0x2006
+#CHECK-NEXT:    Size: 2
+#CHECK-NEXT:    Binding: Global
+#CHECK-NEXT:    Type: Object
+#CHECK-NEXT:    Other: 0
+#CHECK-NEXT:    Section: Common (0xF
+#CHECK-NEXT:  }
 #CHECK-NEXT:]
diff --git a/test/tools/llvm-objcopy/strip-all-gnu.test b/test/tools/llvm-objcopy/strip-all-gnu.test
index 15e200525b264081fd5e01c486343ac42eb372f2..f6dbcc70cf409dc596f61553789b89926ff42268 100644
--- a/test/tools/llvm-objcopy/strip-all-gnu.test
+++ b/test/tools/llvm-objcopy/strip-all-gnu.test
@@ -1,7 +1,9 @@
 # RUN: yaml2obj %s > %t
 # RUN: cp %t %t1
 # RUN: llvm-objcopy --strip-all-gnu %t %t2
+# RUN: llvm-strip --strip-all-gnu %t -o %t3
 # RUN: llvm-readobj -file-headers -sections %t2 | FileCheck %s
+# RUN: cmp %t2 %t3
 
 !ELF
 FileHeader:
diff --git a/test/tools/llvm-objcopy/strip-all.test b/test/tools/llvm-objcopy/strip-all.test
index 8c0f7489134b621c4b40b852104b7ba3b9f3997a..5c5b6fd374ff74bb3d3df87de7da2ae09b2833ac 100644
--- a/test/tools/llvm-objcopy/strip-all.test
+++ b/test/tools/llvm-objcopy/strip-all.test
@@ -39,12 +39,16 @@
 # RUN: llvm-objcopy -S %t9 %t9
 # RUN: cmp %t2 %t9
 
+# RUN: cp %t %t10
+# RUN: llvm-strip -s %t10
+# RUN: cmp %t2 %t10
+
 # Verify that a non-existent symbol table (after first call to llvm-strip)
 # can be handled correctly.
-# RUN: cp %t %t9
-# RUN: llvm-strip --strip-all -keep=unavailable_symbol %t9
-# RUN: llvm-strip --strip-all -keep=unavailable_symbol %t9
-# RUN: cmp %t2 %t9
+# RUN: cp %t %t11
+# RUN: llvm-strip --strip-all --keep-symbol=unavailable_symbol %t11
+# RUN: llvm-strip --strip-all --keep-symbol=unavailable_symbol %t11
+# RUN: cmp %t2 %t11
 
 !ELF
 FileHeader:
diff --git a/test/tools/llvm-objdump/Inputs/trivial.obj.wasm b/test/tools/llvm-objdump/Inputs/trivial.obj.wasm
index 2aa042d54dc5bf56ae1abfd89144ab15ef5b832e..8652d67f69222ab57610df1b13d3972e031460a1 100644
Binary files a/test/tools/llvm-objdump/Inputs/trivial.obj.wasm and b/test/tools/llvm-objdump/Inputs/trivial.obj.wasm differ
diff --git a/test/tools/llvm-objdump/WebAssembly/symbol-table.test b/test/tools/llvm-objdump/WebAssembly/symbol-table.test
index 43c52873c9f0fa2d92f18d03bb28a972fc602b4f..fff4c9fe52ca56473e5f02905b179bdffc5e1c20 100644
--- a/test/tools/llvm-objdump/WebAssembly/symbol-table.test
+++ b/test/tools/llvm-objdump/WebAssembly/symbol-table.test
@@ -4,5 +4,6 @@ CHECK:      SYMBOL TABLE:
 CHECK-NEXT: 00000002 g     F CODE	main
 CHECK-NEXT: 00000000 l       DATA	.L.str
 CHECK-NEXT: 00000000 g     F *UND*	puts
+CHECK-NEXT: 00000003 l     F CODE	.LSomeOtherFunction_bitcast
 CHECK-NEXT: 00000000 g     F *UND*	SomeOtherFunction
 CHECK-NEXT: 00000010 g       DATA	var
diff --git a/test/tools/llvm-objdump/file-headers-coff.test b/test/tools/llvm-objdump/file-headers-coff.test
index 784b0124a2d8001005e90cb7406915e4ff1dff40..144532d6fd9e6d8d56cf8d5b6d0df03f474875c6 100644
--- a/test/tools/llvm-objdump/file-headers-coff.test
+++ b/test/tools/llvm-objdump/file-headers-coff.test
@@ -10,4 +10,4 @@ sections:
 symbols:
 
 # CHECK: architecture: i386
-# CHECK: start address: 0x0000
+# CHECK: start address: 0x00000000
diff --git a/test/tools/llvm-objdump/file-headers-elf.test b/test/tools/llvm-objdump/file-headers-elf.test
index ade59cf05da6adcfed2d52c951063555ec3dc8d6..397b9035bd42cf7ed8cf2d6d0c3500fa74eba0f1 100644
--- a/test/tools/llvm-objdump/file-headers-elf.test
+++ b/test/tools/llvm-objdump/file-headers-elf.test
@@ -8,7 +8,7 @@ FileHeader:
   Data:            ELFDATA2LSB
   Type:            ET_REL
   Machine:         EM_X86_64
-  Entry:           0x123456
+  Entry:           0x123456789abcde
 
 # CHECK: architecture: x86_64
-# CHECK: start address: 0x00123456
+# CHECK: start address: 0x00123456789abcde
diff --git a/test/tools/llvm-objdump/file-headers-pe.test b/test/tools/llvm-objdump/file-headers-pe.test
index 1e2fb2c4c3d41ac61ed7a82c2b3cb6292d5f71fc..68c086163bb04fba6fec2935d68393e47ea07680 100644
--- a/test/tools/llvm-objdump/file-headers-pe.test
+++ b/test/tools/llvm-objdump/file-headers-pe.test
@@ -7,7 +7,7 @@ header: !Header
   Machine: IMAGE_FILE_MACHINE_I386
   Characteristics: [ IMAGE_FILE_DEBUG_STRIPPED ]
 OptionalHeader:
-  AddressOfEntryPoint: 0x1234
+  AddressOfEntryPoint: 0x123456
 # Unfortunately, all these flags are mandatory to set AddressOfEntryPoint.
 # All the values are randomly picked. They can't interfere in what
 # we are testing here.
@@ -30,4 +30,4 @@ sections:
 symbols:
 
 # CHECK: architecture: i386
-# CHECK: start address: 0x1234
+# CHECK: start address: 0x00123456
diff --git a/test/tools/llvm-objdump/full-contents.test b/test/tools/llvm-objdump/full-contents.test
new file mode 100644
index 0000000000000000000000000000000000000000..de0d584df324b60736e832e299411a3d22361805
--- /dev/null
+++ b/test/tools/llvm-objdump/full-contents.test
@@ -0,0 +1,47 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objdump --full-contents %t > %t.out1
+# RUN: llvm-objdump -s %t > %t.out2
+# RUN: cmp %t.out1 %t.out2
+# RUN: FileCheck %s --input-file=%t.out1
+
+# CHECK:      .bss
+# CHECK-NEXT: <skipping contents of bss section at [0000, 0040)>
+# CHECK:      .text
+# CHECK-NEXT:  0000 01234567                             .#Eg
+# CHECK:      .user-defined
+# CHECK-NEXT:  0000 76543210                             vT2.
+# CHECK:      .empty-section
+# CHECK-NEXT: <skipping contents of bss section at [0000, 0020)>
+# CHECK:      .symtab
+# CHECK:      .strtab
+# CHECK:      .shstrtab
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Size:            64
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+    Content:         "01234567"
+    Size:            4
+  - Name:            .user-defined
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Content:         "76543210"
+    Size:            4
+  - Name:            .empty-section
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Size:            32
diff --git a/test/tools/llvm-objdump/non-archive-object.test b/test/tools/llvm-objdump/non-archive-object.test
new file mode 100644
index 0000000000000000000000000000000000000000..b1884102c02e6f35415d07cb132abb429f3dc1b6
--- /dev/null
+++ b/test/tools/llvm-objdump/non-archive-object.test
@@ -0,0 +1,25 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objdump -a %t | FileCheck %s
+
+# If this test has not crashed, then this test passed.
+# CHECK: file format ELF64-x86-64
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Size:            64
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+    Content:         "01234567"
+    Size:            4
+
diff --git a/test/tools/llvm-objdump/relocations-elf.test b/test/tools/llvm-objdump/relocations-elf.test
new file mode 100644
index 0000000000000000000000000000000000000000..a29b3e6a6fbadd64186c0b68c3a5f4325055e764
--- /dev/null
+++ b/test/tools/llvm-objdump/relocations-elf.test
@@ -0,0 +1,73 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objdump --reloc %t | FileCheck %s
+# RUN: llvm-objdump -r      %t | FileCheck %s
+
+# CHECK: RELOCATION RECORDS FOR [.rel.text]:
+# CHECK: 0000000000000001 R_X86_64_32 glob1
+# CHECK: 0000000000000001 R_X86_64_32S glob2
+# CHECK: 0000000000000002 R_X86_64_64 loc1
+
+# CHECK: RELOCATION RECORDS FOR [.rela.text]:
+# CHECK: 0000000000000001 R_X86_64_32 glob1+1
+# CHECK: 0000000000000001 R_X86_64_32S glob2+2
+# CHECK: 0000000000000002 R_X86_64_64 loc1+3
+
+!ELF
+FileHeader: !FileHeader
+  Class: ELFCLASS64
+  Data: ELFDATA2LSB
+  Type: ET_REL
+  Machine: EM_X86_64
+
+Sections:
+- Name: .text
+  Type: SHT_PROGBITS
+  Content: "0000000000000000"
+  AddressAlign: 16
+  Flags: [SHF_ALLOC]
+
+- Name: .rel.text
+  Type: SHT_REL
+  Info: .text
+  AddressAlign: 4
+  Relocations:
+    - Offset: 0x1
+      Symbol: glob1
+      Type: R_X86_64_32
+    - Offset: 0x1
+      Symbol: glob2
+      Type: R_X86_64_32S
+    - Offset: 0x2
+      Symbol: loc1
+      Type: R_X86_64_64
+
+- Name: .rela.text
+  Type: SHT_RELA
+  Link: .symtab
+  Info: .text
+  AddressAlign: 4
+  Relocations:
+    - Offset: 0x1
+      Addend: 1
+      Symbol: glob1
+      Type: R_X86_64_32
+    - Offset: 0x1
+      Addend: 2
+      Symbol: glob2
+      Type: R_X86_64_32S
+    - Offset: 0x2
+      Addend: 3
+      Symbol: loc1
+      Type: R_X86_64_64
+
+Symbols:
+  Local:
+    - Name: loc1
+    - Name: loc2
+  Global:
+    - Name: glob1
+      Section: .text
+      Value: 0x0
+      Size: 4
+    - Name: glob2
+
diff --git a/test/tools/llvm-objdump/symbol-table-elf.test b/test/tools/llvm-objdump/symbol-table-elf.test
new file mode 100644
index 0000000000000000000000000000000000000000..fc1eccdffb74d9c75437a026b3d2a43aeb7e1c89
--- /dev/null
+++ b/test/tools/llvm-objdump/symbol-table-elf.test
@@ -0,0 +1,47 @@
+# RUN: yaml2obj %s > %t
+# RUN: llvm-objdump --syms %t | FileCheck %s
+# RUN: llvm-objdump -t     %t | FileCheck %s
+
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: 0000000000000000         *UND*     00000000
+# CHECK-NEXT: 0000000000001004 l     F .text     00000000 lfoo
+# CHECK-NEXT: 0000000000001008 l       .text     00000000 lbar
+# CHECK-NEXT: 0000000000001004 g     F .text     00000000 foo
+# CHECK-NEXT: 0000000000001008 g       .text     00000000 bar
+
+!ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x0000000000000010
+    Size:            64
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+    Content:         "00000000"
+Symbols:
+   Global:
+     - Name:     foo
+       Type:     STT_FUNC
+       Section:  .text
+       Value:    0x1004
+     - Name:     bar
+       Type:     STT_OBJECT
+       Section:  .text
+       Value:    0x1008
+   Local:
+     - Name:     lfoo
+       Type:     STT_FUNC
+       Section:  .text
+       Value:    0x1004
+     - Name:     lbar
+       Type:     STT_OBJECT
+       Section:  .text
+       Value:    0x1008
diff --git a/test/tools/llvm-objdump/wasm.txt b/test/tools/llvm-objdump/wasm.txt
index d24db89188ee5ae80f9cc8fc3b54a605fd17bf0d..93517fed6d418f7b04608fac70a20ec035bfdd40 100644
--- a/test/tools/llvm-objdump/wasm.txt
+++ b/test/tools/llvm-objdump/wasm.txt
@@ -2,13 +2,13 @@
 
 # CHECK:      Sections:
 # CHECK-NEXT: Idx Name          Size      Address          Type
-# CHECK-NEXT:  0 TYPE          0000000e 0000000000000000
+# CHECK-NEXT:  0 TYPE          00000011 0000000000000000
 # CHECK-NEXT:  1 IMPORT        0000005d 0000000000000000
-# CHECK-NEXT:  2 FUNCTION      00000002 0000000000000000
-# CHECK-NEXT:  3 CODE          00000019 0000000000000000 TEXT
+# CHECK-NEXT:  2 FUNCTION      00000003 0000000000000000
+# CHECK-NEXT:  3 CODE          00000024 0000000000000000 TEXT
 # CHECK-NEXT:  4 DATA          0000001c 0000000000000000 DATA
-# CHECK-NEXT:  5 linking       00000051 0000000000000000
-# CHECK-NEXT:  6 reloc.CODE    0000000c 0000000000000000
+# CHECK-NEXT:  5 linking       0000006d 0000000000000000
+# CHECK-NEXT:  6 reloc.CODE    0000000f 0000000000000000
 
 # RUN: llvm-objdump -p %p/Inputs/trivial.obj.wasm | FileCheck %s -check-prefix CHECK-HEADER
 
@@ -18,5 +18,6 @@
 # RUN: llvm-objdump -s --section=CODE %p/Inputs/trivial.obj.wasm | FileCheck %s -check-prefix CHECK-SECTIONS
 
 # CHECK-SECTIONS: Contents of section CODE:
-# CHECK-SECTIONS: 0000 01170041 80808080 00108080 8080001a  ...A............
-# CHECK-SECTIONS: 0010 10818080 80004100 0b                 ......A..
+# CHECK-SECTIONS: 0000 02170041 80808080 00108080 8080001a  ...A............
+# CHECK-SECTIONS: 0010 10838080 80004100 0b0a0041 00108180  ......A....A....
+# CHECK-SECTIONS: 0020 8080000b                             ....
diff --git a/test/tools/llvm-opt-fuzzer/command-line.ll b/test/tools/llvm-opt-fuzzer/command-line.ll
index f747bba431bcba2e31a2c6f2c8fd6423832a3eac..8c3f6b60154b3ec23b1d0d098f499ff7e7bda4d0 100644
--- a/test/tools/llvm-opt-fuzzer/command-line.ll
+++ b/test/tools/llvm-opt-fuzzer/command-line.ll
@@ -13,7 +13,7 @@
 
 ; Don't start with incorrect passes specified
 ; RUN: not llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes no-pass 2>&1 | FileCheck -check-prefix=PIPELINE %s
-; PIPELINE: can't parse pass pipeline
+; PIPELINE: unknown pass name 'no-pass'
 
 ; Correct command line
 ; RUN: llvm-opt-fuzzer %t -ignore_remaining_args=1 -mtriple x86_64 -passes instcombine 2>&1 | FileCheck -check-prefix=CORRECT %s
diff --git a/test/tools/llvm-pdbdump/Inputs/Stripped.pdb b/test/tools/llvm-pdbdump/Inputs/Stripped.pdb
new file mode 100644
index 0000000000000000000000000000000000000000..c0988c2c3bb7e963da50f6e6ab1bf0412d717faa
Binary files /dev/null and b/test/tools/llvm-pdbdump/Inputs/Stripped.pdb differ
diff --git a/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..403ada1771318ba12294577c4ff529db8eef5d47
--- /dev/null
+++ b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.cpp
@@ -0,0 +1,11 @@
+// Compile with "cl /c /Zi /GR- UsingNamespaceTest.cpp"
+// Link with "link UsingNamespaceTest.obj /debug /nodefaultlib /entry:main"
+
+namespace NS {
+  int foo() { return 1; }
+}
+
+using namespace NS;
+int main(int argc, char **argv) {
+  return foo();
+}
diff --git a/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb
new file mode 100644
index 0000000000000000000000000000000000000000..ce5211e3fc8dca5ced3ac638943aac0aff735046
Binary files /dev/null and b/test/tools/llvm-pdbdump/Inputs/UsingNamespaceTest.pdb differ
diff --git a/test/tools/llvm-pdbdump/checksum-string.test b/test/tools/llvm-pdbdump/checksum-string.test
index c3ecc265e9ba70c7e938cec668c52daae1664bc7..6925329a5904398e97c2eab325f7e8851939f14e 100644
--- a/test/tools/llvm-pdbdump/checksum-string.test
+++ b/test/tools/llvm-pdbdump/checksum-string.test
@@ -1,3 +1,4 @@
+; REQUIRES: diasdk
 ; RUN: llvm-pdbutil pretty -lines %p/Inputs/PrettyFuncDumperTest.pdb > %t
 
 ; CHECK: ---COMPILANDS---
diff --git a/test/tools/llvm-pdbdump/class-layout.test b/test/tools/llvm-pdbdump/class-layout.test
index 1b7e909dcb7bffbbabc56868da0906830b4b6ed7..cb70dab0dc0a3e0d9b75d2381da10338efa21b03 100644
--- a/test/tools/llvm-pdbdump/class-layout.test
+++ b/test/tools/llvm-pdbdump/class-layout.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -all -class-recurse-depth=1 \
 ; RUN:   %p/Inputs/ClassLayoutTest.pdb > %t
 ; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBALS_TEST
diff --git a/test/tools/llvm-pdbdump/complex-padding-graphical.test b/test/tools/llvm-pdbdump/complex-padding-graphical.test
index 9373c1ec6c2f013ae1e4abc48ad09fee523aa68c..42511db95ffc5ba800bfd19514a0a8cf437d7960 100644
--- a/test/tools/llvm-pdbdump/complex-padding-graphical.test
+++ b/test/tools/llvm-pdbdump/complex-padding-graphical.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -classes -class-definitions=layout \
 ; RUN:     -include-types=Test %p/Inputs/ComplexPaddingTest.pdb > %t
 
diff --git a/test/tools/llvm-pdbdump/enum-layout.test b/test/tools/llvm-pdbdump/enum-layout.test
index 5813321f000d8a76d5d8a49e928939303149c027..57006d182bc514bae319b8aaa837979266461def 100644
--- a/test/tools/llvm-pdbdump/enum-layout.test
+++ b/test/tools/llvm-pdbdump/enum-layout.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -types %p/Inputs/ClassLayoutTest.pdb > %t
 ; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBAL_ENUM
 ; RUN: FileCheck -input-file=%t %s -check-prefix=MEMBER_ENUM
diff --git a/test/tools/llvm-pdbdump/explain-dbi-stream.test b/test/tools/llvm-pdbdump/explain-dbi-stream.test
index f393f976caaac427b6023e8d82113f129d8d220f..030e51f8f05061142521ab9a60c892ab65bca9d6 100644
--- a/test/tools/llvm-pdbdump/explain-dbi-stream.test
+++ b/test/tools/llvm-pdbdump/explain-dbi-stream.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil explain \
 ; RUN: -offset=0xF000 \
 ; RUN: -offset=0xF004 \
diff --git a/test/tools/llvm-pdbdump/explain-pdb-stream.test b/test/tools/llvm-pdbdump/explain-pdb-stream.test
index 10efb5b6459e72893f21398131158e1f23165163..32ec800f8cafb80f09302609cbb08de98d4a3d87 100644
--- a/test/tools/llvm-pdbdump/explain-pdb-stream.test
+++ b/test/tools/llvm-pdbdump/explain-pdb-stream.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil explain \
 ; RUN: -offset=0x11000 \
 ; RUN: -offset=0x11004 \
diff --git a/test/tools/llvm-pdbdump/explain.test b/test/tools/llvm-pdbdump/explain.test
index d76e86add2d84682e764b834ea8799cbb453deb8..1179fe5aad7543e54f7762355786377749f8cd7e 100644
--- a/test/tools/llvm-pdbdump/explain.test
+++ b/test/tools/llvm-pdbdump/explain.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil explain -offset=0 %p/Inputs/InjectedSource.pdb \
 ; RUN:  | FileCheck --check-prefix=ZERO %s
 ; RUN: llvm-pdbutil explain -offset=40 %p/Inputs/InjectedSource.pdb \
diff --git a/test/tools/llvm-pdbdump/injected-sources.test b/test/tools/llvm-pdbdump/injected-sources.test
index c04422e2a101380aa7ec5762f43d3e8542755cd9..9d2d1b91a05c55df5dea8500d8cbd4ef07210bb0 100644
--- a/test/tools/llvm-pdbdump/injected-sources.test
+++ b/test/tools/llvm-pdbdump/injected-sources.test
@@ -1,6 +1,8 @@
 ; The PDB committed to the repo does not seem to be recognized by older
 ; versions of DIA SDK, so we xfail the test temporarily until we can
 ; figure out how to get a PDB that makes all versions of MSVC happy.
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -injected-sources -injected-source-content \
 ; RUN:   %p/Inputs/InjectedSource.pdb | FileCheck %s
 ; RUN: llvm-pdbutil pretty -injected-sources -injected-source-content \
diff --git a/test/tools/llvm-pdbdump/lit.local.cfg b/test/tools/llvm-pdbdump/lit.local.cfg
index 28a895f51148d81f629d904057122bf921bc4ef2..5f1f826babb2bd99818d4fbd26cebe4fbd3584e6 100644
--- a/test/tools/llvm-pdbdump/lit.local.cfg
+++ b/test/tools/llvm-pdbdump/lit.local.cfg
@@ -1 +1,2 @@
-config.unsupported = not config.have_dia_sdk
+if config.have_dia_sdk:
+  config.available_features.add("diasdk")
diff --git a/test/tools/llvm-pdbdump/load-address.test b/test/tools/llvm-pdbdump/load-address.test
index 4402790d71f4dc412075ffa148b695901728c8dd..46b3a074e1c9d5d6264a7792189374478af666f9 100644
--- a/test/tools/llvm-pdbdump/load-address.test
+++ b/test/tools/llvm-pdbdump/load-address.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -externals %p/Inputs/LoadAddressTest.pdb \
 ; RUN:    | FileCheck --check-prefix=RVA %s
 ; RUN: llvm-pdbutil pretty -externals -load-address=0x40000000 \
diff --git a/test/tools/llvm-pdbdump/pretty-func-dumper.test b/test/tools/llvm-pdbdump/pretty-func-dumper.test
index 5e4dc8d998bf37de75117db3e303be801d84bec5..40bbcda258809e3c66349c7ba36bb05a227cd895 100644
--- a/test/tools/llvm-pdbdump/pretty-func-dumper.test
+++ b/test/tools/llvm-pdbdump/pretty-func-dumper.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -all -class-recurse-depth=1 \
 ; RUN:   %p/Inputs/PrettyFuncDumperTest.pdb > %t
 ; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBALS_FUNC
diff --git a/test/tools/llvm-pdbdump/regex-filter.test b/test/tools/llvm-pdbdump/regex-filter.test
index 1c49009bf36ea6ee6b5081773959b3ebbde6ab68..7eed0963a5cdc60c394d45c2e8c148d9fb801ec6 100644
--- a/test/tools/llvm-pdbdump/regex-filter.test
+++ b/test/tools/llvm-pdbdump/regex-filter.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -module-syms -globals -types %p/Inputs/FilterTest.pdb \
 ; RUN:    | FileCheck --check-prefix=NO_FILTER %s
 
diff --git a/test/tools/llvm-pdbdump/simple-padding-graphical.test b/test/tools/llvm-pdbdump/simple-padding-graphical.test
index 91da534ca0101f4d1c74c49bb30f9474da01ddd5..00bae75429701fbe1cc23cab0014ea74baa8c49c 100644
--- a/test/tools/llvm-pdbdump/simple-padding-graphical.test
+++ b/test/tools/llvm-pdbdump/simple-padding-graphical.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -classes -class-definitions=layout \
 ; RUN:     -include-types=SimplePad %p/Inputs/SimplePaddingTest.pdb > %t
 
diff --git a/test/tools/llvm-pdbdump/stripped.test b/test/tools/llvm-pdbdump/stripped.test
new file mode 100644
index 0000000000000000000000000000000000000000..1d12c9ecfa2d3d59d4cec5844547ac043277abe6
--- /dev/null
+++ b/test/tools/llvm-pdbdump/stripped.test
@@ -0,0 +1,109 @@
+; RUN: llvm-pdbutil dump -all %p/Inputs/Stripped.pdb > %t
+; RUN: FileCheck -input-file=%t %s
+
+; CHECK: Summary
+; CHECK-NEXT: ============================================================
+; CHECK-NEXT:  Block Size: 4096
+; CHECK-NEXT:  Number of blocks: 17
+; CHECK-NEXT:  Number of streams: 12
+; CHECK-NEXT:  Signature: 1541179274
+; CHECK-NEXT:  Age: 2
+; CHECK-NEXT:  GUID: {FF4F9B62-D99A-4647-97A7-22C702B1E053}
+; CHECK-NEXT:  Features: 0x1
+; CHECK-NEXT:  Has Debug Info: true
+; CHECK-NEXT:  Has Types: true
+; CHECK-NEXT:  Has IDs: true
+; CHECK-NEXT:  Has Globals: true
+; CHECK-NEXT:  Has Publics: true
+; CHECK-NEXT:  Is incrementally linked: false
+; CHECK-NEXT:  Has conflicting types: false
+; CHECK-NEXT:  Is stripped: true
+
+; CHECK: Streams
+; CHECK-NEXT: ============================================================
+; CHECK-NEXT:  Stream  0 (  88 bytes): [Old MSF Directory]
+; CHECK-NEXT:             Blocks: [4]
+; CHECK-NEXT:  Stream  1 (  78 bytes): [PDB Stream]
+; CHECK-NEXT:             Blocks: [14]
+; CHECK-NEXT:  Stream  2 (  56 bytes): [TPI Stream]
+; CHECK-NEXT:             Blocks: [13]
+; CHECK-NEXT:  Stream  3 (1355 bytes): [DBI Stream]
+; CHECK-NEXT:             Blocks: [7]
+; CHECK-NEXT:  Stream  4 (  56 bytes): [IPI Stream]
+; CHECK-NEXT:             Blocks: [6]
+; CHECK-NEXT:  Stream  5 (   0 bytes): [Named Stream "/LinkInfo"]
+; CHECK-NEXT:             Blocks: []
+; CHECK-NEXT:  Stream  6 ( 200 bytes): [Section Header Data]
+; CHECK-NEXT:             Blocks: [8]
+; CHECK-NEXT:  Stream  7 (  16 bytes): [Global Symbol Hash]
+; CHECK-NEXT:             Blocks: [9]
+; CHECK-NEXT:  Stream  8 ( 928 bytes): [Public Symbol Hash]
+; CHECK-NEXT:             Blocks: [11]
+; CHECK-NEXT:  Stream  9 ( 716 bytes): [Symbol Records]
+; CHECK-NEXT:             Blocks: [10]
+; CHECK-NEXT:  Stream 10 (   0 bytes): [TPI Hash]
+; CHECK-NEXT:             Blocks: []
+; CHECK-NEXT:  Stream 11 (   0 bytes): [IPI Hash]
+; CHECK-NEXT:             Blocks: []
+
+; CHECK: Module Stats
+; CHECK-NEXT: ============================================================
+
+; CHECK: S_UDT Record Stats
+; CHECK-NEXT: ============================================================
+
+; CHECK: String Table
+; CHECK-NEXT: ============================================================
+
+; CHECK: Modules
+; CHECK-NEXT: ============================================================
+
+; CHECK: Files
+; CHECK-NEXT: ============================================================
+
+; CHECK: Lines
+; CHECK-NEXT: ============================================================
+
+; CHECK: Inlinee Lines
+; CHECK-NEXT: ============================================================
+
+; CHECK: Cross Module Imports
+; CHECK-NEXT: ============================================================
+
+; CHECK: Cross Module Exports
+; CHECK-NEXT: ============================================================
+
+; CHECK: Old FPO Data
+; CHECK-NEXT: ============================================================
+
+; CHECK: New FPO Data
+; CHECK-NEXT: ============================================================
+
+; CHECK: Types (TPI Stream)
+; CHECK-NEXT: ============================================================
+
+; CHECK: Types (IPI Stream)
+; CHECK-NEXT: ============================================================
+
+; CHECK: Global Symbols
+; CHECK-NEXT: ============================================================
+
+; CHECK: Public Symbols
+; CHECK-NEXT: ============================================================
+
+; CHECK: Symbols
+; CHECK-NEXT: ============================================================
+
+; CHECK: Section Headers
+; CHECK-NEXT: ============================================================
+
+; CHECK: Original Section Headers
+; CHECK-NEXT: ============================================================
+
+; CHECK: Section Contributions
+; CHECK-NEXT: ============================================================
+
+; CHECK: Section Map
+; CHECK-NEXT: ============================================================
+
+
diff --git a/test/tools/llvm-pdbdump/symbol-filters.test b/test/tools/llvm-pdbdump/symbol-filters.test
index 80c24baf17ca0d957c5b4ca5ae5d5c58b517e8bc..4091d1d65c874d60522cbb2ea16fc0d5e14a143f 100644
--- a/test/tools/llvm-pdbdump/symbol-filters.test
+++ b/test/tools/llvm-pdbdump/symbol-filters.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -globals -module-syms -sym-types=data %p/Inputs/FilterTest.pdb \
 ; RUN:    | FileCheck --check-prefix=ONLY_DATA %s
 
diff --git a/test/tools/llvm-pdbdump/type-qualifiers.test b/test/tools/llvm-pdbdump/type-qualifiers.test
index 0969c15873c8741481f9d231b44bea020560606a..9c8827cc5da3ca0764ef4717fe49e5ab8d2c543d 100644
--- a/test/tools/llvm-pdbdump/type-qualifiers.test
+++ b/test/tools/llvm-pdbdump/type-qualifiers.test
@@ -1,3 +1,5 @@
+; REQUIRES: diasdk
+
 ; RUN: llvm-pdbutil pretty -all -class-recurse-depth=1 \
 ; RUN:   %p/Inputs/TypeQualifiersTest.pdb > %t
 ; RUN: FileCheck -input-file=%t %s -check-prefix=GLOBALS_FUNC
diff --git a/test/tools/llvm-pdbdump/usingnamespace.test b/test/tools/llvm-pdbdump/usingnamespace.test
new file mode 100644
index 0000000000000000000000000000000000000000..d44b0cbf9e7693a602980e51e6f9e36e6953f92d
--- /dev/null
+++ b/test/tools/llvm-pdbdump/usingnamespace.test
@@ -0,0 +1,8 @@
+; REQUIRES: diasdk
+
+; RUN: llvm-pdbutil pretty -module-syms %p/Inputs/UsingNamespaceTest.pdb > %t
+; RUN: FileCheck -input-file=%t %s
+
+; CHECK: ---SYMBOLS---
+; CHECK-NEXT: {{.*}}UsingNamespaceTest.obj
+; CHECK-DAG: using namespace NS
diff --git a/test/tools/llvm-readobj/Inputs/arm64-win1.obj b/test/tools/llvm-readobj/Inputs/arm64-win1.obj
new file mode 100755
index 0000000000000000000000000000000000000000..025e1db6cce4a2efdc398d5149dab52ce132c250
Binary files /dev/null and b/test/tools/llvm-readobj/Inputs/arm64-win1.obj differ
diff --git a/test/tools/llvm-readobj/Inputs/arm64-win2.obj b/test/tools/llvm-readobj/Inputs/arm64-win2.obj
new file mode 100755
index 0000000000000000000000000000000000000000..7e506eedda6549fdd4d576972a182fcca358b453
Binary files /dev/null and b/test/tools/llvm-readobj/Inputs/arm64-win2.obj differ
diff --git a/test/tools/llvm-readobj/Inputs/trivial.obj.wasm b/test/tools/llvm-readobj/Inputs/trivial.obj.wasm
index 0e3efb66a7feecfc91fb60c605285947ae0f7244..2f99d3446123f06927724b5d2c733d32cdef044d 100644
Binary files a/test/tools/llvm-readobj/Inputs/trivial.obj.wasm and b/test/tools/llvm-readobj/Inputs/trivial.obj.wasm differ
diff --git a/test/tools/llvm-readobj/arm64-win-error1.s b/test/tools/llvm-readobj/arm64-win-error1.s
new file mode 100644
index 0000000000000000000000000000000000000000..cd449efb55004a314a2c7934eb195f09f3df5d9c
--- /dev/null
+++ b/test/tools/llvm-readobj/arm64-win-error1.s
@@ -0,0 +1,54 @@
+## Check that error handling for bad opcodes works.
+## .xdata below contains the bad opcode 0xdf in the 4th word of .xdata.
+
+// REQUIRES: aarch64-registered-target
+// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o - \
+// RUN:   | llvm-readobj -unwind - | FileCheck %s
+
+// CHECK:     Prologue [
+// CHECK:        0xdf                ; Bad opcode!
+// CHECK:        0xff                ; Bad opcode!
+// CHECK:        0xd600              ; stp x19, lr, [sp, #0]
+// CHECK:        0x01                ; sub sp, #16
+// CHECK:        0xe4                ; end
+// CHECK:     ]
+
+	.text
+	.globl	"?func@@YAHXZ"
+	.p2align	3
+"?func@@YAHXZ":
+	sub     sp,sp,#0x10
+	stp     x19,lr,[sp]
+	sub     sp,sp,#0x1F0
+	mov     w19,w0
+	bl	"?func2@@YAXXZ"
+	cmp     w19,#2
+	ble     .LBB0_1
+	bl      "?func2@@YAHXZ"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+.LBB0_1:
+	mov      x0,sp
+	bl       "?func3@@YAHPEAH@Z"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+
+
+.section .pdata,"dr"
+	.long "?func@@YAHXZ"@IMGREL
+        .long "$unwind$func@@YAHXZ"@IMGREL
+
+
+.section	.xdata,"dr"
+"$unwind$func@@YAHXZ":
+        .p2align	3
+	.long		0x10800012
+	.long 		0x8
+	.long 		0xe
+	.long 		0x00d6ffdf
+	.long 		0xe3e3e401
+
diff --git a/test/tools/llvm-readobj/arm64-win-error2.s b/test/tools/llvm-readobj/arm64-win-error2.s
new file mode 100644
index 0000000000000000000000000000000000000000..93c461de8ee3428d6b35b78abf570d6a1b09d90d
--- /dev/null
+++ b/test/tools/llvm-readobj/arm64-win-error2.s
@@ -0,0 +1,50 @@
+## Check that the sanity check for an inconsistent header works.
+## The first word contains the bad value for CodeWords, 0xf, which indicates
+## that we need 0x11110 << 2 =  120 bytes of space for the unwind codes.
+## It follows that the .xdata section is badly formed as only 8 bytes are
+## allocated for the unwind codes.
+
+// REQUIRES: aarch64-registered-target
+// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o - \
+// RUN:   | not llvm-readobj -unwind - 2>&1 | FileCheck %s
+
+// CHECK: LLVM ERROR: Malformed unwind data
+
+	.text
+	.globl	"?func@@YAHXZ"
+	.p2align	3
+"?func@@YAHXZ":
+	sub     sp,sp,#0x10
+	stp     x19,lr,[sp]
+	sub     sp,sp,#0x1F0
+	mov     w19,w0
+	bl	"?func2@@YAXXZ"
+	cmp     w19,#2
+	ble     .LBB0_1
+	bl      "?func2@@YAHXZ"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+.LBB0_1:
+	mov      x0,sp
+	bl       "?func3@@YAHPEAH@Z"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+
+.section .pdata,"dr"
+	.long "?func@@YAHXZ"@IMGREL
+        .long "$unwind$func@@YAHXZ"@IMGREL
+
+
+.section	.xdata,"dr"
+"$unwind$func@@YAHXZ":
+        .p2align	3
+	.long		0xf0800012
+	.long 		0x8
+	.long 		0xe
+	.long 		0x100d61f
+	.long 		0xe3e3e3e4
+
diff --git a/test/tools/llvm-readobj/arm64-win-error3.s b/test/tools/llvm-readobj/arm64-win-error3.s
new file mode 100644
index 0000000000000000000000000000000000000000..5cbc3d7c585c191fe454c4739a749d8a1525fa88
--- /dev/null
+++ b/test/tools/llvm-readobj/arm64-win-error3.s
@@ -0,0 +1,51 @@
+## Check that error handling for going past the unwind data works.
+## .xdata below contains bad opcodes in the last word.  The last byte, 0xe0,
+## indicates that we have come across alloc_l, which requires 4 bytes. In this
+## case, unwind code processing will go past the allocated unwind data.
+
+// REQUIRES: aarch64-registered-target
+// RUN: llvm-mc -filetype=obj -triple aarch64-windows %s -o - \
+// RUN:   | llvm-readobj -unwind - | FileCheck %s
+
+// CHECK: Prologue [
+// CHECK:   Opcode 0xe0 goes past the unwind data
+
+	.text
+	.globl	"?func@@YAHXZ"
+	.p2align	3
+"?func@@YAHXZ":
+	sub     sp,sp,#0x10
+	stp     x19,lr,[sp]
+	sub     sp,sp,#0x1F0
+	mov     w19,w0
+	bl	"?func2@@YAXXZ"
+	cmp     w19,#2
+	ble     .LBB0_1
+	bl      "?func2@@YAHXZ"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+.LBB0_1:
+	mov      x0,sp
+	bl       "?func3@@YAHPEAH@Z"
+	add      sp,sp,#0x1F0
+	ldp      x19,lr,[sp]
+	add      sp,sp,#0x10
+	ret
+
+
+.section .pdata,"dr"
+	.long "?func@@YAHXZ"@IMGREL
+        .long "$unwind$func@@YAHXZ"@IMGREL
+
+
+.section	.xdata,"dr"
+"$unwind$func@@YAHXZ":
+        .p2align	3
+	.long		0x10800012
+	.long 		0x8
+	.long 		0xe
+	.long 		0x100d61f
+	.long 		0xe0000000
+
diff --git a/test/tools/llvm-readobj/gnu-file-headers.test b/test/tools/llvm-readobj/gnu-file-headers.test
index 4b74d0948a3e5d2c474474c12580df5556a556d2..e246a3d717bc0f590e7b39bd9730377d81b8666f 100644
--- a/test/tools/llvm-readobj/gnu-file-headers.test
+++ b/test/tools/llvm-readobj/gnu-file-headers.test
@@ -2,6 +2,8 @@ RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-i386 --elf-output-style=GNU \
 RUN:   | FileCheck %s -check-prefix ELF32
 RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-x86-64 --elf-output-style=GNU \
 RUN:   | FileCheck %s -check-prefix ELF64
+RUN: llvm-readobj -h %p/Inputs/trivial.obj.elf-mipsel --elf-output-style=GNU \
+RUN:   | FileCheck %s -check-prefix MIPSEL
 
 ELF32:      ELF Header:
 ELF32-NEXT:  Magic:   7f 45 4c 46 01 01 01 03 00 00 00 00 00 00 00 00
@@ -44,3 +46,24 @@ ELF64-NEXT:  Number of program headers:          0
 ELF64-NEXT:  Size of section headers:           64 (bytes)
 ELF64-NEXT:  Number of section headers:         10
 ELF64-NEXT:  Section header string table index: 7
+
+MIPSEL:     ELF Header:
+MIPSEL-NEXT:  Magic:   7f 45 4c 46 01 01 01 03 00 00 00 00 00 00 00 00
+MIPSEL-NEXT:  Class:                             ELF32
+MIPSEL-NEXT:  Data:                              2's complement, little endian
+MIPSEL-NEXT:  Version:                           1 (current)
+MIPSEL-NEXT:  OS/ABI:                            UNIX - GNU
+MIPSEL-NEXT:  ABI Version:                       0x0
+MIPSEL-NEXT:  Type:                              REL (Relocatable file)
+MIPSEL-NEXT:  Machine:                           MIPS R3000
+MIPSEL-NEXT:  Version:                           0x1
+MIPSEL-NEXT:  Entry point address:               0x0
+MIPSEL-NEXT:  Start of program headers:          0 (bytes into file)
+MIPSEL-NEXT:  Start of section headers:          172 (bytes into file)
+MIPSEL-NEXT:  Flags:                             0x50001000, o32, mips32
+MIPSEL-NEXT:  Size of this header:               52 (bytes)
+MIPSEL-NEXT:  Size of program headers:           0 (bytes)
+MIPSEL-NEXT:  Number of program headers:         0
+MIPSEL-NEXT:  Size of section headers:           40 (bytes)
+MIPSEL-NEXT:  Number of section headers:         9
+MIPSEL-NEXT:  Section header string table index: 6
diff --git a/test/tools/llvm-readobj/gnu-notes.test b/test/tools/llvm-readobj/gnu-notes.test
index 1a9c7e304b111336baf310339626b1e6fd568438..21078231f19f83104eadc74df220961b672da8cb 100644
--- a/test/tools/llvm-readobj/gnu-notes.test
+++ b/test/tools/llvm-readobj/gnu-notes.test
@@ -1,15 +1,55 @@
 # RUN: yaml2obj %s > %t.so
-# RUN: llvm-readobj -elf-output-style GNU --notes %t.so | FileCheck %s
+# RUN: llvm-readobj -elf-output-style GNU --notes %t.so | FileCheck %s --check-prefix=GNU
+# RUN: llvm-readobj -elf-output-style LLVM --notes %t.so | FileCheck %s --check-prefix=LLVM
 
-# CHECK: Displaying notes found at file offset 0x00000300 with length 0x00000020:
-# CHECK:   Owner                 Data size       Description
-# CHECK:   GNU                  0x00000010       NT_GNU_BUILD_ID (unique build ID bitstring)
-# CHECK:     Build ID: 4fcb712aa6387724a9f465a32cd8c14b
+# GNU:      Displaying notes found at file offset 0x00000340 with length 0x00000020:
+# GNU-NEXT:   Owner                 Data size       Description
+# GNU-NEXT:   GNU                   0x00000010      NT_GNU_ABI_TAG (ABI version tag)
+# GNU-NEXT:     OS: Linux, ABI: 2.6.32
 
-# CHECK: Displaying notes found at file offset 0x0000036c with length 0x0000001c:
-# CHECK:   Owner                 Data size       Description
-# CHECK:   GNU                  0x00000009       NT_GNU_GOLD_VERSION (gold version)
-# CHECK:     Version: gold 1.11
+# GNU:      Displaying notes found at file offset 0x00000360 with length 0x00000020:
+# GNU-NEXT:   Owner                 Data size       Description
+# GNU-NEXT:   GNU                  0x00000010       NT_GNU_BUILD_ID (unique build ID bitstring)
+# GNU-NEXT:     Build ID: 4fcb712aa6387724a9f465a32cd8c14b
+
+# GNU:      Displaying notes found at file offset 0x000003cc with length 0x0000001c:
+# GNU-NEXT:   Owner                 Data size       Description
+# GNU-NEXT:   GNU                  0x00000009       NT_GNU_GOLD_VERSION (gold version)
+# GNU-NEXT:     Version: gold 1.11
+
+# LLVM:      Notes [
+# LLVM-NEXT:   NoteSection {
+# LLVM-NEXT:     Offset: 0x340
+# LLVM-NEXT:     Size: 0x20
+# LLVM-NEXT:     Note {
+# LLVM-NEXT:       Owner: GNU
+# LLVM-NEXT:       Data size: 0x10
+# LLVM-NEXT:       Type: NT_GNU_ABI_TAG (ABI version tag)
+# LLVM-NEXT:       OS: Linux
+# LLVM-NEXT:       ABI: 2.6.32
+# LLVM-NEXT:     }
+# LLVM-NEXT:   }
+# LLVM-NEXT:   NoteSection {
+# LLVM-NEXT:     Offset: 0x360
+# LLVM-NEXT:     Size: 0x20
+# LLVM-NEXT:     Note {
+# LLVM-NEXT:       Owner: GNU
+# LLVM-NEXT:       Data size: 0x10
+# LLVM-NEXT:       Type: NT_GNU_BUILD_ID (unique build ID bitstring)
+# LLVM-NEXT:       Build ID: 4fcb712aa6387724a9f465a32cd8c14b
+# LLVM-NEXT:     }
+# LLVM-NEXT:   }
+# LLVM-NEXT:   NoteSection {
+# LLVM-NEXT:     Offset: 0x3CC
+# LLVM-NEXT:     Size: 0x1C
+# LLVM-NEXT:     Note {
+# LLVM-NEXT:       Owner: GNU
+# LLVM-NEXT:       Data size: 0x9
+# LLVM-NEXT:       Type: NT_GNU_GOLD_VERSION (gold version)
+# LLVM-NEXT:       Version: gold 1.11
+# LLVM-NEXT:     }
+# LLVM-NEXT:   }
+# LLVM-NEXT: ]
 
 --- !ELF
 FileHeader:
@@ -18,6 +58,10 @@ FileHeader:
   Type:            ET_EXEC
   Machine:         EM_X86_64
 Sections:
+  - Name:            .note.ABI-tag
+    Type:            SHT_NOTE
+    AddressAlign:    0x0000000000000004
+    Content:         040000001000000001000000474E550000000000020000000600000020000000
   - Name:            .note.gnu.build-id
     Type:            SHT_NOTE
     Flags:           [ SHF_ALLOC ]
diff --git a/test/tools/llvm-readobj/note-gnu-property.s b/test/tools/llvm-readobj/note-gnu-property.s
index f0a9b131ed5b2d45ba3be659be5e2b279aa8a2a7..d513a3e460c69414c4fd31bd0a63f8bf852b3340 100644
--- a/test/tools/llvm-readobj/note-gnu-property.s
+++ b/test/tools/llvm-readobj/note-gnu-property.s
@@ -1,23 +1,51 @@
 // REQUIRES: x86-registered-target
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t
-// RUN: llvm-readobj -elf-output-style GNU --notes %t | FileCheck %s
+// RUN: llvm-readobj -elf-output-style GNU --notes %t | FileCheck %s --check-prefix=GNU
+// RUN: llvm-readobj -elf-output-style LLVM --notes %t | FileCheck %s --check-prefix=LLVM
 
-// CHECK:      Displaying notes found at file offset 0x00000040 with length 0x000000b8:
-// CHECK-NEXT:   Owner                 Data size       Description
-// CHECK-NEXT:   GNU                   0x000000a8      NT_GNU_PROPERTY_TYPE_0 (property note)
-// CHECK-NEXT:     Properties:  stack size: 0x100
-// CHECK-NEXT:     stack size: 0x100
-// CHECK-NEXT:     no copy on protected
-// CHECK-NEXT:     X86 features: SHSTK
-// CHECK-NEXT:     X86 features: IBT, SHSTK
-// CHECK-NEXT:     X86 features: none
-// CHECK-NEXT:     <application-specific type 0xfefefefe>
-// CHECK-NEXT:     stack size: <corrupt length: 0x0>
-// CHECK-NEXT:     stack size: <corrupt length: 0x4> 
-// CHECK-NEXT:     no copy on protected <corrupt length: 0x1>
-// CHECK-NEXT:     X86 features: <corrupt length: 0x0>
-// CHECK-NEXT:     X86 features: IBT, <unknown flags: 0xf000f000f000f000>
-// CHECK-NEXT:     <corrupt type (0x2) datasz: 0x1>
+// GNU:      Displaying notes found at file offset 0x00000040 with length 0x000000b8:
+// GNU-NEXT:   Owner                 Data size       Description
+// GNU-NEXT:   GNU                   0x000000a8      NT_GNU_PROPERTY_TYPE_0 (property note)
+// GNU-NEXT:     Properties:  stack size: 0x100
+// GNU-NEXT:     stack size: 0x100
+// GNU-NEXT:     no copy on protected
+// GNU-NEXT:     X86 features: SHSTK
+// GNU-NEXT:     X86 features: IBT, SHSTK
+// GNU-NEXT:     X86 features: none
+// GNU-NEXT:     <application-specific type 0xfefefefe>
+// GNU-NEXT:     stack size: <corrupt length: 0x0>
+// GNU-NEXT:     stack size: <corrupt length: 0x4>
+// GNU-NEXT:     no copy on protected <corrupt length: 0x1>
+// GNU-NEXT:     X86 features: <corrupt length: 0x0>
+// GNU-NEXT:     X86 features: IBT, <unknown flags: 0xf000f000f000f000>
+// GNU-NEXT:     <corrupt type (0x2) datasz: 0x1>
+
+// LLVM:      Notes [
+// LLVM-NEXT:   NoteSection {
+// LLVM-NEXT:     Offset: 0x40
+// LLVM-NEXT:     Size: 0xB8
+// LLVM-NEXT:     Note {
+// LLVM-NEXT:       Owner: GNU
+// LLVM-NEXT:       Data size: 0xA8
+// LLVM-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+// LLVM-NEXT:       Property [
+// LLVM-NEXT:         stack size: 0x100
+// LLVM-NEXT:         stack size: 0x100
+// LLVM-NEXT:         no copy on protected
+// LLVM-NEXT:         X86 features: SHSTK
+// LLVM-NEXT:         X86 features: IBT, SHSTK
+// LLVM-NEXT:         X86 features: none
+// LLVM-NEXT:         <application-specific type 0xfefefefe>
+// LLVM-NEXT:         stack size: <corrupt length: 0x0>
+// LLVM-NEXT:         stack size: <corrupt length: 0x4>
+// LLVM-NEXT:         no copy on protected <corrupt length: 0x1>
+// LLVM-NEXT:         X86 features: <corrupt length: 0x0>
+// LLVM-NEXT:         X86 features: IBT, <unknown flags: 0xf000f000f000f000>
+// LLVM-NEXT:         <corrupt type (0x2) datasz: 0x1>
+// LLVM-NEXT:       ]
+// LLVM-NEXT:     }
+// LLVM-NEXT:   }
+// LLVM-NEXT: ]
 
 .section ".note.gnu.property", "a"
 .align 4 
diff --git a/test/tools/llvm-readobj/note-gnu-property2.s b/test/tools/llvm-readobj/note-gnu-property2.s
index a7eca87eb3c2616d912e21db9534f363a031faec..473e0a24a7e778c65bc5c45419931cb66428916c 100644
--- a/test/tools/llvm-readobj/note-gnu-property2.s
+++ b/test/tools/llvm-readobj/note-gnu-property2.s
@@ -1,11 +1,27 @@
 // REQUIRES: x86-registered-target
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t
-// RUN: llvm-readobj -elf-output-style GNU --notes %t | FileCheck %s
+// RUN: llvm-readobj -elf-output-style GNU --notes %t | FileCheck %s --check-prefix=GNU
+// RUN: llvm-readobj -elf-output-style LLVM --notes %t | FileCheck %s --check-prefix=LLVM
 
-// CHECK:      Displaying notes found at file offset 0x00000040 with length 0x00000014:
-// CHECK-NEXT:   Owner                 Data size       Description
-// CHECK-NEXT:   GNU                   0x00000004      NT_GNU_PROPERTY_TYPE_0 (property note)
-// CHECK-NEXT:     Properties:  <corrupted GNU_PROPERTY_TYPE_0>
+// GNU:      Displaying notes found at file offset 0x00000040 with length 0x00000014:
+// GNU-NEXT:   Owner                 Data size       Description
+// GNU-NEXT:   GNU                   0x00000004      NT_GNU_PROPERTY_TYPE_0 (property note)
+// GNU-NEXT:     Properties:  <corrupted GNU_PROPERTY_TYPE_0>
+
+// LLVM:      Notes [
+// LLVM-NEXT:   NoteSection {
+// LLVM-NEXT:     Offset: 0x40
+// LLVM-NEXT:     Size: 0x14
+// LLVM-NEXT:     Note {
+// LLVM-NEXT:       Owner: GNU
+// LLVM-NEXT:       Data size: 0x4
+// LLVM-NEXT:       Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+// LLVM-NEXT:       Property [
+// LLVM-NEXT:         <corrupted GNU_PROPERTY_TYPE_0>
+// LLVM-NEXT:       ]
+// LLVM-NEXT:     }
+// LLVM-NEXT:   }
+// LLVM-NEXT: ]
 
 // Section below is broken, check we report that.
 
diff --git a/test/tools/llvm-readobj/print-hex.test b/test/tools/llvm-readobj/print-hex.test
index c220eb3739d2abfc9b9a795825be89c47d196353..71e561e479e99344320b2ec8b525d7ef2265914f 100644
--- a/test/tools/llvm-readobj/print-hex.test
+++ b/test/tools/llvm-readobj/print-hex.test
@@ -22,4 +22,5 @@ MACHO: 0x00000010 000031c0 5ac3                       ..1.Z.
 RUN: llvm-readobj -x 1 %p/Inputs/trivial.obj.wasm \
 RUN:     | FileCheck %s --check-prefix WASM
 
-WASM: 0x00000000 03600001 7f60017f 017f6001 7f00 .`...`....`...
+WASM: 0x00000000 04600001 7f60017f 017f6000 0060017f .`...`....`..`..
+WASM: 0x00000010 00                                  .
diff --git a/test/tools/llvm-readobj/relocations.test b/test/tools/llvm-readobj/relocations.test
index be2981304083657d3c5252855dd39b42c0af5cfb..4a7dfa5eba0e389e2a5b44f0120c11ec6e420198 100644
--- a/test/tools/llvm-readobj/relocations.test
+++ b/test/tools/llvm-readobj/relocations.test
@@ -302,6 +302,11 @@ WASM-NEXT:     }
 WASM-NEXT:     Relocation {
 WASM-NEXT:       Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB (0)
 WASM-NEXT:       Offset: 0x11
+WASM-NEXT:       Symbol: .LSomeOtherFunction_bitcast
+WASM-NEXT:     }
+WASM-NEXT:     Relocation {
+WASM-NEXT:       Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB (0)
+WASM-NEXT:       Offset: 0x1E
 WASM-NEXT:       Symbol: SomeOtherFunction
 WASM-NEXT:     }
 WASM-NEXT:   }
diff --git a/test/tools/llvm-readobj/sections.test b/test/tools/llvm-readobj/sections.test
index 4900c4f57b66517739d9bf7fb7915fe60f78ea78..c371f4bb644817ba3a23807adb7a3999f9e20ae2 100644
--- a/test/tools/llvm-readobj/sections.test
+++ b/test/tools/llvm-readobj/sections.test
@@ -496,28 +496,28 @@ MACHO-ARM-NEXT:]
 WASM:      Sections [
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: TYPE (0x1)
-WASM-NEXT:     Size: 14
+WASM-NEXT:     Size: 17
 WASM-NEXT:     Offset: 8
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: IMPORT (0x2)
 WASM-NEXT:     Size: 93
-WASM-NEXT:     Offset: 28
+WASM-NEXT:     Offset: 31
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: FUNCTION (0x3)
-WASM-NEXT:     Size: 2
-WASM-NEXT:     Offset: 127
+WASM-NEXT:     Size: 3
+WASM-NEXT:     Offset: 130
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: CODE (0xA)
-WASM-NEXT:     Size: 25
-WASM-NEXT:     Offset: 135
+WASM-NEXT:     Size: 36
+WASM-NEXT:     Offset: 139
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: DATA (0xB)
 WASM-NEXT:     Size: 19
-WASM-NEXT:     Offset: 166
+WASM-NEXT:     Offset: 181
 WASM-NEXT:     Segments [
 WASM-NEXT:       Segment {
 WASM-NEXT:         Name: .rodata..L.str
@@ -528,14 +528,14 @@ WASM-NEXT:     ]
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: CUSTOM (0x0)
-WASM-NEXT:     Size: 61
-WASM-NEXT:     Offset: 191
+WASM-NEXT:     Size: 89
+WASM-NEXT:     Offset: 206
 WASM-NEXT:     Name: linking
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: CUSTOM (0x0)
-WASM-NEXT:     Size: 12
-WASM-NEXT:     Offset: 266
+WASM-NEXT:     Size: 15
+WASM-NEXT:     Offset: 309
 WASM-NEXT:     Name: reloc.CODE
 WASM-NEXT:   }
 WASM-NEXT: ]
diff --git a/test/tools/llvm-readobj/symbols.test b/test/tools/llvm-readobj/symbols.test
index d6bb870942da30ae04b4e12b1efc0bd9f7fe311a..1a0cacdeccdd44cf023c2355835f08e511408c71 100644
--- a/test/tools/llvm-readobj/symbols.test
+++ b/test/tools/llvm-readobj/symbols.test
@@ -88,6 +88,11 @@ WASM-NEXT:     Type: FUNCTION (0x0)
 WASM-NEXT:     Flags: 0x10
 WASM-NEXT:   }
 WASM-NEXT:   Symbol {
+WASM-NEXT:     Name: .LSomeOtherFunction_bitcast
+WASM-NEXT:     Type: FUNCTION (0x0)
+WASM-NEXT:     Flags: 0x2
+WASM-NEXT:   }
+WASM-NEXT:   Symbol {
 WASM-NEXT:     Name: SomeOtherFunction
 WASM-NEXT:     Type: FUNCTION (0x0)
 WASM-NEXT:     Flags: 0x10
diff --git a/test/tools/llvm-readobj/unwind-arm64-windows.test b/test/tools/llvm-readobj/unwind-arm64-windows.test
new file mode 100644
index 0000000000000000000000000000000000000000..879afe27efba386b3a67a5b5736146f9a47a39e5
--- /dev/null
+++ b/test/tools/llvm-readobj/unwind-arm64-windows.test
@@ -0,0 +1,69 @@
+RUN: llvm-readobj -unwind %p/Inputs/arm64-win1.obj | FileCheck %s -check-prefix=UNWIND1
+RUN: llvm-readobj -unwind %p/Inputs/arm64-win2.obj | FileCheck %s -check-prefix=UNWIND2
+
+UNWIND1:         ExceptionData {
+UNWIND1-NEXT:      FunctionLength: 340
+UNWIND1-NEXT:      Version: 0
+UNWIND1-NEXT:      ExceptionData: No
+UNWIND1-NEXT:      EpiloguePacked: Yes
+UNWIND1-NEXT:      EpilogueOffset: 15
+UNWIND1-NEXT:      ByteCodeLength: 28
+UNWIND1-NEXT:      Prologue [
+UNWIND1-NEXT:        0xe002dac8          ; sub sp, #2993280
+UNWIND1-NEXT:        0xe3                ; nop
+UNWIND1-NEXT:        0xe3                ; nop
+UNWIND1-NEXT:        0xe3                ; nop
+UNWIND1-NEXT:        0xd885              ; stp d10, d11, [sp, #40]
+UNWIND1-NEXT:        0xd803              ; stp d8, d9, [sp, #24]
+UNWIND1-NEXT:        0xd2c2              ; str x30, [sp, #16]
+UNWIND1-NEXT:        0x28                ; stp x19, x20, [sp, #-64]!
+UNWIND1-NEXT:        0xe4                ; end
+UNWIND1-NEXT:      ]
+UNWIND1-NEXT:      Epilogue [
+UNWIND1-NEXT:        0xe002dac8          ; add sp, #2993280
+UNWIND1-NEXT:        0xd885              ; ldp d10, d11, [sp, #40]
+UNWIND1-NEXT:        0xd803              ; ldp d8, d9, [sp, #24]
+UNWIND1-NEXT:        0xd2c2              ; ldr x30, [sp, #16]
+UNWIND1-NEXT:        0x28                ; ldp x19, x20, [sp], #64
+UNWIND1-NEXT:        0xe4                ; end
+UNWIND1-NEXT:      ]
+UNWIND1_NEXT:    }
+
+
+UNWIND2:         ExceptionData {
+UNWIND2-NEXT:      FunctionLength: 72
+UNWIND2-NEXT:      Version: 0
+UNWIND2-NEXT:      ExceptionData: No
+UNWIND2-NEXT:      EpiloguePacked: No
+UNWIND2-NEXT:      EpilogueScopes: 2
+UNWIND2-NEXT:      ByteCodeLength: 8
+UNWIND2-NEXT:      Prologue [
+UNWIND2-NEXT:        0x1f                ; sub sp, #496
+UNWIND2-NEXT:        0xd600              ; stp x19, lr, [sp, #0]
+UNWIND2-NEXT:        0x01                ; sub sp, #16
+UNWIND2-NEXT:        0xe4                ; end
+UNWIND2-NEXT:      ]
+UNWIND2-NEXT:      EpilogueScopes [
+UNWIND2-NEXT:        EpilogueScope {
+UNWIND2-NEXT:          StartOffset: 8
+UNWIND2-NEXT:          EpilogueStartIndex: 0
+UNWIND2-NEXT:          Opcodes [
+UNWIND2-NEXT:            0x1f                ; add sp, #496
+UNWIND2-NEXT:            0xd600              ; ldp x19, lr, [sp, #0]
+UNWIND2-NEXT:            0x01                ; add sp, #16
+UNWIND2-NEXT:            0xe4                ; end
+UNWIND2-NEXT:          ]
+UNWIND2-NEXT:        }
+UNWIND2-NEXT:        EpilogueScope {
+UNWIND2-NEXT:          StartOffset: 14
+UNWIND2-NEXT:          EpilogueStartIndex: 0
+UNWIND2-NEXT:          Opcodes [
+UNWIND2-NEXT:            0x1f                ; add sp, #496
+UNWIND2-NEXT:            0xd600              ; ldp x19, lr, [sp, #0]
+UNWIND2-NEXT:            0x01                ; add sp, #16
+UNWIND2-NEXT:            0xe4                ; end
+UNWIND2-NEXT:          ]
+UNWIND2-NEXT:        }
+UNWIND2-NEXT:      ]
+UNWIND2-NEXT:    }
+
diff --git a/test/tools/llvm-strings/negative-char.test b/test/tools/llvm-strings/negative-char.test
new file mode 100644
index 0000000000000000000000000000000000000000..331dde47078927295994e08f0cb755375a12c4fb
--- /dev/null
+++ b/test/tools/llvm-strings/negative-char.test
@@ -0,0 +1,3 @@
+# RUN: echo -e "z\0\x80\0a\0" | llvm-strings --bytes 1 - | FileCheck %s
+# CHECK: z{{$}}
+# CHECK-NEXT: {{^}} a
diff --git a/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt b/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt
index 88a9dc2e58c77b0bfe1b78b31cf3aa852c1fff44..52ec12550a3d3604a39f01503123d86b95ab3935 100644
--- a/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt
@@ -8,8 +8,8 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3500000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 17, thread: 8715, kind: function-enter, tsc: 22555670288232728 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 17, thread: 8715, kind: function-exit, tsc: 22555670288334784 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', args: [ 1 ], cpu: 17, thread: 8715, kind: function-enter-arg, tsc: 22555670288335768 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 17, thread: 8715, kind: function-exit, tsc: 22555670288365224 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 17, thread: 8715, kind: function-enter, tsc: 22555670288232728, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 17, thread: 8715, kind: function-exit, tsc: 22555670288334784, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', args: [ 1 ], cpu: 17, thread: 8715, kind: function-enter-arg, tsc: 22555670288335768, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 17, thread: 8715, kind: function-exit, tsc: 22555670288365224, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt b/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt
index 65232b79ba41cc0720e5e37314f1440649fb7d90..84c757c2b26398b49e9a33cccc044c0ba4e9d13e 100644
--- a/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3900000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 0, thread: 2590, process: 2590, kind: function-enter, tsc: 2033303630902004 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033403115246844 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 0, thread: 2590, process: 2590, kind: function-enter, tsc: 2033490200702516 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033504122687120 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 67 ], cpu: 0, thread: 2590, process: 2590, kind: function-enter-arg, tsc: 2033505343905936 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033505343936752 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 0, thread: 2590, process: 2590, kind: function-enter, tsc: 2033303630902004, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033403115246844, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 0, thread: 2590, process: 2590, kind: function-enter, tsc: 2033490200702516, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033504122687120, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 67 ], cpu: 0, thread: 2590, process: 2590, kind: function-enter-arg, tsc: 2033505343905936, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 0, thread: 2590, process: 2590, kind: function-exit, tsc: 2033505343936752, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt b/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt
index 21a3b7e4a0f1be8b181fcdc596092be319f128d9..d2af2fc09c2ebff7884d29840a8d8fe898b1200f 100644
--- a/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt
@@ -8,12 +8,12 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3900000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070767347414784 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070767347496472 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070768324320264 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070768324344100 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070768921602152 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070768921625968 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070769627174140 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070769627197624 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070767347414784, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070767347496472, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070768324320264, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070768324344100, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070768921602152, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070768921625968, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-enter, tsc: 2070769627174140, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 7, thread: 25518, process: 25518, kind: function-exit, tsc: 2070769627197624, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt b/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt
index 06b5eb8904e706a57a5c762517e5d07b8fa227eb..592796434bd83bb0f27630be7ed92e52adfefacf 100644
--- a/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt
@@ -8,6 +8,6 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3500000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 1 ], cpu: 49, thread: 14648, kind: function-enter-arg, tsc: 18828908666543318 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 49, thread: 14648, kind: function-exit, tsc: 18828908666595604 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 1 ], cpu: 49, thread: 14648, kind: function-enter-arg, tsc: 18828908666543318, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 49, thread: 14648, kind: function-exit, tsc: 18828908666595604, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt b/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt
index a3a3ed6d22b2452f5c1ad81c74ddda91fed63733..afeac68fa3dacecbd6dd7f4ee7ab60e937279ef6 100644
--- a/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3900000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 6, thread: 2631, process: 2631, kind: function-enter, tsc: 2034042117104344 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034042117199088 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 6, thread: 2631, process: 2631, kind: function-enter, tsc: 2034043145686378 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034043145762200 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 67 ], cpu: 6, thread: 2631, process: 2631, kind: function-enter-arg, tsc: 2034049739853430 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034049739878154 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 6, thread: 2631, process: 2631, kind: function-enter, tsc: 2034042117104344, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034042117199088, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 6, thread: 2631, process: 2631, kind: function-enter, tsc: 2034043145686378, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034043145762200, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', args: [ 67 ], cpu: 6, thread: 2631, process: 2631, kind: function-enter-arg, tsc: 2034049739853430, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 6, thread: 2631, process: 2631, kind: function-exit, tsc: 2034049739878154, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt b/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt
index 46287b2572cf5914baffe52bb8434ccbc5c1f722..fc70015c41e879f558b888b8efdc5e6f92018c59 100644
--- a/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 3900000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069294857657498 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069294857707502 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069295590705912 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069295590734308 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069296377598128 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069296377627032 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069294857657498, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069294857707502, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069295590705912, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069295590734308, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-enter, tsc: 2069296377598128, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 4, thread: 25190, process: 25190, kind: function-exit, tsc: 2069296377627032, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt b/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
index 731ab3083d283ab00b6158823c69369752f02410..99bc7e11b97b8719ad030cb9368da80d94363212 100644
--- a/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
@@ -8,17 +8,17 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 5678
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407340 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407346 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407347 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407387 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407437 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407467 }
-; CHECK-NEXT:   - { type: 0, func-id: 4, function: '4', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407492 }
-; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407517 }
-; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-tail-exit, tsc: 7238225556407542 }
-; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407552 }
-; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407562 }
-; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-enter, tsc: 7238225556407682 }
-; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-exit, tsc: 7238225556407755 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407340, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407346, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407347, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407387, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407437, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407467, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 4, function: '4', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407492, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407517, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-tail-exit, tsc: 7238225556407542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407552, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407562, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-enter, tsc: 7238225556407682, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-exit, tsc: 7238225556407755, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-roundtrip.yaml b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
index 4c5dfd181488deff85fd448d8fbe4689a1b68e77..bbebd67e576116199c5b3a23d3b3f104fde5b2d0 100644
--- a/test/tools/llvm-xray/X86/convert-roundtrip.yaml
+++ b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
@@ -19,6 +19,6 @@ records:
 #CHECK-NEXT:    nonstop-tsc: true
 #CHECK-NEXT:    cycle-frequency: 2601000000
 #CHECK-NEXT:  records:
-#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
-#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-exit, tsc: 10100 }
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-enter, tsc: 10001, data: '' }
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-exit, tsc: 10100, data: '' }
 #CHECK-NEXT:  ...
diff --git a/test/tools/llvm-xray/X86/convert-to-yaml.txt b/test/tools/llvm-xray/X86/convert-to-yaml.txt
index 66a5618e12f6eaa011990a6979288d3b679bc38a..f807fae3a64c5d4c7c3fbffad5dfa73682f9e39c 100644
--- a/test/tools/llvm-xray/X86/convert-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-to-yaml.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-debug-syms.txt b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
index 76cee99d4b51ed52771783c72286cf54bee8c0ce..dbb98e3d3cf053048b760c2a3e5ff192d1da8e6b 100644
--- a/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
+++ b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
index 700fa38ed38c61037a2f16b967cac82105a08192..9a1218256565e49d73fab0367f49918d46598be9 100644
--- a/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
+++ b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
index 6837072a1fc5f692b042e58934ffdb2ec0c76cb5..1efcb3572bad8fdd28bbfca010039365f94c3e34 100644
--- a/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
+++ b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
@@ -8,10 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt b/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt
index 35303016af9a09f809635ad2d0dc89668986a531..ccb8a1b0538b7486fd8ba493a5f2f80f7d9a6d7c 100644
--- a/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt
+++ b/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt
@@ -12,14 +12,14 @@
 ; CHECK-NEXT:  <CPU: id = 6, tsc = 2034042117104344>
 ; CHECK-NEXT:  <TSC Wrap: base = 2034042117104344>
 ; CHECK-EMPTY:
-; CHECK-NEXT: -  <Function Enter: #3 delta = +3>
-; CHECK-NEXT: -  <Function Exit: #3 delta = +3>
-; CHECK-NEXT: -  <Function Enter: #2 delta = +2>
-; CHECK-NEXT: -  <Function Exit: #2 delta = +2>
+; CHECK-NEXT: -  <Function Enter: #3 delta = +0>
+; CHECK-NEXT: -  <Function Exit: #3 delta = +94744>
+; CHECK-NEXT: -  <Function Enter: #2 delta = +1028487290>
+; CHECK-NEXT: -  <Function Exit: #2 delta = +75822>
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Metadata: <TSC Wrap: base = 2034049739853430>
 ; CHECK-EMPTY:
-; CHECK-NEXT: -  <Function Enter: #1 delta = +1>
+; CHECK-NEXT: -  <Function Enter: #1 delta = +0>
 ; CHECK-NEXT:  : <Call Argument: data = 67 (hex = 0x43)>
-; CHECK-NEXT: -  <Function Exit: #1 delta = +1>
+; CHECK-NEXT: -  <Function Exit: #1 delta = +24724>
 
diff --git a/test/tools/llvm-xray/X86/fdr-dump-arg1.txt b/test/tools/llvm-xray/X86/fdr-dump-arg1.txt
index df39f6ddd5fe46019af4ab4b6cf5b6f832c6f634..8fb381a170c32d2e13e380ddc0639cea25c310c7 100644
--- a/test/tools/llvm-xray/X86/fdr-dump-arg1.txt
+++ b/test/tools/llvm-xray/X86/fdr-dump-arg1.txt
@@ -9,8 +9,8 @@
 ; CHECK-NEXT:  <CPU: id = 49, tsc = 18828908666540172>
 ; CHECK-NEXT:  <TSC Wrap: base = 18828908666540172>
 ; CHECK-EMPTY:
-; CHECK-NEXT: -  <Function Enter: #1 delta = +1>
+; CHECK-NEXT: -  <Function Enter: #1 delta = +3146>
 ; CHECK-NEXT:  : <Call Argument: data = 1 (hex = 0x1)>
-; CHECK-NEXT: -  <Function Exit: #1 delta = +1>
+; CHECK-NEXT: -  <Function Exit: #1 delta = +52286>
 ; CHECK-NEXT:  *** <End of Buffer>
 
diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index e973bfef4dc60eed372949860bc3ef78e0462263..a50ff4c255bdabe4fd9e9b0f281d3d85a0e8b7aa 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp
@@ -409,7 +409,7 @@ bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock *> &BBs) {
         for (BasicBlock *Succ : successors(&BB))
           Succ->removePredecessor(&BB);
 
-        TerminatorInst *BBTerm = BB.getTerminator();
+        Instruction *BBTerm = BB.getTerminator();
         if (BBTerm->isEHPad() || BBTerm->getType()->isTokenTy())
           continue;
         if (!BBTerm->getType()->isVoidTy())
diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp
index c0e6d505941fd3c896f417710e6407401fb1d542..5fe40678ca9acd20af8d17905f51d47491461ed7 100644
--- a/tools/dsymutil/dsymutil.cpp
+++ b/tools/dsymutil/dsymutil.cpp
@@ -7,9 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This program is a utility that aims to be a dropin replacement for
-// Darwin's dsymutil.
-//
+// This program is a utility that aims to be a dropin replacement for Darwin's
+// dsymutil.
 //===----------------------------------------------------------------------===//
 
 #include "dsymutil.h"
@@ -165,20 +164,18 @@ static opt<bool>
                        desc("Embed warnings in the linked DWARF debug info."),
                        cat(DsymCategory));
 
-static bool createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) {
+static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) {
   if (NoOutput)
-    return true;
+    return Error::success();
 
   // Create plist file to write to.
   llvm::SmallString<128> InfoPlist(BundleRoot);
   llvm::sys::path::append(InfoPlist, "Contents/Info.plist");
   std::error_code EC;
   llvm::raw_fd_ostream PL(InfoPlist, EC, llvm::sys::fs::F_Text);
-  if (EC) {
-    WithColor::error() << "cannot create plist file " << InfoPlist << ": "
-                       << EC.message() << '\n';
-    return false;
-  }
+  if (EC)
+    return make_error<StringError>(
+        "cannot create Plist: " + toString(errorCodeToError(EC)), EC);
 
   CFBundleInfo BI = getBundleInfo(Bin);
 
@@ -230,22 +227,21 @@ static bool createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) {
      << "</plist>\n";
 
   PL.close();
-  return true;
+  return Error::success();
 }
 
-static bool createBundleDir(llvm::StringRef BundleBase) {
+static Error createBundleDir(llvm::StringRef BundleBase) {
   if (NoOutput)
-    return true;
+    return Error::success();
 
   llvm::SmallString<128> Bundle(BundleBase);
   llvm::sys::path::append(Bundle, "Contents", "Resources", "DWARF");
-  if (std::error_code EC = create_directories(Bundle.str(), true,
-                                              llvm::sys::fs::perms::all_all)) {
-    WithColor::error() << "cannot create directory " << Bundle << ": "
-                       << EC.message() << "\n";
-    return false;
-  }
-  return true;
+  if (std::error_code EC =
+          create_directories(Bundle.str(), true, llvm::sys::fs::perms::all_all))
+    return make_error<StringError>(
+        "cannot create bundle: " + toString(errorCodeToError(EC)), EC);
+
+  return Error::success();
 }
 
 static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) {
@@ -257,7 +253,7 @@ static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) {
 
   Expected<OwningBinary<Binary>> BinOrErr = createBinary(OutputFile);
   if (!BinOrErr) {
-    errs() << OutputFile << ": " << toString(BinOrErr.takeError());
+    WithColor::error() << OutputFile << ": " << toString(BinOrErr.takeError());
     return false;
   }
 
@@ -276,7 +272,7 @@ static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) {
   return false;
 }
 
-static std::string getOutputFileName(llvm::StringRef InputFile) {
+static Expected<std::string> getOutputFileName(llvm::StringRef InputFile) {
   // When updating, do in place replacement.
   if (OutputFileOpt.empty() && Update)
     return InputFile;
@@ -305,8 +301,10 @@ static std::string getOutputFileName(llvm::StringRef InputFile) {
   llvm::SmallString<128> BundleDir(OutputFileOpt);
   if (BundleDir.empty())
     BundleDir = DwarfFile + ".dSYM";
-  if (!createBundleDir(BundleDir) || !createPlistFile(DwarfFile, BundleDir))
-    return "";
+  if (auto E = createBundleDir(BundleDir))
+    return std::move(E);
+  if (auto E = createPlistFile(DwarfFile, BundleDir))
+    return std::move(E);
 
   llvm::sys::path::append(BundleDir, "Contents", "Resources", "DWARF",
                           llvm::sys::path::filename(DwarfFile));
@@ -521,13 +519,20 @@ int main(int argc, char **argv) {
       // Using a std::shared_ptr rather than std::unique_ptr because move-only
       // types don't work with std::bind in the ThreadPool implementation.
       std::shared_ptr<raw_fd_ostream> OS;
-      std::string OutputFile = getOutputFileName(InputFile);
+
+      Expected<std::string> OutputFileOrErr = getOutputFileName(InputFile);
+      if (!OutputFileOrErr) {
+        WithColor::error() << toString(OutputFileOrErr.takeError());
+        return 1;
+      }
+
+      std::string OutputFile = *OutputFileOrErr;
       if (NeedsTempFiles) {
         TempFiles.emplace_back(Map->getTriple().getArchName().str());
 
         auto E = TempFiles.back().createTempFile();
         if (E) {
-          errs() << toString(std::move(E));
+          WithColor::error() << toString(std::move(E));
           return 1;
         }
 
@@ -540,7 +545,7 @@ int main(int argc, char **argv) {
         OS = std::make_shared<raw_fd_ostream>(NoOutput ? "-" : OutputFile, EC,
                                               sys::fs::F_None);
         if (EC) {
-          errs() << OutputFile << ": " << EC.message();
+          WithColor::error() << OutputFile << ": " << EC.message();
           return 1;
         }
       }
@@ -567,10 +572,16 @@ int main(int argc, char **argv) {
     if (!AllOK)
       return 1;
 
-    if (NeedsTempFiles &&
-        !MachOUtils::generateUniversalBinary(
-            TempFiles, getOutputFileName(InputFile), *OptionsOrErr, SDKPath))
-      return 1;
+    if (NeedsTempFiles) {
+      Expected<std::string> OutputFileOrErr = getOutputFileName(InputFile);
+      if (!OutputFileOrErr) {
+        WithColor::error() << toString(OutputFileOrErr.takeError());
+        return 1;
+      }
+      if (!MachOUtils::generateUniversalBinary(TempFiles, *OutputFileOrErr,
+                                               *OptionsOrErr, SDKPath))
+        return 1;
+    }
   }
 
   return 0;
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 239460d972d4c22999a1af4c2a8d19e8d9f556ac..71e5b72a40c802e6a29a1ca19f8a5e2edfd3f170 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -447,8 +447,8 @@ static void diagnosticHandler(const DiagnosticInfo &DI) {
   ld_plugin_level Level;
   switch (DI.getSeverity()) {
   case DS_Error:
-    message(LDPL_FATAL, "LLVM gold plugin has failed to create LTO module: %s",
-            ErrStorage.c_str());
+    Level = LDPL_FATAL;
+    break;
   case DS_Warning:
     Level = LDPL_WARNING;
     break;
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 4794fe532a5254a685819f87bb5f05cf30b177fb..c3c57e2cdeed3cbfb2a22f463d9587fa2bd7c484 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -115,6 +115,11 @@ namespace {
                "rather than individual functions"),
       cl::init(false));
 
+  cl::list<std::string>
+      JITDylibs("jd",
+                cl::desc("Specifies the JITDylib to be used for any subsequent "
+                         "-extra-module arguments."));
+
   // The MCJIT supports building for a target address space separate from
   // the JIT compilation process. Use a forked process and a copying
   // memory manager with IPC to execute using this functionality.
@@ -696,7 +701,7 @@ int main(int argc, char **argv, char * const *envp) {
   return Result;
 }
 
-static orc::IRTransformLayer2::TransformFunction createDebugDumper() {
+static orc::IRTransformLayer::TransformFunction createDebugDumper() {
   switch (OrcDumpKind) {
   case DumpKind::NoDump:
     return [](orc::ThreadSafeModule TSM,
@@ -749,6 +754,8 @@ static orc::IRTransformLayer2::TransformFunction createDebugDumper() {
   llvm_unreachable("Unknown DumpKind");
 }
 
+static void exitOnLazyCallThroughFailure() { exit(1); }
+
 int runOrcLazyJIT(const char *ProgName) {
   // Start setting up the JIT environment.
 
@@ -778,10 +785,14 @@ int runOrcLazyJIT(const char *ProgName) {
                         : None);
 
   DataLayout DL = ExitOnErr(JTMB.getDefaultDataLayoutForTarget());
-  auto J = ExitOnErr(orc::LLLazyJIT::Create(std::move(JTMB), DL, LazyJITCompileThreads));
+
+  auto J = ExitOnErr(orc::LLLazyJIT::Create(
+      std::move(JTMB), DL,
+      pointerToJITTargetAddress(exitOnLazyCallThroughFailure),
+      LazyJITCompileThreads));
 
   if (PerModuleLazy)
-    J->setPartitionFunction(orc::CompileOnDemandLayer2::compileWholeModule);
+    J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule);
 
   auto Dump = createDebugDumper();
 
@@ -793,23 +804,42 @@ int runOrcLazyJIT(const char *ProgName) {
     }
     return Dump(std::move(TSM), R);
   });
-  J->getMainJITDylib().setFallbackDefinitionGenerator(ExitOnErr(
-      orc::DynamicLibraryFallbackGenerator::CreateForCurrentProcess(DL)));
+  J->getMainJITDylib().setGenerator(
+      ExitOnErr(orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(DL)));
 
   orc::MangleAndInterner Mangle(J->getExecutionSession(), DL);
-  orc::LocalCXXRuntimeOverrides2 CXXRuntimeOverrides;
+  orc::LocalCXXRuntimeOverrides CXXRuntimeOverrides;
   ExitOnErr(CXXRuntimeOverrides.enable(J->getMainJITDylib(), Mangle));
 
   // Add the main module.
   ExitOnErr(J->addLazyIRModule(std::move(MainModule)));
 
-  // Add any extra modules.
-  for (auto &ModulePath : ExtraModules) {
-    auto M = parseIRFile(ModulePath, Err, *TSCtx.getContext());
-    if (!M)
-      reportError(Err, ProgName);
+  // Create JITDylibs and add any extra modules.
+  {
+    // Create JITDylibs, keep a map from argument index to dylib. We will use
+    // -extra-module argument indexes to determine what dylib to use for each
+    // -extra-module.
+    std::map<unsigned, orc::JITDylib *> IdxToDylib;
+    IdxToDylib[0] = &J->getMainJITDylib();
+    for (auto JDItr = JITDylibs.begin(), JDEnd = JITDylibs.end();
+         JDItr != JDEnd; ++JDItr) {
+      IdxToDylib[JITDylibs.getPosition(JDItr - JITDylibs.begin())] =
+          &J->createJITDylib(*JDItr);
+    }
 
-    ExitOnErr(J->addLazyIRModule(orc::ThreadSafeModule(std::move(M), TSCtx)));
+    for (auto EMItr = ExtraModules.begin(), EMEnd = ExtraModules.end();
+         EMItr != EMEnd; ++EMItr) {
+      auto M = parseIRFile(*EMItr, Err, *TSCtx.getContext());
+      if (!M)
+        reportError(Err, ProgName);
+
+      auto EMIdx = ExtraModules.getPosition(EMItr - ExtraModules.begin());
+      assert(EMIdx != 0 && "ExtraModule should have index > 0");
+      auto JDItr = std::prev(IdxToDylib.lower_bound(EMIdx));
+      auto &JD = *JDItr->second;
+      ExitOnErr(
+          J->addLazyIRModule(JD, orc::ThreadSafeModule(std::move(M), TSCtx)));
+    }
   }
 
   // Add the objects.
@@ -837,6 +867,8 @@ int runOrcLazyJIT(const char *ProgName) {
     AltEntryThreads.push_back(std::thread([EntryPoint]() { EntryPoint(); }));
   }
 
+  J->getExecutionSession().dump(llvm::dbgs());
+
   // Run main.
   auto MainSym = ExitOnErr(J->lookup("main"));
   typedef int (*MainFnPtr)(int, const char *[]);
diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index 2970a59beee22ad88bd23616806003b5f3be7d9c..191c684d5245eb90c79c4994a3e2b92d78f2c2fb 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  BinaryFormat
   Core
   DlltoolDriver
   LibDriver
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 454b3971d2857f397ccbb43e893d2246ebbf7ac4..5ab8ae13d3e0eeb5d70fec29272ffc7600beef35 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -96,6 +96,7 @@ MODIFIERS:
   [D] - use zero for timestamps and uids/gids (default)
   [i] - put [files] before [relpos] (same as [b])
   [l] - ignored for compatibility
+  [L] - add archive's contents
   [o] - preserve original dates
   [s] - create an archive index (cf. ranlib)
   [S] - do not build a symbol table
@@ -156,14 +157,14 @@ static std::string Options;
 // This enumeration delineates the kinds of operations on an archive
 // that are permitted.
 enum ArchiveOperation {
-  Print,            ///< Print the contents of the archive
-  Delete,           ///< Delete the specified members
-  Move,             ///< Move members to end or as given by {a,b,i} modifiers
-  QuickAppend,      ///< Quickly append to end of archive
-  ReplaceOrInsert,  ///< Replace or Insert members
-  DisplayTable,     ///< Display the table of contents
-  Extract,          ///< Extract files back to file system
-  CreateSymTab      ///< Create a symbol table in an existing archive
+  Print,           ///< Print the contents of the archive
+  Delete,          ///< Delete the specified members
+  Move,            ///< Move members to end or as given by {a,b,i} modifiers
+  QuickAppend,     ///< Quickly append to end of archive
+  ReplaceOrInsert, ///< Replace or Insert members
+  DisplayTable,    ///< Display the table of contents
+  Extract,         ///< Extract files back to file system
+  CreateSymTab     ///< Create a symbol table in an existing archive
 };
 
 // Modifiers to follow operation to vary behavior
@@ -176,6 +177,7 @@ static bool Verbose = false;       ///< 'v' modifier
 static bool Symtab = true;         ///< 's' modifier
 static bool Deterministic = true;  ///< 'D' and 'U' modifiers
 static bool Thin = false;          ///< 'T' modifier
+static bool AddLibrary = false;    ///< 'L' modifier
 
 // Relative Positional Argument (for insert/move). This variable holds
 // the name of the archive member to which the 'a', 'b' or 'i' modifier
@@ -214,6 +216,21 @@ static void getMembers() {
     Members.push_back(Arg);
 }
 
+std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
+std::vector<std::unique_ptr<object::Archive>> Archives;
+
+static object::Archive &readLibrary(const Twine &Library) {
+  auto BufOrErr = MemoryBuffer::getFile(Library, -1, false);
+  failIfError(BufOrErr.getError(), "Could not open library");
+  ArchiveBuffers.push_back(std::move(*BufOrErr));
+  auto LibOrErr =
+      object::Archive::create(ArchiveBuffers.back()->getMemBufferRef());
+  failIfError(errorToErrorCode(LibOrErr.takeError()),
+              "Could not parse library");
+  Archives.push_back(std::move(*LibOrErr));
+  return *Archives.back();
+}
+
 static void runMRIScript();
 
 // Parse the command line options as presented and return the operation
@@ -239,18 +256,44 @@ static ArchiveOperation parseCommandLine() {
 
   bool MaybeJustCreateSymTab = false;
 
-  for(unsigned i=0; i<Options.size(); ++i) {
-    switch(Options[i]) {
-    case 'd': ++NumOperations; Operation = Delete; break;
-    case 'm': ++NumOperations; Operation = Move ; break;
-    case 'p': ++NumOperations; Operation = Print; break;
-    case 'q': ++NumOperations; Operation = QuickAppend; break;
-    case 'r': ++NumOperations; Operation = ReplaceOrInsert; break;
-    case 't': ++NumOperations; Operation = DisplayTable; break;
-    case 'x': ++NumOperations; Operation = Extract; break;
-    case 'c': Create = true; break;
-    case 'l': /* accepted but unused */ break;
-    case 'o': OriginalDates = true; break;
+  for (unsigned i = 0; i < Options.size(); ++i) {
+    switch (Options[i]) {
+    case 'd':
+      ++NumOperations;
+      Operation = Delete;
+      break;
+    case 'm':
+      ++NumOperations;
+      Operation = Move;
+      break;
+    case 'p':
+      ++NumOperations;
+      Operation = Print;
+      break;
+    case 'q':
+      ++NumOperations;
+      Operation = QuickAppend;
+      break;
+    case 'r':
+      ++NumOperations;
+      Operation = ReplaceOrInsert;
+      break;
+    case 't':
+      ++NumOperations;
+      Operation = DisplayTable;
+      break;
+    case 'x':
+      ++NumOperations;
+      Operation = Extract;
+      break;
+    case 'c':
+      Create = true;
+      break;
+    case 'l': /* accepted but unused */
+      break;
+    case 'o':
+      OriginalDates = true;
+      break;
     case 's':
       Symtab = true;
       MaybeJustCreateSymTab = true;
@@ -258,8 +301,12 @@ static ArchiveOperation parseCommandLine() {
     case 'S':
       Symtab = false;
       break;
-    case 'u': OnlyUpdate = true; break;
-    case 'v': Verbose = true; break;
+    case 'u':
+      OnlyUpdate = true;
+      break;
+    case 'v':
+      Verbose = true;
+      break;
     case 'a':
       getRelPos();
       AddAfter = true;
@@ -284,6 +331,9 @@ static ArchiveOperation parseCommandLine() {
     case 'T':
       Thin = true;
       break;
+    case 'L':
+      AddLibrary = true;
+      break;
     default:
       fail(std::string("unknown option ") + Options[i]);
     }
@@ -296,7 +346,7 @@ static ArchiveOperation parseCommandLine() {
   // Everything on the command line at this point is a member.
   getMembers();
 
- if (NumOperations == 0 && MaybeJustCreateSymTab) {
+  if (NumOperations == 0 && MaybeJustCreateSymTab) {
     NumOperations = 1;
     Operation = CreateSymTab;
     if (!Members.empty())
@@ -320,6 +370,8 @@ static ArchiveOperation parseCommandLine() {
     fail("The 'o' modifier is only applicable to the 'x' operation");
   if (OnlyUpdate && Operation != ReplaceOrInsert)
     fail("The 'u' modifier is only applicable to the 'r' operation");
+  if (AddLibrary && Operation != QuickAppend)
+    fail("The 'L' modifier is only applicable to the 'q' operation");
 
   // Return the parsed operation to the caller
   return Operation;
@@ -512,6 +564,26 @@ static void addMember(std::vector<NewArchiveMember> &Members,
     Members[Pos] = std::move(*NMOrErr);
 }
 
+static void addLibMember(std::vector<NewArchiveMember> &Members,
+                         StringRef FileName) {
+  Expected<NewArchiveMember> NMOrErr =
+      NewArchiveMember::getFile(FileName, Deterministic);
+  failIfError(NMOrErr.takeError(), FileName);
+  if (identify_magic(NMOrErr->Buf->getBuffer()) == file_magic::archive) {
+    object::Archive &Lib = readLibrary(FileName);
+    Error Err = Error::success();
+
+    for (auto &Child : Lib.children(Err))
+      addMember(Members, Child);
+
+    failIfError(std::move(Err));
+  } else {
+    // Use the basename of the object path for the member name.
+    NMOrErr->MemberName = sys::path::filename(NMOrErr->MemberName);
+    Members.push_back(std::move(*NMOrErr));
+  }
+}
+
 enum InsertAction {
   IA_AddOldMember,
   IA_AddNewMember,
@@ -634,6 +706,13 @@ computeNewArchiveMembers(ArchiveOperation Operation,
     ++Pos;
   }
 
+  if (AddLibrary) {
+    assert(Operation == QuickAppend);
+    for (auto &Member : Members)
+      addLibMember(Ret, Member);
+    return Ret;
+  }
+
   for (unsigned I = 0; I != Members.size(); ++I)
     Ret.insert(Ret.begin() + InsertPos, NewArchiveMember());
   Pos = InsertPos;
@@ -665,11 +744,10 @@ static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) {
   return getDefaultForHost();
 }
 
-static void
-performWriteOperation(ArchiveOperation Operation,
-                      object::Archive *OldArchive,
-                      std::unique_ptr<MemoryBuffer> OldArchiveBuf,
-                      std::vector<NewArchiveMember> *NewMembersP) {
+static void performWriteOperation(ArchiveOperation Operation,
+                                  object::Archive *OldArchive,
+                                  std::unique_ptr<MemoryBuffer> OldArchiveBuf,
+                                  std::vector<NewArchiveMember> *NewMembersP) {
   std::vector<NewArchiveMember> NewMembers;
   if (!NewMembersP)
     NewMembers = computeNewArchiveMembers(Operation, OldArchive);
@@ -791,8 +869,6 @@ static void runMRIScript() {
   const MemoryBuffer &Ref = *Buf.get();
   bool Saved = false;
   std::vector<NewArchiveMember> NewMembers;
-  std::vector<std::unique_ptr<MemoryBuffer>> ArchiveBuffers;
-  std::vector<std::unique_ptr<object::Archive>> Archives;
 
   for (line_iterator I(Ref, /*SkipBlanks*/ false), E; I != E; ++I) {
     StringRef Line = *I;
@@ -817,15 +893,7 @@ static void runMRIScript() {
 
     switch (Command) {
     case MRICommand::AddLib: {
-      auto BufOrErr = MemoryBuffer::getFile(Rest, -1, false);
-      failIfError(BufOrErr.getError(), "Could not open library");
-      ArchiveBuffers.push_back(std::move(*BufOrErr));
-      auto LibOrErr =
-          object::Archive::create(ArchiveBuffers.back()->getMemBufferRef());
-      failIfError(errorToErrorCode(LibOrErr.takeError()),
-                  "Could not parse library");
-      Archives.push_back(std::move(*LibOrErr));
-      object::Archive &Lib = *Archives.back();
+      object::Archive &Lib = readLibrary(Rest);
       {
         Error Err = Error::success();
         for (auto &Member : Lib.children(Err))
@@ -884,7 +952,7 @@ static int ar_main(int argc, char **argv) {
   BumpPtrAllocator Alloc;
   StringSaver Saver(Alloc);
   cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv);
-  for(size_t i = 1; i < Argv.size(); ++i) {
+  for (size_t i = 1; i < Argv.size(); ++i) {
     StringRef Arg = Argv[i];
     const char *match;
     auto MatchFlagWithArg = [&](const char *expected) {
@@ -895,8 +963,7 @@ static int ar_main(int argc, char **argv) {
         match = Argv[i];
         return true;
       }
-      if (Arg.startswith(expected) && Arg.size() > len &&
-                 Arg[len] == '=') {
+      if (Arg.startswith(expected) && Arg.size() > len && Arg[len] == '=') {
         match = Arg.data() + len + 1;
         return true;
       }
@@ -905,7 +972,7 @@ static int ar_main(int argc, char **argv) {
     if (handleGenericOption(Argv[i]))
       return 0;
     if (Arg == "--") {
-      for(; i < Argv.size(); ++i)
+      for (; i < Argv.size(); ++i)
         PositionalArgs.push_back(Argv[i]);
       break;
     }
@@ -918,11 +985,11 @@ static int ar_main(int argc, char **argv) {
         MRI = true;
       } else if (MatchFlagWithArg("format")) {
         FormatType = StringSwitch<Format>(match)
-            .Case("default", Default)
-            .Case("gnu", GNU)
-            .Case("darwin", DARWIN)
-            .Case("bsd", BSD)
-            .Default(Unknown);
+                         .Case("default", Default)
+                         .Case("gnu", GNU)
+                         .Case("darwin", DARWIN)
+                         .Case("bsd", BSD)
+                         .Default(Unknown);
         if (FormatType == Unknown)
           fail(std::string("Invalid format ") + match);
       } else if (MatchFlagWithArg("plugin")) {
@@ -942,7 +1009,7 @@ static int ar_main(int argc, char **argv) {
 
 static int ranlib_main(int argc, char **argv) {
   bool ArchiveSpecified = false;
-  for(int i = 1; i < argc; ++i) {
+  for (int i = 1; i < argc; ++i) {
     if (handleGenericOption(argv[i])) {
       return 0;
     } else {
diff --git a/tools/llvm-c-test/echo.cpp b/tools/llvm-c-test/echo.cpp
index d4c61e2d13c4cea6381061244ca797947d1e0fc9..db926e8aceaac9c18ee6277bbc7c14a6e3cef687 100644
--- a/tools/llvm-c-test/echo.cpp
+++ b/tools/llvm-c-test/echo.cpp
@@ -240,7 +240,17 @@ static LLVMValueRef clone_constant_impl(LLVMValueRef Cst, LLVMModuleRef M) {
     // Try function
     if (LLVMIsAFunction(Cst)) {
       check_value_kind(Cst, LLVMFunctionValueKind);
-      LLVMValueRef Dst = LLVMGetNamedFunction(M, Name);
+
+      LLVMValueRef Dst = nullptr;
+      // Try an intrinsic
+      unsigned ID = LLVMGetIntrinsicID(Cst);
+      if (ID > 0 && !LLVMIntrinsicIsOverloaded(ID)) {
+        Dst = LLVMGetIntrinsicDeclaration(M, ID, nullptr, 0);
+      } else {
+        // Try a normal function
+        Dst = LLVMGetNamedFunction(M, Name);
+      }
+
       if (Dst)
         return Dst;
       report_fatal_error("Could not find function");
diff --git a/tools/llvm-diff/DifferenceEngine.cpp b/tools/llvm-diff/DifferenceEngine.cpp
index b2673c1407f42b8819e9d069555eae23b2ac51fc..acff8bb3e89bef92e9126194f5efe162a8d02ac7 100644
--- a/tools/llvm-diff/DifferenceEngine.cpp
+++ b/tools/llvm-diff/DifferenceEngine.cpp
@@ -629,8 +629,8 @@ void FunctionDifferenceEngine::runBlockDiff(BasicBlock::iterator LStart,
   // If the terminators have different kinds, but one is an invoke and the
   // other is an unconditional branch immediately following a call, unify
   // the results and the destinations.
-  TerminatorInst *LTerm = LStart->getParent()->getTerminator();
-  TerminatorInst *RTerm = RStart->getParent()->getTerminator();
+  Instruction *LTerm = LStart->getParent()->getTerminator();
+  Instruction *RTerm = RStart->getParent()->getTerminator();
   if (isa<BranchInst>(LTerm) && isa<InvokeInst>(RTerm)) {
     if (cast<BranchInst>(LTerm)->isConditional()) return;
     BasicBlock::iterator I = LTerm->getIterator();
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 42992641eb7e4d1516c0b86ea0d37427aa80ed2d..d9e8e36efe5c57d602b0b74c3cf360f84b7fef55 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -226,7 +226,7 @@ static alias VerboseAlias("v", desc("Alias for -verbose."), aliasopt(Verbose),
 static void error(StringRef Prefix, std::error_code EC) {
   if (!EC)
     return;
-  errs() << Prefix << ": " << EC.message() << "\n";
+  WithColor::error() << Prefix << ": " << EC.message() << "\n";
   exit(1);
 }
 
@@ -422,8 +422,8 @@ static bool dumpObjectFile(ObjectFile &Obj, DWARFContext &DICtx, Twine Filename,
     for (auto name : Name)
       Names.insert((IgnoreCase && !UseRegex) ? StringRef(name).lower() : name);
 
-    filterByName(Names, DICtx.compile_units(), OS);
-    filterByName(Names, DICtx.dwo_compile_units(), OS);
+    filterByName(Names, DICtx.normal_units(), OS);
+    filterByName(Names, DICtx.dwo_units(), OS);
     return true;
   }
 
@@ -571,6 +571,14 @@ int main(int argc, char **argv) {
     return 0;
   }
 
+  // FIXME: Audit interactions between these two options and make them
+  //        compatible.
+  if (Diff && Verbose) {
+    WithColor::error() << "incompatible arguments: specifying both -diff and "
+                          "-verbose is currently not supported";
+    return 0;
+  }
+
   std::unique_ptr<ToolOutputFile> OutputFile;
   if (!OutputFilename.empty()) {
     std::error_code EC;
@@ -624,7 +632,7 @@ int main(int argc, char **argv) {
 
   if (Verify) {
     // If we encountered errors during verify, exit with a non-zero exit status.
-    if (!std::all_of(Objects.begin(), Objects.end(), [&](std::string Object) {
+    if (!all_of(Objects, [&](std::string Object) {
           return handleFile(Object, verifyObjectFile, OS);
         }))
       exit(1);
diff --git a/tools/llvm-exegesis/lib/AArch64/Target.cpp b/tools/llvm-exegesis/lib/AArch64/Target.cpp
index 90c5927ad2939a931d08fc7ed13b08ab1c0e7fc8..0197420f43364ba343d610fa06a3e724a721d943 100644
--- a/tools/llvm-exegesis/lib/AArch64/Target.cpp
+++ b/tools/llvm-exegesis/lib/AArch64/Target.cpp
@@ -11,6 +11,7 @@
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
@@ -52,6 +53,10 @@ static llvm::MCInst loadImmediate(unsigned Reg, unsigned RegBitWidth,
 } // namespace
 
 class ExegesisAArch64Target : public ExegesisTarget {
+public:
+  ExegesisAArch64Target() : ExegesisTarget({}) {}
+
+private:
   std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
                                      unsigned Reg,
                                      const llvm::APInt &Value) const override {
@@ -90,3 +95,4 @@ void InitializeAArch64ExegesisTarget() {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Analysis.cpp b/tools/llvm-exegesis/lib/Analysis.cpp
index eaacb5b1d6579e2432d57e2673f422a1984d51af..0dd6bcbd46619c73cad0947d72c6e04e5ceae953 100644
--- a/tools/llvm-exegesis/lib/Analysis.cpp
+++ b/tools/llvm-exegesis/lib/Analysis.cpp
@@ -15,6 +15,7 @@
 #include <unordered_set>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 static const char kCsvSep = ',';
@@ -657,11 +658,11 @@ llvm::Error Analysis::run<Analysis::PrintSchedClassInconsistencies>(
 
     // Print any scheduling class that has at least one cluster that does not
     // match the checked-in data.
-    if (std::all_of(SchedClassClusters.begin(), SchedClassClusters.end(),
-                    [this, &RSCAndPoints](const SchedClassCluster &C) {
-                      return C.measurementsMatch(*SubtargetInfo_,
-                                                 RSCAndPoints.RSC, Clustering_);
-                    }))
+    if (llvm::all_of(SchedClassClusters,
+                     [this, &RSCAndPoints](const SchedClassCluster &C) {
+                       return C.measurementsMatch(
+                           *SubtargetInfo_, RSCAndPoints.RSC, Clustering_);
+                     }))
       continue; // Nothing weird.
 
     OS << "<div class=\"inconsistency\"><p>Sched Class <span "
@@ -796,3 +797,4 @@ std::vector<std::pair<uint16_t, float>> computeIdealizedProcResPressure(
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Analysis.h b/tools/llvm-exegesis/lib/Analysis.h
index a65a2f1b1da31d549a56381b0890d5c87af6ad1a..9ee1493f4e051939aa0e846c5527da2d13f7f774 100644
--- a/tools/llvm-exegesis/lib/Analysis.h
+++ b/tools/llvm-exegesis/lib/Analysis.h
@@ -30,6 +30,7 @@
 #include <string>
 #include <unordered_map>
 
+namespace llvm {
 namespace exegesis {
 
 // A helper class to analyze benchmark results for a target.
@@ -135,5 +136,6 @@ std::vector<std::pair<uint16_t, float>> computeIdealizedProcResPressure(
     llvm::SmallVector<llvm::MCWriteProcResEntry, 8> WPRS);
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H
diff --git a/tools/llvm-exegesis/lib/Assembler.cpp b/tools/llvm-exegesis/lib/Assembler.cpp
index 2b67682cde7be359e2098f54a41d67fd03eb16af..2e3712ce7dc7d859e27f0bc746c5758f4ae9ec14 100644
--- a/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/tools/llvm-exegesis/lib/Assembler.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/MemoryBuffer.h"
 
+namespace llvm {
 namespace exegesis {
 
 static constexpr const char ModuleID[] = "ExegesisInfoTest";
@@ -33,6 +34,7 @@ generateSnippetSetupCode(const ExegesisTarget &ET,
                          const llvm::MCSubtargetInfo *const MSI,
                          llvm::ArrayRef<RegisterValue> RegisterInitialValues,
                          bool &IsSnippetSetupComplete) {
+  IsSnippetSetupComplete = true;
   std::vector<llvm::MCInst> Result;
   for (const RegisterValue &RV : RegisterInitialValues) {
     // Load a constant in the register.
@@ -109,6 +111,8 @@ static void fillMachineFunction(llvm::MachineFunction &MF,
         Builder.addReg(Op.getReg(), Flags);
       } else if (Op.isImm()) {
         Builder.addImm(Op.getImm());
+      } else if (!Op.isValid()) {
+        llvm_unreachable("Operand is not set");
       } else {
         llvm_unreachable("Not yet implemented");
       }
@@ -138,8 +142,10 @@ llvm::BitVector getFunctionReservedRegs(const llvm::TargetMachine &TM) {
       llvm::make_unique<llvm::LLVMContext>();
   std::unique_ptr<llvm::Module> Module =
       createModule(Context, TM.createDataLayout());
+  // TODO: This only works for targets implementing LLVMTargetMachine.
+  const LLVMTargetMachine &LLVMTM = static_cast<const LLVMTargetMachine&>(TM);
   std::unique_ptr<llvm::MachineModuleInfo> MMI =
-      llvm::make_unique<llvm::MachineModuleInfo>(&TM);
+      llvm::make_unique<llvm::MachineModuleInfo>(&LLVMTM);
   llvm::MachineFunction &MF =
       createVoidVoidPtrMachineFunction(FunctionID, Module.get(), MMI.get());
   // Saving reserved registers for client.
@@ -170,7 +176,7 @@ void assembleToStream(const ExegesisTarget &ET,
   for (const unsigned Reg : LiveIns)
     MF.getRegInfo().addLiveIn(Reg);
 
-  bool IsSnippetSetupComplete = false;
+  bool IsSnippetSetupComplete;
   std::vector<llvm::MCInst> Code =
       generateSnippetSetupCode(ET, TM->getMCSubtargetInfo(),
                                RegisterInitialValues, IsSnippetSetupComplete);
@@ -292,3 +298,4 @@ ExecutableFunction::ExecutableFunction(
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Assembler.h b/tools/llvm-exegesis/lib/Assembler.h
index f2a77168cb7fb50135408f3b6357d31dbaf3006f..ee6bc86f3788487f12c70ca8ae17536dd5eaf889 100644
--- a/tools/llvm-exegesis/lib/Assembler.h
+++ b/tools/llvm-exegesis/lib/Assembler.h
@@ -32,6 +32,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 
+namespace llvm {
 namespace exegesis {
 
 class ExegesisTarget;
@@ -82,5 +83,6 @@ struct ExecutableFunction {
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_ASSEMBLER_H
diff --git a/tools/llvm-exegesis/lib/BenchmarkCode.h b/tools/llvm-exegesis/lib/BenchmarkCode.h
index b10dca5c25eb7f2972e5be30164998f8e9c83a25..38bea2519a64ecad80c765a4607f7462629a54b4 100644
--- a/tools/llvm-exegesis/lib/BenchmarkCode.h
+++ b/tools/llvm-exegesis/lib/BenchmarkCode.h
@@ -15,6 +15,7 @@
 #include <string>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 // A collection of instructions that are to be assembled, executed and measured.
@@ -35,5 +36,6 @@ struct BenchmarkCode {
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKCODE_H
diff --git a/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 5d4912ea40749c135f332a2d5f7c0cb81d81903c..0507ae8959d0bfea2d754a54e49b429c2f23d6ae 100644
--- a/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -22,6 +22,10 @@ static constexpr const char kIntegerPrefix[] = "i_0x";
 static constexpr const char kDoublePrefix[] = "f_";
 static constexpr const char kInvalidOperand[] = "INVALID";
 
+namespace llvm {
+
+namespace {
+
 // A mutable struct holding an LLVMState that can be passed through the
 // serialization process to encode/decode registers and instructions.
 struct YamlContext {
@@ -141,13 +145,13 @@ private:
     return 0;
   }
 
-  const exegesis::LLVMState *State;
+  const llvm::exegesis::LLVMState *State;
   std::string LastError;
   llvm::raw_string_ostream ErrorStream;
 };
+} // namespace
 
 // Defining YAML traits for IO.
-namespace llvm {
 namespace yaml {
 
 static YamlContext &getTypedContext(void *Ctx) {
@@ -294,7 +298,6 @@ struct MappingContextTraits<exegesis::InstructionBenchmark, YamlContext> {
 };
 
 } // namespace yaml
-} // namespace llvm
 
 namespace exegesis {
 
@@ -341,7 +344,7 @@ InstructionBenchmark::readYamls(const LLVMState &State,
 
 void InstructionBenchmark::writeYamlTo(const LLVMState &State,
                                        llvm::raw_ostream &OS) {
-  llvm::yaml::Output Yout(OS);
+  llvm::yaml::Output Yout(OS, nullptr /*Ctx*/, 200 /*WrapColumn*/);
   YamlContext Context(State);
   Yout.beginDocuments();
   llvm::yaml::yamlize(Yout, *this, /*unused*/ true, Context);
@@ -384,3 +387,4 @@ void PerInstructionStats::push(const BenchmarkMeasure &BM) {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/BenchmarkResult.h b/tools/llvm-exegesis/lib/BenchmarkResult.h
index 961c07b99dd21261facf6cb711fda81d18e529a3..773a2e50abc4d70053db917658120d236bf5edbc 100644
--- a/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -28,6 +28,7 @@
 #include <unordered_map>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 struct InstructionBenchmarkKey {
@@ -111,5 +112,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRESULT_H
diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 7addb0acd7ebee51ff4fea587b9e5b7411d366ed..437503f848659dc6377253e28d56ea90bcb53475 100644
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -13,13 +13,16 @@
 #include "Assembler.h"
 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
+#include "PerfHelper.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Program.h"
 
+namespace llvm {
 namespace exegesis {
 
 BenchmarkFailure::BenchmarkFailure(const llvm::Twine &S)
@@ -43,6 +46,55 @@ GenerateInstructions(const BenchmarkCode &BC, const size_t MinInstructions) {
   return Code;
 }
 
+namespace {
+class FunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
+public:
+  FunctionExecutorImpl(const LLVMState &State,
+                       llvm::object::OwningBinary<llvm::object::ObjectFile> Obj,
+                       BenchmarkRunner::ScratchSpace *Scratch)
+      : Function(State.createTargetMachine(), std::move(Obj)),
+        Scratch(Scratch) {}
+
+private:
+  llvm::Expected<int64_t> runAndMeasure(const char *Counters) const override {
+    // We sum counts when there are several counters for a single ProcRes
+    // (e.g. P23 on SandyBridge).
+    int64_t CounterValue = 0;
+    llvm::SmallVector<llvm::StringRef, 2> CounterNames;
+    llvm::StringRef(Counters).split(CounterNames, '+');
+    char *const ScratchPtr = Scratch->ptr();
+    for (auto &CounterName : CounterNames) {
+      CounterName = CounterName.trim();
+      pfm::PerfEvent PerfEvent(CounterName);
+      if (!PerfEvent.valid())
+        llvm::report_fatal_error(
+            llvm::Twine("invalid perf event '").concat(CounterName).concat("'"));
+      pfm::Counter Counter(PerfEvent);
+      Scratch->clear();
+      {
+        llvm::CrashRecoveryContext CRC;
+        llvm::CrashRecoveryContext::Enable();
+        const bool Crashed = !CRC.RunSafely([this, &Counter, ScratchPtr]() {
+          Counter.start();
+          this->Function(ScratchPtr);
+          Counter.stop();
+        });
+        llvm::CrashRecoveryContext::Disable();
+        // FIXME: Better diagnosis.
+        if (Crashed)
+          return llvm::make_error<BenchmarkFailure>(
+              "snippet crashed while running");
+      }
+      CounterValue += Counter.read();
+    }
+    return CounterValue;
+  }
+
+  const ExecutableFunction Function;
+  BenchmarkRunner::ScratchSpace *const Scratch;
+};
+} // namespace
+
 InstructionBenchmark
 BenchmarkRunner::runConfiguration(const BenchmarkCode &BC,
                                   unsigned NumRepetitions) const {
@@ -86,16 +138,21 @@ BenchmarkRunner::runConfiguration(const BenchmarkCode &BC,
   }
   llvm::outs() << "Check generated assembly with: /usr/bin/objdump -d "
                << *ObjectFilePath << "\n";
-  const ExecutableFunction EF(State.createTargetMachine(),
-                              getObjectFromFile(*ObjectFilePath));
-  InstrBenchmark.Measurements = runMeasurements(EF, *Scratch);
+  const FunctionExecutorImpl Executor(State, getObjectFromFile(*ObjectFilePath),
+                                      Scratch.get());
+  auto Measurements = runMeasurements(Executor);
+  if (llvm::Error E = Measurements.takeError()) {
+    InstrBenchmark.Error = llvm::toString(std::move(E));
+    return InstrBenchmark;
+  }
+  InstrBenchmark.Measurements = std::move(*Measurements);
   assert(InstrBenchmark.NumRepetitions > 0 && "invalid NumRepetitions");
   for (BenchmarkMeasure &BM : InstrBenchmark.Measurements) {
     // Scale the measurements by instruction.
     BM.PerInstructionValue /= InstrBenchmark.NumRepetitions;
     // Scale the measurements by snippet.
     BM.PerSnippetValue *= static_cast<double>(BC.Instructions.size()) /
-                   InstrBenchmark.NumRepetitions;
+                          InstrBenchmark.NumRepetitions;
   }
 
   return InstrBenchmark;
@@ -115,4 +172,7 @@ BenchmarkRunner::writeObjectFile(const BenchmarkCode &BC,
   return ResultPath.str();
 }
 
+BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
+
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/BenchmarkRunner.h b/tools/llvm-exegesis/lib/BenchmarkRunner.h
index e5b567f24637c7f1a352462181ec56f5ad8a5ec4..4f77f492ab4b09e6b64df77f276b836d63e3ccee 100644
--- a/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -27,6 +27,7 @@
 #include <memory>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 // A class representing failures that happened during Benchmark, they are used
@@ -64,13 +65,21 @@ public:
     char *const AlignedPtr;
   };
 
+  // A helper to measure counters while executing a function in a sandboxed
+  // context.
+  class FunctionExecutor {
+  public:
+    virtual ~FunctionExecutor();
+    virtual llvm::Expected<int64_t>
+    runAndMeasure(const char *Counters) const = 0;
+  };
+
 protected:
   const LLVMState &State;
 
 private:
-  virtual std::vector<BenchmarkMeasure>
-  runMeasurements(const ExecutableFunction &EF,
-                  ScratchSpace &Scratch) const = 0;
+  virtual llvm::Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const = 0;
 
   llvm::Expected<std::string>
   writeObjectFile(const BenchmarkCode &Configuration,
@@ -82,5 +91,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_BENCHMARKRUNNER_H
diff --git a/tools/llvm-exegesis/lib/Clustering.cpp b/tools/llvm-exegesis/lib/Clustering.cpp
index b63afec945fa7c5b7eb6c56d972f1c919ce86927..761629167bb65fdc63ca9f5302c5dfbbc950c282 100644
--- a/tools/llvm-exegesis/lib/Clustering.cpp
+++ b/tools/llvm-exegesis/lib/Clustering.cpp
@@ -11,6 +11,7 @@
 #include <string>
 #include <unordered_set>
 
+namespace llvm {
 namespace exegesis {
 
 // The clustering problem has the following characteristics:
@@ -170,3 +171,4 @@ InstructionBenchmarkClustering::create(
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Clustering.h b/tools/llvm-exegesis/lib/Clustering.h
index c811020e0fe820c998160f2252053614878c3d51..9dc0adffb1e5e6fac949e53f3e01bec83bdddfb7 100644
--- a/tools/llvm-exegesis/lib/Clustering.h
+++ b/tools/llvm-exegesis/lib/Clustering.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/Error.h"
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
 class InstructionBenchmarkClustering {
@@ -109,5 +110,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_CLUSTERING_H
diff --git a/tools/llvm-exegesis/lib/CodeTemplate.cpp b/tools/llvm-exegesis/lib/CodeTemplate.cpp
index 34433daa23186006e977fceece4441e8a1bdd305..e159b000755acbec6a11516c5b406fee29d0296f 100644
--- a/tools/llvm-exegesis/lib/CodeTemplate.cpp
+++ b/tools/llvm-exegesis/lib/CodeTemplate.cpp
@@ -9,6 +9,7 @@
 
 #include "CodeTemplate.h"
 
+namespace llvm {
 namespace exegesis {
 
 CodeTemplate::CodeTemplate(CodeTemplate &&) = default;
@@ -65,4 +66,54 @@ llvm::MCInst InstructionTemplate::build() const {
   return Result;
 }
 
+bool isEnumValue(ExecutionMode Execution) {
+  return llvm::isPowerOf2_32(static_cast<uint32_t>(Execution));
+}
+
+llvm::StringRef getName(ExecutionMode Bit) {
+  assert(isEnumValue(Bit) && "Bit must be a power of two");
+  switch (Bit) {
+  case ExecutionMode::UNKNOWN:
+    return "UNKNOWN";
+  case ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS:
+    return "ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS";
+  case ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS:
+    return "ALWAYS_SERIAL_TIED_REGS_ALIAS";
+  case ExecutionMode::SERIAL_VIA_MEMORY_INSTR:
+    return "SERIAL_VIA_MEMORY_INSTR";
+  case ExecutionMode::SERIAL_VIA_EXPLICIT_REGS:
+    return "SERIAL_VIA_EXPLICIT_REGS";
+  case ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR:
+    return "SERIAL_VIA_NON_MEMORY_INSTR";
+  case ExecutionMode::ALWAYS_PARALLEL_MISSING_USE_OR_DEF:
+    return "ALWAYS_PARALLEL_MISSING_USE_OR_DEF";
+  case ExecutionMode::PARALLEL_VIA_EXPLICIT_REGS:
+    return "PARALLEL_VIA_EXPLICIT_REGS";
+  }
+  llvm_unreachable("Missing enum case");
+}
+
+llvm::ArrayRef<ExecutionMode> getAllExecutionBits() {
+  static const ExecutionMode kAllExecutionModeBits[] = {
+      ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS,
+      ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS,
+      ExecutionMode::SERIAL_VIA_MEMORY_INSTR,
+      ExecutionMode::SERIAL_VIA_EXPLICIT_REGS,
+      ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR,
+      ExecutionMode::ALWAYS_PARALLEL_MISSING_USE_OR_DEF,
+      ExecutionMode::PARALLEL_VIA_EXPLICIT_REGS,
+  };
+  return llvm::makeArrayRef(kAllExecutionModeBits);
+}
+
+llvm::SmallVector<ExecutionMode, 4>
+getExecutionModeBits(ExecutionMode Execution) {
+  llvm::SmallVector<ExecutionMode, 4> Result;
+  for (const auto Bit : getAllExecutionBits())
+    if ((Execution & Bit) == Bit)
+      Result.push_back(Bit);
+  return Result;
+}
+
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/CodeTemplate.h b/tools/llvm-exegesis/lib/CodeTemplate.h
index e5006eb74c9b9baafe3b1822bcbaf61164e8cb47..4c55487f3d12161c977c5097c2b5de16967ddb23 100644
--- a/tools/llvm-exegesis/lib/CodeTemplate.h
+++ b/tools/llvm-exegesis/lib/CodeTemplate.h
@@ -17,7 +17,9 @@
 #define LLVM_TOOLS_LLVM_EXEGESIS_CODETEMPLATE_H
 
 #include "MCInstrDescView.h"
+#include "llvm/ADT/BitmaskEnum.h"
 
+namespace llvm {
 namespace exegesis {
 
 // A template for an Instruction holding values for each of its Variables.
@@ -45,9 +47,65 @@ struct InstructionTemplate {
   llvm::SmallVector<llvm::MCOperand, 4> VariableValues;
 };
 
+enum class ExecutionMode : uint8_t {
+  UNKNOWN = 0U,
+  // The instruction is always serial because implicit Use and Def alias.
+  // e.g. AAA (alias via EFLAGS)
+  ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS = 1u << 0,
+
+  // The instruction is always serial because one Def is tied to a Use.
+  // e.g. AND32ri (alias via tied GR32)
+  ALWAYS_SERIAL_TIED_REGS_ALIAS = 1u << 1,
+
+  // The execution can be made serial by inserting a second instruction that
+  // clobbers/reads memory.
+  // e.g. MOV8rm
+  SERIAL_VIA_MEMORY_INSTR = 1u << 2,
+
+  // The execution can be made serial by picking one Def that aliases with one
+  // Use.
+  // e.g. VXORPSrr XMM1, XMM1, XMM2
+  SERIAL_VIA_EXPLICIT_REGS = 1u << 3,
+
+  // The execution can be made serial by inserting a second instruction that
+  // uses one of the Defs and defs one of the Uses.
+  // e.g.
+  // 1st instruction: MMX_PMOVMSKBrr ECX, MM7
+  // 2nd instruction: MMX_MOVD64rr MM7, ECX
+  //  or instruction: MMX_MOVD64to64rr MM7, ECX
+  //  or instruction: MMX_PINSRWrr MM7, MM7, ECX, 1
+  SERIAL_VIA_NON_MEMORY_INSTR = 1u << 4,
+
+  // The execution is always parallel because the instruction is missing Use or
+  // Def operands.
+  ALWAYS_PARALLEL_MISSING_USE_OR_DEF = 1u << 5,
+
+  // The execution can be made parallel by repeating the same instruction but
+  // making sure that Defs of one instruction do not alias with Uses of the
+  // second one.
+  PARALLEL_VIA_EXPLICIT_REGS = 1u << 6,
+
+  LLVM_MARK_AS_BITMASK_ENUM(/*Largest*/ PARALLEL_VIA_EXPLICIT_REGS)
+};
+
+// Returns whether Execution is one of the values defined in the enum above.
+bool isEnumValue(ExecutionMode Execution);
+
+// Returns a human readable string for the enum.
+llvm::StringRef getName(ExecutionMode Execution);
+
+// Returns a sequence of increasing powers of two corresponding to all the
+// Execution flags.
+llvm::ArrayRef<ExecutionMode> getAllExecutionBits();
+
+// Decomposes Execution into individual set bits.
+llvm::SmallVector<ExecutionMode, 4> getExecutionModeBits(ExecutionMode);
+
+LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
+
 // A CodeTemplate is a set of InstructionTemplates that may not be fully
 // specified (i.e. some variables are not yet set). This allows the
-// BenchmarkRunner to instantiate it many times with specific values to study
+// SnippetGenerator to instantiate it many times with specific values to study
 // their impact on instruction's performance.
 struct CodeTemplate {
   CodeTemplate() = default;
@@ -57,6 +115,7 @@ struct CodeTemplate {
   CodeTemplate(const CodeTemplate &) = delete;
   CodeTemplate &operator=(const CodeTemplate &) = delete;
 
+  ExecutionMode Execution = ExecutionMode::UNKNOWN;
   // Some information about how this template has been created.
   std::string Info;
   // The list of the instructions for this template.
@@ -67,5 +126,6 @@ struct CodeTemplate {
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_CODETEMPLATE_H
diff --git a/tools/llvm-exegesis/lib/Latency.cpp b/tools/llvm-exegesis/lib/Latency.cpp
index 4173cf3f9a1b4b87488b4d4cea727c54b12db8f3..3d18e37f4c3806de4b45d490a53aaf7fb474fd41 100644
--- a/tools/llvm-exegesis/lib/Latency.cpp
+++ b/tools/llvm-exegesis/lib/Latency.cpp
@@ -13,71 +13,161 @@
 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
 #include "PerfHelper.h"
+#include "Target.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/Support/FormatVariadic.h"
 
+namespace llvm {
 namespace exegesis {
 
-LatencySnippetGenerator::~LatencySnippetGenerator() = default;
+struct ExecutionClass {
+  ExecutionMode Mask;
+  const char *Description;
+} static const kExecutionClasses[] = {
+    {ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS |
+         ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS,
+     "Repeating a single implicitly serial instruction"},
+    {ExecutionMode::SERIAL_VIA_EXPLICIT_REGS,
+     "Repeating a single explicitly serial instruction"},
+    {ExecutionMode::SERIAL_VIA_MEMORY_INSTR |
+         ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR,
+     "Repeating two instructions"},
+};
+
+static constexpr size_t kMaxAliasingInstructions = 10;
 
-llvm::Expected<CodeTemplate>
-LatencySnippetGenerator::generateTwoInstructionPrototype(
-    const Instruction &Instr) const {
+static std::vector<Instruction>
+computeAliasingInstructions(const LLVMState &State, const Instruction &Instr,
+                            size_t MaxAliasingInstructions) {
+  // Randomly iterate the set of instructions.
   std::vector<unsigned> Opcodes;
   Opcodes.resize(State.getInstrInfo().getNumOpcodes());
   std::iota(Opcodes.begin(), Opcodes.end(), 0U);
   std::shuffle(Opcodes.begin(), Opcodes.end(), randomGenerator());
+
+  std::vector<Instruction> AliasingInstructions;
   for (const unsigned OtherOpcode : Opcodes) {
-    if (OtherOpcode == Instr.Description->Opcode)
+    if (OtherOpcode == Instr.Description->getOpcode())
       continue;
-    const auto &OtherInstrDesc = State.getInstrInfo().get(OtherOpcode);
-    const Instruction OtherInstr(OtherInstrDesc, RATC);
+    const Instruction &OtherInstr = State.getIC().getInstr(OtherOpcode);
     if (OtherInstr.hasMemoryOperands())
       continue;
-    const AliasingConfigurations Forward(Instr, OtherInstr);
-    const AliasingConfigurations Back(OtherInstr, Instr);
-    if (Forward.empty() || Back.empty())
-      continue;
-    InstructionTemplate ThisIT(Instr);
-    InstructionTemplate OtherIT(OtherInstr);
-    if (!Forward.hasImplicitAliasing())
-      setRandomAliasing(Forward, ThisIT, OtherIT);
-    if (!Back.hasImplicitAliasing())
-      setRandomAliasing(Back, OtherIT, ThisIT);
-    CodeTemplate CT;
-    CT.Info = llvm::formatv("creating cycle through {0}.",
-                            State.getInstrInfo().getName(OtherOpcode));
-    CT.Instructions.push_back(std::move(ThisIT));
-    CT.Instructions.push_back(std::move(OtherIT));
-    return std::move(CT);
+    if (Instr.hasAliasingRegistersThrough(OtherInstr))
+      AliasingInstructions.push_back(std::move(OtherInstr));
+    if (AliasingInstructions.size() >= MaxAliasingInstructions)
+      break;
   }
-  return llvm::make_error<BenchmarkFailure>(
-      "Infeasible : Didn't find any scheme to make the instruction serial");
+  return AliasingInstructions;
 }
 
-llvm::Expected<CodeTemplate>
-LatencySnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
-  const Instruction Instr(State.getInstrInfo().get(Opcode), RATC);
+static ExecutionMode getExecutionModes(const Instruction &Instr) {
+  ExecutionMode EM = ExecutionMode::UNKNOWN;
+  if (Instr.hasAliasingImplicitRegisters())
+    EM |= ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS;
+  if (Instr.hasTiedRegisters())
+    EM |= ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS;
   if (Instr.hasMemoryOperands())
+    EM |= ExecutionMode::SERIAL_VIA_MEMORY_INSTR;
+  else {
+    if (Instr.hasAliasingRegisters())
+      EM |= ExecutionMode::SERIAL_VIA_EXPLICIT_REGS;
+    if (Instr.hasOneUseOrOneDef())
+      EM |= ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR;
+  }
+  return EM;
+}
+
+static void appendCodeTemplates(const LLVMState &State,
+                                const Instruction &Instr,
+                                ExecutionMode ExecutionModeBit,
+                                llvm::StringRef ExecutionClassDescription,
+                                std::vector<CodeTemplate> &CodeTemplates) {
+  assert(isEnumValue(ExecutionModeBit) && "Bit must be a power of two");
+  switch (ExecutionModeBit) {
+  case ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS:
+    // Nothing to do, the instruction is always serial.
+    LLVM_FALLTHROUGH;
+  case ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS: {
+    // Picking whatever value for the tied variable will make the instruction
+    // serial.
+    CodeTemplate CT;
+    CT.Execution = ExecutionModeBit;
+    CT.Info = ExecutionClassDescription;
+    CT.Instructions.push_back(Instr);
+    CodeTemplates.push_back(std::move(CT));
+    return;
+  }
+  case ExecutionMode::SERIAL_VIA_MEMORY_INSTR: {
+    // Select back-to-back memory instruction.
+    // TODO: Implement me.
+    return;
+  }
+  case ExecutionMode::SERIAL_VIA_EXPLICIT_REGS: {
+    // Making the execution of this instruction serial by selecting one def
+    // register to alias with one use register.
+    const AliasingConfigurations SelfAliasing(Instr, Instr);
+    assert(!SelfAliasing.empty() && !SelfAliasing.hasImplicitAliasing() &&
+           "Instr must alias itself explicitly");
+    InstructionTemplate IT(Instr);
+    // This is a self aliasing instruction so defs and uses are from the same
+    // instance, hence twice IT in the following call.
+    setRandomAliasing(SelfAliasing, IT, IT);
+    CodeTemplate CT;
+    CT.Execution = ExecutionModeBit;
+    CT.Info = ExecutionClassDescription;
+    CT.Instructions.push_back(std::move(IT));
+    CodeTemplates.push_back(std::move(CT));
+    return;
+  }
+  case ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR: {
+    // Select back-to-back non-memory instruction.
+    for (const auto OtherInstr :
+         computeAliasingInstructions(State, Instr, kMaxAliasingInstructions)) {
+      const AliasingConfigurations Forward(Instr, OtherInstr);
+      const AliasingConfigurations Back(OtherInstr, Instr);
+      InstructionTemplate ThisIT(Instr);
+      InstructionTemplate OtherIT(OtherInstr);
+      if (!Forward.hasImplicitAliasing())
+        setRandomAliasing(Forward, ThisIT, OtherIT);
+      if (!Back.hasImplicitAliasing())
+        setRandomAliasing(Back, OtherIT, ThisIT);
+      CodeTemplate CT;
+      CT.Execution = ExecutionModeBit;
+      CT.Info = ExecutionClassDescription;
+      CT.Instructions.push_back(std::move(ThisIT));
+      CT.Instructions.push_back(std::move(OtherIT));
+      CodeTemplates.push_back(std::move(CT));
+    }
+    return;
+  }
+  default:
+    llvm_unreachable("Unhandled enum value");
+  }
+}
+
+LatencySnippetGenerator::~LatencySnippetGenerator() = default;
+
+llvm::Expected<std::vector<CodeTemplate>>
+LatencySnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
+  std::vector<CodeTemplate> Results;
+  const ExecutionMode EM = getExecutionModes(Instr);
+  for (const auto EC : kExecutionClasses) {
+    for (const auto ExecutionModeBit : getExecutionModeBits(EM & EC.Mask))
+      appendCodeTemplates(State, Instr, ExecutionModeBit, EC.Description,
+                          Results);
+    if (!Results.empty())
+      break;
+  }
+  if (Results.empty())
     return llvm::make_error<BenchmarkFailure>(
-        "Infeasible : has memory operands");
-  if (auto CT = generateSelfAliasingCodeTemplate(Instr))
-    return CT;
-  else
-    llvm::consumeError(CT.takeError());
-  // No self aliasing, trying to create a dependency through another opcode.
-  return generateTwoInstructionPrototype(Instr);
+        "No strategy found to make the execution serial");
+  return std::move(Results);
 }
 
 const char *LatencyBenchmarkRunner::getCounterName() const {
-  if (!State.getSubtargetInfo().getSchedModel().hasExtraProcessorInfo())
-    llvm::report_fatal_error("sched model is missing extra processor info!");
-  const char *CounterName = State.getSubtargetInfo()
-                                .getSchedModel()
-                                .getExtraProcessorInfo()
-                                .PfmCounters.CycleCounter;
+  const char *CounterName = State.getPfmCounters().CycleCounter;
   if (!CounterName)
     llvm::report_fatal_error("sched model does not define a cycle counter");
   return CounterName;
@@ -85,30 +175,27 @@ const char *LatencyBenchmarkRunner::getCounterName() const {
 
 LatencyBenchmarkRunner::~LatencyBenchmarkRunner() = default;
 
-std::vector<BenchmarkMeasure>
-LatencyBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
-                                        ScratchSpace &Scratch) const {
+llvm::Expected<std::vector<BenchmarkMeasure>>
+LatencyBenchmarkRunner::runMeasurements(
+    const FunctionExecutor &Executor) const {
   // Cycle measurements include some overhead from the kernel. Repeat the
   // measure several times and take the minimum value.
   constexpr const int NumMeasurements = 30;
-  int64_t MinLatency = std::numeric_limits<int64_t>::max();
+  int64_t MinValue = std::numeric_limits<int64_t>::max();
   const char *CounterName = getCounterName();
   if (!CounterName)
     llvm::report_fatal_error("could not determine cycle counter name");
-  const pfm::PerfEvent CyclesPerfEvent(CounterName);
-  if (!CyclesPerfEvent.valid())
-    llvm::report_fatal_error("invalid perf event");
   for (size_t I = 0; I < NumMeasurements; ++I) {
-    pfm::Counter Counter(CyclesPerfEvent);
-    Scratch.clear();
-    Counter.start();
-    Function(Scratch.ptr());
-    Counter.stop();
-    const int64_t Value = Counter.read();
-    if (Value < MinLatency)
-      MinLatency = Value;
+    auto ExpectedCounterValue = Executor.runAndMeasure(CounterName);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    if (*ExpectedCounterValue < MinValue)
+      MinValue = *ExpectedCounterValue;
   }
-  return {BenchmarkMeasure::Create("latency", MinLatency)};
+  std::vector<BenchmarkMeasure> Result = {
+      BenchmarkMeasure::Create("latency", MinValue)};
+  return std::move(Result);
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Latency.h b/tools/llvm-exegesis/lib/Latency.h
index 37feb62e3dc34b09faea0fd9c689cc1799d24c5b..fef72cde5a6a224c5fdb56a4581dea03299ade07 100644
--- a/tools/llvm-exegesis/lib/Latency.h
+++ b/tools/llvm-exegesis/lib/Latency.h
@@ -19,6 +19,7 @@
 #include "MCInstrDescView.h"
 #include "SnippetGenerator.h"
 
+namespace llvm {
 namespace exegesis {
 
 class LatencySnippetGenerator : public SnippetGenerator {
@@ -26,12 +27,8 @@ public:
   LatencySnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {}
   ~LatencySnippetGenerator() override;
 
-  llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const override;
-
-private:
-  llvm::Expected<CodeTemplate>
-  generateTwoInstructionPrototype(const Instruction &Instr) const;
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override;
 };
 
 class LatencyBenchmarkRunner : public BenchmarkRunner {
@@ -41,12 +38,12 @@ public:
   ~LatencyBenchmarkRunner() override;
 
 private:
-  std::vector<BenchmarkMeasure>
-  runMeasurements(const ExecutableFunction &EF,
-                  ScratchSpace &Scratch) const override;
+  llvm::Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const override;
 
   virtual const char *getCounterName() const;
 };
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_LATENCY_H
diff --git a/tools/llvm-exegesis/lib/LlvmState.cpp b/tools/llvm-exegesis/lib/LlvmState.cpp
index 9ff42ca71fd2ad1963f74ec3b6f2e1a88f1491f4..b5580c83cf5c2952c7b8964c35a8597bf1359927 100644
--- a/tools/llvm-exegesis/lib/LlvmState.cpp
+++ b/tools/llvm-exegesis/lib/LlvmState.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+namespace llvm {
 namespace exegesis {
 
 LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName) {
@@ -35,11 +36,17 @@ LLVMState::LLVMState(const std::string &Triple, const std::string &CpuName) {
     llvm::errs() << "no exegesis target for " << Triple << ", using default\n";
     TheExegesisTarget = &ExegesisTarget::getDefault();
   }
+  PfmCounters = &TheExegesisTarget->getPfmCounters(CpuName);
+
+  RATC.reset(new RegisterAliasingTrackerCache(
+      getRegInfo(), getFunctionReservedRegs(getTargetMachine())));
+  IC.reset(new InstructionsCache(getInstrInfo(), getRATC()));
 }
 
-LLVMState::LLVMState()
+LLVMState::LLVMState(const std::string &CpuName)
     : LLVMState(llvm::sys::getProcessTriple(),
-                llvm::sys::getHostCPUName().str()) {}
+                CpuName.empty() ? llvm::sys::getHostCPUName().str() : CpuName) {
+}
 
 std::unique_ptr<llvm::LLVMTargetMachine>
 LLVMState::createTargetMachine() const {
@@ -69,3 +76,4 @@ bool LLVMState::canAssemble(const llvm::MCInst &Inst) const {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/LlvmState.h b/tools/llvm-exegesis/lib/LlvmState.h
index c84db300841e7abf0f5a304b114c011cd4911452..159a8a51c5cd1d6e48f1e84fd396850fbaeab6f2 100644
--- a/tools/llvm-exegesis/lib/LlvmState.h
+++ b/tools/llvm-exegesis/lib/LlvmState.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H
 #define LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H
 
+#include "MCInstrDescView.h"
+#include "RegisterAliasing.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -24,15 +26,18 @@
 #include <memory>
 #include <string>
 
+namespace llvm {
 namespace exegesis {
 
 class ExegesisTarget;
+struct PfmCountersInfo;
 
 // An object to initialize LLVM and prepare objects needed to run the
 // measurements.
 class LLVMState {
 public:
-  LLVMState();
+  // Uses the host triple. If CpuName is empty, uses the host CPU.
+  LLVMState(const std::string &CpuName);
 
   LLVMState(const std::string &Triple,
             const std::string &CpuName); // For tests.
@@ -55,11 +60,20 @@ public:
     return *TargetMachine->getMCSubtargetInfo();
   }
 
+  const RegisterAliasingTrackerCache &getRATC() const { return *RATC; }
+  const InstructionsCache &getIC() const { return *IC; }
+
+  const PfmCountersInfo &getPfmCounters() const { return *PfmCounters; }
+
 private:
   const ExegesisTarget *TheExegesisTarget;
   std::unique_ptr<const llvm::TargetMachine> TargetMachine;
+  std::unique_ptr<const RegisterAliasingTrackerCache> RATC;
+  std::unique_ptr<const InstructionsCache> IC;
+  const PfmCountersInfo *PfmCounters;
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_LLVMSTATE_H
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.cpp b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
index 75d85873146a30eb7b8419728d99f20306746183..e0521af4d19e11bc8899c9ed0720447633f819d9 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.cpp
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 
+namespace llvm {
 namespace exegesis {
 
 unsigned Variable::getIndex() const {
@@ -27,7 +28,14 @@ unsigned Variable::getPrimaryOperandIndex() const {
   return TiedOperands[0];
 }
 
-bool Variable::hasTiedOperands() const { return TiedOperands.size() > 1; }
+bool Variable::hasTiedOperands() const {
+  assert(TiedOperands.size() <= 2 &&
+         "No more than two operands can be tied together");
+  // By definition only Use and Def operands can be tied together.
+  // TiedOperands[0] is the Def operand (LLVM stores defs first).
+  // TiedOperands[1] is the Use operand.
+  return TiedOperands.size() > 1;
+}
 
 unsigned Operand::getIndex() const {
   assert(Index >= 0 && "Index must be set");
@@ -87,24 +95,25 @@ const llvm::MCOperandInfo &Operand::getExplicitOperandInfo() const {
   return *Info;
 }
 
-Instruction::Instruction(const llvm::MCInstrDesc &MCInstrDesc,
-                         const RegisterAliasingTrackerCache &RATC)
-    : Description(&MCInstrDesc) {
+Instruction::Instruction(const llvm::MCInstrInfo &InstrInfo,
+                         const RegisterAliasingTrackerCache &RATC,
+                         unsigned Opcode)
+    : Description(&InstrInfo.get(Opcode)), Name(InstrInfo.getName(Opcode)) {
   unsigned OpIndex = 0;
-  for (; OpIndex < MCInstrDesc.getNumOperands(); ++OpIndex) {
-    const auto &OpInfo = MCInstrDesc.opInfo_begin()[OpIndex];
+  for (; OpIndex < Description->getNumOperands(); ++OpIndex) {
+    const auto &OpInfo = Description->opInfo_begin()[OpIndex];
     Operand Operand;
     Operand.Index = OpIndex;
-    Operand.IsDef = (OpIndex < MCInstrDesc.getNumDefs());
+    Operand.IsDef = (OpIndex < Description->getNumDefs());
     // TODO(gchatelet): Handle isLookupPtrRegClass.
     if (OpInfo.RegClass >= 0)
       Operand.Tracker = &RATC.getRegisterClass(OpInfo.RegClass);
     Operand.TiedToIndex =
-        MCInstrDesc.getOperandConstraint(OpIndex, llvm::MCOI::TIED_TO);
+        Description->getOperandConstraint(OpIndex, llvm::MCOI::TIED_TO);
     Operand.Info = &OpInfo;
     Operands.push_back(Operand);
   }
-  for (const llvm::MCPhysReg *MCPhysReg = MCInstrDesc.getImplicitDefs();
+  for (const llvm::MCPhysReg *MCPhysReg = Description->getImplicitDefs();
        MCPhysReg && *MCPhysReg; ++MCPhysReg, ++OpIndex) {
     Operand Operand;
     Operand.Index = OpIndex;
@@ -113,7 +122,7 @@ Instruction::Instruction(const llvm::MCInstrDesc &MCInstrDesc,
     Operand.ImplicitReg = MCPhysReg;
     Operands.push_back(Operand);
   }
-  for (const llvm::MCPhysReg *MCPhysReg = MCInstrDesc.getImplicitUses();
+  for (const llvm::MCPhysReg *MCPhysReg = Description->getImplicitUses();
        MCPhysReg && *MCPhysReg; ++MCPhysReg, ++OpIndex) {
     Operand Operand;
     Operand.Index = OpIndex;
@@ -166,7 +175,7 @@ const Operand &Instruction::getPrimaryOperand(const Variable &Var) const {
 }
 
 bool Instruction::hasMemoryOperands() const {
-  return std::any_of(Operands.begin(), Operands.end(), [](const Operand &Op) {
+  return any_of(Operands, [](const Operand &Op) {
     return Op.isReg() && Op.isExplicit() && Op.isMemory();
   });
 }
@@ -196,8 +205,13 @@ bool Instruction::hasAliasingRegisters() const {
   return AllDefRegs.anyCommon(AllUseRegs);
 }
 
+bool Instruction::hasOneUseOrOneDef() const {
+  return AllDefRegs.count() || AllUseRegs.count();
+}
+
 void Instruction::dump(const llvm::MCRegisterInfo &RegInfo,
                        llvm::raw_ostream &Stream) const {
+  Stream << "- " << Name << "\n";
   for (const auto &Op : Operands) {
     Stream << "- Op" << Op.getIndex();
     if (Op.isExplicit())
@@ -227,10 +241,15 @@ void Instruction::dump(const llvm::MCRegisterInfo &RegInfo,
   }
   for (const auto &Var : Variables) {
     Stream << "- Var" << Var.getIndex();
-    Stream << " (";
-    for (auto OperandIndex : Var.TiedOperands)
+    Stream << " [";
+    bool IsFirst = true;
+    for (auto OperandIndex : Var.TiedOperands) {
+      if (!IsFirst)
+        Stream << ",";
       Stream << "Op" << OperandIndex;
-    Stream << ")";
+      IsFirst = false;
+    }
+    Stream << "]";
     Stream << "\n";
   }
   if (hasMemoryOperands())
@@ -243,6 +262,17 @@ void Instruction::dump(const llvm::MCRegisterInfo &RegInfo,
     Stream << "- hasAliasingRegisters\n";
 }
 
+InstructionsCache::InstructionsCache(const llvm::MCInstrInfo &InstrInfo,
+                                     const RegisterAliasingTrackerCache &RATC)
+    : InstrInfo(InstrInfo), RATC(RATC) {}
+
+const Instruction &InstructionsCache::getInstr(unsigned Opcode) const {
+  auto &Found = Instructions[Opcode];
+  if (!Found)
+    Found.reset(new Instruction(InstrInfo, RATC, Opcode));
+  return *Found;
+}
+
 bool RegisterOperandAssignment::
 operator==(const RegisterOperandAssignment &Other) const {
   return std::tie(Op, Reg) == std::tie(Other.Op, Other.Reg);
@@ -281,8 +311,7 @@ bool AliasingConfigurations::hasImplicitAliasing() const {
 }
 
 AliasingConfigurations::AliasingConfigurations(
-    const Instruction &DefInstruction, const Instruction &UseInstruction)
-    : DefInstruction(DefInstruction), UseInstruction(UseInstruction) {
+    const Instruction &DefInstruction, const Instruction &UseInstruction) {
   if (UseInstruction.AllUseRegs.anyCommon(DefInstruction.AllDefRegs)) {
     auto CommonRegisters = UseInstruction.AllUseRegs;
     CommonRegisters &= DefInstruction.AllDefRegs;
@@ -326,3 +355,4 @@ void DumpMCInst(const llvm::MCRegisterInfo &MCRegisterInfo,
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/MCInstrDescView.h b/tools/llvm-exegesis/lib/MCInstrDescView.h
index 39e5c4a5f5b2c6c5cd3fb0fa2b26c0b5f3f42a5b..58efd2a4e41c1af7a447a43cc57e77a7ea8356da 100644
--- a/tools/llvm-exegesis/lib/MCInstrDescView.h
+++ b/tools/llvm-exegesis/lib/MCInstrDescView.h
@@ -20,6 +20,7 @@
 #define LLVM_TOOLS_LLVM_EXEGESIS_MCINSTRDESCVIEW_H
 
 #include <random>
+#include <unordered_map>
 
 #include "RegisterAliasing.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -28,6 +29,7 @@
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
 // A variable represents the value associated to an Operand or a set of Operands
@@ -80,7 +82,7 @@ struct Operand {
   const llvm::MCOperandInfo &getExplicitOperandInfo() const;
 
   // Please use the accessors above and not the following fields.
-  unsigned Index = 0;
+  int Index = -1;
   bool IsDef = false;
   const RegisterAliasingTracker *Tracker = nullptr; // Set for Register Op.
   const llvm::MCOperandInfo *Info = nullptr;        // Set for Explicit Op.
@@ -92,8 +94,8 @@ struct Operand {
 // A view over an MCInstrDesc offering a convenient interface to compute
 // Register aliasing.
 struct Instruction {
-  Instruction(const llvm::MCInstrDesc &MCInstrDesc,
-              const RegisterAliasingTrackerCache &ATC);
+  Instruction(const llvm::MCInstrInfo &InstrInfo,
+              const RegisterAliasingTrackerCache &RATC, unsigned Opcode);
 
   // Returns the Operand linked to this Variable.
   // In case the Variable is tied, the primary (i.e. Def) Operand is returned.
@@ -125,11 +127,17 @@ struct Instruction {
   // reads or write the same memory region.
   bool hasMemoryOperands() const;
 
+  // Returns whether this instruction as at least one use or one def.
+  // Repeating this instruction may execute sequentially by adding an
+  // instruction that aliases one of these.
+  bool hasOneUseOrOneDef() const;
+
   // Convenient function to help with debugging.
   void dump(const llvm::MCRegisterInfo &RegInfo,
             llvm::raw_ostream &Stream) const;
 
   const llvm::MCInstrDesc *Description; // Never nullptr.
+  llvm::StringRef Name;                 // The name of this instruction.
   llvm::SmallVector<Operand, 8> Operands;
   llvm::SmallVector<Variable, 4> Variables;
   llvm::BitVector ImplDefRegs; // The set of aliased implicit def registers.
@@ -138,6 +146,22 @@ struct Instruction {
   llvm::BitVector AllUseRegs;  // The set of all aliased use registers.
 };
 
+// Instructions are expensive to instantiate. This class provides a cache of
+// Instructions with lazy construction.
+struct InstructionsCache {
+  InstructionsCache(const llvm::MCInstrInfo &InstrInfo,
+                    const RegisterAliasingTrackerCache &RATC);
+
+  // Returns the Instruction object corresponding to this Opcode.
+  const Instruction &getInstr(unsigned Opcode) const;
+
+private:
+  const llvm::MCInstrInfo &InstrInfo;
+  const RegisterAliasingTrackerCache &RATC;
+  mutable std::unordered_map<unsigned, std::unique_ptr<Instruction>>
+      Instructions;
+};
+
 // Represents the assignment of a Register to an Operand.
 struct RegisterOperandAssignment {
   RegisterOperandAssignment(const Operand *Operand, llvm::MCPhysReg Reg)
@@ -173,10 +197,7 @@ struct AliasingConfigurations {
 
   bool empty() const; // True if no aliasing configuration is found.
   bool hasImplicitAliasing() const;
-  void setExplicitAliasing() const;
 
-  const Instruction &DefInstruction;
-  const Instruction &UseInstruction;
   llvm::SmallVector<AliasingRegisterOperands, 32> Configurations;
 };
 
@@ -188,5 +209,6 @@ void DumpMCInst(const llvm::MCRegisterInfo &MCRegisterInfo,
                 const llvm::MCInst &MCInst, llvm::raw_ostream &OS);
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_MCINSTRDESCVIEW_H
diff --git a/tools/llvm-exegesis/lib/PerfHelper.cpp b/tools/llvm-exegesis/lib/PerfHelper.cpp
index c145ea8404b4e292365bbd183c47277e65827f46..c1c242ca88fa55b1afe0b06bee696c097272a56f 100644
--- a/tools/llvm-exegesis/lib/PerfHelper.cpp
+++ b/tools/llvm-exegesis/lib/PerfHelper.cpp
@@ -17,6 +17,7 @@
 #endif
 #include <cassert>
 
+namespace llvm {
 namespace exegesis {
 namespace pfm {
 
@@ -136,3 +137,4 @@ int64_t Counter::read() const { return 42; }
 
 } // namespace pfm
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/PerfHelper.h b/tools/llvm-exegesis/lib/PerfHelper.h
index 8c3f13e6c5cdc709f3e776f2f6e723ac53b9a864..2d0810846606a7ce798b648c3fe82a2e3168f628 100644
--- a/tools/llvm-exegesis/lib/PerfHelper.h
+++ b/tools/llvm-exegesis/lib/PerfHelper.h
@@ -23,6 +23,7 @@
 
 struct perf_event_attr;
 
+namespace llvm {
 namespace exegesis {
 namespace pfm {
 
@@ -102,5 +103,6 @@ void Measure(
 
 } // namespace pfm
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_PERFHELPER_H
diff --git a/tools/llvm-exegesis/lib/RegisterAliasing.cpp b/tools/llvm-exegesis/lib/RegisterAliasing.cpp
index 039f78db985faf0dd99a4e0011e0345f269a47f3..54041ca30aa02c47706b8214a310ae84d45b8e85 100644
--- a/tools/llvm-exegesis/lib/RegisterAliasing.cpp
+++ b/tools/llvm-exegesis/lib/RegisterAliasing.cpp
@@ -9,6 +9,7 @@
 
 #include "RegisterAliasing.h"
 
+namespace llvm {
 namespace exegesis {
 
 llvm::BitVector getAliasedBits(const llvm::MCRegisterInfo &RegInfo,
@@ -81,3 +82,4 @@ RegisterAliasingTrackerCache::getRegisterClass(unsigned RegClassIndex) const {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/RegisterAliasing.h b/tools/llvm-exegesis/lib/RegisterAliasing.h
index 064d9333beb4c7bef1edaebbc8b92812e1c06513..94a2eb07f4954f911605a2ea63413ab778e76683 100644
--- a/tools/llvm-exegesis/lib/RegisterAliasing.h
+++ b/tools/llvm-exegesis/lib/RegisterAliasing.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/PackedVector.h"
 #include "llvm/MC/MCRegisterInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
 // Returns the registers that are aliased by the ones set in SourceBits.
@@ -104,5 +105,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_ALIASINGTRACKER_H
diff --git a/tools/llvm-exegesis/lib/RegisterValue.cpp b/tools/llvm-exegesis/lib/RegisterValue.cpp
index 1982a6c53b2177f3ad8975320045c4718267ec1f..2bf996cead48dbf91a422cb318ca3edfb57a647d 100644
--- a/tools/llvm-exegesis/lib/RegisterValue.cpp
+++ b/tools/llvm-exegesis/lib/RegisterValue.cpp
@@ -10,6 +10,7 @@
 #include "RegisterValue.h"
 #include "llvm/ADT/APFloat.h"
 
+namespace llvm {
 namespace exegesis {
 
 static llvm::APFloat getFloatValue(const llvm::fltSemantics &FltSemantics,
@@ -47,3 +48,4 @@ llvm::APInt bitcastFloatValue(const llvm::fltSemantics &FltSemantics,
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/RegisterValue.h b/tools/llvm-exegesis/lib/RegisterValue.h
index a4ef8e0ba1e5bba11844c7db8bfdd2272c6be63d..51ea30ac8eb47e54d0a05340eb37caa3c6d51f54 100644
--- a/tools/llvm-exegesis/lib/RegisterValue.h
+++ b/tools/llvm-exegesis/lib/RegisterValue.h
@@ -17,6 +17,7 @@
 #include <llvm/ADT/APFloat.h>
 #include <llvm/ADT/APInt.h>
 
+namespace llvm {
 namespace exegesis {
 
 // A simple object storing the value for a particular register.
@@ -43,3 +44,4 @@ llvm::APInt bitcastFloatValue(const llvm::fltSemantics &FltSemantics,
                               PredefinedValues Value);
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.cpp b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
index 3765776f7249f7e6554287f1ac52e1007fbd979a..eb6a8577b5773cde66edf261e0bd30608868b673 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.cpp
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.cpp
@@ -20,38 +20,46 @@
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Program.h"
 
+namespace llvm {
 namespace exegesis {
 
+std::vector<CodeTemplate> getSingleton(CodeTemplate &&CT) {
+  std::vector<CodeTemplate> Result;
+  Result.push_back(std::move(CT));
+  return Result;
+}
+
 SnippetGeneratorFailure::SnippetGeneratorFailure(const llvm::Twine &S)
     : llvm::StringError(S, llvm::inconvertibleErrorCode()) {}
 
-SnippetGenerator::SnippetGenerator(const LLVMState &State)
-    : State(State), RATC(State.getRegInfo(),
-                         getFunctionReservedRegs(State.getTargetMachine())) {}
+SnippetGenerator::SnippetGenerator(const LLVMState &State) : State(State) {}
 
 SnippetGenerator::~SnippetGenerator() = default;
 
 llvm::Expected<std::vector<BenchmarkCode>>
-SnippetGenerator::generateConfigurations(unsigned Opcode) const {
-  if (auto E = generateCodeTemplate(Opcode)) {
-    CodeTemplate &CT = E.get();
-    const llvm::BitVector &ForbiddenRegs =
-        CT.ScratchSpacePointerInReg
-            ? RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits()
-            : RATC.emptyRegisters();
+SnippetGenerator::generateConfigurations(const Instruction &Instr) const {
+  if (auto E = generateCodeTemplates(Instr)) {
+    const auto &RATC = State.getRATC();
     std::vector<BenchmarkCode> Output;
-    // TODO: Generate as many BenchmarkCode as needed.
-    {
-      BenchmarkCode BC;
-      BC.Info = CT.Info;
-      for (InstructionTemplate &IT : CT.Instructions) {
-        randomizeUnsetVariables(ForbiddenRegs, IT);
-        BC.Instructions.push_back(IT.build());
+    for (CodeTemplate &CT : E.get()) {
+      const llvm::BitVector &ForbiddenRegs =
+          CT.ScratchSpacePointerInReg
+              ? RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits()
+              : RATC.emptyRegisters();
+      // TODO: Generate as many BenchmarkCode as needed.
+      {
+        BenchmarkCode BC;
+        BC.Info = CT.Info;
+        for (InstructionTemplate &IT : CT.Instructions) {
+          randomizeUnsetVariables(ForbiddenRegs, IT);
+          BC.Instructions.push_back(IT.build());
+        }
+        if (CT.ScratchSpacePointerInReg)
+          BC.LiveIns.push_back(CT.ScratchSpacePointerInReg);
+        BC.RegisterInitialValues =
+            computeRegisterInitialValues(CT.Instructions);
+        Output.push_back(std::move(BC));
       }
-      if (CT.ScratchSpacePointerInReg)
-        BC.LiveIns.push_back(CT.ScratchSpacePointerInReg);
-      BC.RegisterInitialValues = computeRegisterInitialValues(CT.Instructions);
-      Output.push_back(std::move(BC));
     }
     return Output;
   } else
@@ -64,7 +72,7 @@ std::vector<RegisterValue> SnippetGenerator::computeRegisterInitialValues(
   // Ignore memory operands which are handled separately.
   // Loop invariant: DefinedRegs[i] is true iif it has been set at least once
   // before the current instruction.
-  llvm::BitVector DefinedRegs = RATC.emptyRegisters();
+  llvm::BitVector DefinedRegs = State.getRATC().emptyRegisters();
   std::vector<RegisterValue> RIV;
   for (const InstructionTemplate &IT : Instructions) {
     // Returns the register that this Operand sets or uses, or 0 if this is not
@@ -100,13 +108,14 @@ std::vector<RegisterValue> SnippetGenerator::computeRegisterInitialValues(
   return RIV;
 }
 
-llvm::Expected<CodeTemplate> SnippetGenerator::generateSelfAliasingCodeTemplate(
-    const Instruction &Instr) const {
+llvm::Expected<std::vector<CodeTemplate>>
+generateSelfAliasingCodeTemplates(const Instruction &Instr) {
   const AliasingConfigurations SelfAliasing(Instr, Instr);
-  if (SelfAliasing.empty()) {
+  if (SelfAliasing.empty())
     return llvm::make_error<SnippetGeneratorFailure>("empty self aliasing");
-  }
-  CodeTemplate CT;
+  std::vector<CodeTemplate> Result;
+  Result.emplace_back();
+  CodeTemplate &CT = Result.back();
   InstructionTemplate IT(Instr);
   if (SelfAliasing.hasImplicitAliasing()) {
     CT.Info = "implicit Self cycles, picking random values.";
@@ -117,16 +126,18 @@ llvm::Expected<CodeTemplate> SnippetGenerator::generateSelfAliasingCodeTemplate(
     setRandomAliasing(SelfAliasing, IT, IT);
   }
   CT.Instructions.push_back(std::move(IT));
-  return std::move(CT);
+  return std::move(Result);
 }
 
-llvm::Expected<CodeTemplate>
-SnippetGenerator::generateUnconstrainedCodeTemplate(const Instruction &Instr,
-                                                    llvm::StringRef Msg) const {
-  CodeTemplate CT;
+llvm::Expected<std::vector<CodeTemplate>>
+generateUnconstrainedCodeTemplates(const Instruction &Instr,
+                                   llvm::StringRef Msg) {
+  std::vector<CodeTemplate> Result;
+  Result.emplace_back();
+  CodeTemplate &CT = Result.back();
   CT.Info = llvm::formatv("{0}, repeating an unconstrained assignment", Msg);
   CT.Instructions.emplace_back(Instr);
-  return std::move(CT);
+  return std::move(Result);
 }
 
 std::mt19937 &randomGenerator() {
@@ -212,3 +223,4 @@ void randomizeUnsetVariables(const llvm::BitVector &ForbiddenRegs,
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/SnippetGenerator.h b/tools/llvm-exegesis/lib/SnippetGenerator.h
index 9493c5848165f463f0a5e4af8b900c65adc6a701..967b273182b74b3f2e00fd83b56bf1ce169512f7 100644
--- a/tools/llvm-exegesis/lib/SnippetGenerator.h
+++ b/tools/llvm-exegesis/lib/SnippetGenerator.h
@@ -28,8 +28,20 @@
 #include <memory>
 #include <vector>
 
+namespace llvm {
 namespace exegesis {
 
+std::vector<CodeTemplate> getSingleton(CodeTemplate &&CT);
+
+// Generates code templates that has a self-dependency.
+llvm::Expected<std::vector<CodeTemplate>>
+generateSelfAliasingCodeTemplates(const Instruction &Instr);
+
+// Generates code templates without assignment constraints.
+llvm::Expected<std::vector<CodeTemplate>>
+generateUnconstrainedCodeTemplates(const Instruction &Instr,
+                                   llvm::StringRef Msg);
+
 // A class representing failures that happened during Benchmark, they are used
 // to report informations to the user.
 class SnippetGeneratorFailure : public llvm::StringError {
@@ -46,7 +58,7 @@ public:
 
   // Calls generateCodeTemplate and expands it into one or more BenchmarkCode.
   llvm::Expected<std::vector<BenchmarkCode>>
-  generateConfigurations(unsigned Opcode) const;
+  generateConfigurations(const Instruction &Instr) const;
 
   // Given a snippet, computes which registers the setup code needs to define.
   std::vector<RegisterValue> computeRegisterInitialValues(
@@ -54,20 +66,11 @@ public:
 
 protected:
   const LLVMState &State;
-  const RegisterAliasingTrackerCache RATC;
-
-  // Generates a single code template that has a self-dependency.
-  llvm::Expected<CodeTemplate>
-  generateSelfAliasingCodeTemplate(const Instruction &Instr) const;
-  // Generates a single code template without assignment constraints.
-  llvm::Expected<CodeTemplate>
-  generateUnconstrainedCodeTemplate(const Instruction &Instr,
-                                    llvm::StringRef Msg) const;
 
 private:
   // API to be implemented by subclasses.
-  virtual llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const = 0;
+  virtual llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const = 0;
 };
 
 // A global Random Number Generator to randomize configurations.
@@ -90,5 +93,6 @@ void randomizeUnsetVariables(const llvm::BitVector &ForbiddenRegs,
                              InstructionTemplate &IT);
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_SNIPPETGENERATOR_H
diff --git a/tools/llvm-exegesis/lib/Target.cpp b/tools/llvm-exegesis/lib/Target.cpp
index 8baa8499c9263b30fab62bcbac5cb59c27de8fde..06557770418c3278988421a7cead7cafba7834b0 100644
--- a/tools/llvm-exegesis/lib/Target.cpp
+++ b/tools/llvm-exegesis/lib/Target.cpp
@@ -11,6 +11,7 @@
 #include "Latency.h"
 #include "Uops.h"
 
+namespace llvm {
 namespace exegesis {
 
 ExegesisTarget::~ExegesisTarget() {} // anchor.
@@ -84,10 +85,37 @@ ExegesisTarget::createUopsBenchmarkRunner(const LLVMState &State) const {
   return llvm::make_unique<UopsBenchmarkRunner>(State);
 }
 
+static_assert(std::is_pod<PfmCountersInfo>::value,
+              "We shouldn't have dynamic initialization here");
+const PfmCountersInfo PfmCountersInfo::Default = {nullptr, nullptr, nullptr, 0u};
+
+const PfmCountersInfo &
+ExegesisTarget::getPfmCounters(llvm::StringRef CpuName) const {
+  assert(std::is_sorted(
+             CpuPfmCounters.begin(), CpuPfmCounters.end(),
+             [](const CpuAndPfmCounters &LHS, const CpuAndPfmCounters &RHS) {
+               return strcmp(LHS.CpuName, RHS.CpuName) < 0;
+             }) &&
+         "CpuPfmCounters table is not sorted");
+
+  // Find entry
+  auto Found =
+      std::lower_bound(CpuPfmCounters.begin(), CpuPfmCounters.end(), CpuName);
+  if (Found == CpuPfmCounters.end() ||
+      llvm::StringRef(Found->CpuName) != CpuName) {
+    return PfmCountersInfo::Default;
+  }
+  assert(Found->PCI && "Missing counters");
+  return *Found->PCI;
+}
+
 namespace {
 
 // Default implementation.
 class ExegesisDefaultTarget : public ExegesisTarget {
+public:
+  ExegesisDefaultTarget() : ExegesisTarget({}) {}
+
 private:
   std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
                                      unsigned Reg,
@@ -109,3 +137,4 @@ const ExegesisTarget &ExegesisTarget::getDefault() {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h
index dd778d35b72d8a057dd43a83d94921564efe35fb..b0f0e996173634ea493edb540e4858d48d3c1899 100644
--- a/tools/llvm-exegesis/lib/Target.h
+++ b/tools/llvm-exegesis/lib/Target.h
@@ -28,10 +28,45 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
+struct PfmCountersInfo {
+  // An optional name of a performance counter that can be used to measure
+  // cycles.
+  const char *CycleCounter;
+
+  // An optional name of a performance counter that can be used to measure
+  // uops.
+  const char *UopsCounter;
+
+  // An IssueCounter specifies how to measure uops issued to specific proc
+  // resources.
+  struct IssueCounter {
+    const char *Counter;
+    // The name of the ProcResource that this counter measures.
+    const char *ProcResName;
+  };
+  // An optional list of IssueCounters.
+  const IssueCounter *IssueCounters;
+  unsigned NumIssueCounters;
+
+  static const PfmCountersInfo Default;
+};
+
+struct CpuAndPfmCounters {
+  const char *CpuName;
+  const PfmCountersInfo *PCI;
+  bool operator<(llvm::StringRef S) const {
+    return llvm::StringRef(CpuName) < S;
+  }
+};
+
 class ExegesisTarget {
 public:
+  explicit ExegesisTarget(llvm::ArrayRef<CpuAndPfmCounters> CpuPfmCounters)
+      : CpuPfmCounters(CpuPfmCounters) {}
+
   // Targets can use this to add target-specific passes in assembleToStream();
   virtual void addTargetSpecificPasses(llvm::PassManagerBase &PM) const {}
 
@@ -82,6 +117,10 @@ public:
 
   virtual ~ExegesisTarget();
 
+  // Returns the Pfm counters for the given CPU (or the default if no pfm
+  // counters are defined for this CPU).
+  const PfmCountersInfo &getPfmCounters(llvm::StringRef CpuName) const;
+
 private:
   virtual bool matchesArch(llvm::Triple::ArchType Arch) const = 0;
 
@@ -97,8 +136,10 @@ private:
       const LLVMState &State) const;
 
   const ExegesisTarget *Next = nullptr;
+  const llvm::ArrayRef<CpuAndPfmCounters> CpuPfmCounters;
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_TARGET_H
diff --git a/tools/llvm-exegesis/lib/Uops.cpp b/tools/llvm-exegesis/lib/Uops.cpp
index 2208e2a3821102c9404b1b7ecc723b9a52ac02a2..9768f4533f782e3f1831fb7002fd765bd8eb9cfd 100644
--- a/tools/llvm-exegesis/lib/Uops.cpp
+++ b/tools/llvm-exegesis/lib/Uops.cpp
@@ -12,7 +12,6 @@
 #include "Assembler.h"
 #include "BenchmarkRunner.h"
 #include "MCInstrDescView.h"
-#include "PerfHelper.h"
 #include "Target.h"
 
 // FIXME: Load constants into registers (e.g. with fld1) to not break
@@ -79,6 +78,7 @@
 // In that case we just use a greedy register assignment and hope for the
 // best.
 
+namespace llvm {
 namespace exegesis {
 
 static llvm::SmallVector<const Variable *, 8>
@@ -124,21 +124,19 @@ void UopsSnippetGenerator::instantiateMemoryOperands(
          "not enough scratch space");
 }
 
-llvm::Expected<CodeTemplate>
-UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
-  const auto &ET = State.getExegesisTarget();
+llvm::Expected<std::vector<CodeTemplate>>
+UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
   CodeTemplate CT;
-
   const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
-  const Instruction Instr(State.getInstrInfo().get(Opcode), RATC);
   if (Instr.hasMemoryOperands()) {
+    const auto &ET = State.getExegesisTarget();
     CT.ScratchSpacePointerInReg =
         ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
     if (CT.ScratchSpacePointerInReg == 0)
       return llvm::make_error<BenchmarkFailure>(
           "Infeasible : target does not support memory instructions");
     ScratchSpaceAliasedRegs =
-        &RATC.getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
+        &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
     // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
     // FIXME: We could make a copy of the scratch register.
     for (const auto &Op : Instr.Operands) {
@@ -155,13 +153,13 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
     CT.Info = "instruction is parallel, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return std::move(CT);
+    return getSingleton(std::move(CT));
   }
   if (SelfAliasing.hasImplicitAliasing()) {
     CT.Info = "instruction is serial, repeating a random one.";
     CT.Instructions.push_back(std::move(IT));
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return std::move(CT);
+    return getSingleton(std::move(CT));
   }
   const auto TiedVariables = getVariablesWithTiedOperands(Instr);
   if (!TiedVariables.empty()) {
@@ -183,14 +181,15 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
       CT.Instructions.push_back(std::move(TmpIT));
     }
     instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-    return std::move(CT);
+    return getSingleton(std::move(CT));
   }
+  const auto &ReservedRegisters = State.getRATC().reservedRegisters();
   // No tied variables, we pick random values for defs.
   llvm::BitVector Defs(State.getRegInfo().getNumRegs());
   for (const auto &Op : Instr.Operands) {
     if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
-      remove(PossibleRegisters, RATC.reservedRegisters());
+      remove(PossibleRegisters, ReservedRegisters);
       // Do not use the scratch memory address register.
       if (ScratchSpaceAliasedRegs)
         remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
@@ -205,7 +204,7 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
   for (const auto &Op : Instr.Operands) {
     if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
       auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
-      remove(PossibleRegisters, RATC.reservedRegisters());
+      remove(PossibleRegisters, ReservedRegisters);
       // Do not use the scratch memory address register.
       if (ScratchSpaceAliasedRegs)
         remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
@@ -219,56 +218,37 @@ UopsSnippetGenerator::generateCodeTemplate(unsigned Opcode) const {
       "instruction has no tied variables picking Uses different from defs";
   CT.Instructions.push_back(std::move(IT));
   instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
-  return std::move(CT);
+  return getSingleton(std::move(CT));
 }
 
-std::vector<BenchmarkMeasure>
-UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
-                                     ScratchSpace &Scratch) const {
-  const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
-
-  const auto RunMeasurement = [&Function,
-                               &Scratch](const char *const Counters) {
-    // We sum counts when there are several counters for a single ProcRes
-    // (e.g. P23 on SandyBridge).
-    int64_t CounterValue = 0;
-    llvm::SmallVector<llvm::StringRef, 2> CounterNames;
-    llvm::StringRef(Counters).split(CounterNames, ',');
-    for (const auto &CounterName : CounterNames) {
-      pfm::PerfEvent UopPerfEvent(CounterName);
-      if (!UopPerfEvent.valid())
-        llvm::report_fatal_error(
-            llvm::Twine("invalid perf event ").concat(Counters));
-      pfm::Counter Counter(UopPerfEvent);
-      Scratch.clear();
-      Counter.start();
-      Function(Scratch.ptr());
-      Counter.stop();
-      CounterValue += Counter.read();
-    }
-    return CounterValue;
-  };
-
+llvm::Expected<std::vector<BenchmarkMeasure>>
+UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
   std::vector<BenchmarkMeasure> Result;
-  const auto &PfmCounters = SchedModel.getExtraProcessorInfo().PfmCounters;
+  const PfmCountersInfo &PCI = State.getPfmCounters();
   // Uops per port.
-  for (unsigned ProcResIdx = 1;
-       ProcResIdx < SchedModel.getNumProcResourceKinds(); ++ProcResIdx) {
-    const char *const Counters = PfmCounters.IssueCounters[ProcResIdx];
-    if (!Counters)
+  for (const auto *IssueCounter = PCI.IssueCounters,
+                  *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
+       IssueCounter != IssueCounterEnd; ++IssueCounter) {
+    if (!IssueCounter->Counter)
       continue;
-    const double CounterValue = RunMeasurement(Counters);
-    Result.push_back(BenchmarkMeasure::Create(
-        SchedModel.getProcResource(ProcResIdx)->Name, CounterValue));
+    auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
+                                              *ExpectedCounterValue));
   }
   // NumMicroOps.
-  if (const char *const UopsCounter = PfmCounters.UopsCounter) {
-    const double CounterValue = RunMeasurement(UopsCounter);
-    Result.push_back(BenchmarkMeasure::Create("NumMicroOps", CounterValue));
+  if (const char *const UopsCounter = PCI.UopsCounter) {
+    auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    Result.push_back(
+        BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
   }
-  return Result;
+  return std::move(Result);
 }
 
 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/lib/Uops.h b/tools/llvm-exegesis/lib/Uops.h
index 33d0d8b159616aca84794f06061ffcc2a2eea713..b2a5ea177f44a33e031cba3ba08a3c66b6eb97ab 100644
--- a/tools/llvm-exegesis/lib/Uops.h
+++ b/tools/llvm-exegesis/lib/Uops.h
@@ -18,6 +18,7 @@
 #include "BenchmarkRunner.h"
 #include "SnippetGenerator.h"
 
+namespace llvm {
 namespace exegesis {
 
 class UopsSnippetGenerator : public SnippetGenerator {
@@ -25,8 +26,8 @@ public:
   UopsSnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {}
   ~UopsSnippetGenerator() override;
 
-  llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const override;
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override;
 
   static constexpr const size_t kMinNumDifferentAddresses = 6;
 
@@ -68,11 +69,11 @@ public:
   static constexpr const size_t kMinNumDifferentAddresses = 6;
 
 private:
-  std::vector<BenchmarkMeasure>
-  runMeasurements(const ExecutableFunction &EF,
-                  ScratchSpace &Scratch) const override;
+  llvm::Expected<std::vector<BenchmarkMeasure>>
+  runMeasurements(const FunctionExecutor &Executor) const override;
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_EXEGESIS_UOPS_H
diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp
index 4a9cb08e27a6f0f0010d5bb100e74e2e44615804..618e4d77db4cd5fe8e6ad3a0f08b2c67b5409186 100644
--- a/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -17,87 +17,226 @@
 #include "X86Subtarget.h"
 #include "llvm/MC/MCInstBuilder.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
 
-// Common code for X86 Uops and Latency runners.
-template <typename Impl> class X86SnippetGenerator : public Impl {
-  using Impl::Impl;
-
-  llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const override {
-    // Test whether we can generate a snippet for this instruction.
-    const auto &InstrInfo = this->State.getInstrInfo();
-    const auto OpcodeName = InstrInfo.getName(Opcode);
-    if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") ||
-        OpcodeName.startswith("ADJCALLSTACK")) {
+// Returns an error if we cannot handle the memory references in this
+// instruction.
+Error isInvalidMemoryInstr(const Instruction &Instr) {
+  switch (Instr.Description->TSFlags & X86II::FormMask) {
+  default:
+    llvm_unreachable("Unknown FormMask value");
+  // These have no memory access.
+  case X86II::Pseudo:
+  case X86II::RawFrm:
+  case X86II::MRMDestReg:
+  case X86II::MRMSrcReg:
+  case X86II::MRMSrcReg4VOp3:
+  case X86II::MRMSrcRegOp4:
+  case X86II::MRMXr:
+  case X86II::MRM0r:
+  case X86II::MRM1r:
+  case X86II::MRM2r:
+  case X86II::MRM3r:
+  case X86II::MRM4r:
+  case X86II::MRM5r:
+  case X86II::MRM6r:
+  case X86II::MRM7r:
+  case X86II::MRM_C0:
+  case X86II::MRM_C1:
+  case X86II::MRM_C2:
+  case X86II::MRM_C3:
+  case X86II::MRM_C4:
+  case X86II::MRM_C5:
+  case X86II::MRM_C6:
+  case X86II::MRM_C7:
+  case X86II::MRM_C8:
+  case X86II::MRM_C9:
+  case X86II::MRM_CA:
+  case X86II::MRM_CB:
+  case X86II::MRM_CC:
+  case X86II::MRM_CD:
+  case X86II::MRM_CE:
+  case X86II::MRM_CF:
+  case X86II::MRM_D0:
+  case X86II::MRM_D1:
+  case X86II::MRM_D2:
+  case X86II::MRM_D3:
+  case X86II::MRM_D4:
+  case X86II::MRM_D5:
+  case X86II::MRM_D6:
+  case X86II::MRM_D7:
+  case X86II::MRM_D8:
+  case X86II::MRM_D9:
+  case X86II::MRM_DA:
+  case X86II::MRM_DB:
+  case X86II::MRM_DC:
+  case X86II::MRM_DD:
+  case X86II::MRM_DE:
+  case X86II::MRM_DF:
+  case X86II::MRM_E0:
+  case X86II::MRM_E1:
+  case X86II::MRM_E2:
+  case X86II::MRM_E3:
+  case X86II::MRM_E4:
+  case X86II::MRM_E5:
+  case X86II::MRM_E6:
+  case X86II::MRM_E7:
+  case X86II::MRM_E8:
+  case X86II::MRM_E9:
+  case X86II::MRM_EA:
+  case X86II::MRM_EB:
+  case X86II::MRM_EC:
+  case X86II::MRM_ED:
+  case X86II::MRM_EE:
+  case X86II::MRM_EF:
+  case X86II::MRM_F0:
+  case X86II::MRM_F1:
+  case X86II::MRM_F2:
+  case X86II::MRM_F3:
+  case X86II::MRM_F4:
+  case X86II::MRM_F5:
+  case X86II::MRM_F6:
+  case X86II::MRM_F7:
+  case X86II::MRM_F8:
+  case X86II::MRM_F9:
+  case X86II::MRM_FA:
+  case X86II::MRM_FB:
+  case X86II::MRM_FC:
+  case X86II::MRM_FD:
+  case X86II::MRM_FE:
+  case X86II::MRM_FF:
+  case X86II::RawFrmImm8:
+    return Error::success();
+  case X86II::AddRegFrm:
+    return (Instr.Description->Opcode == X86::POP16r || Instr.Description->Opcode == X86::POP32r ||
+            Instr.Description->Opcode == X86::PUSH16r || Instr.Description->Opcode == X86::PUSH32r)
+               ? make_error<BenchmarkFailure>(
+                     "unsupported opcode: unsupported memory access")
+               : Error::success();
+  // These access memory and are handled.
+  case X86II::MRMDestMem:
+  case X86II::MRMSrcMem:
+  case X86II::MRMSrcMem4VOp3:
+  case X86II::MRMSrcMemOp4:
+  case X86II::MRMXm:
+  case X86II::MRM0m:
+  case X86II::MRM1m:
+  case X86II::MRM2m:
+  case X86II::MRM3m:
+  case X86II::MRM4m:
+  case X86II::MRM5m:
+  case X86II::MRM6m:
+  case X86II::MRM7m:
+    return Error::success();
+  // These access memory and are not handled yet.
+  case X86II::RawFrmImm16:
+  case X86II::RawFrmMemOffs:
+  case X86II::RawFrmSrc:
+  case X86II::RawFrmDst:
+  case X86II::RawFrmDstSrc:
+    return make_error<BenchmarkFailure>(
+        "unsupported opcode: non uniform memory access");
+  }
+}
+
+static llvm::Error IsInvalidOpcode(const Instruction &Instr) {
+  const auto OpcodeName = Instr.Name;
+  if ((Instr.Description->TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return llvm::make_error<BenchmarkFailure>(
+        "unsupported opcode: pseudo instruction");
+  if (OpcodeName.startswith("POPF") || OpcodeName.startswith("PUSHF") ||
+      OpcodeName.startswith("ADJCALLSTACK"))
+    return llvm::make_error<BenchmarkFailure>(
+        "unsupported opcode: Push/Pop/AdjCallStack");
+  if (llvm::Error Error = isInvalidMemoryInstr(Instr))
+    return Error;
+  // We do not handle instructions with OPERAND_PCREL.
+  for (const Operand &Op : Instr.Operands)
+    if (Op.isExplicit() &&
+        Op.getExplicitOperandInfo().OperandType == llvm::MCOI::OPERAND_PCREL)
       return llvm::make_error<BenchmarkFailure>(
-          "Unsupported opcode: Push/Pop/AdjCallStack");
-    }
+          "unsupported opcode: PC relative operand");
+  // We do not handle second-form X87 instructions. We only handle first-form
+  // ones (_Fp), see comment in X86InstrFPStack.td.
+  for (const Operand &Op : Instr.Operands)
+    if (Op.isReg() && Op.isExplicit() &&
+        Op.getExplicitOperandInfo().RegClass == llvm::X86::RSTRegClassID)
+      return llvm::make_error<BenchmarkFailure>(
+          "unsupported second-form X87 instruction");
+  return llvm::Error::success();
+}
+
+static unsigned GetX86FPFlags(const Instruction &Instr) {
+  return Instr.Description->TSFlags & llvm::X86II::FPTypeMask;
+}
+
+class X86LatencySnippetGenerator : public LatencySnippetGenerator {
+public:
+  using LatencySnippetGenerator::LatencySnippetGenerator;
 
-    // Handle X87.
-    const auto &InstrDesc = InstrInfo.get(Opcode);
-    const unsigned FPInstClass = InstrDesc.TSFlags & llvm::X86II::FPTypeMask;
-    const Instruction Instr(InstrDesc, this->RATC);
-    switch (FPInstClass) {
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override {
+    if (auto E = IsInvalidOpcode(Instr))
+      return std::move(E);
+
+    switch (GetX86FPFlags(Instr)) {
     case llvm::X86II::NotFP:
-      break;
+      return LatencySnippetGenerator::generateCodeTemplates(Instr);
     case llvm::X86II::ZeroArgFP:
-      return llvm::make_error<BenchmarkFailure>("Unsupported x87 ZeroArgFP");
     case llvm::X86II::OneArgFP:
-      return llvm::make_error<BenchmarkFailure>("Unsupported x87 OneArgFP");
+    case llvm::X86II::SpecialFP:
+    case llvm::X86II::CompareFP:
+    case llvm::X86II::CondMovFP:
+      return llvm::make_error<BenchmarkFailure>("Unsupported x87 Instruction");
     case llvm::X86II::OneArgFPRW:
-    case llvm::X86II::TwoArgFP: {
+    case llvm::X86II::TwoArgFP:
       // These are instructions like
       //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
       //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
       // They are intrinsically serial and do not modify the state of the stack.
-      // We generate the same code for latency and uops.
-      return this->generateSelfAliasingCodeTemplate(Instr);
-    }
-    case llvm::X86II::CompareFP:
-      return Impl::handleCompareFP(Instr);
-    case llvm::X86II::CondMovFP:
-      return Impl::handleCondMovFP(Instr);
-    case llvm::X86II::SpecialFP:
-      return llvm::make_error<BenchmarkFailure>("Unsupported x87 SpecialFP");
+      return generateSelfAliasingCodeTemplates(Instr);
     default:
       llvm_unreachable("Unknown FP Type!");
     }
-
-    // Fallback to generic implementation.
-    return Impl::Base::generateCodeTemplate(Opcode);
   }
 };
 
-class X86LatencyImpl : public LatencySnippetGenerator {
-protected:
-  using Base = LatencySnippetGenerator;
-  using Base::Base;
-  llvm::Expected<CodeTemplate> handleCompareFP(const Instruction &Instr) const {
-    return llvm::make_error<SnippetGeneratorFailure>(
-        "Unsupported x87 CompareFP");
-  }
-  llvm::Expected<CodeTemplate> handleCondMovFP(const Instruction &Instr) const {
-    return llvm::make_error<SnippetGeneratorFailure>(
-        "Unsupported x87 CondMovFP");
-  }
-};
+class X86UopsSnippetGenerator : public UopsSnippetGenerator {
+public:
+  using UopsSnippetGenerator::UopsSnippetGenerator;
 
-class X86UopsImpl : public UopsSnippetGenerator {
-protected:
-  using Base = UopsSnippetGenerator;
-  using Base::Base;
-  // We can compute uops for any FP instruction that does not grow or shrink the
-  // stack (either do not touch the stack or push as much as they pop).
-  llvm::Expected<CodeTemplate> handleCompareFP(const Instruction &Instr) const {
-    return generateUnconstrainedCodeTemplate(
-        Instr, "instruction does not grow/shrink the FP stack");
-  }
-  llvm::Expected<CodeTemplate> handleCondMovFP(const Instruction &Instr) const {
-    return generateUnconstrainedCodeTemplate(
-        Instr, "instruction does not grow/shrink the FP stack");
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override {
+    if (auto E = IsInvalidOpcode(Instr))
+      return std::move(E);
+
+    switch (GetX86FPFlags(Instr)) {
+    case llvm::X86II::NotFP:
+      return UopsSnippetGenerator::generateCodeTemplates(Instr);
+    case llvm::X86II::ZeroArgFP:
+    case llvm::X86II::OneArgFP:
+    case llvm::X86II::SpecialFP:
+      return llvm::make_error<BenchmarkFailure>("Unsupported x87 Instruction");
+    case llvm::X86II::OneArgFPRW:
+    case llvm::X86II::TwoArgFP:
+      // These are instructions like
+      //   - `ST(0) = fsqrt(ST(0))` (OneArgFPRW)
+      //   - `ST(0) = ST(0) + ST(i)` (TwoArgFP)
+      // They are intrinsically serial and do not modify the state of the stack.
+      // We generate the same code for latency and uops.
+      return generateSelfAliasingCodeTemplates(Instr);
+    case llvm::X86II::CompareFP:
+    case llvm::X86II::CondMovFP:
+      // We can compute uops for any FP instruction that does not grow or shrink
+      // the stack (either do not touch the stack or push as much as they pop).
+      return generateUnconstrainedCodeTemplates(
+          Instr, "instruction does not grow/shrink the FP stack");
+    default:
+      llvm_unreachable("Unknown FP Type!");
+    }
   }
 };
 
@@ -182,12 +321,10 @@ struct ConstantInliner {
     return std::move(Instructions);
   }
 
-  std::vector<llvm::MCInst>
-  loadX87AndFinalize(unsigned Reg, unsigned RegBitWidth, unsigned Opcode) {
-    assert((RegBitWidth & 7) == 0 &&
-           "RegBitWidth must be a multiple of 8 bits");
-    initStack(RegBitWidth / 8);
-    add(llvm::MCInstBuilder(Opcode)
+  std::vector<llvm::MCInst> loadX87STAndFinalize(unsigned Reg) {
+    initStack(kF80Bytes);
+    add(llvm::MCInstBuilder(llvm::X86::LD_F80m)
+            // Address = ESP
             .addReg(llvm::X86::RSP) // BaseReg
             .addImm(1)              // ScaleAmt
             .addReg(0)              // IndexReg
@@ -195,7 +332,21 @@ struct ConstantInliner {
             .addReg(0));            // Segment
     if (Reg != llvm::X86::ST0)
       add(llvm::MCInstBuilder(llvm::X86::ST_Frr).addReg(Reg));
-    add(releaseStackSpace(RegBitWidth / 8));
+    add(releaseStackSpace(kF80Bytes));
+    return std::move(Instructions);
+  }
+
+  std::vector<llvm::MCInst> loadX87FPAndFinalize(unsigned Reg) {
+    initStack(kF80Bytes);
+    add(llvm::MCInstBuilder(llvm::X86::LD_Fp80m)
+            .addReg(Reg)
+            // Address = ESP
+            .addReg(llvm::X86::RSP) // BaseReg
+            .addImm(1)              // ScaleAmt
+            .addReg(0)              // IndexReg
+            .addImm(0)              // Disp
+            .addReg(0));            // Segment
+    add(releaseStackSpace(kF80Bytes));
     return std::move(Instructions);
   }
 
@@ -206,6 +357,8 @@ struct ConstantInliner {
   }
 
 private:
+  static constexpr const unsigned kF80Bytes = 10; // 80 bits.
+
   ConstantInliner &add(const llvm::MCInst &Inst) {
     Instructions.push_back(Inst);
     return *this;
@@ -239,7 +392,13 @@ private:
   std::vector<llvm::MCInst> Instructions;
 };
 
+#include "X86GenExegesis.inc"
+
 class ExegesisX86Target : public ExegesisTarget {
+public:
+  ExegesisX86Target() : ExegesisTarget(X86CpuPfmCounters) {}
+
+private:
   void addTargetSpecificPasses(llvm::PassManagerBase &PM) const override {
     // Lowers FP pseudo-instructions, e.g. ABS_Fp32 -> ABS_F.
     PM.add(llvm::createX86FloatingPointStackifierPass());
@@ -258,33 +417,28 @@ class ExegesisX86Target : public ExegesisTarget {
 
   void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg,
                           unsigned Offset) const override {
-    // FIXME: For instructions that read AND write to memory, we use the same
-    // value for input and output.
-    for (size_t I = 0, E = IT.Instr.Operands.size(); I < E; ++I) {
-      const Operand *Op = &IT.Instr.Operands[I];
-      if (Op->isExplicit() && Op->isMemory()) {
-        // Case 1: 5-op memory.
-        assert((I + 5 <= E) && "x86 memory references are always 5 ops");
-        IT.getValueFor(*Op) = llvm::MCOperand::createReg(Reg); // BaseReg
-        Op = &IT.Instr.Operands[++I];
-        assert(Op->isMemory());
-        assert(Op->isExplicit());
-        IT.getValueFor(*Op) = llvm::MCOperand::createImm(1); // ScaleAmt
-        Op = &IT.Instr.Operands[++I];
-        assert(Op->isMemory());
-        assert(Op->isExplicit());
-        IT.getValueFor(*Op) = llvm::MCOperand::createReg(0); // IndexReg
-        Op = &IT.Instr.Operands[++I];
-        assert(Op->isMemory());
-        assert(Op->isExplicit());
-        IT.getValueFor(*Op) = llvm::MCOperand::createImm(Offset); // Disp
-        Op = &IT.Instr.Operands[++I];
-        assert(Op->isMemory());
-        assert(Op->isExplicit());
-        IT.getValueFor(*Op) = llvm::MCOperand::createReg(0); // Segment
-        // Case2: segment:index addressing. We assume that ES is 0.
+    assert(!isInvalidMemoryInstr(IT.Instr) &&
+           "fillMemoryOperands requires a valid memory instruction");
+    int MemOpIdx = X86II::getMemoryOperandNo(IT.Instr.Description->TSFlags);
+    assert(MemOpIdx >= 0 && "invalid memory operand index");
+    // getMemoryOperandNo() ignores tied operands, so we have to add them back.
+    for (unsigned I = 0; I <= static_cast<unsigned>(MemOpIdx); ++I) {
+      const auto &Op = IT.Instr.Operands[I];
+      if (Op.isTied() && Op.getTiedToIndex() < I) {
+        ++MemOpIdx;
       }
     }
+    // Now fill in the memory operands.
+    const auto SetOp = [&IT](int OpIdx, const MCOperand &OpVal) {
+      const auto Op = IT.Instr.Operands[OpIdx];
+      assert(Op.isMemory() && Op.isExplicit() && "invalid memory pattern");
+      IT.getValueFor(Op) = OpVal;
+    };
+    SetOp(MemOpIdx + 0, MCOperand::createReg(Reg));    // BaseReg
+    SetOp(MemOpIdx + 1, MCOperand::createImm(1));      // ScaleAmt
+    SetOp(MemOpIdx + 2, MCOperand::createReg(0));      // IndexReg
+    SetOp(MemOpIdx + 3, MCOperand::createImm(Offset)); // Disp
+    SetOp(MemOpIdx + 4, MCOperand::createReg(0));      // Segment
   }
 
   std::vector<llvm::MCInst> setRegTo(const llvm::MCSubtargetInfo &STI,
@@ -318,12 +472,12 @@ class ExegesisX86Target : public ExegesisTarget {
       if (STI.getFeatureBits()[llvm::X86::FeatureAVX512])
         return CI.loadAndFinalize(Reg, 512, llvm::X86::VMOVDQU32Zrm);
     if (llvm::X86::RSTRegClass.contains(Reg)) {
-      if (Value.getBitWidth() == 32)
-        return CI.loadX87AndFinalize(Reg, 32, llvm::X86::LD_F32m);
-      if (Value.getBitWidth() == 64)
-        return CI.loadX87AndFinalize(Reg, 64, llvm::X86::LD_F64m);
-      if (Value.getBitWidth() == 80)
-        return CI.loadX87AndFinalize(Reg, 80, llvm::X86::LD_F80m);
+      return CI.loadX87STAndFinalize(Reg);
+    }
+    if (llvm::X86::RFP32RegClass.contains(Reg) ||
+        llvm::X86::RFP64RegClass.contains(Reg) ||
+        llvm::X86::RFP80RegClass.contains(Reg)) {
+      return CI.loadX87FPAndFinalize(Reg);
     }
     if (Reg == llvm::X86::EFLAGS)
       return CI.popFlagAndFinalize();
@@ -332,12 +486,12 @@ class ExegesisX86Target : public ExegesisTarget {
 
   std::unique_ptr<SnippetGenerator>
   createLatencySnippetGenerator(const LLVMState &State) const override {
-    return llvm::make_unique<X86SnippetGenerator<X86LatencyImpl>>(State);
+    return llvm::make_unique<X86LatencySnippetGenerator>(State);
   }
 
   std::unique_ptr<SnippetGenerator>
   createUopsSnippetGenerator(const LLVMState &State) const override {
-    return llvm::make_unique<X86SnippetGenerator<X86UopsImpl>>(State);
+    return llvm::make_unique<X86UopsSnippetGenerator>(State);
   }
 
   bool matchesArch(llvm::Triple::ArchType Arch) const override {
@@ -357,3 +511,4 @@ void InitializeX86ExegesisTarget() {
 }
 
 } // namespace exegesis
+} // namespace llvm
diff --git a/tools/llvm-exegesis/llvm-exegesis.cpp b/tools/llvm-exegesis/llvm-exegesis.cpp
index 8fed1375c6fa2c78aa81139db1d103b14c82bd17..a28e68ec006d3d7bbadd0f866fddc6500697c928 100644
--- a/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -38,72 +38,81 @@
 #include <algorithm>
 #include <string>
 
-static llvm::cl::opt<unsigned>
-    OpcodeIndex("opcode-index", llvm::cl::desc("opcode to measure, by index"),
-                llvm::cl::init(0));
-
-static llvm::cl::opt<std::string>
-    OpcodeName("opcode-name", llvm::cl::desc("opcode to measure, by name"),
-               llvm::cl::init(""));
-
-static llvm::cl::opt<std::string>
-    SnippetsFile("snippets-file", llvm::cl::desc("code snippets to measure"),
-                 llvm::cl::init(""));
-
-static llvm::cl::opt<std::string>
-    BenchmarkFile("benchmarks-file", llvm::cl::desc(""), llvm::cl::init(""));
-
-static llvm::cl::opt<exegesis::InstructionBenchmark::ModeE> BenchmarkMode(
-    "mode", llvm::cl::desc("the mode to run"),
-    llvm::cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency,
-                                "latency", "Instruction Latency"),
-                     clEnumValN(exegesis::InstructionBenchmark::Uops, "uops",
-                                "Uop Decomposition"),
-                     // When not asking for a specific benchmark mode, we'll
-                     // analyse the results.
-                     clEnumValN(exegesis::InstructionBenchmark::Unknown,
-                                "analysis", "Analysis")));
-
-static llvm::cl::opt<unsigned>
+namespace llvm {
+namespace exegesis {
+
+static cl::opt<int> OpcodeIndex("opcode-index",
+                                cl::desc("opcode to measure, by index"),
+                                cl::init(0));
+
+static cl::opt<std::string>
+    OpcodeNames("opcode-name",
+                cl::desc("comma-separated list of opcodes to measure, by name"),
+                cl::init(""));
+
+static cl::opt<std::string> SnippetsFile("snippets-file",
+                                         cl::desc("code snippets to measure"),
+                                         cl::init(""));
+
+static cl::opt<std::string> BenchmarkFile("benchmarks-file", cl::desc(""),
+                                          cl::init(""));
+
+static cl::opt<exegesis::InstructionBenchmark::ModeE>
+    BenchmarkMode("mode", cl::desc("the mode to run"),
+                  cl::values(clEnumValN(exegesis::InstructionBenchmark::Latency,
+                                        "latency", "Instruction Latency"),
+                             clEnumValN(exegesis::InstructionBenchmark::Uops,
+                                        "uops", "Uop Decomposition"),
+                             // When not asking for a specific benchmark mode,
+                             // we'll analyse the results.
+                             clEnumValN(exegesis::InstructionBenchmark::Unknown,
+                                        "analysis", "Analysis")));
+
+static cl::opt<unsigned>
     NumRepetitions("num-repetitions",
-                   llvm::cl::desc("number of time to repeat the asm snippet"),
-                   llvm::cl::init(10000));
+                   cl::desc("number of time to repeat the asm snippet"),
+                   cl::init(10000));
 
-static llvm::cl::opt<bool> IgnoreInvalidSchedClass(
+static cl::opt<bool> IgnoreInvalidSchedClass(
     "ignore-invalid-sched-class",
-    llvm::cl::desc("ignore instructions that do not define a sched class"),
-    llvm::cl::init(false));
+    cl::desc("ignore instructions that do not define a sched class"),
+    cl::init(false));
 
-static llvm::cl::opt<unsigned> AnalysisNumPoints(
+static cl::opt<unsigned> AnalysisNumPoints(
     "analysis-numpoints",
-    llvm::cl::desc("minimum number of points in an analysis cluster"),
-    llvm::cl::init(3));
+    cl::desc("minimum number of points in an analysis cluster"), cl::init(3));
 
-static llvm::cl::opt<float>
+static cl::opt<float>
     AnalysisEpsilon("analysis-epsilon",
-                    llvm::cl::desc("dbscan epsilon for analysis clustering"),
-                    llvm::cl::init(0.1));
+                    cl::desc("dbscan epsilon for analysis clustering"),
+                    cl::init(0.1));
 
-static llvm::cl::opt<std::string>
-    AnalysisClustersOutputFile("analysis-clusters-output-file",
-                               llvm::cl::desc(""), llvm::cl::init("-"));
-static llvm::cl::opt<std::string>
+static cl::opt<std::string>
+    AnalysisClustersOutputFile("analysis-clusters-output-file", cl::desc(""),
+                               cl::init("-"));
+static cl::opt<std::string>
     AnalysisInconsistenciesOutputFile("analysis-inconsistencies-output-file",
-                                      llvm::cl::desc(""), llvm::cl::init("-"));
+                                      cl::desc(""), cl::init("-"));
 
-namespace exegesis {
+static cl::opt<std::string>
+    CpuName("mcpu",
+            cl::desc(
+                "cpu name to use for pfm counters, leave empty to autodetect"),
+            cl::init(""));
 
-static llvm::ExitOnError ExitOnErr;
+
+static ExitOnError ExitOnErr;
 
 #ifdef LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET
 void LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET();
 #endif
 
-// Checks that only one of OpcodeName, OpcodeIndex or SnippetsFile is provided,
-// and returns the opcode index or 0 if snippets should be read from
+// Checks that only one of OpcodeNames, OpcodeIndex or SnippetsFile is provided,
+// and returns the opcode indices or {} if snippets should be read from
 // `SnippetsFile`.
-static unsigned getOpcodeOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
-  const size_t NumSetFlags = (OpcodeName.empty() ? 0 : 1) +
+static std::vector<unsigned>
+getOpcodesOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
+  const size_t NumSetFlags = (OpcodeNames.empty() ? 0 : 1) +
                              (OpcodeIndex == 0 ? 0 : 1) +
                              (SnippetsFile.empty() ? 0 : 1);
   if (NumSetFlags != 1)
@@ -111,25 +120,42 @@ static unsigned getOpcodeOrDie(const llvm::MCInstrInfo &MCInstrInfo) {
         "please provide one and only one of 'opcode-index', 'opcode-name' or "
         "'snippets-file'");
   if (!SnippetsFile.empty())
-    return 0;
+    return {};
   if (OpcodeIndex > 0)
-    return OpcodeIndex;
+    return {static_cast<unsigned>(OpcodeIndex)};
+  if (OpcodeIndex < 0) {
+    std::vector<unsigned> Result;
+    for (unsigned I = 1, E = MCInstrInfo.getNumOpcodes(); I < E; ++I)
+      Result.push_back(I);
+    return Result;
+  }
   // Resolve opcode name -> opcode.
-  for (unsigned I = 0, E = MCInstrInfo.getNumOpcodes(); I < E; ++I)
-    if (MCInstrInfo.getName(I) == OpcodeName)
-      return I;
-  llvm::report_fatal_error(llvm::Twine("unknown opcode ").concat(OpcodeName));
+  const auto ResolveName =
+      [&MCInstrInfo](llvm::StringRef OpcodeName) -> unsigned {
+    for (unsigned I = 1, E = MCInstrInfo.getNumOpcodes(); I < E; ++I)
+      if (MCInstrInfo.getName(I) == OpcodeName)
+        return I;
+    return 0u;
+  };
+  llvm::SmallVector<llvm::StringRef, 2> Pieces;
+  llvm::StringRef(OpcodeNames.getValue())
+      .split(Pieces, ",", /* MaxSplit */ -1, /* KeepEmpty */ false);
+  std::vector<unsigned> Result;
+  for (const llvm::StringRef OpcodeName : Pieces) {
+    if (unsigned Opcode = ResolveName(OpcodeName))
+      Result.push_back(Opcode);
+    else
+      llvm::report_fatal_error(
+          llvm::Twine("unknown opcode ").concat(OpcodeName));
+  }
+  return Result;
 }
 
 // Generates code snippets for opcode `Opcode`.
 static llvm::Expected<std::vector<BenchmarkCode>>
 generateSnippets(const LLVMState &State, unsigned Opcode) {
-  const std::unique_ptr<SnippetGenerator> Generator =
-      State.getExegesisTarget().createSnippetGenerator(BenchmarkMode, State);
-  if (!Generator)
-    llvm::report_fatal_error("cannot create snippet generator");
-
-  const llvm::MCInstrDesc &InstrDesc = State.getInstrInfo().get(Opcode);
+  const Instruction &Instr = State.getIC().getInstr(Opcode);
+  const llvm::MCInstrDesc &InstrDesc = *Instr.Description;
   // Ignore instructions that we cannot run.
   if (InstrDesc.isPseudo())
     return llvm::make_error<BenchmarkFailure>("Unsupported opcode: isPseudo");
@@ -140,7 +166,11 @@ generateSnippets(const LLVMState &State, unsigned Opcode) {
     return llvm::make_error<BenchmarkFailure>(
         "Unsupported opcode: isCall/isReturn");
 
-  return Generator->generateConfigurations(Opcode);
+  const std::unique_ptr<SnippetGenerator> Generator =
+      State.getExegesisTarget().createSnippetGenerator(BenchmarkMode, State);
+  if (!Generator)
+    llvm::report_fatal_error("cannot create snippet generator");
+  return Generator->generateConfigurations(Instr);
 }
 
 namespace {
@@ -298,19 +328,30 @@ void benchmarkMain() {
   LLVM_EXEGESIS_INITIALIZE_NATIVE_TARGET();
 #endif
 
-  const LLVMState State;
-  const auto Opcode = getOpcodeOrDie(State.getInstrInfo());
+  const LLVMState State(CpuName);
+  const auto Opcodes = getOpcodesOrDie(State.getInstrInfo());
 
   std::vector<BenchmarkCode> Configurations;
-  if (Opcode > 0) {
-    // Ignore instructions without a sched class if -ignore-invalid-sched-class
-    // is passed.
-    if (IgnoreInvalidSchedClass &&
-        State.getInstrInfo().get(Opcode).getSchedClass() == 0) {
-      llvm::errs() << "ignoring instruction without sched class\n";
-      return;
+  if (!Opcodes.empty()) {
+    for (const unsigned Opcode : Opcodes) {
+      // Ignore instructions without a sched class if
+      // -ignore-invalid-sched-class is passed.
+      if (IgnoreInvalidSchedClass &&
+          State.getInstrInfo().get(Opcode).getSchedClass() == 0) {
+        llvm::errs() << State.getInstrInfo().getName(Opcode)
+                     << ": ignoring instruction without sched class\n";
+        continue;
+      }
+      auto ConfigsForInstr = generateSnippets(State, Opcode);
+      if (!ConfigsForInstr) {
+        llvm::logAllUnhandledErrors(
+            ConfigsForInstr.takeError(), llvm::errs(),
+            llvm::Twine(State.getInstrInfo().getName(Opcode)).concat(": "));
+        continue;
+      }
+      std::move(ConfigsForInstr->begin(), ConfigsForInstr->end(),
+                std::back_inserter(Configurations));
     }
-    Configurations = ExitOnErr(generateSnippets(State, Opcode));
   } else {
     Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
   }
@@ -365,7 +406,7 @@ static void analysisMain() {
   llvm::InitializeNativeTargetAsmPrinter();
   llvm::InitializeNativeTargetDisassembler();
   // Read benchmarks.
-  const LLVMState State;
+  const LLVMState State("");
   const std::vector<InstructionBenchmark> Points =
       ExitOnErr(InstructionBenchmark::readYamls(State, BenchmarkFile));
   llvm::outs() << "Parsed " << Points.size() << " benchmark points\n";
@@ -396,9 +437,11 @@ static void analysisMain() {
 }
 
 } // namespace exegesis
+} // namespace llvm
 
 int main(int Argc, char **Argv) {
-  llvm::cl::ParseCommandLineOptions(Argc, Argv, "");
+  using namespace llvm;
+  cl::ParseCommandLineOptions(Argc, Argv, "");
 
   exegesis::ExitOnErr.setExitCodeMapper([](const llvm::Error &Err) {
     if (Err.isA<llvm::StringError>())
@@ -406,7 +449,7 @@ int main(int Argc, char **Argv) {
     return EXIT_FAILURE;
   });
 
-  if (BenchmarkMode == exegesis::InstructionBenchmark::Unknown) {
+  if (exegesis::BenchmarkMode == exegesis::InstructionBenchmark::Unknown) {
     exegesis::analysisMain();
   } else {
     exegesis::benchmarkMain();
diff --git a/tools/llvm-lto2/llvm-lto2.cpp b/tools/llvm-lto2/llvm-lto2.cpp
index 442973f90209310bb39a926f89eedaf5e021aa04..26426367e252ed4a8650bada41ed90109a098e3e 100644
--- a/tools/llvm-lto2/llvm-lto2.cpp
+++ b/tools/llvm-lto2/llvm-lto2.cpp
@@ -23,6 +23,7 @@
 #include "llvm/LTO/LTO.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Threading.h"
 
@@ -388,6 +389,7 @@ static int dumpSymtab(int argc, char **argv) {
 }
 
 int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
   InitializeAllTargets();
   InitializeAllTargetMCs();
   InitializeAllAsmPrinters();
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 0263c866f77230137771f55d73c8aeabb3a4d876..c0976502f54533ffa1e7e93263f0c48fd07498d3 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -164,6 +164,10 @@ MainFileName("main-file-name",
 static cl::opt<bool> SaveTempLabels("save-temp-labels",
                                     cl::desc("Don't discard temporary labels"));
 
+static cl::opt<bool> LexMasmIntegers(
+    "masm-integers",
+    cl::desc("Enable binary and hex masm integers (0b110 and 0ABCh)"));
+
 static cl::opt<bool> NoExecStack("no-exec-stack",
                                  cl::desc("File doesn't need an exec stack"));
 
@@ -293,6 +297,7 @@ static int AssembleInput(const char *ProgName, const Target *TheTarget,
     return SymbolResult;
   Parser->setShowParsedOperands(ShowInstOperands);
   Parser->setTargetParser(*TAP);
+  Parser->getLexer().setLexMasmIntegers(LexMasmIntegers);
 
   int Res = Parser->Run(NoInitialTextSection);
 
diff --git a/tools/llvm-mca/CMakeLists.txt b/tools/llvm-mca/CMakeLists.txt
index fead673ef698d5d133ef6a59c332ef8710d275d0..4339d48d461831e7fe3c96c561106d069f508fb9 100644
--- a/tools/llvm-mca/CMakeLists.txt
+++ b/tools/llvm-mca/CMakeLists.txt
@@ -14,6 +14,7 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_tool(llvm-mca
   llvm-mca.cpp
   CodeRegion.cpp
+  CodeRegionGenerator.cpp
   PipelinePrinter.cpp
   Views/DispatchStatistics.cpp
   Views/InstructionInfoView.cpp
diff --git a/tools/llvm-mca/CodeRegion.cpp b/tools/llvm-mca/CodeRegion.cpp
index 896865996504201e11b6f0e93adf115fac33ece2..29a27c50c171f3ff5f99fedf53ffc4a1c51af298 100644
--- a/tools/llvm-mca/CodeRegion.cpp
+++ b/tools/llvm-mca/CodeRegion.cpp
@@ -14,11 +14,10 @@
 
 #include "CodeRegion.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
-bool CodeRegion::isLocInRange(SMLoc Loc) const {
+bool CodeRegion::isLocInRange(llvm::SMLoc Loc) const {
   if (RangeEnd.isValid() && Loc.getPointer() > RangeEnd.getPointer())
     return false;
   if (RangeStart.isValid() && Loc.getPointer() < RangeStart.getPointer())
@@ -26,11 +25,11 @@ bool CodeRegion::isLocInRange(SMLoc Loc) const {
   return true;
 }
 
-void CodeRegions::beginRegion(StringRef Description, SMLoc Loc) {
+void CodeRegions::beginRegion(llvm::StringRef Description, llvm::SMLoc Loc) {
   assert(!Regions.empty() && "Missing Default region");
   const CodeRegion &CurrentRegion = *Regions.back();
   if (CurrentRegion.startLoc().isValid() && !CurrentRegion.endLoc().isValid()) {
-    SM.PrintMessage(Loc, SourceMgr::DK_Warning,
+    SM.PrintMessage(Loc, llvm::SourceMgr::DK_Warning,
                     "Ignoring invalid region start");
     return;
   }
@@ -41,26 +40,28 @@ void CodeRegions::beginRegion(StringRef Description, SMLoc Loc) {
   addRegion(Description, Loc);
 }
 
-void CodeRegions::endRegion(SMLoc Loc) {
+void CodeRegions::endRegion(llvm::SMLoc Loc) {
   assert(!Regions.empty() && "Missing Default region");
   CodeRegion &CurrentRegion = *Regions.back();
   if (CurrentRegion.endLoc().isValid()) {
-    SM.PrintMessage(Loc, SourceMgr::DK_Warning, "Ignoring invalid region end");
+    SM.PrintMessage(Loc, llvm::SourceMgr::DK_Warning,
+                    "Ignoring invalid region end");
     return;
   }
 
   CurrentRegion.setEndLocation(Loc);
 }
 
-void CodeRegions::addInstruction(std::unique_ptr<const MCInst> Instruction) {
-  const SMLoc &Loc = Instruction->getLoc();
+void CodeRegions::addInstruction(const llvm::MCInst &Instruction) {
+  const llvm::SMLoc &Loc = Instruction.getLoc();
   const auto It =
       std::find_if(Regions.rbegin(), Regions.rend(),
                    [Loc](const std::unique_ptr<CodeRegion> &Region) {
                      return Region->isLocInRange(Loc);
                    });
   if (It != Regions.rend())
-    (*It)->addInstruction(std::move(Instruction));
+    (*It)->addInstruction(Instruction);
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/CodeRegion.h b/tools/llvm-mca/CodeRegion.h
index 7f0025e4884cab21a2979af43e670c307cf8e215..867aa18bb4fe0d3f70c24767b5bb347b7b4983d8 100644
--- a/tools/llvm-mca/CodeRegion.h
+++ b/tools/llvm-mca/CodeRegion.h
@@ -34,12 +34,14 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_CODEREGION_H
 #define LLVM_TOOLS_LLVM_MCA_CODEREGION_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include <vector>
 
+namespace llvm {
 namespace mca {
 
 /// A region of assembly code.
@@ -49,7 +51,7 @@ class CodeRegion {
   // An optional descriptor for this region.
   llvm::StringRef Description;
   // Instructions that form this region.
-  std::vector<std::unique_ptr<const llvm::MCInst>> Instructions;
+  std::vector<llvm::MCInst> Instructions;
   // Source location range.
   llvm::SMLoc RangeStart;
   llvm::SMLoc RangeEnd;
@@ -61,8 +63,8 @@ public:
   CodeRegion(llvm::StringRef Desc, llvm::SMLoc Start)
       : Description(Desc), RangeStart(Start), RangeEnd() {}
 
-  void addInstruction(std::unique_ptr<const llvm::MCInst> Instruction) {
-    Instructions.emplace_back(std::move(Instruction));
+  void addInstruction(const llvm::MCInst &Instruction) {
+    Instructions.emplace_back(Instruction);
   }
 
   llvm::SMLoc startLoc() const { return RangeStart; }
@@ -72,10 +74,7 @@ public:
   bool empty() const { return Instructions.empty(); }
   bool isLocInRange(llvm::SMLoc Loc) const;
 
-  const std::vector<std::unique_ptr<const llvm::MCInst>> &
-  getInstructions() const {
-    return Instructions;
-  }
+  llvm::ArrayRef<llvm::MCInst> getInstructions() const { return Instructions; }
 
   llvm::StringRef getDescription() const { return Description; }
 };
@@ -106,26 +105,26 @@ public:
 
   void beginRegion(llvm::StringRef Description, llvm::SMLoc Loc);
   void endRegion(llvm::SMLoc Loc);
-  void addInstruction(std::unique_ptr<const llvm::MCInst> Instruction);
+  void addInstruction(const llvm::MCInst &Instruction);
+  llvm::SourceMgr &getSourceMgr() const { return SM; }
 
   CodeRegions(llvm::SourceMgr &S) : SM(S) {
     // Create a default region for the input code sequence.
     addRegion("Default", llvm::SMLoc());
   }
 
-  const std::vector<std::unique_ptr<const llvm::MCInst>> &
-  getInstructionSequence(unsigned Idx) const {
+  llvm::ArrayRef<llvm::MCInst> getInstructionSequence(unsigned Idx) const {
     return Regions[Idx]->getInstructions();
   }
 
   bool empty() const {
-    return std::all_of(Regions.begin(), Regions.end(),
-                       [](const std::unique_ptr<CodeRegion> &Region) {
-                         return Region->empty();
-                       });
+    return llvm::all_of(Regions, [](const std::unique_ptr<CodeRegion> &Region) {
+      return Region->empty();
+    });
   }
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/CodeRegionGenerator.cpp b/tools/llvm-mca/CodeRegionGenerator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5bd37adeeae999a911f208ba9d6eb2a1e7915e7b
--- /dev/null
+++ b/tools/llvm-mca/CodeRegionGenerator.cpp
@@ -0,0 +1,137 @@
+//===----------------------- CodeRegionGenerator.cpp ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines classes responsible for generating llvm-mca
+/// CodeRegions from various types of input. llvm-mca only analyzes CodeRegions,
+/// so the classes here provide the input-to-CodeRegions translation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeRegionGenerator.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/SMLoc.h"
+#include <memory>
+
+namespace llvm {
+namespace mca {
+
+// This virtual dtor serves as the anchor for the CodeRegionGenerator class.
+CodeRegionGenerator::~CodeRegionGenerator() {}
+
+// A comment consumer that parses strings.  The only valid tokens are strings.
+class MCACommentConsumer : public AsmCommentConsumer {
+public:
+  CodeRegions &Regions;
+
+  MCACommentConsumer(CodeRegions &R) : Regions(R) {}
+  void HandleComment(SMLoc Loc, StringRef CommentText) override;
+};
+
+// This class provides the callbacks that occur when parsing input assembly.
+class MCStreamerWrapper final : public MCStreamer {
+  CodeRegions &Regions;
+
+public:
+  MCStreamerWrapper(MCContext &Context, mca::CodeRegions &R)
+      : MCStreamer(Context), Regions(R) {}
+
+  // We only want to intercept the emission of new instructions.
+  virtual void EmitInstruction(const MCInst &Inst,
+                               const MCSubtargetInfo & /* unused */,
+                               bool /* unused */) override {
+    Regions.addInstruction(Inst);
+  }
+
+  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
+    return true;
+  }
+
+  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                        unsigned ByteAlignment) override {}
+  void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+                    uint64_t Size = 0, unsigned ByteAlignment = 0,
+                    SMLoc Loc = SMLoc()) override {}
+  void EmitGPRel32Value(const MCExpr *Value) override {}
+  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+  void EmitCOFFSymbolStorageClass(int StorageClass) override {}
+  void EmitCOFFSymbolType(int Type) override {}
+  void EndCOFFSymbolDef() override {}
+
+  ArrayRef<MCInst> GetInstructionSequence(unsigned Index) const {
+    return Regions.getInstructionSequence(Index);
+  }
+};
+
+void MCACommentConsumer::HandleComment(SMLoc Loc, StringRef CommentText) {
+  // Skip empty comments.
+  StringRef Comment(CommentText);
+  if (Comment.empty())
+    return;
+
+  // Skip spaces and tabs.
+  unsigned Position = Comment.find_first_not_of(" \t");
+  if (Position >= Comment.size())
+    // We reached the end of the comment. Bail out.
+    return;
+
+  Comment = Comment.drop_front(Position);
+  if (Comment.consume_front("LLVM-MCA-END")) {
+    Regions.endRegion(Loc);
+    return;
+  }
+
+  // Try to parse the LLVM-MCA-BEGIN comment.
+  if (!Comment.consume_front("LLVM-MCA-BEGIN"))
+    return;
+
+  // Skip spaces and tabs.
+  Position = Comment.find_first_not_of(" \t");
+  if (Position < Comment.size())
+    Comment = Comment.drop_front(Position);
+  // Use the rest of the string as a descriptor for this code snippet.
+  Regions.beginRegion(Comment, Loc);
+}
+
+Expected<const CodeRegions &> AsmCodeRegionGenerator::parseCodeRegions() {
+  MCTargetOptions Opts;
+  Opts.PreserveAsmComments = false;
+  MCStreamerWrapper Str(Ctx, Regions);
+
+  // Create a MCAsmParser and setup the lexer to recognize llvm-mca ASM
+  // comments.
+  std::unique_ptr<MCAsmParser> Parser(
+      createMCAsmParser(Regions.getSourceMgr(), Ctx, Str, MAI));
+  MCAsmLexer &Lexer = Parser->getLexer();
+  MCACommentConsumer CC(Regions);
+  Lexer.setCommentConsumer(&CC);
+
+  // Create a target-specific parser and perform the parse.
+  std::unique_ptr<MCTargetAsmParser> TAP(
+      TheTarget.createMCAsmParser(STI, *Parser, MCII, Opts));
+  if (!TAP)
+    return make_error<StringError>(
+        "This target does not support assembly parsing.",
+        inconvertibleErrorCode());
+  Parser->setTargetParser(*TAP);
+  Parser->Run(false);
+
+  // Get the assembler dialect from the input.  llvm-mca will use this as the
+  // default dialect when printing reports.
+  AssemblerDialect = Parser->getAssemblerDialect();
+  return Regions;
+}
+
+} // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/CodeRegionGenerator.h b/tools/llvm-mca/CodeRegionGenerator.h
new file mode 100644
index 0000000000000000000000000000000000000000..892cafb926861a44fb5f5d56efddc06444aa62a7
--- /dev/null
+++ b/tools/llvm-mca/CodeRegionGenerator.h
@@ -0,0 +1,70 @@
+//===----------------------- CodeRegionGenerator.h --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares classes responsible for generating llvm-mca
+/// CodeRegions from various types of input. llvm-mca only analyzes CodeRegions,
+/// so the classes here provide the input-to-CodeRegions translation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_MCA_CODEREGION_GENERATOR_H
+#define LLVM_TOOLS_LLVM_MCA_CODEREGION_GENERATOR_H
+
+#include "CodeRegion.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <memory>
+
+namespace llvm {
+namespace mca {
+
+/// This class is responsible for parsing the input given to the llvm-mca
+/// driver, and converting that into a CodeRegions instance.
+class CodeRegionGenerator {
+protected:
+  CodeRegions Regions;
+  CodeRegionGenerator(const CodeRegionGenerator &) = delete;
+  CodeRegionGenerator &operator=(const CodeRegionGenerator &) = delete;
+
+public:
+  CodeRegionGenerator(SourceMgr &SM) : Regions(SM) {}
+  virtual ~CodeRegionGenerator();
+  virtual Expected<const CodeRegions &> parseCodeRegions() = 0;
+};
+
+/// This class is responsible for parsing input ASM and generating
+/// a CodeRegions instance.
+class AsmCodeRegionGenerator final : public CodeRegionGenerator {
+  const Target &TheTarget;
+  MCContext &Ctx;
+  const MCAsmInfo &MAI;
+  const MCSubtargetInfo &STI;
+  const MCInstrInfo &MCII;
+  unsigned AssemblerDialect; // This is set during parsing.
+
+public:
+  AsmCodeRegionGenerator(const Target &T, SourceMgr &SM, MCContext &C,
+                         const MCAsmInfo &A, const MCSubtargetInfo &S,
+                         const MCInstrInfo &I)
+      : CodeRegionGenerator(SM), TheTarget(T), Ctx(C), MAI(A), STI(S), MCII(I),
+        AssemblerDialect(0) {}
+
+  unsigned getAssemblerDialect() const { return AssemblerDialect; }
+  Expected<const CodeRegions &> parseCodeRegions() override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif // LLVM_TOOLS_LLVM_MCA_CODEREGION_GENERATOR_H
diff --git a/tools/llvm-mca/PipelinePrinter.cpp b/tools/llvm-mca/PipelinePrinter.cpp
index 619f22cc810e4f17c5f61c9e0ed4cf45c6313466..18ef45fc2a6553752c45340f42bb438ae8be4102 100644
--- a/tools/llvm-mca/PipelinePrinter.cpp
+++ b/tools/llvm-mca/PipelinePrinter.cpp
@@ -15,12 +15,12 @@
 #include "PipelinePrinter.h"
 #include "Views/View.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 void PipelinePrinter::printReport(llvm::raw_ostream &OS) const {
   for (const auto &V : Views)
     V->printView(OS);
 }
 } // namespace mca.
+} // namespace llvm
diff --git a/tools/llvm-mca/PipelinePrinter.h b/tools/llvm-mca/PipelinePrinter.h
index a90b3a2af42fa057bc7fd811cb0a966d292895f2..7e426383f21bc8fd8f5d295190eaa87ca63d813b 100644
--- a/tools/llvm-mca/PipelinePrinter.h
+++ b/tools/llvm-mca/PipelinePrinter.h
@@ -24,6 +24,7 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 /// A printer class that knows how to collects statistics on the
@@ -48,5 +49,6 @@ public:
   void printReport(llvm::raw_ostream &OS) const;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_PIPELINEPRINTER_H
diff --git a/tools/llvm-mca/Views/DispatchStatistics.cpp b/tools/llvm-mca/Views/DispatchStatistics.cpp
index cccb09a9fa7ddaae8e7c1ab7fdf10a6475b0c0cd..2562c82407bf52423603d015ce95d69e3c909f19 100644
--- a/tools/llvm-mca/Views/DispatchStatistics.cpp
+++ b/tools/llvm-mca/Views/DispatchStatistics.cpp
@@ -16,8 +16,7 @@
 #include "Views/DispatchStatistics.h"
 #include "llvm/Support/Format.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
 void DispatchStatistics::onEvent(const HWStallEvent &Event) {
@@ -33,7 +32,7 @@ void DispatchStatistics::onEvent(const HWInstructionEvent &Event) {
   NumDispatched += DE.MicroOpcodes;
 }
 
-void DispatchStatistics::printDispatchHistogram(llvm::raw_ostream &OS) const {
+void DispatchStatistics::printDispatchHistogram(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   TempStream << "\n\nDispatch Logic - "
@@ -84,3 +83,4 @@ void DispatchStatistics::printDispatchStalls(raw_ostream &OS) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/DispatchStatistics.h b/tools/llvm-mca/Views/DispatchStatistics.h
index 0f6f75e0954fb2944a9b95ffd2f33e7ea82cc461..6679c81efe954ada2db98196bab5d851828ee206 100644
--- a/tools/llvm-mca/Views/DispatchStatistics.h
+++ b/tools/llvm-mca/Views/DispatchStatistics.h
@@ -39,6 +39,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include <map>
 
+namespace llvm {
 namespace mca {
 
 class DispatchStatistics : public View {
@@ -80,5 +81,6 @@ public:
   }
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/InstructionInfoView.cpp b/tools/llvm-mca/Views/InstructionInfoView.cpp
index a2e3001383a112d038c7d2f9b588530aa51efc02..5016afb49e442af600a4d8f2161bae973aecc793 100644
--- a/tools/llvm-mca/Views/InstructionInfoView.cpp
+++ b/tools/llvm-mca/Views/InstructionInfoView.cpp
@@ -14,15 +14,13 @@
 
 #include "Views/InstructionInfoView.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 void InstructionInfoView::printView(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   const MCSchedModel &SM = STI.getSchedModel();
-  unsigned Instructions = Source.size();
 
   std::string Instruction;
   raw_string_ostream InstrStream(Instruction);
@@ -32,8 +30,7 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
              << "[4]: MayLoad\n[5]: MayStore\n[6]: HasSideEffects (U)\n\n";
 
   TempStream << "[1]    [2]    [3]    [4]    [5]    [6]    Instructions:\n";
-  for (unsigned I = 0, E = Instructions; I < E; ++I) {
-    const MCInst &Inst = Source.getMCInstFromIndex(I);
+  for (const MCInst &Inst : Source) {
     const MCInstrDesc &MCDesc = MCII.get(Inst.getOpcode());
 
     // Obtain the scheduling class information from the instruction.
@@ -89,3 +86,4 @@ void InstructionInfoView::printView(raw_ostream &OS) const {
   OS << Buffer;
 }
 } // namespace mca.
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/InstructionInfoView.h b/tools/llvm-mca/Views/InstructionInfoView.h
index 435c058d82432a11cd5c2be3113e4ed0304eb552..3ef95d474490b0e9c7dc0b210460f792b70c5974 100644
--- a/tools/llvm-mca/Views/InstructionInfoView.h
+++ b/tools/llvm-mca/Views/InstructionInfoView.h
@@ -35,8 +35,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONINFOVIEW_H
 
-#include "SourceMgr.h"
 #include "Views/View.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -44,23 +45,25 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 /// A view that prints out generic instruction information.
 class InstructionInfoView : public View {
   const llvm::MCSubtargetInfo &STI;
   const llvm::MCInstrInfo &MCII;
-  const SourceMgr &Source;
+  llvm::ArrayRef<llvm::MCInst> Source;
   llvm::MCInstPrinter &MCIP;
 
 public:
   InstructionInfoView(const llvm::MCSubtargetInfo &sti,
-                      const llvm::MCInstrInfo &mcii, const SourceMgr &S,
-                      llvm::MCInstPrinter &IP)
+                      const llvm::MCInstrInfo &mcii,
+                      llvm::ArrayRef<llvm::MCInst> S, llvm::MCInstPrinter &IP)
       : STI(sti), MCII(mcii), Source(S), MCIP(IP) {}
 
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.cpp b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
index 7dbc76a51e190a429019d0c7f102a1dec398e243..06202bc414212af9fe369d931154567509a23b9b 100644
--- a/tools/llvm-mca/Views/RegisterFileStatistics.cpp
+++ b/tools/llvm-mca/Views/RegisterFileStatistics.cpp
@@ -15,16 +15,18 @@
 #include "Views/RegisterFileStatistics.h"
 #include "llvm/Support/Format.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
-void RegisterFileStatistics::initializeRegisterFileInfo() {
+RegisterFileStatistics::RegisterFileStatistics(const MCSubtargetInfo &sti)
+    : STI(sti) {
   const MCSchedModel &SM = STI.getSchedModel();
-  RegisterFileUsage Empty = {0, 0, 0};
+  RegisterFileUsage RFUEmpty = {0, 0, 0};
+  MoveEliminationInfo MEIEmpty = {0, 0, 0, 0, 0};
   if (!SM.hasExtraProcessorInfo()) {
     // Assume a single register file.
-    RegisterFiles.emplace_back(Empty);
+    PRFUsage.emplace_back(RFUEmpty);
+    MoveElimInfo.emplace_back(MEIEmpty);
     return;
   }
 
@@ -35,8 +37,42 @@ void RegisterFileStatistics::initializeRegisterFileInfo() {
   // be skipped. If there are no user defined register files, then reserve a
   // single entry for the default register file at index #0.
   unsigned NumRegFiles = std::max(PI.NumRegisterFiles, 1U);
-  RegisterFiles.resize(NumRegFiles);
-  std::fill(RegisterFiles.begin(), RegisterFiles.end(), Empty);
+
+  PRFUsage.resize(NumRegFiles);
+  std::fill(PRFUsage.begin(), PRFUsage.end(), RFUEmpty);
+
+  MoveElimInfo.resize(NumRegFiles);
+  std::fill(MoveElimInfo.begin(), MoveElimInfo.end(), MEIEmpty);
+}
+
+void RegisterFileStatistics::updateRegisterFileUsage(
+    ArrayRef<unsigned> UsedPhysRegs) {
+  for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I) {
+    RegisterFileUsage &RFU = PRFUsage[I];
+    unsigned NumUsedPhysRegs = UsedPhysRegs[I];
+    RFU.CurrentlyUsedMappings += NumUsedPhysRegs;
+    RFU.TotalMappings += NumUsedPhysRegs;
+    RFU.MaxUsedMappings =
+        std::max(RFU.MaxUsedMappings, RFU.CurrentlyUsedMappings);
+  }
+}
+
+void RegisterFileStatistics::updateMoveElimInfo(const Instruction &Inst) {
+  if (!Inst.isOptimizableMove())
+    return;
+
+  assert(Inst.getDefs().size() == 1 && "Expected a single definition!");
+  assert(Inst.getUses().size() == 1 && "Expected a single register use!");
+  const WriteState &WS = Inst.getDefs()[0];
+  const ReadState &RS = Inst.getUses()[0];
+
+  MoveEliminationInfo &Info =
+      MoveElimInfo[Inst.getDefs()[0].getRegisterFileID()];
+  Info.TotalMoveEliminationCandidates++;
+  if (WS.isEliminated())
+    Info.CurrentMovesEliminated++;
+  if (WS.isWriteZero() && RS.isReadZero())
+    Info.TotalMovesThatPropagateZero++;
 }
 
 void RegisterFileStatistics::onEvent(const HWInstructionEvent &Event) {
@@ -45,21 +81,24 @@ void RegisterFileStatistics::onEvent(const HWInstructionEvent &Event) {
     break;
   case HWInstructionEvent::Retired: {
     const auto &RE = static_cast<const HWInstructionRetiredEvent &>(Event);
-    for (unsigned I = 0, E = RegisterFiles.size(); I < E; ++I)
-      RegisterFiles[I].CurrentlyUsedMappings -= RE.FreedPhysRegs[I];
+    for (unsigned I = 0, E = PRFUsage.size(); I < E; ++I)
+      PRFUsage[I].CurrentlyUsedMappings -= RE.FreedPhysRegs[I];
     break;
   }
   case HWInstructionEvent::Dispatched: {
     const auto &DE = static_cast<const HWInstructionDispatchedEvent &>(Event);
-    for (unsigned I = 0, E = RegisterFiles.size(); I < E; ++I) {
-      RegisterFileUsage &RFU = RegisterFiles[I];
-      unsigned NumUsedPhysRegs = DE.UsedPhysRegs[I];
-      RFU.CurrentlyUsedMappings += NumUsedPhysRegs;
-      RFU.TotalMappings += NumUsedPhysRegs;
-      RFU.MaxUsedMappings =
-          std::max(RFU.MaxUsedMappings, RFU.CurrentlyUsedMappings);
-    }
+    updateRegisterFileUsage(DE.UsedPhysRegs);
+    updateMoveElimInfo(*DE.IR.getInstruction());
+  }
   }
+}
+
+void RegisterFileStatistics::onCycleEnd() {
+  for (MoveEliminationInfo &MEI : MoveElimInfo) {
+    unsigned &CurrentMax = MEI.MaxMovesEliminatedPerCycle;
+    CurrentMax = std::max(CurrentMax, MEI.CurrentMovesEliminated);
+    MEI.TotalMovesEliminated += MEI.CurrentMovesEliminated;
+    MEI.CurrentMovesEliminated = 0;
   }
 }
 
@@ -68,14 +107,14 @@ void RegisterFileStatistics::printView(raw_ostream &OS) const {
   raw_string_ostream TempStream(Buffer);
 
   TempStream << "\n\nRegister File statistics:";
-  const RegisterFileUsage &GlobalUsage = RegisterFiles[0];
+  const RegisterFileUsage &GlobalUsage = PRFUsage[0];
   TempStream << "\nTotal number of mappings created:    "
              << GlobalUsage.TotalMappings;
   TempStream << "\nMax number of mappings used:         "
              << GlobalUsage.MaxUsedMappings << '\n';
 
-  for (unsigned I = 1, E = RegisterFiles.size(); I < E; ++I) {
-    const RegisterFileUsage &RFU = RegisterFiles[I];
+  for (unsigned I = 1, E = PRFUsage.size(); I < E; ++I) {
+    const RegisterFileUsage &RFU = PRFUsage[I];
     // Obtain the register file descriptor from the scheduling model.
     assert(STI.getSchedModel().hasExtraProcessorInfo() &&
            "Unable to find register file info!");
@@ -98,6 +137,27 @@ void RegisterFileStatistics::printView(raw_ostream &OS) const {
                << RFU.TotalMappings;
     TempStream << "\n   Max number of mappings used:      "
                << RFU.MaxUsedMappings << '\n';
+    const MoveEliminationInfo &MEI = MoveElimInfo[I];
+
+    if (MEI.TotalMoveEliminationCandidates) {
+      TempStream << "   Number of optimizable moves:      "
+                 << MEI.TotalMoveEliminationCandidates;
+      double EliminatedMovProportion = (double)MEI.TotalMovesEliminated /
+                                       MEI.TotalMoveEliminationCandidates *
+                                       100.0;
+      double ZeroMovProportion = (double)MEI.TotalMovesThatPropagateZero /
+                                 MEI.TotalMoveEliminationCandidates * 100.0;
+      TempStream << "\n   Number of moves eliminated:       "
+                 << MEI.TotalMovesEliminated << "  "
+                 << format("(%.1f%%)",
+                           floor((EliminatedMovProportion * 10) + 0.5) / 10);
+      TempStream << "\n   Number of zero moves:             "
+                 << MEI.TotalMovesThatPropagateZero << "  "
+                 << format("(%.1f%%)",
+                           floor((ZeroMovProportion * 10) + 0.5) / 10);
+      TempStream << "\n   Max moves eliminated per cycle:   "
+                 << MEI.MaxMovesEliminatedPerCycle << '\n';
+    }
   }
 
   TempStream.flush();
@@ -105,3 +165,4 @@ void RegisterFileStatistics::printView(raw_ostream &OS) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/RegisterFileStatistics.h b/tools/llvm-mca/Views/RegisterFileStatistics.h
index 3dcac4d4f75f2a5094e6090fefaf9ce2cc22b0f6..a2c52a668daef74ea0388751ec78a653ed5d47d5 100644
--- a/tools/llvm-mca/Views/RegisterFileStatistics.h
+++ b/tools/llvm-mca/Views/RegisterFileStatistics.h
@@ -21,6 +21,10 @@
 ///    Number of physical registers:     72
 ///    Total number of mappings created: 0
 ///    Max number of mappings used:      0
+///    Number of optimizable moves:      200
+///    Number of moves eliminated:       200 (100.0%)
+///    Number of zero moves:             200 (100.0%)
+///    Max moves eliminated per cycle:   2
 ///
 /// *  Register File #2 -- IntegerPRF:
 ///    Number of physical registers:     64
@@ -36,6 +40,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
+namespace llvm {
 namespace mca {
 
 class RegisterFileStatistics : public View {
@@ -48,20 +53,29 @@ class RegisterFileStatistics : public View {
     unsigned CurrentlyUsedMappings;
   };
 
+  struct MoveEliminationInfo {
+    unsigned TotalMoveEliminationCandidates;
+    unsigned TotalMovesEliminated;
+    unsigned TotalMovesThatPropagateZero;
+    unsigned MaxMovesEliminatedPerCycle;
+    unsigned CurrentMovesEliminated;
+  };
+
   // There is one entry for each register file implemented by the processor.
-  llvm::SmallVector<RegisterFileUsage, 4> RegisterFiles;
+  llvm::SmallVector<RegisterFileUsage, 4> PRFUsage;
+  llvm::SmallVector<MoveEliminationInfo, 4> MoveElimInfo;
 
-  void initializeRegisterFileInfo();
+  void updateRegisterFileUsage(ArrayRef<unsigned> UsedPhysRegs);
+  void updateMoveElimInfo(const Instruction &Inst);
 
 public:
-  RegisterFileStatistics(const llvm::MCSubtargetInfo &sti) : STI(sti) {
-    initializeRegisterFileInfo();
-  }
+  RegisterFileStatistics(const llvm::MCSubtargetInfo &sti);
 
+  void onCycleEnd() override;
   void onEvent(const HWInstructionEvent &Event) override;
-
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/ResourcePressureView.cpp b/tools/llvm-mca/Views/ResourcePressureView.cpp
index bba1e70bc2686a0d5a0fd81209cdd3cbf1aab04e..6df61840437df225ff41bdb3419e469dee5cb4b6 100644
--- a/tools/llvm-mca/Views/ResourcePressureView.cpp
+++ b/tools/llvm-mca/Views/ResourcePressureView.cpp
@@ -16,11 +16,13 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
-void ResourcePressureView::initialize() {
+ResourcePressureView::ResourcePressureView(const llvm::MCSubtargetInfo &sti,
+                                           MCInstPrinter &Printer,
+                                           ArrayRef<MCInst> S)
+    : STI(sti), MCIP(Printer), Source(S), LastInstructionIdx(0) {
   // Populate the map of resource descriptors.
   unsigned R2VIndex = 0;
   const MCSchedModel &SM = STI.getSchedModel();
@@ -41,9 +43,15 @@ void ResourcePressureView::initialize() {
 }
 
 void ResourcePressureView::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Dispatched) {
+    LastInstructionIdx = Event.IR.getSourceIndex();
+    return;
+  }
+
   // We're only interested in Issue events.
   if (Event.Type != HWInstructionEvent::Issued)
     return;
+
   const auto &IssueEvent = static_cast<const HWInstructionIssuedEvent &>(Event);
   const unsigned SourceIdx = Event.IR.getSourceIndex() % Source.size();
   for (const std::pair<ResourceRef, ResourceCycles> &Use :
@@ -92,8 +100,7 @@ static void printResourcePressure(formatted_raw_ostream &OS, double Pressure,
   OS.PadToColumn(Col);
 }
 
-void ResourcePressureView::printResourcePressurePerIteration(
-    raw_ostream &OS, unsigned Executions) const {
+void ResourcePressureView::printResourcePressurePerIter(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   formatted_raw_ostream FOS(TempStream);
@@ -126,6 +133,7 @@ void ResourcePressureView::printResourcePressurePerIteration(
   FOS << '\n';
   FOS.flush();
 
+  const unsigned Executions = LastInstructionIdx / Source.size() + 1;
   for (unsigned I = 0, E = NumResourceUnits; I < E; ++I) {
     double Usage = ResourceUsage[I + Source.size() * E];
     printResourcePressure(FOS, Usage / Executions, (I + 1) * 7);
@@ -135,8 +143,7 @@ void ResourcePressureView::printResourcePressurePerIteration(
   OS << Buffer;
 }
 
-void ResourcePressureView::printResourcePressurePerInstruction(
-    raw_ostream &OS, unsigned Executions) const {
+void ResourcePressureView::printResourcePressurePerInst(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   formatted_raw_ostream FOS(TempStream);
@@ -148,13 +155,16 @@ void ResourcePressureView::printResourcePressurePerInstruction(
   std::string Instruction;
   raw_string_ostream InstrStream(Instruction);
 
-  for (unsigned I = 0, E = Source.size(); I < E; ++I) {
+  unsigned InstrIndex = 0;
+  const unsigned Executions = LastInstructionIdx / Source.size() + 1;
+  for (const MCInst &MCI : Source) {
+    unsigned BaseEltIdx = InstrIndex * NumResourceUnits;
     for (unsigned J = 0; J < NumResourceUnits; ++J) {
-      double Usage = ResourceUsage[J + I * NumResourceUnits];
+      double Usage = ResourceUsage[J + BaseEltIdx];
       printResourcePressure(FOS, Usage / Executions, (J + 1) * 7);
     }
 
-    MCIP.printInst(&Source.getMCInstFromIndex(I), InstrStream, "", STI);
+    MCIP.printInst(&MCI, InstrStream, "", STI);
     InstrStream.flush();
     StringRef Str(Instruction);
 
@@ -167,6 +177,9 @@ void ResourcePressureView::printResourcePressurePerInstruction(
     FOS.flush();
     OS << Buffer;
     Buffer = "";
+
+    ++InstrIndex;
   }
 }
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/ResourcePressureView.h b/tools/llvm-mca/Views/ResourcePressureView.h
index ad9c29a55e5cbab3f0b7799ba5a5db20ad85ba3a..572ce6fe6b7007eed7dc655d2c748b4892e74b80 100644
--- a/tools/llvm-mca/Views/ResourcePressureView.h
+++ b/tools/llvm-mca/Views/ResourcePressureView.h
@@ -58,13 +58,14 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_RESOURCEPRESSUREVIEW_H
 
-#include "SourceMgr.h"
 #include "Views/View.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include <map>
 
+namespace llvm {
 namespace mca {
 
 /// This class collects resource pressure statistics and it is able to print
@@ -72,7 +73,8 @@ namespace mca {
 class ResourcePressureView : public View {
   const llvm::MCSubtargetInfo &STI;
   llvm::MCInstPrinter &MCIP;
-  const SourceMgr &Source;
+  llvm::ArrayRef<llvm::MCInst> Source;
+  unsigned LastInstructionIdx;
 
   // Map to quickly obtain the ResourceUsage column index from a processor
   // resource ID.
@@ -82,28 +84,21 @@ class ResourcePressureView : public View {
   std::vector<ResourceCycles> ResourceUsage;
   unsigned NumResourceUnits;
 
-  const llvm::MCInst &GetMCInstFromIndex(unsigned Index) const;
-  void printResourcePressurePerIteration(llvm::raw_ostream &OS,
-                                         unsigned Executions) const;
-  void printResourcePressurePerInstruction(llvm::raw_ostream &OS,
-                                           unsigned Executions) const;
-  void initialize();
+  void printResourcePressurePerIter(llvm::raw_ostream &OS) const;
+  void printResourcePressurePerInst(llvm::raw_ostream &OS) const;
 
 public:
   ResourcePressureView(const llvm::MCSubtargetInfo &sti,
-                       llvm::MCInstPrinter &Printer, const SourceMgr &SM)
-      : STI(sti), MCIP(Printer), Source(SM) {
-    initialize();
-  }
+                       llvm::MCInstPrinter &Printer,
+                       llvm::ArrayRef<llvm::MCInst> S);
 
   void onEvent(const HWInstructionEvent &Event) override;
-
   void printView(llvm::raw_ostream &OS) const override {
-    unsigned Executions = Source.getNumIterations();
-    printResourcePressurePerIteration(OS, Executions);
-    printResourcePressurePerInstruction(OS, Executions);
+    printResourcePressurePerIter(OS);
+    printResourcePressurePerInst(OS);
   }
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp b/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
index d5aab396b4c5353a6ae9fccd766685d690c7ed22..7e2fd316c97760b95942fde254ece4506c8d6b67 100644
--- a/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
+++ b/tools/llvm-mca/Views/RetireControlUnitStatistics.cpp
@@ -15,8 +15,7 @@
 #include "Views/RetireControlUnitStatistics.h"
 #include "llvm/Support/Format.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
 void RetireControlUnitStatistics::onEvent(const HWInstructionEvent &Event) {
@@ -24,7 +23,7 @@ void RetireControlUnitStatistics::onEvent(const HWInstructionEvent &Event) {
     ++NumRetired;
 }
 
-void RetireControlUnitStatistics::printView(llvm::raw_ostream &OS) const {
+void RetireControlUnitStatistics::printView(raw_ostream &OS) const {
   std::string Buffer;
   raw_string_ostream TempStream(Buffer);
   TempStream << "\n\nRetire Control Unit - "
@@ -47,3 +46,4 @@ void RetireControlUnitStatistics::printView(llvm::raw_ostream &OS) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/RetireControlUnitStatistics.h b/tools/llvm-mca/Views/RetireControlUnitStatistics.h
index 0531e389c903bda07cc662cf95ab0fe9dd41f296..9a4821ec31a157675bcb7188b2d823931798d26d 100644
--- a/tools/llvm-mca/Views/RetireControlUnitStatistics.h
+++ b/tools/llvm-mca/Views/RetireControlUnitStatistics.h
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include <map>
 
+namespace llvm {
 namespace mca {
 
 class RetireControlUnitStatistics : public View {
@@ -48,13 +49,12 @@ public:
   RetireControlUnitStatistics() : NumRetired(0), NumCycles(0) {}
 
   void onEvent(const HWInstructionEvent &Event) override;
-
   void onCycleBegin() override { NumCycles++; }
-
   void onCycleEnd() override { updateHistograms(); }
 
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/SchedulerStatistics.cpp b/tools/llvm-mca/Views/SchedulerStatistics.cpp
index bc91bf04a81cb07fa007f6e36ae22ad10832a9df..edd6056c1e8aea585ea9b391697cd14421f53787 100644
--- a/tools/llvm-mca/Views/SchedulerStatistics.cpp
+++ b/tools/llvm-mca/Views/SchedulerStatistics.cpp
@@ -16,8 +16,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
 void SchedulerStatistics::onEvent(const HWInstructionEvent &Event) {
@@ -121,9 +120,10 @@ void SchedulerStatistics::printSchedulerUsage(raw_ostream &OS) const {
   FOS.flush();
 }
 
-void SchedulerStatistics::printView(llvm::raw_ostream &OS) const {
+void SchedulerStatistics::printView(raw_ostream &OS) const {
   printSchedulerStats(OS);
   printSchedulerUsage(OS);
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/SchedulerStatistics.h b/tools/llvm-mca/Views/SchedulerStatistics.h
index de70db26ed4da8c37ebb3073b682e833154553d4..56dd3af19124aa516e5e0e0423e013d6d9bc5644 100644
--- a/tools/llvm-mca/Views/SchedulerStatistics.h
+++ b/tools/llvm-mca/Views/SchedulerStatistics.h
@@ -42,6 +42,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include <map>
 
+namespace llvm {
 namespace mca {
 
 class SchedulerStatistics final : public View {
@@ -70,9 +71,7 @@ public:
         Usage(STI.getSchedModel().NumProcResourceKinds, {0, 0, 0}) {}
 
   void onEvent(const HWInstructionEvent &Event) override;
-
   void onCycleBegin() override { NumCycles++; }
-
   void onCycleEnd() override { updateHistograms(); }
 
   // Increases the number of used scheduler queue slots of every buffered
@@ -88,5 +87,6 @@ public:
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/SummaryView.cpp b/tools/llvm-mca/Views/SummaryView.cpp
index eb4c50c5d1f49046efd8069ada6334cd3f18f889..fdf27600c933e204cfb8aad446e7cab955c7cf58 100644
--- a/tools/llvm-mca/Views/SummaryView.cpp
+++ b/tools/llvm-mca/Views/SummaryView.cpp
@@ -18,21 +18,23 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Format.h"
 
+namespace llvm {
 namespace mca {
 
 #define DEBUG_TYPE "llvm-mca"
 
-using namespace llvm;
-
-SummaryView::SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
+SummaryView::SummaryView(const MCSchedModel &Model, ArrayRef<MCInst> S,
                          unsigned Width)
-    : SM(Model), Source(S), DispatchWidth(Width), TotalCycles(0),
-      NumMicroOps(0), ProcResourceUsage(Model.getNumProcResourceKinds(), 0),
-      ProcResourceMasks(Model.getNumProcResourceKinds(), 0) {
+    : SM(Model), Source(S), DispatchWidth(Width), LastInstructionIdx(0),
+      TotalCycles(0), NumMicroOps(0),
+      ProcResourceUsage(Model.getNumProcResourceKinds(), 0) {
   computeProcResourceMasks(SM, ProcResourceMasks);
 }
 
 void SummaryView::onEvent(const HWInstructionEvent &Event) {
+  if (Event.Type == HWInstructionEvent::Dispatched)
+    LastInstructionIdx = Event.IR.getSourceIndex();
+
   // We are only interested in the "instruction retired" events generated by
   // the retire stage for instructions that are part of iteration #0.
   if (Event.Type != HWInstructionEvent::Retired ||
@@ -58,8 +60,8 @@ void SummaryView::onEvent(const HWInstructionEvent &Event) {
 }
 
 void SummaryView::printView(raw_ostream &OS) const {
-  unsigned Iterations = Source.getNumIterations();
   unsigned Instructions = Source.size();
+  unsigned Iterations = (LastInstructionIdx / Instructions) + 1;
   unsigned TotalInstructions = Instructions * Iterations;
   unsigned TotalUOps = NumMicroOps * Iterations;
   double IPC = (double)TotalInstructions / TotalCycles;
@@ -85,3 +87,4 @@ void SummaryView::printView(raw_ostream &OS) const {
   OS << Buffer;
 }
 } // namespace mca.
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/SummaryView.h b/tools/llvm-mca/Views/SummaryView.h
index 13875976d398f6eef199e48565b073408d659200..f59fd4233fbecc03bf7f88a036659e8b7bff1cb5 100644
--- a/tools/llvm-mca/Views/SummaryView.h
+++ b/tools/llvm-mca/Views/SummaryView.h
@@ -29,19 +29,20 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_SUMMARYVIEW_H
 
-#include "SourceMgr.h"
 #include "Views/View.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
 /// A view that collects and prints a few performance numbers.
 class SummaryView : public View {
   const llvm::MCSchedModel &SM;
-  const SourceMgr &Source;
+  llvm::ArrayRef<llvm::MCInst> Source;
   const unsigned DispatchWidth;
+  unsigned LastInstructionIdx;
   unsigned TotalCycles;
   // The total number of micro opcodes contributed by a block of instructions.
   unsigned NumMicroOps;
@@ -62,15 +63,15 @@ class SummaryView : public View {
   double getBlockRThroughput() const;
 
 public:
-  SummaryView(const llvm::MCSchedModel &Model, const SourceMgr &S,
+  SummaryView(const llvm::MCSchedModel &Model, llvm::ArrayRef<llvm::MCInst> S,
               unsigned Width);
 
   void onCycleEnd() override { ++TotalCycles; }
-
   void onEvent(const HWInstructionEvent &Event) override;
 
   void printView(llvm::raw_ostream &OS) const override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/TimelineView.cpp b/tools/llvm-mca/Views/TimelineView.cpp
index 1ad7271b2a4745a1d1b1bd59dc7a2ab66a6222b0..7d55bbc99c73d7666319a40063e99a5689df0924 100644
--- a/tools/llvm-mca/Views/TimelineView.cpp
+++ b/tools/llvm-mca/Views/TimelineView.cpp
@@ -14,20 +14,18 @@
 
 #include "Views/TimelineView.h"
 
-using namespace llvm;
-
+namespace llvm {
 namespace mca {
 
 TimelineView::TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer,
-                           const SourceMgr &S, unsigned MaxIterations,
+                           llvm::ArrayRef<llvm::MCInst> S, unsigned Iterations,
                            unsigned Cycles)
-    : STI(sti), MCIP(Printer), AsmSequence(S), CurrentCycle(0),
+    : STI(sti), MCIP(Printer), Source(S), CurrentCycle(0),
       MaxCycle(Cycles == 0 ? 80 : Cycles), LastCycle(0), WaitTime(S.size()),
       UsedBuffer(S.size()) {
-  unsigned NumInstructions = AsmSequence.size();
-  if (!MaxIterations)
-    MaxIterations = DEFAULT_ITERATIONS;
-  NumInstructions *= std::min(MaxIterations, AsmSequence.getNumIterations());
+  unsigned NumInstructions = Source.size();
+  assert(Iterations && "Invalid number of iterations specified!");
+  NumInstructions *= Iterations;
   Timeline.resize(NumInstructions);
   TimelineViewEntry InvalidTVEntry = {-1, 0, 0, 0, 0};
   std::fill(Timeline.begin(), Timeline.end(), InvalidTVEntry);
@@ -42,7 +40,7 @@ TimelineView::TimelineView(const MCSubtargetInfo &sti, MCInstPrinter &Printer,
 
 void TimelineView::onReservedBuffers(const InstRef &IR,
                                      ArrayRef<unsigned> Buffers) {
-  if (IR.getSourceIndex() >= AsmSequence.size())
+  if (IR.getSourceIndex() >= Source.size())
     return;
 
   const MCSchedModel &SM = STI.getSchedModel();
@@ -72,7 +70,7 @@ void TimelineView::onEvent(const HWInstructionEvent &Event) {
     // Update the WaitTime entry which corresponds to this Index.
     assert(TVEntry.CycleDispatched >= 0 && "Invalid TVEntry found!");
     unsigned CycleDispatched = static_cast<unsigned>(TVEntry.CycleDispatched);
-    WaitTimeEntry &WTEntry = WaitTime[Index % AsmSequence.size()];
+    WaitTimeEntry &WTEntry = WaitTime[Index % Source.size()];
     WTEntry.CyclesSpentInSchedulerQueue +=
         TVEntry.CycleIssued - CycleDispatched;
     assert(CycleDispatched <= TVEntry.CycleReady &&
@@ -176,12 +174,11 @@ void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
   raw_string_ostream InstrStream(Instruction);
 
   formatted_raw_ostream FOS(OS);
-  unsigned Executions = Timeline.size() / AsmSequence.size();
-  for (unsigned I = 0, E = WaitTime.size(); I < E; ++I) {
-    printWaitTimeEntry(FOS, WaitTime[I], I, Executions);
+  unsigned Executions = Timeline.size() / Source.size();
+  unsigned IID = 0;
+  for (const MCInst &Inst : Source) {
+    printWaitTimeEntry(FOS, WaitTime[IID], IID, Executions);
     // Append the instruction info at the end of the line.
-    const MCInst &Inst = AsmSequence.getMCInstFromIndex(I);
-
     MCIP.printInst(&Inst, InstrStream, "", STI);
     InstrStream.flush();
 
@@ -191,6 +188,8 @@ void TimelineView::printAverageWaitTimes(raw_ostream &OS) const {
     FOS << "   " << Str << '\n';
     FOS.flush();
     Instruction = "";
+
+    ++IID;
   }
 }
 
@@ -266,25 +265,30 @@ void TimelineView::printTimeline(raw_ostream &OS) const {
   std::string Instruction;
   raw_string_ostream InstrStream(Instruction);
 
-  for (unsigned I = 0, E = Timeline.size(); I < E; ++I) {
-    const TimelineViewEntry &Entry = Timeline[I];
-    if (Entry.CycleRetired == 0)
-      return;
-
-    unsigned Iteration = I / AsmSequence.size();
-    unsigned SourceIndex = I % AsmSequence.size();
-    printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex);
-    // Append the instruction info at the end of the line.
-    const MCInst &Inst = AsmSequence.getMCInstFromIndex(I);
-    MCIP.printInst(&Inst, InstrStream, "", STI);
-    InstrStream.flush();
-
-    // Consume any tabs or spaces at the beginning of the string.
-    StringRef Str(Instruction);
-    Str = Str.ltrim();
-    FOS << "   " << Str << '\n';
-    FOS.flush();
-    Instruction = "";
+  unsigned IID = 0;
+  const unsigned Iterations = Timeline.size() / Source.size();
+  for (unsigned Iteration = 0; Iteration < Iterations; ++Iteration) {
+    for (const MCInst &Inst : Source) {
+      const TimelineViewEntry &Entry = Timeline[IID];
+      if (Entry.CycleRetired == 0)
+        return;
+
+      unsigned SourceIndex = IID % Source.size();
+      printTimelineViewEntry(FOS, Entry, Iteration, SourceIndex);
+      // Append the instruction info at the end of the line.
+      MCIP.printInst(&Inst, InstrStream, "", STI);
+      InstrStream.flush();
+
+      // Consume any tabs or spaces at the beginning of the string.
+      StringRef Str(Instruction);
+      Str = Str.ltrim();
+      FOS << "   " << Str << '\n';
+      FOS.flush();
+      Instruction = "";
+
+      ++IID;
+    }
   }
 }
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/TimelineView.h b/tools/llvm-mca/Views/TimelineView.h
index 361e37ac625299c8bfc055e3ec25d1fb31f151e6..ee981800161c90c0112fc7c3f432ce2e8b26e34c 100644
--- a/tools/llvm-mca/Views/TimelineView.h
+++ b/tools/llvm-mca/Views/TimelineView.h
@@ -100,13 +100,15 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
 #define LLVM_TOOLS_LLVM_MCA_TIMELINEVIEW_H
 
-#include "SourceMgr.h"
 #include "Views/View.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
 /// This class listens to instruction state transition events
@@ -119,7 +121,7 @@ namespace mca {
 class TimelineView : public View {
   const llvm::MCSubtargetInfo &STI;
   llvm::MCInstPrinter &MCIP;
-  const SourceMgr &AsmSequence;
+  llvm::ArrayRef<llvm::MCInst> Source;
 
   unsigned CurrentCycle;
   unsigned MaxCycle;
@@ -152,8 +154,6 @@ class TimelineView : public View {
                           const WaitTimeEntry &E, unsigned Index,
                           unsigned Executions) const;
 
-  const unsigned DEFAULT_ITERATIONS = 10;
-
   // Display characters for the TimelineView report output.
   struct DisplayChar {
     static const char Dispatched = 'D';
@@ -166,7 +166,7 @@ class TimelineView : public View {
 
 public:
   TimelineView(const llvm::MCSubtargetInfo &sti, llvm::MCInstPrinter &Printer,
-               const SourceMgr &Sequence, unsigned MaxIterations,
+               llvm::ArrayRef<llvm::MCInst> S, unsigned Iterations,
                unsigned Cycles);
 
   // Event handlers.
@@ -184,5 +184,6 @@ public:
   }
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/Views/View.cpp b/tools/llvm-mca/Views/View.cpp
index 1cf4daeec84ea8047a94c8d246cb345f4b2e4376..6cfb9dd9f3948b5c24e046d64edda1f2c777933f 100644
--- a/tools/llvm-mca/Views/View.cpp
+++ b/tools/llvm-mca/Views/View.cpp
@@ -14,7 +14,9 @@
 
 #include "Views/View.h"
 
+namespace llvm {
 namespace mca {
 
 void View::anchor() {}
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/Views/View.h b/tools/llvm-mca/Views/View.h
index 9ba94a5da9771fde44f36b2d0fc4a50903987c0a..c332bb53938ab732f535babf0d7eb79273703e58 100644
--- a/tools/llvm-mca/Views/View.h
+++ b/tools/llvm-mca/Views/View.h
@@ -19,6 +19,7 @@
 #include "HWEventListener.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
 class View : public HWEventListener {
@@ -28,5 +29,6 @@ public:
   void anchor() override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Context.h b/tools/llvm-mca/include/Context.h
index 9d64ae32f1ca94413d9ac6ddae25762bff497ff0..ebd1528e371f722e970785dc689f383ad1e813a5 100644
--- a/tools/llvm-mca/include/Context.h
+++ b/tools/llvm-mca/include/Context.h
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include <memory>
 
+namespace llvm {
 namespace mca {
 
 /// This is a convenience struct to hold the parameters necessary for creating
@@ -42,13 +43,12 @@ struct PipelineOptions {
 };
 
 class Context {
-  llvm::SmallVector<std::unique_ptr<HardwareUnit>, 4> Hardware;
-  const llvm::MCRegisterInfo &MRI;
-  const llvm::MCSubtargetInfo &STI;
+  SmallVector<std::unique_ptr<HardwareUnit>, 4> Hardware;
+  const MCRegisterInfo &MRI;
+  const MCSubtargetInfo &STI;
 
 public:
-  Context(const llvm::MCRegisterInfo &R, const llvm::MCSubtargetInfo &S)
-      : MRI(R), STI(S) {}
+  Context(const MCRegisterInfo &R, const MCSubtargetInfo &S) : MRI(R), STI(S) {}
   Context(const Context &C) = delete;
   Context &operator=(const Context &C) = delete;
 
@@ -64,4 +64,5 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 #endif // LLVM_TOOLS_LLVM_MCA_CONTEXT_H
diff --git a/tools/llvm-mca/include/HWEventListener.h b/tools/llvm-mca/include/HWEventListener.h
index cef78041565f36286c818a413b7fe37840ce04e0..0216fae7866b71ead66e3aa5635997664e1915c9 100644
--- a/tools/llvm-mca/include/HWEventListener.h
+++ b/tools/llvm-mca/include/HWEventListener.h
@@ -19,6 +19,7 @@
 #include "Support.h"
 #include "llvm/ADT/ArrayRef.h"
 
+namespace llvm {
 namespace mca {
 
 // An HWInstructionEvent represents state changes of instructions that
@@ -61,23 +62,22 @@ public:
 class HWInstructionIssuedEvent : public HWInstructionEvent {
 public:
   using ResourceRef = std::pair<uint64_t, uint64_t>;
-  HWInstructionIssuedEvent(
-      const InstRef &IR,
-      llvm::ArrayRef<std::pair<ResourceRef, ResourceCycles>> UR)
+  HWInstructionIssuedEvent(const InstRef &IR,
+                           ArrayRef<std::pair<ResourceRef, ResourceCycles>> UR)
       : HWInstructionEvent(HWInstructionEvent::Issued, IR), UsedResources(UR) {}
 
-  llvm::ArrayRef<std::pair<ResourceRef, ResourceCycles>> UsedResources;
+  ArrayRef<std::pair<ResourceRef, ResourceCycles>> UsedResources;
 };
 
 class HWInstructionDispatchedEvent : public HWInstructionEvent {
 public:
-  HWInstructionDispatchedEvent(const InstRef &IR, llvm::ArrayRef<unsigned> Regs,
+  HWInstructionDispatchedEvent(const InstRef &IR, ArrayRef<unsigned> Regs,
                                unsigned UOps)
       : HWInstructionEvent(HWInstructionEvent::Dispatched, IR),
         UsedPhysRegs(Regs), MicroOpcodes(UOps) {}
   // Number of physical register allocated for this instruction. There is one
   // entry per register file.
-  llvm::ArrayRef<unsigned> UsedPhysRegs;
+  ArrayRef<unsigned> UsedPhysRegs;
   // Number of micro opcodes dispatched.
   // This field is often set to the total number of micro-opcodes specified by
   // the instruction descriptor of IR.
@@ -92,12 +92,12 @@ public:
 
 class HWInstructionRetiredEvent : public HWInstructionEvent {
 public:
-  HWInstructionRetiredEvent(const InstRef &IR, llvm::ArrayRef<unsigned> Regs)
+  HWInstructionRetiredEvent(const InstRef &IR, ArrayRef<unsigned> Regs)
       : HWInstructionEvent(HWInstructionEvent::Retired, IR),
         FreedPhysRegs(Regs) {}
   // Number of register writes that have been architecturally committed. There
   // is one entry per register file.
-  llvm::ArrayRef<unsigned> FreedPhysRegs;
+  ArrayRef<unsigned> FreedPhysRegs;
 };
 
 // A HWStallEvent represents a pipeline stall caused by the lack of hardware
@@ -141,9 +141,9 @@ public:
   // Events generated by the Scheduler when buffered resources are
   // consumed/freed for an instruction.
   virtual void onReservedBuffers(const InstRef &Inst,
-                                 llvm::ArrayRef<unsigned> Buffers) {}
+                                 ArrayRef<unsigned> Buffers) {}
   virtual void onReleasedBuffers(const InstRef &Inst,
-                                 llvm::ArrayRef<unsigned> Buffers) {}
+                                 ArrayRef<unsigned> Buffers) {}
 
   virtual ~HWEventListener() {}
 
@@ -151,5 +151,6 @@ private:
   virtual void anchor();
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h b/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h
index e8c496ab967a98a5f6863ed3dbdcff72bbc379d2..5070418c11bffc3f97a2dfd8d21d36adb9eb22cd 100644
--- a/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h
+++ b/tools/llvm-mca/include/HardwareUnits/HardwareUnit.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
 #define LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
 
+namespace llvm {
 namespace mca {
 
 class HardwareUnit {
@@ -28,4 +29,5 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 #endif // LLVM_TOOLS_LLVM_MCA_HARDWAREUNIT_H
diff --git a/tools/llvm-mca/include/HardwareUnits/LSUnit.h b/tools/llvm-mca/include/HardwareUnits/LSUnit.h
index b348c973ee04cee7d91dbd688a994b1b3892f883..6b36282ca7253aa7499d9c32f91ab42558bdf49d 100644
--- a/tools/llvm-mca/include/HardwareUnits/LSUnit.h
+++ b/tools/llvm-mca/include/HardwareUnits/LSUnit.h
@@ -19,6 +19,7 @@
 #include "HardwareUnits/HardwareUnit.h"
 #include <set>
 
+namespace llvm {
 namespace mca {
 
 class InstRef;
@@ -128,11 +129,7 @@ public:
   void dump() const;
 #endif
 
-  enum Status {
-    LSU_AVAILABLE = 0,
-    LSU_LQUEUE_FULL,
-    LSU_SQUEUE_FULL
-  };
+  enum Status { LSU_AVAILABLE = 0, LSU_LQUEUE_FULL, LSU_SQUEUE_FULL };
 
   // Returns LSU_AVAILABLE if there are enough load/store queue entries to serve
   // IR. It also returns LSU_AVAILABLE if IR is not a memory operation.
@@ -156,5 +153,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
index 1026079c3772386add10a6b82136fe99f9dd2048..d9949bf4f6a13be5208240ba299cb4d7927895c1 100644
--- a/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
+++ b/tools/llvm-mca/include/HardwareUnits/RegisterFile.h
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/Error.h"
 
+namespace llvm {
 namespace mca {
 
 class ReadState;
@@ -33,7 +34,7 @@ class WriteRef;
 /// Manages hardware register files, and tracks register definitions for
 /// register renaming purposes.
 class RegisterFile : public HardwareUnit {
-  const llvm::MCRegisterInfo &MRI;
+  const MCRegisterInfo &MRI;
 
   // class RegisterMappingTracker is a  physical register file (PRF) descriptor.
   // There is one RegisterMappingTracker for every PRF definition in the
@@ -68,9 +69,11 @@ class RegisterFile : public HardwareUnit {
     bool AllowZeroMoveEliminationOnly;
 
     RegisterMappingTracker(unsigned NumPhysRegisters,
-                           unsigned MaxMoveEliminated = 0U)
+                           unsigned MaxMoveEliminated = 0U,
+                           bool AllowZeroMoveElimOnly = false)
         : NumPhysRegs(NumPhysRegisters), NumUsedPhysRegs(0),
-          MaxMoveEliminatedPerCycle(MaxMoveEliminated), NumMoveEliminated(0U) {}
+          MaxMoveEliminatedPerCycle(MaxMoveEliminated), NumMoveEliminated(0U),
+          AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly) {}
   };
 
   // A vector of register file descriptors.  This set always contains at least
@@ -82,7 +85,7 @@ class RegisterFile : public HardwareUnit {
   //
   // Users can limit the number of physical registers that are available in
   // regsiter file #0 specifying command line flag `-register-file-size=<uint>`.
-  llvm::SmallVector<RegisterMappingTracker, 4> RegisterFiles;
+  SmallVector<RegisterMappingTracker, 4> RegisterFiles;
 
   // This type is used to propagate information about the owner of a register,
   // and the cost of allocating it in the PRF. Register cost is defined as the
@@ -98,7 +101,7 @@ class RegisterFile : public HardwareUnit {
   //
   // There is a RegisterRenamingInfo object for every logical register defined
   // by the target. RegisteRenamingInfo objects are stored into vector
-  // `RegisterMappings`, and llvm::MCPhysReg IDs can be used to reference
+  // `RegisterMappings`, and MCPhysReg IDs can be used to reference
   // elements in that vector.
   //
   // Each RegisterRenamingInfo is owned by a PRF, and field `IndexPlusCost`
@@ -107,12 +110,18 @@ class RegisterFile : public HardwareUnit {
   //
   // Field `AllowMoveElimination` is set for registers that are used as
   // destination by optimizable register moves.
+  //
+  // Field `AliasRegID` is set by writes from register moves that have been
+  // eliminated at register renaming stage. A move eliminated at register
+  // renaming stage is effectively bypassed, and its write aliases the source
+  // register definition.
   struct RegisterRenamingInfo {
     IndexPlusCostPairTy IndexPlusCost;
-    llvm::MCPhysReg RenameAs;
+    MCPhysReg RenameAs;
+    MCPhysReg AliasRegID;
     bool AllowMoveElimination;
     RegisterRenamingInfo()
-        : IndexPlusCost(std::make_pair(0U, 1U)), RenameAs(0U),
+        : IndexPlusCost(std::make_pair(0U, 1U)), RenameAs(0U), AliasRegID(0U),
           AllowMoveElimination(false) {}
   };
 
@@ -135,7 +144,7 @@ class RegisterFile : public HardwareUnit {
 
   // Used to track zero registers. There is one bit for each register defined by
   // the target. Bits are set for registers that are known to be zero.
-  llvm::APInt ZeroRegisters;
+  APInt ZeroRegisters;
 
   // This method creates a new register file descriptor.
   // The new register file owns all of the registers declared by register
@@ -151,49 +160,56 @@ class RegisterFile : public HardwareUnit {
   // Here FPRegisterFile contains all the registers defined by register class
   // VR128RegClass and VR256RegClass. FPRegisterFile implements 60
   // registers which can be used for register renaming purpose.
-  void
-  addRegisterFile(llvm::ArrayRef<llvm::MCRegisterCostEntry> RegisterClasses,
-                  unsigned NumPhysRegs);
+  void addRegisterFile(const MCRegisterFileDesc &RF,
+                       ArrayRef<MCRegisterCostEntry> Entries);
 
   // Consumes physical registers in each register file specified by the
   // `IndexPlusCostPairTy`. This method is called from `addRegisterMapping()`.
   void allocatePhysRegs(const RegisterRenamingInfo &Entry,
-                        llvm::MutableArrayRef<unsigned> UsedPhysRegs);
+                        MutableArrayRef<unsigned> UsedPhysRegs);
 
   // Releases previously allocated physical registers from the register file(s).
   // This method is called from `invalidateRegisterMapping()`.
   void freePhysRegs(const RegisterRenamingInfo &Entry,
-                    llvm::MutableArrayRef<unsigned> FreedPhysRegs);
+                    MutableArrayRef<unsigned> FreedPhysRegs);
+
+  // Collects writes that are in a RAW dependency with RS.
+  // This method is called from `addRegisterRead()`.
+  void collectWrites(const ReadState &RS,
+                     SmallVectorImpl<WriteRef> &Writes) const;
 
   // Create an instance of RegisterMappingTracker for every register file
   // specified by the processor model.
   // If no register file is specified, then this method creates a default
   // register file with an unbounded number of physical registers.
-  void initialize(const llvm::MCSchedModel &SM, unsigned NumRegs);
+  void initialize(const MCSchedModel &SM, unsigned NumRegs);
 
 public:
-  RegisterFile(const llvm::MCSchedModel &SM, const llvm::MCRegisterInfo &mri,
+  RegisterFile(const MCSchedModel &SM, const MCRegisterInfo &mri,
                unsigned NumRegs = 0);
 
   // This method updates the register mappings inserting a new register
   // definition. This method is also responsible for updating the number of
   // allocated physical registers in each register file modified by the write.
   // No physical regiser is allocated if this write is from a zero-idiom.
-  void addRegisterWrite(WriteRef Write,
-                        llvm::MutableArrayRef<unsigned> UsedPhysRegs);
+  void addRegisterWrite(WriteRef Write, MutableArrayRef<unsigned> UsedPhysRegs);
+
+  // Collect writes that are in a data dependency with RS, and update RS
+  // internal state.
+  void addRegisterRead(ReadState &RS, SmallVectorImpl<WriteRef> &Writes) const;
 
   // Removes write \param WS from the register mappings.
   // Physical registers may be released to reflect this update.
   // No registers are released if this write is from a zero-idiom.
   void removeRegisterWrite(const WriteState &WS,
-                           llvm::MutableArrayRef<unsigned> FreedPhysRegs);
+                           MutableArrayRef<unsigned> FreedPhysRegs);
 
   // Returns true if a move from RS to WS can be eliminated.
   // On success, it updates WriteState by setting flag `WS.isEliminated`.
   // If RS is a read from a zero register, and WS is eliminated, then
   // `WS.WritesZero` is also set, so that method addRegisterWrite() would not
   // reserve a physical register for it.
-  bool tryEliminateMove(WriteState &WS, const ReadState &RS);
+  bool tryEliminateMove(WriteState &WS, ReadState &RS);
 
   // Checks if there are enough physical registers in the register files.
   // Returns a "response mask" where each bit represents the response from a
@@ -204,9 +220,9 @@ public:
   //
   // Current implementation can simulate up to 32 register files (including the
   // special register file at index #0).
-  unsigned isAvailable(llvm::ArrayRef<unsigned> Regs) const;
-  void collectWrites(llvm::SmallVectorImpl<WriteRef> &Writes,
-                     unsigned RegID) const;
+  unsigned isAvailable(ArrayRef<unsigned> Regs) const;
+
+  // Returns the number of PRFs implemented by this processor.
   unsigned getNumRegisterFiles() const { return RegisterFiles.size(); }
 
   // Notify each PRF that a new cycle just started.
@@ -218,5 +234,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H
diff --git a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h b/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
index dfac15f53fc966c299d6dd482858c0b9717c61d7..065ead8f1a8abd300ad06f4e0fae775b8cb061b8 100644
--- a/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
+++ b/tools/llvm-mca/include/HardwareUnits/ResourceManager.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
 
+namespace llvm {
 namespace mca {
 
 /// Used to notify the internal state of a processor resource.
@@ -188,8 +189,7 @@ class ResourceState {
   }
 
 public:
-  ResourceState(const llvm::MCProcResourceDesc &Desc, unsigned Index,
-                uint64_t Mask);
+  ResourceState(const MCProcResourceDesc &Desc, unsigned Index, uint64_t Mask);
 
   unsigned getProcResourceID() const { return ProcResourceDescIndex; }
   uint64_t getResourceMask() const { return ResourceMask; }
@@ -210,9 +210,7 @@ public:
   /// `NumUnits` available units.
   bool isReady(unsigned NumUnits = 1) const;
 
-  bool isAResourceGroup() const {
-    return llvm::countPopulation(ResourceMask) > 1;
-  }
+  bool isAResourceGroup() const { return countPopulation(ResourceMask) > 1; }
 
   bool containsResource(uint64_t ID) const { return ResourceMask & ID; }
 
@@ -227,7 +225,7 @@ public:
   }
 
   unsigned getNumUnits() const {
-    return isAResourceGroup() ? 1U : llvm::countPopulation(ResourceSizeMask);
+    return isAResourceGroup() ? 1U : countPopulation(ResourceSizeMask);
   }
 
   /// Checks if there is an available slot in the resource buffer.
@@ -285,10 +283,10 @@ class ResourceManager {
 
   // Keeps track of which resources are busy, and how many cycles are left
   // before those become usable again.
-  llvm::SmallDenseMap<ResourceRef, unsigned> BusyResources;
+  SmallDenseMap<ResourceRef, unsigned> BusyResources;
 
   // A table to map processor resource IDs to processor resource masks.
-  llvm::SmallVector<uint64_t, 8> ProcResID2Mask;
+  SmallVector<uint64_t, 8> ProcResID2Mask;
 
   // Returns the actual resource unit that will be used.
   ResourceRef selectPipe(uint64_t ResourceID);
@@ -304,7 +302,7 @@ class ResourceManager {
                              uint64_t ResourceMask);
 
 public:
-  ResourceManager(const llvm::MCSchedModel &SM);
+  ResourceManager(const MCSchedModel &SM);
   virtual ~ResourceManager() = default;
 
   // Overrides the selection strategy for the resource at index ResourceID in
@@ -318,17 +316,17 @@ public:
 
   // Returns RS_BUFFER_AVAILABLE if buffered resources are not reserved, and if
   // there are enough available slots in the buffers.
-  ResourceStateEvent canBeDispatched(llvm::ArrayRef<uint64_t> Buffers) const;
+  ResourceStateEvent canBeDispatched(ArrayRef<uint64_t> Buffers) const;
 
   // Return the processor resource identifier associated to this Mask.
   unsigned resolveResourceMask(uint64_t Mask) const;
 
   // Consume a slot in every buffered resource from array 'Buffers'. Resource
   // units that are dispatch hazards (i.e. BufferSize=0) are marked as reserved.
-  void reserveBuffers(llvm::ArrayRef<uint64_t> Buffers);
+  void reserveBuffers(ArrayRef<uint64_t> Buffers);
 
   // Release buffer entries previously allocated by method reserveBuffers.
-  void releaseBuffers(llvm::ArrayRef<uint64_t> Buffers);
+  void releaseBuffers(ArrayRef<uint64_t> Buffers);
 
   // Reserve a processor resource. A reserved resource is not available for
   // instruction issue until it is released.
@@ -345,9 +343,9 @@ public:
 
   void issueInstruction(
       const InstrDesc &Desc,
-      llvm::SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Pipes);
+      SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Pipes);
 
-  void cycleEvent(llvm::SmallVectorImpl<ResourceRef> &ResourcesFreed);
+  void cycleEvent(SmallVectorImpl<ResourceRef> &ResourcesFreed);
 
 #ifndef NDEBUG
   void dump() const {
@@ -357,5 +355,6 @@ public:
 #endif
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_RESOURCE_MANAGER_H
diff --git a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h b/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
index 552a2094ff13901b4a8b1bf0204403f800c8e727..12e0a1fba136ea926c1ce3f48d8e9b7fdcf7fa60 100644
--- a/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
+++ b/tools/llvm-mca/include/HardwareUnits/RetireControlUnit.h
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSchedule.h"
 #include <vector>
 
+namespace llvm {
 namespace mca {
 
 /// This class tracks which instructions are in-flight (i.e., dispatched but not
@@ -62,7 +63,7 @@ private:
   std::vector<RUToken> Queue;
 
 public:
-  RetireControlUnit(const llvm::MCSchedModel &SM);
+  RetireControlUnit(const MCSchedModel &SM);
 
   bool isEmpty() const { return AvailableSlots == Queue.size(); }
   bool isAvailable(unsigned Quantity = 1) const {
@@ -98,5 +99,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_RETIRE_CONTROL_UNIT_H
diff --git a/tools/llvm-mca/include/HardwareUnits/Scheduler.h b/tools/llvm-mca/include/HardwareUnits/Scheduler.h
index db124958ee5eeb43a04b0d6b08cb09f59907122d..17332b430d2edff7f8c5d00ff4e31f75f5757242 100644
--- a/tools/llvm-mca/include/HardwareUnits/Scheduler.h
+++ b/tools/llvm-mca/include/HardwareUnits/Scheduler.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
 
+namespace llvm {
 namespace mca {
 
 class SchedulerStrategy {
@@ -104,25 +105,25 @@ class Scheduler : public HardwareUnit {
   /// Issue an instruction without updating the ready queue.
   void issueInstructionImpl(
       InstRef &IR,
-      llvm::SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Pipes);
+      SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Pipes);
 
   // Identify instructions that have finished executing, and remove them from
   // the IssuedSet. References to executed instructions are added to input
   // vector 'Executed'.
-  void updateIssuedSet(llvm::SmallVectorImpl<InstRef> &Executed);
+  void updateIssuedSet(SmallVectorImpl<InstRef> &Executed);
 
   // Try to promote instructions from WaitSet to ReadySet.
   // Add promoted instructions to the 'Ready' vector in input.
-  void promoteToReadySet(llvm::SmallVectorImpl<InstRef> &Ready);
+  void promoteToReadySet(SmallVectorImpl<InstRef> &Ready);
 
 public:
-  Scheduler(const llvm::MCSchedModel &Model, LSUnit *Lsu)
-      : LSU(Lsu), Resources(llvm::make_unique<ResourceManager>(Model)) {
+  Scheduler(const MCSchedModel &Model, LSUnit *Lsu)
+      : LSU(Lsu), Resources(make_unique<ResourceManager>(Model)) {
     initializeStrategy(nullptr);
   }
-  Scheduler(const llvm::MCSchedModel &Model, LSUnit *Lsu,
+  Scheduler(const MCSchedModel &Model, LSUnit *Lsu,
             std::unique_ptr<SchedulerStrategy> SelectStrategy)
-      : LSU(Lsu), Resources(llvm::make_unique<ResourceManager>(Model)) {
+      : LSU(Lsu), Resources(make_unique<ResourceManager>(Model)) {
     initializeStrategy(std::move(SelectStrategy));
   }
   Scheduler(std::unique_ptr<ResourceManager> RM, LSUnit *Lsu,
@@ -167,8 +168,8 @@ public:
   /// result of this event.
   void issueInstruction(
       InstRef &IR,
-      llvm::SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Used,
-      llvm::SmallVectorImpl<InstRef> &Ready);
+      SmallVectorImpl<std::pair<ResourceRef, ResourceCycles>> &Used,
+      SmallVectorImpl<InstRef> &Ready);
 
   /// Returns true if IR has to be issued immediately, or if IR is a zero
   /// latency instruction.
@@ -181,9 +182,9 @@ public:
   /// have changed in state, and that are now available to new instructions.
   /// Instructions executed are added to vector Executed, while vector Ready is
   /// populated with instructions that have become ready in this new cycle.
-  void cycleEvent(llvm::SmallVectorImpl<ResourceRef> &Freed,
-                  llvm::SmallVectorImpl<InstRef> &Ready,
-                  llvm::SmallVectorImpl<InstRef> &Executed);
+  void cycleEvent(SmallVectorImpl<ResourceRef> &Freed,
+                  SmallVectorImpl<InstRef> &Ready,
+                  SmallVectorImpl<InstRef> &Executed);
 
   /// Convert a resource mask into a valid llvm processor resource identifier.
   unsigned getResourceID(uint64_t Mask) const {
@@ -202,12 +203,13 @@ public:
   // This routine performs a sanity check.  This routine should only be called
   // when we know that 'IR' is not in the scheduler's instruction queues.
   void sanityCheck(const InstRef &IR) const {
-    assert(llvm::find(WaitSet, IR) == WaitSet.end());
-    assert(llvm::find(ReadySet, IR) == ReadySet.end());
-    assert(llvm::find(IssuedSet, IR) == IssuedSet.end());
+    assert(find(WaitSet, IR) == WaitSet.end() && "Already in the wait set!");
+    assert(find(ReadySet, IR) == ReadySet.end() && "Already in the ready set!");
+    assert(find(IssuedSet, IR) == IssuedSet.end() && "Already executing!");
   }
 #endif // !NDEBUG
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_SCHEDULER_H
diff --git a/tools/llvm-mca/include/InstrBuilder.h b/tools/llvm-mca/include/InstrBuilder.h
index ff7fb52044a4b4f71f9226473a3fbd40116767b8..67aa889cf7bf5ebc0751ffc523c3703f42516760 100644
--- a/tools/llvm-mca/include/InstrBuilder.h
+++ b/tools/llvm-mca/include/InstrBuilder.h
@@ -17,13 +17,13 @@
 
 #include "Instruction.h"
 #include "Support.h"
-#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Error.h"
 
+namespace llvm {
 namespace mca {
 
 /// A builder class that knows how to construct Instruction objects.
@@ -37,54 +37,34 @@ namespace mca {
 /// Information from the machine scheduling model is used to identify processor
 /// resources that are consumed by an instruction.
 class InstrBuilder {
-  const llvm::MCSubtargetInfo &STI;
-  const llvm::MCInstrInfo &MCII;
-  const llvm::MCRegisterInfo &MRI;
-  const llvm::MCInstrAnalysis &MCIA;
-  llvm::MCInstPrinter &MCIP;
-  llvm::SmallVector<uint64_t, 8> ProcResourceMasks;
+  const MCSubtargetInfo &STI;
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+  const MCInstrAnalysis &MCIA;
+  SmallVector<uint64_t, 8> ProcResourceMasks;
 
-  llvm::DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
-  llvm::DenseMap<const llvm::MCInst *, std::unique_ptr<const InstrDesc>>
-      VariantDescriptors;
+  DenseMap<unsigned short, std::unique_ptr<const InstrDesc>> Descriptors;
+  DenseMap<const MCInst *, std::unique_ptr<const InstrDesc>> VariantDescriptors;
 
-  llvm::Expected<const InstrDesc &>
-  createInstrDescImpl(const llvm::MCInst &MCI);
-  llvm::Expected<const InstrDesc &>
-  getOrCreateInstrDesc(const llvm::MCInst &MCI);
+  Expected<const InstrDesc &> createInstrDescImpl(const MCInst &MCI);
+  Expected<const InstrDesc &> getOrCreateInstrDesc(const MCInst &MCI);
 
   InstrBuilder(const InstrBuilder &) = delete;
   InstrBuilder &operator=(const InstrBuilder &) = delete;
 
-  llvm::Error populateWrites(InstrDesc &ID, const llvm::MCInst &MCI,
-                             unsigned SchedClassID);
-  llvm::Error populateReads(InstrDesc &ID, const llvm::MCInst &MCI,
-                            unsigned SchedClassID);
-  llvm::Error verifyInstrDesc(const InstrDesc &ID,
-                              const llvm::MCInst &MCI) const;
+  Error populateWrites(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID);
+  Error populateReads(InstrDesc &ID, const MCInst &MCI, unsigned SchedClassID);
+  Error verifyInstrDesc(const InstrDesc &ID, const MCInst &MCI) const;
 
 public:
-  InstrBuilder(const llvm::MCSubtargetInfo &sti, const llvm::MCInstrInfo &mcii,
-               const llvm::MCRegisterInfo &mri,
-               const llvm::MCInstrAnalysis &mcia, llvm::MCInstPrinter &mcip)
-      : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), MCIP(mcip),
-        ProcResourceMasks(STI.getSchedModel().getNumProcResourceKinds()) {
-    computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
-  }
-
-  // Returns an array of processor resource masks.
-  // Masks are computed by function mca::computeProcResourceMasks. see
-  // Support.h for a description of how masks are computed and how masks can be
-  // used to solve set membership problems.
-  llvm::ArrayRef<uint64_t> getProcResourceMasks() const {
-    return ProcResourceMasks;
-  }
+  InstrBuilder(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
+               const MCRegisterInfo &RI, const MCInstrAnalysis &IA);
 
   void clear() { VariantDescriptors.shrink_and_clear(); }
 
-  llvm::Expected<std::unique_ptr<Instruction>>
-  createInstruction(const llvm::MCInst &MCI);
+  Expected<std::unique_ptr<Instruction>> createInstruction(const MCInst &MCI);
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Instruction.h b/tools/llvm-mca/include/Instruction.h
index 0d7db11795efaeb61c5ca5e9ed04df2afdc3910a..7407283bca2c3933aacd83ed3db269349975b774 100644
--- a/tools/llvm-mca/include/Instruction.h
+++ b/tools/llvm-mca/include/Instruction.h
@@ -16,7 +16,9 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
 #define LLVM_TOOLS_LLVM_MCA_INSTRUCTION_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/MathExtras.h"
 
 #ifndef NDEBUG
@@ -27,6 +29,7 @@
 #include <set>
 #include <vector>
 
+namespace llvm {
 namespace mca {
 
 constexpr int UNKNOWN_CYCLES = -512;
@@ -86,7 +89,7 @@ class ReadState;
 /// register write. It also tracks how many cycles are left before the write
 /// back stage.
 class WriteState {
-  const WriteDescriptor &WD;
+  const WriteDescriptor *WD;
   // On instruction issue, this field is set equal to the write latency.
   // Before instruction issue, this field defaults to -512, a special
   // value that represents an "unknown" number of cycles.
@@ -98,6 +101,9 @@ class WriteState {
   // field RegisterID from WD.
   unsigned RegisterID;
 
+  // Physical register file that serves register RegisterID.
+  unsigned PRFID;
+
   // True if this write implicitly clears the upper portion of RegisterID's
   // super-registers.
   bool ClearsSuperRegs;
@@ -131,16 +137,18 @@ class WriteState {
 public:
   WriteState(const WriteDescriptor &Desc, unsigned RegID,
              bool clearsSuperRegs = false, bool writesZero = false)
-      : WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
-        ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
+      : WD(&Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
+        PRFID(0), ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
         IsEliminated(false), DependentWrite(nullptr), NumWriteUsers(0U) {}
-  WriteState(const WriteState &Other) = delete;
-  WriteState &operator=(const WriteState &Other) = delete;
+
+  WriteState(const WriteState &Other) = default;
+  WriteState &operator=(const WriteState &Other) = default;
 
   int getCyclesLeft() const { return CyclesLeft; }
-  unsigned getWriteResourceID() const { return WD.SClassOrWriteResourceID; }
+  unsigned getWriteResourceID() const { return WD->SClassOrWriteResourceID; }
   unsigned getRegisterID() const { return RegisterID; }
-  unsigned getLatency() const { return WD.Latency; }
+  unsigned getRegisterFileID() const { return PRFID; }
+  unsigned getLatency() const { return WD->Latency; }
 
   void addUser(ReadState *Use, int ReadAdvance);
 
@@ -164,6 +172,8 @@ public:
     IsEliminated = true;
   }
 
+  void setPRF(unsigned PRF) { PRFID = PRF; }
+
   // On every cycle, update CyclesLeft and notify dependent users.
   void cycleEvent();
   void onInstructionIssued();
@@ -178,9 +188,11 @@ public:
 /// A read may be dependent on more than one write. This occurs when some
 /// writes only partially update the register associated to this read.
 class ReadState {
-  const ReadDescriptor &RD;
+  const ReadDescriptor *RD;
   // Physical register identified associated to this read.
   unsigned RegisterID;
+  // Physical register file that serves register RegisterID.
+  unsigned PRFID;
   // Number of writes that contribute to the definition of RegisterID.
   // In the absence of partial register updates, the number of DependentWrites
   // cannot be more than one.
@@ -197,23 +209,24 @@ class ReadState {
   // This field is set to true only if there are no dependent writes, and
   // there are no `CyclesLeft' to wait.
   bool IsReady;
+  // True if this is a read from a known zero register.
+  bool IsZero;
   // True if this register read is from a dependency-breaking instruction.
   bool IndependentFromDef;
 
 public:
   ReadState(const ReadDescriptor &Desc, unsigned RegID)
-      : RD(Desc), RegisterID(RegID), DependentWrites(0),
+      : RD(&Desc), RegisterID(RegID), PRFID(0), DependentWrites(0),
         CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true),
-        IndependentFromDef(false) {}
-  ReadState(const ReadState &Other) = delete;
-  ReadState &operator=(const ReadState &Other) = delete;
+        IsZero(false), IndependentFromDef(false) {}
 
-  const ReadDescriptor &getDescriptor() const { return RD; }
-  unsigned getSchedClass() const { return RD.SchedClassID; }
+  const ReadDescriptor &getDescriptor() const { return *RD; }
+  unsigned getSchedClass() const { return RD->SchedClassID; }
   unsigned getRegisterID() const { return RegisterID; }
+  unsigned getRegisterFileID() const { return PRFID; }
 
   bool isReady() const { return IsReady; }
-  bool isImplicitRead() const { return RD.isImplicitRead(); }
+  bool isImplicitRead() const { return RD->isImplicitRead(); }
 
   bool isIndependentFromDef() const { return IndependentFromDef; }
   void setIndependentFromDef() { IndependentFromDef = true; }
@@ -224,6 +237,10 @@ public:
     DependentWrites = Writes;
     IsReady = !Writes;
   }
+
+  bool isReadZero() const { return IsZero; }
+  void setReadZero() { IsZero = true; }
+  void setPRF(unsigned ID) { PRFID = ID; }
 };
 
 /// A sequence of cycles.
@@ -313,13 +330,59 @@ struct InstrDesc {
   InstrDesc &operator=(const InstrDesc &Other) = delete;
 };
 
+/// Base class for instructions consumed by the simulation pipeline.
+///
+/// This class tracks data dependencies as well as generic properties
+/// of the instruction.
+class InstructionBase {
+  const InstrDesc &Desc;
+
+  // This field is set for instructions that are candidates for move
+  // elimination. For more information about move elimination, see the
+  // definition of RegisterMappingTracker in RegisterFile.h
+  bool IsOptimizableMove;
+
+  // Output dependencies.
+  // One entry per each implicit and explicit register definition.
+  SmallVector<WriteState, 4> Defs;
+
+  // Input dependencies.
+  // One entry per each implicit and explicit register use.
+  SmallVector<ReadState, 4> Uses;
+
+public:
+  InstructionBase(const InstrDesc &D) : Desc(D), IsOptimizableMove(false) {}
+
+  SmallVectorImpl<WriteState> &getDefs() { return Defs; }
+  const ArrayRef<WriteState> getDefs() const { return Defs; }
+  SmallVectorImpl<ReadState> &getUses() { return Uses; }
+  const ArrayRef<ReadState> getUses() const { return Uses; }
+  const InstrDesc &getDesc() const { return Desc; }
+
+  unsigned getLatency() const { return Desc.MaxLatency; }
+
+  bool hasDependentUsers() const {
+    return any_of(Defs,
+                  [](const WriteState &Def) { return Def.getNumUsers() > 0; });
+  }
+
+  unsigned getNumUsers() const {
+    unsigned NumUsers = 0;
+    for (const WriteState &Def : Defs)
+      NumUsers += Def.getNumUsers();
+    return NumUsers;
+  }
+
+  // Returns true if this instruction is a candidate for move elimination.
+  bool isOptimizableMove() const { return IsOptimizableMove; }
+  void setOptimizableMove() { IsOptimizableMove = true; }
+};
+
 /// An instruction propagated through the simulated instruction pipeline.
 ///
 /// This class is used to monitor changes to the internal state of instructions
 /// that are sent to the various components of the simulated hardware pipeline.
-class Instruction {
-  const InstrDesc &Desc;
-
+class Instruction : public InstructionBase {
   enum InstrStage {
     IS_INVALID,   // Instruction in an invalid state.
     IS_AVAILABLE, // Instruction dispatched but operands are not ready.
@@ -339,53 +402,14 @@ class Instruction {
   // Retire Unit token ID for this instruction.
   unsigned RCUTokenID;
 
-  // This field is set for instructions that are candidates for move
-  // elimination. For more information about move elimination, see the
-  // definition of RegisterMappingTracker in RegisterFile.h
-  //
-  // TODO: Teach subtargets how to describe optimizable register moves.
-  bool IsOptimizableMove;
-
-  using UniqueDef = std::unique_ptr<WriteState>;
-  using UniqueUse = std::unique_ptr<ReadState>;
-  using VecDefs = std::vector<UniqueDef>;
-  using VecUses = std::vector<UniqueUse>;
-
-  // Output dependencies.
-  // One entry per each implicit and explicit register definition.
-  VecDefs Defs;
-
-  // Input dependencies.
-  // One entry per each implicit and explicit register use.
-  VecUses Uses;
-
 public:
   Instruction(const InstrDesc &D)
-      : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0),
-        IsOptimizableMove(false) {}
-  Instruction(const Instruction &Other) = delete;
-  Instruction &operator=(const Instruction &Other) = delete;
-
-  VecDefs &getDefs() { return Defs; }
-  const VecDefs &getDefs() const { return Defs; }
-  VecUses &getUses() { return Uses; }
-  const VecUses &getUses() const { return Uses; }
-  const InstrDesc &getDesc() const { return Desc; }
+      : InstructionBase(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES),
+        RCUTokenID(0) {}
+
   unsigned getRCUTokenID() const { return RCUTokenID; }
   int getCyclesLeft() const { return CyclesLeft; }
 
-  bool hasDependentUsers() const {
-    return llvm::any_of(
-        Defs, [](const UniqueDef &Def) { return Def->getNumUsers() > 0; });
-  }
-
-  unsigned getNumUsers() const {
-    unsigned NumUsers = 0;
-    for (const UniqueDef &Def : Defs)
-      NumUsers += Def->getNumUsers();
-    return NumUsers;
-  }
-
   // Transition to the dispatch stage, and assign a RCUToken to this
   // instruction. The RCUToken is used to track the completion of every
   // register write performed by this instruction.
@@ -409,13 +433,10 @@ public:
   bool isExecuted() const { return Stage == IS_EXECUTED; }
   bool isRetired() const { return Stage == IS_RETIRED; }
 
-  // Returns true if this instruction is a candidate for move elimination.
-  bool isOptimizableMove() const { return IsOptimizableMove; }
-  void setOptimizableMove() { IsOptimizableMove = true; }
   bool isEliminated() const {
-    return isReady() && Defs.size() &&
-           llvm::all_of(Defs,
-                        [](const UniqueDef &D) { return D->isEliminated(); });
+    return isReady() && getDefs().size() &&
+           all_of(getDefs(),
+                  [](const WriteState &W) { return W.isEliminated(); });
   }
 
   // Forces a transition from state IS_AVAILABLE to state IS_EXECUTED.
@@ -446,18 +467,18 @@ public:
   const Instruction *getInstruction() const { return Data.second; }
 
   /// Returns true if this references a valid instruction.
-  bool isValid() const { return Data.second; }
+  operator bool() const { return Data.second != nullptr; }
 
   /// Invalidate this reference.
   void invalidate() { Data.second = nullptr; }
 
 #ifndef NDEBUG
-  void print(llvm::raw_ostream &OS) const { OS << getSourceIndex(); }
+  void print(raw_ostream &OS) const { OS << getSourceIndex(); }
 #endif
 };
 
 #ifndef NDEBUG
-inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const InstRef &IR) {
+inline raw_ostream &operator<<(raw_ostream &OS, const InstRef &IR) {
   IR.print(OS);
   return OS;
 }
@@ -503,5 +524,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Pipeline.h b/tools/llvm-mca/include/Pipeline.h
index ad487e7564bea830907d4ae64ac37ea7a82ff0b6..47ff07b288288bb2fc667f158e6e88b51de919df 100644
--- a/tools/llvm-mca/include/Pipeline.h
+++ b/tools/llvm-mca/include/Pipeline.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Error.h"
 
+namespace llvm {
 namespace mca {
 
 class HWEventListener;
@@ -54,11 +55,11 @@ class Pipeline {
   Pipeline &operator=(const Pipeline &P) = delete;
 
   /// An ordered list of stages that define this instruction pipeline.
-  llvm::SmallVector<std::unique_ptr<Stage>, 8> Stages;
+  SmallVector<std::unique_ptr<Stage>, 8> Stages;
   std::set<HWEventListener *> Listeners;
   unsigned Cycles;
 
-  llvm::Error runCycle();
+  Error runCycle();
   bool hasWorkToProcess();
   void notifyCycleBegin();
   void notifyCycleEnd();
@@ -66,9 +67,10 @@ class Pipeline {
 public:
   Pipeline() : Cycles(0) {}
   void appendStage(std::unique_ptr<Stage> S);
-  llvm::Error run();
+  Error run();
   void addEventListener(HWEventListener *Listener);
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_PIPELINE_H
diff --git a/tools/llvm-mca/include/SourceMgr.h b/tools/llvm-mca/include/SourceMgr.h
index 573ca7a9a00312b0bdcd7ec5928ea0d6ef3a5102..e5180107011b19a3feab4faa9a92ca4c00f278cb 100644
--- a/tools/llvm-mca/include/SourceMgr.h
+++ b/tools/llvm-mca/include/SourceMgr.h
@@ -16,49 +16,42 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
 #define LLVM_TOOLS_LLVM_MCA_SOURCEMGR_H
 
-#include "llvm/MC/MCInst.h"
-#include <vector>
+#include "llvm/ADT/ArrayRef.h"
 
+namespace llvm {
 namespace mca {
 
-typedef std::pair<unsigned, const llvm::MCInst *> SourceRef;
+class Instruction;
+
+typedef std::pair<unsigned, const Instruction &> SourceRef;
 
 class SourceMgr {
-  using InstVec = std::vector<std::unique_ptr<const llvm::MCInst>>;
-  const InstVec &Sequence;
+  using UniqueInst = std::unique_ptr<Instruction>;
+  ArrayRef<UniqueInst> Sequence;
   unsigned Current;
-  unsigned Iterations;
+  const unsigned Iterations;
   static const unsigned DefaultIterations = 100;
 
 public:
-  SourceMgr(const InstVec &MCInstSequence, unsigned NumIterations)
-      : Sequence(MCInstSequence), Current(0),
-        Iterations(NumIterations ? NumIterations : DefaultIterations) {}
+  SourceMgr(ArrayRef<UniqueInst> S, unsigned Iter)
+      : Sequence(S), Current(0), Iterations(Iter ? Iter : DefaultIterations) {}
 
-  unsigned getCurrentIteration() const { return Current / Sequence.size(); }
   unsigned getNumIterations() const { return Iterations; }
   unsigned size() const { return Sequence.size(); }
-  const InstVec &getSequence() const { return Sequence; }
-
-  bool hasNext() const { return Current < (Iterations * size()); }
-  void updateNext() { Current++; }
+  bool hasNext() const { return Current < (Iterations * Sequence.size()); }
+  void updateNext() { ++Current; }
 
-  const SourceRef peekNext() const {
+  SourceRef peekNext() const {
     assert(hasNext() && "Already at end of sequence!");
-    unsigned Index = getCurrentInstructionIndex();
-    return SourceRef(Current, Sequence[Index].get());
-  }
-
-  unsigned getCurrentInstructionIndex() const {
-    return Current % Sequence.size();
+    return SourceRef(Current, *Sequence[Current % Sequence.size()]);
   }
 
-  const llvm::MCInst &getMCInstFromIndex(unsigned Index) const {
-    return *Sequence[Index % size()];
-  }
-
-  bool isEmpty() const { return size() == 0; }
+  using const_iterator = ArrayRef<UniqueInst>::const_iterator;
+  const_iterator begin() const { return Sequence.begin(); }
+  const_iterator end() const { return Sequence.end(); }
 };
+
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Stages/DispatchStage.h b/tools/llvm-mca/include/Stages/DispatchStage.h
index 5a2ac3e6088aef45a159662a39a4d8843b1ef90d..29cace1022e08ddbb746e6b4fe16d7686ba511ce 100644
--- a/tools/llvm-mca/include/Stages/DispatchStage.h
+++ b/tools/llvm-mca/include/Stages/DispatchStage.h
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
+namespace llvm {
 namespace mca {
 
 // Implements the hardware dispatch logic.
@@ -52,30 +53,25 @@ class DispatchStage final : public Stage {
   unsigned AvailableEntries;
   unsigned CarryOver;
   InstRef CarriedOver;
-  const llvm::MCSubtargetInfo &STI;
+  const MCSubtargetInfo &STI;
   RetireControlUnit &RCU;
   RegisterFile &PRF;
 
   bool checkRCU(const InstRef &IR) const;
   bool checkPRF(const InstRef &IR) const;
   bool canDispatch(const InstRef &IR) const;
-  llvm::Error dispatch(InstRef IR);
+  Error dispatch(InstRef IR);
 
-  void updateRAWDependencies(ReadState &RS, const llvm::MCSubtargetInfo &STI);
+  void updateRAWDependencies(ReadState &RS, const MCSubtargetInfo &STI);
 
   void notifyInstructionDispatched(const InstRef &IR,
-                                   llvm::ArrayRef<unsigned> UsedPhysRegs,
+                                   ArrayRef<unsigned> UsedPhysRegs,
                                    unsigned uOps) const;
 
-  void collectWrites(llvm::SmallVectorImpl<WriteRef> &Vec,
-                     unsigned RegID) const {
-    return PRF.collectWrites(Vec, RegID);
-  }
-
 public:
-  DispatchStage(const llvm::MCSubtargetInfo &Subtarget,
-                const llvm::MCRegisterInfo &MRI, unsigned MaxDispatchWidth,
-                RetireControlUnit &R, RegisterFile &F)
+  DispatchStage(const MCSubtargetInfo &Subtarget, const MCRegisterInfo &MRI,
+                unsigned MaxDispatchWidth, RetireControlUnit &R,
+                RegisterFile &F)
       : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth),
         CarryOver(0U), CarriedOver(), STI(Subtarget), RCU(R), PRF(F) {}
 
@@ -84,13 +80,14 @@ public:
   // The dispatch logic internally doesn't buffer instructions. So there is
   // never work to do at the beginning of every cycle.
   bool hasWorkToComplete() const override { return false; }
-  llvm::Error cycleStart() override;
-  llvm::Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error execute(InstRef &IR) override;
 
 #ifndef NDEBUG
   void dump() const;
 #endif
 };
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_DISPATCH_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/ExecuteStage.h b/tools/llvm-mca/include/Stages/ExecuteStage.h
index 63e6f0bc2b8944979e10f3ebfdad497f28369890..91b24059c950df6dfdc762c063ce79a2b474922c 100644
--- a/tools/llvm-mca/include/Stages/ExecuteStage.h
+++ b/tools/llvm-mca/include/Stages/ExecuteStage.h
@@ -23,19 +23,20 @@
 #include "Stages/Stage.h"
 #include "llvm/ADT/ArrayRef.h"
 
+namespace llvm {
 namespace mca {
 
 class ExecuteStage final : public Stage {
   Scheduler &HWS;
 
-  llvm::Error issueInstruction(InstRef &IR);
+  Error issueInstruction(InstRef &IR);
 
   // Called at the beginning of each cycle to issue already dispatched
   // instructions to the underlying pipelines.
-  llvm::Error issueReadyInstructions();
+  Error issueReadyInstructions();
 
   // Used to notify instructions eliminated at register renaming stage.
-  llvm::Error handleInstructionEliminated(InstRef &IR);
+  Error handleInstructionEliminated(InstRef &IR);
 
   ExecuteStage(const ExecuteStage &Other) = delete;
   ExecuteStage &operator=(const ExecuteStage &Other) = delete;
@@ -59,12 +60,12 @@ public:
   // state changes, and processor resources freed by the scheduler.
   // Instructions that transitioned to the 'Executed' state are automatically
   // moved to the next stage (i.e. RetireStage).
-  llvm::Error cycleStart() override;
-  llvm::Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error execute(InstRef &IR) override;
 
   void notifyInstructionIssued(
       const InstRef &IR,
-      llvm::ArrayRef<std::pair<ResourceRef, ResourceCycles>> Used) const;
+      ArrayRef<std::pair<ResourceRef, ResourceCycles>> Used) const;
   void notifyInstructionExecuted(const InstRef &IR) const;
   void notifyInstructionReady(const InstRef &IR) const;
   void notifyResourceAvailable(const ResourceRef &RR) const;
@@ -74,5 +75,6 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_EXECUTE_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/FetchStage.h b/tools/llvm-mca/include/Stages/FetchStage.h
index 10a89c9446988a5279ee4cc6c20308cac5a4ba83..55bf2011b32ba5f312d52c650ed941f69170e71a 100644
--- a/tools/llvm-mca/include/Stages/FetchStage.h
+++ b/tools/llvm-mca/include/Stages/FetchStage.h
@@ -16,37 +16,36 @@
 #ifndef LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
 #define LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
 
-#include "InstrBuilder.h"
 #include "SourceMgr.h"
 #include "Stages/Stage.h"
 #include <map>
 
+namespace llvm {
 namespace mca {
 
 class FetchStage final : public Stage {
-  std::unique_ptr<Instruction> CurrentInstruction;
+  InstRef CurrentInstruction;
   using InstMap = std::map<unsigned, std::unique_ptr<Instruction>>;
   InstMap Instructions;
-  InstrBuilder &IB;
   SourceMgr &SM;
 
   // Updates the program counter, and sets 'CurrentInstruction'.
-  llvm::Error getNextInstruction();
+  void getNextInstruction();
 
   FetchStage(const FetchStage &Other) = delete;
   FetchStage &operator=(const FetchStage &Other) = delete;
 
 public:
-  FetchStage(InstrBuilder &IB, SourceMgr &SM)
-      : CurrentInstruction(), IB(IB), SM(SM) {}
+  FetchStage(SourceMgr &SM) : CurrentInstruction(), SM(SM) {}
 
   bool isAvailable(const InstRef &IR) const override;
   bool hasWorkToComplete() const override;
-  llvm::Error execute(InstRef &IR) override;
-  llvm::Error cycleStart() override;
-  llvm::Error cycleEnd() override;
+  Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error cycleEnd() override;
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_FETCH_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/InstructionTables.h b/tools/llvm-mca/include/Stages/InstructionTables.h
index 16be004d1152986d29babd10ce4647633f4cc10f..e618d06b1b74740a23bb7eb48650390c03d6bc0c 100644
--- a/tools/llvm-mca/include/Stages/InstructionTables.h
+++ b/tools/llvm-mca/include/Stages/InstructionTables.h
@@ -18,25 +18,28 @@
 #define LLVM_TOOLS_LLVM_MCA_INSTRUCTIONTABLES_H
 
 #include "HardwareUnits/Scheduler.h"
-#include "InstrBuilder.h"
 #include "Stages/Stage.h"
+#include "Support.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
 
+namespace llvm {
 namespace mca {
 
 class InstructionTables final : public Stage {
-  const llvm::MCSchedModel &SM;
-  InstrBuilder &IB;
-  llvm::SmallVector<std::pair<ResourceRef, ResourceCycles>, 4> UsedResources;
+  const MCSchedModel &SM;
+  SmallVector<std::pair<ResourceRef, ResourceCycles>, 4> UsedResources;
+  SmallVector<uint64_t, 8> Masks;
 
 public:
-  InstructionTables(const llvm::MCSchedModel &Model, InstrBuilder &Builder)
-      : Stage(), SM(Model), IB(Builder) {}
+  InstructionTables(const MCSchedModel &Model) : Stage(), SM(Model) {
+    computeProcResourceMasks(Model, Masks);
+  }
 
   bool hasWorkToComplete() const override { return false; }
-  llvm::Error execute(InstRef &IR) override;
+  Error execute(InstRef &IR) override;
 };
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/include/Stages/RetireStage.h b/tools/llvm-mca/include/Stages/RetireStage.h
index 2041105a194b35ce1e6f1e20b638422ef61fcf89..28eda40984f36caf3bb19415a7c3545b565af8bf 100644
--- a/tools/llvm-mca/include/Stages/RetireStage.h
+++ b/tools/llvm-mca/include/Stages/RetireStage.h
@@ -21,6 +21,7 @@
 #include "HardwareUnits/RetireControlUnit.h"
 #include "Stages/Stage.h"
 
+namespace llvm {
 namespace mca {
 
 class RetireStage final : public Stage {
@@ -36,11 +37,12 @@ public:
       : Stage(), RCU(R), PRF(F) {}
 
   bool hasWorkToComplete() const override { return !RCU.isEmpty(); }
-  llvm::Error cycleStart() override;
-  llvm::Error execute(InstRef &IR) override;
+  Error cycleStart() override;
+  Error execute(InstRef &IR) override;
   void notifyInstructionRetired(const InstRef &IR) const;
 };
 
 } // namespace mca
+} // namespace llvm
 
 #endif // LLVM_TOOLS_LLVM_MCA_RETIRE_STAGE_H
diff --git a/tools/llvm-mca/include/Stages/Stage.h b/tools/llvm-mca/include/Stages/Stage.h
index 5470c9cf0d973685f9555b4e5667d002aeee7c49..5665fc453bfcae42279b63ad8401a3cc88633614 100644
--- a/tools/llvm-mca/include/Stages/Stage.h
+++ b/tools/llvm-mca/include/Stages/Stage.h
@@ -20,6 +20,7 @@
 #include "llvm/Support/Error.h"
 #include <set>
 
+namespace llvm {
 namespace mca {
 
 class InstRef;
@@ -46,13 +47,13 @@ public:
 
   /// Called once at the start of each cycle.  This can be used as a setup
   /// phase to prepare for the executions during the cycle.
-  virtual llvm::Error cycleStart() { return llvm::ErrorSuccess(); }
+  virtual Error cycleStart() { return ErrorSuccess(); }
 
   /// Called once at the end of each cycle.
-  virtual llvm::Error cycleEnd() { return llvm::ErrorSuccess(); }
+  virtual Error cycleEnd() { return ErrorSuccess(); }
 
   /// The primary action that this stage performs on instruction IR.
-  virtual llvm::Error execute(InstRef &IR) = 0;
+  virtual Error execute(InstRef &IR) = 0;
 
   void setNextInSequence(Stage *NextStage) {
     assert(!NextInSequence && "This stage already has a NextInSequence!");
@@ -67,7 +68,7 @@ public:
   ///
   /// Stages are responsible for moving instructions to their immediate
   /// successor stages.
-  llvm::Error moveToTheNextStage(InstRef &IR) {
+  Error moveToTheNextStage(InstRef &IR) {
     assert(checkNextStage(IR) && "Next stage is not ready!");
     return NextInSequence->execute(IR);
   }
@@ -83,4 +84,5 @@ public:
 };
 
 } // namespace mca
+} // namespace llvm
 #endif // LLVM_TOOLS_LLVM_MCA_STAGE_H
diff --git a/tools/llvm-mca/include/Support.h b/tools/llvm-mca/include/Support.h
index 91c8e1b41773089265ca06f6102bb2825dc9a1fe..e7a4e33ed74ecdfc8ae4e5381e693b90d473c25c 100644
--- a/tools/llvm-mca/include/Support.h
+++ b/tools/llvm-mca/include/Support.h
@@ -18,9 +18,30 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/Support/Error.h"
 
+namespace llvm {
 namespace mca {
 
+template <typename T>
+class InstructionError : public ErrorInfo<InstructionError<T>> {
+public:
+  static char ID;
+  std::string Message;
+  const T &Inst;
+
+  InstructionError(std::string M, const T &MCI)
+      : Message(std::move(M)), Inst(MCI) {}
+
+  void log(raw_ostream &OS) const override { OS << Message; }
+
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+};
+
+template <typename T> char InstructionError<T>::ID;
+
 /// This class represents the number of cycles per resource (fractions of
 /// cycles).  That quantity is managed here as a ratio, and accessed via the
 /// double cast-operator below.  The two quantities, number of cycles and
@@ -49,8 +70,7 @@ public:
     else {
       // Create a common denominator for LHS and RHS by calculating the least
       // common multiple from the GCD.
-      unsigned GCD =
-          llvm::GreatestCommonDivisor64(Denominator, RHS.Denominator);
+      unsigned GCD = GreatestCommonDivisor64(Denominator, RHS.Denominator);
       unsigned LCM = (Denominator * RHS.Denominator) / GCD;
       unsigned LHSNumerator = Numerator * (LCM / Denominator);
       unsigned RHSNumerator = RHS.Numerator * (LCM / RHS.Denominator);
@@ -83,16 +103,17 @@ public:
 ///
 /// Resource masks are used by the ResourceManager to solve set membership
 /// problems with simple bit manipulation operations.
-void computeProcResourceMasks(const llvm::MCSchedModel &SM,
-                              llvm::SmallVectorImpl<uint64_t> &Masks);
+void computeProcResourceMasks(const MCSchedModel &SM,
+                              SmallVectorImpl<uint64_t> &Masks);
 
 /// Compute the reciprocal block throughput from a set of processor resource
 /// cycles. The reciprocal block throughput is computed as the MAX between:
 ///  - NumMicroOps / DispatchWidth
 ///  - ProcResourceCycles / #ProcResourceUnits  (for every consumed resource).
-double computeBlockRThroughput(const llvm::MCSchedModel &SM,
-                               unsigned DispatchWidth, unsigned NumMicroOps,
-                               llvm::ArrayRef<unsigned> ProcResourceUsage);
+double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
+                               unsigned NumMicroOps,
+                               ArrayRef<unsigned> ProcResourceUsage);
 } // namespace mca
+} // namespace llvm
 
 #endif
diff --git a/tools/llvm-mca/lib/Context.cpp b/tools/llvm-mca/lib/Context.cpp
index c84ea73c4d2bc57a927864fcb57b7e554fc37938..5b6f52478ddbb60b84ef0b79f3b2b718092e5339 100644
--- a/tools/llvm-mca/lib/Context.cpp
+++ b/tools/llvm-mca/lib/Context.cpp
@@ -24,10 +24,9 @@
 #include "Stages/FetchStage.h"
 #include "Stages/RetireStage.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 std::unique_ptr<Pipeline>
 Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
                                SourceMgr &SrcMgr) {
@@ -41,7 +40,7 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
   auto HWS = llvm::make_unique<Scheduler>(SM, LSU.get());
 
   // Create the pipeline stages.
-  auto Fetch = llvm::make_unique<FetchStage>(IB, SrcMgr);
+  auto Fetch = llvm::make_unique<FetchStage>(SrcMgr);
   auto Dispatch = llvm::make_unique<DispatchStage>(STI, MRI, Opts.DispatchWidth,
                                                    *RCU, *PRF);
   auto Execute = llvm::make_unique<ExecuteStage>(*HWS);
@@ -63,3 +62,4 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, InstrBuilder &IB,
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HWEventListener.cpp b/tools/llvm-mca/lib/HWEventListener.cpp
index f27a04a9a9801ef2642c16516eb4fd2d8349c76e..3930e2555a9ba33df5c9b42496fdc1b16e711c58 100644
--- a/tools/llvm-mca/lib/HWEventListener.cpp
+++ b/tools/llvm-mca/lib/HWEventListener.cpp
@@ -14,8 +14,10 @@
 
 #include "HWEventListener.h"
 
+namespace llvm {
 namespace mca {
 
 // Anchor the vtable here.
 void HWEventListener::anchor() {}
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp b/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
index daeda06d859f3f5aee366ecfbb75136aca0040b7..4e46ffacbd4044799c8a86a8916dd8455c882e3a 100644
--- a/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/HardwareUnit.cpp
@@ -15,9 +15,11 @@
 
 #include "HardwareUnits/HardwareUnit.h"
 
+namespace llvm {
 namespace mca {
 
 // Pin the vtable with this method.
 HardwareUnit::~HardwareUnit() = default;
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp b/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
index aca90165af239450e5c687535d1611c8ff49608a..6923c6e0dc8ff532af346f2c9fb70f9013b038a4 100644
--- a/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/LSUnit.cpp
@@ -17,10 +17,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 #ifndef NDEBUG
@@ -164,3 +163,4 @@ void LSUnit::onInstructionExecuted(const InstRef &IR) {
   }
 }
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
index 51a247861395eb6176f5e8a936e257bf9f9626c7..6bc63a0db5000787beaf91458aa84ce75d5bccfe 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RegisterFile.cpp
@@ -18,10 +18,9 @@
 #include "Instruction.h"
 #include "llvm/Support/Debug.h"
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 RegisterFile::RegisterFile(const MCSchedModel &SM, const MCRegisterInfo &mri,
@@ -37,7 +36,7 @@ void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) {
   // declared by the target. The number of physical registers in the default
   // register file is set equal to `NumRegs`. A value of zero for `NumRegs`
   // means: this register file has an unbounded number of physical registers.
-  addRegisterFile({} /* all registers */, NumRegs);
+  RegisterFiles.emplace_back(NumRegs);
   if (!SM.hasExtraProcessorInfo())
     return;
 
@@ -45,18 +44,18 @@ void RegisterFile::initialize(const MCSchedModel &SM, unsigned NumRegs) {
   // object. The size of every register file, as well as the mapping between
   // register files and register classes is specified via tablegen.
   const MCExtraProcessorInfo &Info = SM.getExtraProcessorInfo();
-  for (unsigned I = 0, E = Info.NumRegisterFiles; I < E; ++I) {
+
+  // Skip invalid register file at index 0.
+  for (unsigned I = 1, E = Info.NumRegisterFiles; I < E; ++I) {
     const MCRegisterFileDesc &RF = Info.RegisterFiles[I];
-    // Skip invalid register files with zero physical registers.
-    unsigned Length = RF.NumRegisterCostEntries;
-    if (!RF.NumPhysRegs)
-      continue;
+    assert(RF.NumPhysRegs && "Invalid PRF with zero physical registers!");
+
     // The cost of a register definition is equivalent to the number of
     // physical registers that are allocated at register renaming stage.
+    unsigned Length = RF.NumRegisterCostEntries;
     const MCRegisterCostEntry *FirstElt =
         &Info.RegisterCostTable[RF.RegisterCostEntryIdx];
-    addRegisterFile(ArrayRef<MCRegisterCostEntry>(FirstElt, Length),
-                    RF.NumPhysRegs);
+    addRegisterFile(RF, ArrayRef<MCRegisterCostEntry>(FirstElt, Length));
   }
 }
 
@@ -65,15 +64,16 @@ void RegisterFile::cycleStart() {
     RMT.NumMoveEliminated = 0;
 }
 
-void RegisterFile::addRegisterFile(ArrayRef<MCRegisterCostEntry> Entries,
-                                   unsigned NumPhysRegs) {
+void RegisterFile::addRegisterFile(const MCRegisterFileDesc &RF,
+                                   ArrayRef<MCRegisterCostEntry> Entries) {
   // A default register file is always allocated at index #0. That register file
   // is mainly used to count the total number of mappings created by all
   // register files at runtime. Users can limit the number of available physical
   // registers in register file #0 through the command line flag
   // `-register-file-size`.
   unsigned RegisterFileIndex = RegisterFiles.size();
-  RegisterFiles.emplace_back(NumPhysRegs);
+  RegisterFiles.emplace_back(RF.NumPhysRegs, RF.MaxMovesEliminatedPerCycle,
+                             RF.AllowZeroMoveEliminationOnly);
 
   // Special case where there is no register class identifier in the set.
   // An empty set of register classes means: this register file contains all
@@ -99,6 +99,7 @@ void RegisterFile::addRegisterFile(ArrayRef<MCRegisterCostEntry> Entries,
       }
       IPC = std::make_pair(RegisterFileIndex, RCE.Cost);
       Entry.RenameAs = Reg;
+      Entry.AllowMoveElimination = RCE.AllowMoveElimination;
 
       // Assume the same cost for each sub-register.
       for (MCSubRegIterator I(Reg, &MRI); I.isValid(); ++I) {
@@ -169,8 +170,10 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
   // implicitly clears the upper portion of the underlying register.
   // If a write clears its super-registers, then it is renamed as `RenameAs`.
   bool IsWriteZero = WS.isWriteZero();
-  bool ShouldAllocatePhysRegs = !IsWriteZero;
+  bool IsEliminated = WS.isEliminated();
+  bool ShouldAllocatePhysRegs = !IsWriteZero && !IsEliminated;
   const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+  WS.setPRF(RRI.IndexPlusCost.first);
 
   if (RRI.RenameAs && RRI.RenameAs != RegID) {
     RegID = RRI.RenameAs;
@@ -185,6 +188,7 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
       if (OtherWrite.getWriteState() &&
           (OtherWrite.getSourceIndex() != Write.getSourceIndex())) {
         // This partial write has a false dependency on RenameAs.
+        assert(!IsEliminated && "Unexpected partial update!");
         WS.setDependentWrite(OtherWrite.getWriteState());
       }
     }
@@ -203,22 +207,33 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
       ZeroRegisters.clearBit(*I);
   }
 
-  // Update the mapping for register RegID including its sub-registers.
-  RegisterMappings[RegID].first = Write;
-  for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I)
-    RegisterMappings[*I].first = Write;
+  // If this is move has been eliminated, then the call to tryEliminateMove
+  // should have already updated all the register mappings.
+  if (!IsEliminated) {
+    // Update the mapping for register RegID including its sub-registers.
+    RegisterMappings[RegID].first = Write;
+    RegisterMappings[RegID].second.AliasRegID = 0U;
+    for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
+      RegisterMappings[*I].first = Write;
+      RegisterMappings[*I].second.AliasRegID = 0U;
+    }
 
-  // No physical registers are allocated for instructions that are optimized in
-  // hardware. For example, zero-latency data-dependency breaking instructions
-  // don't consume physical registers.
-  if (ShouldAllocatePhysRegs)
-    allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
+    // No physical registers are allocated for instructions that are optimized
+    // in hardware. For example, zero-latency data-dependency breaking
+    // instructions don't consume physical registers.
+    if (ShouldAllocatePhysRegs)
+      allocatePhysRegs(RegisterMappings[RegID].second, UsedPhysRegs);
+  }
 
   if (!WS.clearsSuperRegisters())
     return;
 
   for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) {
-    RegisterMappings[*I].first = Write;
+    if (!IsEliminated) {
+      RegisterMappings[*I].first = Write;
+      RegisterMappings[*I].second.AliasRegID = 0U;
+    }
+
     if (IsWriteZero)
       ZeroRegisters.setBit(*I);
     else
@@ -228,6 +243,11 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
 
 void RegisterFile::removeRegisterWrite(
     const WriteState &WS, MutableArrayRef<unsigned> FreedPhysRegs) {
+  // Early exit if this write was eliminated. A write eliminated at register
+  // renaming stage generates an alias, and it is not added to the PRF.
+  if (WS.isEliminated())
+    return;
+
   unsigned RegID = WS.getRegisterID();
 
   assert(RegID != 0 && "Invalidating an already invalid register?");
@@ -269,14 +289,10 @@ void RegisterFile::removeRegisterWrite(
   }
 }
 
-bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) {
+bool RegisterFile::tryEliminateMove(WriteState &WS, ReadState &RS) {
   const RegisterMapping &RMFrom = RegisterMappings[RS.getRegisterID()];
   const RegisterMapping &RMTo = RegisterMappings[WS.getRegisterID()];
 
-  // Early exit if the PRF doesn't support move elimination for this register.
-  if (!RMTo.second.AllowMoveElimination)
-    return false;
-
   // From and To must be owned by the same PRF.
   const RegisterRenamingInfo &RRIFrom = RMFrom.second;
   const RegisterRenamingInfo &RRITo = RMTo.second;
@@ -298,9 +314,13 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) {
   // For now, we assume that there is a strong correlation between registers
   // that allow move elimination, and how those same registers are renamed in
   // hardware.
-  if (RRITo.RenameAs && RRITo.RenameAs != WS.getRegisterID())
+  if (RRITo.RenameAs && RRITo.RenameAs != WS.getRegisterID()) {
+    // Early exit if the PRF doesn't support move elimination for this register.
+    if (!RegisterMappings[RRITo.RenameAs].second.AllowMoveElimination)
+      return false;
     if (!WS.clearsSuperRegisters())
       return false;
+  }
 
   RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
   if (RMT.MaxMoveEliminatedPerCycle &&
@@ -311,18 +331,46 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) {
   if (RMT.AllowZeroMoveEliminationOnly && !IsZeroMove)
     return false;
 
+  MCPhysReg FromReg = RS.getRegisterID();
+  MCPhysReg ToReg = WS.getRegisterID();
+
+  // Construct an alias.
+  MCPhysReg AliasReg = FromReg;
+  if (RRIFrom.RenameAs)
+    AliasReg = RRIFrom.RenameAs;
+
+  const RegisterRenamingInfo &RMAlias = RegisterMappings[AliasReg].second;
+  if (RMAlias.AliasRegID)
+    AliasReg = RMAlias.AliasRegID;
+
+  if (AliasReg != ToReg) {
+    RegisterMappings[ToReg].second.AliasRegID = AliasReg;
+    for (MCSubRegIterator I(ToReg, &MRI); I.isValid(); ++I)
+      RegisterMappings[*I].second.AliasRegID = AliasReg;
+  }
+
   RMT.NumMoveEliminated++;
-  if (IsZeroMove)
+  if (IsZeroMove) {
     WS.setWriteZero();
+    RS.setReadZero();
+  }
   WS.setEliminated();
+
   return true;
 }
 
-void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
-                                 unsigned RegID) const {
+void RegisterFile::collectWrites(const ReadState &RS,
+                                 SmallVectorImpl<WriteRef> &Writes) const {
+  unsigned RegID = RS.getRegisterID();
   assert(RegID && RegID < RegisterMappings.size());
   LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register "
                     << MRI.getName(RegID) << '\n');
+
+  // Check if this is an alias.
+  const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+  if (RRI.AliasRegID)
+    RegID = RRI.AliasRegID;
+
   const WriteRef &WR = RegisterMappings[RegID].first;
   if (WR.isValid())
     Writes.push_back(WR);
@@ -335,11 +383,13 @@ void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
   }
 
   // Remove duplicate entries and resize the input vector.
-  sort(Writes, [](const WriteRef &Lhs, const WriteRef &Rhs) {
-    return Lhs.getWriteState() < Rhs.getWriteState();
-  });
-  auto It = std::unique(Writes.begin(), Writes.end());
-  Writes.resize(std::distance(Writes.begin(), It));
+  if (Writes.size() > 1) {
+    sort(Writes, [](const WriteRef &Lhs, const WriteRef &Rhs) {
+      return Lhs.getWriteState() < Rhs.getWriteState();
+    });
+    auto It = std::unique(Writes.begin(), Writes.end());
+    Writes.resize(std::distance(Writes.begin(), It));
+  }
 
   LLVM_DEBUG({
     for (const WriteRef &WR : Writes) {
@@ -351,6 +401,20 @@ void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
   });
 }
 
+void RegisterFile::addRegisterRead(ReadState &RS,
+                                   SmallVectorImpl<WriteRef> &Defs) const {
+  unsigned RegID = RS.getRegisterID();
+  const RegisterRenamingInfo &RRI = RegisterMappings[RegID].second;
+  RS.setPRF(RRI.IndexPlusCost.first);
+  if (RS.isIndependentFromDef())
+    return;
+
+  if (ZeroRegisters[RS.getRegisterID()])
+    RS.setReadZero();
+  collectWrites(RS, Defs);
+  RS.setDependentWrites(Defs.size());
+}
+
 unsigned RegisterFile::isAvailable(ArrayRef<unsigned> Regs) const {
   SmallVector<unsigned, 4> NumPhysRegs(getNumRegisterFiles());
 
@@ -424,3 +488,4 @@ void RegisterFile::dump() const {
 #endif
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp b/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
index 46a374c210266553c21a153889f9ece5ab65ccc1..e371f50ed4892aa9a88bf84af0a15b2e4e75ad6b 100644
--- a/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/ResourceManager.cpp
@@ -18,10 +18,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 ResourceStrategy::~ResourceStrategy() = default;
 
@@ -97,8 +96,7 @@ getStrategyFor(const ResourceState &RS) {
   return std::unique_ptr<ResourceStrategy>(nullptr);
 }
 
-ResourceManager::ResourceManager(const MCSchedModel &SM)
-    : ProcResID2Mask(SM.getNumProcResourceKinds()) {
+ResourceManager::ResourceManager(const MCSchedModel &SM) {
   computeProcResourceMasks(SM, ProcResID2Mask);
   Resources.resize(SM.getNumProcResourceKinds());
   Strategies.resize(SM.getNumProcResourceKinds());
@@ -218,13 +216,12 @@ void ResourceManager::releaseBuffers(ArrayRef<uint64_t> Buffers) {
 }
 
 bool ResourceManager::canBeIssued(const InstrDesc &Desc) const {
-  return std::all_of(Desc.Resources.begin(), Desc.Resources.end(),
-                     [&](const std::pair<uint64_t, const ResourceUsage> &E) {
-                       unsigned NumUnits =
-                           E.second.isReserved() ? 0U : E.second.NumUnits;
-                       unsigned Index = getResourceStateIndex(E.first);
-                       return Resources[Index]->isReady(NumUnits);
-                     });
+  return all_of(
+      Desc.Resources, [&](const std::pair<uint64_t, const ResourceUsage> &E) {
+        unsigned NumUnits = E.second.isReserved() ? 0U : E.second.NumUnits;
+        unsigned Index = getResourceStateIndex(E.first);
+        return Resources[Index]->isReady(NumUnits);
+      });
 }
 
 // Returns true if all resources are in-order, and there is at least one
@@ -307,3 +304,4 @@ void ResourceManager::releaseResource(uint64_t ResourceID) {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp b/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
index af1b01f49dc030a11d2d405d3c6e920eb2da2144..0456e1d7a5bf9fb8fded20f42323e906ab9d93c6 100644
--- a/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/RetireControlUnit.cpp
@@ -15,10 +15,9 @@
 #include "HardwareUnits/RetireControlUnit.h"
 #include "llvm/Support/Debug.h"
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 RetireControlUnit::RetireControlUnit(const MCSchedModel &SM)
@@ -63,7 +62,7 @@ const RetireControlUnit::RUToken &RetireControlUnit::peekCurrentToken() const {
 void RetireControlUnit::consumeCurrentToken() {
   const RetireControlUnit::RUToken &Current = peekCurrentToken();
   assert(Current.NumSlots && "Reserved zero slots?");
-  assert(Current.IR.isValid() && "Invalid RUToken in the RCU queue.");
+  assert(Current.IR && "Invalid RUToken in the RCU queue.");
 
   // Update the slot index to be the next item in the circular queue.
   CurrentInstructionSlotIdx += Current.NumSlots;
@@ -73,7 +72,7 @@ void RetireControlUnit::consumeCurrentToken() {
 
 void RetireControlUnit::onInstructionExecuted(unsigned TokenID) {
   assert(Queue.size() > TokenID);
-  assert(Queue[TokenID].Executed == false && Queue[TokenID].IR.isValid());
+  assert(Queue[TokenID].Executed == false && Queue[TokenID].IR);
   Queue[TokenID].Executed = true;
 }
 
@@ -85,3 +84,4 @@ void RetireControlUnit::dump() const {
 #endif
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp b/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
index 8bfa761c8a16980b7999418774b31a612591cc0b..b1ac8d99b865b83b3af292c5382b60a7e96baf8d 100644
--- a/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
+++ b/tools/llvm-mca/lib/HardwareUnits/Scheduler.cpp
@@ -15,10 +15,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
 void Scheduler::initializeStrategy(std::unique_ptr<SchedulerStrategy> S) {
@@ -108,7 +107,7 @@ void Scheduler::promoteToReadySet(SmallVectorImpl<InstRef> &Ready) {
   unsigned RemovedElements = 0;
   for (auto I = WaitSet.begin(), E = WaitSet.end(); I != E;) {
     InstRef &IR = *I;
-    if (!IR.isValid())
+    if (!IR)
       break;
 
     // Check if this instruction is now ready. In case, force
@@ -160,7 +159,7 @@ void Scheduler::updateIssuedSet(SmallVectorImpl<InstRef> &Executed) {
   unsigned RemovedElements = 0;
   for (auto I = IssuedSet.begin(), E = IssuedSet.end(); I != E;) {
     InstRef &IR = *I;
-    if (!IR.isValid())
+    if (!IR)
       break;
     Instruction &IS = *IR.getInstruction();
     if (!IS.isExecuted()) {
@@ -243,3 +242,4 @@ bool Scheduler::isReady(const InstRef &IR) const {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/InstrBuilder.cpp b/tools/llvm-mca/lib/InstrBuilder.cpp
index 0a26f40b940313ca3cc1cfd507cafea7d347831f..535ad4d57feea8d15f91533fb9e0128c6c78e6fe 100644
--- a/tools/llvm-mca/lib/InstrBuilder.cpp
+++ b/tools/llvm-mca/lib/InstrBuilder.cpp
@@ -22,9 +22,16 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
+InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti,
+                           const llvm::MCInstrInfo &mcii,
+                           const llvm::MCRegisterInfo &mri,
+                           const llvm::MCInstrAnalysis &mcia)
+    : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia) {
+  computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
+}
 
 static void initializeUsedResources(InstrDesc &ID,
                                     const MCSchedClassDesc &SCDesc,
@@ -215,9 +222,8 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
   }
 
   if (CurrentDef != NumExplicitDefs) {
-    return make_error<StringError>(
-        "error: Expected more register operand definitions.",
-        inconvertibleErrorCode());
+    return make_error<InstructionError<MCInst>>(
+        "Expected more register operand definitions.", MCI);
   }
 
   CurrentDef = 0;
@@ -253,11 +259,12 @@ Error InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
     // Always assume that the optional definition is the last operand of the
     // MCInst sequence.
     const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1);
-    if (i == MCI.getNumOperands() || !Op.isReg())
-      return make_error<StringError>(
-          "error: expected a register operand for an optional "
-          "definition. Instruction has not be correctly analyzed.",
-          inconvertibleErrorCode());
+    if (i == MCI.getNumOperands() || !Op.isReg()) {
+      std::string Message =
+          "expected a register operand for an optional definition. Instruction "
+          "has not been correctly analyzed.";
+      return make_error<InstructionError<MCInst>>(Message, MCI);
+    }
 
     WriteDescriptor &Write = ID.Writes[TotalDefs - 1];
     Write.OpIndex = MCI.getNumOperands() - 1;
@@ -284,9 +291,8 @@ Error InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
   }
 
   if (NumExplicitDefs) {
-    return make_error<StringError>(
-        "error: Expected more register operand definitions. ",
-        inconvertibleErrorCode());
+    return make_error<InstructionError<MCInst>>(
+        "Expected more register operand definitions.", MCI);
   }
 
   unsigned NumExplicitUses = MCI.getNumOperands() - i;
@@ -332,23 +338,18 @@ Error InstrBuilder::verifyInstrDesc(const InstrDesc &ID,
   if (!UsesMemory && !UsesBuffers && !UsesResources)
     return ErrorSuccess();
 
-  std::string ToString;
-  raw_string_ostream OS(ToString);
+  StringRef Message;
   if (UsesMemory) {
-    WithColor::error() << "found an inconsistent instruction that decodes "
-                       << "into zero opcodes and that consumes load/store "
-                       << "unit resources.\n";
+    Message = "found an inconsistent instruction that decodes "
+              "into zero opcodes and that consumes load/store "
+              "unit resources.";
   } else {
-    WithColor::error() << "found an inconsistent instruction that decodes"
-                       << " to zero opcodes and that consumes scheduler "
-                       << "resources.\n";
+    Message = "found an inconsistent instruction that decodes "
+              "to zero opcodes and that consumes scheduler "
+              "resources.";
   }
 
-  MCIP.printInst(&MCI, OS, "", STI);
-  OS.flush();
-  WithColor::note() << "instruction: " << ToString << '\n';
-  return make_error<StringError>("Invalid instruction definition found",
-                                 inconvertibleErrorCode());
+  return make_error<InstructionError<MCInst>>(Message, MCI);
 }
 
 Expected<const InstrDesc &>
@@ -371,24 +372,17 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
       SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);
 
     if (!SchedClassID) {
-      return make_error<StringError>("unable to resolve this variant class.",
-                                     inconvertibleErrorCode());
+      return make_error<InstructionError<MCInst>>(
+          "unable to resolve scheduling class for write variant.", MCI);
     }
   }
 
   // Check if this instruction is supported. Otherwise, report an error.
   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
   if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
-    std::string ToString;
-    raw_string_ostream OS(ToString);
-    WithColor::error() << "found an unsupported instruction in the input"
-                       << " assembly sequence.\n";
-    MCIP.printInst(&MCI, OS, "", STI);
-    OS.flush();
-    WithColor::note() << "instruction: " << ToString << '\n';
-    return make_error<StringError>(
-        "Don't know how to analyze unsupported instructions",
-        inconvertibleErrorCode());
+    return make_error<InstructionError<MCInst>>(
+        "found an unsupported instruction in the input assembly sequence.",
+        MCI);
   }
 
   // Create a new empty descriptor.
@@ -463,6 +457,8 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
   bool IsZeroIdiom = MCIA.isZeroIdiom(MCI, Mask, ProcID);
   bool IsDepBreaking =
       IsZeroIdiom || MCIA.isDependencyBreaking(MCI, Mask, ProcID);
+  if (MCIA.isOptimizableRegisterMove(MCI, ProcID))
+    NewIS->setOptimizableMove();
 
   // Initialize Reads first.
   for (const ReadDescriptor &RD : D.Reads) {
@@ -485,14 +481,15 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
 
     // Okay, this is a register operand. Create a ReadState for it.
     assert(RegID > 0 && "Invalid register ID found!");
-    auto RS = llvm::make_unique<ReadState>(RD, RegID);
+    NewIS->getUses().emplace_back(RD, RegID);
+    ReadState &RS = NewIS->getUses().back();
 
     if (IsDepBreaking) {
       // A mask of all zeroes means: explicit input operands are not
       // independent.
       if (Mask.isNullValue()) {
         if (!RD.isImplicitRead())
-          RS->setIndependentFromDef();
+          RS.setIndependentFromDef();
       } else {
         // Check if this register operand is independent according to `Mask`.
         // Note that Mask may not have enough bits to describe all explicit and
@@ -502,11 +499,10 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
         if (Mask.getBitWidth() > RD.UseIndex) {
           // Okay. This map describe register use `RD.UseIndex`.
           if (Mask[RD.UseIndex])
-            RS->setIndependentFromDef();
+            RS.setIndependentFromDef();
         }
       }
     }
-    NewIS->getUses().emplace_back(std::move(RS));
   }
 
   // Early exit if there are no writes.
@@ -533,12 +529,13 @@ InstrBuilder::createInstruction(const MCInst &MCI) {
     }
 
     assert(RegID && "Expected a valid register ID!");
-    NewIS->getDefs().emplace_back(llvm::make_unique<WriteState>(
+    NewIS->getDefs().emplace_back(
         WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex],
-        /* WritesZero */ IsZeroIdiom));
+        /* WritesZero */ IsZeroIdiom);
     ++WriteIndex;
   }
 
   return std::move(NewIS);
 }
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Instruction.cpp b/tools/llvm-mca/lib/Instruction.cpp
index 511e7b2070341bec9db6bdacae8cf5dae7875b61..832a6199f00ca363891cd3e5c780c2bb921f9b96 100644
--- a/tools/llvm-mca/lib/Instruction.cpp
+++ b/tools/llvm-mca/lib/Instruction.cpp
@@ -16,10 +16,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 void ReadState::writeStartEvent(unsigned Cycles) {
   assert(DependentWrites);
   assert(CyclesLeft == UNKNOWN_CYCLES);
@@ -93,7 +92,7 @@ void ReadState::cycleEvent() {
 
 #ifndef NDEBUG
 void WriteState::dump() const {
-  dbgs() << "{ OpIdx=" << WD.OpIndex << ", Lat=" << getLatency() << ", RegID "
+  dbgs() << "{ OpIdx=" << WD->OpIndex << ", Lat=" << getLatency() << ", RegID "
          << getRegisterID() << ", Cycles Left=" << getCyclesLeft() << " }";
 }
 
@@ -120,10 +119,10 @@ void Instruction::execute() {
   Stage = IS_EXECUTING;
 
   // Set the cycles left before the write-back stage.
-  CyclesLeft = Desc.MaxLatency;
+  CyclesLeft = getLatency();
 
-  for (UniqueDef &Def : Defs)
-    Def->onInstructionIssued();
+  for (WriteState &WS : getDefs())
+    WS.onInstructionIssued();
 
   // Transition to the "executed" stage if this is a zero-latency instruction.
   if (!CyclesLeft)
@@ -139,21 +138,21 @@ void Instruction::forceExecuted() {
 void Instruction::update() {
   assert(isDispatched() && "Unexpected instruction stage found!");
 
-  if (!all_of(Uses, [](const UniqueUse &Use) { return Use->isReady(); }))
+  if (!all_of(getUses(), [](const ReadState &Use) { return Use.isReady(); }))
     return;
 
   // A partial register write cannot complete before a dependent write.
-  auto IsDefReady = [&](const UniqueDef &Def) {
-    if (const WriteState *Write = Def->getDependentWrite()) {
+  auto IsDefReady = [&](const WriteState &Def) {
+    if (const WriteState *Write = Def.getDependentWrite()) {
       int WriteLatency = Write->getCyclesLeft();
       if (WriteLatency == UNKNOWN_CYCLES)
         return false;
-      return static_cast<unsigned>(WriteLatency) < Desc.MaxLatency;
+      return static_cast<unsigned>(WriteLatency) < getLatency();
     }
     return true;
   };
 
-  if (all_of(Defs, IsDefReady))
+  if (all_of(getDefs(), IsDefReady))
     Stage = IS_READY;
 }
 
@@ -162,8 +161,8 @@ void Instruction::cycleEvent() {
     return;
 
   if (isDispatched()) {
-    for (UniqueUse &Use : Uses)
-      Use->cycleEvent();
+    for (ReadState &Use : getUses())
+      Use.cycleEvent();
 
     update();
     return;
@@ -171,8 +170,8 @@ void Instruction::cycleEvent() {
 
   assert(isExecuting() && "Instruction not in-flight?");
   assert(CyclesLeft && "Instruction already executed?");
-  for (UniqueDef &Def : Defs)
-    Def->cycleEvent();
+  for (WriteState &Def : getDefs())
+    Def.cycleEvent();
   CyclesLeft--;
   if (!CyclesLeft)
     Stage = IS_EXECUTED;
@@ -181,3 +180,4 @@ void Instruction::cycleEvent() {
 const unsigned WriteRef::INVALID_IID = std::numeric_limits<unsigned>::max();
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Pipeline.cpp b/tools/llvm-mca/lib/Pipeline.cpp
index 2d9aa6b2a31631ce33d41157676cc630517ada26..309f415913d746b53639293e354266cc503bca9d 100644
--- a/tools/llvm-mca/lib/Pipeline.cpp
+++ b/tools/llvm-mca/lib/Pipeline.cpp
@@ -17,12 +17,11 @@
 #include "HWEventListener.h"
 #include "llvm/Support/Debug.h"
 
+namespace llvm {
 namespace mca {
 
 #define DEBUG_TYPE "llvm-mca"
 
-using namespace llvm;
-
 void Pipeline::addEventListener(HWEventListener *Listener) {
   if (Listener)
     Listeners.insert(Listener);
@@ -39,13 +38,14 @@ bool Pipeline::hasWorkToProcess() {
 Error Pipeline::run() {
   assert(!Stages.empty() && "Unexpected empty pipeline found!");
 
-  while (hasWorkToProcess()) {
+  do {
     notifyCycleBegin();
     if (Error Err = runCycle())
       return Err;
     notifyCycleEnd();
     ++Cycles;
-  }
+  } while (hasWorkToProcess());
+
   return ErrorSuccess();
 }
 
@@ -94,3 +94,4 @@ void Pipeline::notifyCycleEnd() {
     Listener->onCycleEnd();
 }
 } // namespace mca.
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/DispatchStage.cpp b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
index c33b86027dae1333e597febaa585f0d6998710b7..838dbad22e3ad5c03ffb5d1c8fda39374bb914d0 100644
--- a/tools/llvm-mca/lib/Stages/DispatchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/DispatchStage.cpp
@@ -21,10 +21,9 @@
 #include "HardwareUnits/Scheduler.h"
 #include "llvm/Support/Debug.h"
 
-using namespace llvm;
-
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 void DispatchStage::notifyInstructionDispatched(const InstRef &IR,
@@ -37,9 +36,8 @@ void DispatchStage::notifyInstructionDispatched(const InstRef &IR,
 
 bool DispatchStage::checkPRF(const InstRef &IR) const {
   SmallVector<unsigned, 4> RegDefs;
-  for (const std::unique_ptr<WriteState> &RegDef :
-       IR.getInstruction()->getDefs())
-    RegDefs.emplace_back(RegDef->getRegisterID());
+  for (const WriteState &RegDef : IR.getInstruction()->getDefs())
+    RegDefs.emplace_back(RegDef.getRegisterID());
 
   const unsigned RegisterMask = PRF.isAvailable(RegDefs);
   // A mask with all zeroes means: register files are available.
@@ -69,8 +67,9 @@ void DispatchStage::updateRAWDependencies(ReadState &RS,
                                           const MCSubtargetInfo &STI) {
   SmallVector<WriteRef, 4> DependentWrites;
 
-  collectWrites(DependentWrites, RS.getRegisterID());
-  RS.setDependentWrites(DependentWrites.size());
+  // Collect all the dependent writes, and update RS internal state.
+  PRF.addRegisterRead(RS, DependentWrites);
+
   // We know that this read depends on all the writes in DependentWrites.
   // For each write, check if we have ReadAdvance information, and use it
   // to figure out in how many cycles this read becomes available.
@@ -101,10 +100,11 @@ Error DispatchStage::dispatch(InstRef IR) {
   }
 
   // Check if this is an optimizable reg-reg move.
+  bool IsEliminated = false;
   if (IS.isOptimizableMove()) {
     assert(IS.getDefs().size() == 1 && "Expected a single input!");
     assert(IS.getUses().size() == 1 && "Expected a single output!");
-    PRF.tryEliminateMove(*IS.getDefs()[0], *IS.getUses()[0]);
+    IsEliminated = PRF.tryEliminateMove(IS.getDefs()[0], IS.getUses()[0]);
   }
 
   // A dependency-breaking instruction doesn't have to wait on the register
@@ -113,17 +113,20 @@ Error DispatchStage::dispatch(InstRef IR) {
   // instruction. A dependency-breaking instruction is a zero-latency
   // instruction that doesn't consume hardware resources.
   // An example of dependency-breaking instruction on X86 is a zero-idiom XOR.
-  for (std::unique_ptr<ReadState> &RS : IS.getUses())
-    if (!RS->isIndependentFromDef())
-      updateRAWDependencies(*RS, STI);
+  //
+  // We also don't update data dependencies for instructions that have been
+  // eliminated at register renaming stage.
+  if (!IsEliminated) {
+    for (ReadState &RS : IS.getUses())
+      updateRAWDependencies(RS, STI);
+  }
 
   // By default, a dependency-breaking zero-idiom is expected to be optimized
   // at register renaming stage. That means, no physical register is allocated
   // to the instruction.
   SmallVector<unsigned, 4> RegisterFiles(PRF.getNumRegisterFiles());
-  for (std::unique_ptr<WriteState> &WS : IS.getDefs())
-    PRF.addRegisterWrite(WriteRef(IR.getSourceIndex(), WS.get()),
-                         RegisterFiles);
+  for (WriteState &WS : IS.getDefs())
+    PRF.addRegisterWrite(WriteRef(IR.getSourceIndex(), &WS), RegisterFiles);
 
   // Reserve slots in the RCU, and notify the instruction that it has been
   // dispatched to the schedulers for execution.
@@ -147,7 +150,7 @@ Error DispatchStage::cycleStart() {
   AvailableEntries = CarryOver >= DispatchWidth ? 0 : DispatchWidth - CarryOver;
   unsigned DispatchedOpcodes = DispatchWidth - AvailableEntries;
   CarryOver -= DispatchedOpcodes;
-  assert(CarriedOver.isValid() && "Invalid dispatched instruction");
+  assert(CarriedOver && "Invalid dispatched instruction");
 
   SmallVector<unsigned, 8> RegisterFiles(PRF.getNumRegisterFiles(), 0U);
   notifyInstructionDispatched(CarriedOver, RegisterFiles, DispatchedOpcodes);
@@ -179,3 +182,4 @@ void DispatchStage::dump() const {
 }
 #endif
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp b/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
index fa297148167a81fa0e0923d326aefe17924b63df..298f08a2887f951117d3a79d629ee705a8e1d66e 100644
--- a/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
+++ b/tools/llvm-mca/lib/Stages/ExecuteStage.cpp
@@ -21,10 +21,9 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 HWStallEvent::GenericEventType toHWStallEventType(Scheduler::Status Status) {
   switch (Status) {
   case Scheduler::SC_LOAD_QUEUE_FULL:
@@ -73,7 +72,7 @@ Error ExecuteStage::issueInstruction(InstRef &IR) {
 
 Error ExecuteStage::issueReadyInstructions() {
   InstRef IR = HWS.select();
-  while (IR.isValid()) {
+  while (IR) {
     if (Error Err = issueInstruction(IR))
       return Err;
 
@@ -107,7 +106,6 @@ Error ExecuteStage::cycleStart() {
   return issueReadyInstructions();
 }
 
-
 #ifndef NDEBUG
 static void verifyInstructionEliminated(const InstRef &IR) {
   const Instruction &Inst = *IR.getInstruction();
@@ -121,7 +119,6 @@ static void verifyInstructionEliminated(const InstRef &IR) {
 }
 #endif
 
-
 Error ExecuteStage::handleInstructionEliminated(InstRef &IR) {
 #ifndef NDEBUG
   verifyInstructionEliminated(IR);
@@ -219,3 +216,4 @@ void ExecuteStage::notifyReservedOrReleasedBuffers(const InstRef &IR,
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/FetchStage.cpp b/tools/llvm-mca/lib/Stages/FetchStage.cpp
index e2cdad37ee1454814483824cc767fa0996f12a45..6e91dd6121d35fb18f00fdd6583cdeb22029fef9 100644
--- a/tools/llvm-mca/lib/Stages/FetchStage.cpp
+++ b/tools/llvm-mca/lib/Stages/FetchStage.cpp
@@ -14,54 +14,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "Stages/FetchStage.h"
+#include "Instruction.h"
 
+namespace llvm {
 namespace mca {
 
-bool FetchStage::hasWorkToComplete() const {
-  return CurrentInstruction.get() || SM.hasNext();
-}
+bool FetchStage::hasWorkToComplete() const { return CurrentInstruction; }
 
 bool FetchStage::isAvailable(const InstRef & /* unused */) const {
-  if (!CurrentInstruction)
-    return false;
-  assert(SM.hasNext() && "Unexpected internal state!");
-  const SourceRef SR = SM.peekNext();
-  InstRef IR(SR.first, CurrentInstruction.get());
-  return checkNextStage(IR);
+  if (CurrentInstruction)
+    return checkNextStage(CurrentInstruction);
+  return false;
 }
 
-llvm::Error FetchStage::getNextInstruction() {
+void FetchStage::getNextInstruction() {
   assert(!CurrentInstruction && "There is already an instruction to process!");
   if (!SM.hasNext())
-    return llvm::ErrorSuccess();
-  const SourceRef SR = SM.peekNext();
-  llvm::Expected<std::unique_ptr<Instruction>> InstOrErr =
-      IB.createInstruction(*SR.second);
-  if (!InstOrErr)
-    return InstOrErr.takeError();
-  CurrentInstruction = std::move(InstOrErr.get());
-  return llvm::ErrorSuccess();
+    return;
+  SourceRef SR = SM.peekNext();
+  std::unique_ptr<Instruction> Inst = llvm::make_unique<Instruction>(SR.second);
+  CurrentInstruction = InstRef(SR.first, Inst.get());
+  Instructions[SR.first] = std::move(Inst);
+  SM.updateNext();
 }
 
 llvm::Error FetchStage::execute(InstRef & /*unused */) {
   assert(CurrentInstruction && "There is no instruction to process!");
-  const SourceRef SR = SM.peekNext();
-  InstRef IR(SR.first, CurrentInstruction.get());
-  assert(checkNextStage(IR) && "Invalid fetch!");
-
-  Instructions[IR.getSourceIndex()] = std::move(CurrentInstruction);
-  if (llvm::Error Val = moveToTheNextStage(IR))
+  if (llvm::Error Val = moveToTheNextStage(CurrentInstruction))
     return Val;
 
-  SM.updateNext();
-
   // Move the program counter.
-  return getNextInstruction();
+  CurrentInstruction.invalidate();
+  getNextInstruction();
+  return llvm::ErrorSuccess();
 }
 
 llvm::Error FetchStage::cycleStart() {
   if (!CurrentInstruction)
-    return getNextInstruction();
+    getNextInstruction();
   return llvm::ErrorSuccess();
 }
 
@@ -80,3 +70,4 @@ llvm::Error FetchStage::cycleEnd() {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/InstructionTables.cpp b/tools/llvm-mca/lib/Stages/InstructionTables.cpp
index e49eb446062422d6e2e49f6e628474df7dfdd366..33c30e7f95c0f7142b0732df0a20d858f0550cbf 100644
--- a/tools/llvm-mca/lib/Stages/InstructionTables.cpp
+++ b/tools/llvm-mca/lib/Stages/InstructionTables.cpp
@@ -17,12 +17,10 @@
 
 #include "Stages/InstructionTables.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 Error InstructionTables::execute(InstRef &IR) {
-  ArrayRef<uint64_t> Masks = IB.getProcResourceMasks();
   const InstrDesc &Desc = IR.getInstruction()->getDesc();
   UsedResources.clear();
 
@@ -68,3 +66,4 @@ Error InstructionTables::execute(InstRef &IR) {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/RetireStage.cpp b/tools/llvm-mca/lib/Stages/RetireStage.cpp
index 3c923e4bb050377cb6665e2fb3ac17d25fc87f34..47eed5f2c9c651beefae47f364ee0f9d764dda85 100644
--- a/tools/llvm-mca/lib/Stages/RetireStage.cpp
+++ b/tools/llvm-mca/lib/Stages/RetireStage.cpp
@@ -20,6 +20,7 @@
 
 #define DEBUG_TYPE "llvm-mca"
 
+namespace llvm {
 namespace mca {
 
 llvm::Error RetireStage::cycleStart() {
@@ -52,9 +53,10 @@ void RetireStage::notifyInstructionRetired(const InstRef &IR) const {
   llvm::SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
   const Instruction &Inst = *IR.getInstruction();
 
-  for (const std::unique_ptr<WriteState> &WS : Inst.getDefs())
-    PRF.removeRegisterWrite(*WS.get(), FreedRegs);
+  for (const WriteState &WS : Inst.getDefs())
+    PRF.removeRegisterWrite(WS, FreedRegs);
   notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Stages/Stage.cpp b/tools/llvm-mca/lib/Stages/Stage.cpp
index e8cd74f2163bf5bf0dc0b61d4147394312199e98..c3cfe47d24e19feba002cec357a148355685da91 100644
--- a/tools/llvm-mca/lib/Stages/Stage.cpp
+++ b/tools/llvm-mca/lib/Stages/Stage.cpp
@@ -15,6 +15,7 @@
 
 #include "Stages/Stage.h"
 
+namespace llvm {
 namespace mca {
 
 // Pin the vtable here in the implementation file.
@@ -25,3 +26,4 @@ void Stage::addListener(HWEventListener *Listener) {
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/lib/Support.cpp b/tools/llvm-mca/lib/Support.cpp
index 8f6b8a91f38f499c3c913665d2bfee44c4840020..a6ff26dafb5baa49e52d1fd3291c3bd22596c712 100644
--- a/tools/llvm-mca/lib/Support.cpp
+++ b/tools/llvm-mca/lib/Support.cpp
@@ -16,10 +16,9 @@
 #include "Support.h"
 #include "llvm/MC/MCSchedule.h"
 
+namespace llvm {
 namespace mca {
 
-using namespace llvm;
-
 void computeProcResourceMasks(const MCSchedModel &SM,
                               SmallVectorImpl<uint64_t> &Masks) {
   unsigned ProcResourceID = 0;
@@ -77,3 +76,4 @@ double computeBlockRThroughput(const MCSchedModel &SM, unsigned DispatchWidth,
 }
 
 } // namespace mca
+} // namespace llvm
diff --git a/tools/llvm-mca/llvm-mca.cpp b/tools/llvm-mca/llvm-mca.cpp
index 9466ae7e84d6d060648622c81ea167f88e45d6ad..3a066f713bc80d13b7bfbe100f946f4dc5d28471 100644
--- a/tools/llvm-mca/llvm-mca.cpp
+++ b/tools/llvm-mca/llvm-mca.cpp
@@ -22,6 +22,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeRegion.h"
+#include "CodeRegionGenerator.h"
 #include "PipelinePrinter.h"
 #include "Stages/FetchStage.h"
 #include "Stages/InstructionTables.h"
@@ -35,12 +36,11 @@
 #include "Views/TimelineView.h"
 #include "include/Context.h"
 #include "include/Pipeline.h"
+#include "include/Support.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
@@ -68,13 +68,13 @@ static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
                                            cl::value_desc("filename"));
 
 static cl::opt<std::string>
-    ArchName("march", cl::desc("Target arch to assemble for, "
-                               "see -version for available targets"),
+    ArchName("march", cl::desc("Target architecture. "
+                               "See -version for available targets"),
              cl::cat(ToolOptions));
 
 static cl::opt<std::string>
-    TripleName("mtriple", cl::desc("Target triple to assemble for, "
-                                   "see -version for available targets"),
+    TripleName("mtriple",
+               cl::desc("Target triple. See -version for available targets"),
                cl::cat(ToolOptions));
 
 static cl::opt<std::string>
@@ -198,59 +198,6 @@ const Target *getTarget(const char *ProgName) {
   return TheTarget;
 }
 
-// A comment consumer that parses strings.
-// The only valid tokens are strings.
-class MCACommentConsumer : public AsmCommentConsumer {
-public:
-  mca::CodeRegions &Regions;
-
-  MCACommentConsumer(mca::CodeRegions &R) : Regions(R) {}
-  void HandleComment(SMLoc Loc, StringRef CommentText) override {
-    // Skip empty comments.
-    StringRef Comment(CommentText);
-    if (Comment.empty())
-      return;
-
-    // Skip spaces and tabs
-    unsigned Position = Comment.find_first_not_of(" \t");
-    if (Position >= Comment.size())
-      // We reached the end of the comment. Bail out.
-      return;
-
-    Comment = Comment.drop_front(Position);
-    if (Comment.consume_front("LLVM-MCA-END")) {
-      Regions.endRegion(Loc);
-      return;
-    }
-
-    // Now try to parse string LLVM-MCA-BEGIN
-    if (!Comment.consume_front("LLVM-MCA-BEGIN"))
-      return;
-
-    // Skip spaces and tabs
-    Position = Comment.find_first_not_of(" \t");
-    if (Position < Comment.size())
-      Comment = Comment.drop_front(Position);
-    // Use the rest of the string as a descriptor for this code snippet.
-    Regions.beginRegion(Comment, Loc);
-  }
-};
-
-int AssembleInput(MCAsmParser &Parser, const Target *TheTarget,
-                  MCSubtargetInfo &STI, MCInstrInfo &MCII,
-                  MCTargetOptions &MCOptions) {
-  std::unique_ptr<MCTargetAsmParser> TAP(
-      TheTarget->createMCAsmParser(STI, Parser, MCII, MCOptions));
-
-  if (!TAP) {
-    WithColor::error() << "this target does not support assembly parsing.\n";
-    return 1;
-  }
-
-  Parser.setTargetParser(*TAP);
-  return Parser.Run(false);
-}
-
 ErrorOr<std::unique_ptr<ToolOutputFile>> getOutputStream() {
   if (OutputFilename == "")
     OutputFilename = "-";
@@ -261,40 +208,6 @@ ErrorOr<std::unique_ptr<ToolOutputFile>> getOutputStream() {
     return std::move(Out);
   return EC;
 }
-
-class MCStreamerWrapper final : public MCStreamer {
-  mca::CodeRegions &Regions;
-
-public:
-  MCStreamerWrapper(MCContext &Context, mca::CodeRegions &R)
-      : MCStreamer(Context), Regions(R) {}
-
-  // We only want to intercept the emission of new instructions.
-  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                               bool /* unused */) override {
-    Regions.addInstruction(llvm::make_unique<const MCInst>(Inst));
-  }
-
-  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override {
-    return true;
-  }
-
-  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                        unsigned ByteAlignment) override {}
-  void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
-                    uint64_t Size = 0, unsigned ByteAlignment = 0,
-                    SMLoc Loc = SMLoc()) override {}
-  void EmitGPRel32Value(const MCExpr *Value) override {}
-  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
-  void EmitCOFFSymbolStorageClass(int StorageClass) override {}
-  void EmitCOFFSymbolType(int Type) override {}
-  void EndCOFFSymbolDef() override {}
-
-  const std::vector<std::unique_ptr<const MCInst>> &
-  GetInstructionSequence(unsigned Index) const {
-    return Regions.getInstructionSequence(Index);
-  }
-};
 } // end of anonymous namespace
 
 static void processOptionImpl(cl::opt<bool> &O, const cl::opt<bool> &Default) {
@@ -324,6 +237,16 @@ static void processViewOptions() {
   processOptionImpl(PrintRetireStats, Default);
 }
 
+// Returns true on success.
+static bool runPipeline(mca::Pipeline &P) {
+  // Handle pipeline errors here.
+  if (auto Err = P.run()) {
+    WithColor::error() << toString(std::move(Err));
+    return false;
+  }
+  return true;
+}
+
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
 
@@ -341,9 +264,6 @@ int main(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv,
                               "llvm machine code performance analyzer.\n");
 
-  MCTargetOptions MCOptions;
-  MCOptions.PreserveAsmComments = false;
-
   // Get the target from the triple. If a triple is not specified, then select
   // the default triple for the host. If the triple doesn't correspond to any
   // registered target, then exit with an error message.
@@ -383,9 +303,6 @@ int main(int argc, char **argv) {
 
   std::unique_ptr<buffer_ostream> BOS;
 
-  mca::CodeRegions Regions(SrcMgr);
-  MCStreamerWrapper Str(Ctx, Regions);
-
   std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
 
   std::unique_ptr<MCInstrAnalysis> MCIA(
@@ -418,14 +335,14 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  std::unique_ptr<MCAsmParser> P(createMCAsmParser(SrcMgr, Ctx, Str, *MAI));
-  MCAsmLexer &Lexer = P->getLexer();
-  MCACommentConsumer CC(Regions);
-  Lexer.setCommentConsumer(&CC);
-
-  if (AssembleInput(*P, TheTarget, *STI, *MCII, MCOptions))
+  // Parse the input and create CodeRegions that llvm-mca can analyze.
+  mca::AsmCodeRegionGenerator CRG(*TheTarget, SrcMgr, Ctx, *MAI, *STI, *MCII);
+  Expected<const mca::CodeRegions &> RegionsOrErr = CRG.parseCodeRegions();
+  if (auto Err = RegionsOrErr.takeError()) {
+    WithColor::error() << Err << "\n";
     return 1;
-
+  }
+  const mca::CodeRegions &Regions = *RegionsOrErr;
   if (Regions.empty()) {
     WithColor::error() << "no assembly instructions found.\n";
     return 1;
@@ -438,7 +355,7 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  unsigned AssemblerDialect = P->getAssemblerDialect();
+  unsigned AssemblerDialect = CRG.getAssemblerDialect();
   if (OutputAsmVariant >= 0)
     AssemblerDialect = static_cast<unsigned>(OutputAsmVariant);
   std::unique_ptr<MCInstPrinter> IP(TheTarget->createMCInstPrinter(
@@ -460,7 +377,7 @@ int main(int argc, char **argv) {
     Width = DispatchWidth;
 
   // Create an instruction builder.
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA, *IP);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, *MCIA);
 
   // Create a context to control ownership of the pipeline hardware.
   mca::Context MCA(*MRI, *STI);
@@ -470,6 +387,7 @@ int main(int argc, char **argv) {
 
   // Number each region in the sequence.
   unsigned RegionIdx = 0;
+
   for (const std::unique_ptr<mca::CodeRegion> &Region : Regions) {
     // Skip empty code regions.
     if (Region->empty())
@@ -485,26 +403,53 @@ int main(int argc, char **argv) {
       TOF->os() << "\n\n";
     }
 
-    mca::SourceMgr S(Region->getInstructions(),
-                     PrintInstructionTables ? 1 : Iterations);
+    // Lower the MCInst sequence into an mca::Instruction sequence.
+    ArrayRef<MCInst> Insts = Region->getInstructions();
+    std::vector<std::unique_ptr<mca::Instruction>> LoweredSequence;
+    for (const MCInst &MCI : Insts) {
+      llvm::Expected<std::unique_ptr<mca::Instruction>> Inst =
+          IB.createInstruction(MCI);
+      if (!Inst) {
+        if (auto NewE = handleErrors(
+                Inst.takeError(),
+                [&IP, &STI](const mca::InstructionError<MCInst> &IE) {
+                  std::string InstructionStr;
+                  raw_string_ostream SS(InstructionStr);
+                  WithColor::error() << IE.Message << '\n';
+                  IP->printInst(&IE.Inst, SS, "", *STI);
+                  SS.flush();
+                  WithColor::note() << "instruction: " << InstructionStr
+                                    << '\n';
+                })) {
+          // Default case.
+          WithColor::error() << toString(std::move(NewE));
+        }
+        return 1;
+      }
+
+      LoweredSequence.emplace_back(std::move(Inst.get()));
+    }
+
+    mca::SourceMgr S(LoweredSequence, PrintInstructionTables ? 1 : Iterations);
 
     if (PrintInstructionTables) {
       //  Create a pipeline, stages, and a printer.
       auto P = llvm::make_unique<mca::Pipeline>();
-      P->appendStage(llvm::make_unique<mca::FetchStage>(IB, S));
-      P->appendStage(llvm::make_unique<mca::InstructionTables>(SM, IB));
+      P->appendStage(llvm::make_unique<mca::FetchStage>(S));
+      P->appendStage(llvm::make_unique<mca::InstructionTables>(SM));
       mca::PipelinePrinter Printer(*P);
 
       // Create the views for this pipeline, execute, and emit a report.
       if (PrintInstructionInfoView) {
-        Printer.addView(
-            llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, S, *IP));
+        Printer.addView(llvm::make_unique<mca::InstructionInfoView>(
+            *STI, *MCII, Insts, *IP));
       }
       Printer.addView(
-          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, S));
-      auto Err = P->run();
-      if (Err)
-        report_fatal_error(toString(std::move(Err)));
+          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
+
+      if (!runPipeline(*P))
+        return 1;
+
       Printer.printReport(TOF->os());
       continue;
     }
@@ -514,11 +459,11 @@ int main(int argc, char **argv) {
     mca::PipelinePrinter Printer(*P);
 
     if (PrintSummaryView)
-      Printer.addView(llvm::make_unique<mca::SummaryView>(SM, S, Width));
+      Printer.addView(llvm::make_unique<mca::SummaryView>(SM, Insts, Width));
 
     if (PrintInstructionInfoView)
       Printer.addView(
-          llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, S, *IP));
+          llvm::make_unique<mca::InstructionInfoView>(*STI, *MCII, Insts, *IP));
 
     if (PrintDispatchStats)
       Printer.addView(llvm::make_unique<mca::DispatchStatistics>());
@@ -534,16 +479,19 @@ int main(int argc, char **argv) {
 
     if (PrintResourcePressureView)
       Printer.addView(
-          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, S));
+          llvm::make_unique<mca::ResourcePressureView>(*STI, *IP, Insts));
 
     if (PrintTimelineView) {
+      unsigned TimelineIterations =
+          TimelineMaxIterations ? TimelineMaxIterations : 10;
       Printer.addView(llvm::make_unique<mca::TimelineView>(
-          *STI, *IP, S, TimelineMaxIterations, TimelineMaxCycles));
+          *STI, *IP, Insts, std::min(TimelineIterations, S.getNumIterations()),
+          TimelineMaxCycles));
     }
 
-    auto Err = P->run();
-    if (Err)
-      report_fatal_error(toString(std::move(Err)));
+    if (!runPipeline(*P))
+      return 1;
+
     Printer.printReport(TOF->os());
 
     // Clear the InstrBuilder internal state in preparation for another round.
diff --git a/tools/llvm-mt/Opts.td b/tools/llvm-mt/Opts.td
index 6dc3eea524e64d7caa2dabe0b0750d88fdf3855a..da5b2c992ee35c26fadf7c8cd8e865e655ea1773 100644
--- a/tools/llvm-mt/Opts.td
+++ b/tools/llvm-mt/Opts.td
@@ -23,6 +23,7 @@ def validate_file_hashes : Joined<["/", "-"], "validate_file_hashes:">, HelpText
 def canonicalize : Flag<["/", "-"], "canonicalize:">, HelpText<"Not supported">, Group<unsupported>;
 def check_for_duplicates : Flag<["/", "-"], "check_for_duplicates:">, HelpText<"Not supported">, Group<unsupported>;
 def make_cdfs : Flag<["/", "-"], "makecdfs:">, HelpText<"Not supported">, Group<unsupported>;
+def notify_update : Flag<["/", "-"], "notify_update">, HelpText<"Not supported">, Group<unsupported>;
 def verbose : Flag<["/", "-"], "verbose">, HelpText<"Not supported">, Group<unsupported>;
 def help : Flag<["/", "-"], "?">;
 def help_long : Flag<["/", "-"], "help">, Alias<help>;
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 7e1fd86d0b01111e2c062641c9a78a9cc1c33204..21f3a2bade5e2ca812972705a2972c39d3045f47 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -183,6 +183,8 @@ cl::opt<bool> DyldInfoOnly("dyldinfo-only",
 cl::opt<bool> NoLLVMBitcode("no-llvm-bc",
                             cl::desc("Disable LLVM bitcode reader"));
 
+cl::extrahelp HelpResponse("\nPass @FILE as argument to read options from FILE.\n");
+
 bool PrintAddress = true;
 
 bool MultipleFiles = false;
@@ -1179,8 +1181,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       // see if this symbol is a symbol from that section and if not skip it.
       if (Nsect && Nsect != getNsectInMachO(*MachO, Sym))
         continue;
-      NMSymbol S;
-      memset(&S, '\0', sizeof(S));
+      NMSymbol S = {};
       S.Size = 0;
       S.Address = 0;
       if (PrintSize) {
@@ -1274,8 +1275,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
           }
         }
         if (!found) {
-          NMSymbol S;
-          memset(&S, '\0', sizeof(NMSymbol));
+          NMSymbol S = {};
           S.Address = Entry.address() + BaseSegmentAddress;
           S.Size = 0;
           S.TypeChar = '\0';
@@ -1365,8 +1365,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
 
             // Now create the undefined symbol using the referened dynamic
             // library.
-            NMSymbol U;
-            memset(&U, '\0', sizeof(NMSymbol));
+            NMSymbol U = {};
             U.Address = 0;
             U.Size = 0;
             U.TypeChar = 'U';
@@ -1432,8 +1431,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
         }
         if (!found) {
           LastSymbolName = Entry.symbolName();
-          NMSymbol B;
-          memset(&B, '\0', sizeof(NMSymbol));
+          NMSymbol B = {};
           B.Address = 0;
           B.Size = 0;
           B.TypeChar = 'U';
@@ -1492,8 +1490,7 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
         }
         if (!found) {
           LastSymbolName = Entry.symbolName();
-          NMSymbol L;
-          memset(&L, '\0', sizeof(NMSymbol));
+          NMSymbol L = {};
           L.Name = Entry.symbolName();
           L.Address = 0;
           L.Size = 0;
@@ -1631,9 +1628,8 @@ dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
         }
         // See this address is not already in the symbol table fake up an
         // nlist for it.
-	if (!found) {
-          NMSymbol F;
-          memset(&F, '\0', sizeof(NMSymbol));
+        if (!found) {
+          NMSymbol F = {};
           F.Name = "<redacted function X>";
           F.Address = FoundFns[f] + BaseSegmentAddress;
           F.Size = 0;
@@ -1753,12 +1749,14 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
         outs() << "Archive map\n";
         for (; I != E; ++I) {
           Expected<Archive::Child> C = I->getMember();
-          if (!C)
+          if (!C) {
             error(C.takeError(), Filename);
+            break;
+          }
           Expected<StringRef> FileNameOrErr = C->getName();
           if (!FileNameOrErr) {
             error(FileNameOrErr.takeError(), Filename);
-            return;
+            break;
           }
           StringRef SymName = I->getName();
           outs() << SymName << " in " << FileNameOrErr.get() << "\n";
@@ -1898,7 +1896,6 @@ static void dumpSymbolNamesFromFile(std::string &Filename) {
         if (HostArchName == I->getArchFlagName()) {
           Expected<std::unique_ptr<ObjectFile>> ObjOrErr = I->getAsObjectFile();
           std::string ArchiveName;
-          ArchiveName.clear();
           if (ObjOrErr) {
             ObjectFile &Obj = *ObjOrErr.get();
             dumpSymbolNamesFromObject(Obj, false);
diff --git a/tools/llvm-objcopy/Buffer.cpp b/tools/llvm-objcopy/Buffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8044b023aaad0f1809981ef17dcdf40185067c84
--- /dev/null
+++ b/tools/llvm-objcopy/Buffer.cpp
@@ -0,0 +1,51 @@
+//===- Buffer.cpp ---------------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Buffer.h"
+#include "llvm-objcopy.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
+
+namespace llvm {
+namespace objcopy {
+
+Buffer::~Buffer() {}
+
+void FileBuffer::allocate(size_t Size) {
+  Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+      FileOutputBuffer::create(getName(), Size, FileOutputBuffer::F_executable);
+  handleAllErrors(BufferOrErr.takeError(), [this](const ErrorInfoBase &E) {
+    error("failed to open " + getName() + ": " + E.message());
+  });
+  Buf = std::move(*BufferOrErr);
+}
+
+Error FileBuffer::commit() { return Buf->commit(); }
+
+uint8_t *FileBuffer::getBufferStart() {
+  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+}
+
+void MemBuffer::allocate(size_t Size) {
+  Buf = WritableMemoryBuffer::getNewMemBuffer(Size, getName());
+}
+
+Error MemBuffer::commit() { return Error::success(); }
+
+uint8_t *MemBuffer::getBufferStart() {
+  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
+}
+
+std::unique_ptr<WritableMemoryBuffer> MemBuffer::releaseMemoryBuffer() {
+  return std::move(Buf);
+}
+
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/tools/llvm-objcopy/Buffer.h b/tools/llvm-objcopy/Buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5b9c5b2d22b1d38379b61b9193a6aefbf05a424
--- /dev/null
+++ b/tools/llvm-objcopy/Buffer.h
@@ -0,0 +1,66 @@
+//===- Buffer.h -------------------------------------------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OBJCOPY_BUFFER_H
+#define LLVM_TOOLS_OBJCOPY_BUFFER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
+
+namespace llvm {
+namespace objcopy {
+
+// The class Buffer abstracts out the common interface of FileOutputBuffer and
+// WritableMemoryBuffer so that the hierarchy of Writers depends on this
+// abstract interface and doesn't depend on a particular implementation.
+// TODO: refactor the buffer classes in LLVM to enable us to use them here
+// directly.
+class Buffer {
+  StringRef Name;
+
+public:
+  virtual ~Buffer();
+  virtual void allocate(size_t Size) = 0;
+  virtual uint8_t *getBufferStart() = 0;
+  virtual Error commit() = 0;
+
+  explicit Buffer(StringRef Name) : Name(Name) {}
+  StringRef getName() const { return Name; }
+};
+
+class FileBuffer : public Buffer {
+  std::unique_ptr<FileOutputBuffer> Buf;
+
+public:
+  void allocate(size_t Size) override;
+  uint8_t *getBufferStart() override;
+  Error commit() override;
+
+  explicit FileBuffer(StringRef FileName) : Buffer(FileName) {}
+};
+
+class MemBuffer : public Buffer {
+  std::unique_ptr<WritableMemoryBuffer> Buf;
+
+public:
+  void allocate(size_t Size) override;
+  uint8_t *getBufferStart() override;
+  Error commit() override;
+
+  explicit MemBuffer(StringRef Name) : Buffer(Name) {}
+
+  std::unique_ptr<WritableMemoryBuffer> releaseMemoryBuffer();
+};
+
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_OBJCOPY_BUFFER_H
diff --git a/tools/llvm-objcopy/CMakeLists.txt b/tools/llvm-objcopy/CMakeLists.txt
index b0cd66be5b3a314ce51b67e9858e859f183566f3..afbf78791766334db9a0d31dad9c0a4ca184c5bb 100644
--- a/tools/llvm-objcopy/CMakeLists.txt
+++ b/tools/llvm-objcopy/CMakeLists.txt
@@ -14,8 +14,11 @@ tablegen(LLVM StripOpts.inc -gen-opt-parser-defs)
 add_public_tablegen_target(StripOptsTableGen)
 
 add_llvm_tool(llvm-objcopy
+  Buffer.cpp
+  CopyConfig.cpp
   llvm-objcopy.cpp
-  Object.cpp
+  ELF/ELFObjcopy.cpp
+  ELF/Object.cpp
   DEPENDS
   ObjcopyOptsTableGen
   StripOptsTableGen
diff --git a/tools/llvm-objcopy/CopyConfig.cpp b/tools/llvm-objcopy/CopyConfig.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..67963a22a1c0eacc65547f327108e6b2f569f300
--- /dev/null
+++ b/tools/llvm-objcopy/CopyConfig.cpp
@@ -0,0 +1,447 @@
+//===- CopyConfig.cpp -----------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CopyConfig.h"
+#include "llvm-objcopy.h"
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <memory>
+#include <string>
+
+namespace llvm {
+namespace objcopy {
+
+namespace {
+enum ObjcopyID {
+  OBJCOPY_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  OBJCOPY_##ID,
+#include "ObjcopyOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const OBJCOPY_##NAME[] = VALUE;
+#include "ObjcopyOpts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info ObjcopyInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {OBJCOPY_##PREFIX,                                                           \
+   NAME,                                                                       \
+   HELPTEXT,                                                                   \
+   METAVAR,                                                                    \
+   OBJCOPY_##ID,                                                               \
+   opt::Option::KIND##Class,                                                   \
+   PARAM,                                                                      \
+   FLAGS,                                                                      \
+   OBJCOPY_##GROUP,                                                            \
+   OBJCOPY_##ALIAS,                                                            \
+   ALIASARGS,                                                                  \
+   VALUES},
+#include "ObjcopyOpts.inc"
+#undef OPTION
+};
+
+class ObjcopyOptTable : public opt::OptTable {
+public:
+  ObjcopyOptTable() : OptTable(ObjcopyInfoTable) {}
+};
+
+enum StripID {
+  STRIP_INVALID = 0, // This is not an option ID.
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  STRIP_##ID,
+#include "StripOpts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const STRIP_##NAME[] = VALUE;
+#include "StripOpts.inc"
+#undef PREFIX
+
+static const opt::OptTable::Info StripInfoTable[] = {
+#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
+               HELPTEXT, METAVAR, VALUES)                                      \
+  {STRIP_##PREFIX, NAME,       HELPTEXT,                                       \
+   METAVAR,        STRIP_##ID, opt::Option::KIND##Class,                       \
+   PARAM,          FLAGS,      STRIP_##GROUP,                                  \
+   STRIP_##ALIAS,  ALIASARGS,  VALUES},
+#include "StripOpts.inc"
+#undef OPTION
+};
+
+class StripOptTable : public opt::OptTable {
+public:
+  StripOptTable() : OptTable(StripInfoTable) {}
+};
+
+enum SectionFlag {
+  SecNone = 0,
+  SecAlloc = 1 << 0,
+  SecLoad = 1 << 1,
+  SecNoload = 1 << 2,
+  SecReadonly = 1 << 3,
+  SecDebug = 1 << 4,
+  SecCode = 1 << 5,
+  SecData = 1 << 6,
+  SecRom = 1 << 7,
+  SecMerge = 1 << 8,
+  SecStrings = 1 << 9,
+  SecContents = 1 << 10,
+  SecShare = 1 << 11,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ SecShare)
+};
+
+} // namespace
+
+static SectionFlag parseSectionRenameFlag(StringRef SectionName) {
+  return llvm::StringSwitch<SectionFlag>(SectionName)
+      .Case("alloc", SectionFlag::SecAlloc)
+      .Case("load", SectionFlag::SecLoad)
+      .Case("noload", SectionFlag::SecNoload)
+      .Case("readonly", SectionFlag::SecReadonly)
+      .Case("debug", SectionFlag::SecDebug)
+      .Case("code", SectionFlag::SecCode)
+      .Case("data", SectionFlag::SecData)
+      .Case("rom", SectionFlag::SecRom)
+      .Case("merge", SectionFlag::SecMerge)
+      .Case("strings", SectionFlag::SecStrings)
+      .Case("contents", SectionFlag::SecContents)
+      .Case("share", SectionFlag::SecShare)
+      .Default(SectionFlag::SecNone);
+}
+
+static SectionRename parseRenameSectionValue(StringRef FlagValue) {
+  if (!FlagValue.contains('='))
+    error("Bad format for --rename-section: missing '='");
+
+  // Initial split: ".foo" = ".bar,f1,f2,..."
+  auto Old2New = FlagValue.split('=');
+  SectionRename SR;
+  SR.OriginalName = Old2New.first;
+
+  // Flags split: ".bar" "f1" "f2" ...
+  SmallVector<StringRef, 6> NameAndFlags;
+  Old2New.second.split(NameAndFlags, ',');
+  SR.NewName = NameAndFlags[0];
+
+  if (NameAndFlags.size() > 1) {
+    SectionFlag Flags = SectionFlag::SecNone;
+    for (size_t I = 1, Size = NameAndFlags.size(); I < Size; ++I) {
+      SectionFlag Flag = parseSectionRenameFlag(NameAndFlags[I]);
+      if (Flag == SectionFlag::SecNone)
+        error("Unrecognized section flag '" + NameAndFlags[I] +
+              "'. Flags supported for GNU compatibility: alloc, load, noload, "
+              "readonly, debug, code, data, rom, share, contents, merge, "
+              "strings.");
+      Flags |= Flag;
+    }
+
+    SR.NewFlags = 0;
+    if (Flags & SectionFlag::SecAlloc)
+      *SR.NewFlags |= ELF::SHF_ALLOC;
+    if (!(Flags & SectionFlag::SecReadonly))
+      *SR.NewFlags |= ELF::SHF_WRITE;
+    if (Flags & SectionFlag::SecCode)
+      *SR.NewFlags |= ELF::SHF_EXECINSTR;
+    if (Flags & SectionFlag::SecMerge)
+      *SR.NewFlags |= ELF::SHF_MERGE;
+    if (Flags & SectionFlag::SecStrings)
+      *SR.NewFlags |= ELF::SHF_STRINGS;
+  }
+
+  return SR;
+}
+
+static const StringMap<MachineInfo> ArchMap{
+    // Name, {EMachine, 64bit, LittleEndian}
+    {"aarch64", {ELF::EM_AARCH64, true, true}},
+    {"arm", {ELF::EM_ARM, false, true}},
+    {"i386", {ELF::EM_386, false, true}},
+    {"i386:x86-64", {ELF::EM_X86_64, true, true}},
+    {"powerpc:common64", {ELF::EM_PPC64, true, true}},
+    {"sparc", {ELF::EM_SPARC, false, true}},
+    {"x86-64", {ELF::EM_X86_64, true, true}},
+};
+
+static const MachineInfo &getMachineInfo(StringRef Arch) {
+  auto Iter = ArchMap.find(Arch);
+  if (Iter == std::end(ArchMap))
+    error("Invalid architecture: '" + Arch + "'");
+  return Iter->getValue();
+}
+
+static void addGlobalSymbolsFromFile(std::vector<std::string> &Symbols,
+                                     StringRef Filename) {
+  SmallVector<StringRef, 16> Lines;
+  auto BufOrErr = MemoryBuffer::getFile(Filename);
+  if (!BufOrErr)
+    reportError(Filename, BufOrErr.getError());
+
+  BufOrErr.get()->getBuffer().split(Lines, '\n');
+  for (StringRef Line : Lines) {
+    // Ignore everything after '#', trim whitespace, and only add the symbol if
+    // it's not empty.
+    auto TrimmedLine = Line.split('#').first.trim();
+    if (!TrimmedLine.empty())
+      Symbols.push_back(TrimmedLine.str());
+  }
+}
+
+// ParseObjcopyOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseObjcopyOptions will print the help messege and
+// exit.
+DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
+  ObjcopyOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  llvm::opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0) {
+    T.PrintHelp(errs(), "llvm-objcopy input [output]", "objcopy tool");
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(OBJCOPY_help)) {
+    T.PrintHelp(outs(), "llvm-objcopy input [output]", "objcopy tool");
+    exit(0);
+  }
+
+  if (InputArgs.hasArg(OBJCOPY_version)) {
+    cl::PrintVersionMessage();
+    exit(0);
+  }
+
+  SmallVector<const char *, 2> Positional;
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_UNKNOWN))
+    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_INPUT))
+    Positional.push_back(Arg->getValue());
+
+  if (Positional.empty())
+    error("No input file specified");
+
+  if (Positional.size() > 2)
+    error("Too many positional arguments");
+
+  CopyConfig Config;
+  Config.InputFilename = Positional[0];
+  Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1];
+  if (InputArgs.hasArg(OBJCOPY_target) &&
+      (InputArgs.hasArg(OBJCOPY_input_target) ||
+       InputArgs.hasArg(OBJCOPY_output_target)))
+    error("--target cannot be used with --input-target or --output-target");
+
+  if (InputArgs.hasArg(OBJCOPY_target)) {
+    Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
+    Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_target);
+  } else {
+    Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target);
+    Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target);
+  }
+  if (Config.InputFormat == "binary") {
+    auto BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture);
+    if (BinaryArch.empty())
+      error("Specified binary input without specifiying an architecture");
+    Config.BinaryArch = getMachineInfo(BinaryArch);
+  }
+
+  if (auto Arg = InputArgs.getLastArg(OBJCOPY_compress_debug_sections,
+                                      OBJCOPY_compress_debug_sections_eq)) {
+    Config.CompressionType = DebugCompressionType::Z;
+
+    if (Arg->getOption().getID() == OBJCOPY_compress_debug_sections_eq) {
+      Config.CompressionType =
+          StringSwitch<DebugCompressionType>(
+              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq))
+              .Case("zlib-gnu", DebugCompressionType::GNU)
+              .Case("zlib", DebugCompressionType::Z)
+              .Default(DebugCompressionType::None);
+      if (Config.CompressionType == DebugCompressionType::None)
+        error("Invalid or unsupported --compress-debug-sections format: " +
+              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq));
+      if (!zlib::isAvailable())
+        error("LLVM was not compiled with LLVM_ENABLE_ZLIB: can not compress.");
+    }
+  }
+
+  Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo);
+  Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
+  Config.SymbolsPrefix = InputArgs.getLastArgValue(OBJCOPY_prefix_symbols);
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      error("Bad format for --redefine-sym");
+    auto Old2New = StringRef(Arg->getValue()).split('=');
+    if (!Config.SymbolsToRename.insert(Old2New).second)
+      error("Multiple redefinition of symbol " + Old2New.first);
+  }
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_rename_section)) {
+    SectionRename SR = parseRenameSectionValue(StringRef(Arg->getValue()));
+    if (!Config.SectionsToRename.try_emplace(SR.OriginalName, SR).second)
+      error("Multiple renames of section " + SR.OriginalName);
+  }
+
+  for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section))
+    Config.ToRemove.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep))
+    Config.Keep.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_only_keep))
+    Config.OnlyKeep.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_add_section))
+    Config.AddSection.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_dump_section))
+    Config.DumpSection.push_back(Arg->getValue());
+  Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all);
+  Config.StripAllGNU = InputArgs.hasArg(OBJCOPY_strip_all_gnu);
+  Config.StripDebug = InputArgs.hasArg(OBJCOPY_strip_debug);
+  Config.StripDWO = InputArgs.hasArg(OBJCOPY_strip_dwo);
+  Config.StripSections = InputArgs.hasArg(OBJCOPY_strip_sections);
+  Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc);
+  Config.StripUnneeded = InputArgs.hasArg(OBJCOPY_strip_unneeded);
+  Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo);
+  Config.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden);
+  Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken);
+  Config.DiscardAll = InputArgs.hasArg(OBJCOPY_discard_all);
+  Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug);
+  Config.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols);
+  Config.DecompressDebugSections =
+      InputArgs.hasArg(OBJCOPY_decompress_debug_sections);
+  for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol))
+    Config.SymbolsToLocalize.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbol))
+    Config.SymbolsToKeepGlobal.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbols))
+    addGlobalSymbolsFromFile(Config.SymbolsToKeepGlobal, Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol))
+    Config.SymbolsToGlobalize.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol))
+    Config.SymbolsToWeaken.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol))
+    Config.SymbolsToRemove.push_back(Arg->getValue());
+  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol))
+    Config.SymbolsToKeep.push_back(Arg->getValue());
+
+  Config.DeterministicArchives = InputArgs.hasFlag(
+      OBJCOPY_enable_deterministic_archives,
+      OBJCOPY_disable_deterministic_archives, /*default=*/true);
+
+  Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates);
+
+  if (Config.DecompressDebugSections &&
+      Config.CompressionType != DebugCompressionType::None) {
+    error("Cannot specify --compress-debug-sections at the same time as "
+          "--decompress-debug-sections at the same time");
+  }
+
+  if (Config.DecompressDebugSections && !zlib::isAvailable())
+    error("LLVM was not compiled with LLVM_ENABLE_ZLIB: cannot decompress.");
+
+  DriverConfig DC;
+  DC.CopyConfigs.push_back(std::move(Config));
+  return DC;
+}
+
+// ParseStripOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseStripOptions will print the help messege and
+// exit.
+DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr) {
+  StripOptTable T;
+  unsigned MissingArgumentIndex, MissingArgumentCount;
+  llvm::opt::InputArgList InputArgs =
+      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
+
+  if (InputArgs.size() == 0) {
+    T.PrintHelp(errs(), "llvm-strip [options] file...", "strip tool");
+    exit(1);
+  }
+
+  if (InputArgs.hasArg(STRIP_help)) {
+    T.PrintHelp(outs(), "llvm-strip [options] file...", "strip tool");
+    exit(0);
+  }
+
+  if (InputArgs.hasArg(STRIP_version)) {
+    cl::PrintVersionMessage();
+    exit(0);
+  }
+
+  SmallVector<const char *, 2> Positional;
+  for (auto Arg : InputArgs.filtered(STRIP_UNKNOWN))
+    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
+  for (auto Arg : InputArgs.filtered(STRIP_INPUT))
+    Positional.push_back(Arg->getValue());
+
+  if (Positional.empty())
+    error("No input file specified");
+
+  if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output))
+    error("Multiple input files cannot be used in combination with -o");
+
+  CopyConfig Config;
+  Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
+
+  Config.DiscardAll = InputArgs.hasArg(STRIP_discard_all);
+  Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded);
+  Config.StripAll = InputArgs.hasArg(STRIP_strip_all);
+  Config.StripAllGNU = InputArgs.hasArg(STRIP_strip_all_gnu);
+
+  if (!Config.StripDebug && !Config.StripUnneeded && !Config.DiscardAll &&
+      !Config.StripAllGNU)
+    Config.StripAll = true;
+
+  for (auto Arg : InputArgs.filtered(STRIP_keep))
+    Config.Keep.push_back(Arg->getValue());
+
+  for (auto Arg : InputArgs.filtered(STRIP_remove_section))
+    Config.ToRemove.push_back(Arg->getValue());
+
+  for (auto Arg : InputArgs.filtered(STRIP_keep_symbol))
+    Config.SymbolsToKeep.push_back(Arg->getValue());
+
+  Config.DeterministicArchives =
+      InputArgs.hasFlag(STRIP_enable_deterministic_archives,
+                        STRIP_disable_deterministic_archives, /*default=*/true);
+
+  Config.PreserveDates = InputArgs.hasArg(STRIP_preserve_dates);
+
+  DriverConfig DC;
+  if (Positional.size() == 1) {
+    Config.InputFilename = Positional[0];
+    Config.OutputFilename =
+        InputArgs.getLastArgValue(STRIP_output, Positional[0]);
+    DC.CopyConfigs.push_back(std::move(Config));
+  } else {
+    for (const char *Filename : Positional) {
+      Config.InputFilename = Filename;
+      Config.OutputFilename = Filename;
+      DC.CopyConfigs.push_back(Config);
+    }
+  }
+
+  return DC;
+}
+
+} // namespace objcopy
+} // namespace llvm
diff --git a/tools/llvm-objcopy/CopyConfig.h b/tools/llvm-objcopy/CopyConfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ebe2a072bb4b3ca12590338cb22050e91ec6f20
--- /dev/null
+++ b/tools/llvm-objcopy/CopyConfig.h
@@ -0,0 +1,114 @@
+//===- CopyConfig.h -------------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_OBJCOPY_COPY_CONFIG_H
+#define LLVM_TOOLS_LLVM_OBJCOPY_COPY_CONFIG_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+// Necessary for llvm::DebugCompressionType::None
+#include "llvm/Target/TargetOptions.h"
+#include <string>
+#include <vector>
+
+namespace llvm {
+namespace objcopy {
+
+// This type keeps track of the machine info for various architectures. This
+// lets us map architecture names to ELF types and the e_machine value of the
+// ELF file.
+struct MachineInfo {
+  uint16_t EMachine;
+  bool Is64Bit;
+  bool IsLittleEndian;
+};
+
+struct SectionRename {
+  StringRef OriginalName;
+  StringRef NewName;
+  Optional<uint64_t> NewFlags;
+};
+
+// Configuration for copying/stripping a single file.
+struct CopyConfig {
+  // Main input/output options
+  StringRef InputFilename;
+  StringRef InputFormat;
+  StringRef OutputFilename;
+  StringRef OutputFormat;
+
+  // Only applicable for --input-format=Binary
+  MachineInfo BinaryArch;
+
+  // Advanced options
+  StringRef AddGnuDebugLink;
+  StringRef SplitDWO;
+  StringRef SymbolsPrefix;
+
+  // Repeated options
+  std::vector<StringRef> AddSection;
+  std::vector<StringRef> DumpSection;
+  std::vector<StringRef> Keep;
+  std::vector<StringRef> OnlyKeep;
+  std::vector<StringRef> SymbolsToGlobalize;
+  std::vector<StringRef> SymbolsToKeep;
+  std::vector<StringRef> SymbolsToLocalize;
+  std::vector<StringRef> SymbolsToRemove;
+  std::vector<StringRef> SymbolsToWeaken;
+  std::vector<StringRef> ToRemove;
+  std::vector<std::string> SymbolsToKeepGlobal;
+
+  // Map options
+  StringMap<SectionRename> SectionsToRename;
+  StringMap<StringRef> SymbolsToRename;
+
+  // Boolean options
+  bool DeterministicArchives = true;
+  bool DiscardAll = false;
+  bool ExtractDWO = false;
+  bool KeepFileSymbols = false;
+  bool LocalizeHidden = false;
+  bool OnlyKeepDebug = false;
+  bool PreserveDates = false;
+  bool StripAll = false;
+  bool StripAllGNU = false;
+  bool StripDWO = false;
+  bool StripDebug = false;
+  bool StripNonAlloc = false;
+  bool StripSections = false;
+  bool StripUnneeded = false;
+  bool Weaken = false;
+  bool DecompressDebugSections = false;
+  DebugCompressionType CompressionType = DebugCompressionType::None;
+};
+
+// Configuration for the overall invocation of this tool. When invoked as
+// objcopy, will always contain exactly one CopyConfig. When invoked as strip,
+// will contain one or more CopyConfigs.
+struct DriverConfig {
+  SmallVector<CopyConfig, 1> CopyConfigs;
+};
+
+// ParseObjcopyOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseObjcopyOptions will print the help messege and
+// exit.
+DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr);
+
+// ParseStripOptions returns the config and sets the input arguments. If a
+// help flag is set then ParseStripOptions will print the help messege and
+// exit.
+DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr);
+
+} // namespace objcopy
+} // namespace llvm
+
+#endif
diff --git a/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a367a30c467f5dc50bc2d7f86121dc542b2eea5c
--- /dev/null
+++ b/tools/llvm-objcopy/ELF/ELFObjcopy.cpp
@@ -0,0 +1,506 @@
+//===- ELFObjcopy.cpp -----------------------------------------------------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ELFObjcopy.h"
+#include "Buffer.h"
+#include "CopyConfig.h"
+#include "llvm-objcopy.h"
+#include "Object.h"
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compression.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+
+namespace llvm {
+namespace objcopy {
+namespace elf {
+
+using namespace object;
+using namespace ELF;
+using SectionPred = std::function<bool(const SectionBase &Sec)>;
+
+static bool isDebugSection(const SectionBase &Sec) {
+  return StringRef(Sec.Name).startswith(".debug") ||
+         StringRef(Sec.Name).startswith(".zdebug") || Sec.Name == ".gdb_index";
+}
+
+static bool isDWOSection(const SectionBase &Sec) {
+  return StringRef(Sec.Name).endswith(".dwo");
+}
+
+static bool onlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) {
+  // We can't remove the section header string table.
+  if (&Sec == Obj.SectionNames)
+    return false;
+  // Short of keeping the string table we want to keep everything that is a DWO
+  // section and remove everything else.
+  return !isDWOSection(Sec);
+}
+
+static ElfType getOutputElfType(const Binary &Bin) {
+  // Infer output ELF type from the input ELF object
+  if (isa<ELFObjectFile<ELF32LE>>(Bin))
+    return ELFT_ELF32LE;
+  if (isa<ELFObjectFile<ELF64LE>>(Bin))
+    return ELFT_ELF64LE;
+  if (isa<ELFObjectFile<ELF32BE>>(Bin))
+    return ELFT_ELF32BE;
+  if (isa<ELFObjectFile<ELF64BE>>(Bin))
+    return ELFT_ELF64BE;
+  llvm_unreachable("Invalid ELFType");
+}
+
+static ElfType getOutputElfType(const MachineInfo &MI) {
+  // Infer output ELF type from the binary arch specified
+  if (MI.Is64Bit)
+    return MI.IsLittleEndian ? ELFT_ELF64LE : ELFT_ELF64BE;
+  else
+    return MI.IsLittleEndian ? ELFT_ELF32LE : ELFT_ELF32BE;
+}
+
+static std::unique_ptr<Writer> createWriter(const CopyConfig &Config,
+                                            Object &Obj, Buffer &Buf,
+                                            ElfType OutputElfType) {
+  if (Config.OutputFormat == "binary") {
+    return llvm::make_unique<BinaryWriter>(Obj, Buf);
+  }
+  // Depending on the initial ELFT and OutputFormat we need a different Writer.
+  switch (OutputElfType) {
+  case ELFT_ELF32LE:
+    return llvm::make_unique<ELFWriter<ELF32LE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  case ELFT_ELF64LE:
+    return llvm::make_unique<ELFWriter<ELF64LE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  case ELFT_ELF32BE:
+    return llvm::make_unique<ELFWriter<ELF32BE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  case ELFT_ELF64BE:
+    return llvm::make_unique<ELFWriter<ELF64BE>>(Obj, Buf,
+                                                 !Config.StripSections);
+  }
+  llvm_unreachable("Invalid output format");
+}
+
+static void splitDWOToFile(const CopyConfig &Config, const Reader &Reader,
+                           StringRef File, ElfType OutputElfType) {
+  auto DWOFile = Reader.create();
+  DWOFile->removeSections(
+      [&](const SectionBase &Sec) { return onlyKeepDWOPred(*DWOFile, Sec); });
+  FileBuffer FB(File);
+  auto Writer = createWriter(Config, *DWOFile, FB, OutputElfType);
+  Writer->finalize();
+  Writer->write();
+}
+
+static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
+                               Object &Obj) {
+  for (auto &Sec : Obj.sections()) {
+    if (Sec.Name == SecName) {
+      if (Sec.OriginalData.size() == 0)
+        return make_error<StringError>("Can't dump section \"" + SecName +
+                                           "\": it has no contents",
+                                       object_error::parse_failed);
+      Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
+          FileOutputBuffer::create(Filename, Sec.OriginalData.size());
+      if (!BufferOrErr)
+        return BufferOrErr.takeError();
+      std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
+      std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(),
+                Buf->getBufferStart());
+      if (Error E = Buf->commit())
+        return E;
+      return Error::success();
+    }
+  }
+  return make_error<StringError>("Section not found",
+                                 object_error::parse_failed);
+}
+
+static bool isCompressed(const SectionBase &Section) {
+  const char *Magic = "ZLIB";
+  return StringRef(Section.Name).startswith(".zdebug") ||
+         (Section.OriginalData.size() > strlen(Magic) &&
+          !strncmp(reinterpret_cast<const char *>(Section.OriginalData.data()),
+                   Magic, strlen(Magic))) ||
+         (Section.Flags & ELF::SHF_COMPRESSED);
+}
+
+static bool isCompressable(const SectionBase &Section) {
+  return !isCompressed(Section) && isDebugSection(Section) &&
+         Section.Name != ".gdb_index";
+}
+
+static void replaceDebugSections(
+    const CopyConfig &Config, Object &Obj, SectionPred &RemovePred,
+    function_ref<bool(const SectionBase &)> shouldReplace,
+    function_ref<SectionBase *(const SectionBase *)> addSection) {
+  SmallVector<SectionBase *, 13> ToReplace;
+  SmallVector<RelocationSection *, 13> RelocationSections;
+  for (auto &Sec : Obj.sections()) {
+    if (RelocationSection *R = dyn_cast<RelocationSection>(&Sec)) {
+      if (shouldReplace(*R->getSection()))
+        RelocationSections.push_back(R);
+      continue;
+    }
+
+    if (shouldReplace(Sec))
+      ToReplace.push_back(&Sec);
+  }
+
+  for (SectionBase *S : ToReplace) {
+    SectionBase *NewSection = addSection(S);
+
+    for (RelocationSection *RS : RelocationSections) {
+      if (RS->getSection() == S)
+        RS->setSection(NewSection);
+    }
+  }
+
+  RemovePred = [shouldReplace, RemovePred](const SectionBase &Sec) {
+    return shouldReplace(Sec) || RemovePred(Sec);
+  };
+}
+
+// This function handles the high level operations of GNU objcopy including
+// handling command line options. It's important to outline certain properties
+// we expect to hold of the command line operations. Any operation that "keeps"
+// should keep regardless of a remove. Additionally any removal should respect
+// any previous removals. Lastly whether or not something is removed shouldn't
+// depend a) on the order the options occur in or b) on some opaque priority
+// system. The only priority is that keeps/copies overrule removes.
+static void handleArgs(const CopyConfig &Config, Object &Obj,
+                       const Reader &Reader, ElfType OutputElfType) {
+
+  if (!Config.SplitDWO.empty()) {
+    splitDWOToFile(Config, Reader, Config.SplitDWO, OutputElfType);
+  }
+
+  // TODO: update or remove symbols only if there is an option that affects
+  // them.
+  if (Obj.SymbolTable) {
+    Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
+      if (!Sym.isCommon() &&
+          ((Config.LocalizeHidden &&
+            (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
+           (!Config.SymbolsToLocalize.empty() &&
+            is_contained(Config.SymbolsToLocalize, Sym.Name))))
+        Sym.Binding = STB_LOCAL;
+
+      // Note: these two globalize flags have very similar names but different
+      // meanings:
+      //
+      // --globalize-symbol: promote a symbol to global
+      // --keep-global-symbol: all symbols except for these should be made local
+      //
+      // If --globalize-symbol is specified for a given symbol, it will be
+      // global in the output file even if it is not included via
+      // --keep-global-symbol. Because of that, make sure to check
+      // --globalize-symbol second.
+      if (!Config.SymbolsToKeepGlobal.empty() &&
+          !is_contained(Config.SymbolsToKeepGlobal, Sym.Name) &&
+          Sym.getShndx() != SHN_UNDEF)
+        Sym.Binding = STB_LOCAL;
+
+      if (!Config.SymbolsToGlobalize.empty() &&
+          is_contained(Config.SymbolsToGlobalize, Sym.Name) &&
+          Sym.getShndx() != SHN_UNDEF)
+        Sym.Binding = STB_GLOBAL;
+
+      if (!Config.SymbolsToWeaken.empty() &&
+          is_contained(Config.SymbolsToWeaken, Sym.Name) &&
+          Sym.Binding == STB_GLOBAL)
+        Sym.Binding = STB_WEAK;
+
+      if (Config.Weaken && Sym.Binding == STB_GLOBAL &&
+          Sym.getShndx() != SHN_UNDEF)
+        Sym.Binding = STB_WEAK;
+
+      const auto I = Config.SymbolsToRename.find(Sym.Name);
+      if (I != Config.SymbolsToRename.end())
+        Sym.Name = I->getValue();
+
+      if (!Config.SymbolsPrefix.empty() && Sym.Type != STT_SECTION)
+        Sym.Name = (Config.SymbolsPrefix + Sym.Name).str();
+    });
+
+    // The purpose of this loop is to mark symbols referenced by sections
+    // (like GroupSection or RelocationSection). This way, we know which
+    // symbols are still 'needed' and which are not.
+    if (Config.StripUnneeded) {
+      for (auto &Section : Obj.sections())
+        Section.markSymbols();
+    }
+
+    Obj.removeSymbols([&](const Symbol &Sym) {
+      if ((!Config.SymbolsToKeep.empty() &&
+           is_contained(Config.SymbolsToKeep, Sym.Name)) ||
+          (Config.KeepFileSymbols && Sym.Type == STT_FILE))
+        return false;
+
+      if (Config.DiscardAll && Sym.Binding == STB_LOCAL &&
+          Sym.getShndx() != SHN_UNDEF && Sym.Type != STT_FILE &&
+          Sym.Type != STT_SECTION)
+        return true;
+
+      if (Config.StripAll || Config.StripAllGNU)
+        return true;
+
+      if (!Config.SymbolsToRemove.empty() &&
+          is_contained(Config.SymbolsToRemove, Sym.Name)) {
+        return true;
+      }
+
+      if (Config.StripUnneeded && !Sym.Referenced &&
+          (Sym.Binding == STB_LOCAL || Sym.getShndx() == SHN_UNDEF) &&
+          Sym.Type != STT_FILE && Sym.Type != STT_SECTION)
+        return true;
+
+      return false;
+    });
+  }
+
+  SectionPred RemovePred = [](const SectionBase &) { return false; };
+
+  // Removes:
+  if (!Config.ToRemove.empty()) {
+    RemovePred = [&Config](const SectionBase &Sec) {
+      return is_contained(Config.ToRemove, Sec.Name);
+    };
+  }
+
+  if (Config.StripDWO || !Config.SplitDWO.empty())
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      return isDWOSection(Sec) || RemovePred(Sec);
+    };
+
+  if (Config.ExtractDWO)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      return onlyKeepDWOPred(Obj, Sec) || RemovePred(Sec);
+    };
+
+  if (Config.StripAllGNU)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if ((Sec.Flags & SHF_ALLOC) != 0)
+        return false;
+      if (&Sec == Obj.SectionNames)
+        return false;
+      switch (Sec.Type) {
+      case SHT_SYMTAB:
+      case SHT_REL:
+      case SHT_RELA:
+      case SHT_STRTAB:
+        return true;
+      }
+      return isDebugSection(Sec);
+    };
+
+  if (Config.StripSections) {
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      return RemovePred(Sec) || (Sec.Flags & SHF_ALLOC) == 0;
+    };
+  }
+
+  if (Config.StripDebug) {
+    RemovePred = [RemovePred](const SectionBase &Sec) {
+      return RemovePred(Sec) || isDebugSection(Sec);
+    };
+  }
+
+  if (Config.StripNonAlloc)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if (&Sec == Obj.SectionNames)
+        return false;
+      return (Sec.Flags & SHF_ALLOC) == 0;
+    };
+
+  if (Config.StripAll)
+    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
+      if (RemovePred(Sec))
+        return true;
+      if (&Sec == Obj.SectionNames)
+        return false;
+      if (StringRef(Sec.Name).startswith(".gnu.warning"))
+        return false;
+      return (Sec.Flags & SHF_ALLOC) == 0;
+    };
+
+  // Explicit copies:
+  if (!Config.OnlyKeep.empty()) {
+    RemovePred = [&Config, RemovePred, &Obj](const SectionBase &Sec) {
+      // Explicitly keep these sections regardless of previous removes.
+      if (is_contained(Config.OnlyKeep, Sec.Name))
+        return false;
+
+      // Allow all implicit removes.
+      if (RemovePred(Sec))
+        return true;
+
+      // Keep special sections.
+      if (Obj.SectionNames == &Sec)
+        return false;
+      if (Obj.SymbolTable == &Sec ||
+          (Obj.SymbolTable && Obj.SymbolTable->getStrTab() == &Sec))
+        return false;
+
+      // Remove everything else.
+      return true;
+    };
+  }
+
+  if (!Config.Keep.empty()) {
+    RemovePred = [Config, RemovePred](const SectionBase &Sec) {
+      // Explicitly keep these sections regardless of previous removes.
+      if (is_contained(Config.Keep, Sec.Name))
+        return false;
+      // Otherwise defer to RemovePred.
+      return RemovePred(Sec);
+    };
+  }
+
+  // This has to be the last predicate assignment.
+  // If the option --keep-symbol has been specified
+  // and at least one of those symbols is present
+  // (equivalently, the updated symbol table is not empty)
+  // the symbol table and the string table should not be removed.
+  if ((!Config.SymbolsToKeep.empty() || Config.KeepFileSymbols) &&
+      Obj.SymbolTable && !Obj.SymbolTable->empty()) {
+    RemovePred = [&Obj, RemovePred](const SectionBase &Sec) {
+      if (&Sec == Obj.SymbolTable || &Sec == Obj.SymbolTable->getStrTab())
+        return false;
+      return RemovePred(Sec);
+    };
+  }
+
+  if (Config.CompressionType != DebugCompressionType::None)
+    replaceDebugSections(Config, Obj, RemovePred, isCompressable,
+                         [&Config, &Obj](const SectionBase *S) {
+                           return &Obj.addSection<CompressedSection>(
+                               *S, Config.CompressionType);
+                         });
+  else if (Config.DecompressDebugSections)
+    replaceDebugSections(
+        Config, Obj, RemovePred,
+        [](const SectionBase &S) { return isa<CompressedSection>(&S); },
+        [&Obj](const SectionBase *S) {
+          auto CS = cast<CompressedSection>(S);
+          return &Obj.addSection<DecompressedSection>(*CS);
+        });
+
+  Obj.removeSections(RemovePred);
+
+  if (!Config.SectionsToRename.empty()) {
+    for (auto &Sec : Obj.sections()) {
+      const auto Iter = Config.SectionsToRename.find(Sec.Name);
+      if (Iter != Config.SectionsToRename.end()) {
+        const SectionRename &SR = Iter->second;
+        Sec.Name = SR.NewName;
+        if (SR.NewFlags.hasValue()) {
+          // Preserve some flags which should not be dropped when setting flags.
+          // Also, preserve anything OS/processor dependant.
+          const uint64_t PreserveMask = ELF::SHF_COMPRESSED | ELF::SHF_EXCLUDE |
+                                        ELF::SHF_GROUP | ELF::SHF_LINK_ORDER |
+                                        ELF::SHF_MASKOS | ELF::SHF_MASKPROC |
+                                        ELF::SHF_TLS | ELF::SHF_INFO_LINK;
+          Sec.Flags = (Sec.Flags & PreserveMask) |
+                      (SR.NewFlags.getValue() & ~PreserveMask);
+        }
+      }
+    }
+  }
+
+  if (!Config.AddSection.empty()) {
+    for (const auto &Flag : Config.AddSection) {
+      auto SecPair = Flag.split("=");
+      auto SecName = SecPair.first;
+      auto File = SecPair.second;
+      auto BufOrErr = MemoryBuffer::getFile(File);
+      if (!BufOrErr)
+        reportError(File, BufOrErr.getError());
+      auto Buf = std::move(*BufOrErr);
+      auto BufPtr = reinterpret_cast<const uint8_t *>(Buf->getBufferStart());
+      auto BufSize = Buf->getBufferSize();
+      Obj.addSection<OwnedDataSection>(SecName,
+                                       ArrayRef<uint8_t>(BufPtr, BufSize));
+    }
+  }
+
+  if (!Config.DumpSection.empty()) {
+    for (const auto &Flag : Config.DumpSection) {
+      std::pair<StringRef, StringRef> SecPair = Flag.split("=");
+      StringRef SecName = SecPair.first;
+      StringRef File = SecPair.second;
+      if (Error E = dumpSectionToFile(SecName, File, Obj))
+        reportError(Config.InputFilename, std::move(E));
+    }
+  }
+
+  if (!Config.AddGnuDebugLink.empty())
+    Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink);
+}
+
+void executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
+                               Buffer &Out) {
+  BinaryReader Reader(Config.BinaryArch, &In);
+  std::unique_ptr<Object> Obj = Reader.create();
+
+  const ElfType OutputElfType = getOutputElfType(Config.BinaryArch);
+  handleArgs(Config, *Obj, Reader, OutputElfType);
+  std::unique_ptr<Writer> Writer =
+      createWriter(Config, *Obj, Out, OutputElfType);
+  Writer->finalize();
+  Writer->write();
+}
+
+void executeObjcopyOnBinary(const CopyConfig &Config,
+                            object::ELFObjectFileBase &In, Buffer &Out) {
+  ELFReader Reader(&In);
+  std::unique_ptr<Object> Obj = Reader.create();
+  const ElfType OutputElfType = getOutputElfType(In);
+  handleArgs(Config, *Obj, Reader, OutputElfType);
+  std::unique_ptr<Writer> Writer =
+      createWriter(Config, *Obj, Out, OutputElfType);
+  Writer->finalize();
+  Writer->write();
+}
+
+} // end namespace elf
+} // end namespace objcopy
+} // end namespace llvm
diff --git a/tools/llvm-objcopy/ELF/ELFObjcopy.h b/tools/llvm-objcopy/ELF/ELFObjcopy.h
new file mode 100644
index 0000000000000000000000000000000000000000..43f41c00ce5b12a3498d59db1481c5f2c95974ea
--- /dev/null
+++ b/tools/llvm-objcopy/ELF/ELFObjcopy.h
@@ -0,0 +1,34 @@
+//===- ELFObjcopy.h ---------------------------------------------*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
+#define LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
+
+namespace llvm {
+class MemoryBuffer;
+
+namespace object {
+class ELFObjectFileBase;
+} // end namespace object
+
+namespace objcopy {
+struct CopyConfig;
+class Buffer;
+
+namespace elf {
+void executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In,
+                               Buffer &Out);
+void executeObjcopyOnBinary(const CopyConfig &Config,
+                            object::ELFObjectFileBase &In, Buffer &Out);
+
+} // end namespace elf
+} // end namespace objcopy
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_OBJCOPY_ELFOBJCOPY_H
diff --git a/tools/llvm-objcopy/Object.cpp b/tools/llvm-objcopy/ELF/Object.cpp
similarity index 96%
rename from tools/llvm-objcopy/Object.cpp
rename to tools/llvm-objcopy/ELF/Object.cpp
index ddf811a769bde35b98a051092e139cb503c49626..c2af99fc197dbbba05f21a30884aad68f65406ca 100644
--- a/tools/llvm-objcopy/Object.cpp
+++ b/tools/llvm-objcopy/ELF/Object.cpp
@@ -28,42 +28,13 @@
 #include <utility>
 #include <vector>
 
-using namespace llvm;
-using namespace llvm::objcopy;
+namespace llvm {
+namespace objcopy {
+namespace elf {
+
 using namespace object;
 using namespace ELF;
 
-Buffer::~Buffer() {}
-
-void FileBuffer::allocate(size_t Size) {
-  Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
-      FileOutputBuffer::create(getName(), Size, FileOutputBuffer::F_executable);
-  handleAllErrors(BufferOrErr.takeError(), [this](const ErrorInfoBase &E) {
-    error("failed to open " + getName() + ": " + E.message());
-  });
-  Buf = std::move(*BufferOrErr);
-}
-
-Error FileBuffer::commit() { return Buf->commit(); }
-
-uint8_t *FileBuffer::getBufferStart() {
-  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
-}
-
-void MemBuffer::allocate(size_t Size) {
-  Buf = WritableMemoryBuffer::getNewMemBuffer(Size, getName());
-}
-
-Error MemBuffer::commit() { return Error::success(); }
-
-uint8_t *MemBuffer::getBufferStart() {
-  return reinterpret_cast<uint8_t *>(Buf->getBufferStart());
-}
-
-std::unique_ptr<WritableMemoryBuffer> MemBuffer::releaseMemoryBuffer() {
-  return std::move(Buf);
-}
-
 template <class ELFT> void ELFWriter<ELFT>::writePhdr(const Segment &Seg) {
   uint8_t *B = Buf.getBufferStart();
   B += Obj.ProgramHdrSegment.Offset + Seg.Index * sizeof(Elf_Phdr);
@@ -361,6 +332,8 @@ uint16_t Symbol::getShndx() const {
   llvm_unreachable("Symbol with invalid ShndxType encountered");
 }
 
+bool Symbol::isCommon() const { return getShndx() == SHN_COMMON; }
+
 void SymbolTableSection::assignIndices() {
   uint32_t Index = 0;
   for (auto &Sym : Symbols)
@@ -644,12 +617,12 @@ void GnuDebugLinkSection::init(StringRef File, StringRef Data) {
   // establish the order that sections should go in. By using the maximum
   // possible offset we cause this section to wind up at the end.
   OriginalOffset = std::numeric_limits<uint64_t>::max();
-  JamCRC crc;
-  crc.update(ArrayRef<char>(Data.data(), Data.size()));
+  JamCRC CRC;
+  CRC.update(ArrayRef<char>(Data.data(), Data.size()));
   // The CRC32 value needs to be complemented because the JamCRC dosn't
   // finalize the CRC32 value. It also dosn't negate the initial CRC32 value
   // but it starts by default at 0xFFFFFFFF which is the complement of zero.
-  CRC32 = ~crc.getCRC();
+  CRC32 = ~CRC.getCRC();
 }
 
 GnuDebugLinkSection::GnuDebugLinkSection(StringRef File) : FileName(File) {
@@ -777,7 +750,7 @@ void BinaryELFBuilder<ELFT>::addData(SymbolTableSection *SymTab) {
 
   std::string SanitizedFilename = MemBuf->getBufferIdentifier().str();
   std::replace_if(std::begin(SanitizedFilename), std::end(SanitizedFilename),
-                  [](char c) { return !isalnum(c); }, '_');
+                  [](char C) { return !isalnum(C); }, '_');
   Twine Prefix = Twine("_binary_") + SanitizedFilename;
 
   SymTab->addSymbol(Prefix + "_start", STB_GLOBAL, STT_NOTYPE, &DataSection,
@@ -1157,20 +1130,20 @@ std::unique_ptr<Object> BinaryReader::create() const {
 
 std::unique_ptr<Object> ELFReader::create() const {
   auto Obj = llvm::make_unique<Object>();
-  if (auto *o = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
-    ELFBuilder<ELF32LE> Builder(*o, *Obj);
+  if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(Bin)) {
+    ELFBuilder<ELF32LE> Builder(*O, *Obj);
     Builder.build();
     return Obj;
-  } else if (auto *o = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
-    ELFBuilder<ELF64LE> Builder(*o, *Obj);
+  } else if (auto *O = dyn_cast<ELFObjectFile<ELF64LE>>(Bin)) {
+    ELFBuilder<ELF64LE> Builder(*O, *Obj);
     Builder.build();
     return Obj;
-  } else if (auto *o = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
-    ELFBuilder<ELF32BE> Builder(*o, *Obj);
+  } else if (auto *O = dyn_cast<ELFObjectFile<ELF32BE>>(Bin)) {
+    ELFBuilder<ELF32BE> Builder(*O, *Obj);
     Builder.build();
     return Obj;
-  } else if (auto *o = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
-    ELFBuilder<ELF64BE> Builder(*o, *Obj);
+  } else if (auto *O = dyn_cast<ELFObjectFile<ELF64BE>>(Bin)) {
+    ELFBuilder<ELF64BE> Builder(*O, *Obj);
     Builder.build();
     return Obj;
   }
@@ -1196,7 +1169,9 @@ template <class ELFT> void ELFWriter<ELFT>::writeEhdr() {
   Ehdr.e_machine = Obj.Machine;
   Ehdr.e_version = Obj.Version;
   Ehdr.e_entry = Obj.Entry;
-  Ehdr.e_phnum = size(Obj.segments());
+  // We have to use the fully-qualified name llvm::size
+  // since some compilers complain on ambiguous resolution.
+  Ehdr.e_phnum = llvm::size(Obj.segments());
   Ehdr.e_phoff = (Ehdr.e_phnum != 0) ? Obj.ProgramHdrSegment.Offset : 0;
   Ehdr.e_phentsize = (Ehdr.e_phnum != 0) ? sizeof(Elf_Phdr) : 0;
   Ehdr.e_flags = Obj.Flags;
@@ -1335,7 +1310,7 @@ static uint64_t alignToAddr(uint64_t Offset, uint64_t Addr, uint64_t Align) {
 }
 
 // Orders segments such that if x = y->ParentSegment then y comes before x.
-static void OrderSegments(std::vector<Segment *> &Segments) {
+static void orderSegments(std::vector<Segment *> &Segments) {
   std::stable_sort(std::begin(Segments), std::end(Segments),
                    compareSegmentsByOffset);
 }
@@ -1377,7 +1352,7 @@ static uint64_t LayoutSegments(std::vector<Segment *> &Segments,
 // sections had a ParentSegment or an offset one past the last section if there
 // was a section that didn't have a ParentSegment.
 template <class Range>
-static uint64_t LayoutSections(Range Sections, uint64_t Offset) {
+static uint64_t layoutSections(Range Sections, uint64_t Offset) {
   // Now the offset of every segment has been set we can assign the offsets
   // of each section. For sections that are covered by a segment we should use
   // the segment's original offset and the section's original offset to compute
@@ -1421,13 +1396,13 @@ template <class ELFT> void ELFWriter<ELFT>::assignOffsets() {
     OrderedSegments.push_back(&Segment);
   OrderedSegments.push_back(&Obj.ElfHdrSegment);
   OrderedSegments.push_back(&Obj.ProgramHdrSegment);
-  OrderSegments(OrderedSegments);
+  orderSegments(OrderedSegments);
   // Offset is used as the start offset of the first segment to be laid out.
   // Since the ELF Header (ElfHdrSegment) must be at the start of the file,
   // we start at offset 0.
   uint64_t Offset = 0;
   Offset = LayoutSegments(OrderedSegments, Offset);
-  Offset = LayoutSections(Obj.sections(), Offset);
+  Offset = layoutSections(Obj.sections(), Offset);
   // If we need to write the section header table out then we need to align the
   // Offset so that SHOffset is valid.
   if (WriteSectionHeaders)
@@ -1612,7 +1587,7 @@ void BinaryWriter::finalize() {
       continue;
     AllocatedSections.push_back(&Section);
   }
-  LayoutSections(make_pointee_range(AllocatedSections), Offset);
+  layoutSections(make_pointee_range(AllocatedSections), Offset);
 
   // Now that every section has been laid out we just need to compute the total
   // file size. This might not be the same as the offset returned by
@@ -1628,9 +1603,6 @@ void BinaryWriter::finalize() {
   SecWriter = llvm::make_unique<BinarySectionWriter>(Buf);
 }
 
-namespace llvm {
-namespace objcopy {
-
 template class BinaryELFBuilder<ELF64LE>;
 template class BinaryELFBuilder<ELF64BE>;
 template class BinaryELFBuilder<ELF32LE>;
@@ -1645,5 +1617,7 @@ template class ELFWriter<ELF64LE>;
 template class ELFWriter<ELF64BE>;
 template class ELFWriter<ELF32LE>;
 template class ELFWriter<ELF32BE>;
+
+} // end namespace elf
 } // end namespace objcopy
 } // end namespace llvm
diff --git a/tools/llvm-objcopy/Object.h b/tools/llvm-objcopy/ELF/Object.h
similarity index 93%
rename from tools/llvm-objcopy/Object.h
rename to tools/llvm-objcopy/ELF/Object.h
index 5fb03a5501e76f6576651dcc5d8b36463917449e..91ff1cddac17c6f3bccab3e0801f576ff7bbd9ee 100644
--- a/tools/llvm-objcopy/Object.h
+++ b/tools/llvm-objcopy/ELF/Object.h
@@ -10,6 +10,8 @@
 #ifndef LLVM_TOOLS_OBJCOPY_OBJECT_H
 #define LLVM_TOOLS_OBJCOPY_OBJECT_H
 
+#include "Buffer.h"
+#include "CopyConfig.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -28,8 +30,8 @@
 namespace llvm {
 enum class DebugCompressionType;
 namespace objcopy {
+namespace elf {
 
-class Buffer;
 class SectionBase;
 class Section;
 class OwnedDataSection;
@@ -67,15 +69,6 @@ public:
 
 enum ElfType { ELFT_ELF32LE, ELFT_ELF64LE, ELFT_ELF32BE, ELFT_ELF64BE };
 
-// This type keeps track of the machine info for various architectures. This
-// lets us map architecture names to ELF types and the e_machine value of the
-// ELF file.
-struct MachineInfo {
-  uint16_t EMachine;
-  bool Is64Bit;
-  bool IsLittleEndian;
-};
-
 class SectionVisitor {
 public:
   virtual ~SectionVisitor();
@@ -154,48 +147,6 @@ public:
   explicit BinarySectionWriter(Buffer &Buf) : SectionWriter(Buf) {}
 };
 
-// The class Buffer abstracts out the common interface of FileOutputBuffer and
-// WritableMemoryBuffer so that the hierarchy of Writers depends on this
-// abstract interface and doesn't depend on a particular implementation.
-// TODO: refactor the buffer classes in LLVM to enable us to use them here
-// directly.
-class Buffer {
-  StringRef Name;
-
-public:
-  virtual ~Buffer();
-  virtual void allocate(size_t Size) = 0;
-  virtual uint8_t *getBufferStart() = 0;
-  virtual Error commit() = 0;
-
-  explicit Buffer(StringRef Name) : Name(Name) {}
-  StringRef getName() const { return Name; }
-};
-
-class FileBuffer : public Buffer {
-  std::unique_ptr<FileOutputBuffer> Buf;
-
-public:
-  void allocate(size_t Size) override;
-  uint8_t *getBufferStart() override;
-  Error commit() override;
-
-  explicit FileBuffer(StringRef FileName) : Buffer(FileName) {}
-};
-
-class MemBuffer : public Buffer {
-  std::unique_ptr<WritableMemoryBuffer> Buf;
-
-public:
-  void allocate(size_t Size) override;
-  uint8_t *getBufferStart() override;
-  Error commit() override;
-
-  explicit MemBuffer(StringRef Name) : Buffer(Name) {}
-
-  std::unique_ptr<WritableMemoryBuffer> releaseMemoryBuffer();
-};
-
 class Writer {
 protected:
   Object &Obj;
@@ -464,6 +415,7 @@ struct Symbol {
   bool Referenced = false;
 
   uint16_t getShndx() const;
+  bool isCommon() const;
 };
 
 class SectionIndexSection : public SectionBase {
@@ -620,7 +572,6 @@ public:
   void setFlagWord(ELF::Elf32_Word W) { FlagWord = W; }
   void addMember(SectionBase *Sec) { GroupMembers.push_back(Sec); }
 
-  void initialize(SectionTableRef SecTable) override{};
   void accept(SectionVisitor &) const override;
   void finalize() override;
   void removeSymbols(function_ref<bool(const Symbol &)> ToRemove) override;
@@ -815,6 +766,8 @@ public:
     return *Segments.back();
   }
 };
+
+} // end namespace elf
 } // end namespace objcopy
 } // end namespace llvm
 
diff --git a/tools/llvm-objcopy/ObjcopyOpts.td b/tools/llvm-objcopy/ObjcopyOpts.td
index 18b270b77584cb6fc6a2d7a06cc53aacc6e5b666..285ab9d69db29589d2621f2af63fd09fe4c7282f 100644
--- a/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/tools/llvm-objcopy/ObjcopyOpts.td
@@ -1,76 +1,97 @@
 include "llvm/Option/OptParser.td"
 
-multiclass Eq<string name> {
-  def NAME: Separate<["--", "-"], name>;
-  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>;
+multiclass Eq<string name, string help> {
+  def NAME : Separate<["--", "-"], name>;
+  def NAME #_eq : Joined<["--", "-"], name #"=">,
+                  Alias<!cast<Separate>(NAME)>,
+                  HelpText<help>;
 }
 
 def help : Flag<["-", "--"], "help">;
-defm binary_architecture : Eq<"binary-architecture">,
-                           HelpText<"Used when transforming an architecture-less format (such as binary) to another format">;
-def B : JoinedOrSeparate<["-"], "B">,
-        Alias<binary_architecture>;
-defm input_target : Eq<"input-target">,
-                    HelpText<"Format of the input file">,
+
+defm binary_architecture
+    : Eq<"binary-architecture", "Used when transforming an architecture-less "
+                                "format (such as binary) to another format">;
+def B : JoinedOrSeparate<["-"], "B">, Alias<binary_architecture>;
+
+defm target : Eq<"target", "Format of the input and output file">,
+              Values<"binary">;
+def F : JoinedOrSeparate<["-"], "F">, Alias<target>;
+
+defm input_target : Eq<"input-target", "Format of the input file">,
                     Values<"binary">;
-def I : JoinedOrSeparate<[ "-" ], "I">, Alias<input_target>;
-defm output_target : Eq<"output-target">,
-                     HelpText<"Format of the output file">,
+def I : JoinedOrSeparate<["-"], "I">, Alias<input_target>;
+
+defm output_target : Eq<"output-target", "Format of the output file">,
                      Values<"binary">;
+def O : JoinedOrSeparate<["-"], "O">, Alias<output_target>;
+
 def compress_debug_sections : Flag<["--", "-"], "compress-debug-sections">;
-def compress_debug_sections_eq : Joined<["--", "-"], "compress-debug-sections=">,
-                                 MetaVarName<"[ zlib | zlib-gnu ]">,
-                                 HelpText<"Compress DWARF debug sections using "
-                                          "specified style. Supported styles: "
-                                          "'zlib-gnu' and 'zlib'">;
+def compress_debug_sections_eq
+    : Joined<["--", "-"], "compress-debug-sections=">,
+      MetaVarName<"[ zlib | zlib-gnu ]">,
+      HelpText<"Compress DWARF debug sections using specified style. Supported "
+               "styles: 'zlib-gnu' and 'zlib'">;
 def decompress_debug_sections : Flag<["-", "--"], "decompress-debug-sections">,
                                 HelpText<"Decompress DWARF debug sections.">;
-def O : JoinedOrSeparate<["-"], "O">,
-        Alias<output_target>;
-defm split_dwo : Eq<"split-dwo">,
-                 MetaVarName<"dwo-file">,
-                 HelpText<"Equivalent to extract-dwo on the input file to <dwo-file>, then strip-dwo on the input file">;
+defm split_dwo
+    : Eq<"split-dwo", "Equivalent to extract-dwo on the input file to "
+                      "<dwo-file>, then strip-dwo on the input file">,
+      MetaVarName<"dwo-file">;
 
-def preserve_dates : Flag<[ "-", "--" ], "preserve-dates">,
+def enable_deterministic_archives
+    : Flag<["-", "--"], "enable-deterministic-archives">,
+      HelpText<"Enable deterministic mode when copying archives (use zero for "
+               "UIDs, GIDs, and timestamps).">;
+def D : Flag<["-"], "D">,
+        Alias<enable_deterministic_archives>,
+        HelpText<"Alias for --enable-deterministic-archives">;
+
+def disable_deterministic_archives
+    : Flag<["-", "--"], "disable-deterministic-archives">,
+      HelpText<"Disable deterministic mode when copying archives (use real "
+               "values for UIDs, GIDs, and timestamps).">;
+def U : Flag<["-"], "U">,
+        Alias<disable_deterministic_archives>,
+        HelpText<"Alias for --disable-deterministic-archives">;
+
+def preserve_dates : Flag<["-", "--"], "preserve-dates">,
                      HelpText<"Preserve access and modification timestamps">;
+def p : Flag<["-"], "p">, Alias<preserve_dates>;
+
+defm add_gnu_debuglink
+    : Eq<"add-gnu-debuglink", "Add a .gnu_debuglink for <debug-file>">,
+      MetaVarName<"debug-file">;
 
-def p : Flag<[ "-" ], "p">, Alias<preserve_dates>;
+defm remove_section : Eq<"remove-section", "Remove <section>">,
+                      MetaVarName<"section">;
+def R : JoinedOrSeparate<["-"], "R">, Alias<remove_section>;
 
-defm add_gnu_debuglink : Eq<"add-gnu-debuglink">,
-                         MetaVarName<"debug-file">,
-                         HelpText<"Add a .gnu_debuglink for <debug-file>">;
-defm remove_section : Eq<"remove-section">,
-                      MetaVarName<"section">,
-                      HelpText<"Remove <section>">;
 defm rename_section
-    : Eq<"rename-section">,
-      MetaVarName<"old=new[,flag1,...]">,
+    : Eq<"rename-section",
+         "Renames a section from old to new, optionally with specified flags. "
+         "Flags supported for GNU compatibility: alloc, load, noload, "
+         "readonly, debug, code, data, rom, share, contents, merge, strings.">,
+      MetaVarName<"old=new[,flag1,...]">;
+defm redefine_symbol
+    : Eq<"redefine-sym", "Change the name of a symbol old to new">,
+      MetaVarName<"old=new">;
+defm keep : Eq<"keep", "Keep <section>">, MetaVarName<"section">;
+defm only_keep : Eq<"only-keep", "Remove all but <section>">,
+                 MetaVarName<"section">;
+def j : JoinedOrSeparate<["-"], "j">, Alias<only_keep>;
+defm add_section
+    : Eq<"add-section",
+         "Make a section named <section> with the contents of <file>.">,
+      MetaVarName<"section=file">;
+
+def strip_all
+    : Flag<["-", "--"], "strip-all">,
       HelpText<
-          "Renames a section from old to new, optionally with specified flags. "
-          "Flags supported for GNU compatibility: alloc, load, noload, "
-          "readonly, debug, code, data, rom, share, contents, merge, strings.">;
-defm redefine_symbol : Eq<"redefine-sym">,
-                       MetaVarName<"old=new">,
-                       HelpText<"Change the name of a symbol old to new">;
-def R : JoinedOrSeparate<["-"], "R">,
-        Alias<remove_section>;
-defm keep : Eq<"keep">,
-            MetaVarName<"section">,
-            HelpText<"Keep <section>">;
-defm only_keep : Eq<"only-keep">,
-                 MetaVarName<"section">,
-                 HelpText<"Remove all but <section>">;
-def j : JoinedOrSeparate<["-"], "j">,
-                      Alias<only_keep>;
-defm add_section : Eq<"add-section">,
-                   MetaVarName<"section=file">,
-                   HelpText<"Make a section named <section> with the contents of <file>.">;
-def strip_all : Flag<["-", "--"], "strip-all">,
-                HelpText<"Remove non-allocated sections other than .gnu.warning* sections">;
-def S : Flag<["-"], "S">,
-        Alias<strip_all>;
+          "Remove non-allocated sections other than .gnu.warning* sections">;
+def S : Flag<["-"], "S">, Alias<strip_all>;
 def strip_all_gnu : Flag<["-", "--"], "strip-all-gnu">,
-                    HelpText<"Compaitable with GNU objcopy's --strip-all">;
+                    HelpText<"Compatible with GNU objcopy's --strip-all">;
 def strip_debug : Flag<["-", "--"], "strip-debug">,
                   HelpText<"Remove all debug information">;
 def strip_dwo : Flag<["-", "--"], "strip-dwo">,
@@ -79,69 +100,67 @@ def strip_sections : Flag<["-", "--"], "strip-sections">,
                      HelpText<"Remove all section headers">;
 def strip_non_alloc : Flag<["-", "--"], "strip-non-alloc">,
                       HelpText<"Remove all non-allocated sections">;
-def extract_dwo : Flag<["-", "--"], "extract-dwo">,
-                  HelpText<"Remove all sections that are not DWARF .dwo sections from file">;
-def localize_hidden : Flag<["-", "--"], "localize-hidden">,
-                      HelpText<"Mark all symbols that have hidden or internal visibility as local">;
-defm localize_symbol : Eq<"localize-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Mark <symbol> as local">;
-def L : JoinedOrSeparate<["-"], "L">,
-        Alias<localize_symbol>;
-defm globalize_symbol : Eq<"globalize-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Mark <symbol> as global">;
+def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
+                     HelpText<"Remove all symbols not needed by relocations">;
 
-defm keep_global_symbol
-    : Eq<"keep-global-symbol">,
-      MetaVarName<"symbol">,
-      HelpText<"Convert all symbols except <symbol> to local. May be repeated "
-               "to convert all except a set of symbols to local.">;
-def G : JoinedOrSeparate<[ "-" ], "G">, Alias<keep_global_symbol>;
+def extract_dwo
+    : Flag<["-", "--"], "extract-dwo">,
+      HelpText<
+          "Remove all sections that are not DWARF .dwo sections from file">;
 
-defm keep_global_symbols
-    : Eq<"keep-global-symbols">,
-      MetaVarName<"filename">,
+def localize_hidden
+    : Flag<["-", "--"], "localize-hidden">,
       HelpText<
-          "Reads a list of symbols from <filename> and runs as if "
-	  "--keep-global-symbol=<symbol> is set for each one. <filename> "
-	  "contains one symbol per line and may contain comments beginning "
-	  "with '#'. Leading and trailing whitespace is stripped from each "
-	  "line. May be repeated to read symbols from many files.">;
+          "Mark all symbols that have hidden or internal visibility as local">;
+defm localize_symbol : Eq<"localize-symbol", "Mark <symbol> as local">,
+                       MetaVarName<"symbol">;
+def L : JoinedOrSeparate<["-"], "L">, Alias<localize_symbol>;
 
-def version : Flag<[ "-", "--" ], "version">,
-              HelpText<"Print the version and exit.">;
+defm globalize_symbol : Eq<"globalize-symbol", "Mark <symbol> as global">,
+                        MetaVarName<"symbol">;
+defm keep_global_symbol
+    : Eq<"keep-global-symbol",
+         "Convert all symbols except <symbol> to local. May be repeated to "
+         "convert all except a set of symbols to local.">,
+      MetaVarName<"symbol">;
+def G : JoinedOrSeparate<["-"], "G">, Alias<keep_global_symbol>;
 
-defm weaken_symbol : Eq<"weaken-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Mark <symbol> as weak">;
-def W : JoinedOrSeparate<["-"], "W">,
-        Alias<weaken_symbol>;
+defm keep_global_symbols
+    : Eq<"keep-global-symbols",
+         "Reads a list of symbols from <filename> and runs as if "
+         "--keep-global-symbol=<symbol> is set for each one. <filename> "
+         "contains one symbol per line and may contain comments beginning with "
+         "'#'. Leading and trailing whitespace is stripped from each line. May "
+         "be repeated to read symbols from many files.">,
+      MetaVarName<"filename">;
+
+defm weaken_symbol : Eq<"weaken-symbol", "Mark <symbol> as weak">,
+                     MetaVarName<"symbol">;
+def W : JoinedOrSeparate<["-"], "W">, Alias<weaken_symbol>;
 def weaken : Flag<["-", "--"], "weaken">,
-                  HelpText<"Mark all global symbols as weak">;
-def discard_all : Flag<["-", "--"], "discard-all">,
-                      HelpText<"Remove all local symbols except file and section symbols">;
-def x : Flag<["-"], "x">,
-        Alias<discard_all>;
-defm strip_symbol : Eq<"strip-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Remove symbol <symbol>">;
-def N : JoinedOrSeparate<["-"], "N">,
-        Alias<strip_symbol>;
-defm keep_symbol : Eq<"keep-symbol">,
-                       MetaVarName<"symbol">,
-                       HelpText<"Do not remove symbol <symbol>">;
-def K : JoinedOrSeparate<["-"], "K">,
-        Alias<keep_symbol>;
-def only_keep_debug : Flag<["-", "--"], "only-keep-debug">,
-                          HelpText<"Currently ignored. Only for compaitability with GNU objcopy.">;
-def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
-                      HelpText<"Remove all symbols not needed by relocations">;
+             HelpText<"Mark all global symbols as weak">;
+def discard_all
+    : Flag<["-", "--"], "discard-all">,
+      HelpText<"Remove all local symbols except file and section symbols">;
+def x : Flag<["-"], "x">, Alias<discard_all>;
+defm strip_symbol : Eq<"strip-symbol", "Remove symbol <symbol>">,
+                    MetaVarName<"symbol">;
+def N : JoinedOrSeparate<["-"], "N">, Alias<strip_symbol>;
+defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
+                   MetaVarName<"symbol">;
+def K : JoinedOrSeparate<["-"], "K">, Alias<keep_symbol>;
+def only_keep_debug
+    : Flag<["-", "--"], "only-keep-debug">,
+      HelpText<"Currently ignored. Only for compatibility with GNU objcopy.">;
 def keep_file_symbols : Flag<["-", "--"], "keep-file-symbols">,
-                      HelpText<"Do not remove file symbols">;
-defm dump_section : Eq<"dump-section">,
-                   MetaVarName<"section=file">,
-                   HelpText<"Dump contents of section named <section> into file <file>">;
-defm prefix_symbols : Eq<"prefix-symbols">,
-                       MetaVarName<"prefix">,
-                       HelpText<"Add <prefix> to the start of every symbol name">;
+                        HelpText<"Do not remove file symbols">;
+defm dump_section
+    : Eq<"dump-section",
+         "Dump contents of section named <section> into file <file>">,
+      MetaVarName<"section=file">;
+defm prefix_symbols
+    : Eq<"prefix-symbols", "Add <prefix> to the start of every symbol name">,
+      MetaVarName<"prefix">;
+
+def version : Flag<["-", "--"], "version">,
+              HelpText<"Print the version and exit.">;
diff --git a/tools/llvm-objcopy/StripOpts.td b/tools/llvm-objcopy/StripOpts.td
index 821dfa3b277be724a57a0bacf6de61aed5d9af6b..3660148f88380ca4fdec73ab86e3b04d30843e5e 100644
--- a/tools/llvm-objcopy/StripOpts.td
+++ b/tools/llvm-objcopy/StripOpts.td
@@ -1,58 +1,65 @@
 include "llvm/Option/OptParser.td"
 
-multiclass Eq<string name> {
-  def NAME: Separate<["--", "-"], name>;
-  def NAME # _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>;
+multiclass Eq<string name, string help> {
+  def NAME : Separate<["--", "-"], name>;
+  def NAME #_eq : Joined<["--", "-"], name #"=">,
+                  Alias<!cast<Separate>(NAME)>,
+                  HelpText<help>;
 }
 
 def help : Flag<["-", "--"], "help">;
 
-defm output : Eq<"o">,
-              MetaVarName<"output">,
-              HelpText<"Write output to <file>">;
-
-def preserve_dates : Flag<[ "-", "--" ], "preserve-dates">,
+def enable_deterministic_archives
+    : Flag<["-", "--"], "enable-deterministic-archives">,
+      HelpText<"Enable deterministic mode when stripping archives (use zero "
+               "for UIDs, GIDs, and timestamps).">;
+def D : Flag<["-"], "D">,
+        Alias<enable_deterministic_archives>,
+        HelpText<"Alias for --enable-deterministic-archives">;
+
+def disable_deterministic_archives
+    : Flag<["-", "--"], "disable-deterministic-archives">,
+      HelpText<"Disable deterministic mode when stripping archives (use real "
+               "values for UIDs, GIDs, and timestamps).">;
+def U : Flag<["-"], "U">,
+        Alias<disable_deterministic_archives>,
+        HelpText<"Alias for --disable-deterministic-archives">;
+
+defm output : Eq<"o", "Write output to <file>">, MetaVarName<"output">;
+
+def preserve_dates : Flag<["-", "--"], "preserve-dates">,
                      HelpText<"Preserve access and modification timestamps">;
+def p : Flag<["-"], "p">, Alias<preserve_dates>;
 
-def p : Flag<[ "-" ], "p">, Alias<preserve_dates>;
-
-def strip_all : Flag<["-", "--"], "strip-all">,
-                HelpText<"Remove non-allocated sections other than .gnu.warning* sections">;
+def strip_all
+    : Flag<["-", "--"], "strip-all">,
+      HelpText<
+          "Remove non-allocated sections other than .gnu.warning* sections">;
+def s : Flag<["-"], "s">, Alias<strip_all>;
 
+def strip_all_gnu : Flag<["-", "--"], "strip-all-gnu">,
+                    HelpText<"Compatible with GNU strip's --strip-all">;
 def strip_debug : Flag<["-", "--"], "strip-debug">,
                   HelpText<"Remove debugging symbols only">;
+def d : Flag<["-"], "d">, Alias<strip_debug>;
+def g : Flag<["-"], "g">, Alias<strip_debug>;
+def S : Flag<["-"], "S">, Alias<strip_debug>;
+def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
+                     HelpText<"Remove all symbols not needed by relocations">;
 
-def d : Flag<["-"], "d">,
-        Alias<strip_debug>;
-
-def g : Flag<["-"], "g">,
-        Alias<strip_debug>;
-
-def S : Flag<["-"], "S">,
-        Alias<strip_debug>;
-
-defm remove_section : Eq<"remove-section">,
-                      MetaVarName<"section">,
-                      HelpText<"Remove <section>">;
-
-def R : JoinedOrSeparate<["-"], "R">,
-        Alias<remove_section>;
-
-defm keep_symbol : Eq<"keep-symbol">,
-                   MetaVarName<"symbol">,
-                   HelpText<"Do not remove symbol <symbol>">;
+defm remove_section : Eq<"remove-section", "Remove <section>">,
+                      MetaVarName<"section">;
+def R : JoinedOrSeparate<["-"], "R">, Alias<remove_section>;
 
-def K : JoinedOrSeparate<["-"], "K">,
-        Alias<keep_symbol>;
+defm keep : Eq<"keep", "Keep <section>">, MetaVarName<"section">;
+defm keep_symbol : Eq<"keep-symbol", "Do not remove symbol <symbol>">,
+                   MetaVarName<"symbol">;
+def K : JoinedOrSeparate<["-"], "K">, Alias<keep_symbol>;
 
-def discard_all : Flag<["-", "--"], "discard-all">,
-                  HelpText<"Remove all local symbols except file and section symbols">;
+def discard_all
+    : Flag<["-", "--"], "discard-all">,
+      HelpText<"Remove all local symbols except file and section symbols">;
+def x : Flag<["-"], "x">, Alias<discard_all>;
 
-def version : Flag<[ "-", "--" ], "version">,
+def version : Flag<["-", "--"], "version">,
               HelpText<"Print the version and exit.">;
-
-def x : Flag<["-"], "x">,
-        Alias<discard_all>;
-
-def strip_unneeded : Flag<["-", "--"], "strip-unneeded">,
-                      HelpText<"Remove all symbols not needed by relocations">;
diff --git a/tools/llvm-objcopy/llvm-objcopy.cpp b/tools/llvm-objcopy/llvm-objcopy.cpp
index 3e494f92b67cdc33f5222f4db400a795165287d2..755c786cee95e58968872a10e2cc541e1d1c5692 100644
--- a/tools/llvm-objcopy/llvm-objcopy.cpp
+++ b/tools/llvm-objcopy/llvm-objcopy.cpp
@@ -8,16 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-objcopy.h"
+#include "Buffer.h"
+#include "CopyConfig.h"
+#include "ELF/ELFObjcopy.h"
 
-#include "Object.h"
-#include "llvm/ADT/BitmaskEnum.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Object/Binary.h"
@@ -28,13 +26,9 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Compression.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/Path.h"
@@ -44,172 +38,11 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
-#include <functional>
-#include <iterator>
 #include <memory>
 #include <string>
 #include <system_error>
 #include <utility>
 
-using namespace llvm;
-using namespace llvm::objcopy;
-using namespace object;
-using namespace ELF;
-
-namespace {
-
-enum ObjcopyID {
-  OBJCOPY_INVALID = 0, // This is not an option ID.
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  OBJCOPY_##ID,
-#include "ObjcopyOpts.inc"
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE) const char *const OBJCOPY_##NAME[] = VALUE;
-#include "ObjcopyOpts.inc"
-#undef PREFIX
-
-static const opt::OptTable::Info ObjcopyInfoTable[] = {
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  {OBJCOPY_##PREFIX,                                                           \
-   NAME,                                                                       \
-   HELPTEXT,                                                                   \
-   METAVAR,                                                                    \
-   OBJCOPY_##ID,                                                               \
-   opt::Option::KIND##Class,                                                   \
-   PARAM,                                                                      \
-   FLAGS,                                                                      \
-   OBJCOPY_##GROUP,                                                            \
-   OBJCOPY_##ALIAS,                                                            \
-   ALIASARGS,                                                                  \
-   VALUES},
-#include "ObjcopyOpts.inc"
-#undef OPTION
-};
-
-class ObjcopyOptTable : public opt::OptTable {
-public:
-  ObjcopyOptTable() : OptTable(ObjcopyInfoTable, true) {}
-};
-
-enum StripID {
-  STRIP_INVALID = 0, // This is not an option ID.
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  STRIP_##ID,
-#include "StripOpts.inc"
-#undef OPTION
-};
-
-#define PREFIX(NAME, VALUE) const char *const STRIP_##NAME[] = VALUE;
-#include "StripOpts.inc"
-#undef PREFIX
-
-static const opt::OptTable::Info StripInfoTable[] = {
-#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM,  \
-               HELPTEXT, METAVAR, VALUES)                                      \
-  {STRIP_##PREFIX, NAME,       HELPTEXT,                                       \
-   METAVAR,        STRIP_##ID, opt::Option::KIND##Class,                       \
-   PARAM,          FLAGS,      STRIP_##GROUP,                                  \
-   STRIP_##ALIAS,  ALIASARGS,  VALUES},
-#include "StripOpts.inc"
-#undef OPTION
-};
-
-class StripOptTable : public opt::OptTable {
-public:
-  StripOptTable() : OptTable(StripInfoTable, true) {}
-};
-
-struct SectionRename {
-  StringRef OriginalName;
-  StringRef NewName;
-  Optional<uint64_t> NewFlags;
-};
-
-// Configuration for copying/stripping a single file.
-struct CopyConfig {
-  // Main input/output options
-  StringRef InputFilename;
-  StringRef InputFormat;
-  StringRef OutputFilename;
-  StringRef OutputFormat;
-
-  // Only applicable for --input-format=Binary
-  MachineInfo BinaryArch;
-
-  // Advanced options
-  StringRef AddGnuDebugLink;
-  StringRef SplitDWO;
-  StringRef SymbolsPrefix;
-
-  // Repeated options
-  std::vector<StringRef> AddSection;
-  std::vector<StringRef> DumpSection;
-  std::vector<StringRef> Keep;
-  std::vector<StringRef> OnlyKeep;
-  std::vector<StringRef> SymbolsToGlobalize;
-  std::vector<StringRef> SymbolsToKeep;
-  std::vector<StringRef> SymbolsToLocalize;
-  std::vector<StringRef> SymbolsToRemove;
-  std::vector<StringRef> SymbolsToWeaken;
-  std::vector<StringRef> ToRemove;
-  std::vector<std::string> SymbolsToKeepGlobal;
-
-  // Map options
-  StringMap<SectionRename> SectionsToRename;
-  StringMap<StringRef> SymbolsToRename;
-
-  // Boolean options
-  bool DiscardAll = false;
-  bool ExtractDWO = false;
-  bool KeepFileSymbols = false;
-  bool LocalizeHidden = false;
-  bool OnlyKeepDebug = false;
-  bool PreserveDates = false;
-  bool StripAll = false;
-  bool StripAllGNU = false;
-  bool StripDWO = false;
-  bool StripDebug = false;
-  bool StripNonAlloc = false;
-  bool StripSections = false;
-  bool StripUnneeded = false;
-  bool Weaken = false;
-  bool DecompressDebugSections = false;
-  DebugCompressionType CompressionType = DebugCompressionType::None;
-};
-
-// Configuration for the overall invocation of this tool. When invoked as
-// objcopy, will always contain exactly one CopyConfig. When invoked as strip,
-// will contain one or more CopyConfigs.
-struct DriverConfig {
-  SmallVector<CopyConfig, 1> CopyConfigs;
-};
-
-using SectionPred = std::function<bool(const SectionBase &Sec)>;
-
-enum SectionFlag {
-  SecNone = 0,
-  SecAlloc = 1 << 0,
-  SecLoad = 1 << 1,
-  SecNoload = 1 << 2,
-  SecReadonly = 1 << 3,
-  SecDebug = 1 << 4,
-  SecCode = 1 << 5,
-  SecData = 1 << 6,
-  SecRom = 1 << 7,
-  SecMerge = 1 << 8,
-  SecStrings = 1 << 9,
-  SecContents = 1 << 10,
-  SecShare = 1 << 11,
-  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ SecShare)
-};
-
-} // namespace
-
 namespace llvm {
 namespace objcopy {
 
@@ -242,515 +75,9 @@ LLVM_ATTRIBUTE_NORETURN void reportError(StringRef File, Error E) {
 } // end namespace objcopy
 } // end namespace llvm
 
-static SectionFlag parseSectionRenameFlag(StringRef SectionName) {
-  return llvm::StringSwitch<SectionFlag>(SectionName)
-      .Case("alloc", SectionFlag::SecAlloc)
-      .Case("load", SectionFlag::SecLoad)
-      .Case("noload", SectionFlag::SecNoload)
-      .Case("readonly", SectionFlag::SecReadonly)
-      .Case("debug", SectionFlag::SecDebug)
-      .Case("code", SectionFlag::SecCode)
-      .Case("data", SectionFlag::SecData)
-      .Case("rom", SectionFlag::SecRom)
-      .Case("merge", SectionFlag::SecMerge)
-      .Case("strings", SectionFlag::SecStrings)
-      .Case("contents", SectionFlag::SecContents)
-      .Case("share", SectionFlag::SecShare)
-      .Default(SectionFlag::SecNone);
-}
-
-static SectionRename parseRenameSectionValue(StringRef FlagValue) {
-  if (!FlagValue.contains('='))
-    error("Bad format for --rename-section: missing '='");
-
-  // Initial split: ".foo" = ".bar,f1,f2,..."
-  auto Old2New = FlagValue.split('=');
-  SectionRename SR;
-  SR.OriginalName = Old2New.first;
-
-  // Flags split: ".bar" "f1" "f2" ...
-  SmallVector<StringRef, 6> NameAndFlags;
-  Old2New.second.split(NameAndFlags, ',');
-  SR.NewName = NameAndFlags[0];
-
-  if (NameAndFlags.size() > 1) {
-    SectionFlag Flags = SectionFlag::SecNone;
-    for (size_t I = 1, Size = NameAndFlags.size(); I < Size; ++I) {
-      SectionFlag Flag = parseSectionRenameFlag(NameAndFlags[I]);
-      if (Flag == SectionFlag::SecNone)
-        error("Unrecognized section flag '" + NameAndFlags[I] +
-              "'. Flags supported for GNU compatibility: alloc, load, noload, "
-              "readonly, debug, code, data, rom, share, contents, merge, "
-              "strings.");
-      Flags |= Flag;
-    }
-
-    SR.NewFlags = 0;
-    if (Flags & SectionFlag::SecAlloc)
-      *SR.NewFlags |= ELF::SHF_ALLOC;
-    if (!(Flags & SectionFlag::SecReadonly))
-      *SR.NewFlags |= ELF::SHF_WRITE;
-    if (Flags & SectionFlag::SecCode)
-      *SR.NewFlags |= ELF::SHF_EXECINSTR;
-    if (Flags & SectionFlag::SecMerge)
-      *SR.NewFlags |= ELF::SHF_MERGE;
-    if (Flags & SectionFlag::SecStrings)
-      *SR.NewFlags |= ELF::SHF_STRINGS;
-  }
-
-  return SR;
-}
-
-static bool isDebugSection(const SectionBase &Sec) {
-  return StringRef(Sec.Name).startswith(".debug") ||
-         StringRef(Sec.Name).startswith(".zdebug") || Sec.Name == ".gdb_index";
-}
-
-static bool isDWOSection(const SectionBase &Sec) {
-  return StringRef(Sec.Name).endswith(".dwo");
-}
-
-static bool onlyKeepDWOPred(const Object &Obj, const SectionBase &Sec) {
-  // We can't remove the section header string table.
-  if (&Sec == Obj.SectionNames)
-    return false;
-  // Short of keeping the string table we want to keep everything that is a DWO
-  // section and remove everything else.
-  return !isDWOSection(Sec);
-}
-
-static const StringMap<MachineInfo> ArchMap{
-    // Name, {EMachine, 64bit, LittleEndian}
-    {"aarch64", {EM_AARCH64, true, true}},
-    {"arm", {EM_ARM, false, true}},
-    {"i386", {EM_386, false, true}},
-    {"i386:x86-64", {EM_X86_64, true, true}},
-    {"powerpc:common64", {EM_PPC64, true, true}},
-    {"sparc", {EM_SPARC, false, true}},
-    {"x86-64", {EM_X86_64, true, true}},
-};
-
-static const MachineInfo &getMachineInfo(StringRef Arch) {
-  auto Iter = ArchMap.find(Arch);
-  if (Iter == std::end(ArchMap))
-    error("Invalid architecture: '" + Arch + "'");
-  return Iter->getValue();
-}
-
-static ElfType getOutputElfType(const Binary &Bin) {
-  // Infer output ELF type from the input ELF object
-  if (isa<ELFObjectFile<ELF32LE>>(Bin))
-    return ELFT_ELF32LE;
-  if (isa<ELFObjectFile<ELF64LE>>(Bin))
-    return ELFT_ELF64LE;
-  if (isa<ELFObjectFile<ELF32BE>>(Bin))
-    return ELFT_ELF32BE;
-  if (isa<ELFObjectFile<ELF64BE>>(Bin))
-    return ELFT_ELF64BE;
-  llvm_unreachable("Invalid ELFType");
-}
-
-static ElfType getOutputElfType(const MachineInfo &MI) {
-  // Infer output ELF type from the binary arch specified
-  if (MI.Is64Bit)
-    return MI.IsLittleEndian ? ELFT_ELF64LE : ELFT_ELF64BE;
-  else
-    return MI.IsLittleEndian ? ELFT_ELF32LE : ELFT_ELF32BE;
-}
-
-static std::unique_ptr<Writer> createWriter(const CopyConfig &Config,
-                                            Object &Obj, Buffer &Buf,
-                                            ElfType OutputElfType) {
-  if (Config.OutputFormat == "binary") {
-    return llvm::make_unique<BinaryWriter>(Obj, Buf);
-  }
-  // Depending on the initial ELFT and OutputFormat we need a different Writer.
-  switch (OutputElfType) {
-  case ELFT_ELF32LE:
-    return llvm::make_unique<ELFWriter<ELF32LE>>(Obj, Buf,
-                                                 !Config.StripSections);
-  case ELFT_ELF64LE:
-    return llvm::make_unique<ELFWriter<ELF64LE>>(Obj, Buf,
-                                                 !Config.StripSections);
-  case ELFT_ELF32BE:
-    return llvm::make_unique<ELFWriter<ELF32BE>>(Obj, Buf,
-                                                 !Config.StripSections);
-  case ELFT_ELF64BE:
-    return llvm::make_unique<ELFWriter<ELF64BE>>(Obj, Buf,
-                                                 !Config.StripSections);
-  }
-  llvm_unreachable("Invalid output format");
-}
-
-static void splitDWOToFile(const CopyConfig &Config, const Reader &Reader,
-                           StringRef File, ElfType OutputElfType) {
-  auto DWOFile = Reader.create();
-  DWOFile->removeSections(
-      [&](const SectionBase &Sec) { return onlyKeepDWOPred(*DWOFile, Sec); });
-  FileBuffer FB(File);
-  auto Writer = createWriter(Config, *DWOFile, FB, OutputElfType);
-  Writer->finalize();
-  Writer->write();
-}
-
-static Error dumpSectionToFile(StringRef SecName, StringRef Filename,
-                               Object &Obj) {
-  for (auto &Sec : Obj.sections()) {
-    if (Sec.Name == SecName) {
-      if (Sec.OriginalData.size() == 0)
-        return make_error<StringError>("Can't dump section \"" + SecName +
-                                           "\": it has no contents",
-                                       object_error::parse_failed);
-      Expected<std::unique_ptr<FileOutputBuffer>> BufferOrErr =
-          FileOutputBuffer::create(Filename, Sec.OriginalData.size());
-      if (!BufferOrErr)
-        return BufferOrErr.takeError();
-      std::unique_ptr<FileOutputBuffer> Buf = std::move(*BufferOrErr);
-      std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(),
-                Buf->getBufferStart());
-      if (Error E = Buf->commit())
-        return E;
-      return Error::success();
-    }
-  }
-  return make_error<StringError>("Section not found",
-                                 object_error::parse_failed);
-}
-
-static bool isCompressed(const SectionBase &Section) {
-  const char *Magic = "ZLIB";
-  return StringRef(Section.Name).startswith(".zdebug") ||
-         (Section.OriginalData.size() > strlen(Magic) &&
-          !strncmp(reinterpret_cast<const char *>(Section.OriginalData.data()),
-                   Magic, strlen(Magic))) ||
-         (Section.Flags & ELF::SHF_COMPRESSED);
-}
-
-static bool isCompressable(const SectionBase &Section) {
-  return !isCompressed(Section) && isDebugSection(Section) &&
-         Section.Name != ".gdb_index";
-}
-
-static void replaceDebugSections(
-    const CopyConfig &Config, Object &Obj, SectionPred &RemovePred,
-    function_ref<bool(const SectionBase &)> shouldReplace,
-    function_ref<SectionBase *(const SectionBase *)> addSection) {
-  SmallVector<SectionBase *, 13> ToReplace;
-  SmallVector<RelocationSection *, 13> RelocationSections;
-  for (auto &Sec : Obj.sections()) {
-    if (RelocationSection *R = dyn_cast<RelocationSection>(&Sec)) {
-      if (shouldReplace(*R->getSection()))
-        RelocationSections.push_back(R);
-      continue;
-    }
-
-    if (shouldReplace(Sec))
-      ToReplace.push_back(&Sec);
-  }
-
-  for (SectionBase *S : ToReplace) {
-    SectionBase *NewSection = addSection(S);
-
-    for (RelocationSection *RS : RelocationSections) {
-      if (RS->getSection() == S)
-        RS->setSection(NewSection);
-    }
-  }
-
-  RemovePred = [shouldReplace, RemovePred](const SectionBase &Sec) {
-    return shouldReplace(Sec) || RemovePred(Sec);
-  };
-}
-
-// This function handles the high level operations of GNU objcopy including
-// handling command line options. It's important to outline certain properties
-// we expect to hold of the command line operations. Any operation that "keeps"
-// should keep regardless of a remove. Additionally any removal should respect
-// any previous removals. Lastly whether or not something is removed shouldn't
-// depend a) on the order the options occur in or b) on some opaque priority
-// system. The only priority is that keeps/copies overrule removes.
-static void handleArgs(const CopyConfig &Config, Object &Obj,
-                       const Reader &Reader, ElfType OutputElfType) {
-
-  if (!Config.SplitDWO.empty()) {
-    splitDWOToFile(Config, Reader, Config.SplitDWO, OutputElfType);
-  }
-
-  // TODO: update or remove symbols only if there is an option that affects
-  // them.
-  if (Obj.SymbolTable) {
-    Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
-      if ((Config.LocalizeHidden &&
-           (Sym.Visibility == STV_HIDDEN || Sym.Visibility == STV_INTERNAL)) ||
-          (!Config.SymbolsToLocalize.empty() &&
-           is_contained(Config.SymbolsToLocalize, Sym.Name)))
-        Sym.Binding = STB_LOCAL;
-
-      // Note: these two globalize flags have very similar names but different
-      // meanings:
-      //
-      // --globalize-symbol: promote a symbol to global
-      // --keep-global-symbol: all symbols except for these should be made local
-      //
-      // If --globalize-symbol is specified for a given symbol, it will be
-      // global in the output file even if it is not included via
-      // --keep-global-symbol. Because of that, make sure to check
-      // --globalize-symbol second.
-      if (!Config.SymbolsToKeepGlobal.empty() &&
-          !is_contained(Config.SymbolsToKeepGlobal, Sym.Name))
-        Sym.Binding = STB_LOCAL;
-
-      if (!Config.SymbolsToGlobalize.empty() &&
-          is_contained(Config.SymbolsToGlobalize, Sym.Name))
-        Sym.Binding = STB_GLOBAL;
-
-      if (!Config.SymbolsToWeaken.empty() &&
-          is_contained(Config.SymbolsToWeaken, Sym.Name) &&
-          Sym.Binding == STB_GLOBAL)
-        Sym.Binding = STB_WEAK;
-
-      if (Config.Weaken && Sym.Binding == STB_GLOBAL &&
-          Sym.getShndx() != SHN_UNDEF)
-        Sym.Binding = STB_WEAK;
-
-      const auto I = Config.SymbolsToRename.find(Sym.Name);
-      if (I != Config.SymbolsToRename.end())
-        Sym.Name = I->getValue();
-
-      if (!Config.SymbolsPrefix.empty() && Sym.Type != STT_SECTION)
-        Sym.Name = (Config.SymbolsPrefix + Sym.Name).str();
-    });
-
-    // The purpose of this loop is to mark symbols referenced by sections
-    // (like GroupSection or RelocationSection). This way, we know which
-    // symbols are still 'needed' and wich are not.
-    if (Config.StripUnneeded) {
-      for (auto &Section : Obj.sections())
-        Section.markSymbols();
-    }
-
-    Obj.removeSymbols([&](const Symbol &Sym) {
-      if ((!Config.SymbolsToKeep.empty() &&
-           is_contained(Config.SymbolsToKeep, Sym.Name)) ||
-          (Config.KeepFileSymbols && Sym.Type == STT_FILE))
-        return false;
-
-      if (Config.DiscardAll && Sym.Binding == STB_LOCAL &&
-          Sym.getShndx() != SHN_UNDEF && Sym.Type != STT_FILE &&
-          Sym.Type != STT_SECTION)
-        return true;
-
-      if (Config.StripAll || Config.StripAllGNU)
-        return true;
-
-      if (!Config.SymbolsToRemove.empty() &&
-          is_contained(Config.SymbolsToRemove, Sym.Name)) {
-        return true;
-      }
-
-      if (Config.StripUnneeded && !Sym.Referenced &&
-          (Sym.Binding == STB_LOCAL || Sym.getShndx() == SHN_UNDEF) &&
-          Sym.Type != STT_FILE && Sym.Type != STT_SECTION)
-        return true;
-
-      return false;
-    });
-  }
-
-  SectionPred RemovePred = [](const SectionBase &) { return false; };
-
-  // Removes:
-  if (!Config.ToRemove.empty()) {
-    RemovePred = [&Config](const SectionBase &Sec) {
-      return is_contained(Config.ToRemove, Sec.Name);
-    };
-  }
-
-  if (Config.StripDWO || !Config.SplitDWO.empty())
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      return isDWOSection(Sec) || RemovePred(Sec);
-    };
-
-  if (Config.ExtractDWO)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      return onlyKeepDWOPred(Obj, Sec) || RemovePred(Sec);
-    };
-
-  if (Config.StripAllGNU)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if ((Sec.Flags & SHF_ALLOC) != 0)
-        return false;
-      if (&Sec == Obj.SectionNames)
-        return false;
-      switch (Sec.Type) {
-      case SHT_SYMTAB:
-      case SHT_REL:
-      case SHT_RELA:
-      case SHT_STRTAB:
-        return true;
-      }
-      return isDebugSection(Sec);
-    };
-
-  if (Config.StripSections) {
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      return RemovePred(Sec) || (Sec.Flags & SHF_ALLOC) == 0;
-    };
-  }
-
-  if (Config.StripDebug) {
-    RemovePred = [RemovePred](const SectionBase &Sec) {
-      return RemovePred(Sec) || isDebugSection(Sec);
-    };
-  }
-
-  if (Config.StripNonAlloc)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if (&Sec == Obj.SectionNames)
-        return false;
-      return (Sec.Flags & SHF_ALLOC) == 0;
-    };
-
-  if (Config.StripAll)
-    RemovePred = [RemovePred, &Obj](const SectionBase &Sec) {
-      if (RemovePred(Sec))
-        return true;
-      if (&Sec == Obj.SectionNames)
-        return false;
-      if (StringRef(Sec.Name).startswith(".gnu.warning"))
-        return false;
-      return (Sec.Flags & SHF_ALLOC) == 0;
-    };
-
-  // Explicit copies:
-  if (!Config.OnlyKeep.empty()) {
-    RemovePred = [&Config, RemovePred, &Obj](const SectionBase &Sec) {
-      // Explicitly keep these sections regardless of previous removes.
-      if (is_contained(Config.OnlyKeep, Sec.Name))
-        return false;
-
-      // Allow all implicit removes.
-      if (RemovePred(Sec))
-        return true;
-
-      // Keep special sections.
-      if (Obj.SectionNames == &Sec)
-        return false;
-      if (Obj.SymbolTable == &Sec ||
-          (Obj.SymbolTable && Obj.SymbolTable->getStrTab() == &Sec))
-        return false;
-
-      // Remove everything else.
-      return true;
-    };
-  }
-
-  if (!Config.Keep.empty()) {
-    RemovePred = [Config, RemovePred](const SectionBase &Sec) {
-      // Explicitly keep these sections regardless of previous removes.
-      if (is_contained(Config.Keep, Sec.Name))
-        return false;
-      // Otherwise defer to RemovePred.
-      return RemovePred(Sec);
-    };
-  }
-
-  // This has to be the last predicate assignment.
-  // If the option --keep-symbol has been specified
-  // and at least one of those symbols is present
-  // (equivalently, the updated symbol table is not empty)
-  // the symbol table and the string table should not be removed.
-  if ((!Config.SymbolsToKeep.empty() || Config.KeepFileSymbols) &&
-      Obj.SymbolTable && !Obj.SymbolTable->empty()) {
-    RemovePred = [&Obj, RemovePred](const SectionBase &Sec) {
-      if (&Sec == Obj.SymbolTable || &Sec == Obj.SymbolTable->getStrTab())
-        return false;
-      return RemovePred(Sec);
-    };
-  }
-
-  if (Config.CompressionType != DebugCompressionType::None)
-    replaceDebugSections(Config, Obj, RemovePred, isCompressable,
-                         [&Config, &Obj](const SectionBase *S) {
-                           return &Obj.addSection<CompressedSection>(
-                               *S, Config.CompressionType);
-                         });
-  else if (Config.DecompressDebugSections)
-    replaceDebugSections(
-        Config, Obj, RemovePred,
-        [](const SectionBase &S) { return isa<CompressedSection>(&S); },
-        [&Obj](const SectionBase *S) {
-          auto CS = cast<CompressedSection>(S);
-          return &Obj.addSection<DecompressedSection>(*CS);
-        });
-
-  Obj.removeSections(RemovePred);
-
-  if (!Config.SectionsToRename.empty()) {
-    for (auto &Sec : Obj.sections()) {
-      const auto Iter = Config.SectionsToRename.find(Sec.Name);
-      if (Iter != Config.SectionsToRename.end()) {
-        const SectionRename &SR = Iter->second;
-        Sec.Name = SR.NewName;
-        if (SR.NewFlags.hasValue()) {
-          // Preserve some flags which should not be dropped when setting flags.
-          // Also, preserve anything OS/processor dependant.
-          const uint64_t PreserveMask = ELF::SHF_COMPRESSED | ELF::SHF_EXCLUDE |
-                                        ELF::SHF_GROUP | ELF::SHF_LINK_ORDER |
-                                        ELF::SHF_MASKOS | ELF::SHF_MASKPROC |
-                                        ELF::SHF_TLS | ELF::SHF_INFO_LINK;
-          Sec.Flags = (Sec.Flags & PreserveMask) |
-                      (SR.NewFlags.getValue() & ~PreserveMask);
-        }
-      }
-    }
-  }
-
-  if (!Config.AddSection.empty()) {
-    for (const auto &Flag : Config.AddSection) {
-      auto SecPair = Flag.split("=");
-      auto SecName = SecPair.first;
-      auto File = SecPair.second;
-      auto BufOrErr = MemoryBuffer::getFile(File);
-      if (!BufOrErr)
-        reportError(File, BufOrErr.getError());
-      auto Buf = std::move(*BufOrErr);
-      auto BufPtr = reinterpret_cast<const uint8_t *>(Buf->getBufferStart());
-      auto BufSize = Buf->getBufferSize();
-      Obj.addSection<OwnedDataSection>(SecName,
-                                       ArrayRef<uint8_t>(BufPtr, BufSize));
-    }
-  }
-
-  if (!Config.DumpSection.empty()) {
-    for (const auto &Flag : Config.DumpSection) {
-      std::pair<StringRef, StringRef> SecPair = Flag.split("=");
-      StringRef SecName = SecPair.first;
-      StringRef File = SecPair.second;
-      if (Error E = dumpSectionToFile(SecName, File, Obj))
-        reportError(Config.InputFilename, std::move(E));
-    }
-  }
-
-  if (!Config.AddGnuDebugLink.empty())
-    Obj.addSection<GnuDebugLinkSection>(Config.AddGnuDebugLink);
-}
-
-static void executeElfObjcopyOnBinary(const CopyConfig &Config, Reader &Reader,
-                                      Buffer &Out, ElfType OutputElfType) {
-  std::unique_ptr<Object> Obj = Reader.create();
-
-  handleArgs(Config, *Obj, Reader, OutputElfType);
-
-  std::unique_ptr<Writer> Writer =
-      createWriter(Config, *Obj, Out, OutputElfType);
-  Writer->finalize();
-  Writer->write();
-}
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::objcopy;
 
 // For regular archives this function simply calls llvm::writeArchive,
 // For thin archives it writes the archive file itself as well as its members.
@@ -781,8 +108,29 @@ static Error deepWriteArchive(StringRef ArcName,
   return Error::success();
 }
 
-static void executeElfObjcopyOnArchive(const CopyConfig &Config,
-                                       const Archive &Ar) {
+/// The function executeObjcopyOnRawBinary does the dispatch based on the format
+/// of the output specified by the command line options.
+static void executeObjcopyOnRawBinary(const CopyConfig &Config,
+                                      MemoryBuffer &In, Buffer &Out) {
+  // TODO: llvm-objcopy should parse CopyConfig.OutputFormat to recognize
+  // formats other than ELF / "binary" and invoke
+  // elf::executeObjcopyOnRawBinary, macho::executeObjcopyOnRawBinary or
+  // coff::executeObjcopyOnRawBinary accordingly.
+  return elf::executeObjcopyOnRawBinary(Config, In, Out);
+}
+
+/// The function executeObjcopyOnBinary does the dispatch based on the format
+/// of the input binary (ELF, MachO or COFF).
+static void executeObjcopyOnBinary(const CopyConfig &Config, object::Binary &In,
+                                   Buffer &Out) {
+  if (auto *ELFBinary = dyn_cast<object::ELFObjectFileBase>(&In))
+    return elf::executeObjcopyOnBinary(Config, *ELFBinary, Out);
+  else
+    error("Unsupported object file format");
+}
+
+static void executeObjcopyOnArchive(const CopyConfig &Config,
+                                    const Archive &Ar) {
   std::vector<NewArchiveMember> NewArchiveMembers;
   Error Err = Error::success();
   for (const Archive::Child &Child : Ar.children(Err)) {
@@ -796,11 +144,10 @@ static void executeElfObjcopyOnArchive(const CopyConfig &Config,
       reportError(Ar.getFileName(), ChildNameOrErr.takeError());
 
     MemBuffer MB(ChildNameOrErr.get());
-    ELFReader Reader(Bin);
-    executeElfObjcopyOnBinary(Config, Reader, MB, getOutputElfType(*Bin));
+    executeObjcopyOnBinary(Config, *Bin, MB);
 
     Expected<NewArchiveMember> Member =
-        NewArchiveMember::getOldMember(Child, true);
+        NewArchiveMember::getOldMember(Child, Config.DeterministicArchives);
     if (!Member)
       reportError(Ar.getFileName(), Member.takeError());
     Member->Buf = MB.releaseMemoryBuffer();
@@ -810,9 +157,9 @@ static void executeElfObjcopyOnArchive(const CopyConfig &Config,
 
   if (Err)
     reportError(Config.InputFilename, std::move(Err));
-  if (Error E =
-          deepWriteArchive(Config.OutputFilename, NewArchiveMembers,
-                           Ar.hasSymbolTable(), Ar.kind(), true, Ar.isThin()))
+  if (Error E = deepWriteArchive(Config.OutputFilename, NewArchiveMembers,
+                                 Ar.hasSymbolTable(), Ar.kind(),
+                                 Config.DeterministicArchives, Ar.isThin()))
     reportError(Config.OutputFilename, std::move(E));
 }
 
@@ -832,7 +179,10 @@ static void restoreDateOnFile(StringRef Filename,
     reportError(Filename, EC);
 }
 
-static void executeElfObjcopy(const CopyConfig &Config) {
+/// The function executeObjcopy does the higher level dispatch based on the type
+/// of input (raw binary, archive or single object file) and takes care of the
+/// format-agnostic modifications, i.e. preserving dates.
+static void executeObjcopy(const CopyConfig &Config) {
   sys::fs::file_status Stat;
   if (Config.PreserveDates)
     if (auto EC = sys::fs::status(Config.InputFilename, Stat))
@@ -842,11 +192,8 @@ static void executeElfObjcopy(const CopyConfig &Config) {
     auto BufOrErr = MemoryBuffer::getFile(Config.InputFilename);
     if (!BufOrErr)
       reportError(Config.InputFilename, BufOrErr.getError());
-
     FileBuffer FB(Config.OutputFilename);
-    BinaryReader Reader(Config.BinaryArch, BufOrErr->get());
-    executeElfObjcopyOnBinary(Config, Reader, FB,
-                              getOutputElfType(Config.BinaryArch));
+    executeObjcopyOnRawBinary(Config, *BufOrErr->get(), FB);
   } else {
     Expected<OwningBinary<llvm::object::Binary>> BinaryOrErr =
         createBinary(Config.InputFilename);
@@ -854,12 +201,10 @@ static void executeElfObjcopy(const CopyConfig &Config) {
       reportError(Config.InputFilename, BinaryOrErr.takeError());
 
     if (Archive *Ar = dyn_cast<Archive>(BinaryOrErr.get().getBinary())) {
-      executeElfObjcopyOnArchive(Config, *Ar);
+      executeObjcopyOnArchive(Config, *Ar);
     } else {
       FileBuffer FB(Config.OutputFilename);
-      Binary *Bin = BinaryOrErr.get().getBinary();
-      ELFReader Reader(Bin);
-      executeElfObjcopyOnBinary(Config, Reader, FB, getOutputElfType(*Bin));
+      executeObjcopyOnBinary(Config, *BinaryOrErr.get().getBinary(), FB);
     }
   }
 
@@ -870,246 +215,14 @@ static void executeElfObjcopy(const CopyConfig &Config) {
   }
 }
 
-static void addGlobalSymbolsFromFile(std::vector<std::string> &Symbols,
-                                     StringRef Filename) {
-  SmallVector<StringRef, 16> Lines;
-  auto BufOrErr = MemoryBuffer::getFile(Filename);
-  if (!BufOrErr)
-    reportError(Filename, BufOrErr.getError());
-
-  BufOrErr.get()->getBuffer().split(Lines, '\n');
-  for (StringRef Line : Lines) {
-    // Ignore everything after '#', trim whitespace, and only add the symbol if
-    // it's not empty.
-    auto TrimmedLine = Line.split('#').first.trim();
-    if (!TrimmedLine.empty())
-      Symbols.push_back(TrimmedLine.str());
-  }
-}
-
-// ParseObjcopyOptions returns the config and sets the input arguments. If a
-// help flag is set then ParseObjcopyOptions will print the help messege and
-// exit.
-static DriverConfig parseObjcopyOptions(ArrayRef<const char *> ArgsArr) {
-  ObjcopyOptTable T;
-  unsigned MissingArgumentIndex, MissingArgumentCount;
-  llvm::opt::InputArgList InputArgs =
-      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
-
-  if (InputArgs.size() == 0) {
-    T.PrintHelp(errs(), "llvm-objcopy input [output]", "objcopy tool");
-    exit(1);
-  }
-
-  if (InputArgs.hasArg(OBJCOPY_help)) {
-    T.PrintHelp(outs(), "llvm-objcopy input [output]", "objcopy tool");
-    exit(0);
-  }
-
-  if (InputArgs.hasArg(OBJCOPY_version)) {
-    cl::PrintVersionMessage();
-    exit(0);
-  }
-
-  SmallVector<const char *, 2> Positional;
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_UNKNOWN))
-    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_INPUT))
-    Positional.push_back(Arg->getValue());
-
-  if (Positional.empty())
-    error("No input file specified");
-
-  if (Positional.size() > 2)
-    error("Too many positional arguments");
-
-  CopyConfig Config;
-  Config.InputFilename = Positional[0];
-  Config.OutputFilename = Positional[Positional.size() == 1 ? 0 : 1];
-  Config.InputFormat = InputArgs.getLastArgValue(OBJCOPY_input_target);
-  Config.OutputFormat = InputArgs.getLastArgValue(OBJCOPY_output_target);
-  if (Config.InputFormat == "binary") {
-    auto BinaryArch = InputArgs.getLastArgValue(OBJCOPY_binary_architecture);
-    if (BinaryArch.empty())
-      error("Specified binary input without specifiying an architecture");
-    Config.BinaryArch = getMachineInfo(BinaryArch);
-  }
-
-  if (auto Arg = InputArgs.getLastArg(OBJCOPY_compress_debug_sections,
-                                      OBJCOPY_compress_debug_sections_eq)) {
-    Config.CompressionType = DebugCompressionType::Z;
-
-    if (Arg->getOption().getID() == OBJCOPY_compress_debug_sections_eq) {
-      Config.CompressionType =
-          StringSwitch<DebugCompressionType>(
-              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq))
-              .Case("zlib-gnu", DebugCompressionType::GNU)
-              .Case("zlib", DebugCompressionType::Z)
-              .Default(DebugCompressionType::None);
-      if (Config.CompressionType == DebugCompressionType::None)
-        error("Invalid or unsupported --compress-debug-sections format: " +
-              InputArgs.getLastArgValue(OBJCOPY_compress_debug_sections_eq));
-      if (!zlib::isAvailable())
-        error("LLVM was not compiled with LLVM_ENABLE_ZLIB: can not compress.");
-    }
-  }
-
-  Config.SplitDWO = InputArgs.getLastArgValue(OBJCOPY_split_dwo);
-  Config.AddGnuDebugLink = InputArgs.getLastArgValue(OBJCOPY_add_gnu_debuglink);
-  Config.SymbolsPrefix = InputArgs.getLastArgValue(OBJCOPY_prefix_symbols);
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_redefine_symbol)) {
-    if (!StringRef(Arg->getValue()).contains('='))
-      error("Bad format for --redefine-sym");
-    auto Old2New = StringRef(Arg->getValue()).split('=');
-    if (!Config.SymbolsToRename.insert(Old2New).second)
-      error("Multiple redefinition of symbol " + Old2New.first);
-  }
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_rename_section)) {
-    SectionRename SR = parseRenameSectionValue(StringRef(Arg->getValue()));
-    if (!Config.SectionsToRename.try_emplace(SR.OriginalName, SR).second)
-      error("Multiple renames of section " + SR.OriginalName);
-  }
-
-  for (auto Arg : InputArgs.filtered(OBJCOPY_remove_section))
-    Config.ToRemove.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep))
-    Config.Keep.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_only_keep))
-    Config.OnlyKeep.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_add_section))
-    Config.AddSection.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_dump_section))
-    Config.DumpSection.push_back(Arg->getValue());
-  Config.StripAll = InputArgs.hasArg(OBJCOPY_strip_all);
-  Config.StripAllGNU = InputArgs.hasArg(OBJCOPY_strip_all_gnu);
-  Config.StripDebug = InputArgs.hasArg(OBJCOPY_strip_debug);
-  Config.StripDWO = InputArgs.hasArg(OBJCOPY_strip_dwo);
-  Config.StripSections = InputArgs.hasArg(OBJCOPY_strip_sections);
-  Config.StripNonAlloc = InputArgs.hasArg(OBJCOPY_strip_non_alloc);
-  Config.StripUnneeded = InputArgs.hasArg(OBJCOPY_strip_unneeded);
-  Config.ExtractDWO = InputArgs.hasArg(OBJCOPY_extract_dwo);
-  Config.LocalizeHidden = InputArgs.hasArg(OBJCOPY_localize_hidden);
-  Config.Weaken = InputArgs.hasArg(OBJCOPY_weaken);
-  Config.DiscardAll = InputArgs.hasArg(OBJCOPY_discard_all);
-  Config.OnlyKeepDebug = InputArgs.hasArg(OBJCOPY_only_keep_debug);
-  Config.KeepFileSymbols = InputArgs.hasArg(OBJCOPY_keep_file_symbols);
-  Config.DecompressDebugSections =
-      InputArgs.hasArg(OBJCOPY_decompress_debug_sections);
-  for (auto Arg : InputArgs.filtered(OBJCOPY_localize_symbol))
-    Config.SymbolsToLocalize.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbol))
-    Config.SymbolsToKeepGlobal.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_global_symbols))
-    addGlobalSymbolsFromFile(Config.SymbolsToKeepGlobal, Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_globalize_symbol))
-    Config.SymbolsToGlobalize.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_weaken_symbol))
-    Config.SymbolsToWeaken.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_strip_symbol))
-    Config.SymbolsToRemove.push_back(Arg->getValue());
-  for (auto Arg : InputArgs.filtered(OBJCOPY_keep_symbol))
-    Config.SymbolsToKeep.push_back(Arg->getValue());
-
-  Config.PreserveDates = InputArgs.hasArg(OBJCOPY_preserve_dates);
-
-  DriverConfig DC;
-  DC.CopyConfigs.push_back(std::move(Config));
-  if (Config.DecompressDebugSections &&
-      Config.CompressionType != DebugCompressionType::None) {
-    error("Cannot specify --compress-debug-sections at the same time as "
-          "--decompress-debug-sections at the same time");
-  }
-
-  if (Config.DecompressDebugSections && !zlib::isAvailable())
-    error("LLVM was not compiled with LLVM_ENABLE_ZLIB: cannot decompress.");
-
-  return DC;
-}
-
-// ParseStripOptions returns the config and sets the input arguments. If a
-// help flag is set then ParseStripOptions will print the help messege and
-// exit.
-static DriverConfig parseStripOptions(ArrayRef<const char *> ArgsArr) {
-  StripOptTable T;
-  unsigned MissingArgumentIndex, MissingArgumentCount;
-  llvm::opt::InputArgList InputArgs =
-      T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
-
-  static const char Usage[] = "llvm-strip [options] file...";
-  if (InputArgs.size() == 0) {
-    T.PrintHelp(errs(), Usage, "strip tool");
-    exit(1);
-  }
-
-  if (InputArgs.hasArg(STRIP_help)) {
-    T.PrintHelp(outs(), Usage, "strip tool");
-    exit(0);
-  }
-
-  if (InputArgs.hasArg(STRIP_version)) {
-    cl::PrintVersionMessage();
-    exit(0);
-  }
-
-  SmallVector<const char *, 2> Positional;
-  for (auto Arg : InputArgs.filtered(STRIP_UNKNOWN))
-    error("unknown argument '" + Arg->getAsString(InputArgs) + "'");
-  for (auto Arg : InputArgs.filtered(STRIP_INPUT))
-    Positional.push_back(Arg->getValue());
-
-  if (Positional.empty())
-    error("No input file specified");
-
-  if (Positional.size() > 1 && InputArgs.hasArg(STRIP_output))
-    error("Multiple input files cannot be used in combination with -o");
-
-  CopyConfig Config;
-  Config.StripDebug = InputArgs.hasArg(STRIP_strip_debug);
-
-  Config.DiscardAll = InputArgs.hasArg(STRIP_discard_all);
-  Config.StripUnneeded = InputArgs.hasArg(STRIP_strip_unneeded);
-  Config.StripAll = InputArgs.hasArg(STRIP_strip_all);
-
-  if (!Config.StripDebug && !Config.StripUnneeded && !Config.DiscardAll)
-    Config.StripAll = true;
-
-  for (auto Arg : InputArgs.filtered(STRIP_remove_section))
-    Config.ToRemove.push_back(Arg->getValue());
-
-  for (auto Arg : InputArgs.filtered(STRIP_keep_symbol))
-    Config.SymbolsToKeep.push_back(Arg->getValue());
-
-  Config.PreserveDates = InputArgs.hasArg(STRIP_preserve_dates);
-
-  DriverConfig DC;
-  if (Positional.size() == 1) {
-    Config.InputFilename = Positional[0];
-    Config.OutputFilename =
-        InputArgs.getLastArgValue(STRIP_output, Positional[0]);
-    DC.CopyConfigs.push_back(std::move(Config));
-  } else {
-    for (const char *Filename : Positional) {
-      Config.InputFilename = Filename;
-      Config.OutputFilename = Filename;
-      DC.CopyConfigs.push_back(Config);
-    }
-  }
-
-  return DC;
-}
-
 int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   ToolName = argv[0];
   DriverConfig DriverConfig;
-  if (sys::path::stem(ToolName).endswith_lower("strip"))
+  if (sys::path::stem(ToolName).contains("strip"))
     DriverConfig = parseStripOptions(makeArrayRef(argv + 1, argc));
   else
     DriverConfig = parseObjcopyOptions(makeArrayRef(argv + 1, argc));
   for (const CopyConfig &CopyConfig : DriverConfig.CopyConfigs)
-    executeElfObjcopy(CopyConfig);
+    executeObjcopy(CopyConfig);
 }
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 64c3823295c8c25c7609154f4e19e9f75c901095..671e8a2c4f79243808e46bd81c62916e87278dcc 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -106,7 +106,11 @@ DisassembleFunctions("df",
 static StringSet<> DisasmFuncsSet;
 
 cl::opt<bool>
-llvm::Relocations("r", cl::desc("Display the relocation entries in the file"));
+llvm::Relocations("reloc",
+                  cl::desc("Display the relocation entries in the file"));
+static cl::alias RelocationsShort("r", cl::desc("Alias for --reloc"),
+                                  cl::NotHidden,
+                                  cl::aliasopt(llvm::Relocations));
 
 cl::opt<bool>
 llvm::DynamicRelocations("dynamic-reloc",
@@ -116,10 +120,16 @@ DynamicRelocationsd("R", cl::desc("Alias for --dynamic-reloc"),
              cl::aliasopt(DynamicRelocations));
 
 cl::opt<bool>
-llvm::SectionContents("s", cl::desc("Display the content of each section"));
+    llvm::SectionContents("full-contents",
+                          cl::desc("Display the content of each section"));
+static cl::alias SectionContentsShort("s",
+                                      cl::desc("Alias for --full-contents"),
+                                      cl::aliasopt(SectionContents));
 
-cl::opt<bool>
-llvm::SymbolTable("t", cl::desc("Display the symbol table"));
+cl::opt<bool> llvm::SymbolTable("syms", cl::desc("Display the symbol table"));
+static cl::alias SymbolTableShort("t", cl::desc("Alias for --syms"),
+                                  cl::NotHidden,
+                                  cl::aliasopt(llvm::SymbolTable));
 
 cl::opt<bool>
 llvm::ExportsTrie("exports-trie", cl::desc("Display mach-o exported symbols"));
@@ -2220,8 +2230,11 @@ static void printFileHeaders(const ObjectFile *o) {
   Expected<uint64_t> StartAddrOrErr = o->getStartAddress();
   if (!StartAddrOrErr)
     report_error(o->getFileName(), StartAddrOrErr.takeError());
+
+  StringRef Fmt = o->getBytesInAddress() > 4 ? "%016" PRIx64 : "%08" PRIx64;
+  uint64_t Address = StartAddrOrErr.get();
   outs() << "start address: "
-         << format("0x%0*x", o->getBytesInAddress(), StartAddrOrErr.get())
+         << "0x" << format(Fmt.data(), Address)
          << "\n";
 }
 
@@ -2302,8 +2315,8 @@ static void DumpObject(ObjectFile *o, const Archive *a = nullptr,
     outs() << ":\tfile format " << o->getFileFormatName() << "\n\n";
   }
 
-  if (ArchiveHeaders && !MachOOpt)
-    printArchiveChild(a->getFileName(), *c);
+  if (ArchiveHeaders && !MachOOpt && c)
+    printArchiveChild(ArchiveName, *c);
   if (Disassemble)
     DisassembleObject(o, Relocations);
   if (Relocations && !Disassemble)
@@ -2356,8 +2369,8 @@ static void DumpObject(const COFFImportFile *I, const Archive *A,
            << ":\tfile format COFF-import-file"
            << "\n\n";
 
-  if (ArchiveHeaders && !MachOOpt)
-    printArchiveChild(A->getFileName(), *C);
+  if (ArchiveHeaders && !MachOOpt && C)
+    printArchiveChild(ArchiveName, *C);
   if (SymbolTable)
     printCOFFSymbolTable(I);
 }
diff --git a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
index 98d5428ddd1acde8d4f240f0669638ad3b2faef1..57e75b1db9ece9e9e3a447bf1187d9bc0b493fae 100644
--- a/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
+++ b/tools/llvm-opt-fuzzer/llvm-opt-fuzzer.cpp
@@ -144,9 +144,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  bool Ok = PB.parsePassPipeline(MPM, PassPipeline, false, false);
-  assert(Ok && "Should have been checked during fuzzer initialization");
-  (void)Ok; // silence unused variable warning on release builds
+  auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false);
+  assert(!Err && "Should have been checked during fuzzer initialization");
+  // Only fail with assert above, otherwise ignore the parsing error.
+  consumeError(std::move(Err));
 
   // Run passes which we need to test
   //
@@ -235,8 +236,8 @@ extern "C" LLVM_ATTRIBUTE_USED int LLVMFuzzerInitialize(
 
   PassBuilder PB(TM.get());
   ModulePassManager MPM;
-  if (!PB.parsePassPipeline(MPM, PassPipeline, false, false)) {
-    errs() << *argv[0] << ": can't parse pass pipeline\n";
+  if (auto Err = PB.parsePassPipeline(MPM, PassPipeline, false, false)) {
+    errs() << *argv[0] << ": " << toString(std::move(Err)) << "\n";
     exit(1);
   }
 
diff --git a/tools/llvm-opt-report/CMakeLists.txt b/tools/llvm-opt-report/CMakeLists.txt
index 777537a54c0ff47cbb3976bdabf4f32d7a7725ad..3aabc03ab3f21351b9176158d99159775efc845b 100644
--- a/tools/llvm-opt-report/CMakeLists.txt
+++ b/tools/llvm-opt-report/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS Core Demangle Object Support)
+set(LLVM_LINK_COMPONENTS Core Demangle Object OptRemarks Support)
 
 add_llvm_tool(llvm-opt-report
   OptReport.cpp
diff --git a/tools/llvm-opt-report/OptReport.cpp b/tools/llvm-opt-report/OptReport.cpp
index aa7966132c28791bdeb293c6ca8f65531c844f58..071f779a9e62c1ca97c83c867c639380424a994e 100644
--- a/tools/llvm-opt-report/OptReport.cpp
+++ b/tools/llvm-opt-report/OptReport.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm-c/OptRemarks.h"
 #include <cstdlib>
 #include <map>
 #include <set>
@@ -142,104 +143,44 @@ typedef std::map<std::string, std::map<int, std::map<std::string, std::map<int,
           OptReportLocationInfo>>>> LocationInfoTy;
 } // anonymous namespace
 
-static void collectLocationInfo(yaml::Stream &Stream,
-                                LocationInfoTy &LocationInfo) {
-  SmallVector<char, 8> Tmp;
-
-  // Note: We're using the YAML parser here directly, instead of using the
-  // YAMLTraits implementation, because the YAMLTraits implementation does not
-  // support a way to handle only a subset of the input keys (it will error out
-  // if there is an input key that you don't map to your class), and
-  // furthermore, it does not provide a way to handle the Args sequence of
-  // key/value pairs, where the order must be captured and the 'String' key
-  // might be repeated.
-  for (auto &Doc : Stream) {
-    auto *Root = dyn_cast<yaml::MappingNode>(Doc.getRoot());
-    if (!Root)
-      continue;
+static bool readLocationInfo(LocationInfoTy &LocationInfo) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+      MemoryBuffer::getFile(InputFileName.c_str());
+  if (std::error_code EC = Buf.getError()) {
+    WithColor::error() << "Can't open file " << InputFileName << ": "
+                       << EC.message() << "\n";
+    return false;
+  }
 
-    bool Transformed = Root->getRawTag() == "!Passed";
-    std::string Pass, File, Function;
-    int Line = 0, Column = 1;
+  StringRef Buffer = (*Buf)->getBuffer();
+  LLVMOptRemarkParserRef Parser =
+      LLVMOptRemarkParserCreate(Buffer.data(), Buffer.size());
+
+  LLVMOptRemarkEntry *Remark = nullptr;
+  while ((Remark = LLVMOptRemarkParserGetNext(Parser))) {
+    bool Transformed =
+        StringRef(Remark->RemarkType.Str, Remark->RemarkType.Len) == "!Passed";
+    StringRef Pass(Remark->PassName.Str, Remark->PassName.Len);
+    StringRef File(Remark->DebugLoc.SourceFile.Str,
+                   Remark->DebugLoc.SourceFile.Len);
+    StringRef Function(Remark->FunctionName.Str, Remark->FunctionName.Len);
+    uint32_t Line = Remark->DebugLoc.SourceLineNumber;
+    uint32_t Column = Remark->DebugLoc.SourceColumnNumber;
+    ArrayRef<LLVMOptRemarkArg> Args(Remark->Args, Remark->NumArgs);
 
     int VectorizationFactor = 1;
     int InterleaveCount = 1;
     int UnrollCount = 1;
 
-    for (auto &RootChild : *Root) {
-      auto *Key = dyn_cast<yaml::ScalarNode>(RootChild.getKey());
-      if (!Key)
-        continue;
-      StringRef KeyName = Key->getValue(Tmp);
-      if (KeyName == "Pass") {
-        auto *Value = dyn_cast<yaml::ScalarNode>(RootChild.getValue());
-        if (!Value)
-          continue;
-        Pass = Value->getValue(Tmp);
-      } else if (KeyName == "Function") {
-        auto *Value = dyn_cast<yaml::ScalarNode>(RootChild.getValue());
-        if (!Value)
-          continue;
-        Function = Value->getValue(Tmp);
-      } else if (KeyName == "DebugLoc") {
-        auto *DebugLoc = dyn_cast<yaml::MappingNode>(RootChild.getValue());
-        if (!DebugLoc)
-          continue;
-
-        for (auto &DLChild : *DebugLoc) {
-          auto *DLKey = dyn_cast<yaml::ScalarNode>(DLChild.getKey());
-          if (!DLKey)
-            continue;
-          StringRef DLKeyName = DLKey->getValue(Tmp);
-          if (DLKeyName == "File") {
-            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
-            if (!Value)
-              continue;
-            File = Value->getValue(Tmp);
-          } else if (DLKeyName == "Line") {
-            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
-            if (!Value)
-              continue;
-            Value->getValue(Tmp).getAsInteger(10, Line);
-          } else if (DLKeyName == "Column") {
-            auto *Value = dyn_cast<yaml::ScalarNode>(DLChild.getValue());
-            if (!Value)
-              continue;
-            Value->getValue(Tmp).getAsInteger(10, Column);
-          }
-        }
-      } else if (KeyName == "Args") {
-        auto *Args = dyn_cast<yaml::SequenceNode>(RootChild.getValue());
-        if (!Args)
-          continue;
-        for (auto &ArgChild : *Args) {
-          auto *ArgMap = dyn_cast<yaml::MappingNode>(&ArgChild);
-          if (!ArgMap)
-            continue;
-          for (auto &ArgKV : *ArgMap) {
-            auto *ArgKey = dyn_cast<yaml::ScalarNode>(ArgKV.getKey());
-            if (!ArgKey)
-              continue;
-            StringRef ArgKeyName = ArgKey->getValue(Tmp);
-            if (ArgKeyName == "VectorizationFactor") {
-              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
-              if (!Value)
-                continue;
-              Value->getValue(Tmp).getAsInteger(10, VectorizationFactor);
-            } else if (ArgKeyName == "InterleaveCount") {
-              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
-              if (!Value)
-                continue;
-              Value->getValue(Tmp).getAsInteger(10, InterleaveCount);
-            } else if (ArgKeyName == "UnrollCount") {
-              auto *Value = dyn_cast<yaml::ScalarNode>(ArgKV.getValue());
-              if (!Value)
-                continue;
-              Value->getValue(Tmp).getAsInteger(10, UnrollCount);
-            }
-          }
-        }
-      }
+    for (const LLVMOptRemarkArg &Arg : Args) {
+      StringRef ArgKeyName(Arg.Key.Str, Arg.Key.Len);
+      StringRef ArgValue(Arg.Value.Str, Arg.Value.Len);
+      if (ArgKeyName == "VectorizationFactor")
+        ArgValue.getAsInteger(10, VectorizationFactor);
+      else if (ArgKeyName == "InterleaveCount")
+        ArgValue.getAsInteger(10, InterleaveCount);
+      else if (ArgKeyName == "UnrollCount")
+        ArgValue.getAsInteger(10, UnrollCount);
     }
 
     if (Line < 1 || File.empty())
@@ -268,22 +209,13 @@ static void collectLocationInfo(yaml::Stream &Stream,
       UpdateLLII(LI.Vectorized);
     }
   }
-}
-
-static bool readLocationInfo(LocationInfoTy &LocationInfo) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
-      MemoryBuffer::getFileOrSTDIN(InputFileName);
-  if (std::error_code EC = Buf.getError()) {
-    WithColor::error() << "Can't open file " << InputFileName << ": "
-                       << EC.message() << "\n";
-    return false;
-  }
 
-  SourceMgr SM;
-  yaml::Stream Stream(Buf.get()->getBuffer(), SM);
-  collectLocationInfo(Stream, LocationInfo);
+  bool HasError = LLVMOptRemarkParserHasError(Parser);
+  if (HasError)
+    WithColor::error() << LLVMOptRemarkParserGetErrorMessage(Parser) << "\n";
 
-  return true;
+  LLVMOptRemarkParserDispose(Parser);
+  return !HasError;
 }
 
 static bool writeReport(LocationInfoTy &LocationInfo) {
diff --git a/tools/llvm-pdbutil/DumpOutputStyle.cpp b/tools/llvm-pdbutil/DumpOutputStyle.cpp
index 7f309f48f2af39611ae6101a01c98101ed192f5a..e4f6aa7f6ec583d7582ea9e8e48b8b7f3e357e48 100644
--- a/tools/llvm-pdbutil/DumpOutputStyle.cpp
+++ b/tools/llvm-pdbutil/DumpOutputStyle.cpp
@@ -250,7 +250,7 @@ Error DumpOutputStyle::dumpFileSummary() {
 static StatCollection getSymbolStats(const SymbolGroup &SG,
                                      StatCollection &CumulativeStats) {
   StatCollection Stats;
-  if (SG.getFile().isPdb()) {
+  if (SG.getFile().isPdb() && SG.hasDebugStream()) {
     // For PDB files, all symbols are packed into one stream.
     for (const auto &S : SG.getPdbModuleStream().symbols(nullptr)) {
       Stats.update(S.kind(), S.length());
@@ -1420,19 +1420,21 @@ Error DumpOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
       P.formatLine("TI: {0}, Offset: {1}", IO.Type, fmtle(IO.Offset));
     }
 
-    P.NewLine();
-    P.formatLine("Hash Adjusters:");
-    auto &Adjusters = Stream.getHashAdjusters();
-    auto &Strings = Err(getPdb().getStringTable());
-    for (const auto &A : Adjusters) {
-      AutoIndent Indent2(P);
-      auto ExpectedStr = Strings.getStringForID(A.first);
-      TypeIndex TI(A.second);
-      if (ExpectedStr)
-        P.formatLine("`{0}` -> {1}", *ExpectedStr, TI);
-      else {
-        P.formatLine("unknown str id ({0}) -> {1}", A.first, TI);
-        consumeError(ExpectedStr.takeError());
+    if (getPdb().hasPDBStringTable()) {
+      P.NewLine();
+      P.formatLine("Hash Adjusters:");
+      auto &Adjusters = Stream.getHashAdjusters();
+      auto &Strings = Err(getPdb().getStringTable());
+      for (const auto &A : Adjusters) {
+        AutoIndent Indent2(P);
+        auto ExpectedStr = Strings.getStringForID(A.first);
+        TypeIndex TI(A.second);
+        if (ExpectedStr)
+          P.formatLine("`{0}` -> {1}", *ExpectedStr, TI);
+        else {
+          P.formatLine("unknown str id ({0}) -> {1}", A.first, TI);
+          consumeError(ExpectedStr.takeError());
+        }
       }
     }
   }
diff --git a/tools/llvm-pdbutil/InputFile.cpp b/tools/llvm-pdbutil/InputFile.cpp
index b2019642b2b0ae9b08be06742ffbb3e3bd93533d..8eb116cf0d80c01f2e6de58b79dc7f313e62c2d5 100644
--- a/tools/llvm-pdbutil/InputFile.cpp
+++ b/tools/llvm-pdbutil/InputFile.cpp
@@ -116,10 +116,6 @@ static std::string formatChecksumKind(FileChecksumKind Kind) {
   return formatUnknownEnum(Kind);
 }
 
-static const DebugStringTableSubsectionRef &extractStringTable(PDBFile &File) {
-  return cantFail(File.getStringTable()).getStringTable();
-}
-
 template <typename... Args>
 static void formatInternal(LinePrinter &Printer, bool Append, Args &&... args) {
   if (Append)
@@ -168,8 +164,13 @@ void SymbolGroup::initializeForPdb(uint32_t Modi) {
 
   // PDB always uses the same string table, but each module has its own
   // checksums.  So we only set the strings if they're not already set.
-  if (!SC.hasStrings())
-    SC.setStrings(extractStringTable(File->pdb()));
+  if (!SC.hasStrings()) {
+    auto StringTable = File->pdb().getStringTable();
+    if (StringTable)
+      SC.setStrings(StringTable->getStringTable());
+    else
+      consumeError(StringTable.takeError());
+  }
 
   SC.resetChecksums();
   auto MDS = getModuleDebugStream(File->pdb(), Name, Modi);
diff --git a/tools/llvm-pdbutil/InputFile.h b/tools/llvm-pdbutil/InputFile.h
index 552f3a3b21273f0b8776b96483eb52dbf0fc83b2..ee4e651c1e99893f3d092d203c6e577dde0d4549 100644
--- a/tools/llvm-pdbutil/InputFile.h
+++ b/tools/llvm-pdbutil/InputFile.h
@@ -110,6 +110,8 @@ public:
   const InputFile &getFile() const { return *File; }
   InputFile &getFile() { return *File; }
 
+  bool hasDebugStream() const { return DebugStream != nullptr; }
+
 private:
   void initializeForPdb(uint32_t Modi);
   void updatePdbModi(uint32_t Modi);
diff --git a/tools/llvm-pdbutil/PdbYaml.cpp b/tools/llvm-pdbutil/PdbYaml.cpp
index eb39708a27e94d98a5ac502561e5b8a6fc0963cf..3ea333608314e470d8c5af0bcafa377cd6dff493 100644
--- a/tools/llvm-pdbutil/PdbYaml.cpp
+++ b/tools/llvm-pdbutil/PdbYaml.cpp
@@ -110,6 +110,7 @@ void MappingTraits<PdbObject>::mapping(IO &IO, PdbObject &Obj) {
   IO.mapOptional("DbiStream", Obj.DbiStream);
   IO.mapOptional("TpiStream", Obj.TpiStream);
   IO.mapOptional("IpiStream", Obj.IpiStream);
+  IO.mapOptional("PublicsStream", Obj.PublicsStream);
 }
 
 void MappingTraits<MSFHeaders>::mapping(IO &IO, MSFHeaders &Obj) {
@@ -163,6 +164,11 @@ void MappingTraits<PdbTpiStream>::mapping(IO &IO,
   IO.mapRequired("Records", Obj.Records);
 }
 
+void MappingTraits<PdbPublicsStream>::mapping(
+    IO &IO, pdb::yaml::PdbPublicsStream &Obj) {
+  IO.mapRequired("Records", Obj.PubSyms);
+}
+
 void MappingTraits<NamedStreamMapping>::mapping(IO &IO,
                                                 NamedStreamMapping &Obj) {
   IO.mapRequired("Name", Obj.StreamName);
diff --git a/tools/llvm-pdbutil/PdbYaml.h b/tools/llvm-pdbutil/PdbYaml.h
index 91e054490a5f6b1d1dea06ce33141054eba89b3a..97ba87266cc67af85655a6fba5349dbec9c2b73a 100644
--- a/tools/llvm-pdbutil/PdbYaml.h
+++ b/tools/llvm-pdbutil/PdbYaml.h
@@ -92,6 +92,10 @@ struct PdbTpiStream {
   std::vector<CodeViewYAML::LeafRecord> Records;
 };
 
+struct PdbPublicsStream {
+  std::vector<CodeViewYAML::SymbolRecord> PubSyms;
+};
+
 struct PdbObject {
   explicit PdbObject(BumpPtrAllocator &Allocator) : Allocator(Allocator) {}
 
@@ -102,6 +106,7 @@ struct PdbObject {
   Optional<PdbDbiStream> DbiStream;
   Optional<PdbTpiStream> TpiStream;
   Optional<PdbTpiStream> IpiStream;
+  Optional<PdbPublicsStream> PublicsStream;
 
   Optional<std::vector<StringRef>> StringTable;
 
@@ -118,6 +123,7 @@ LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::StreamBlockList)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbInfoStream)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbDbiStream)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbTpiStream)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbPublicsStream)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::NamedStreamMapping)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbModiStream)
 LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbDbiModuleInfo)
diff --git a/tools/llvm-pdbutil/PrettyCompilandDumper.cpp b/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
index 0d99c9b1245c38c859c19070df1bfaa7b8bb4e05..94a0b2d5e780af3abdc617134ffa5ceaf44002ff 100644
--- a/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
+++ b/tools/llvm-pdbutil/PrettyCompilandDumper.cpp
@@ -28,6 +28,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
@@ -216,3 +217,13 @@ void CompilandDumper::dump(const PDBSymbolUnknown &Symbol) {
   Printer.NewLine();
   Printer << "unknown (" << Symbol.getSymTag() << ")";
 }
+
+void CompilandDumper::dump(const PDBSymbolUsingNamespace &Symbol) {
+  if (Printer.IsSymbolExcluded(Symbol.getName()))
+    return;
+
+  Printer.NewLine();
+  Printer << "using namespace ";
+  std::string Name = Symbol.getName();
+  WithColor(Printer, PDB_ColorItem::Identifier).get() << Name;
+}
diff --git a/tools/llvm-pdbutil/PrettyCompilandDumper.h b/tools/llvm-pdbutil/PrettyCompilandDumper.h
index cae196e9d1343f075837b666d136f6d02fd0a6f0..1a840e49607c12a7e7689d211ba98f1696beacae 100644
--- a/tools/llvm-pdbutil/PrettyCompilandDumper.h
+++ b/tools/llvm-pdbutil/PrettyCompilandDumper.h
@@ -34,6 +34,7 @@ public:
   void dump(const PDBSymbolThunk &Symbol) override;
   void dump(const PDBSymbolTypeTypedef &Symbol) override;
   void dump(const PDBSymbolUnknown &Symbol) override;
+  void dump(const PDBSymbolUsingNamespace &Symbol) override;
 
 private:
   LinePrinter &Printer;
diff --git a/tools/llvm-pdbutil/YAMLOutputStyle.cpp b/tools/llvm-pdbutil/YAMLOutputStyle.cpp
index 521e27fc08979db6a547b761e00842b620a87bc1..62b5c428d41026e0bd1ac6d71377756a2ad796a4 100644
--- a/tools/llvm-pdbutil/YAMLOutputStyle.cpp
+++ b/tools/llvm-pdbutil/YAMLOutputStyle.cpp
@@ -18,10 +18,13 @@
 #include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 
 using namespace llvm;
@@ -68,6 +71,9 @@ Error YAMLOutputStyle::dump() {
   if (auto EC = dumpIpiStream())
     return EC;
 
+  if (auto EC = dumpPublics())
+    return EC;
+
   flush();
   return Error::success();
 }
@@ -326,6 +332,42 @@ Error YAMLOutputStyle::dumpIpiStream() {
   return Error::success();
 }
 
+Error YAMLOutputStyle::dumpPublics() {
+  if (!opts::pdb2yaml::PublicsStream)
+    return Error::success();
+
+  Obj.PublicsStream.emplace();
+  auto ExpectedPublics = File.getPDBPublicsStream();
+  if (!ExpectedPublics) {
+    llvm::consumeError(ExpectedPublics.takeError());
+    return Error::success();
+  }
+
+  PublicsStream &Publics = *ExpectedPublics;
+  const GSIHashTable &PublicsTable = Publics.getPublicsTable();
+
+  auto ExpectedSyms = File.getPDBSymbolStream();
+  if (!ExpectedSyms) {
+    llvm::consumeError(ExpectedSyms.takeError());
+    return Error::success();
+  }
+
+  BinaryStreamRef SymStream =
+      ExpectedSyms->getSymbolArray().getUnderlyingStream();
+  for (uint32_t PubSymOff : PublicsTable) {
+    Expected<CVSymbol> Sym = readSymbolFromStream(SymStream, PubSymOff);
+    if (!Sym)
+      return Sym.takeError();
+    auto ES = CodeViewYAML::SymbolRecord::fromCodeViewSymbol(*Sym);
+    if (!ES)
+      return ES.takeError();
+
+    Obj.PublicsStream->PubSyms.push_back(*ES);
+  }
+
+  return Error::success();
+}
+
 void YAMLOutputStyle::flush() {
   Out << Obj;
   outs().flush();
diff --git a/tools/llvm-pdbutil/YAMLOutputStyle.h b/tools/llvm-pdbutil/YAMLOutputStyle.h
index 3690e3529d4a1ebcf476f38b9e3c8d36458d6956..a5ad3355d2ab4d8b7996ba0bd5f9c23693ea72c2 100644
--- a/tools/llvm-pdbutil/YAMLOutputStyle.h
+++ b/tools/llvm-pdbutil/YAMLOutputStyle.h
@@ -35,6 +35,7 @@ private:
   Error dumpDbiStream();
   Error dumpTpiStream();
   Error dumpIpiStream();
+  Error dumpPublics();
 
   void flush();
 
diff --git a/tools/llvm-pdbutil/llvm-pdbutil.cpp b/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 34618f6b762c1b7258e1b173b01b7dc187fbf94d..215bfbeb2060c63d6cbfeaf3d6534f87a267e093 100644
--- a/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -663,6 +663,10 @@ cl::opt<bool> IpiStream("ipi-stream",
                         cl::desc("Dump the IPI Stream (Stream 5)"),
                         cl::sub(PdbToYamlSubcommand), cl::init(false));
 
+cl::opt<bool> PublicsStream("publics-stream",
+                            cl::desc("Dump the Publics Stream"),
+                            cl::sub(PdbToYamlSubcommand), cl::init(false));
+
 // MODULE & FILE OPTIONS
 cl::opt<bool> DumpModules("modules", cl::desc("dump compiland information"),
                           cl::cat(FileOptions), cl::sub(PdbToYamlSubcommand));
@@ -1495,6 +1499,7 @@ int main(int Argc, const char **Argv) {
       opts::pdb2yaml::DbiStream = true;
       opts::pdb2yaml::TpiStream = true;
       opts::pdb2yaml::IpiStream = true;
+      opts::pdb2yaml::PublicsStream = true;
       opts::pdb2yaml::DumpModules = true;
       opts::pdb2yaml::DumpModuleFiles = true;
       opts::pdb2yaml::DumpModuleSyms = true;
diff --git a/tools/llvm-pdbutil/llvm-pdbutil.h b/tools/llvm-pdbutil/llvm-pdbutil.h
index 1584dce52c59e0ef016727e39437e15b01e7ffb4..a57cc51d7fd79b028bc93b214601a2dbc9928c30 100644
--- a/tools/llvm-pdbutil/llvm-pdbutil.h
+++ b/tools/llvm-pdbutil/llvm-pdbutil.h
@@ -192,6 +192,7 @@ extern llvm::cl::opt<bool> PdbStream;
 extern llvm::cl::opt<bool> DbiStream;
 extern llvm::cl::opt<bool> TpiStream;
 extern llvm::cl::opt<bool> IpiStream;
+extern llvm::cl::opt<bool> PublicsStream;
 extern llvm::cl::list<std::string> InputFilename;
 extern llvm::cl::opt<bool> DumpModules;
 extern llvm::cl::opt<bool> DumpModuleFiles;
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.cpp b/tools/llvm-readobj/ARMWinEHPrinter.cpp
index a90840b22c8d86f1f3f3f6aa2ddbe2026fc93d58..eb575894db587bc6e630bc8cd07bc5d739d26b4f 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.cpp
+++ b/tools/llvm-readobj/ARMWinEHPrinter.cpp
@@ -118,31 +118,57 @@ const size_t Decoder::PDataEntrySize = sizeof(RuntimeFunction);
 
 // TODO name the uops more appropriately
 const Decoder::RingEntry Decoder::Ring[] = {
-  { 0x80, 0x00, &Decoder::opcode_0xxxxxxx },  // UOP_STACK_FREE (16-bit)
-  { 0xc0, 0x80, &Decoder::opcode_10Lxxxxx },  // UOP_POP (32-bit)
-  { 0xf0, 0xc0, &Decoder::opcode_1100xxxx },  // UOP_STACK_SAVE (16-bit)
-  { 0xf8, 0xd0, &Decoder::opcode_11010Lxx },  // UOP_POP (16-bit)
-  { 0xf8, 0xd8, &Decoder::opcode_11011Lxx },  // UOP_POP (32-bit)
-  { 0xf8, 0xe0, &Decoder::opcode_11100xxx },  // UOP_VPOP (32-bit)
-  { 0xfc, 0xe8, &Decoder::opcode_111010xx },  // UOP_STACK_FREE (32-bit)
-  { 0xfe, 0xec, &Decoder::opcode_1110110L },  // UOP_POP (16-bit)
-  { 0xff, 0xee, &Decoder::opcode_11101110 },  // UOP_MICROSOFT_SPECIFIC (16-bit)
+  { 0x80, 0x00, 1, &Decoder::opcode_0xxxxxxx },  // UOP_STACK_FREE (16-bit)
+  { 0xc0, 0x80, 2, &Decoder::opcode_10Lxxxxx },  // UOP_POP (32-bit)
+  { 0xf0, 0xc0, 1, &Decoder::opcode_1100xxxx },  // UOP_STACK_SAVE (16-bit)
+  { 0xf8, 0xd0, 1, &Decoder::opcode_11010Lxx },  // UOP_POP (16-bit)
+  { 0xf8, 0xd8, 1, &Decoder::opcode_11011Lxx },  // UOP_POP (32-bit)
+  { 0xf8, 0xe0, 1, &Decoder::opcode_11100xxx },  // UOP_VPOP (32-bit)
+  { 0xfc, 0xe8, 2, &Decoder::opcode_111010xx },  // UOP_STACK_FREE (32-bit)
+  { 0xfe, 0xec, 2, &Decoder::opcode_1110110L },  // UOP_POP (16-bit)
+  { 0xff, 0xee, 2, &Decoder::opcode_11101110 },  // UOP_MICROSOFT_SPECIFIC (16-bit)
                                               // UOP_PUSH_MACHINE_FRAME
                                               // UOP_PUSH_CONTEXT
                                               // UOP_PUSH_TRAP_FRAME
                                               // UOP_REDZONE_RESTORE_LR
-  { 0xff, 0xef, &Decoder::opcode_11101111 },  // UOP_LDRPC_POSTINC (32-bit)
-  { 0xff, 0xf5, &Decoder::opcode_11110101 },  // UOP_VPOP (32-bit)
-  { 0xff, 0xf6, &Decoder::opcode_11110110 },  // UOP_VPOP (32-bit)
-  { 0xff, 0xf7, &Decoder::opcode_11110111 },  // UOP_STACK_RESTORE (16-bit)
-  { 0xff, 0xf8, &Decoder::opcode_11111000 },  // UOP_STACK_RESTORE (16-bit)
-  { 0xff, 0xf9, &Decoder::opcode_11111001 },  // UOP_STACK_RESTORE (32-bit)
-  { 0xff, 0xfa, &Decoder::opcode_11111010 },  // UOP_STACK_RESTORE (32-bit)
-  { 0xff, 0xfb, &Decoder::opcode_11111011 },  // UOP_NOP (16-bit)
-  { 0xff, 0xfc, &Decoder::opcode_11111100 },  // UOP_NOP (32-bit)
-  { 0xff, 0xfd, &Decoder::opcode_11111101 },  // UOP_NOP (16-bit) / END
-  { 0xff, 0xfe, &Decoder::opcode_11111110 },  // UOP_NOP (32-bit) / END
-  { 0xff, 0xff, &Decoder::opcode_11111111 },  // UOP_END
+  { 0xff, 0xef, 2, &Decoder::opcode_11101111 },  // UOP_LDRPC_POSTINC (32-bit)
+  { 0xff, 0xf5, 2, &Decoder::opcode_11110101 },  // UOP_VPOP (32-bit)
+  { 0xff, 0xf6, 2, &Decoder::opcode_11110110 },  // UOP_VPOP (32-bit)
+  { 0xff, 0xf7, 3, &Decoder::opcode_11110111 },  // UOP_STACK_RESTORE (16-bit)
+  { 0xff, 0xf8, 4, &Decoder::opcode_11111000 },  // UOP_STACK_RESTORE (16-bit)
+  { 0xff, 0xf9, 3, &Decoder::opcode_11111001 },  // UOP_STACK_RESTORE (32-bit)
+  { 0xff, 0xfa, 4, &Decoder::opcode_11111010 },  // UOP_STACK_RESTORE (32-bit)
+  { 0xff, 0xfb, 1, &Decoder::opcode_11111011 },  // UOP_NOP (16-bit)
+  { 0xff, 0xfc, 1, &Decoder::opcode_11111100 },  // UOP_NOP (32-bit)
+  { 0xff, 0xfd, 1, &Decoder::opcode_11111101 },  // UOP_NOP (16-bit) / END
+  { 0xff, 0xfe, 1, &Decoder::opcode_11111110 },  // UOP_NOP (32-bit) / END
+  { 0xff, 0xff, 1, &Decoder::opcode_11111111 },  // UOP_END
+};
+
+
+// Unwind opcodes for ARM64.
+// https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
+const Decoder::RingEntry Decoder::Ring64[] = {
+  { 0xe0, 0x00, 1, &Decoder::opcode_alloc_s },
+  { 0xe0, 0x20, 1, &Decoder::opcode_save_r19r20_x },
+  { 0xc0, 0x40, 1, &Decoder::opcode_save_fplr },
+  { 0xc0, 0x80, 1, &Decoder::opcode_save_fplr_x },
+  { 0xf8, 0xc0, 2, &Decoder::opcode_alloc_m },
+  { 0xfc, 0xc8, 2, &Decoder::opcode_save_regp },
+  { 0xfc, 0xcc, 2, &Decoder::opcode_save_regp_x },
+  { 0xfc, 0xd0, 2, &Decoder::opcode_save_reg },
+  { 0xfe, 0xd4, 2, &Decoder::opcode_save_reg_x },
+  { 0xfe, 0xd6, 2, &Decoder::opcode_save_lrpair },
+  { 0xfe, 0xd8, 2, &Decoder::opcode_save_fregp },
+  { 0xfe, 0xda, 2, &Decoder::opcode_save_fregp_x },
+  { 0xfe, 0xdc, 2, &Decoder::opcode_save_freg },
+  { 0xff, 0xde, 2, &Decoder::opcode_save_freg_x },
+  { 0xff, 0xe0, 4, &Decoder::opcode_alloc_l },
+  { 0xff, 0xe1, 1, &Decoder::opcode_setfp },
+  { 0xff, 0xe2, 2, &Decoder::opcode_addfp },
+  { 0xff, 0xe3, 1, &Decoder::opcode_nop },
+  { 0xff, 0xe4, 1, &Decoder::opcode_end },
+  { 0xff, 0xe5, 1, &Decoder::opcode_end_c },
 };
 
 void Decoder::printRegisters(const std::pair<uint16_t, uint32_t> &RegisterMask) {
@@ -493,18 +519,291 @@ bool Decoder::opcode_11111111(const uint8_t *OC, unsigned &Offset,
   return true;
 }
 
+// ARM64 unwind codes start here.
+bool Decoder::opcode_alloc_s(const uint8_t *OC, unsigned &Offset,
+                             unsigned Length, bool Prologue) {
+  uint32_t NumBytes = (OC[Offset] & 0x1F) << 4;
+  SW.startLine() << format("0x%02x                ; %s sp, #%u\n", OC[Offset],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           NumBytes);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_save_r19r20_x(const uint8_t *OC, unsigned &Offset,
+                                   unsigned Length, bool Prologue) {
+  uint32_t Off = (OC[Offset] & 0x1F) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x                ; stp x19, x20, [sp, #-%u]!\n", OC[Offset], Off);
+  else
+    SW.startLine() << format(
+        "0x%02x                ; ldp x19, x20, [sp], #%u\n", OC[Offset], Off);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_save_fplr(const uint8_t *OC, unsigned &Offset,
+                               unsigned Length, bool Prologue) {
+  uint32_t Off = (OC[Offset] & 0x3F) << 3;
+  SW.startLine() << format(
+      "0x%02x                ; %s x29, x30, [sp, #%u]\n", OC[Offset],
+      static_cast<const char *>(Prologue ? "stp" : "ldp"), Off);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_save_fplr_x(const uint8_t *OC, unsigned &Offset,
+                                 unsigned Length, bool Prologue) {
+  uint32_t Off = ((OC[Offset] & 0x3F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x                ; stp x29, x30, [sp, #-%u]!\n", OC[Offset], Off);
+  else
+    SW.startLine() << format(
+        "0x%02x                ; ldp x29, x30, [sp], #%u\n", OC[Offset], Off);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_alloc_m(const uint8_t *OC, unsigned &Offset,
+                             unsigned Length, bool Prologue) {
+  uint32_t NumBytes = ((OC[Offset] & 0x07) << 8);
+  NumBytes |= (OC[Offset + 1] & 0xFF);
+  NumBytes <<= 4;
+  SW.startLine() << format("0x%02x%02x              ; %s sp, #%u\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "sub" : "add"),
+                           NumBytes);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_regp(const uint8_t *OC, unsigned &Offset,
+                               unsigned Length, bool Prologue) {
+  uint32_t Reg = ((OC[Offset] & 0x03) << 8);
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 19;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format(
+      "0x%02x%02x              ; %s x%u, x%u, [sp, #%u]\n",
+      OC[Offset], OC[Offset + 1],
+      static_cast<const char *>(Prologue ? "stp" : "ldp"), Reg, Reg + 1, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_regp_x(const uint8_t *OC, unsigned &Offset,
+                                 unsigned Length, bool Prologue) {
+  uint32_t Reg = ((OC[Offset] & 0x03) << 8);
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 19;
+  uint32_t Off = ((OC[Offset + 1] & 0x3F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x%02x              ; stp x%u, x%u, [sp, #-%u]!\n",
+        OC[Offset], OC[Offset + 1], Reg,
+        Reg + 1, Off);
+  else
+    SW.startLine() << format(
+        "0x%02x%02x              ; ldp x%u, x%u, [sp], #%u\n",
+        OC[Offset], OC[Offset + 1], Reg,
+        Reg + 1, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_reg(const uint8_t *OC, unsigned &Offset,
+                              unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x03) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 19;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format("0x%02x%02x              ; %s x%u, [sp, #%u]\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "str" : "ldr"),
+                           Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_reg_x(const uint8_t *OC, unsigned &Offset,
+                                unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xE0);
+  Reg >>= 5;
+  Reg += 19;
+  uint32_t Off = ((OC[Offset + 1] & 0x1F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format("0x%02x%02x              ; str x%u, [sp, #%u]!\n",
+                             OC[Offset], OC[Offset + 1], Reg, Off);
+  else
+    SW.startLine() << format("0x%02x%02x              ; ldr x%u, [sp], #%u\n",
+                             OC[Offset], OC[Offset + 1], Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_lrpair(const uint8_t *OC, unsigned &Offset,
+                                 unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg *= 2;
+  Reg += 19;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format("0x%02x%02x              ; %s x%u, lr, [sp, #%u]\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "stp" : "ldp"),
+                           Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_fregp(const uint8_t *OC, unsigned &Offset,
+                                unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 8;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format("0x%02x%02x              ; %s d%u, d%u, [sp, #%u]\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "stp" : "ldp"),
+                           Reg, Reg + 1, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_fregp_x(const uint8_t *OC, unsigned &Offset,
+                                  unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 8;
+  uint32_t Off = ((OC[Offset + 1] & 0x3F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x%02x              ; stp d%u, d%u, [sp, #-%u]!\n", OC[Offset],
+        OC[Offset + 1], Reg, Reg + 1, Off);
+  else
+    SW.startLine() << format(
+        "0x%02x%02x              ; ldp d%u, d%u, [sp], #%u\n", OC[Offset],
+        OC[Offset + 1], Reg, Reg + 1, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_freg(const uint8_t *OC, unsigned &Offset,
+                               unsigned Length, bool Prologue) {
+  uint32_t Reg = (OC[Offset] & 0x01) << 8;
+  Reg |= (OC[Offset + 1] & 0xC0);
+  Reg >>= 6;
+  Reg += 8;
+  uint32_t Off = (OC[Offset + 1] & 0x3F) << 3;
+  SW.startLine() << format("0x%02x%02x                ; %s d%u, [sp, #%u]\n",
+                           OC[Offset], OC[Offset + 1],
+                           static_cast<const char *>(Prologue ? "str" : "ldr"),
+                           Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_save_freg_x(const uint8_t *OC, unsigned &Offset,
+                                 unsigned Length, bool Prologue) {
+  uint32_t Reg = ((OC[Offset + 1] & 0xE0) >> 5) + 8;
+  uint32_t Off = ((OC[Offset + 1] & 0x1F) + 1) << 3;
+  if (Prologue)
+    SW.startLine() << format(
+        "0x%02x%02x              ; str d%u, [sp, #-%u]!\n", OC[Offset],
+        OC[Offset + 1], Reg, Off);
+  else
+    SW.startLine() << format(
+        "0x%02x%02x              ; ldr d%u, [sp], #%u\n", OC[Offset],
+        OC[Offset + 1], Reg, Off);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_alloc_l(const uint8_t *OC, unsigned &Offset,
+                             unsigned Length, bool Prologue) {
+  unsigned Off =
+      (OC[Offset + 1] << 16) | (OC[Offset + 2] << 8) | (OC[Offset + 3] << 0);
+  Off <<= 4;
+  SW.startLine() << format(
+      "0x%02x%02x%02x%02x          ; %s sp, #%u\n", OC[Offset], OC[Offset + 1],
+      OC[Offset + 2], OC[Offset + 3],
+      static_cast<const char *>(Prologue ? "sub" : "add"), Off);
+  Offset += 4;
+  return false;
+}
+
+bool Decoder::opcode_setfp(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                           bool Prologue) {
+  SW.startLine() << format("0x%02x                ; mov fp, sp\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_addfp(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                           bool Prologue) {
+  unsigned NumBytes = OC[Offset + 1] << 3;
+  SW.startLine() << format("0x%02x%02x              ; add fp, sp, #%u\n",
+                           OC[Offset], OC[Offset + 1], NumBytes);
+  Offset += 2;
+  return false;
+}
+
+bool Decoder::opcode_nop(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                         bool Prologue) {
+  SW.startLine() << format("0x%02x                ; nop\n", OC[Offset]);
+  ++Offset;
+  return false;
+}
+
+bool Decoder::opcode_end(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                         bool Prologue) {
+  SW.startLine() << format("0x%02x                ; end\n", OC[Offset]);
+  ++Offset;
+  return true;
+}
+
+bool Decoder::opcode_end_c(const uint8_t *OC, unsigned &Offset, unsigned Length,
+                           bool Prologue) {
+  SW.startLine() << format("0x%02x                ; end_c\n", OC[Offset]);
+  ++Offset;
+  return true;
+}
+
 void Decoder::decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
                             bool Prologue) {
   assert((!Prologue || Offset == 0) && "prologue should always use offset 0");
-
+  const RingEntry* DecodeRing = isAArch64 ? Ring64 : Ring;
   bool Terminated = false;
   for (unsigned OI = Offset, OE = Opcodes.size(); !Terminated && OI < OE; ) {
     for (unsigned DI = 0;; ++DI) {
-      if ((Opcodes[OI] & Ring[DI].Mask) == Ring[DI].Value) {
-        Terminated = (this->*Ring[DI].Routine)(Opcodes.data(), OI, 0, Prologue);
+      if ((isAArch64 && (DI >= array_lengthof(Ring64))) ||
+          (!isAArch64 && (DI >= array_lengthof(Ring)))) {
+        SW.startLine() << format("0x%02x                ; Bad opcode!\n",
+                                 Opcodes.data()[OI]);
+        ++OI;
+        break;
+      }
+
+      if ((Opcodes[OI] & DecodeRing[DI].Mask) == DecodeRing[DI].Value) {
+        if (OI + DecodeRing[DI].Length > OE) {
+          SW.startLine() << format("Opcode 0x%02x goes past the unwind data\n",
+                                    Opcodes[OI]);
+          OI += DecodeRing[DI].Length;
+          break;
+        }
+        Terminated =
+            (this->*DecodeRing[DI].Routine)(Opcodes.data(), OI, 0, Prologue);
         break;
       }
-      assert(DI < array_lengthof(Ring) && "unhandled opcode");
     }
   }
 }
@@ -520,22 +819,36 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
   uint64_t Offset = VA - SectionVA;
   const ulittle32_t *Data =
     reinterpret_cast<const ulittle32_t *>(Contents.data() + Offset);
-  const ExceptionDataRecord XData(Data);
 
+  // Sanity check to ensure that the .xdata header is present.
+  // A header is one or two words, followed by at least one word to describe
+  // the unwind codes. Applicable to both ARM and AArch64.
+  if (Contents.size() - Offset < 8)
+    report_fatal_error(".xdata must be at least 8 bytes in size");
+
+  const ExceptionDataRecord XData(Data, isAArch64);
   DictScope XRS(SW, "ExceptionData");
-  SW.printNumber("FunctionLength", XData.FunctionLength() << 1);
+  SW.printNumber("FunctionLength",
+                 isAArch64 ? XData.FunctionLengthInBytesAArch64() :
+                 XData.FunctionLengthInBytesARM());
   SW.printNumber("Version", XData.Vers());
   SW.printBoolean("ExceptionData", XData.X());
   SW.printBoolean("EpiloguePacked", XData.E());
-  SW.printBoolean("Fragment", XData.F());
+  if (!isAArch64)
+    SW.printBoolean("Fragment", XData.F());
   SW.printNumber(XData.E() ? "EpilogueOffset" : "EpilogueScopes",
                  XData.EpilogueCount());
-  SW.printNumber("ByteCodeLength",
-                 static_cast<uint64_t>(XData.CodeWords() * sizeof(uint32_t)));
+  uint64_t ByteCodeLength = XData.CodeWords() * sizeof(uint32_t);
+  SW.printNumber("ByteCodeLength", ByteCodeLength);
+
+  if ((int64_t)(Contents.size() - Offset - 4 * HeaderWords(XData) -
+                (XData.E() ? 0 : XData.EpilogueCount() * 4) -
+                (XData.X() ? 8 : 0)) < (int64_t)ByteCodeLength)
+    report_fatal_error("Malformed unwind data");
 
   if (XData.E()) {
     ArrayRef<uint8_t> UC = XData.UnwindByteCode();
-    if (!XData.F()) {
+    if (isAArch64 || !XData.F()) {
       ListScope PS(SW, "Prologue");
       decodeOpcodes(UC, 0, /*Prologue=*/true);
     }
@@ -544,16 +857,27 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
       decodeOpcodes(UC, XData.EpilogueCount(), /*Prologue=*/false);
     }
   } else {
+    {
+      ListScope PS(SW, "Prologue");
+      decodeOpcodes(XData.UnwindByteCode(), 0, /*Prologue=*/true);
+    }
     ArrayRef<ulittle32_t> EpilogueScopes = XData.EpilogueScopes();
     ListScope ESS(SW, "EpilogueScopes");
     for (const EpilogueScope ES : EpilogueScopes) {
       DictScope ESES(SW, "EpilogueScope");
       SW.printNumber("StartOffset", ES.EpilogueStartOffset());
-      SW.printNumber("Condition", ES.Condition());
-      SW.printNumber("EpilogueStartIndex", ES.EpilogueStartIndex());
+      if (!isAArch64)
+        SW.printNumber("Condition", ES.Condition());
+      SW.printNumber("EpilogueStartIndex",
+                     isAArch64 ? ES.EpilogueStartIndexAArch64()
+                               : ES.EpilogueStartIndexARM());
+      if (ES.ES & ~0xffc3ffff)
+        SW.printNumber("ReservedBits", (ES.ES >> 18) & 0xF);
 
       ListScope Opcodes(SW, "Opcodes");
-      decodeOpcodes(XData.UnwindByteCode(), ES.EpilogueStartIndex(),
+      decodeOpcodes(XData.UnwindByteCode(),
+                    isAArch64 ? ES.EpilogueStartIndexAArch64()
+                              : ES.EpilogueStartIndexARM(),
                     /*Prologue=*/false);
     }
   }
@@ -565,10 +889,15 @@ bool Decoder::dumpXDataRecord(const COFFObjectFile &COFF,
                                + (XData.E() ? 0 : XData.EpilogueCount())
                                + XData.CodeWords();
 
-    ErrorOr<SymbolRef> Symbol =
-      getRelocatedSymbol(COFF, Section, HandlerOffset * sizeof(uint32_t));
+    ErrorOr<SymbolRef> Symbol = getRelocatedSymbol(
+        COFF, Section, Offset + HandlerOffset * sizeof(uint32_t));
     if (!Symbol)
       Symbol = getSymbol(COFF, Address, /*FunctionOnly=*/true);
+    if (!Symbol) {
+      ListScope EHS(SW, "ExceptionHandler");
+      SW.printString("Routine", "(null)");
+      return true;
+    }
 
     Expected<StringRef> Name = Symbol->getName();
     if (!Name) {
@@ -628,10 +957,7 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     }
     FunctionAddress = *FunctionAddressOrErr;
   } else {
-    const pe32_header *PEHeader;
-    if (COFF.getPE32Header(PEHeader))
-      return false;
-    FunctionAddress = PEHeader->ImageBase + RF.BeginAddress;
+    FunctionAddress = COFF.getImageBase() + RF.BeginAddress;
   }
 
   SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
@@ -666,22 +992,18 @@ bool Decoder::dumpUnpackedEntry(const COFFObjectFile &COFF,
     }
     section_iterator SI = *SIOrErr;
 
-    return dumpXDataRecord(COFF, *SI, FunctionAddress, Address);
+    // FIXME: Do we need to add an offset from the relocation?
+    return dumpXDataRecord(COFF, *SI, FunctionAddress,
+                           RF.ExceptionInformationRVA());
   } else {
-    const pe32_header *PEHeader;
-    if (COFF.getPE32Header(PEHeader))
-      return false;
-
-    uint64_t Address = PEHeader->ImageBase + RF.ExceptionInformationRVA();
+    uint64_t Address = COFF.getImageBase() + RF.ExceptionInformationRVA();
     SW.printString("ExceptionRecord", formatSymbol("", Address));
 
-    ErrorOr<SectionRef> Section =
-      getSectionContaining(COFF, RF.ExceptionInformationRVA());
+    ErrorOr<SectionRef> Section = getSectionContaining(COFF, Address);
     if (!Section)
       return false;
 
-    return dumpXDataRecord(COFF, *Section, FunctionAddress,
-                           RF.ExceptionInformationRVA());
+    return dumpXDataRecord(COFF, *Section, FunctionAddress, Address);
   }
 }
 
@@ -725,8 +1047,9 @@ bool Decoder::dumpPackedEntry(const object::COFFObjectFile &COFF,
   }
 
   SW.printString("Function", formatSymbol(FunctionName, FunctionAddress));
-  SW.printBoolean("Fragment",
-                  RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment);
+  if (!isAArch64)
+    SW.printBoolean("Fragment",
+                    RF.Flag() == RuntimeFunctionFlag::RFF_PackedFragment);
   SW.printNumber("FunctionLength", RF.FunctionLength());
   SW.startLine() << "ReturnType: " << RF.Ret() << '\n';
   SW.printBoolean("HomedParameters", RF.H());
@@ -749,6 +1072,10 @@ bool Decoder::dumpProcedureDataEntry(const COFFObjectFile &COFF,
   DictScope RFS(SW, "RuntimeFunction");
   if (Entry.Flag() == RuntimeFunctionFlag::RFF_Unpacked)
     return dumpUnpackedEntry(COFF, Section, Offset, Index, Entry);
+  if (isAArch64) {
+    SW.startLine() << "Packed unwind data not yet supported for ARM64\n";
+    return true;
+  }
   return dumpPackedEntry(COFF, Section, Offset, Index, Entry);
 }
 
diff --git a/tools/llvm-readobj/ARMWinEHPrinter.h b/tools/llvm-readobj/ARMWinEHPrinter.h
index 95f521702268a89e56d3ef49d13219a4c4e0e5ee..e271a1e6fe77206962b5509cd45484411061470d 100644
--- a/tools/llvm-readobj/ARMWinEHPrinter.h
+++ b/tools/llvm-readobj/ARMWinEHPrinter.h
@@ -24,13 +24,16 @@ class Decoder {
 
   ScopedPrinter &SW;
   raw_ostream &OS;
+  bool isAArch64;
 
   struct RingEntry {
     uint8_t Mask;
     uint8_t Value;
+    uint8_t Length;
     bool (Decoder::*Routine)(const uint8_t *, unsigned &, unsigned, bool);
   };
   static const RingEntry Ring[];
+  static const RingEntry Ring64[];
 
   bool opcode_0xxxxxxx(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
@@ -75,6 +78,50 @@ class Decoder {
   bool opcode_11111111(const uint8_t *Opcodes, unsigned &Offset,
                        unsigned Length, bool Prologue);
 
+  // ARM64 unwind codes start here.
+  bool opcode_alloc_s(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                      bool Prologue);
+  bool opcode_save_r19r20_x(const uint8_t *Opcodes, unsigned &Offset,
+                            unsigned Length, bool Prologue);
+  bool opcode_save_fplr(const uint8_t *Opcodes, unsigned &Offset,
+                        unsigned Length, bool Prologue);
+  bool opcode_save_fplr_x(const uint8_t *Opcodes, unsigned &Offset,
+                          unsigned Length, bool Prologue);
+  bool opcode_alloc_m(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                      bool Prologue);
+  bool opcode_save_regp(const uint8_t *Opcodes, unsigned &Offset,
+                        unsigned Length, bool Prologue);
+  bool opcode_save_regp_x(const uint8_t *Opcodes, unsigned &Offset,
+                          unsigned Length, bool Prologue);
+  bool opcode_save_reg(const uint8_t *Opcodes, unsigned &Offset,
+                       unsigned Length, bool Prologue);
+  bool opcode_save_reg_x(const uint8_t *Opcodes, unsigned &Offset,
+                         unsigned Length, bool Prologue);
+  bool opcode_save_lrpair(const uint8_t *Opcodes, unsigned &Offset,
+                          unsigned Length, bool Prologue);
+  bool opcode_save_fregp(const uint8_t *Opcodes, unsigned &Offset,
+                         unsigned Length, bool Prologue);
+  bool opcode_save_fregp_x(const uint8_t *Opcodes, unsigned &Offset,
+                           unsigned Length, bool Prologue);
+  bool opcode_save_freg(const uint8_t *Opcodes, unsigned &Offset,
+                        unsigned Length, bool Prologue);
+  bool opcode_save_freg_x(const uint8_t *Opcodes, unsigned &Offset,
+                          unsigned Length, bool Prologue);
+  bool opcode_alloc_l(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                      bool Prologue);
+  bool opcode_setfp(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                    bool Prologue);
+  bool opcode_addfp(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                    bool Prologue);
+  bool opcode_nop(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                  bool Prologue);
+  bool opcode_end(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                  bool Prologue);
+  bool opcode_end_c(const uint8_t *Opcodes, unsigned &Offset, unsigned Length,
+                    bool Prologue);
+  bool opcode_save_next(const uint8_t *Opcodes, unsigned &Offset,
+                        unsigned Length, bool Prologue);
+
   void decodeOpcodes(ArrayRef<uint8_t> Opcodes, unsigned Offset,
                      bool Prologue);
 
@@ -107,7 +154,9 @@ class Decoder {
                          const object::SectionRef Section);
 
 public:
-  Decoder(ScopedPrinter &SW) : SW(SW), OS(SW.getOStream()) {}
+  Decoder(ScopedPrinter &SW, bool isAArch64) : SW(SW),
+                                               OS(SW.getOStream()),
+                                               isAArch64(isAArch64) {}
   std::error_code dumpProcedureData(const object::COFFObjectFile &COFF);
 };
 }
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index fe31c36b6025f96e835f78a9bce90ff9341c573e..7f5907139930457e35cf1a9cf77deaa00f72513c 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -1248,7 +1248,9 @@ void COFFDumper::mergeCodeViewTypes(MergingTypeTableBuilder &CVIDs,
         error(object_error::parse_failed);
       }
       SmallVector<TypeIndex, 128> SourceToDest;
-      if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, Types))
+      Optional<EndPrecompRecord> EndPrecomp;
+      if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, Types,
+                                          EndPrecomp))
         return error(std::move(EC));
     }
   }
@@ -1549,8 +1551,10 @@ void COFFDumper::printUnwindInfo() {
     Dumper.printData(Ctx);
     break;
   }
+  case COFF::IMAGE_FILE_MACHINE_ARM64:
   case COFF::IMAGE_FILE_MACHINE_ARMNT: {
-    ARM::WinEH::Decoder Decoder(W);
+    ARM::WinEH::Decoder Decoder(W, Obj->getMachine() ==
+                                       COFF::IMAGE_FILE_MACHINE_ARM64);
     Decoder.dumpProcedureData(*Obj);
     break;
   }
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index 6f71d2d8b6b27ffc5b1f0d42df964f6888a24fee..c91d2c548bfb321619b38ef5fd286a2f113eaa6f 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -22,9 +22,9 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -43,6 +43,7 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
@@ -390,6 +391,33 @@ private:
     return to_hexString(Value, false);
   }
 
+  template <typename T, typename TEnum>
+  std::string printFlags(T Value, ArrayRef<EnumEntry<TEnum>> EnumValues,
+                         TEnum EnumMask1 = {}, TEnum EnumMask2 = {},
+                         TEnum EnumMask3 = {}) {
+    std::string Str;
+    for (const auto &Flag : EnumValues) {
+      if (Flag.Value == 0)
+        continue;
+
+      TEnum EnumMask{};
+      if (Flag.Value & EnumMask1)
+        EnumMask = EnumMask1;
+      else if (Flag.Value & EnumMask2)
+        EnumMask = EnumMask2;
+      else if (Flag.Value & EnumMask3)
+        EnumMask = EnumMask3;
+      bool IsEnum = (Flag.Value & EnumMask) != 0;
+      if ((!IsEnum && (Value & Flag.Value) == Flag.Value) ||
+          (IsEnum && (Value & EnumMask) == Flag.Value)) {
+        if (!Str.empty())
+          Str += ", ";
+        Str += Flag.AltName;
+      }
+    }
+    return Str;
+  }
+
   formatted_raw_ostream &printField(struct Field F) {
     if (F.Column != 0)
       OS.PadToColumn(F.Column);
@@ -1167,6 +1195,7 @@ static const char *getElfSegmentType(unsigned Arch, unsigned Type) {
     switch (Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, PT_ARM_EXIDX);
     }
+    break;
   case ELF::EM_MIPS:
   case ELF::EM_MIPS_RS3_LE:
     switch (Type) {
@@ -1175,6 +1204,7 @@ static const char *getElfSegmentType(unsigned Arch, unsigned Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_OPTIONS);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_MIPS_ABIFLAGS);
     }
+    break;
   }
 
   switch (Type) {
@@ -1221,7 +1251,7 @@ static std::string getElfPtType(unsigned Arch, unsigned Type) {
     case ELF::EM_ARM:
       if (Type == ELF::PT_ARM_EXIDX)
         return "EXIDX";
-      return "";
+      break;
     case ELF::EM_MIPS:
     case ELF::EM_MIPS_RS3_LE:
       switch (Type) {
@@ -1234,7 +1264,7 @@ static std::string getElfPtType(unsigned Arch, unsigned Type) {
       case PT_MIPS_ABIFLAGS:
         return "ABIFLAGS";
       }
-      return "";
+      break;
     }
   }
   return std::string("<unknown>: ") + to_string(format_hex(Type, 1));
@@ -1247,49 +1277,49 @@ static const EnumEntry<unsigned> ElfSegmentFlags[] = {
 };
 
 static const EnumEntry<unsigned> ElfHeaderMipsFlags[] = {
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_NOREORDER),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_PIC),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_CPIC),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_32BITMODE),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_FP64),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_NAN2008),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI_O32),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI_O64),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI_EABI32),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ABI_EABI64),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_3900),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4010),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4100),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4650),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4120),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_4111),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_SB1),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_OCTEON),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_XLR),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_OCTEON2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_OCTEON3),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_5400),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_5900),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_5500),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_9000),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_LS2E),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_LS2F),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MACH_LS3A),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_MICROMIPS),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_ASE_M16),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_ASE_MDMX),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_1),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_3),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_4),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_5),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_32),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_64),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_32R2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_64R2),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_32R6),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_MIPS_ARCH_64R6)
+  ENUM_ENT(EF_MIPS_NOREORDER, "noreorder"),
+  ENUM_ENT(EF_MIPS_PIC, "pic"),
+  ENUM_ENT(EF_MIPS_CPIC, "cpic"),
+  ENUM_ENT(EF_MIPS_ABI2, "abi2"),
+  ENUM_ENT(EF_MIPS_32BITMODE, "32bitmode"),
+  ENUM_ENT(EF_MIPS_FP64, "fp64"),
+  ENUM_ENT(EF_MIPS_NAN2008, "nan2008"),
+  ENUM_ENT(EF_MIPS_ABI_O32, "o32"),
+  ENUM_ENT(EF_MIPS_ABI_O64, "o64"),
+  ENUM_ENT(EF_MIPS_ABI_EABI32, "eabi32"),
+  ENUM_ENT(EF_MIPS_ABI_EABI64, "eabi64"),
+  ENUM_ENT(EF_MIPS_MACH_3900, "3900"),
+  ENUM_ENT(EF_MIPS_MACH_4010, "4010"),
+  ENUM_ENT(EF_MIPS_MACH_4100, "4100"),
+  ENUM_ENT(EF_MIPS_MACH_4650, "4650"),
+  ENUM_ENT(EF_MIPS_MACH_4120, "4120"),
+  ENUM_ENT(EF_MIPS_MACH_4111, "4111"),
+  ENUM_ENT(EF_MIPS_MACH_SB1, "sb1"),
+  ENUM_ENT(EF_MIPS_MACH_OCTEON, "octeon"),
+  ENUM_ENT(EF_MIPS_MACH_XLR, "xlr"),
+  ENUM_ENT(EF_MIPS_MACH_OCTEON2, "octeon2"),
+  ENUM_ENT(EF_MIPS_MACH_OCTEON3, "octeon3"),
+  ENUM_ENT(EF_MIPS_MACH_5400, "5400"),
+  ENUM_ENT(EF_MIPS_MACH_5900, "5900"),
+  ENUM_ENT(EF_MIPS_MACH_5500, "5500"),
+  ENUM_ENT(EF_MIPS_MACH_9000, "9000"),
+  ENUM_ENT(EF_MIPS_MACH_LS2E, "loongson-2e"),
+  ENUM_ENT(EF_MIPS_MACH_LS2F, "loongson-2f"),
+  ENUM_ENT(EF_MIPS_MACH_LS3A, "loongson-3a"),
+  ENUM_ENT(EF_MIPS_MICROMIPS, "micromips"),
+  ENUM_ENT(EF_MIPS_ARCH_ASE_M16, "mips16"),
+  ENUM_ENT(EF_MIPS_ARCH_ASE_MDMX, "mdmx"),
+  ENUM_ENT(EF_MIPS_ARCH_1, "mips1"),
+  ENUM_ENT(EF_MIPS_ARCH_2, "mips2"),
+  ENUM_ENT(EF_MIPS_ARCH_3, "mips3"),
+  ENUM_ENT(EF_MIPS_ARCH_4, "mips4"),
+  ENUM_ENT(EF_MIPS_ARCH_5, "mips5"),
+  ENUM_ENT(EF_MIPS_ARCH_32, "mips32"),
+  ENUM_ENT(EF_MIPS_ARCH_64, "mips64"),
+  ENUM_ENT(EF_MIPS_ARCH_32R2, "mips32r2"),
+  ENUM_ENT(EF_MIPS_ARCH_64R2, "mips64r2"),
+  ENUM_ENT(EF_MIPS_ARCH_32R6, "mips32r6"),
+  ENUM_ENT(EF_MIPS_ARCH_64R6, "mips64r6")
 };
 
 static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
@@ -1325,15 +1355,17 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX902),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX904),
   LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK)
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK),
+  LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_SRAM_ECC)
 };
 
 static const EnumEntry<unsigned> ElfHeaderRISCVFlags[] = {
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_RVC),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_FLOAT_ABI_SINGLE),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_FLOAT_ABI_DOUBLE),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_FLOAT_ABI_QUAD),
-  LLVM_READOBJ_ENUM_ENT(ELF, EF_RISCV_RVE)
+  ENUM_ENT(EF_RISCV_RVC, "RVC"),
+  ENUM_ENT(EF_RISCV_FLOAT_ABI_SINGLE, "single-float ABI"),
+  ENUM_ENT(EF_RISCV_FLOAT_ABI_DOUBLE, "double-float ABI"),
+  ENUM_ENT(EF_RISCV_FLOAT_ABI_QUAD, "quad-float ABI"),
+  ENUM_ENT(EF_RISCV_RVE, "RVE")
 };
 
 static const EnumEntry<unsigned> ElfSymOtherFlags[] = {
@@ -1423,7 +1455,7 @@ ELFDumper<ELFT>::ELFDumper(const ELFFile<ELFT> *Obj, ScopedPrinter &Writer)
       break;
     case ELF::SHT_LLVM_CALL_GRAPH_PROFILE:
       if (DotCGProfileSec != nullptr)
-        reportError("Multiple .note.llvm.cgprofile");
+        reportError("Multiple .llvm.call-graph-profile");
       DotCGProfileSec = &Sec;
       break;
     case ELF::SHT_LLVM_ADDRSIG:
@@ -1610,29 +1642,32 @@ static const char *getTypeString(unsigned Arch, uint64_t Type) {
   case EM_HEXAGON:
     switch (Type) {
 #define HEXAGON_DYNAMIC_TAG(name, value)                                       \
-  case DT_##name:                                                              \
-    return #name;
+    case DT_##name:                                                            \
+      return #name;
 #include "llvm/BinaryFormat/DynamicTags.def"
 #undef HEXAGON_DYNAMIC_TAG
     }
+    break;
 
   case EM_MIPS:
     switch (Type) {
 #define MIPS_DYNAMIC_TAG(name, value)                                          \
-  case DT_##name:                                                              \
-    return #name;
+    case DT_##name:                                                            \
+      return #name;
 #include "llvm/BinaryFormat/DynamicTags.def"
 #undef MIPS_DYNAMIC_TAG
     }
+    break;
 
-    case EM_PPC64:
-      switch(Type) {
+  case EM_PPC64:
+    switch(Type) {
 #define PPC64_DYNAMIC_TAG(name, value)                                         \
     case DT_##name:                                                            \
       return #name;
 #include "llvm/BinaryFormat/DynamicTags.def"
 #undef PPC64_DYNAMIC_TAG
     }
+    break;
   }
 #undef DYNAMIC_TAG
   switch (Type) {
@@ -2517,7 +2552,17 @@ template <class ELFT> void GNUStyle<ELFT>::printFileHeaders(const ELFO *Obj) {
   printFields(OS, "Start of program headers:", Str);
   Str = to_string(e->e_shoff) + " (bytes into file)";
   printFields(OS, "Start of section headers:", Str);
+  std::string ElfFlags;
+  if (e->e_machine == EM_MIPS)
+    ElfFlags =
+        printFlags(e->e_flags, makeArrayRef(ElfHeaderMipsFlags),
+                   unsigned(ELF::EF_MIPS_ARCH), unsigned(ELF::EF_MIPS_ABI),
+                   unsigned(ELF::EF_MIPS_MACH));
+  else if (e->e_machine == EM_RISCV)
+    ElfFlags = printFlags(e->e_flags, makeArrayRef(ElfHeaderRISCVFlags));
   Str = "0x" + to_hexString(e->e_flags);
+  if (!ElfFlags.empty())
+    Str = Str + ", " + ElfFlags;
   printFields(OS, "Flags:", Str);
   Str = to_string(e->e_ehsize) + " (bytes)";
   printFields(OS, "Size of this header:", Str);
@@ -2791,11 +2836,13 @@ std::string getSectionTypeString(unsigned Arch, unsigned Type) {
     case SHT_ARM_OVERLAYSECTION:
       return "ARM_OVERLAYSECTION";
     }
+    break;
   case EM_X86_64:
     switch (Type) {
     case SHT_X86_64_UNWIND:
       return "X86_64_UNWIND";
     }
+    break;
   case EM_MIPS:
   case EM_MIPS_RS3_LE:
     switch (Type) {
@@ -2808,6 +2855,7 @@ std::string getSectionTypeString(unsigned Arch, unsigned Type) {
     case SHT_MIPS_DWARF:
       return "SHT_MIPS_DWARF";
     }
+    break;
   }
   switch (Type) {
   case SHT_NULL:
@@ -3607,40 +3655,41 @@ static std::string getAMDGPUNoteTypeName(const uint32_t NT) {
 }
 
 template <typename ELFT>
-static void printGNUProperty(raw_ostream &OS, uint32_t Type, uint32_t DataSize,
-                             ArrayRef<uint8_t> Data) {
+static std::string getGNUProperty(uint32_t Type, uint32_t DataSize,
+                                  ArrayRef<uint8_t> Data) {
+  std::string str;
+  raw_string_ostream OS(str);
   switch (Type) {
   default:
-    OS << format("    <application-specific type 0x%x>\n", Type);
-    return;
+    OS << format("<application-specific type 0x%x>", Type);
+    return OS.str();
   case GNU_PROPERTY_STACK_SIZE: {
-    OS << "    stack size: ";
+    OS << "stack size: ";
     if (DataSize == sizeof(typename ELFT::uint))
-      OS << format("0x%llx\n",
-                   (uint64_t)(*(const typename ELFT::Addr *)Data.data()));
+      OS << formatv("{0:x}",
+                    (uint64_t)(*(const typename ELFT::Addr *)Data.data()));
     else
-      OS << format("<corrupt length: 0x%x>\n", DataSize);
-    break;
+      OS << format("<corrupt length: 0x%x>", DataSize);
+    return OS.str();
   }
   case GNU_PROPERTY_NO_COPY_ON_PROTECTED:
-    OS << "    no copy on protected";
+    OS << "no copy on protected";
     if (DataSize)
       OS << format(" <corrupt length: 0x%x>", DataSize);
-    OS << "\n";
-    break;
+    return OS.str();
   case GNU_PROPERTY_X86_FEATURE_1_AND:
-    OS << "    X86 features: ";
+    OS << "X86 features: ";
     if (DataSize != 4 && DataSize != 8) {
-      OS << format("<corrupt length: 0x%x>\n", DataSize);
-      break;
+      OS << format("<corrupt length: 0x%x>", DataSize);
+      return OS.str();
     }
     uint64_t CFProtection =
         (DataSize == 4)
             ? support::endian::read32<ELFT::TargetEndianness>(Data.data())
             : support::endian::read64<ELFT::TargetEndianness>(Data.data());
     if (CFProtection == 0) {
-      OS << "none\n";
-      break;
+      OS << "none";
+      return OS.str();
     }
     if (CFProtection & GNU_PROPERTY_X86_FEATURE_1_IBT) {
       OS << "IBT";
@@ -3656,105 +3705,144 @@ static void printGNUProperty(raw_ostream &OS, uint32_t Type, uint32_t DataSize,
     }
     if (CFProtection)
       OS << format("<unknown flags: 0x%llx>", CFProtection);
-    OS << "\n";
-    break;
+    return OS.str();
   }
 }
 
 template <typename ELFT>
-static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
-                         ArrayRef<typename ELFT::Word> Words, size_t Size) {
+static SmallVector<std::string, 4>
+getGNUPropertyList(ArrayRef<typename ELFT::Word> Words) {
   using Elf_Word = typename ELFT::Word;
 
+  SmallVector<std::string, 4> Properties;
+  ArrayRef<uint8_t> Arr(reinterpret_cast<const uint8_t *>(Words.data()),
+                        Words.size());
+  while (Arr.size() >= 8) {
+    uint32_t Type = *reinterpret_cast<const Elf_Word *>(Arr.data());
+    uint32_t DataSize = *reinterpret_cast<const Elf_Word *>(Arr.data() + 4);
+    Arr = Arr.drop_front(8);
+
+    // Take padding size into account if present.
+    uint64_t PaddedSize = alignTo(DataSize, sizeof(typename ELFT::uint));
+    std::string str;
+    raw_string_ostream OS(str);
+    if (Arr.size() < PaddedSize) {
+      OS << format("<corrupt type (0x%x) datasz: 0x%x>", Type, DataSize);
+      Properties.push_back(OS.str());
+      break;
+    }
+    Properties.push_back(
+        getGNUProperty<ELFT>(Type, DataSize, Arr.take_front(PaddedSize)));
+    Arr = Arr.drop_front(PaddedSize);
+  }
+
+  if (!Arr.empty())
+    Properties.push_back("<corrupted GNU_PROPERTY_TYPE_0>");
+
+  return Properties;
+}
+
+struct GNUAbiTag {
+  std::string OSName;
+  std::string ABI;
+  bool IsValid;
+};
+
+template <typename ELFT>
+static GNUAbiTag getGNUAbiTag(ArrayRef<typename ELFT::Word> Words) {
+  if (Words.size() < 4)
+    return {"", "", /*IsValid=*/false};
+
+  static const char *OSNames[] = {
+      "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable", "NaCl",
+  };
+  StringRef OSName = "Unknown";
+  if (Words[0] < array_lengthof(OSNames))
+    OSName = OSNames[Words[0]];
+  uint32_t Major = Words[1], Minor = Words[2], Patch = Words[3];
+  std::string str;
+  raw_string_ostream ABI(str);
+  ABI << Major << "." << Minor << "." << Patch;
+  return {OSName, ABI.str(), /*IsValid=*/true};
+}
+
+template <typename ELFT>
+static std::string getGNUBuildId(ArrayRef<typename ELFT::Word> Words) {
+  std::string str;
+  raw_string_ostream OS(str);
+  ArrayRef<uint8_t> ID(reinterpret_cast<const uint8_t *>(Words.data()),
+                       Words.size());
+  for (const auto &B : ID)
+    OS << format_hex_no_prefix(B, 2);
+  return OS.str();
+}
+
+template <typename ELFT>
+static StringRef getGNUGoldVersion(ArrayRef<typename ELFT::Word> Words) {
+  return StringRef(reinterpret_cast<const char *>(Words.data()), Words.size());
+}
+
+template <typename ELFT>
+static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
+                         ArrayRef<typename ELFT::Word> Words) {
   switch (NoteType) {
   default:
     return;
   case ELF::NT_GNU_ABI_TAG: {
-    static const char *OSNames[] = {
-        "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable", "NaCl",
-    };
-
-    StringRef OSName = "Unknown";
-    if (Words[0] < array_lengthof(OSNames))
-      OSName = OSNames[Words[0]];
-    uint32_t Major = Words[1], Minor = Words[2], Patch = Words[3];
-
-    if (Words.size() < 4)
+    const GNUAbiTag &AbiTag = getGNUAbiTag<ELFT>(Words);
+    if (!AbiTag.IsValid)
       OS << "    <corrupt GNU_ABI_TAG>";
     else
-      OS << "    OS: " << OSName << ", ABI: " << Major << "." << Minor << "."
-         << Patch;
+      OS << "    OS: " << AbiTag.OSName << ", ABI: " << AbiTag.ABI;
     break;
   }
   case ELF::NT_GNU_BUILD_ID: {
-    OS << "    Build ID: ";
-    ArrayRef<uint8_t> ID(reinterpret_cast<const uint8_t *>(Words.data()), Size);
-    for (const auto &B : ID)
-      OS << format_hex_no_prefix(B, 2);
+    OS << "    Build ID: " << getGNUBuildId<ELFT>(Words);
     break;
   }
   case ELF::NT_GNU_GOLD_VERSION:
-    OS << "    Version: "
-       << StringRef(reinterpret_cast<const char *>(Words.data()), Size);
+    OS << "    Version: " << getGNUGoldVersion<ELFT>(Words);
     break;
   case ELF::NT_GNU_PROPERTY_TYPE_0:
     OS << "    Properties:";
-
-    ArrayRef<uint8_t> Arr(reinterpret_cast<const uint8_t *>(Words.data()),
-                          Size);
-    while (Arr.size() >= 8) {
-      uint32_t Type = *reinterpret_cast<const Elf_Word *>(Arr.data());
-      uint32_t DataSize = *reinterpret_cast<const Elf_Word *>(Arr.data() + 4);
-      Arr = Arr.drop_front(8);
-
-      // Take padding size into account if present.
-      uint64_t PaddedSize = alignTo(DataSize, sizeof(typename ELFT::uint));
-      if (Arr.size() < PaddedSize) {
-        OS << format("    <corrupt type (0x%x) datasz: 0x%x>\n", Type,
-                     DataSize);
-        break;
-      }
-      printGNUProperty<ELFT>(OS, Type, DataSize, Arr.take_front(PaddedSize));
-      Arr = Arr.drop_front(PaddedSize);
-    }
-
-    if (!Arr.empty())
-      OS << "    <corrupted GNU_PROPERTY_TYPE_0>";
+    for (const auto &Property : getGNUPropertyList<ELFT>(Words))
+      OS << "    " << Property << "\n";
     break;
   }
   OS << '\n';
 }
 
+struct AMDGPUNote {
+  std::string type;
+  std::string value;
+};
+
 template <typename ELFT>
-static void printAMDGPUNote(raw_ostream &OS, uint32_t NoteType,
-                            ArrayRef<typename ELFT::Word> Words, size_t Size) {
+static AMDGPUNote getAMDGPUNote(uint32_t NoteType,
+                                ArrayRef<typename ELFT::Word> Words) {
   switch (NoteType) {
   default:
-    return;
-    case ELF::NT_AMD_AMDGPU_HSA_METADATA:
-      OS << "    HSA Metadata:\n"
-         << StringRef(reinterpret_cast<const char *>(Words.data()), Size);
-      break;
-    case ELF::NT_AMD_AMDGPU_ISA:
-      OS << "    ISA Version:\n"
-         << "        "
-         << StringRef(reinterpret_cast<const char *>(Words.data()), Size);
-      break;
-    case ELF::NT_AMD_AMDGPU_PAL_METADATA:
-      const uint32_t *PALMetadataBegin = reinterpret_cast<const uint32_t *>(Words.data());
-      const uint32_t *PALMetadataEnd = PALMetadataBegin + Size;
-      std::vector<uint32_t> PALMetadata(PALMetadataBegin, PALMetadataEnd);
-      std::string PALMetadataString;
-      auto Error = AMDGPU::PALMD::toString(PALMetadata, PALMetadataString);
-      OS << "    PAL Metadata:\n";
-      if (Error) {
-        OS << "        Invalid";
-        return;
-      }
-      OS << PALMetadataString;
-      break;
+    return {"", ""};
+  case ELF::NT_AMD_AMDGPU_HSA_METADATA:
+    return {"HSA Metadata",
+            std::string(reinterpret_cast<const char *>(Words.data()),
+                        Words.size())};
+  case ELF::NT_AMD_AMDGPU_ISA:
+    return {"ISA Version",
+            std::string(reinterpret_cast<const char *>(Words.data()),
+                        Words.size())};
+  case ELF::NT_AMD_AMDGPU_PAL_METADATA:
+    const uint32_t *PALMetadataBegin =
+        reinterpret_cast<const uint32_t *>(Words.data());
+    const uint32_t *PALMetadataEnd = PALMetadataBegin + Words.size();
+    std::vector<uint32_t> PALMetadata(PALMetadataBegin, PALMetadataEnd);
+    std::string PALMetadataString;
+    auto Error = AMDGPU::PALMD::toString(PALMetadata, PALMetadataString);
+    if (Error) {
+      return {"PAL Metadata", "Invalid"};
+    }
+    return {"PAL Metadata", PALMetadataString};
   }
-  OS.flush();
 }
 
 template <class ELFT>
@@ -3779,12 +3867,14 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
 
     if (Name == "GNU") {
       OS << getGNUNoteTypeName(Type) << '\n';
-      printGNUNote<ELFT>(OS, Type, Descriptor, Descriptor.size());
+      printGNUNote<ELFT>(OS, Type, Descriptor);
     } else if (Name == "FreeBSD") {
       OS << getFreeBSDNoteTypeName(Type) << '\n';
     } else if (Name == "AMD") {
       OS << getAMDGPUNoteTypeName(Type) << '\n';
-      printAMDGPUNote<ELFT>(OS, Type, Descriptor, Descriptor.size());
+      const AMDGPUNote N = getAMDGPUNote<ELFT>(Type, Descriptor);
+      if (!N.type.empty())
+        OS << "    " << N.type << ":\n        " << N.value << '\n';
     } else {
       OS << "Unknown note type: (" << format_hex(Type, 10) << ')';
     }
@@ -4388,9 +4478,98 @@ void LLVMStyle<ELFT>::printAddrsig(const ELFFile<ELFT> *Obj) {
   }
 }
 
+template <typename ELFT>
+static void printGNUNoteLLVMStyle(uint32_t NoteType,
+                                  ArrayRef<typename ELFT::Word> Words,
+                                  ScopedPrinter &W) {
+  switch (NoteType) {
+  default:
+    return;
+  case ELF::NT_GNU_ABI_TAG: {
+    const GNUAbiTag &AbiTag = getGNUAbiTag<ELFT>(Words);
+    if (!AbiTag.IsValid) {
+      W.printString("ABI", "<corrupt GNU_ABI_TAG>");
+    } else {
+      W.printString("OS", AbiTag.OSName);
+      W.printString("ABI", AbiTag.ABI);
+    }
+    break;
+  }
+  case ELF::NT_GNU_BUILD_ID: {
+    W.printString("Build ID", getGNUBuildId<ELFT>(Words));
+    break;
+  }
+  case ELF::NT_GNU_GOLD_VERSION:
+    W.printString("Version", getGNUGoldVersion<ELFT>(Words));
+    break;
+  case ELF::NT_GNU_PROPERTY_TYPE_0:
+    ListScope D(W, "Property");
+    for (const auto &Property : getGNUPropertyList<ELFT>(Words))
+      W.printString(Property);
+    break;
+  }
+}
+
 template <class ELFT>
 void LLVMStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
-  W.startLine() << "printNotes not implemented!\n";
+  ListScope L(W, "Notes");
+  const Elf_Ehdr *e = Obj->getHeader();
+  bool IsCore = e->e_type == ELF::ET_CORE;
+
+  auto PrintHeader = [&](const typename ELFT::Off Offset,
+                         const typename ELFT::Addr Size) {
+    W.printHex("Offset", Offset);
+    W.printHex("Size", Size);
+  };
+
+  auto ProcessNote = [&](const Elf_Note &Note) {
+    DictScope D2(W, "Note");
+    StringRef Name = Note.getName();
+    ArrayRef<Elf_Word> Descriptor = Note.getDesc();
+    Elf_Word Type = Note.getType();
+
+    W.printString("Owner", Name);
+    W.printHex("Data size", Descriptor.size());
+    if (Name == "GNU") {
+      W.printString("Type", getGNUNoteTypeName(Type));
+      printGNUNoteLLVMStyle<ELFT>(Type, Descriptor, W);
+    } else if (Name == "FreeBSD") {
+      W.printString("Type", getFreeBSDNoteTypeName(Type));
+    } else if (Name == "AMD") {
+      W.printString("Type", getAMDGPUNoteTypeName(Type));
+      const AMDGPUNote N = getAMDGPUNote<ELFT>(Type, Descriptor);
+      if (!N.type.empty())
+        W.printString(N.type, N.value);
+    } else {
+      W.getOStream() << "Unknown note type: (" << format_hex(Type, 10) << ')';
+    }
+  };
+
+  if (IsCore) {
+    for (const auto &P : unwrapOrError(Obj->program_headers())) {
+      if (P.p_type != PT_NOTE)
+        continue;
+      DictScope D(W, "NoteSection");
+      PrintHeader(P.p_offset, P.p_filesz);
+      Error Err = Error::success();
+      for (const auto &Note : Obj->notes(P, Err))
+        ProcessNote(Note);
+      if (Err)
+        error(std::move(Err));
+    }
+  } else {
+    for (const auto &S : unwrapOrError(Obj->sections())) {
+      if (S.sh_type != SHT_NOTE)
+        continue;
+      DictScope D(W, "NoteSection");
+      PrintHeader(S.sh_offset, S.sh_size);
+      Error Err = Error::success();
+      for (const auto &Note : Obj->notes(S, Err))
+        ProcessNote(Note);
+      if (Err)
+        error(std::move(Err));
+    }
+  }
 }
 
 template <class ELFT>
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 54db1ec113fc37c0e961f2722cd51228a5ed5117..6ef282365745a7d2122cdd976a0b613ca2f91ceb 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -88,25 +88,30 @@ CheckFiles("check",
            cl::desc("File containing RuntimeDyld verifier checks."),
            cl::ZeroOrMore);
 
-static cl::opt<uint64_t>
+// Tracking BUG: 19665
+// http://llvm.org/bugs/show_bug.cgi?id=19665
+//
+// Do not change these options to cl::opt<uint64_t> since this silently breaks
+// argument parsing.
+static cl::opt<unsigned long long>
 PreallocMemory("preallocate",
               cl::desc("Allocate memory upfront rather than on-demand"),
               cl::init(0));
 
-static cl::opt<uint64_t>
+static cl::opt<unsigned long long>
 TargetAddrStart("target-addr-start",
                 cl::desc("For -verify only: start of phony target address "
                          "range."),
                 cl::init(4096), // Start at "page 1" - no allocating at "null".
                 cl::Hidden);
 
-static cl::opt<uint64_t>
+static cl::opt<unsigned long long>
 TargetAddrEnd("target-addr-end",
               cl::desc("For -verify only: end of phony target address range."),
               cl::init(~0ULL),
               cl::Hidden);
 
-static cl::opt<uint64_t>
+static cl::opt<unsigned long long>
 TargetSectionSep("target-section-sep",
                  cl::desc("For -verify only: Separation between sections in "
                           "phony target address space."),
@@ -577,7 +582,11 @@ static void remapSectionsAndSymbols(const llvm::Triple &TargetTriple,
     if (LoadAddr &&
         *LoadAddr != static_cast<uint64_t>(
                        reinterpret_cast<uintptr_t>(Tmp->first))) {
-      AlreadyAllocated[*LoadAddr] = Tmp->second;
+      // A section will have a LoadAddr of 0 if it wasn't loaded for whatever
+      // reason (e.g. zero byte COFF sections). Don't include those sections in
+      // the allocation map.
+      if (*LoadAddr != 0)
+        AlreadyAllocated[*LoadAddr] = Tmp->second;
       Worklist.erase(Tmp);
     }
   }
diff --git a/tools/llvm-shlib/CMakeLists.txt b/tools/llvm-shlib/CMakeLists.txt
index dab1893c6879b447a4c464639b1a69a74802ae67..187066e5ded245f7d532d6d86accbf794afc59be 100644
--- a/tools/llvm-shlib/CMakeLists.txt
+++ b/tools/llvm-shlib/CMakeLists.txt
@@ -44,6 +44,7 @@ if(LLVM_BUILD_LLVM_DYLIB)
   list(REMOVE_DUPLICATES LIB_NAMES)
   if(("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux") OR (MINGW) OR (HAIKU)
      OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "FreeBSD")
+     OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "GNU")
      OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "OpenBSD")
      OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "Fuchsia")
      OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "DragonFly")
diff --git a/tools/llvm-size/llvm-size.cpp b/tools/llvm-size/llvm-size.cpp
index ed53bacc7c39317a33297a2348f8bb79b590f4a7..ad1aefcafcbdbfd9977f933658b42c867d9f5351 100644
--- a/tools/llvm-size/llvm-size.cpp
+++ b/tools/llvm-size/llvm-size.cpp
@@ -71,9 +71,11 @@ ArchFlags("arch", cl::desc("architecture(s) from a Mach-O file to dump"),
 static bool ArchAll = false;
 
 enum RadixTy { octal = 8, decimal = 10, hexadecimal = 16 };
-static cl::opt<unsigned int>
-Radix("radix", cl::desc("Print size in radix. Only 8, 10, and 16 are valid"),
-      cl::init(decimal));
+static cl::opt<RadixTy> Radix(
+    "radix", cl::desc("Print size in radix"), cl::init(decimal),
+    cl::values(clEnumValN(octal, "8", "Print size in octal"),
+               clEnumValN(decimal, "10", "Print size in decimal"),
+               clEnumValN(hexadecimal, "16", "Print size in hexadecimal")));
 
 static cl::opt<RadixTy>
 RadixShort(cl::desc("Print size in radix:"),
@@ -865,7 +867,7 @@ int main(int argc, char **argv) {
   if (OutputFormatShort.getNumOccurrences())
     OutputFormat = static_cast<OutputFormatTy>(OutputFormatShort);
   if (RadixShort.getNumOccurrences())
-    Radix = RadixShort;
+    Radix = RadixShort.getValue();
 
   for (unsigned i = 0; i < ArchFlags.size(); ++i) {
     if (ArchFlags[i] == "all") {
diff --git a/tools/llvm-strings/llvm-strings.cpp b/tools/llvm-strings/llvm-strings.cpp
index 8e2d213bcc73c126297d8558bf43b632ad595c2b..c355caf899d55b557313ba53b18ee2788a21a3f5 100644
--- a/tools/llvm-strings/llvm-strings.cpp
+++ b/tools/llvm-strings/llvm-strings.cpp
@@ -80,7 +80,7 @@ static void strings(raw_ostream &OS, StringRef FileName, StringRef Contents) {
   const char *B = Contents.begin();
   const char *P = nullptr, *E = nullptr, *S = nullptr;
   for (P = Contents.begin(), E = Contents.end(); P < E; ++P) {
-    if (std::isgraph(*P) || std::isblank(*P)) {
+    if (isPrint(*P) || *P == '\t') {
       if (S == nullptr)
         S = P;
     } else if (S) {
diff --git a/tools/llvm-xray/xray-account.cpp b/tools/llvm-xray/xray-account.cpp
index 93bb271b32805ba64b54049ef36bf94fd245457e..3f01605fd85242e5f3246d5ca1fe7a470cb27bdd 100644
--- a/tools/llvm-xray/xray-account.cpp
+++ b/tools/llvm-xray/xray-account.cpp
@@ -146,6 +146,10 @@ bool LatencyAccountant::accountRecord(const XRayRecord &Record) {
 
   auto &ThreadStack = PerThreadFunctionStack[Record.TId];
   switch (Record.Type) {
+  case RecordTypes::CUSTOM_EVENT:
+  case RecordTypes::TYPED_EVENT:
+    // TODO: Support custom and typed event accounting in the future.
+    return true;
   case RecordTypes::ENTER:
   case RecordTypes::ENTER_ARG: {
     ThreadStack.emplace_back(Record.FuncId, Record.TSC);
@@ -417,19 +421,25 @@ namespace llvm {
 template <> struct format_provider<llvm::xray::RecordTypes> {
   static void format(const llvm::xray::RecordTypes &T, raw_ostream &Stream,
                      StringRef Style) {
-    switch(T) {
-      case RecordTypes::ENTER:
-        Stream << "enter";
-        break;
-      case RecordTypes::ENTER_ARG:
-        Stream << "enter-arg";
-        break;
-      case RecordTypes::EXIT:
-        Stream << "exit";
-        break;
-      case RecordTypes::TAIL_EXIT:
-        Stream << "tail-exit";
-        break;
+    switch (T) {
+    case RecordTypes::ENTER:
+      Stream << "enter";
+      break;
+    case RecordTypes::ENTER_ARG:
+      Stream << "enter-arg";
+      break;
+    case RecordTypes::EXIT:
+      Stream << "exit";
+      break;
+    case RecordTypes::TAIL_EXIT:
+      Stream << "tail-exit";
+      break;
+    case RecordTypes::CUSTOM_EVENT:
+      Stream << "custom-event";
+      break;
+    case RecordTypes::TYPED_EVENT:
+      Stream << "typed-event";
+      break;
     }
   }
 };
diff --git a/tools/llvm-xray/xray-converter.cpp b/tools/llvm-xray/xray-converter.cpp
index 1faa49cf4314f4dd78342239ccb091781c76c9ea..3f153b99bc9366adfc710f3cc518bbcd678602f7 100644
--- a/tools/llvm-xray/xray-converter.cpp
+++ b/tools/llvm-xray/xray-converter.cpp
@@ -92,9 +92,10 @@ void TraceConverter::exportAsYAML(const Trace &Records, raw_ostream &OS) {
     Trace.Records.push_back({R.RecordType, R.CPU, R.Type, R.FuncId,
                              Symbolize ? FuncIdHelper.SymbolOrNumber(R.FuncId)
                                        : llvm::to_string(R.FuncId),
-                             R.TSC, R.TId, R.PId, R.CallArgs});
+                             R.TSC, R.TId, R.PId, R.CallArgs, R.Data});
   }
   Output Out(OS, nullptr, 0);
+  Out.setWriteDefaultValues(false);
   Out << Trace;
 }
 
@@ -123,21 +124,27 @@ void TraceConverter::exportAsRAWv1(const Trace &Records, raw_ostream &OS) {
   // Then write out the rest of the records, still in an endian-appropriate
   // format.
   for (const auto &R : Records) {
-    Writer.write(R.RecordType);
-    // The on disk naive raw format uses 8 bit CPUs, but the record has 16.
-    // There's no choice but truncation.
-    Writer.write(static_cast<uint8_t>(R.CPU));
     switch (R.Type) {
     case RecordTypes::ENTER:
     case RecordTypes::ENTER_ARG:
+      Writer.write(R.RecordType);
+      Writer.write(static_cast<uint8_t>(R.CPU));
       Writer.write(uint8_t{0});
       break;
     case RecordTypes::EXIT:
+      Writer.write(R.RecordType);
+      Writer.write(static_cast<uint8_t>(R.CPU));
       Writer.write(uint8_t{1});
       break;
     case RecordTypes::TAIL_EXIT:
+      Writer.write(R.RecordType);
+      Writer.write(static_cast<uint8_t>(R.CPU));
       Writer.write(uint8_t{2});
       break;
+    case RecordTypes::CUSTOM_EVENT:
+    case RecordTypes::TYPED_EVENT:
+      // Skip custom and typed event records for v1 logs.
+      continue;
     }
     Writer.write(R.FuncId);
     Writer.write(R.TSC);
@@ -264,6 +271,10 @@ void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
     double EventTimestampUs = double(1000000) / CycleFreq * double(R.TSC);
     StackTrieNode *&StackCursor = StackCursorByThreadId[R.TId];
     switch (R.Type) {
+    case RecordTypes::CUSTOM_EVENT:
+    case RecordTypes::TYPED_EVENT:
+      // TODO: Support typed and custom event rendering on Chrome Trace Viewer.
+      break;
     case RecordTypes::ENTER:
     case RecordTypes::ENTER_ARG:
       StackCursor = findOrCreateStackNode(StackCursor, R.FuncId, R.TId,
diff --git a/tools/llvm-xray/xray-graph.cpp b/tools/llvm-xray/xray-graph.cpp
index c619bf86299bf807401b0a26732f82f5edb147d7..fe49cca20d577305ddea4f5dee57bfb49befc5cc 100644
--- a/tools/llvm-xray/xray-graph.cpp
+++ b/tools/llvm-xray/xray-graph.cpp
@@ -246,6 +246,10 @@ Error GraphRenderer::accountRecord(const XRayRecord &Record) {
     updateStat(G[Record.FuncId].S, D);
     break;
   }
+  case RecordTypes::CUSTOM_EVENT:
+  case RecordTypes::TYPED_EVENT:
+    // TODO: Support custom and typed events in the graph processing?
+    break;
   }
 
   return Error::success();
diff --git a/tools/llvm-xray/xray-stacks.cpp b/tools/llvm-xray/xray-stacks.cpp
index 1a6069780a31880ca04e54366c579df7984b9994..059940b7756ff2b428d85c352c050118bccce6dc 100644
--- a/tools/llvm-xray/xray-stacks.cpp
+++ b/tools/llvm-xray/xray-stacks.cpp
@@ -366,6 +366,9 @@ public:
                                     AccountRecordState *state) {
     auto &TS = ThreadStackMap[R.TId];
     switch (R.Type) {
+    case RecordTypes::CUSTOM_EVENT:
+    case RecordTypes::TYPED_EVENT:
+      return AccountRecordStatus::OK;
     case RecordTypes::ENTER:
     case RecordTypes::ENTER_ARG: {
       state->wasLastRecordExit = false;
diff --git a/tools/opt-remarks/CMakeLists.txt b/tools/opt-remarks/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a87beae1e8931505dfcb3e1a516dfb76db9fcd2f
--- /dev/null
+++ b/tools/opt-remarks/CMakeLists.txt
@@ -0,0 +1,22 @@
+set(LLVM_LINK_COMPONENTS
+  OptRemarks
+  )
+
+set(SOURCES
+  liboptremarks.cpp
+  )
+
+set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/OptRemarks.exports)
+
+add_llvm_library(OptRemarks SHARED ${SOURCES})
+
+install(FILES ${LLVM_MAIN_INCLUDE_DIR}/llvm-c/OptRemarks.h
+  DESTINATION include/llvm-c
+  COMPONENT OptRemarks)
+
+if (APPLE)
+  set(OPTREMARKS_VERSION ${LLVM_VERSION_MAJOR})
+  set_property(TARGET OptRemarks APPEND_STRING PROPERTY
+              LINK_FLAGS
+              " -compatibility_version 1 -current_version ${OPTREMARKS_VERSION}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}")
+endif()
diff --git a/tools/opt-remarks/OptRemarks.exports b/tools/opt-remarks/OptRemarks.exports
new file mode 100644
index 0000000000000000000000000000000000000000..c3f678d754fc0c54835187f9be75f39a3b3f40d3
--- /dev/null
+++ b/tools/opt-remarks/OptRemarks.exports
@@ -0,0 +1,6 @@
+LLVMOptRemarkParserCreate
+LLVMOptRemarkParserGetNext
+LLVMOptRemarkParserHasError
+LLVMOptRemarkParserGetErrorMessage
+LLVMOptRemarkParserDispose
+LLVMOptRemarkVersion
diff --git a/tools/opt-remarks/liboptremarks.cpp b/tools/opt-remarks/liboptremarks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..13acada06ac2c0015a7a3d7ee200c98120a268ab
--- /dev/null
+++ b/tools/opt-remarks/liboptremarks.cpp
@@ -0,0 +1,18 @@
+//===-liboptremarks.cpp - LLVM Opt-Remarks Shared Library -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Provide a library to work with optimization remarks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/OptRemarks.h"
+
+extern uint32_t LLVMOptRemarkVersion(void) {
+  return OPT_REMARKS_API_VERSION;
+}
diff --git a/tools/opt/NewPMDriver.cpp b/tools/opt/NewPMDriver.cpp
index 55ca23cd6f32178bfff2a20bf659662dd987b6b0..e2f9a06523a81c608f6f14c5d5ff5bae361c34ea 100644
--- a/tools/opt/NewPMDriver.cpp
+++ b/tools/opt/NewPMDriver.cpp
@@ -108,24 +108,30 @@ static cl::opt<PGOKind> PGOKindFlag(
                           "Use sampled profile to guide PGO.")));
 static cl::opt<std::string> ProfileFile(
     "profile-file", cl::desc("Path to the profile."), cl::Hidden);
+static cl::opt<std::string>
+    ProfileRemappingFile("profile-remapping-file",
+                         cl::desc("Path to the profile remapping file."),
+                         cl::Hidden);
 static cl::opt<bool> DebugInfoForProfiling(
     "new-pm-debug-info-for-profiling", cl::init(false), cl::Hidden,
     cl::desc("Emit special debug info to enable PGO profile generation."));
 /// @}}
 
 template <typename PassManagerT>
-bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) {
-  if (PipelineText.empty())
+bool tryParsePipelineText(PassBuilder &PB,
+                          const cl::opt<std::string> &PipelineOpt) {
+  if (PipelineOpt.empty())
     return false;
 
   // Verify the pipeline is parseable:
   PassManagerT PM;
-  if (PB.parsePassPipeline(PM, PipelineText))
-    return true;
-
-  errs() << "Could not parse pipeline '" << PipelineText
-         << "'. I'm going to igore it.\n";
-  return false;
+  if (auto Err = PB.parsePassPipeline(PM, PipelineOpt)) {
+    errs() << "Could not parse -" << PipelineOpt.ArgStr
+           << " pipeline: " << toString(std::move(Err))
+           << "... I'm going to ignore it.\n";
+    return false;
+  }
+  return true;
 }
 
 /// If one of the EPPipeline command line options was given, register callbacks
@@ -133,50 +139,61 @@ bool tryParsePipelineText(PassBuilder &PB, StringRef PipelineText) {
 static void registerEPCallbacks(PassBuilder &PB, bool VerifyEachPass,
                                 bool DebugLogging) {
   if (tryParsePipelineText<FunctionPassManager>(PB, PeepholeEPPipeline))
-    PB.registerPeepholeEPCallback([&PB, VerifyEachPass, DebugLogging](
-        FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerPeepholeEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse PeepholeEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, PeepholeEPPipeline, VerifyEachPass,
+                                   DebugLogging));
+        });
   if (tryParsePipelineText<LoopPassManager>(PB,
                                             LateLoopOptimizationsEPPipeline))
     PB.registerLateLoopOptimizationsEPCallback(
         [&PB, VerifyEachPass, DebugLogging](
             LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline,
-                               VerifyEachPass, DebugLogging);
+          ExitOnError Err("Unable to parse LateLoopOptimizationsEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, LateLoopOptimizationsEPPipeline,
+                                   VerifyEachPass, DebugLogging));
         });
   if (tryParsePipelineText<LoopPassManager>(PB, LoopOptimizerEndEPPipeline))
-    PB.registerLoopOptimizerEndEPCallback([&PB, VerifyEachPass, DebugLogging](
-        LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerLoopOptimizerEndEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            LoopPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse LoopOptimizerEndEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, LoopOptimizerEndEPPipeline,
+                                   VerifyEachPass, DebugLogging));
+        });
   if (tryParsePipelineText<FunctionPassManager>(PB,
                                                 ScalarOptimizerLateEPPipeline))
     PB.registerScalarOptimizerLateEPCallback(
         [&PB, VerifyEachPass, DebugLogging](
             FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-          PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline,
-                               VerifyEachPass, DebugLogging);
+          ExitOnError Err("Unable to parse ScalarOptimizerLateEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, ScalarOptimizerLateEPPipeline,
+                                   VerifyEachPass, DebugLogging));
         });
   if (tryParsePipelineText<CGSCCPassManager>(PB, CGSCCOptimizerLateEPPipeline))
-    PB.registerCGSCCOptimizerLateEPCallback([&PB, VerifyEachPass, DebugLogging](
-        CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerCGSCCOptimizerLateEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            CGSCCPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse CGSCCOptimizerLateEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, CGSCCOptimizerLateEPPipeline,
+                                   VerifyEachPass, DebugLogging));
+        });
   if (tryParsePipelineText<FunctionPassManager>(PB, VectorizerStartEPPipeline))
-    PB.registerVectorizerStartEPCallback([&PB, VerifyEachPass, DebugLogging](
-        FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
-      PB.parsePassPipeline(PM, VectorizerStartEPPipeline, VerifyEachPass,
-                           DebugLogging);
-    });
+    PB.registerVectorizerStartEPCallback(
+        [&PB, VerifyEachPass, DebugLogging](
+            FunctionPassManager &PM, PassBuilder::OptimizationLevel Level) {
+          ExitOnError Err("Unable to parse VectorizerStartEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, VectorizerStartEPPipeline,
+                                   VerifyEachPass, DebugLogging));
+        });
   if (tryParsePipelineText<ModulePassManager>(PB, PipelineStartEPPipeline))
     PB.registerPipelineStartEPCallback(
         [&PB, VerifyEachPass, DebugLogging](ModulePassManager &PM) {
-          PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
-                               DebugLogging);
+          ExitOnError Err("Unable to parse PipelineStartEP pipeline: ");
+          Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline, VerifyEachPass,
+                                   DebugLogging));
         });
 }
 
@@ -200,17 +217,17 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   Optional<PGOOptions> P;
   switch (PGOKindFlag) {
     case InstrGen:
-      P = PGOOptions(ProfileFile, "", "", true);
+      P = PGOOptions(ProfileFile, "", "", "", true);
       break;
     case InstrUse:
-      P = PGOOptions("", ProfileFile, "", false);
+      P = PGOOptions("", ProfileFile, "", ProfileRemappingFile, false);
       break;
     case SampleUse:
-      P = PGOOptions("", "", ProfileFile, false);
+      P = PGOOptions("", "", ProfileFile, ProfileRemappingFile, false);
       break;
     case NoPGO:
       if (DebugInfoForProfiling)
-        P = PGOOptions("", "", "", false, true);
+        P = PGOOptions("", "", "", "", false, true);
       else
         P = None;
   }
@@ -254,8 +271,8 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   // Specially handle the alias analysis manager so that we can register
   // a custom pipeline of AA passes with it.
   AAManager AA;
-  if (!PB.parseAAPipeline(AA, AAPipeline)) {
-    errs() << Arg0 << ": unable to parse AA pipeline description.\n";
+  if (auto Err = PB.parseAAPipeline(AA, AAPipeline)) {
+    errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
     return false;
   }
 
@@ -280,8 +297,9 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   if (EnableDebugify)
     MPM.addPass(NewPMDebugifyPass());
 
-  if (!PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
-    errs() << Arg0 << ": unable to parse pass pipeline description.\n";
+  if (auto Err =
+          PB.parsePassPipeline(MPM, PassPipeline, VerifyEachPass, DebugPM)) {
+    errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
     return false;
   }
 
diff --git a/tools/xcode-toolchain/CMakeLists.txt b/tools/xcode-toolchain/CMakeLists.txt
index 0ae5e374fe9f6502e5b2700cb5885a6eddd002ba..6167f5f6bdd774613c2a579d33f7028833b51ee1 100644
--- a/tools/xcode-toolchain/CMakeLists.txt
+++ b/tools/xcode-toolchain/CMakeLists.txt
@@ -100,7 +100,7 @@ add_llvm_install_targets(install-xcode-toolchain
                          PREFIX ${LLVMToolchainDir}/usr/)
 
 if(LLVM_DISTRIBUTION_COMPONENTS)
-  if(CMAKE_CONFIGURATION_TYPES)
+  if(LLVM_ENABLE_IDE)
     message(FATAL_ERROR "LLVM_DISTRIBUTION_COMPONENTS cannot be specified with multi-configuration generators (i.e. Xcode or Visual Studio)")
   endif()
 
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index 1212b45fb575d6dd0b8b2b92e6b63bb66f16adc1..b739e857849d8fae1ee83f61c8b2a7ef47a91e55 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -555,6 +555,36 @@ TEST(APFloatTest, MaxNum) {
   EXPECT_EQ(1.0, maxnum(nan, f1).convertToDouble());
 }
 
+TEST(APFloatTest, Minimum) {
+  APFloat f1(1.0);
+  APFloat f2(2.0);
+  APFloat zp(0.0);
+  APFloat zn(-0.0);
+  APFloat nan = APFloat::getNaN(APFloat::IEEEdouble());
+
+  EXPECT_EQ(1.0, minimum(f1, f2).convertToDouble());
+  EXPECT_EQ(1.0, minimum(f2, f1).convertToDouble());
+  EXPECT_EQ(-0.0, minimum(zp, zn).convertToDouble());
+  EXPECT_EQ(-0.0, minimum(zn, zp).convertToDouble());
+  EXPECT_TRUE(std::isnan(minimum(f1, nan).convertToDouble()));
+  EXPECT_TRUE(std::isnan(minimum(nan, f1).convertToDouble()));
+}
+
+TEST(APFloatTest, Maximum) {
+  APFloat f1(1.0);
+  APFloat f2(2.0);
+  APFloat zp(0.0);
+  APFloat zn(-0.0);
+  APFloat nan = APFloat::getNaN(APFloat::IEEEdouble());
+
+  EXPECT_EQ(2.0, maximum(f1, f2).convertToDouble());
+  EXPECT_EQ(2.0, maximum(f2, f1).convertToDouble());
+  EXPECT_EQ(0.0, maximum(zp, zn).convertToDouble());
+  EXPECT_EQ(0.0, maximum(zn, zp).convertToDouble());
+  EXPECT_TRUE(std::isnan(maximum(f1, nan).convertToDouble()));
+  EXPECT_TRUE(std::isnan(maximum(nan, f1).convertToDouble()));
+}
+
 TEST(APFloatTest, Denormal) {
   APFloat::roundingMode rdmd = APFloat::rmNearestTiesToEven;
 
diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp
index 87f22f6f403e27bdc48d816a3947db65a07052c1..ee9c5dd38000d0fa4759c188a39c1cede191a8e8 100644
--- a/unittests/ADT/DenseMapTest.cpp
+++ b/unittests/ADT/DenseMapTest.cpp
@@ -362,6 +362,26 @@ int CountCopyAndMove::Move = 0;
 
 } // anonymous namespace
 
+// Test initializer list construction.
+TEST(DenseMapCustomTest, InitializerList) {
+  DenseMap<int, int> M({{0, 0}, {0, 1}, {1, 2}});
+  EXPECT_EQ(2u, M.size());
+  EXPECT_EQ(1u, M.count(0));
+  EXPECT_EQ(0, M[0]);
+  EXPECT_EQ(1u, M.count(1));
+  EXPECT_EQ(2, M[1]);
+}
+
+// Test initializer list construction.
+TEST(DenseMapCustomTest, EqualityComparison) {
+  DenseMap<int, int> M1({{0, 0}, {1, 2}});
+  DenseMap<int, int> M2({{0, 0}, {1, 2}});
+  DenseMap<int, int> M3({{0, 0}, {1, 3}});
+
+  EXPECT_EQ(M1, M2);
+  EXPECT_NE(M1, M3);
+}
+
 // Test for the default minimum size of a DenseMap
 TEST(DenseMapCustomTest, DefaultMinReservedSizeTest) {
   // IF THIS VALUE CHANGE, please update InitialSizeTest, InitFromIterator, and
diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp
index 0247f023dceabada9f4d998a01c0fe9bdcc26666..7368e2ed0e019d743fce2025deda00d722126bb3 100644
--- a/unittests/ADT/DenseSetTest.cpp
+++ b/unittests/ADT/DenseSetTest.cpp
@@ -80,6 +80,14 @@ TYPED_TEST(DenseSetTest, InitializerList) {
   EXPECT_EQ(0u, set.count(3));
 }
 
+TYPED_TEST(DenseSetTest, InitializerListWithNonPowerOfTwoLength) {
+  TypeParam set({1, 2, 3});
+  EXPECT_EQ(3u, set.size());
+  EXPECT_EQ(1u, set.count(1));
+  EXPECT_EQ(1u, set.count(2));
+  EXPECT_EQ(1u, set.count(3));
+}
+
 TYPED_TEST(DenseSetTest, ConstIteratorComparison) {
   TypeParam set({1});
   const TypeParam &cset = set;
@@ -121,6 +129,15 @@ TYPED_TEST(DenseSetTest, FindAsTest) {
   EXPECT_TRUE(set.find_as("d") == set.end());
 }
 
+TYPED_TEST(DenseSetTest, EqualityComparisonTest) {
+  TypeParam set1({1, 2, 3, 4});
+  TypeParam set2({4, 3, 2, 1});
+  TypeParam set3({2, 3, 4, 5});
+
+  EXPECT_EQ(set1, set2);
+  EXPECT_NE(set1, set3);
+}
+
 // Simple class that counts how many moves and copy happens when growing a map
 struct CountCopyAndMove {
   static int Move;
diff --git a/unittests/ADT/STLExtrasTest.cpp b/unittests/ADT/STLExtrasTest.cpp
index 427d470b61d41a0535735ccb2039f59c2f4dc85a..e65e71fe4855dc83f509dbea2d7d4f17e60d2bb5 100644
--- a/unittests/ADT/STLExtrasTest.cpp
+++ b/unittests/ADT/STLExtrasTest.cpp
@@ -364,6 +364,23 @@ TEST(STLExtrasTest, ADLTest) {
   EXPECT_EQ(5, count);
 }
 
+TEST(STLExtrasTest, EmptyTest) {
+  std::vector<void*> V;
+  EXPECT_TRUE(llvm::empty(V));
+  V.push_back(nullptr);
+  EXPECT_FALSE(llvm::empty(V));
+
+  std::initializer_list<int> E = {};
+  std::initializer_list<int> NotE = {7, 13, 42};
+  EXPECT_TRUE(llvm::empty(E));
+  EXPECT_FALSE(llvm::empty(NotE));
+
+  auto R0 = make_range(V.begin(), V.begin());
+  EXPECT_TRUE(llvm::empty(R0));
+  auto R1 = make_range(V.begin(), V.end());
+  EXPECT_FALSE(llvm::empty(R1));
+}
+
 TEST(STLExtrasTest, EarlyIncrementTest) {
   std::list<int> L = {1, 2, 3, 4};
 
diff --git a/unittests/ADT/SmallSetTest.cpp b/unittests/ADT/SmallSetTest.cpp
index d78a72b38f8b9ed96c4a873782d08e7e4ad55db9..3391a5c83f5a129b04481235c0631d296f520e07 100644
--- a/unittests/ADT/SmallSetTest.cpp
+++ b/unittests/ADT/SmallSetTest.cpp
@@ -142,8 +142,4 @@ TEST(SmallSetTest, IteratorIncMoveCopy) {
   auto Iter2 = s1.begin();
   Iter = std::move(Iter2);
   EXPECT_EQ("str 0", *Iter);
-
-  auto Iter3 = s1.end();
-  Iter3 = Iter2;
-  EXPECT_EQ(Iter3, Iter2);
 }
diff --git a/unittests/ADT/SparseBitVectorTest.cpp b/unittests/ADT/SparseBitVectorTest.cpp
index 9d6f4f1665d54c102ac3d6c5f6d5c1dbbd7f6879..7675ddac14b670b57330b9de30b315cd7dfcfb88 100644
--- a/unittests/ADT/SparseBitVectorTest.cpp
+++ b/unittests/ADT/SparseBitVectorTest.cpp
@@ -31,6 +31,27 @@ TEST(SparseBitVectorTest, TrivialOperation) {
   EXPECT_TRUE(Vec.test(17));
   Vec.clear();
   EXPECT_FALSE(Vec.test(17));
+
+  Vec.set(5);
+  const SparseBitVector<> ConstVec = Vec;
+  EXPECT_TRUE(ConstVec.test(5));
+  EXPECT_FALSE(ConstVec.test(17));
+
+  Vec.set(1337);
+  EXPECT_TRUE(Vec.test(1337));
+  Vec = ConstVec;
+  EXPECT_FALSE(Vec.test(1337));
+
+  Vec.set(1337);
+  EXPECT_FALSE(Vec.empty());
+  SparseBitVector<> MovedVec(std::move(Vec));
+  EXPECT_TRUE(Vec.empty());
+  EXPECT_TRUE(MovedVec.test(5));
+  EXPECT_TRUE(MovedVec.test(1337));
+
+  Vec = std::move(MovedVec);
+  EXPECT_TRUE(MovedVec.empty());
+  EXPECT_FALSE(Vec.empty());
 }
 
 TEST(SparseBitVectorTest, IntersectWith) {
diff --git a/unittests/Analysis/AliasAnalysisTest.cpp b/unittests/Analysis/AliasAnalysisTest.cpp
index 0f0d44f6c78091e0e2ece672d3d35cd9a56e2c4f..42a4210feba26019aafac24cf398c4d50017fc76 100644
--- a/unittests/Analysis/AliasAnalysisTest.cpp
+++ b/unittests/Analysis/AliasAnalysisTest.cpp
@@ -55,8 +55,8 @@ struct AATestPass : FunctionPass {
 
     for (Value *P1 : Pointers)
       for (Value *P2 : Pointers)
-        (void)AA.alias(P1, MemoryLocation::UnknownSize, P2,
-                       MemoryLocation::UnknownSize);
+        (void)AA.alias(P1, LocationSize::unknown(), P2,
+                       LocationSize::unknown());
 
     return false;
   }
diff --git a/unittests/Analysis/AliasSetTrackerTest.cpp b/unittests/Analysis/AliasSetTrackerTest.cpp
index 886971c4d3a243b0306348eef78ca809c6bed689..57d21e2fcb86a236819550c272765456c914a1fd 100644
--- a/unittests/Analysis/AliasSetTrackerTest.cpp
+++ b/unittests/Analysis/AliasSetTrackerTest.cpp
@@ -78,6 +78,8 @@ TEST(AliasSetTracker, AliasUnknownInst) {
   for (auto &Inst : *Test->begin()) {
     bool FoundAS = false;
     for (AliasSet &AS : AST) {
+      if (!Inst.mayReadOrWriteMemory())
+        continue;
       if (!AS.aliasesUnknownInst(&Inst, AA))
         continue;
       ASSERT_NE(FoundAS, true);
diff --git a/unittests/Analysis/CMakeLists.txt b/unittests/Analysis/CMakeLists.txt
index cf1c072fdc32191b3114766270a719e5e1a2e435..7d4fd33716e03b933cfa5042448a454cd5404d57 100644
--- a/unittests/Analysis/CMakeLists.txt
+++ b/unittests/Analysis/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_unittest(AnalysisTests
   CallGraphTest.cpp
   CFGTest.cpp
   CGSCCPassManagerTest.cpp
+  DivergenceAnalysisTest.cpp
   GlobalsModRefTest.cpp
   ValueLatticeTest.cpp
   LazyCallGraphTest.cpp
diff --git a/unittests/Analysis/DivergenceAnalysisTest.cpp b/unittests/Analysis/DivergenceAnalysisTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97dbd18af51d52137ea153f2750eaa0d063e84a3
--- /dev/null
+++ b/unittests/Analysis/DivergenceAnalysisTest.cpp
@@ -0,0 +1,431 @@
+//===- DivergenceAnalysisTest.cpp - DivergenceAnalysis unit tests ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/SyncDependenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace {
+
+BasicBlock *GetBlockByName(StringRef BlockName, Function &F) {
+  for (auto &BB : F) {
+    if (BB.getName() != BlockName)
+      continue;
+    return &BB;
+  }
+  return nullptr;
+}
+
+// We use this fixture to ensure that we clean up DivergenceAnalysis before
+// deleting the PassManager.
+class DivergenceAnalysisTest : public testing::Test {
+protected:
+  LLVMContext Context;
+  Module M;
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI;
+
+  std::unique_ptr<DominatorTree> DT;
+  std::unique_ptr<PostDominatorTree> PDT;
+  std::unique_ptr<LoopInfo> LI;
+  std::unique_ptr<SyncDependenceAnalysis> SDA;
+
+  DivergenceAnalysisTest() : M("", Context), TLII(), TLI(TLII) {}
+
+  DivergenceAnalysis buildDA(Function &F, bool IsLCSSA) {
+    DT.reset(new DominatorTree(F));
+    PDT.reset(new PostDominatorTree(F));
+    LI.reset(new LoopInfo(*DT));
+    SDA.reset(new SyncDependenceAnalysis(*DT, *PDT, *LI));
+    return DivergenceAnalysis(F, nullptr, *DT, *LI, *SDA, IsLCSSA);
+  }
+
+  void runWithDA(
+      Module &M, StringRef FuncName, bool IsLCSSA,
+      function_ref<void(Function &F, LoopInfo &LI, DivergenceAnalysis &DA)>
+          Test) {
+    auto *F = M.getFunction(FuncName);
+    ASSERT_NE(F, nullptr) << "Could not find " << FuncName;
+    DivergenceAnalysis DA = buildDA(*F, IsLCSSA);
+    Test(*F, *LI, DA);
+  }
+};
+
+// Simple initial state test
+TEST_F(DivergenceAnalysisTest, DAInitialState) {
+  IntegerType *IntTy = IntegerType::getInt32Ty(Context);
+  FunctionType *FTy =
+      FunctionType::get(Type::getVoidTy(Context), {IntTy}, false);
+  Function *F = cast<Function>(M.getOrInsertFunction("f", FTy));
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
+  ReturnInst::Create(Context, nullptr, BB);
+
+  DivergenceAnalysis DA = buildDA(*F, false);
+
+  // Whole function region
+  EXPECT_EQ(DA.getRegionLoop(), nullptr);
+
+  // No divergence in initial state
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  // No spurious divergence
+  DA.compute();
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  // Detected divergence after marking
+  Argument &arg = *F->arg_begin();
+  DA.markDivergent(arg);
+
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+  EXPECT_TRUE(DA.isDivergent(arg));
+
+  DA.compute();
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+  EXPECT_TRUE(DA.isDivergent(arg));
+}
+
+TEST_F(DivergenceAnalysisTest, DANoLCSSA) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "target datalayout = \"e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128\" "
+      " "
+      "define i32 @f_1(i8* nocapture %arr, i32 %n, i32* %A, i32* %B) "
+      "    local_unnamed_addr { "
+      "entry: "
+      "  br label %loop.ph "
+      " "
+      "loop.ph: "
+      "  br label %loop "
+      " "
+      "loop: "
+      "  %iv0 = phi i32 [ %iv0.inc, %loop ], [ 0, %loop.ph ] "
+      "  %iv1 = phi i32 [ %iv1.inc, %loop ], [ -2147483648, %loop.ph ] "
+      "  %iv0.inc = add i32 %iv0, 1 "
+      "  %iv1.inc = add i32 %iv1, 3 "
+      "  %cond.cont = icmp slt i32 %iv0, %n "
+      "  br i1 %cond.cont, label %loop, label %for.end.loopexit "
+      " "
+      "for.end.loopexit: "
+      "  ret i32 %iv0 "
+      "} ",
+      Err, C);
+
+  Function *F = M->getFunction("f_1");
+  DivergenceAnalysis DA = buildDA(*F, false);
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  auto ItArg = F->arg_begin();
+  ItArg++;
+  auto &NArg = *ItArg;
+
+  // Seed divergence in argument %n
+  DA.markDivergent(NArg);
+
+  DA.compute();
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+
+  // Verify that "ret %iv.0" is divergent
+  auto ItBlock = F->begin();
+  std::advance(ItBlock, 3);
+  auto &ExitBlock = *GetBlockByName("for.end.loopexit", *F);
+  auto &RetInst = *cast<ReturnInst>(ExitBlock.begin());
+  EXPECT_TRUE(DA.isDivergent(RetInst));
+}
+
+TEST_F(DivergenceAnalysisTest, DALCSSA) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "target datalayout = \"e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128\" "
+      " "
+      "define i32 @f_lcssa(i8* nocapture %arr, i32 %n, i32* %A, i32* %B) "
+      "    local_unnamed_addr { "
+      "entry: "
+      "  br label %loop.ph "
+      " "
+      "loop.ph: "
+      "  br label %loop "
+      " "
+      "loop: "
+      "  %iv0 = phi i32 [ %iv0.inc, %loop ], [ 0, %loop.ph ] "
+      "  %iv1 = phi i32 [ %iv1.inc, %loop ], [ -2147483648, %loop.ph ] "
+      "  %iv0.inc = add i32 %iv0, 1 "
+      "  %iv1.inc = add i32 %iv1, 3 "
+      "  %cond.cont = icmp slt i32 %iv0, %n "
+      "  br i1 %cond.cont, label %loop, label %for.end.loopexit "
+      " "
+      "for.end.loopexit: "
+      "  %val.ret = phi i32 [ %iv0, %loop ] "
+      "  br label %detached.return "
+      " "
+      "detached.return: "
+      "  ret i32 %val.ret "
+      "} ",
+      Err, C);
+
+  Function *F = M->getFunction("f_lcssa");
+  DivergenceAnalysis DA = buildDA(*F, true);
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  auto ItArg = F->arg_begin();
+  ItArg++;
+  auto &NArg = *ItArg;
+
+  // Seed divergence in argument %n
+  DA.markDivergent(NArg);
+
+  DA.compute();
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+
+  // Verify that "ret %iv.0" is divergent
+  auto ItBlock = F->begin();
+  std::advance(ItBlock, 4);
+  auto &ExitBlock = *GetBlockByName("detached.return", *F);
+  auto &RetInst = *cast<ReturnInst>(ExitBlock.begin());
+  EXPECT_TRUE(DA.isDivergent(RetInst));
+}
+
+TEST_F(DivergenceAnalysisTest, DAJoinDivergence) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "target datalayout = \"e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128\" "
+      " "
+      "define void @f_1(i1 %a, i1 %b, i1 %c) "
+      "    local_unnamed_addr { "
+      "A: "
+      "  br i1 %a, label %B, label %C "
+      " "
+      "B: "
+      "  br i1 %b, label %C, label %D "
+      " "
+      "C: "
+      "  %c.join = phi i32 [ 0, %A ], [ 1, %B ] "
+      "  br i1 %c, label %D, label %E "
+      " "
+      "D: "
+      "  %d.join = phi i32 [ 0, %B ], [ 1, %C ] "
+      "  br label %E "
+      " "
+      "E: "
+      "  %e.join = phi i32 [ 0, %C ], [ 1, %D ] "
+      "  ret void "
+      "} "
+      " "
+      "define void @f_2(i1 %a, i1 %b, i1 %c) "
+      "    local_unnamed_addr { "
+      "A: "
+      "  br i1 %a, label %B, label %E "
+      " "
+      "B: "
+      "  br i1 %b, label %C, label %D "
+      " "
+      "C: "
+      "  br label %D "
+      " "
+      "D: "
+      "  %d.join = phi i32 [ 0, %B ], [ 1, %C ] "
+      "  br label %E "
+      " "
+      "E: "
+      "  %e.join = phi i32 [ 0, %A ], [ 1, %D ] "
+      "  ret void "
+      "} "
+      " "
+      "define void @f_3(i1 %a, i1 %b, i1 %c)"
+      "    local_unnamed_addr { "
+      "A: "
+      "  br i1 %a, label %B, label %C "
+      " "
+      "B: "
+      "  br label %C "
+      " "
+      "C: "
+      "  %c.join = phi i32 [ 0, %A ], [ 1, %B ] "
+      "  br i1 %c, label %D, label %E "
+      " "
+      "D: "
+      "  br label %E "
+      " "
+      "E: "
+      "  %e.join = phi i32 [ 0, %C ], [ 1, %D ] "
+      "  ret void "
+      "} ",
+      Err, C);
+
+  // Maps divergent conditions to the basic blocks whose Phi nodes become
+  // divergent. Blocks need to be listed in IR order.
+  using SmallBlockVec = SmallVector<const BasicBlock *, 4>;
+  using InducedDivJoinMap = std::map<const Value *, SmallBlockVec>;
+
+  // Actual function performing the checks.
+  auto CheckDivergenceFunc = [this](Function &F,
+                                    InducedDivJoinMap &ExpectedDivJoins) {
+    for (auto &ItCase : ExpectedDivJoins) {
+      auto *DivVal = ItCase.first;
+      auto DA = buildDA(F, false);
+      DA.markDivergent(*DivVal);
+      DA.compute();
+
+      // List of basic blocks that shall host divergent Phi nodes.
+      auto ItDivJoins = ItCase.second.begin();
+
+      for (auto &BB : F) {
+        auto *Phi = dyn_cast<PHINode>(BB.begin());
+        if (!Phi)
+          continue;
+
+        if (ItDivJoins != ItCase.second.end() && &BB == *ItDivJoins) {
+          EXPECT_TRUE(DA.isDivergent(*Phi));
+          // Advance to next block with expected divergent PHI node.
+          ++ItDivJoins;
+        } else {
+          EXPECT_FALSE(DA.isDivergent(*Phi));
+        }
+      }
+    }
+  };
+
+  {
+    auto *F = M->getFunction("f_1");
+    auto ItBlocks = F->begin();
+    ItBlocks++; // Skip A
+    ItBlocks++; // Skip B
+    auto *C = &*ItBlocks++;
+    auto *D = &*ItBlocks++;
+    auto *E = &*ItBlocks;
+
+    auto ItArg = F->arg_begin();
+    auto *AArg = &*ItArg++;
+    auto *BArg = &*ItArg++;
+    auto *CArg = &*ItArg;
+
+    InducedDivJoinMap DivJoins;
+    DivJoins.emplace(AArg, SmallBlockVec({C, D, E}));
+    DivJoins.emplace(BArg, SmallBlockVec({D, E}));
+    DivJoins.emplace(CArg, SmallBlockVec({E}));
+
+    CheckDivergenceFunc(*F, DivJoins);
+  }
+
+  {
+    auto *F = M->getFunction("f_2");
+    auto ItBlocks = F->begin();
+    ItBlocks++; // Skip A
+    ItBlocks++; // Skip B
+    ItBlocks++; // Skip C
+    auto *D = &*ItBlocks++;
+    auto *E = &*ItBlocks;
+
+    auto ItArg = F->arg_begin();
+    auto *AArg = &*ItArg++;
+    auto *BArg = &*ItArg++;
+    auto *CArg = &*ItArg;
+
+    InducedDivJoinMap DivJoins;
+    DivJoins.emplace(AArg, SmallBlockVec({E}));
+    DivJoins.emplace(BArg, SmallBlockVec({D}));
+    DivJoins.emplace(CArg, SmallBlockVec({}));
+
+    CheckDivergenceFunc(*F, DivJoins);
+  }
+
+  {
+    auto *F = M->getFunction("f_3");
+    auto ItBlocks = F->begin();
+    ItBlocks++; // Skip A
+    ItBlocks++; // Skip B
+    auto *C = &*ItBlocks++;
+    ItBlocks++; // Skip D
+    auto *E = &*ItBlocks;
+
+    auto ItArg = F->arg_begin();
+    auto *AArg = &*ItArg++;
+    auto *BArg = &*ItArg++;
+    auto *CArg = &*ItArg;
+
+    InducedDivJoinMap DivJoins;
+    DivJoins.emplace(AArg, SmallBlockVec({C}));
+    DivJoins.emplace(BArg, SmallBlockVec({}));
+    DivJoins.emplace(CArg, SmallBlockVec({E}));
+
+    CheckDivergenceFunc(*F, DivJoins);
+  }
+}
+
+TEST_F(DivergenceAnalysisTest, DASwitchUnreachableDefault) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> M = parseAssemblyString(
+      "target datalayout = \"e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128\" "
+      " "
+      "define void @switch_unreachable_default(i32 %cond) local_unnamed_addr { "
+      "entry: "
+      "  switch i32 %cond, label %sw.default [ "
+      "    i32 0, label %sw.bb0 "
+      "    i32 1, label %sw.bb1 "
+      "  ] "
+      " "
+      "sw.bb0: "
+      "  br label %sw.epilog "
+      " "
+      "sw.bb1: "
+      "  br label %sw.epilog "
+      " "
+      "sw.default: "
+      "  unreachable "
+      " "
+      "sw.epilog: "
+      "  %div.dbl = phi double [ 0.0, %sw.bb0], [ -1.0, %sw.bb1 ] "
+      "  ret void "
+      "}",
+      Err, C);
+
+  auto *F = M->getFunction("switch_unreachable_default");
+  auto &CondArg = *F->arg_begin();
+  auto DA = buildDA(*F, false);
+
+  EXPECT_FALSE(DA.hasDetectedDivergence());
+
+  DA.markDivergent(CondArg);
+  DA.compute();
+
+  // Still %CondArg is divergent.
+  EXPECT_TRUE(DA.hasDetectedDivergence());
+
+  // The join uni.dbl is not divergent (see D52221)
+  auto &ExitBlock = *GetBlockByName("sw.epilog", *F);
+  auto &DivDblPhi = *cast<PHINode>(ExitBlock.begin());
+  EXPECT_TRUE(DA.isDivergent(DivDblPhi));
+}
+
+} // end anonymous namespace
+} // end namespace llvm
diff --git a/unittests/Analysis/ScalarEvolutionTest.cpp b/unittests/Analysis/ScalarEvolutionTest.cpp
index 98fc44e4923d27dddba4fe019072d9be5c364d2c..3da0614bb71c54918f349c9126abc4ec974d99a3 100644
--- a/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -701,7 +701,7 @@ TEST_F(ScalarEvolutionsTest, SCEVZeroExtendExpr) {
     PN->addIncoming(Dec, IncBB);
     BranchInst::Create(CondBB, IncBB);
 
-    Accum = GetElementPtrInst::Create(I8Ty, Accum, Dec, "gep", EndBB);
+    Accum = GetElementPtrInst::Create(I8Ty, Accum, PN, "gep", EndBB);
 
     PrevBB = CondBB;
     CondBB = NextBB;
diff --git a/unittests/Analysis/TargetLibraryInfoTest.cpp b/unittests/Analysis/TargetLibraryInfoTest.cpp
index ec0f89a7e50080139466a4e6ff7ef0e59ac8c0ce..482d9d8d7c04f32750d680b8e517cbb4086c3aed 100644
--- a/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -76,471 +76,480 @@ TEST_F(TargetLibraryInfoTest, InvalidProto) {
 // Check that we do accept know-correct prototypes.
 TEST_F(TargetLibraryInfoTest, ValidProto) {
   parseAssembly(
-    // These functions use a 64-bit size_t; use the appropriate datalayout.
-    "target datalayout = \"p:64:64:64\"\n"
-
-    // Struct pointers are replaced with an opaque pointer.
-    "%struct = type opaque\n"
-
-    // These functions were extracted as-is from the OS X headers.
-    "declare double @__cospi(double)\n"
-    "declare float @__cospif(float)\n"
-    "declare { double, double } @__sincospi_stret(double)\n"
-    "declare <2 x float> @__sincospif_stret(float)\n"
-    "declare double @__sinpi(double)\n"
-    "declare float @__sinpif(float)\n"
-    "declare i32 @abs(i32)\n"
-    "declare i32 @access(i8*, i32)\n"
-    "declare double @acos(double)\n"
-    "declare float @acosf(float)\n"
-    "declare double @acosh(double)\n"
-    "declare float @acoshf(float)\n"
-    "declare x86_fp80 @acoshl(x86_fp80)\n"
-    "declare x86_fp80 @acosl(x86_fp80)\n"
-    "declare double @asin(double)\n"
-    "declare float @asinf(float)\n"
-    "declare double @asinh(double)\n"
-    "declare float @asinhf(float)\n"
-    "declare x86_fp80 @asinhl(x86_fp80)\n"
-    "declare x86_fp80 @asinl(x86_fp80)\n"
-    "declare double @atan(double)\n"
-    "declare double @atan2(double, double)\n"
-    "declare float @atan2f(float, float)\n"
-    "declare x86_fp80 @atan2l(x86_fp80, x86_fp80)\n"
-    "declare float @atanf(float)\n"
-    "declare double @atanh(double)\n"
-    "declare float @atanhf(float)\n"
-    "declare x86_fp80 @atanhl(x86_fp80)\n"
-    "declare x86_fp80 @atanl(x86_fp80)\n"
-    "declare double @atof(i8*)\n"
-    "declare i32 @atoi(i8*)\n"
-    "declare i64 @atol(i8*)\n"
-    "declare i64 @atoll(i8*)\n"
-    "declare i32 @bcmp(i8*, i8*, i64)\n"
-    "declare void @bcopy(i8*, i8*, i64)\n"
-    "declare void @bzero(i8*, i64)\n"
-    "declare i8* @calloc(i64, i64)\n"
-    "declare double @cbrt(double)\n"
-    "declare float @cbrtf(float)\n"
-    "declare x86_fp80 @cbrtl(x86_fp80)\n"
-    "declare double @ceil(double)\n"
-    "declare float @ceilf(float)\n"
-    "declare x86_fp80 @ceill(x86_fp80)\n"
-    "declare i32 @chown(i8*, i32, i32)\n"
-    "declare void @clearerr(%struct*)\n"
-    "declare double @copysign(double, double)\n"
-    "declare float @copysignf(float, float)\n"
-    "declare x86_fp80 @copysignl(x86_fp80, x86_fp80)\n"
-    "declare double @cabs([2 x double])\n"
-    "declare float @cabsf([2 x float])\n"
-    "declare x86_fp80 @cabsl([2 x x86_fp80])\n"
-    "declare double @cos(double)\n"
-    "declare float @cosf(float)\n"
-    "declare double @cosh(double)\n"
-    "declare float @coshf(float)\n"
-    "declare x86_fp80 @coshl(x86_fp80)\n"
-    "declare x86_fp80 @cosl(x86_fp80)\n"
-    "declare i8* @ctermid(i8*)\n"
-    "declare double @exp(double)\n"
-    "declare double @exp2(double)\n"
-    "declare float @exp2f(float)\n"
-    "declare x86_fp80 @exp2l(x86_fp80)\n"
-    "declare float @expf(float)\n"
-    "declare x86_fp80 @expl(x86_fp80)\n"
-    "declare double @expm1(double)\n"
-    "declare float @expm1f(float)\n"
-    "declare x86_fp80 @expm1l(x86_fp80)\n"
-    "declare double @fabs(double)\n"
-    "declare float @fabsf(float)\n"
-    "declare x86_fp80 @fabsl(x86_fp80)\n"
-    "declare i32 @fclose(%struct*)\n"
-    "declare i32 @feof(%struct*)\n"
-    "declare i32 @ferror(%struct*)\n"
-    "declare i32 @fflush(%struct*)\n"
-    "declare i32 @ffs(i32)\n"
-    "declare i32 @ffsl(i64)\n"
-    "declare i32 @ffsll(i64)\n"
-    "declare i32 @fgetc(%struct*)\n"
-    "declare i32 @fgetc_unlocked(%struct*)\n"
-    "declare i32 @fgetpos(%struct*, i64*)\n"
-    "declare i8* @fgets(i8*, i32, %struct*)\n"
-    "declare i8* @fgets_unlocked(i8*, i32, %struct*)\n"
-    "declare i32 @fileno(%struct*)\n"
-    "declare void @flockfile(%struct*)\n"
-    "declare double @floor(double)\n"
-    "declare float @floorf(float)\n"
-    "declare x86_fp80 @floorl(x86_fp80)\n"
-    "declare i32 @fls(i32)\n"
-    "declare i32 @flsl(i64)\n"
-    "declare i32 @flsll(i64)\n"
-    "declare double @fmax(double, double)\n"
-    "declare float @fmaxf(float, float)\n"
-    "declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)\n"
-    "declare double @fmin(double, double)\n"
-    "declare float @fminf(float, float)\n"
-    "declare x86_fp80 @fminl(x86_fp80, x86_fp80)\n"
-    "declare double @fmod(double, double)\n"
-    "declare float @fmodf(float, float)\n"
-    "declare x86_fp80 @fmodl(x86_fp80, x86_fp80)\n"
-    "declare i32 @fprintf(%struct*, i8*, ...)\n"
-    "declare i32 @fputc(i32, %struct*)\n"
-    "declare i32 @fputc_unlocked(i32, %struct*)\n"
-    "declare i64 @fread(i8*, i64, i64, %struct*)\n"
-    "declare i64 @fread_unlocked(i8*, i64, i64, %struct*)\n"
-    "declare void @free(i8*)\n"
-    "declare double @frexp(double, i32*)\n"
-    "declare float @frexpf(float, i32*)\n"
-    "declare x86_fp80 @frexpl(x86_fp80, i32*)\n"
-    "declare i32 @fscanf(%struct*, i8*, ...)\n"
-    "declare i32 @fseek(%struct*, i64, i32)\n"
-    "declare i32 @fseeko(%struct*, i64, i32)\n"
-    "declare i32 @fsetpos(%struct*, i64*)\n"
-    "declare i32 @fstatvfs(i32, %struct*)\n"
-    "declare i64 @ftell(%struct*)\n"
-    "declare i64 @ftello(%struct*)\n"
-    "declare i32 @ftrylockfile(%struct*)\n"
-    "declare void @funlockfile(%struct*)\n"
-    "declare i32 @getc(%struct*)\n"
-    "declare i32 @getc_unlocked(%struct*)\n"
-    "declare i32 @getchar()\n"
-    "declare i32 @getchar_unlocked()\n"
-    "declare i8* @getenv(i8*)\n"
-    "declare i32 @getitimer(i32, %struct*)\n"
-    "declare i32 @getlogin_r(i8*, i64)\n"
-    "declare %struct* @getpwnam(i8*)\n"
-    "declare i8* @gets(i8*)\n"
-    "declare i32 @gettimeofday(%struct*, i8*)\n"
-    "declare i32 @_Z7isasciii(i32)\n"
-    "declare i32 @_Z7isdigiti(i32)\n"
-    "declare i64 @labs(i64)\n"
-    "declare double @ldexp(double, i32)\n"
-    "declare float @ldexpf(float, i32)\n"
-    "declare x86_fp80 @ldexpl(x86_fp80, i32)\n"
-    "declare i64 @llabs(i64)\n"
-    "declare double @log(double)\n"
-    "declare double @log10(double)\n"
-    "declare float @log10f(float)\n"
-    "declare x86_fp80 @log10l(x86_fp80)\n"
-    "declare double @log1p(double)\n"
-    "declare float @log1pf(float)\n"
-    "declare x86_fp80 @log1pl(x86_fp80)\n"
-    "declare double @log2(double)\n"
-    "declare float @log2f(float)\n"
-    "declare x86_fp80 @log2l(x86_fp80)\n"
-    "declare double @logb(double)\n"
-    "declare float @logbf(float)\n"
-    "declare x86_fp80 @logbl(x86_fp80)\n"
-    "declare float @logf(float)\n"
-    "declare x86_fp80 @logl(x86_fp80)\n"
-    "declare i8* @malloc(i64)\n"
-    "declare i8* @memccpy(i8*, i8*, i32, i64)\n"
-    "declare i8* @memchr(i8*, i32, i64)\n"
-    "declare i32 @memcmp(i8*, i8*, i64)\n"
-    "declare i8* @memcpy(i8*, i8*, i64)\n"
-    "declare i8* @memmove(i8*, i8*, i64)\n"
-    "declare i8* @memset(i8*, i32, i64)\n"
-    "declare void @memset_pattern16(i8*, i8*, i64)\n"
-    "declare i32 @mkdir(i8*, i16)\n"
-    "declare double @modf(double, double*)\n"
-    "declare float @modff(float, float*)\n"
-    "declare x86_fp80 @modfl(x86_fp80, x86_fp80*)\n"
-    "declare double @nearbyint(double)\n"
-    "declare float @nearbyintf(float)\n"
-    "declare x86_fp80 @nearbyintl(x86_fp80)\n"
-    "declare i32 @pclose(%struct*)\n"
-    "declare void @perror(i8*)\n"
-    "declare i32 @posix_memalign(i8**, i64, i64)\n"
-    "declare double @pow(double, double)\n"
-    "declare float @powf(float, float)\n"
-    "declare x86_fp80 @powl(x86_fp80, x86_fp80)\n"
-    "declare i32 @printf(i8*, ...)\n"
-    "declare i32 @putc(i32, %struct*)\n"
-    "declare i32 @putc_unlocked(i32, %struct*)\n"
-    "declare i32 @putchar(i32)\n"
-    "declare i32 @putchar_unlocked(i32)\n"
-    "declare i32 @puts(i8*)\n"
-    "declare void @qsort(i8*, i64, i64, i32 (i8*, i8*)*)\n"
-    "declare i64 @readlink(i8*, i8*, i64)\n"
-    "declare i8* @realloc(i8*, i64)\n"
-    "declare i8* @reallocf(i8*, i64)\n"
-    "declare i32 @remove(i8*)\n"
-    "declare i32 @rename(i8*, i8*)\n"
-    "declare void @rewind(%struct*)\n"
-    "declare double @rint(double)\n"
-    "declare float @rintf(float)\n"
-    "declare x86_fp80 @rintl(x86_fp80)\n"
-    "declare i32 @rmdir(i8*)\n"
-    "declare double @round(double)\n"
-    "declare float @roundf(float)\n"
-    "declare x86_fp80 @roundl(x86_fp80)\n"
-    "declare i32 @scanf(i8*, ...)\n"
-    "declare void @setbuf(%struct*, i8*)\n"
-    "declare i32 @setitimer(i32, %struct*, %struct*)\n"
-    "declare i32 @setvbuf(%struct*, i8*, i32, i64)\n"
-    "declare double @sin(double)\n"
-    "declare float @sinf(float)\n"
-    "declare double @sinh(double)\n"
-    "declare float @sinhf(float)\n"
-    "declare x86_fp80 @sinhl(x86_fp80)\n"
-    "declare x86_fp80 @sinl(x86_fp80)\n"
-    "declare i32 @snprintf(i8*, i64, i8*, ...)\n"
-    "declare i32 @sprintf(i8*, i8*, ...)\n"
-    "declare double @sqrt(double)\n"
-    "declare float @sqrtf(float)\n"
-    "declare x86_fp80 @sqrtl(x86_fp80)\n"
-    "declare i32 @sscanf(i8*, i8*, ...)\n"
-    "declare i32 @statvfs(i8*, %struct*)\n"
-    "declare i8* @stpcpy(i8*, i8*)\n"
-    "declare i8* @stpncpy(i8*, i8*, i64)\n"
-    "declare i32 @strcasecmp(i8*, i8*)\n"
-    "declare i8* @strcat(i8*, i8*)\n"
-    "declare i8* @strchr(i8*, i32)\n"
-    "declare i32 @strcmp(i8*, i8*)\n"
-    "declare i32 @strcoll(i8*, i8*)\n"
-    "declare i8* @strcpy(i8*, i8*)\n"
-    "declare i64 @strcspn(i8*, i8*)\n"
-    "declare i8* @strdup(i8*)\n"
-    "declare i64 @strlen(i8*)\n"
-    "declare i32 @strncasecmp(i8*, i8*, i64)\n"
-    "declare i8* @strncat(i8*, i8*, i64)\n"
-    "declare i32 @strncmp(i8*, i8*, i64)\n"
-    "declare i8* @strncpy(i8*, i8*, i64)\n"
-    "declare i8* @strndup(i8*, i64)\n"
-    "declare i64 @strnlen(i8*, i64)\n"
-    "declare i8* @strpbrk(i8*, i8*)\n"
-    "declare i8* @strrchr(i8*, i32)\n"
-    "declare i64 @strspn(i8*, i8*)\n"
-    "declare i8* @strstr(i8*, i8*)\n"
-    "declare i8* @strtok(i8*, i8*)\n"
-    "declare i8* @strtok_r(i8*, i8*, i8**)\n"
-    "declare i64 @strtol(i8*, i8**, i32)\n"
-    "declare x86_fp80 @strtold(i8*, i8**)\n"
-    "declare i64 @strtoll(i8*, i8**, i32)\n"
-    "declare i64 @strtoul(i8*, i8**, i32)\n"
-    "declare i64 @strtoull(i8*, i8**, i32)\n"
-    "declare i64 @strxfrm(i8*, i8*, i64)\n"
-    "declare double @tan(double)\n"
-    "declare float @tanf(float)\n"
-    "declare double @tanh(double)\n"
-    "declare float @tanhf(float)\n"
-    "declare x86_fp80 @tanhl(x86_fp80)\n"
-    "declare x86_fp80 @tanl(x86_fp80)\n"
-    "declare i64 @times(%struct*)\n"
-    "declare %struct* @tmpfile()\n"
-    "declare i32 @_Z7toasciii(i32)\n"
-    "declare double @trunc(double)\n"
-    "declare float @truncf(float)\n"
-    "declare x86_fp80 @truncl(x86_fp80)\n"
-    "declare i32 @uname(%struct*)\n"
-    "declare i32 @ungetc(i32, %struct*)\n"
-    "declare i32 @unlink(i8*)\n"
-    "declare i32 @utime(i8*, %struct*)\n"
-    "declare i32 @utimes(i8*, %struct*)\n"
-    "declare i8* @valloc(i64)\n"
-    "declare i32 @vfprintf(%struct*, i8*, %struct*)\n"
-    "declare i32 @vfscanf(%struct*, i8*, %struct*)\n"
-    "declare i32 @vprintf(i8*, %struct*)\n"
-    "declare i32 @vscanf(i8*, %struct*)\n"
-    "declare i32 @vsnprintf(i8*, i64, i8*, %struct*)\n"
-    "declare i32 @vsprintf(i8*, i8*, %struct*)\n"
-    "declare i32 @vsscanf(i8*, i8*, %struct*)\n"
-    "declare i64 @wcslen(i32*)\n"
-
-    // These functions were also extracted from the OS X headers, but they are
-    // available with a special name on darwin.
-    // This test uses the default TLI name instead.
-    "declare i32 @chmod(i8*, i16)\n"
-    "declare i32 @closedir(%struct*)\n"
-    "declare %struct* @fdopen(i32, i8*)\n"
-    "declare %struct* @fopen(i8*, i8*)\n"
-    "declare i32 @fputs(i8*, %struct*)\n"
-    "declare i32 @fputs_unlocked(i8*, %struct*)\n"
-    "declare i32 @fstat(i32, %struct*)\n"
-    "declare i64 @fwrite(i8*, i64, i64, %struct*)\n"
-    "declare i64 @fwrite_unlocked(i8*, i64, i64, %struct*)\n"
-    "declare i32 @lchown(i8*, i32, i32)\n"
-    "declare i32 @lstat(i8*, %struct*)\n"
-    "declare i64 @mktime(%struct*)\n"
-    "declare i32 @open(i8*, i32, ...)\n"
-    "declare %struct* @opendir(i8*)\n"
-    "declare %struct* @popen(i8*, i8*)\n"
-    "declare i64 @pread(i32, i8*, i64, i64)\n"
-    "declare i64 @pwrite(i32, i8*, i64, i64)\n"
-    "declare i64 @read(i32, i8*, i64)\n"
-    "declare i8* @realpath(i8*, i8*)\n"
-    "declare i32 @stat(i8*, %struct*)\n"
-    "declare double @strtod(i8*, i8**)\n"
-    "declare float @strtof(i8*, i8**)\n"
-    "declare i32 @system(i8*)\n"
-    "declare i32 @unsetenv(i8*)\n"
-    "declare i64 @write(i32, i8*, i64)\n"
-
-    // These functions are available on Linux but not Darwin; they only differ
-    // from their non-64 counterparts in the struct type.
-    // Use the same prototype as the non-64 variant.
-    "declare %struct* @fopen64(i8*, i8*)\n"
-    "declare i32 @fstat64(i32, %struct*)\n"
-    "declare i32 @fstatvfs64(i32, %struct*)\n"
-    "declare i32 @lstat64(i8*, %struct*)\n"
-    "declare i32 @open64(i8*, i32, ...)\n"
-    "declare i32 @stat64(i8*, %struct*)\n"
-    "declare i32 @statvfs64(i8*, %struct*)\n"
-    "declare %struct* @tmpfile64()\n"
-
-    // These functions are also -64 variants, but do differ in the type of the
-    // off_t (vs off64_t) parameter.  The non-64 variants declared above used
-    // a 64-bit off_t, so, in practice, they are also equivalent.
-    "declare i32 @fseeko64(%struct*, i64, i32)\n"
-    "declare i64 @ftello64(%struct*)\n"
-
-    "declare void @_ZdaPv(i8*)\n"
-    "declare void @_ZdaPvRKSt9nothrow_t(i8*, %struct*)\n"
-    "declare void @_ZdaPvSt11align_val_t(i8*, i64)\n"
-    "declare void @_ZdaPvSt11align_val_tRKSt9nothrow_t(i8*, i64, %struct*)\n"
-    "declare void @_ZdaPvj(i8*, i32)\n"
-    "declare void @_ZdaPvm(i8*, i64)\n"
-    "declare void @_ZdlPv(i8*)\n"
-    "declare void @_ZdlPvRKSt9nothrow_t(i8*, %struct*)\n"
-    "declare void @_ZdlPvSt11align_val_t(i8*, i64)\n"
-    "declare void @_ZdlPvSt11align_val_tRKSt9nothrow_t(i8*, i64, %struct*)\n"
-    "declare void @_ZdlPvj(i8*, i32)\n"
-    "declare void @_ZdlPvm(i8*, i64)\n"
-    "declare i8* @_Znaj(i32)\n"
-    "declare i8* @_ZnajRKSt9nothrow_t(i32, %struct*)\n"
-    "declare i8* @_ZnajSt11align_val_t(i32, i32)\n"
-    "declare i8* @_ZnajSt11align_val_tRKSt9nothrow_t(i32, i32, %struct*)\n"
-    "declare i8* @_Znam(i64)\n"
-    "declare i8* @_ZnamRKSt9nothrow_t(i64, %struct*)\n"
-    "declare i8* @_ZnamSt11align_val_t(i64, i64)\n"
-    "declare i8* @_ZnamSt11align_val_tRKSt9nothrow_t(i64, i64, %struct*)\n"
-    "declare i8* @_Znwj(i32)\n"
-    "declare i8* @_ZnwjRKSt9nothrow_t(i32, %struct*)\n"
-    "declare i8* @_ZnwjSt11align_val_t(i32, i32)\n"
-    "declare i8* @_ZnwjSt11align_val_tRKSt9nothrow_t(i32, i32, %struct*)\n"
-    "declare i8* @_Znwm(i64)\n"
-    "declare i8* @_ZnwmRKSt9nothrow_t(i64, %struct*)\n"
-    "declare i8* @_ZnwmSt11align_val_t(i64, i64)\n"
-    "declare i8* @_ZnwmSt11align_val_tRKSt9nothrow_t(i64, i64, %struct*)\n"
-
-    "declare void @\"??3@YAXPEAX@Z\"(i8*)\n"
-    "declare void @\"??3@YAXPEAXAEBUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
-    "declare void @\"??3@YAXPEAX_K@Z\"(i8*, i64)\n"
-    "declare void @\"??_V@YAXPEAX@Z\"(i8*)\n"
-    "declare void @\"??_V@YAXPEAXAEBUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
-    "declare void @\"??_V@YAXPEAX_K@Z\"(i8*, i64)\n"
-    "declare i8* @\"??2@YAPAXI@Z\"(i32)\n"
-    "declare i8* @\"??2@YAPAXIABUnothrow_t@std@@@Z\"(i32, %struct*)\n"
-    "declare i8* @\"??2@YAPEAX_K@Z\"(i64)\n"
-    "declare i8* @\"??2@YAPEAX_KAEBUnothrow_t@std@@@Z\"(i64, %struct*)\n"
-    "declare i8* @\"??_U@YAPAXI@Z\"(i32)\n"
-    "declare i8* @\"??_U@YAPAXIABUnothrow_t@std@@@Z\"(i32, %struct*)\n"
-    "declare i8* @\"??_U@YAPEAX_K@Z\"(i64)\n"
-    "declare i8* @\"??_U@YAPEAX_KAEBUnothrow_t@std@@@Z\"(i64, %struct*)\n"
-
-    "declare void @\"??3@YAXPAX@Z\"(i8*)\n"
-    "declare void @\"??3@YAXPAXABUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
-    "declare void @\"??3@YAXPAXI@Z\"(i8*, i32)\n"
-    "declare void @\"??_V@YAXPAX@Z\"(i8*)\n"
-    "declare void @\"??_V@YAXPAXABUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
-    "declare void @\"??_V@YAXPAXI@Z\"(i8*, i32)\n"
-
-    // These other functions were derived from the .def C declaration.
-    "declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)\n"
-    "declare void @__cxa_guard_abort(%struct*)\n"
-    "declare i32 @__cxa_guard_acquire(%struct*)\n"
-    "declare void @__cxa_guard_release(%struct*)\n"
-
-    "declare i32 @__nvvm_reflect(i8*)\n"
-
-    "declare i8* @__memcpy_chk(i8*, i8*, i64, i64)\n"
-    "declare i8* @__memmove_chk(i8*, i8*, i64, i64)\n"
-    "declare i8* @__memset_chk(i8*, i32, i64, i64)\n"
-    "declare i8* @__stpcpy_chk(i8*, i8*, i64)\n"
-    "declare i8* @__stpncpy_chk(i8*, i8*, i64, i64)\n"
-    "declare i8* @__strcpy_chk(i8*, i8*, i64)\n"
-    "declare i8* @__strncpy_chk(i8*, i8*, i64, i64)\n"
-
-    "declare i8* @memalign(i64, i64)\n"
-    "declare i8* @mempcpy(i8*, i8*, i64)\n"
-    "declare i8* @memrchr(i8*, i32, i64)\n"
-
-    // These are similar to the FILE* fgetc/fputc.
-    "declare i32 @_IO_getc(%struct*)\n"
-    "declare i32 @_IO_putc(i32, %struct*)\n"
-
-    "declare i32 @__isoc99_scanf(i8*, ...)\n"
-    "declare i32 @__isoc99_sscanf(i8*, i8*, ...)\n"
-    "declare i8* @__strdup(i8*)\n"
-    "declare i8* @__strndup(i8*, i64)\n"
-    "declare i8* @__strtok_r(i8*, i8*, i8**)\n"
-
-    "declare double @__sqrt_finite(double)\n"
-    "declare float @__sqrtf_finite(float)\n"
-    "declare x86_fp80 @__sqrtl_finite(x86_fp80)\n"
-    "declare double @exp10(double)\n"
-    "declare float @exp10f(float)\n"
-    "declare x86_fp80 @exp10l(x86_fp80)\n"
-
-    // These printf variants have the same prototype as the non-'i' versions.
-    "declare i32 @fiprintf(%struct*, i8*, ...)\n"
-    "declare i32 @iprintf(i8*, ...)\n"
-    "declare i32 @siprintf(i8*, i8*, ...)\n"
-
-    "declare i32 @htonl(i32)\n"
-    "declare i16 @htons(i16)\n"
-    "declare i32 @ntohl(i32)\n"
-    "declare i16 @ntohs(i16)\n"
-
-    "declare i32 @isascii(i32)\n"
-    "declare i32 @isdigit(i32)\n"
-    "declare i32 @toascii(i32)\n"
-
-    // These functions were extracted from math-finite.h which provides
-    // functions similar to those in math.h, but optimized for handling
-    // finite values only.
-    "declare double @__acos_finite(double)\n"
-    "declare float @__acosf_finite(float)\n"
-    "declare x86_fp80 @__acosl_finite(x86_fp80)\n"
-    "declare double @__acosh_finite(double)\n"
-    "declare float @__acoshf_finite(float)\n"
-    "declare x86_fp80 @__acoshl_finite(x86_fp80)\n"
-    "declare double @__asin_finite(double)\n"
-    "declare float @__asinf_finite(float)\n"
-    "declare x86_fp80 @__asinl_finite(x86_fp80)\n"
-    "declare double @__atan2_finite(double, double)\n"
-    "declare float @__atan2f_finite(float, float)\n"
-    "declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)\n"
-    "declare double @__atanh_finite(double)\n"
-    "declare float @__atanhf_finite(float)\n"
-    "declare x86_fp80 @__atanhl_finite(x86_fp80)\n"
-    "declare double @__cosh_finite(double)\n"
-    "declare float @__coshf_finite(float)\n"
-    "declare x86_fp80 @__coshl_finite(x86_fp80)\n"
-    "declare double @__exp10_finite(double)\n"
-    "declare float @__exp10f_finite(float)\n"
-    "declare x86_fp80 @__exp10l_finite(x86_fp80)\n"
-    "declare double @__exp2_finite(double)\n"
-    "declare float @__exp2f_finite(float)\n"
-    "declare x86_fp80 @__exp2l_finite(x86_fp80)\n"
-    "declare double @__exp_finite(double)\n"
-    "declare float @__expf_finite(float)\n"
-    "declare x86_fp80 @__expl_finite(x86_fp80)\n"     
-    "declare double @__log10_finite(double)\n"
-    "declare float @__log10f_finite(float)\n"
-    "declare x86_fp80 @__log10l_finite(x86_fp80)\n"
-    "declare double @__log2_finite(double)\n"
-    "declare float @__log2f_finite(float)\n"
-    "declare x86_fp80 @__log2l_finite(x86_fp80)\n"
-    "declare double @__log_finite(double)\n"
-    "declare float @__logf_finite(float)\n"
-    "declare x86_fp80 @__logl_finite(x86_fp80)\n"
-    "declare double @__pow_finite(double, double)\n"
-    "declare float @__powf_finite(float, float)\n"
-    "declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)\n"
-    "declare double @__sinh_finite(double)\n"
-    "declare float @__sinhf_finite(float)\n"
-    "declare x86_fp80 @__sinhl_finite(x86_fp80)\n"
-    );
+      // These functions use a 64-bit size_t; use the appropriate datalayout.
+      "target datalayout = \"p:64:64:64\"\n"
+
+      // Struct pointers are replaced with an opaque pointer.
+      "%struct = type opaque\n"
+
+      // These functions were extracted as-is from the OS X headers.
+      "declare double @__cospi(double)\n"
+      "declare float @__cospif(float)\n"
+      "declare { double, double } @__sincospi_stret(double)\n"
+      "declare <2 x float> @__sincospif_stret(float)\n"
+      "declare double @__sinpi(double)\n"
+      "declare float @__sinpif(float)\n"
+      "declare i32 @abs(i32)\n"
+      "declare i32 @access(i8*, i32)\n"
+      "declare double @acos(double)\n"
+      "declare float @acosf(float)\n"
+      "declare double @acosh(double)\n"
+      "declare float @acoshf(float)\n"
+      "declare x86_fp80 @acoshl(x86_fp80)\n"
+      "declare x86_fp80 @acosl(x86_fp80)\n"
+      "declare double @asin(double)\n"
+      "declare float @asinf(float)\n"
+      "declare double @asinh(double)\n"
+      "declare float @asinhf(float)\n"
+      "declare x86_fp80 @asinhl(x86_fp80)\n"
+      "declare x86_fp80 @asinl(x86_fp80)\n"
+      "declare double @atan(double)\n"
+      "declare double @atan2(double, double)\n"
+      "declare float @atan2f(float, float)\n"
+      "declare x86_fp80 @atan2l(x86_fp80, x86_fp80)\n"
+      "declare float @atanf(float)\n"
+      "declare double @atanh(double)\n"
+      "declare float @atanhf(float)\n"
+      "declare x86_fp80 @atanhl(x86_fp80)\n"
+      "declare x86_fp80 @atanl(x86_fp80)\n"
+      "declare double @atof(i8*)\n"
+      "declare i32 @atoi(i8*)\n"
+      "declare i64 @atol(i8*)\n"
+      "declare i64 @atoll(i8*)\n"
+      "declare i32 @bcmp(i8*, i8*, i64)\n"
+      "declare void @bcopy(i8*, i8*, i64)\n"
+      "declare void @bzero(i8*, i64)\n"
+      "declare i8* @calloc(i64, i64)\n"
+      "declare double @cbrt(double)\n"
+      "declare float @cbrtf(float)\n"
+      "declare x86_fp80 @cbrtl(x86_fp80)\n"
+      "declare double @ceil(double)\n"
+      "declare float @ceilf(float)\n"
+      "declare x86_fp80 @ceill(x86_fp80)\n"
+      "declare i32 @chown(i8*, i32, i32)\n"
+      "declare void @clearerr(%struct*)\n"
+      "declare double @copysign(double, double)\n"
+      "declare float @copysignf(float, float)\n"
+      "declare x86_fp80 @copysignl(x86_fp80, x86_fp80)\n"
+      "declare double @cabs([2 x double])\n"
+      "declare float @cabsf([2 x float])\n"
+      "declare x86_fp80 @cabsl([2 x x86_fp80])\n"
+      "declare double @cos(double)\n"
+      "declare float @cosf(float)\n"
+      "declare double @cosh(double)\n"
+      "declare float @coshf(float)\n"
+      "declare x86_fp80 @coshl(x86_fp80)\n"
+      "declare x86_fp80 @cosl(x86_fp80)\n"
+      "declare i8* @ctermid(i8*)\n"
+      "declare double @exp(double)\n"
+      "declare double @exp2(double)\n"
+      "declare float @exp2f(float)\n"
+      "declare x86_fp80 @exp2l(x86_fp80)\n"
+      "declare float @expf(float)\n"
+      "declare x86_fp80 @expl(x86_fp80)\n"
+      "declare double @expm1(double)\n"
+      "declare float @expm1f(float)\n"
+      "declare x86_fp80 @expm1l(x86_fp80)\n"
+      "declare double @fabs(double)\n"
+      "declare float @fabsf(float)\n"
+      "declare x86_fp80 @fabsl(x86_fp80)\n"
+      "declare i32 @fclose(%struct*)\n"
+      "declare i32 @feof(%struct*)\n"
+      "declare i32 @ferror(%struct*)\n"
+      "declare i32 @fflush(%struct*)\n"
+      "declare i32 @ffs(i32)\n"
+      "declare i32 @ffsl(i64)\n"
+      "declare i32 @ffsll(i64)\n"
+      "declare i32 @fgetc(%struct*)\n"
+      "declare i32 @fgetc_unlocked(%struct*)\n"
+      "declare i32 @fgetpos(%struct*, i64*)\n"
+      "declare i8* @fgets(i8*, i32, %struct*)\n"
+      "declare i8* @fgets_unlocked(i8*, i32, %struct*)\n"
+      "declare i32 @fileno(%struct*)\n"
+      "declare void @flockfile(%struct*)\n"
+      "declare double @floor(double)\n"
+      "declare float @floorf(float)\n"
+      "declare x86_fp80 @floorl(x86_fp80)\n"
+      "declare i32 @fls(i32)\n"
+      "declare i32 @flsl(i64)\n"
+      "declare i32 @flsll(i64)\n"
+      "declare double @fmax(double, double)\n"
+      "declare float @fmaxf(float, float)\n"
+      "declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)\n"
+      "declare double @fmin(double, double)\n"
+      "declare float @fminf(float, float)\n"
+      "declare x86_fp80 @fminl(x86_fp80, x86_fp80)\n"
+      "declare double @fmod(double, double)\n"
+      "declare float @fmodf(float, float)\n"
+      "declare x86_fp80 @fmodl(x86_fp80, x86_fp80)\n"
+      "declare i32 @fprintf(%struct*, i8*, ...)\n"
+      "declare i32 @fputc(i32, %struct*)\n"
+      "declare i32 @fputc_unlocked(i32, %struct*)\n"
+      "declare i64 @fread(i8*, i64, i64, %struct*)\n"
+      "declare i64 @fread_unlocked(i8*, i64, i64, %struct*)\n"
+      "declare void @free(i8*)\n"
+      "declare double @frexp(double, i32*)\n"
+      "declare float @frexpf(float, i32*)\n"
+      "declare x86_fp80 @frexpl(x86_fp80, i32*)\n"
+      "declare i32 @fscanf(%struct*, i8*, ...)\n"
+      "declare i32 @fseek(%struct*, i64, i32)\n"
+      "declare i32 @fseeko(%struct*, i64, i32)\n"
+      "declare i32 @fsetpos(%struct*, i64*)\n"
+      "declare i32 @fstatvfs(i32, %struct*)\n"
+      "declare i64 @ftell(%struct*)\n"
+      "declare i64 @ftello(%struct*)\n"
+      "declare i32 @ftrylockfile(%struct*)\n"
+      "declare void @funlockfile(%struct*)\n"
+      "declare i32 @getc(%struct*)\n"
+      "declare i32 @getc_unlocked(%struct*)\n"
+      "declare i32 @getchar()\n"
+      "declare i32 @getchar_unlocked()\n"
+      "declare i8* @getenv(i8*)\n"
+      "declare i32 @getitimer(i32, %struct*)\n"
+      "declare i32 @getlogin_r(i8*, i64)\n"
+      "declare %struct* @getpwnam(i8*)\n"
+      "declare i8* @gets(i8*)\n"
+      "declare i32 @gettimeofday(%struct*, i8*)\n"
+      "declare i32 @_Z7isasciii(i32)\n"
+      "declare i32 @_Z7isdigiti(i32)\n"
+      "declare i64 @labs(i64)\n"
+      "declare double @ldexp(double, i32)\n"
+      "declare float @ldexpf(float, i32)\n"
+      "declare x86_fp80 @ldexpl(x86_fp80, i32)\n"
+      "declare i64 @llabs(i64)\n"
+      "declare double @log(double)\n"
+      "declare double @log10(double)\n"
+      "declare float @log10f(float)\n"
+      "declare x86_fp80 @log10l(x86_fp80)\n"
+      "declare double @log1p(double)\n"
+      "declare float @log1pf(float)\n"
+      "declare x86_fp80 @log1pl(x86_fp80)\n"
+      "declare double @log2(double)\n"
+      "declare float @log2f(float)\n"
+      "declare x86_fp80 @log2l(x86_fp80)\n"
+      "declare double @logb(double)\n"
+      "declare float @logbf(float)\n"
+      "declare x86_fp80 @logbl(x86_fp80)\n"
+      "declare float @logf(float)\n"
+      "declare x86_fp80 @logl(x86_fp80)\n"
+      "declare i8* @malloc(i64)\n"
+      "declare i8* @memccpy(i8*, i8*, i32, i64)\n"
+      "declare i8* @memchr(i8*, i32, i64)\n"
+      "declare i32 @memcmp(i8*, i8*, i64)\n"
+      "declare i8* @memcpy(i8*, i8*, i64)\n"
+      "declare i8* @memmove(i8*, i8*, i64)\n"
+      "declare i8* @memset(i8*, i32, i64)\n"
+      "declare void @memset_pattern16(i8*, i8*, i64)\n"
+      "declare i32 @mkdir(i8*, i16)\n"
+      "declare double @modf(double, double*)\n"
+      "declare float @modff(float, float*)\n"
+      "declare x86_fp80 @modfl(x86_fp80, x86_fp80*)\n"
+      "declare double @nearbyint(double)\n"
+      "declare float @nearbyintf(float)\n"
+      "declare x86_fp80 @nearbyintl(x86_fp80)\n"
+      "declare i32 @pclose(%struct*)\n"
+      "declare void @perror(i8*)\n"
+      "declare i32 @posix_memalign(i8**, i64, i64)\n"
+      "declare double @pow(double, double)\n"
+      "declare float @powf(float, float)\n"
+      "declare x86_fp80 @powl(x86_fp80, x86_fp80)\n"
+      "declare i32 @printf(i8*, ...)\n"
+      "declare i32 @putc(i32, %struct*)\n"
+      "declare i32 @putc_unlocked(i32, %struct*)\n"
+      "declare i32 @putchar(i32)\n"
+      "declare i32 @putchar_unlocked(i32)\n"
+      "declare i32 @puts(i8*)\n"
+      "declare void @qsort(i8*, i64, i64, i32 (i8*, i8*)*)\n"
+      "declare i64 @readlink(i8*, i8*, i64)\n"
+      "declare i8* @realloc(i8*, i64)\n"
+      "declare i8* @reallocf(i8*, i64)\n"
+      "declare i32 @remove(i8*)\n"
+      "declare i32 @rename(i8*, i8*)\n"
+      "declare void @rewind(%struct*)\n"
+      "declare double @rint(double)\n"
+      "declare float @rintf(float)\n"
+      "declare x86_fp80 @rintl(x86_fp80)\n"
+      "declare i32 @rmdir(i8*)\n"
+      "declare double @round(double)\n"
+      "declare float @roundf(float)\n"
+      "declare x86_fp80 @roundl(x86_fp80)\n"
+      "declare i32 @scanf(i8*, ...)\n"
+      "declare void @setbuf(%struct*, i8*)\n"
+      "declare i32 @setitimer(i32, %struct*, %struct*)\n"
+      "declare i32 @setvbuf(%struct*, i8*, i32, i64)\n"
+      "declare double @sin(double)\n"
+      "declare float @sinf(float)\n"
+      "declare double @sinh(double)\n"
+      "declare float @sinhf(float)\n"
+      "declare x86_fp80 @sinhl(x86_fp80)\n"
+      "declare x86_fp80 @sinl(x86_fp80)\n"
+      "declare i32 @snprintf(i8*, i64, i8*, ...)\n"
+      "declare i32 @sprintf(i8*, i8*, ...)\n"
+      "declare double @sqrt(double)\n"
+      "declare float @sqrtf(float)\n"
+      "declare x86_fp80 @sqrtl(x86_fp80)\n"
+      "declare i32 @sscanf(i8*, i8*, ...)\n"
+      "declare i32 @statvfs(i8*, %struct*)\n"
+      "declare i8* @stpcpy(i8*, i8*)\n"
+      "declare i8* @stpncpy(i8*, i8*, i64)\n"
+      "declare i32 @strcasecmp(i8*, i8*)\n"
+      "declare i8* @strcat(i8*, i8*)\n"
+      "declare i8* @strchr(i8*, i32)\n"
+      "declare i32 @strcmp(i8*, i8*)\n"
+      "declare i32 @strcoll(i8*, i8*)\n"
+      "declare i8* @strcpy(i8*, i8*)\n"
+      "declare i64 @strcspn(i8*, i8*)\n"
+      "declare i8* @strdup(i8*)\n"
+      "declare i64 @strlen(i8*)\n"
+      "declare i32 @strncasecmp(i8*, i8*, i64)\n"
+      "declare i8* @strncat(i8*, i8*, i64)\n"
+      "declare i32 @strncmp(i8*, i8*, i64)\n"
+      "declare i8* @strncpy(i8*, i8*, i64)\n"
+      "declare i8* @strndup(i8*, i64)\n"
+      "declare i64 @strnlen(i8*, i64)\n"
+      "declare i8* @strpbrk(i8*, i8*)\n"
+      "declare i8* @strrchr(i8*, i32)\n"
+      "declare i64 @strspn(i8*, i8*)\n"
+      "declare i8* @strstr(i8*, i8*)\n"
+      "declare i8* @strtok(i8*, i8*)\n"
+      "declare i8* @strtok_r(i8*, i8*, i8**)\n"
+      "declare i64 @strtol(i8*, i8**, i32)\n"
+      "declare x86_fp80 @strtold(i8*, i8**)\n"
+      "declare i64 @strtoll(i8*, i8**, i32)\n"
+      "declare i64 @strtoul(i8*, i8**, i32)\n"
+      "declare i64 @strtoull(i8*, i8**, i32)\n"
+      "declare i64 @strxfrm(i8*, i8*, i64)\n"
+      "declare double @tan(double)\n"
+      "declare float @tanf(float)\n"
+      "declare double @tanh(double)\n"
+      "declare float @tanhf(float)\n"
+      "declare x86_fp80 @tanhl(x86_fp80)\n"
+      "declare x86_fp80 @tanl(x86_fp80)\n"
+      "declare i64 @times(%struct*)\n"
+      "declare %struct* @tmpfile()\n"
+      "declare i32 @_Z7toasciii(i32)\n"
+      "declare double @trunc(double)\n"
+      "declare float @truncf(float)\n"
+      "declare x86_fp80 @truncl(x86_fp80)\n"
+      "declare i32 @uname(%struct*)\n"
+      "declare i32 @ungetc(i32, %struct*)\n"
+      "declare i32 @unlink(i8*)\n"
+      "declare i32 @utime(i8*, %struct*)\n"
+      "declare i32 @utimes(i8*, %struct*)\n"
+      "declare i8* @valloc(i64)\n"
+      "declare i32 @vfprintf(%struct*, i8*, %struct*)\n"
+      "declare i32 @vfscanf(%struct*, i8*, %struct*)\n"
+      "declare i32 @vprintf(i8*, %struct*)\n"
+      "declare i32 @vscanf(i8*, %struct*)\n"
+      "declare i32 @vsnprintf(i8*, i64, i8*, %struct*)\n"
+      "declare i32 @vsprintf(i8*, i8*, %struct*)\n"
+      "declare i32 @vsscanf(i8*, i8*, %struct*)\n"
+      "declare i64 @wcslen(i32*)\n"
+      "declare i32 @fork()\n"
+      "declare i32 @execl(i8*, i8*, ...)\n"
+      "declare i32 @execle(i8*, i8*, ...)\n"
+      "declare i32 @execlp(i8*, i8*, ...)\n"
+      "declare i32 @execv(i8*, i8**)\n"
+      "declare i32 @execvP(i8*, i8*, i8**)\n"
+      "declare i32 @execve(i8*, i8**, i8**)\n"
+      "declare i32 @execvp(i8*, i8**)\n"
+      "declare i32 @execvpe(i8*, i8**, i8**)\n"
+
+      // These functions were also extracted from the OS X headers, but they are
+      // available with a special name on darwin.
+      // This test uses the default TLI name instead.
+      "declare i32 @chmod(i8*, i16)\n"
+      "declare i32 @closedir(%struct*)\n"
+      "declare %struct* @fdopen(i32, i8*)\n"
+      "declare %struct* @fopen(i8*, i8*)\n"
+      "declare i32 @fputs(i8*, %struct*)\n"
+      "declare i32 @fputs_unlocked(i8*, %struct*)\n"
+      "declare i32 @fstat(i32, %struct*)\n"
+      "declare i64 @fwrite(i8*, i64, i64, %struct*)\n"
+      "declare i64 @fwrite_unlocked(i8*, i64, i64, %struct*)\n"
+      "declare i32 @lchown(i8*, i32, i32)\n"
+      "declare i32 @lstat(i8*, %struct*)\n"
+      "declare i64 @mktime(%struct*)\n"
+      "declare i32 @open(i8*, i32, ...)\n"
+      "declare %struct* @opendir(i8*)\n"
+      "declare %struct* @popen(i8*, i8*)\n"
+      "declare i64 @pread(i32, i8*, i64, i64)\n"
+      "declare i64 @pwrite(i32, i8*, i64, i64)\n"
+      "declare i64 @read(i32, i8*, i64)\n"
+      "declare i8* @realpath(i8*, i8*)\n"
+      "declare i32 @stat(i8*, %struct*)\n"
+      "declare double @strtod(i8*, i8**)\n"
+      "declare float @strtof(i8*, i8**)\n"
+      "declare i32 @system(i8*)\n"
+      "declare i32 @unsetenv(i8*)\n"
+      "declare i64 @write(i32, i8*, i64)\n"
+
+      // These functions are available on Linux but not Darwin; they only differ
+      // from their non-64 counterparts in the struct type.
+      // Use the same prototype as the non-64 variant.
+      "declare %struct* @fopen64(i8*, i8*)\n"
+      "declare i32 @fstat64(i32, %struct*)\n"
+      "declare i32 @fstatvfs64(i32, %struct*)\n"
+      "declare i32 @lstat64(i8*, %struct*)\n"
+      "declare i32 @open64(i8*, i32, ...)\n"
+      "declare i32 @stat64(i8*, %struct*)\n"
+      "declare i32 @statvfs64(i8*, %struct*)\n"
+      "declare %struct* @tmpfile64()\n"
+
+      // These functions are also -64 variants, but do differ in the type of the
+      // off_t (vs off64_t) parameter.  The non-64 variants declared above used
+      // a 64-bit off_t, so, in practice, they are also equivalent.
+      "declare i32 @fseeko64(%struct*, i64, i32)\n"
+      "declare i64 @ftello64(%struct*)\n"
+
+      "declare void @_ZdaPv(i8*)\n"
+      "declare void @_ZdaPvRKSt9nothrow_t(i8*, %struct*)\n"
+      "declare void @_ZdaPvSt11align_val_t(i8*, i64)\n"
+      "declare void @_ZdaPvSt11align_val_tRKSt9nothrow_t(i8*, i64, %struct*)\n"
+      "declare void @_ZdaPvj(i8*, i32)\n"
+      "declare void @_ZdaPvm(i8*, i64)\n"
+      "declare void @_ZdlPv(i8*)\n"
+      "declare void @_ZdlPvRKSt9nothrow_t(i8*, %struct*)\n"
+      "declare void @_ZdlPvSt11align_val_t(i8*, i64)\n"
+      "declare void @_ZdlPvSt11align_val_tRKSt9nothrow_t(i8*, i64, %struct*)\n"
+      "declare void @_ZdlPvj(i8*, i32)\n"
+      "declare void @_ZdlPvm(i8*, i64)\n"
+      "declare i8* @_Znaj(i32)\n"
+      "declare i8* @_ZnajRKSt9nothrow_t(i32, %struct*)\n"
+      "declare i8* @_ZnajSt11align_val_t(i32, i32)\n"
+      "declare i8* @_ZnajSt11align_val_tRKSt9nothrow_t(i32, i32, %struct*)\n"
+      "declare i8* @_Znam(i64)\n"
+      "declare i8* @_ZnamRKSt9nothrow_t(i64, %struct*)\n"
+      "declare i8* @_ZnamSt11align_val_t(i64, i64)\n"
+      "declare i8* @_ZnamSt11align_val_tRKSt9nothrow_t(i64, i64, %struct*)\n"
+      "declare i8* @_Znwj(i32)\n"
+      "declare i8* @_ZnwjRKSt9nothrow_t(i32, %struct*)\n"
+      "declare i8* @_ZnwjSt11align_val_t(i32, i32)\n"
+      "declare i8* @_ZnwjSt11align_val_tRKSt9nothrow_t(i32, i32, %struct*)\n"
+      "declare i8* @_Znwm(i64)\n"
+      "declare i8* @_ZnwmRKSt9nothrow_t(i64, %struct*)\n"
+      "declare i8* @_ZnwmSt11align_val_t(i64, i64)\n"
+      "declare i8* @_ZnwmSt11align_val_tRKSt9nothrow_t(i64, i64, %struct*)\n"
+
+      "declare void @\"??3@YAXPEAX@Z\"(i8*)\n"
+      "declare void @\"??3@YAXPEAXAEBUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
+      "declare void @\"??3@YAXPEAX_K@Z\"(i8*, i64)\n"
+      "declare void @\"??_V@YAXPEAX@Z\"(i8*)\n"
+      "declare void @\"??_V@YAXPEAXAEBUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
+      "declare void @\"??_V@YAXPEAX_K@Z\"(i8*, i64)\n"
+      "declare i8* @\"??2@YAPAXI@Z\"(i32)\n"
+      "declare i8* @\"??2@YAPAXIABUnothrow_t@std@@@Z\"(i32, %struct*)\n"
+      "declare i8* @\"??2@YAPEAX_K@Z\"(i64)\n"
+      "declare i8* @\"??2@YAPEAX_KAEBUnothrow_t@std@@@Z\"(i64, %struct*)\n"
+      "declare i8* @\"??_U@YAPAXI@Z\"(i32)\n"
+      "declare i8* @\"??_U@YAPAXIABUnothrow_t@std@@@Z\"(i32, %struct*)\n"
+      "declare i8* @\"??_U@YAPEAX_K@Z\"(i64)\n"
+      "declare i8* @\"??_U@YAPEAX_KAEBUnothrow_t@std@@@Z\"(i64, %struct*)\n"
+
+      "declare void @\"??3@YAXPAX@Z\"(i8*)\n"
+      "declare void @\"??3@YAXPAXABUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
+      "declare void @\"??3@YAXPAXI@Z\"(i8*, i32)\n"
+      "declare void @\"??_V@YAXPAX@Z\"(i8*)\n"
+      "declare void @\"??_V@YAXPAXABUnothrow_t@std@@@Z\"(i8*, %struct*)\n"
+      "declare void @\"??_V@YAXPAXI@Z\"(i8*, i32)\n"
+
+      // These other functions were derived from the .def C declaration.
+      "declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*)\n"
+      "declare void @__cxa_guard_abort(%struct*)\n"
+      "declare i32 @__cxa_guard_acquire(%struct*)\n"
+      "declare void @__cxa_guard_release(%struct*)\n"
+
+      "declare i32 @__nvvm_reflect(i8*)\n"
+
+      "declare i8* @__memcpy_chk(i8*, i8*, i64, i64)\n"
+      "declare i8* @__memmove_chk(i8*, i8*, i64, i64)\n"
+      "declare i8* @__memset_chk(i8*, i32, i64, i64)\n"
+      "declare i8* @__stpcpy_chk(i8*, i8*, i64)\n"
+      "declare i8* @__stpncpy_chk(i8*, i8*, i64, i64)\n"
+      "declare i8* @__strcpy_chk(i8*, i8*, i64)\n"
+      "declare i8* @__strncpy_chk(i8*, i8*, i64, i64)\n"
+
+      "declare i8* @memalign(i64, i64)\n"
+      "declare i8* @mempcpy(i8*, i8*, i64)\n"
+      "declare i8* @memrchr(i8*, i32, i64)\n"
+
+      // These are similar to the FILE* fgetc/fputc.
+      "declare i32 @_IO_getc(%struct*)\n"
+      "declare i32 @_IO_putc(i32, %struct*)\n"
+
+      "declare i32 @__isoc99_scanf(i8*, ...)\n"
+      "declare i32 @__isoc99_sscanf(i8*, i8*, ...)\n"
+      "declare i8* @__strdup(i8*)\n"
+      "declare i8* @__strndup(i8*, i64)\n"
+      "declare i8* @__strtok_r(i8*, i8*, i8**)\n"
+
+      "declare double @__sqrt_finite(double)\n"
+      "declare float @__sqrtf_finite(float)\n"
+      "declare x86_fp80 @__sqrtl_finite(x86_fp80)\n"
+      "declare double @exp10(double)\n"
+      "declare float @exp10f(float)\n"
+      "declare x86_fp80 @exp10l(x86_fp80)\n"
+
+      // These printf variants have the same prototype as the non-'i' versions.
+      "declare i32 @fiprintf(%struct*, i8*, ...)\n"
+      "declare i32 @iprintf(i8*, ...)\n"
+      "declare i32 @siprintf(i8*, i8*, ...)\n"
+
+      "declare i32 @htonl(i32)\n"
+      "declare i16 @htons(i16)\n"
+      "declare i32 @ntohl(i32)\n"
+      "declare i16 @ntohs(i16)\n"
+
+      "declare i32 @isascii(i32)\n"
+      "declare i32 @isdigit(i32)\n"
+      "declare i32 @toascii(i32)\n"
+
+      // These functions were extracted from math-finite.h which provides
+      // functions similar to those in math.h, but optimized for handling
+      // finite values only.
+      "declare double @__acos_finite(double)\n"
+      "declare float @__acosf_finite(float)\n"
+      "declare x86_fp80 @__acosl_finite(x86_fp80)\n"
+      "declare double @__acosh_finite(double)\n"
+      "declare float @__acoshf_finite(float)\n"
+      "declare x86_fp80 @__acoshl_finite(x86_fp80)\n"
+      "declare double @__asin_finite(double)\n"
+      "declare float @__asinf_finite(float)\n"
+      "declare x86_fp80 @__asinl_finite(x86_fp80)\n"
+      "declare double @__atan2_finite(double, double)\n"
+      "declare float @__atan2f_finite(float, float)\n"
+      "declare x86_fp80 @__atan2l_finite(x86_fp80, x86_fp80)\n"
+      "declare double @__atanh_finite(double)\n"
+      "declare float @__atanhf_finite(float)\n"
+      "declare x86_fp80 @__atanhl_finite(x86_fp80)\n"
+      "declare double @__cosh_finite(double)\n"
+      "declare float @__coshf_finite(float)\n"
+      "declare x86_fp80 @__coshl_finite(x86_fp80)\n"
+      "declare double @__exp10_finite(double)\n"
+      "declare float @__exp10f_finite(float)\n"
+      "declare x86_fp80 @__exp10l_finite(x86_fp80)\n"
+      "declare double @__exp2_finite(double)\n"
+      "declare float @__exp2f_finite(float)\n"
+      "declare x86_fp80 @__exp2l_finite(x86_fp80)\n"
+      "declare double @__exp_finite(double)\n"
+      "declare float @__expf_finite(float)\n"
+      "declare x86_fp80 @__expl_finite(x86_fp80)\n"
+      "declare double @__log10_finite(double)\n"
+      "declare float @__log10f_finite(float)\n"
+      "declare x86_fp80 @__log10l_finite(x86_fp80)\n"
+      "declare double @__log2_finite(double)\n"
+      "declare float @__log2f_finite(float)\n"
+      "declare x86_fp80 @__log2l_finite(x86_fp80)\n"
+      "declare double @__log_finite(double)\n"
+      "declare float @__logf_finite(float)\n"
+      "declare x86_fp80 @__logl_finite(x86_fp80)\n"
+      "declare double @__pow_finite(double, double)\n"
+      "declare float @__powf_finite(float, float)\n"
+      "declare x86_fp80 @__powl_finite(x86_fp80, x86_fp80)\n"
+      "declare double @__sinh_finite(double)\n"
+      "declare float @__sinhf_finite(float)\n"
+      "declare x86_fp80 @__sinhl_finite(x86_fp80)\n"
+      );
 
   for (unsigned FI = 0; FI != LibFunc::NumLibFuncs; ++FI) {
     LibFunc LF = (LibFunc)FI;
diff --git a/unittests/Analysis/ValueTrackingTest.cpp b/unittests/Analysis/ValueTrackingTest.cpp
index f391ca12e55208e220f809c87b51f2682c923b9c..c4adde4abe345b537b4e93635e6ca3e3793787bd 100644
--- a/unittests/Analysis/ValueTrackingTest.cpp
+++ b/unittests/Analysis/ValueTrackingTest.cpp
@@ -149,7 +149,205 @@ TEST_F(MatchSelectPatternTest, FMinConstantZeroNsz) {
   expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, true});
 }
 
-TEST_F(MatchSelectPatternTest, VectorFMinNaN) {
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero1) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float -0.0, %a\n"
+      "  %A = select i1 %1, float 0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_NAN, true});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero2) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float %a, -0.0\n"
+      "  %A = select i1 %1, float 0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_NAN, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero3) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float 0.0, %a\n"
+      "  %A = select i1 %1, float -0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_NAN, true});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero4) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float %a, 0.0\n"
+      "  %A = select i1 %1, float -0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_NAN, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero5) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float -0.0, %a\n"
+      "  %A = select i1 %1, float %a, float 0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero6) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float %a, -0.0\n"
+      "  %A = select i1 %1, float %a, float 0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, true});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero7) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float 0.0, %a\n"
+      "  %A = select i1 %1, float %a, float -0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZero8) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float %a, 0.0\n"
+      "  %A = select i1 %1, float %a, float -0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, true});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero1) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float -0.0, %a\n"
+      "  %A = select i1 %1, float 0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_NAN, true});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero2) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float %a, -0.0\n"
+      "  %A = select i1 %1, float 0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_NAN, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero3) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float 0.0, %a\n"
+      "  %A = select i1 %1, float -0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_NAN, true});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero4) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float %a, 0.0\n"
+      "  %A = select i1 %1, float -0.0, float %a\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_NAN, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero5) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float -0.0, %a\n"
+      "  %A = select i1 %1, float %a, float 0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_OTHER, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero6) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float %a, -0.0\n"
+      "  %A = select i1 %1, float %a, float 0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_OTHER, true});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero7) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp olt float 0.0, %a\n"
+      "  %A = select i1 %1, float %a, float -0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_OTHER, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZero8) {
+  parseAssembly(
+      "define float @test(float %a) {\n"
+      "  %1 = fcmp ogt float %a, 0.0\n"
+      "  %A = select i1 %1, float %a, float -0.0\n"
+      "  ret float %A\n"
+      "}\n");
+  // The sign of zero doesn't matter in fcmp.
+  expectPattern({SPF_FMAXNUM, SPNB_RETURNS_OTHER, true});
+}
+
+TEST_F(MatchSelectPatternTest, FMinMismatchConstantZeroVecUndef) {
+  parseAssembly(
+      "define <2 x float> @test(<2 x float> %a) {\n"
+      "  %1 = fcmp ogt <2 x float> %a, <float -0.0, float -0.0>\n"
+      "  %A = select <2 x i1> %1, <2 x float> <float undef, float 0.0>, <2 x float> %a\n"
+      "  ret <2 x float> %A\n"
+      "}\n");
+  // An undef in a vector constant can not be back-propagated for this analysis.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, FMaxMismatchConstantZeroVecUndef) {
+  parseAssembly(
+      "define <2 x float> @test(<2 x float> %a) {\n"
+      "  %1 = fcmp ogt <2 x float> %a, zeroinitializer\n"
+      "  %A = select <2 x i1> %1, <2 x float> %a, <2 x float> <float -0.0, float undef>\n"
+      "  ret <2 x float> %A\n"
+      "}\n");
+  // An undef in a vector constant can not be back-propagated for this analysis.
+  expectPattern({SPF_UNKNOWN, SPNB_NA, false});
+}
+
+TEST_F(MatchSelectPatternTest, VectorFMinimum) {
   parseAssembly(
       "define <4 x float> @test(<4 x float> %a) {\n"
       "  %1 = fcmp ule <4 x float> %a, \n"
@@ -177,7 +375,7 @@ TEST_F(MatchSelectPatternTest, VectorFMinOtherOrdered) {
   expectPattern({SPF_FMINNUM, SPNB_RETURNS_OTHER, true});
 }
 
-TEST_F(MatchSelectPatternTest, VectorNotFMinNaN) {
+TEST_F(MatchSelectPatternTest, VectorNotFMinimum) {
   parseAssembly(
       "define <4 x float> @test(<4 x float> %a) {\n"
       "  %1 = fcmp ule <4 x float> %a, \n"
@@ -318,6 +516,48 @@ TEST(ValueTracking, ComputeNumSignBits_PR32045) {
   EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
 }
 
+// No guarantees for canonical IR in this analysis, so this just bails out. 
+TEST(ValueTracking, ComputeNumSignBits_Shuffle) {
+  StringRef Assembly = "define <2 x i32> @f() { "
+                       "  %val = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 0> "
+                       "  ret <2 x i32> %val "
+                       "} ";
+
+  LLVMContext Context;
+  SMDiagnostic Error;
+  auto M = parseAssemblyString(Assembly, Error, Context);
+  assert(M && "Bad assembly?");
+
+  auto *F = M->getFunction("f");
+  assert(F && "Bad assembly?");
+
+  auto *RVal =
+      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
+  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
+}
+
+// No guarantees for canonical IR in this analysis, so a shuffle element that
+// references an undef value means this can't return any extra information. 
+TEST(ValueTracking, ComputeNumSignBits_Shuffle2) {
+  StringRef Assembly = "define <2 x i32> @f(<2 x i1> %x) { "
+                       "  %sext = sext <2 x i1> %x to <2 x i32> "
+                       "  %val = shufflevector <2 x i32> %sext, <2 x i32> undef, <2 x i32> <i32 0, i32 2> "
+                       "  ret <2 x i32> %val "
+                       "} ";
+
+  LLVMContext Context;
+  SMDiagnostic Error;
+  auto M = parseAssemblyString(Assembly, Error, Context);
+  assert(M && "Bad assembly?");
+
+  auto *F = M->getFunction("f");
+  assert(F && "Bad assembly?");
+
+  auto *RVal =
+      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
+  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
+}
+
 TEST(ValueTracking, ComputeKnownBits) {
   StringRef Assembly = "define i32 @f(i32 %a, i32 %b) { "
                        "  %ash = mul i32 %a, 8 "
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index bc41ab66a23b6939724595e6c259191b8fc90a46..5dba2de4a88b5e8b5f9159f13a7f34b3c94f07a3 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -26,6 +26,7 @@ add_subdirectory(MI)
 add_subdirectory(Object)
 add_subdirectory(ObjectYAML)
 add_subdirectory(Option)
+add_subdirectory(OptRemarks)
 add_subdirectory(Passes)
 add_subdirectory(ProfileData)
 add_subdirectory(Support)
diff --git a/unittests/CodeGen/AArch64SelectionDAGTest.cpp b/unittests/CodeGen/AArch64SelectionDAGTest.cpp
index 620dfc8d234ba1f7140879ff11c5ea360715f68b..0c184d371875c5260c863a889c9f25fd1ddb03c0 100644
--- a/unittests/CodeGen/AArch64SelectionDAGTest.cpp
+++ b/unittests/CodeGen/AArch64SelectionDAGTest.cpp
@@ -42,8 +42,9 @@ protected:
       return;
 
     TargetOptions Options;
-    TM = std::unique_ptr<TargetMachine>(T->createTargetMachine(
-        "AArch64", "", "", Options, None, None, CodeGenOpt::Aggressive));
+    TM = std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+        T->createTargetMachine("AArch64", "", "", Options, None, None,
+                               CodeGenOpt::Aggressive)));
     if (!TM)
       return;
 
@@ -70,7 +71,7 @@ protected:
   }
 
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = nullptr;
+  std::unique_ptr<LLVMTargetMachine> TM;
   std::unique_ptr<Module> M;
   Function *F;
   std::unique_ptr<MachineFunction> MF;
@@ -86,7 +87,7 @@ TEST_F(AArch64SelectionDAGTest, computeKnownBits_ZERO_EXTEND_VECTOR_INREG) {
   auto InVecVT = EVT::getVectorVT(Context, Int8VT, 4);
   auto OutVecVT = EVT::getVectorVT(Context, Int16VT, 2);
   auto InVec = DAG->getConstant(0, Loc, InVecVT);
-  auto Op = DAG->getZeroExtendVectorInReg(InVec, Loc, OutVecVT);
+  auto Op = DAG->getNode(ISD::ZERO_EXTEND_VECTOR_INREG, Loc, OutVecVT, InVec);
   auto DemandedElts = APInt(4, 15);
   KnownBits Known;
   DAG->computeKnownBits(Op, Known, DemandedElts);
@@ -118,7 +119,7 @@ TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_SIGN_EXTEND_VECTOR_INREG) {
   auto InVecVT = EVT::getVectorVT(Context, Int8VT, 4);
   auto OutVecVT = EVT::getVectorVT(Context, Int16VT, 2);
   auto InVec = DAG->getConstant(1, Loc, InVecVT);
-  auto Op = DAG->getSignExtendVectorInReg(InVec, Loc, OutVecVT);
+  auto Op = DAG->getNode(ISD::SIGN_EXTEND_VECTOR_INREG, Loc, OutVecVT, InVec);
   auto DemandedElts = APInt(4, 15);
   EXPECT_EQ(DAG->ComputeNumSignBits(Op, DemandedElts), 15u);
 }
diff --git a/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h b/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h
index ca1aed544d20b63bd4556218f1235145d1b03e03..28af811e1f1701740196dbcec4805b3886399fa0 100644
--- a/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h
+++ b/unittests/CodeGen/GlobalISel/LegalizerHelperTest.h
@@ -44,7 +44,7 @@ void initLLVM() {
 
 /// Create a TargetMachine. As we lack a dedicated always available target for
 /// unittests, we go for "AArch64".
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   Triple TargetTriple("aarch64--");
   std::string Error;
   const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
@@ -52,8 +52,9 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
     return nullptr;
 
   TargetOptions Options;
-  return std::unique_ptr<TargetMachine>(T->createTargetMachine(
-      "AArch64", "", "", Options, None, None, CodeGenOpt::Aggressive));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      T->createTargetMachine("AArch64", "", "", Options, None, None,
+                             CodeGenOpt::Aggressive)));
 }
 
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
@@ -79,7 +80,7 @@ std::unique_ptr<Module> parseMIR(LLVMContext &Context,
 }
 
 std::pair<std::unique_ptr<Module>, std::unique_ptr<MachineModuleInfo>>
-createDummyModule(LLVMContext &Context, const TargetMachine &TM,
+createDummyModule(LLVMContext &Context, const LLVMTargetMachine &TM,
                   StringRef MIRFunc) {
   SmallString<512> S;
   StringRef MIRString = (Twine(R"MIR(
@@ -136,7 +137,7 @@ protected:
     B.setInsertPt(*EntryMBB, EntryMBB->end());
   }
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM;
+  std::unique_ptr<LLVMTargetMachine> TM;
   MachineFunction *MF;
   std::pair<std::unique_ptr<Module>, std::unique_ptr<MachineModuleInfo>>
       ModuleMMIPair;
diff --git a/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp b/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
index 8f17b1991df5a0aa1695782e88ec16571d3844c9..1f3a690ad0155a7d8380f7132ba9d084a2cda439 100644
--- a/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
+++ b/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp
@@ -43,7 +43,7 @@ void initLLVM() {
 
 /// Create a TargetMachine. As we lack a dedicated always available target for
 /// unittests, we go for "AArch64".
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   Triple TargetTriple("aarch64--");
   std::string Error;
   const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
@@ -51,8 +51,9 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
     return nullptr;
 
   TargetOptions Options;
-  return std::unique_ptr<TargetMachine>(T->createTargetMachine(
-      "AArch64", "", "", Options, None, None, CodeGenOpt::Aggressive));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      T->createTargetMachine("AArch64", "", "", Options, None, None,
+                             CodeGenOpt::Aggressive)));
 }
 
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
@@ -78,7 +79,7 @@ std::unique_ptr<Module> parseMIR(LLVMContext &Context,
 }
 
 std::pair<std::unique_ptr<Module>, std::unique_ptr<MachineModuleInfo>>
-createDummyModule(LLVMContext &Context, const TargetMachine &TM,
+createDummyModule(LLVMContext &Context, const LLVMTargetMachine &TM,
                   StringRef MIRFunc) {
   SmallString<512> S;
   StringRef MIRString = (Twine(R"MIR(
@@ -122,7 +123,7 @@ static void collectCopies(SmallVectorImpl<unsigned> &Copies,
 
 TEST(PatternMatchInstr, MatchIntConstant) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -143,7 +144,7 @@ TEST(PatternMatchInstr, MatchIntConstant) {
 
 TEST(PatternMatchInstr, MatchBinaryOp) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -270,7 +271,7 @@ TEST(PatternMatchInstr, MatchBinaryOp) {
 
 TEST(PatternMatchInstr, MatchFPUnaryOp) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -341,7 +342,7 @@ TEST(PatternMatchInstr, MatchFPUnaryOp) {
 
 TEST(PatternMatchInstr, MatchExtendsTrunc) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -397,7 +398,7 @@ TEST(PatternMatchInstr, MatchExtendsTrunc) {
 
 TEST(PatternMatchInstr, MatchSpecificType) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
@@ -444,7 +445,7 @@ TEST(PatternMatchInstr, MatchSpecificType) {
 
 TEST(PatternMatchInstr, MatchCombinators) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   if (!TM)
     return;
   auto ModuleMMIPair = createDummyModule(Context, *TM, "");
diff --git a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index 1be0363adb00604e51553458dfd6a2c3b2fa48e5..ffbde2df2bc9f216707a092429c1e17ad6de560c 100644
--- a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -1227,6 +1227,10 @@ TEST(DWARFDebugInfo, TestRelations) {
   EXPECT_THAT(std::vector<DWARFDie>(A.rbegin(), A.rend()),
               testing::ElementsAre(D, C, B));
 
+  // Make sure conversion from reverse iterator works as expected.
+  EXPECT_EQ(A.rbegin().base(), A.end());
+  EXPECT_EQ(A.rend().base(), A.begin());
+
   // Make sure iterator is bidirectional.
   {
     auto Begin = A.begin();
diff --git a/unittests/DebugInfo/PDB/PDBApiTest.cpp b/unittests/DebugInfo/PDB/PDBApiTest.cpp
index 948bde1bf726b7ed377cdc4cd41f52ce21ef3d7e..007ea9040856be9716595b23d82fcb45b3438c08 100644
--- a/unittests/DebugInfo/PDB/PDBApiTest.cpp
+++ b/unittests/DebugInfo/PDB/PDBApiTest.cpp
@@ -159,6 +159,10 @@ class MockSession : public IPDBSession {
   std::unique_ptr<IPDBEnumSectionContribs> getSectionContribs() const override {
     return nullptr;
   }
+
+  std::unique_ptr<IPDBEnumFrameData> getFrameData() const override {
+    return nullptr;
+  }
 };
 
 class MockRawSymbol : public IPDBRawSymbol {
diff --git a/unittests/Demangle/CMakeLists.txt b/unittests/Demangle/CMakeLists.txt
index 48d959c085265d827bc72a0dab04f0e9239f6a09..2f9d71a37e9f2e1817c8243261875d3d9881c52d 100644
--- a/unittests/Demangle/CMakeLists.txt
+++ b/unittests/Demangle/CMakeLists.txt
@@ -1,8 +1,10 @@
 set(LLVM_LINK_COMPONENTS
   Demangle
+  Support
 )
 
 add_llvm_unittest(DemangleTests
+  ItaniumDemangleTest.cpp
   PartialDemangleTest.cpp
   FindTypesInMangledNameTest.cpp
 )
diff --git a/unittests/Demangle/ItaniumDemangleTest.cpp b/unittests/Demangle/ItaniumDemangleTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..abb690c626a4518219fd48fd05965de004a4c258
--- /dev/null
+++ b/unittests/Demangle/ItaniumDemangleTest.cpp
@@ -0,0 +1,54 @@
+//===------------------ ItaniumDemangleTest.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Demangle/ItaniumDemangle.h"
+#include "llvm/Support/Allocator.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <cstdlib>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::itanium_demangle;
+
+namespace {
+class TestAllocator {
+  BumpPtrAllocator Alloc;
+
+public:
+  void reset() { Alloc.Reset(); }
+
+  template <typename T, typename... Args> T *makeNode(Args &&... args) {
+    return new (Alloc.Allocate(sizeof(T), alignof(T)))
+        T(std::forward<Args>(args)...);
+  }
+
+  void *allocateNodeArray(size_t sz) {
+    return Alloc.Allocate(sizeof(Node *) * sz, alignof(Node *));
+  }
+};
+} // namespace
+
+TEST(ItaniumDemangle, MethodOverride) {
+  struct TestParser : AbstractManglingParser<TestParser, TestAllocator> {
+    std::vector<char> Types;
+
+    TestParser(const char *Str)
+        : AbstractManglingParser(Str, Str + strlen(Str)) {}
+
+    Node *parseType() {
+      Types.push_back(*First);
+      return AbstractManglingParser<TestParser, TestAllocator>::parseType();
+    }
+  };
+
+  TestParser Parser("_Z1fIiEjl");
+  ASSERT_NE(nullptr, Parser.parse());
+  EXPECT_THAT(Parser.Types, testing::ElementsAre('i', 'j', 'l'));
+}
diff --git a/unittests/ExecutionEngine/Orc/CMakeLists.txt b/unittests/ExecutionEngine/Orc/CMakeLists.txt
index 8b0d5fc243544e5346c18578a108d5de6c83749c..019437d4ad5eeb7ad0ec5fc47854b677354b43cf 100644
--- a/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -10,7 +10,6 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_unittest(OrcJITTests
-  CompileOnDemandLayerTest.cpp
   CoreAPIsTest.cpp
   IndirectionUtilsTest.cpp
   GlobalMappingLayerTest.cpp
@@ -18,6 +17,8 @@ add_llvm_unittest(OrcJITTests
   LazyCallThroughAndReexportsTest.cpp
   LazyEmittingLayerTest.cpp
   LegacyAPIInteropTest.cpp
+  LegacyCompileOnDemandLayerTest.cpp
+  LegacyRTDyldObjectLinkingLayerTest.cpp
   ObjectTransformLayerTest.cpp
   OrcCAPITest.cpp
   OrcTestCommon.cpp
@@ -25,7 +26,6 @@ add_llvm_unittest(OrcJITTests
   RemoteObjectLayerTest.cpp
   RPCUtilsTest.cpp
   RTDyldObjectLinkingLayerTest.cpp
-  RTDyldObjectLinkingLayer2Test.cpp
   SymbolStringPoolTest.cpp
   ThreadSafeModuleTest.cpp
   )
diff --git a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index cd742187ffb02b6dde65968c0dd070cbf6740adb..22be76a2eb6d27dabe2eb59aadad0ec30a364909 100644
--- a/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -48,7 +48,8 @@ TEST_F(CoreAPIsStandardTest, BasicSuccessfulLookup) {
         FooMR = std::make_shared<MaterializationResponsibility>(std::move(R));
       })));
 
-  ES.lookup({&JD}, {Foo}, OnResolution, OnReady, NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, OnResolution, OnReady,
+            NoDependenciesToRegister);
 
   EXPECT_FALSE(OnResolutionRun) << "Should not have been resolved yet";
   EXPECT_FALSE(OnReadyRun) << "Should not have been marked ready yet";
@@ -101,7 +102,8 @@ TEST_F(CoreAPIsStandardTest, EmptyLookup) {
     OnReadyRun = true;
   };
 
-  ES.lookup({&JD}, {}, OnResolution, OnReady, NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {}, OnResolution, OnReady,
+            NoDependenciesToRegister);
 
   EXPECT_TRUE(OnResolvedRun) << "OnResolved was not run for empty query";
   EXPECT_TRUE(OnReadyRun) << "OnReady was not run for empty query";
@@ -148,7 +150,7 @@ TEST_F(CoreAPIsStandardTest, RemoveSymbolsTest) {
 
   bool OnResolvedRun = false;
   bool OnReadyRun = false;
-  ES.lookup({&JD}, {Foo, Baz},
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo, Baz},
             [&](Expected<SymbolMap> Result) {
               EXPECT_TRUE(!!Result) << "OnResolved failed unexpectedly";
               consumeError(Result.takeError());
@@ -220,6 +222,26 @@ TEST_F(CoreAPIsStandardTest, ChainedJITDylibLookup) {
   EXPECT_TRUE(OnReadyRun) << "OnReady was not run for empty query";
 }
 
+TEST_F(CoreAPIsStandardTest, LookupWithHiddenSymbols) {
+  auto BarHiddenFlags = BarSym.getFlags() & ~JITSymbolFlags::Exported;
+  auto BarHiddenSym = JITEvaluatedSymbol(BarSym.getAddress(), BarHiddenFlags);
+
+  cantFail(JD.define(absoluteSymbols({{Foo, FooSym}, {Bar, BarHiddenSym}})));
+
+  auto &JD2 = ES.createJITDylib("JD2");
+  cantFail(JD2.define(absoluteSymbols({{Bar, QuxSym}})));
+
+  /// Try a blocking lookup.
+  auto Result = cantFail(
+      ES.lookup(JITDylibSearchList({{&JD, false}, {&JD2, false}}), {Foo, Bar}));
+
+  EXPECT_EQ(Result.size(), 2U) << "Unexpected number of results";
+  EXPECT_EQ(Result.count(Foo), 1U) << "Missing result for \"Foo\"";
+  EXPECT_EQ(Result.count(Bar), 1U) << "Missing result for \"Bar\"";
+  EXPECT_EQ(Result[Bar].getAddress(), QuxSym.getAddress())
+      << "Wrong result for \"Bar\"";
+}
+
 TEST_F(CoreAPIsStandardTest, LookupFlagsTest) {
   // Test that lookupFlags works on a predefined symbol, and does not trigger
   // materialization of a lazy symbol. Make the lazy symbol weak to test that
@@ -257,7 +279,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicAliases) {
                                     {Qux, {Bar, JITSymbolFlags::Weak}}})));
   cantFail(JD.define(absoluteSymbols({{Qux, QuxSym}})));
 
-  auto Result = lookup({&JD}, {Baz, Qux});
+  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), {Baz, Qux});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
   EXPECT_EQ(Result->count(Qux), 1U) << "No result for \"qux\"";
@@ -272,7 +294,7 @@ TEST_F(CoreAPIsStandardTest, TestChainedAliases) {
   cantFail(JD.define(symbolAliases(
       {{Baz, {Bar, BazSym.getFlags()}}, {Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = lookup({&JD}, {Bar, Baz});
+  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar, Baz});
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Bar), 1U) << "No result for \"bar\"";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
@@ -291,7 +313,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicReExports) {
 
   cantFail(JD2.define(reexports(JD, {{Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = cantFail(lookup({&JD2}, Bar));
+  auto Result = cantFail(ES.lookup(JITDylibSearchList({{&JD2, false}}), Bar));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Bar for symbol Foo should match FooSym's address";
 }
@@ -317,30 +339,28 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) {
   cantFail(JD2.define(reexports(
       JD, {{Baz, {Foo, BazSym.getFlags()}}, {Qux, {Bar, QuxSym.getFlags()}}})));
 
-  auto Result = cantFail(lookup({&JD2}, Baz));
+  auto Result = cantFail(ES.lookup(JITDylibSearchList({{&JD2, false}}), Baz));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Baz for symbol Foo should match FooSym's address";
 
   EXPECT_FALSE(BarMaterialized) << "Bar should not have been materialized";
 }
 
-TEST_F(CoreAPIsStandardTest, TestReexportsFallbackGenerator) {
-  // Test that a re-exports fallback generator can dynamically generate
-  // reexports.
+TEST_F(CoreAPIsStandardTest, TestReexportsGenerator) {
+  // Test that a re-exports generator can dynamically generate reexports.
 
   auto &JD2 = ES.createJITDylib("JD2");
   cantFail(JD2.define(absoluteSymbols({{Foo, FooSym}, {Bar, BarSym}})));
 
   auto Filter = [this](SymbolStringPtr Name) { return Name != Bar; };
 
-  JD.setFallbackDefinitionGenerator(
-      ReexportsFallbackDefinitionGenerator(JD2, Filter));
+  JD.setGenerator(ReexportsGenerator(JD2, false, Filter));
 
   auto Flags = JD.lookupFlags({Foo, Bar, Baz});
   EXPECT_EQ(Flags.size(), 1U) << "Unexpected number of results";
   EXPECT_EQ(Flags[Foo], FooSym.getFlags()) << "Unexpected flags for Foo";
 
-  auto Result = cantFail(lookup({&JD}, Foo));
+  auto Result = cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
 
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Incorrect reexported symbol address";
@@ -361,8 +381,8 @@ TEST_F(CoreAPIsStandardTest, TestTrivialCircularDependency) {
     FooReady = true;
   };
 
-  ES.lookup({&JD}, {Foo}, std::move(OnResolution), std::move(OnReady),
-            NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, std::move(OnResolution),
+            std::move(OnReady), NoDependenciesToRegister);
 
   FooR->resolve({{Foo, FooSym}});
   FooR->emit();
@@ -418,7 +438,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
 
   // Issue a lookup for Foo. Use NoDependenciesToRegister: We're going to add
   // the dependencies manually below.
-  ES.lookup({&JD}, {Foo}, std::move(OnFooResolution), std::move(OnFooReady),
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo},
+            std::move(OnFooResolution), std::move(OnFooReady),
             NoDependenciesToRegister);
 
   bool BarResolved = false;
@@ -433,7 +454,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BarReady = true;
   };
 
-  ES.lookup({&JD}, {Bar}, std::move(OnBarResolution), std::move(OnBarReady),
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar},
+            std::move(OnBarResolution), std::move(OnBarReady),
             NoDependenciesToRegister);
 
   bool BazResolved = false;
@@ -449,7 +471,8 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BazReady = true;
   };
 
-  ES.lookup({&JD}, {Baz}, std::move(OnBazResolution), std::move(OnBazReady),
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Baz},
+            std::move(OnBazResolution), std::move(OnBazReady),
             NoDependenciesToRegister);
 
   // Add a circular dependency: Foo -> Bar, Bar -> Baz, Baz -> Foo.
@@ -572,8 +595,8 @@ TEST_F(CoreAPIsStandardTest, AddAndMaterializeLazySymbol) {
     OnReadyRun = true;
   };
 
-  ES.lookup({&JD}, Names, std::move(OnResolution), std::move(OnReady),
-            NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), Names, std::move(OnResolution),
+            std::move(OnReady), NoDependenciesToRegister);
 
   EXPECT_TRUE(FooMaterialized) << "Foo was not materialized";
   EXPECT_TRUE(BarDiscarded) << "Bar was not discarded";
@@ -621,8 +644,8 @@ TEST_F(CoreAPIsStandardTest, TestBasicWeakSymbolMaterialization) {
     OnReadyRun = true;
   };
 
-  ES.lookup({&JD}, {Bar}, std::move(OnResolution), std::move(OnReady),
-            NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar}, std::move(OnResolution),
+            std::move(OnReady), NoDependenciesToRegister);
 
   EXPECT_TRUE(OnResolvedRun) << "OnResolved not run";
   EXPECT_TRUE(OnReadyRun) << "OnReady not run";
@@ -650,27 +673,27 @@ TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) {
       });
 
   cantFail(JD.define(MU));
-  cantFail(lookup({&JD}, Foo));
+  cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
 
   // Assert that materialization is complete by now.
   ExpectNoMoreMaterialization = true;
 
   // Look up bar to verify that no further materialization happens.
-  auto BarResult = cantFail(lookup({&JD}, Bar));
+  auto BarResult = cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Bar));
   EXPECT_EQ(BarResult.getAddress(), BarSym.getAddress())
       << "Expected Bar == BarSym";
 }
 
-TEST_F(CoreAPIsStandardTest, FallbackDefinitionGeneratorTest) {
+TEST_F(CoreAPIsStandardTest, GeneratorTest) {
   cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
 
-  JD.setFallbackDefinitionGenerator(
-      [&](JITDylib &JD2, const SymbolNameSet &Names) {
-        cantFail(JD2.define(absoluteSymbols({{Bar, BarSym}})));
-        return SymbolNameSet({Bar});
-      });
+  JD.setGenerator([&](JITDylib &JD2, const SymbolNameSet &Names) {
+    cantFail(JD2.define(absoluteSymbols({{Bar, BarSym}})));
+    return SymbolNameSet({Bar});
+  });
 
-  auto Result = cantFail(lookup({&JD}, {Foo, Bar}));
+  auto Result =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo, Bar}));
 
   EXPECT_EQ(Result.count(Bar), 1U) << "Expected to find fallback def for 'bar'";
   EXPECT_EQ(Result[Bar].getAddress(), BarSym.getAddress())
@@ -679,14 +702,14 @@ TEST_F(CoreAPIsStandardTest, FallbackDefinitionGeneratorTest) {
 
 TEST_F(CoreAPIsStandardTest, FailResolution) {
   auto MU = llvm::make_unique<SimpleMaterializationUnit>(
-      SymbolFlagsMap(
-          {{Foo, JITSymbolFlags::Weak}, {Bar, JITSymbolFlags::Weak}}),
+      SymbolFlagsMap({{Foo, JITSymbolFlags::Exported | JITSymbolFlags::Weak},
+                      {Bar, JITSymbolFlags::Exported | JITSymbolFlags::Weak}}),
       [&](MaterializationResponsibility R) { R.failMaterialization(); });
 
   cantFail(JD.define(MU));
 
   SymbolNameSet Names({Foo, Bar});
-  auto Result = lookup({&JD}, Names);
+  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), Names);
 
   EXPECT_FALSE(!!Result) << "Expected failure";
   if (!Result) {
@@ -718,7 +741,8 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithUnthreadedMaterialization) {
 
   cantFail(JD.define(MU));
 
-  auto FooLookupResult = cantFail(lookup({&JD}, Foo));
+  auto FooLookupResult =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -739,7 +763,8 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithThreadedMaterialization) {
 
   cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
 
-  auto FooLookupResult = cantFail(lookup({&JD}, Foo));
+  auto FooLookupResult =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -787,14 +812,16 @@ TEST_F(CoreAPIsStandardTest, TestGetRequestedSymbolsAndReplace) {
   EXPECT_FALSE(FooMaterialized) << "Foo should not be materialized yet";
   EXPECT_FALSE(BarMaterialized) << "Bar should not be materialized yet";
 
-  auto FooSymResult = cantFail(lookup({&JD}, Foo));
+  auto FooSymResult =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
   EXPECT_EQ(FooSymResult.getAddress(), FooSym.getAddress())
       << "Address mismatch for Foo";
 
   EXPECT_TRUE(FooMaterialized) << "Foo should be materialized now";
   EXPECT_FALSE(BarMaterialized) << "Bar still should not be materialized";
 
-  auto BarSymResult = cantFail(lookup({&JD}, Bar));
+  auto BarSymResult =
+      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Bar));
   EXPECT_EQ(BarSymResult.getAddress(), BarSym.getAddress())
       << "Address mismatch for Bar";
   EXPECT_TRUE(BarMaterialized) << "Bar should be materialized now";
@@ -814,7 +841,7 @@ TEST_F(CoreAPIsStandardTest, TestMaterializationResponsibilityDelegation) {
 
   cantFail(JD.define(MU));
 
-  auto Result = lookup({&JD}, {Foo, Bar});
+  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo, Bar});
 
   EXPECT_TRUE(!!Result) << "Result should be a success value";
   EXPECT_EQ(Result->count(Foo), 1U) << "\"Foo\" entry missing";
@@ -846,8 +873,8 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) {
 
   auto OnReady = [](Error Err) { cantFail(std::move(Err)); };
 
-  ES.lookup({&JD}, {Foo}, std::move(OnResolution), std::move(OnReady),
-            NoDependenciesToRegister);
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, std::move(OnResolution),
+            std::move(OnReady), NoDependenciesToRegister);
 
   auto MU2 = llvm::make_unique<SimpleMaterializationUnit>(
       SymbolFlagsMap({{Foo, JITSymbolFlags::Exported}}),
@@ -865,14 +892,4 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) {
   FooResponsibility->emit();
 }
 
-TEST_F(CoreAPIsStandardTest, TestMainJITDylibAndDefaultLookupOrder) {
-  cantFail(ES.getMainJITDylib().define(absoluteSymbols({{Foo, FooSym}})));
-  auto Results = cantFail(ES.lookup({Foo}));
-
-  EXPECT_EQ(Results.size(), 1U) << "Incorrect number of results";
-  EXPECT_EQ(Results.count(Foo), 1U) << "Expected result for 'Foo'";
-  EXPECT_EQ(Results[Foo].getAddress(), FooSym.getAddress())
-      << "Expected result address to match Foo's address";
-}
-
 } // namespace
diff --git a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp b/unittests/ExecutionEngine/Orc/LegacyCompileOnDemandLayerTest.cpp
similarity index 95%
rename from unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
rename to unittests/ExecutionEngine/Orc/LegacyCompileOnDemandLayerTest.cpp
index 9aa4437550b221c4e6d58f5215f185b1c3c69e10..38f7a654571ea98bfd870c839c0390d348429897 100644
--- a/unittests/ExecutionEngine/Orc/CompileOnDemandLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/LegacyCompileOnDemandLayerTest.cpp
@@ -54,7 +54,7 @@ public:
   }
 };
 
-TEST(CompileOnDemandLayerTest, FindSymbol) {
+TEST(LegacyCompileOnDemandLayerTest, FindSymbol) {
   MockBaseLayer<int, std::shared_ptr<Module>> TestBaseLayer;
   TestBaseLayer.findSymbolImpl =
     [](const std::string &Name, bool) {
@@ -76,7 +76,7 @@ TEST(CompileOnDemandLayerTest, FindSymbol) {
     llvm_unreachable("Should never be called");
   };
 
-  llvm::orc::CompileOnDemandLayer<decltype(TestBaseLayer)> COD(
+  llvm::orc::LegacyCompileOnDemandLayer<decltype(TestBaseLayer)> COD(
       ES, TestBaseLayer, GetResolver, SetResolver,
       [](Function &F) { return std::set<Function *>{&F}; }, CallbackMgr,
       [] { return llvm::make_unique<DummyStubsManager>(); }, true);
diff --git a/unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c9c958cc42adb366d972054c3aab9d6381fe665
--- /dev/null
+++ b/unittests/ExecutionEngine/Orc/LegacyRTDyldObjectLinkingLayerTest.cpp
@@ -0,0 +1,282 @@
+//===- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer unit tests -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "OrcTestCommon.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+#include "llvm/ExecutionEngine/Orc/Legacy.h"
+#include "llvm/ExecutionEngine/Orc/NullResolver.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace {
+
+class LegacyRTDyldObjectLinkingLayerExecutionTest : public testing::Test,
+                                              public OrcExecutionTest {
+
+};
+
+class SectionMemoryManagerWrapper : public SectionMemoryManager {
+public:
+  int FinalizationCount = 0;
+  int NeedsToReserveAllocationSpaceCount = 0;
+
+  bool needsToReserveAllocationSpace() override {
+    ++NeedsToReserveAllocationSpaceCount;
+    return SectionMemoryManager::needsToReserveAllocationSpace();
+  }
+
+  bool finalizeMemory(std::string *ErrMsg = nullptr) override {
+    ++FinalizationCount;
+    return SectionMemoryManager::finalizeMemory(ErrMsg);
+  }
+};
+
+TEST(LegacyRTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
+  class MemoryManagerWrapper : public SectionMemoryManager {
+  public:
+    MemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {}
+    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                 unsigned SectionID,
+                                 StringRef SectionName,
+                                 bool IsReadOnly) override {
+      if (SectionName == ".debug_str")
+        DebugSeen = true;
+      return SectionMemoryManager::allocateDataSection(Size, Alignment,
+                                                         SectionID,
+                                                         SectionName,
+                                                         IsReadOnly);
+    }
+  private:
+    bool &DebugSeen;
+  };
+
+  bool DebugSectionSeen = false;
+  auto MM = std::make_shared<MemoryManagerWrapper>(DebugSectionSeen);
+
+  ExecutionSession ES;
+
+  LegacyRTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey) {
+    return LegacyRTDyldObjectLinkingLayer::Resources{
+        MM, std::make_shared<NullResolver>()};
+  });
+
+  LLVMContext Context;
+  auto M = llvm::make_unique<Module>("", Context);
+  M->setTargetTriple("x86_64-unknown-linux-gnu");
+  Type *Int32Ty = IntegerType::get(Context, 32);
+  GlobalVariable *GV =
+    new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+                         ConstantInt::get(Int32Ty, 42), "foo");
+
+  GV->setSection(".debug_str");
+
+
+  // Initialize the native target in case this is the first unit test
+  // to try to build a TM.
+  OrcNativeTarget::initialize();
+  std::unique_ptr<TargetMachine> TM(
+    EngineBuilder().selectTarget(Triple(M->getTargetTriple()), "", "",
+                                 SmallVector<std::string, 1>()));
+  if (!TM)
+    return;
+
+  auto Obj = SimpleCompiler(*TM)(*M);
+
+  {
+    // Test with ProcessAllSections = false (the default).
+    auto K = ES.allocateVModule();
+    cantFail(ObjLayer.addObject(
+        K, MemoryBuffer::getMemBufferCopy(Obj->getBuffer())));
+    cantFail(ObjLayer.emitAndFinalize(K));
+    EXPECT_EQ(DebugSectionSeen, false)
+      << "Unexpected debug info section";
+    cantFail(ObjLayer.removeObject(K));
+  }
+
+  {
+    // Test with ProcessAllSections = true.
+    ObjLayer.setProcessAllSections(true);
+    auto K = ES.allocateVModule();
+    cantFail(ObjLayer.addObject(K, std::move(Obj)));
+    cantFail(ObjLayer.emitAndFinalize(K));
+    EXPECT_EQ(DebugSectionSeen, true)
+      << "Expected debug info section not seen";
+    cantFail(ObjLayer.removeObject(K));
+  }
+}
+
+TEST_F(LegacyRTDyldObjectLinkingLayerExecutionTest, NoDuplicateFinalization) {
+  if (!SupportsJIT)
+    return;
+
+  ExecutionSession ES;
+
+  auto MM = std::make_shared<SectionMemoryManagerWrapper>();
+
+  std::map<orc::VModuleKey, std::shared_ptr<orc::SymbolResolver>> Resolvers;
+
+  LegacyRTDyldObjectLinkingLayer ObjLayer(ES, [&](VModuleKey K) {
+    auto I = Resolvers.find(K);
+    assert(I != Resolvers.end() && "Missing resolver");
+    auto R = std::move(I->second);
+    Resolvers.erase(I);
+    return LegacyRTDyldObjectLinkingLayer::Resources{MM, std::move(R)};
+  });
+  SimpleCompiler Compile(*TM);
+
+  // Create a pair of modules that will trigger recursive finalization:
+  // Module 1:
+  //   int bar() { return 42; }
+  // Module 2:
+  //   int bar();
+  //   int foo() { return bar(); }
+  //
+  // Verify that the memory manager is only finalized once (for Module 2).
+  // Failure suggests that finalize is being called on the inner RTDyld
+  // instance (for Module 1) which is unsafe, as it will prevent relocation of
+  // Module 2.
+
+  ModuleBuilder MB1(Context, "", "dummy");
+  {
+    MB1.getModule()->setDataLayout(TM->createDataLayout());
+    Function *BarImpl = MB1.createFunctionDecl<int32_t(void)>("bar");
+    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
+    IRBuilder<> Builder(BarEntry);
+    IntegerType *Int32Ty = IntegerType::get(Context, 32);
+    Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42);
+    Builder.CreateRet(FourtyTwo);
+  }
+
+  auto Obj1 = Compile(*MB1.getModule());
+
+  ModuleBuilder MB2(Context, "", "dummy");
+  {
+    MB2.getModule()->setDataLayout(TM->createDataLayout());
+    Function *BarDecl = MB2.createFunctionDecl<int32_t(void)>("bar");
+    Function *FooImpl = MB2.createFunctionDecl<int32_t(void)>("foo");
+    BasicBlock *FooEntry = BasicBlock::Create(Context, "entry", FooImpl);
+    IRBuilder<> Builder(FooEntry);
+    Builder.CreateRet(Builder.CreateCall(BarDecl));
+  }
+  auto Obj2 = Compile(*MB2.getModule());
+
+  auto K1 = ES.allocateVModule();
+  Resolvers[K1] = std::make_shared<NullResolver>();
+  cantFail(ObjLayer.addObject(K1, std::move(Obj1)));
+
+  auto K2 = ES.allocateVModule();
+  auto LegacyLookup = [&](const std::string &Name) {
+    return ObjLayer.findSymbol(Name, true);
+  };
+
+  Resolvers[K2] = createSymbolResolver(
+      [&](const SymbolNameSet &Symbols) {
+        return cantFail(
+            getResponsibilitySetWithLegacyFn(Symbols, LegacyLookup));
+      },
+      [&](std::shared_ptr<AsynchronousSymbolQuery> Query,
+          const SymbolNameSet &Symbols) {
+        return lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup);
+      });
+
+  cantFail(ObjLayer.addObject(K2, std::move(Obj2)));
+  cantFail(ObjLayer.emitAndFinalize(K2));
+  cantFail(ObjLayer.removeObject(K2));
+
+  // Finalization of module 2 should trigger finalization of module 1.
+  // Verify that finalize on SMMW is only called once.
+  EXPECT_EQ(MM->FinalizationCount, 1)
+      << "Extra call to finalize";
+}
+
+TEST_F(LegacyRTDyldObjectLinkingLayerExecutionTest, NoPrematureAllocation) {
+  if (!SupportsJIT)
+    return;
+
+  ExecutionSession ES;
+
+  auto MM = std::make_shared<SectionMemoryManagerWrapper>();
+
+  LegacyRTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey K) {
+    return LegacyRTDyldObjectLinkingLayer::Resources{
+        MM, std::make_shared<NullResolver>()};
+  });
+  SimpleCompiler Compile(*TM);
+
+  // Create a pair of unrelated modules:
+  //
+  // Module 1:
+  //   int foo() { return 42; }
+  // Module 2:
+  //   int bar() { return 7; }
+  //
+  // Both modules will share a memory manager. We want to verify that the
+  // second object is not loaded before the first one is finalized. To do this
+  // in a portable way, we abuse the
+  // RuntimeDyld::MemoryManager::needsToReserveAllocationSpace hook, which is
+  // called once per object before any sections are allocated.
+
+  ModuleBuilder MB1(Context, "", "dummy");
+  {
+    MB1.getModule()->setDataLayout(TM->createDataLayout());
+    Function *BarImpl = MB1.createFunctionDecl<int32_t(void)>("foo");
+    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
+    IRBuilder<> Builder(BarEntry);
+    IntegerType *Int32Ty = IntegerType::get(Context, 32);
+    Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42);
+    Builder.CreateRet(FourtyTwo);
+  }
+
+  auto Obj1 = Compile(*MB1.getModule());
+
+  ModuleBuilder MB2(Context, "", "dummy");
+  {
+    MB2.getModule()->setDataLayout(TM->createDataLayout());
+    Function *BarImpl = MB2.createFunctionDecl<int32_t(void)>("bar");
+    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
+    IRBuilder<> Builder(BarEntry);
+    IntegerType *Int32Ty = IntegerType::get(Context, 32);
+    Value *Seven = ConstantInt::getSigned(Int32Ty, 7);
+    Builder.CreateRet(Seven);
+  }
+  auto Obj2 = Compile(*MB2.getModule());
+
+  auto K = ES.allocateVModule();
+  cantFail(ObjLayer.addObject(K, std::move(Obj1)));
+  cantFail(ObjLayer.addObject(ES.allocateVModule(), std::move(Obj2)));
+  cantFail(ObjLayer.emitAndFinalize(K));
+  cantFail(ObjLayer.removeObject(K));
+
+  // Only one call to needsToReserveAllocationSpace should have been made.
+  EXPECT_EQ(MM->NeedsToReserveAllocationSpaceCount, 1)
+      << "More than one call to needsToReserveAllocationSpace "
+         "(multiple unrelated objects loaded prior to finalization)";
+}
+
+TEST_F(LegacyRTDyldObjectLinkingLayerExecutionTest, TestNotifyLoadedSignature) {
+  ExecutionSession ES;
+  LegacyRTDyldObjectLinkingLayer ObjLayer(
+      ES,
+      [](VModuleKey) {
+        return LegacyRTDyldObjectLinkingLayer::Resources{
+            nullptr, std::make_shared<NullResolver>()};
+      },
+      [](VModuleKey, const object::ObjectFile &obj,
+         const RuntimeDyld::LoadedObjectInfo &info) {});
+}
+
+} // end anonymous namespace
diff --git a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
index 6ad3c19ada95f6732d50b844ae30969db32c17fa..1c530247a7c0dd903d0b2a474196e1e3b6f2e7df 100644
--- a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
@@ -175,19 +175,19 @@ private:
   }
 };
 
-// Test each operation on ObjectTransformLayer.
-TEST(ObjectTransformLayerTest, Main) {
+// Test each operation on LegacyObjectTransformLayer.
+TEST(LegacyObjectTransformLayerTest, Main) {
   MockBaseLayer M;
 
   ExecutionSession ES(std::make_shared<SymbolStringPool>());
 
   // Create one object transform layer using a transform (as a functor)
   // that allocates new objects, and deals in unique pointers.
-  ObjectTransformLayer<MockBaseLayer, AllocatingTransform> T1(M);
+  LegacyObjectTransformLayer<MockBaseLayer, AllocatingTransform> T1(M);
 
   // Create a second object transform layer using a transform (as a lambda)
   // that mutates objects in place, and deals in naked pointers
-  ObjectTransformLayer<MockBaseLayer,
+  LegacyObjectTransformLayer<MockBaseLayer,
                          std::function<std::shared_ptr<MockObjectFile>(
                            std::shared_ptr<MockObjectFile>)>>
     T2(M, [](std::shared_ptr<MockObjectFile> Obj) {
@@ -257,9 +257,9 @@ TEST(ObjectTransformLayerTest, Main) {
   if (!RunStaticChecks)
     return;
 
-  // Make sure that ObjectTransformLayer implements the object layer concept
+  // Make sure that LegacyObjectTransformLayer implements the object layer concept
   // correctly by sandwitching one between an ObjectLinkingLayer and an
-  // IRCompileLayer, verifying that it compiles if we have a call to the
+  // LegacyIRCompileLayer, verifying that it compiles if we have a call to the
   // IRComileLayer's addModule that should call the transform layer's
   // addObject, and also calling the other public transform layer methods
   // directly to make sure the methods they intend to forward to exist on
@@ -282,8 +282,8 @@ TEST(ObjectTransformLayerTest, Main) {
   };
 
   // Construct the jit layers.
-  RTDyldObjectLinkingLayer BaseLayer(ES, [](VModuleKey) {
-    return RTDyldObjectLinkingLayer::Resources{
+  LegacyRTDyldObjectLinkingLayer BaseLayer(ES, [](VModuleKey) {
+    return LegacyRTDyldObjectLinkingLayer::Resources{
         std::make_shared<llvm::SectionMemoryManager>(),
         std::make_shared<NullResolver>()};
   });
@@ -291,20 +291,20 @@ TEST(ObjectTransformLayerTest, Main) {
   auto IdentityTransform = [](std::unique_ptr<llvm::MemoryBuffer> Obj) {
     return Obj;
   };
-  ObjectTransformLayer<decltype(BaseLayer), decltype(IdentityTransform)>
+  LegacyObjectTransformLayer<decltype(BaseLayer), decltype(IdentityTransform)>
       TransformLayer(BaseLayer, IdentityTransform);
   auto NullCompiler = [](llvm::Module &) {
     return std::unique_ptr<llvm::MemoryBuffer>(nullptr);
   };
-  IRCompileLayer<decltype(TransformLayer), decltype(NullCompiler)>
+  LegacyIRCompileLayer<decltype(TransformLayer), decltype(NullCompiler)>
     CompileLayer(TransformLayer, NullCompiler);
 
-  // Make sure that the calls from IRCompileLayer to ObjectTransformLayer
+  // Make sure that the calls from LegacyIRCompileLayer to LegacyObjectTransformLayer
   // compile.
   cantFail(CompileLayer.addModule(ES.allocateVModule(),
                                   std::unique_ptr<llvm::Module>()));
 
-  // Make sure that the calls from ObjectTransformLayer to ObjectLinkingLayer
+  // Make sure that the calls from LegacyObjectTransformLayer to ObjectLinkingLayer
   // compile.
   VModuleKey DummyKey = ES.allocateVModule();
   cantFail(TransformLayer.emitAndFinalize(DummyKey));
diff --git a/unittests/ExecutionEngine/Orc/OrcTestCommon.h b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
index 284a1e37f103af26ad5502ce2c290f0f3942eac9..e76d2fae5e319ab8c73c97050903e82e96d3996a 100644
--- a/unittests/ExecutionEngine/Orc/OrcTestCommon.h
+++ b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
@@ -97,7 +97,7 @@ public:
       orc::SymbolFlagsMap SymbolFlags, MaterializeFunction Materialize,
       DiscardFunction Discard = DiscardFunction(),
       DestructorFunction Destructor = DestructorFunction())
-      : MaterializationUnit(std::move(SymbolFlags)),
+      : MaterializationUnit(std::move(SymbolFlags), orc::VModuleKey()),
         Materialize(std::move(Materialize)), Discard(std::move(Discard)),
         Destructor(std::move(Destructor)) {}
 
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp
deleted file mode 100644
index 1dbd48b597258c2c9879b7bcb917ce83b470aa8e..0000000000000000000000000000000000000000
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayer2Test.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-//===--- RTDyldObjectLinkingLayer2Test.cpp - RTDyld linking layer tests ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "OrcTestCommon.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/Legacy.h"
-#include "llvm/ExecutionEngine/Orc/NullResolver.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/LLVMContext.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace llvm::orc;
-
-namespace {
-
-class RTDyldObjectLinkingLayer2ExecutionTest : public testing::Test,
-                                               public OrcExecutionTest {};
-
-// Adds an object with a debug section to RuntimeDyld and then returns whether
-// the debug section was passed to the memory manager.
-static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
-                                      bool ProcessAllSections) {
-  class MemoryManagerWrapper : public SectionMemoryManager {
-  public:
-    MemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {}
-    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID, StringRef SectionName,
-                                 bool IsReadOnly) override {
-      if (SectionName == ".debug_str")
-        DebugSeen = true;
-      return SectionMemoryManager::allocateDataSection(
-          Size, Alignment, SectionID, SectionName, IsReadOnly);
-    }
-
-  private:
-    bool &DebugSeen;
-  };
-
-  bool DebugSectionSeen = false;
-
-  ExecutionSession ES;
-  auto &JD = ES.createJITDylib("main");
-  auto Foo = ES.intern("foo");
-
-  RTDyldObjectLinkingLayer2 ObjLayer(ES, [&DebugSectionSeen](VModuleKey) {
-    return llvm::make_unique<MemoryManagerWrapper>(DebugSectionSeen);
-  });
-
-  auto OnResolveDoNothing = [](Expected<SymbolMap> R) {
-    cantFail(std::move(R));
-  };
-
-  auto OnReadyDoNothing = [](Error Err) { cantFail(std::move(Err)); };
-
-  ObjLayer.setProcessAllSections(ProcessAllSections);
-  auto K = ES.allocateVModule();
-  cantFail(ObjLayer.add(JD, K, std::move(Obj)));
-  ES.lookup({&JD}, {Foo}, OnResolveDoNothing, OnReadyDoNothing,
-            NoDependenciesToRegister);
-  return DebugSectionSeen;
-}
-
-TEST(RTDyldObjectLinkingLayer2Test, TestSetProcessAllSections) {
-  LLVMContext Context;
-  auto M = llvm::make_unique<Module>("", Context);
-  M->setTargetTriple("x86_64-unknown-linux-gnu");
-  Type *Int32Ty = IntegerType::get(Context, 32);
-  GlobalVariable *GV =
-      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
-                         ConstantInt::get(Int32Ty, 42), "foo");
-
-  GV->setSection(".debug_str");
-
-  // Initialize the native target in case this is the first unit test
-  // to try to build a TM.
-  OrcNativeTarget::initialize();
-  std::unique_ptr<TargetMachine> TM(EngineBuilder().selectTarget(
-      Triple(M->getTargetTriple()), "", "", SmallVector<std::string, 1>()));
-  if (!TM)
-    return;
-
-  auto Obj = SimpleCompiler(*TM)(*M);
-
-  EXPECT_FALSE(testSetProcessAllSections(
-      MemoryBuffer::getMemBufferCopy(Obj->getBuffer()), false))
-      << "Debug section seen despite ProcessAllSections being false";
-  EXPECT_TRUE(testSetProcessAllSections(std::move(Obj), true))
-      << "Expected to see debug section when ProcessAllSections is true";
-}
-
-TEST(RTDyldObjectLinkingLayer2Test, TestOverrideObjectFlags) {
-
-  OrcNativeTarget::initialize();
-
-  std::unique_ptr<TargetMachine> TM(
-      EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "",
-                                   SmallVector<std::string, 1>()));
-
-  if (!TM)
-    return;
-
-  // Our compiler is going to modify symbol visibility settings without telling
-  // ORC. This will test our ability to override the flags later.
-  class FunkySimpleCompiler : public SimpleCompiler {
-  public:
-    FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {}
-
-    CompileResult operator()(Module &M) {
-      auto *Foo = M.getFunction("foo");
-      assert(Foo && "Expected function Foo not found");
-      Foo->setVisibility(GlobalValue::HiddenVisibility);
-      return SimpleCompiler::operator()(M);
-    }
-  };
-
-  // Create a module with two void() functions: foo and bar.
-  ThreadSafeContext TSCtx(llvm::make_unique<LLVMContext>());
-  ThreadSafeModule M;
-  {
-    ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy");
-    MB.getModule()->setDataLayout(TM->createDataLayout());
-
-    Function *FooImpl = MB.createFunctionDecl<void()>("foo");
-    BasicBlock *FooEntry =
-        BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl);
-    IRBuilder<> B1(FooEntry);
-    B1.CreateRetVoid();
-
-    Function *BarImpl = MB.createFunctionDecl<void()>("bar");
-    BasicBlock *BarEntry =
-        BasicBlock::Create(*TSCtx.getContext(), "entry", BarImpl);
-    IRBuilder<> B2(BarEntry);
-    B2.CreateRetVoid();
-
-    M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx));
-  }
-
-  // Create a simple stack and set the override flags option.
-  ExecutionSession ES;
-  auto &JD = ES.createJITDylib("main");
-  auto Foo = ES.intern("foo");
-  RTDyldObjectLinkingLayer2 ObjLayer(
-      ES, [](VModuleKey) { return llvm::make_unique<SectionMemoryManager>(); });
-  IRCompileLayer2 CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
-
-  ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true);
-
-  cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M)));
-  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
-            [](Error Err) { cantFail(std::move(Err)); },
-            NoDependenciesToRegister);
-}
-
-TEST(RTDyldObjectLinkingLayer2Test, TestAutoClaimResponsibilityForSymbols) {
-
-  OrcNativeTarget::initialize();
-
-  std::unique_ptr<TargetMachine> TM(
-      EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "",
-                                   SmallVector<std::string, 1>()));
-
-  if (!TM)
-    return;
-
-  // Our compiler is going to add a new symbol without telling ORC.
-  // This will test our ability to auto-claim responsibility later.
-  class FunkySimpleCompiler : public SimpleCompiler {
-  public:
-    FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {}
-
-    CompileResult operator()(Module &M) {
-      Function *BarImpl =
-          Function::Create(TypeBuilder<void(), false>::get(M.getContext()),
-                           GlobalValue::ExternalLinkage, "bar", &M);
-      BasicBlock *BarEntry =
-          BasicBlock::Create(M.getContext(), "entry", BarImpl);
-      IRBuilder<> B(BarEntry);
-      B.CreateRetVoid();
-
-      return SimpleCompiler::operator()(M);
-    }
-  };
-
-  // Create a module with two void() functions: foo and bar.
-  ThreadSafeContext TSCtx(llvm::make_unique<LLVMContext>());
-  ThreadSafeModule M;
-  {
-    ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy");
-    MB.getModule()->setDataLayout(TM->createDataLayout());
-
-    Function *FooImpl = MB.createFunctionDecl<void()>("foo");
-    BasicBlock *FooEntry =
-        BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl);
-    IRBuilder<> B(FooEntry);
-    B.CreateRetVoid();
-
-    M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx));
-  }
-
-  // Create a simple stack and set the override flags option.
-  ExecutionSession ES;
-  auto &JD = ES.createJITDylib("main");
-  auto Foo = ES.intern("foo");
-  RTDyldObjectLinkingLayer2 ObjLayer(
-      ES, [](VModuleKey) { return llvm::make_unique<SectionMemoryManager>(); });
-  IRCompileLayer2 CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
-
-  ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true);
-
-  cantFail(CompileLayer.add(JD, ES.allocateVModule(), std::move(M)));
-  ES.lookup({&JD}, {Foo}, [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
-            [](Error Err) { cantFail(std::move(Err)); },
-            NoDependenciesToRegister);
-}
-
-} // end anonymous namespace
diff --git a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index 62c6b7dfa3119b397012ab94ee4c7d48591b9bad..6b1dbe93d5e35b3adb1ddb0a84ea9b43a087eb0c 100644
--- a/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -1,4 +1,4 @@
-//===- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer unit tests -===//
+//===--- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer tests ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "OrcTestCommon.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
 #include "llvm/ExecutionEngine/Orc/Legacy.h"
 #include "llvm/ExecutionEngine/Orc/NullResolver.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
@@ -25,258 +26,204 @@ using namespace llvm::orc;
 namespace {
 
 class RTDyldObjectLinkingLayerExecutionTest : public testing::Test,
-                                              public OrcExecutionTest {
-
-};
-
-class SectionMemoryManagerWrapper : public SectionMemoryManager {
-public:
-  int FinalizationCount = 0;
-  int NeedsToReserveAllocationSpaceCount = 0;
-
-  bool needsToReserveAllocationSpace() override {
-    ++NeedsToReserveAllocationSpaceCount;
-    return SectionMemoryManager::needsToReserveAllocationSpace();
-  }
-
-  bool finalizeMemory(std::string *ErrMsg = nullptr) override {
-    ++FinalizationCount;
-    return SectionMemoryManager::finalizeMemory(ErrMsg);
-  }
-};
+                                               public OrcExecutionTest {};
 
-TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
+// Adds an object with a debug section to RuntimeDyld and then returns whether
+// the debug section was passed to the memory manager.
+static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
+                                      bool ProcessAllSections) {
   class MemoryManagerWrapper : public SectionMemoryManager {
   public:
     MemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {}
     uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID,
-                                 StringRef SectionName,
+                                 unsigned SectionID, StringRef SectionName,
                                  bool IsReadOnly) override {
       if (SectionName == ".debug_str")
         DebugSeen = true;
-      return SectionMemoryManager::allocateDataSection(Size, Alignment,
-                                                         SectionID,
-                                                         SectionName,
-                                                         IsReadOnly);
+      return SectionMemoryManager::allocateDataSection(
+          Size, Alignment, SectionID, SectionName, IsReadOnly);
     }
+
   private:
     bool &DebugSeen;
   };
 
   bool DebugSectionSeen = false;
-  auto MM = std::make_shared<MemoryManagerWrapper>(DebugSectionSeen);
 
   ExecutionSession ES;
+  auto &JD = ES.createJITDylib("main");
+  auto Foo = ES.intern("foo");
 
-  RTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey) {
-    return RTDyldObjectLinkingLayer::Resources{
-        MM, std::make_shared<NullResolver>()};
+  RTDyldObjectLinkingLayer ObjLayer(ES, [&DebugSectionSeen]() {
+    return llvm::make_unique<MemoryManagerWrapper>(DebugSectionSeen);
   });
 
+  auto OnResolveDoNothing = [](Expected<SymbolMap> R) {
+    cantFail(std::move(R));
+  };
+
+  auto OnReadyDoNothing = [](Error Err) { cantFail(std::move(Err)); };
+
+  ObjLayer.setProcessAllSections(ProcessAllSections);
+  cantFail(ObjLayer.add(JD, std::move(Obj), ES.allocateVModule()));
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, OnResolveDoNothing,
+            OnReadyDoNothing, NoDependenciesToRegister);
+  return DebugSectionSeen;
+}
+
+TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
   LLVMContext Context;
   auto M = llvm::make_unique<Module>("", Context);
   M->setTargetTriple("x86_64-unknown-linux-gnu");
   Type *Int32Ty = IntegerType::get(Context, 32);
   GlobalVariable *GV =
-    new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+      new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
                          ConstantInt::get(Int32Ty, 42), "foo");
 
   GV->setSection(".debug_str");
 
-
   // Initialize the native target in case this is the first unit test
   // to try to build a TM.
   OrcNativeTarget::initialize();
-  std::unique_ptr<TargetMachine> TM(
-    EngineBuilder().selectTarget(Triple(M->getTargetTriple()), "", "",
-                                 SmallVector<std::string, 1>()));
+  std::unique_ptr<TargetMachine> TM(EngineBuilder().selectTarget(
+      Triple(M->getTargetTriple()), "", "", SmallVector<std::string, 1>()));
   if (!TM)
     return;
 
   auto Obj = SimpleCompiler(*TM)(*M);
 
-  {
-    // Test with ProcessAllSections = false (the default).
-    auto K = ES.allocateVModule();
-    cantFail(ObjLayer.addObject(
-        K, MemoryBuffer::getMemBufferCopy(Obj->getBuffer())));
-    cantFail(ObjLayer.emitAndFinalize(K));
-    EXPECT_EQ(DebugSectionSeen, false)
-      << "Unexpected debug info section";
-    cantFail(ObjLayer.removeObject(K));
-  }
-
-  {
-    // Test with ProcessAllSections = true.
-    ObjLayer.setProcessAllSections(true);
-    auto K = ES.allocateVModule();
-    cantFail(ObjLayer.addObject(K, std::move(Obj)));
-    cantFail(ObjLayer.emitAndFinalize(K));
-    EXPECT_EQ(DebugSectionSeen, true)
-      << "Expected debug info section not seen";
-    cantFail(ObjLayer.removeObject(K));
-  }
+  EXPECT_FALSE(testSetProcessAllSections(
+      MemoryBuffer::getMemBufferCopy(Obj->getBuffer()), false))
+      << "Debug section seen despite ProcessAllSections being false";
+  EXPECT_TRUE(testSetProcessAllSections(std::move(Obj), true))
+      << "Expected to see debug section when ProcessAllSections is true";
 }
 
-TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoDuplicateFinalization) {
-  if (!SupportsJIT)
-    return;
+TEST(RTDyldObjectLinkingLayerTest, TestOverrideObjectFlags) {
 
-  ExecutionSession ES;
+  OrcNativeTarget::initialize();
 
-  auto MM = std::make_shared<SectionMemoryManagerWrapper>();
+  std::unique_ptr<TargetMachine> TM(
+      EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "",
+                                   SmallVector<std::string, 1>()));
 
-  std::map<orc::VModuleKey, std::shared_ptr<orc::SymbolResolver>> Resolvers;
+  if (!TM)
+    return;
 
-  RTDyldObjectLinkingLayer ObjLayer(ES, [&](VModuleKey K) {
-    auto I = Resolvers.find(K);
-    assert(I != Resolvers.end() && "Missing resolver");
-    auto R = std::move(I->second);
-    Resolvers.erase(I);
-    return RTDyldObjectLinkingLayer::Resources{MM, std::move(R)};
-  });
-  SimpleCompiler Compile(*TM);
-
-  // Create a pair of modules that will trigger recursive finalization:
-  // Module 1:
-  //   int bar() { return 42; }
-  // Module 2:
-  //   int bar();
-  //   int foo() { return bar(); }
-  //
-  // Verify that the memory manager is only finalized once (for Module 2).
-  // Failure suggests that finalize is being called on the inner RTDyld
-  // instance (for Module 1) which is unsafe, as it will prevent relocation of
-  // Module 2.
-
-  ModuleBuilder MB1(Context, "", "dummy");
-  {
-    MB1.getModule()->setDataLayout(TM->createDataLayout());
-    Function *BarImpl = MB1.createFunctionDecl<int32_t(void)>("bar");
-    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
-    IRBuilder<> Builder(BarEntry);
-    IntegerType *Int32Ty = IntegerType::get(Context, 32);
-    Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42);
-    Builder.CreateRet(FourtyTwo);
-  }
+  // Our compiler is going to modify symbol visibility settings without telling
+  // ORC. This will test our ability to override the flags later.
+  class FunkySimpleCompiler : public SimpleCompiler {
+  public:
+    FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {}
 
-  auto Obj1 = Compile(*MB1.getModule());
+    CompileResult operator()(Module &M) {
+      auto *Foo = M.getFunction("foo");
+      assert(Foo && "Expected function Foo not found");
+      Foo->setVisibility(GlobalValue::HiddenVisibility);
+      return SimpleCompiler::operator()(M);
+    }
+  };
 
-  ModuleBuilder MB2(Context, "", "dummy");
+  // Create a module with two void() functions: foo and bar.
+  ThreadSafeContext TSCtx(llvm::make_unique<LLVMContext>());
+  ThreadSafeModule M;
   {
-    MB2.getModule()->setDataLayout(TM->createDataLayout());
-    Function *BarDecl = MB2.createFunctionDecl<int32_t(void)>("bar");
-    Function *FooImpl = MB2.createFunctionDecl<int32_t(void)>("foo");
-    BasicBlock *FooEntry = BasicBlock::Create(Context, "entry", FooImpl);
-    IRBuilder<> Builder(FooEntry);
-    Builder.CreateRet(Builder.CreateCall(BarDecl));
+    ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy");
+    MB.getModule()->setDataLayout(TM->createDataLayout());
+
+    Function *FooImpl = MB.createFunctionDecl<void()>("foo");
+    BasicBlock *FooEntry =
+        BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl);
+    IRBuilder<> B1(FooEntry);
+    B1.CreateRetVoid();
+
+    Function *BarImpl = MB.createFunctionDecl<void()>("bar");
+    BasicBlock *BarEntry =
+        BasicBlock::Create(*TSCtx.getContext(), "entry", BarImpl);
+    IRBuilder<> B2(BarEntry);
+    B2.CreateRetVoid();
+
+    M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx));
   }
-  auto Obj2 = Compile(*MB2.getModule());
 
-  auto K1 = ES.allocateVModule();
-  Resolvers[K1] = std::make_shared<NullResolver>();
-  cantFail(ObjLayer.addObject(K1, std::move(Obj1)));
+  // Create a simple stack and set the override flags option.
+  ExecutionSession ES;
+  auto &JD = ES.createJITDylib("main");
+  auto Foo = ES.intern("foo");
+  RTDyldObjectLinkingLayer ObjLayer(
+      ES, []() { return llvm::make_unique<SectionMemoryManager>(); });
+  IRCompileLayer CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
 
-  auto K2 = ES.allocateVModule();
-  auto LegacyLookup = [&](const std::string &Name) {
-    return ObjLayer.findSymbol(Name, true);
-  };
+  ObjLayer.setOverrideObjectFlagsWithResponsibilityFlags(true);
 
-  Resolvers[K2] = createSymbolResolver(
-      [&](const SymbolNameSet &Symbols) {
-        return cantFail(
-            getResponsibilitySetWithLegacyFn(Symbols, LegacyLookup));
-      },
-      [&](std::shared_ptr<AsynchronousSymbolQuery> Query,
-          const SymbolNameSet &Symbols) {
-        return lookupWithLegacyFn(ES, *Query, Symbols, LegacyLookup);
-      });
-
-  cantFail(ObjLayer.addObject(K2, std::move(Obj2)));
-  cantFail(ObjLayer.emitAndFinalize(K2));
-  cantFail(ObjLayer.removeObject(K2));
-
-  // Finalization of module 2 should trigger finalization of module 1.
-  // Verify that finalize on SMMW is only called once.
-  EXPECT_EQ(MM->FinalizationCount, 1)
-      << "Extra call to finalize";
+  cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo},
+            [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+            [](Error Err) { cantFail(std::move(Err)); },
+            NoDependenciesToRegister);
 }
 
-TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoPrematureAllocation) {
-  if (!SupportsJIT)
-    return;
+TEST(RTDyldObjectLinkingLayerTest, TestAutoClaimResponsibilityForSymbols) {
 
-  ExecutionSession ES;
+  OrcNativeTarget::initialize();
 
-  auto MM = std::make_shared<SectionMemoryManagerWrapper>();
+  std::unique_ptr<TargetMachine> TM(
+      EngineBuilder().selectTarget(Triple("x86_64-unknown-linux-gnu"), "", "",
+                                   SmallVector<std::string, 1>()));
 
-  RTDyldObjectLinkingLayer ObjLayer(ES, [&MM](VModuleKey K) {
-    return RTDyldObjectLinkingLayer::Resources{
-        MM, std::make_shared<NullResolver>()};
-  });
-  SimpleCompiler Compile(*TM);
-
-  // Create a pair of unrelated modules:
-  //
-  // Module 1:
-  //   int foo() { return 42; }
-  // Module 2:
-  //   int bar() { return 7; }
-  //
-  // Both modules will share a memory manager. We want to verify that the
-  // second object is not loaded before the first one is finalized. To do this
-  // in a portable way, we abuse the
-  // RuntimeDyld::MemoryManager::needsToReserveAllocationSpace hook, which is
-  // called once per object before any sections are allocated.
-
-  ModuleBuilder MB1(Context, "", "dummy");
-  {
-    MB1.getModule()->setDataLayout(TM->createDataLayout());
-    Function *BarImpl = MB1.createFunctionDecl<int32_t(void)>("foo");
-    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
-    IRBuilder<> Builder(BarEntry);
-    IntegerType *Int32Ty = IntegerType::get(Context, 32);
-    Value *FourtyTwo = ConstantInt::getSigned(Int32Ty, 42);
-    Builder.CreateRet(FourtyTwo);
-  }
+  if (!TM)
+    return;
 
-  auto Obj1 = Compile(*MB1.getModule());
+  // Our compiler is going to add a new symbol without telling ORC.
+  // This will test our ability to auto-claim responsibility later.
+  class FunkySimpleCompiler : public SimpleCompiler {
+  public:
+    FunkySimpleCompiler(TargetMachine &TM) : SimpleCompiler(TM) {}
+
+    CompileResult operator()(Module &M) {
+      Function *BarImpl =
+          Function::Create(TypeBuilder<void(), false>::get(M.getContext()),
+                           GlobalValue::ExternalLinkage, "bar", &M);
+      BasicBlock *BarEntry =
+          BasicBlock::Create(M.getContext(), "entry", BarImpl);
+      IRBuilder<> B(BarEntry);
+      B.CreateRetVoid();
+
+      return SimpleCompiler::operator()(M);
+    }
+  };
 
-  ModuleBuilder MB2(Context, "", "dummy");
+  // Create a module with two void() functions: foo and bar.
+  ThreadSafeContext TSCtx(llvm::make_unique<LLVMContext>());
+  ThreadSafeModule M;
   {
-    MB2.getModule()->setDataLayout(TM->createDataLayout());
-    Function *BarImpl = MB2.createFunctionDecl<int32_t(void)>("bar");
-    BasicBlock *BarEntry = BasicBlock::Create(Context, "entry", BarImpl);
-    IRBuilder<> Builder(BarEntry);
-    IntegerType *Int32Ty = IntegerType::get(Context, 32);
-    Value *Seven = ConstantInt::getSigned(Int32Ty, 7);
-    Builder.CreateRet(Seven);
+    ModuleBuilder MB(*TSCtx.getContext(), TM->getTargetTriple().str(), "dummy");
+    MB.getModule()->setDataLayout(TM->createDataLayout());
+
+    Function *FooImpl = MB.createFunctionDecl<void()>("foo");
+    BasicBlock *FooEntry =
+        BasicBlock::Create(*TSCtx.getContext(), "entry", FooImpl);
+    IRBuilder<> B(FooEntry);
+    B.CreateRetVoid();
+
+    M = ThreadSafeModule(MB.takeModule(), std::move(TSCtx));
   }
-  auto Obj2 = Compile(*MB2.getModule());
-
-  auto K = ES.allocateVModule();
-  cantFail(ObjLayer.addObject(K, std::move(Obj1)));
-  cantFail(ObjLayer.addObject(ES.allocateVModule(), std::move(Obj2)));
-  cantFail(ObjLayer.emitAndFinalize(K));
-  cantFail(ObjLayer.removeObject(K));
-
-  // Only one call to needsToReserveAllocationSpace should have been made.
-  EXPECT_EQ(MM->NeedsToReserveAllocationSpaceCount, 1)
-      << "More than one call to needsToReserveAllocationSpace "
-         "(multiple unrelated objects loaded prior to finalization)";
-}
 
-TEST_F(RTDyldObjectLinkingLayerExecutionTest, TestNotifyLoadedSignature) {
+  // Create a simple stack and set the override flags option.
   ExecutionSession ES;
+  auto &JD = ES.createJITDylib("main");
+  auto Foo = ES.intern("foo");
   RTDyldObjectLinkingLayer ObjLayer(
-      ES,
-      [](VModuleKey) {
-        return RTDyldObjectLinkingLayer::Resources{
-            nullptr, std::make_shared<NullResolver>()};
-      },
-      [](VModuleKey, const object::ObjectFile &obj,
-         const RuntimeDyld::LoadedObjectInfo &info) {});
+      ES, []() { return llvm::make_unique<SectionMemoryManager>(); });
+  IRCompileLayer CompileLayer(ES, ObjLayer, FunkySimpleCompiler(*TM));
+
+  ObjLayer.setAutoClaimResponsibilityForObjectSymbols(true);
+
+  cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
+  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo},
+            [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
+            [](Error Err) { cantFail(std::move(Err)); },
+            NoDependenciesToRegister);
 }
 
 } // end anonymous namespace
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index 211ab109131cf38ae7a07adc6b8d7a57f4c05b3a..7498983b2609bb25b1f82ba2a892a090f844767d 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -40,3 +40,5 @@ add_llvm_unittest(IRTests
   VerifierTest.cpp
   WaymarkTest.cpp
   )
+
+target_link_libraries(IRTests PRIVATE LLVMTestingSupport)
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index cf81623d0d1730594bf05f2768f5af43652c98aa..7539bbc860bd11329f00bbf0013b558bedd1196b 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -301,7 +301,7 @@ TEST(DominatorTree, NonUniqueEdges) {
         BasicBlock *BB1 = &*FI++;
         BasicBlock *BB2 = &*FI++;
 
-        const TerminatorInst *TI = BB0->getTerminator();
+        const Instruction *TI = BB0->getTerminator();
         assert(TI->getNumSuccessors() == 3 && "Switch has three successors");
 
         BasicBlockEdge Edge_BB0_BB2(BB0, TI->getSuccessor(0));
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 42c0393d38213dcf534274768fde5ca937e5ca55..be29b41309a440ffcbefa77de37192047ccf8a34 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -68,6 +68,14 @@ TEST_F(IRBuilderTest, Intrinsics) {
   II = cast<IntrinsicInst>(Call);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maxnum);
 
+  Call = Builder.CreateMinimum(V, V);
+  II = cast<IntrinsicInst>(Call);
+  EXPECT_EQ(II->getIntrinsicID(), Intrinsic::minimum);
+
+  Call = Builder.CreateMaximum(V, V);
+  II = cast<IntrinsicInst>(Call);
+  EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maximum);
+
   Call = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, {}, {});
   II = cast<IntrinsicInst>(Call);
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::readcyclecounter);
@@ -152,7 +160,7 @@ TEST_F(IRBuilderTest, CreateCondBr) {
   BasicBlock *FBB = BasicBlock::Create(Ctx, "", F);
 
   BranchInst *BI = Builder.CreateCondBr(Builder.getTrue(), TBB, FBB);
-  TerminatorInst *TI = BB->getTerminator();
+  Instruction *TI = BB->getTerminator();
   EXPECT_EQ(BI, TI);
   EXPECT_EQ(2u, TI->getNumSuccessors());
   EXPECT_EQ(TBB, TI->getSuccessor(0));
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 83b166a263cfcfc5965168043e268e3a83aecd80..100c4ed5e159078a7b2ca49631286160450544b4 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -402,6 +402,27 @@ TEST_F(MDNodeTest, PrintFromMetadataAsValue) {
   EXPECT_PRINTER_EQ("metadata !0", MAV0->printAsOperand(OS, true, MST));
   EXPECT_PRINTER_EQ("metadata !1", MAV1->printAsOperand(OS, true, MST));
 }
+
+TEST_F(MDNodeTest, PrintWithDroppedCallOperand) {
+  Module M("test", Context);
+
+  auto *FTy = FunctionType::get(Type::getVoidTy(Context), false);
+  auto *F0 = Function::Create(FTy, GlobalValue::ExternalLinkage, "F0", &M);
+  auto *F1 = Function::Create(FTy, GlobalValue::ExternalLinkage, "F1", &M);
+  auto *BB0 = BasicBlock::Create(Context, "entry", F0);
+
+  CallInst *CI0 = CallInst::Create(F1, "", BB0);
+  CI0->dropAllReferences();
+
+  auto *R0 = ReturnInst::Create(Context, BB0);
+  auto *N0 = MDNode::getDistinct(Context, None);
+  R0->setMetadata("md", N0);
+
+  // Printing the metadata node would previously result in a failed assertion
+  // due to the call instruction's dropped function operand.
+  ModuleSlotTracker MST(&M);
+  EXPECT_PRINTER_EQ("!0 = distinct !{}", N0->print(OS, MST));
+}
 #undef EXPECT_PRINTER_EQ
 
 TEST_F(MDNodeTest, NullOperand) {
diff --git a/unittests/IR/PassBuilderCallbacksTest.cpp b/unittests/IR/PassBuilderCallbacksTest.cpp
index 97bbb81a6b013b58babc701e0accc86552ecf3d4..20c47b045e75cbc67bb11e8f3eb3e46d24192a8d 100644
--- a/unittests/IR/PassBuilderCallbacksTest.cpp
+++ b/unittests/IR/PassBuilderCallbacksTest.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Testing/Support/Error.h"
 #include <functional>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -460,7 +461,7 @@ TEST_F(ModuleCallbacksTest, Passes) {
       .WillOnce(Invoke(getAnalysisResult));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -494,7 +495,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -525,7 +526,7 @@ TEST_F(ModuleCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
 
   PM.run(*M, AM);
@@ -537,7 +538,7 @@ TEST_F(FunctionCallbacksTest, Passes) {
       .WillOnce(Invoke(getAnalysisResult));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -571,7 +572,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -604,7 +605,7 @@ TEST_F(FunctionCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -615,7 +616,7 @@ TEST_F(LoopCallbacksTest, Passes) {
       .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult)));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -650,7 +651,7 @@ TEST_F(LoopCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -682,7 +683,7 @@ TEST_F(LoopCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -693,7 +694,7 @@ TEST_F(CGSCCCallbacksTest, Passes) {
       .WillOnce(WithArgs<0, 1, 2>(Invoke(getAnalysisResult)));
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -727,7 +728,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedPasses) {
       .InSequence(PISequence);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -759,7 +760,7 @@ TEST_F(CGSCCCallbacksTest, InstrumentedSkippedPasses) {
       .Times(0);
 
   StringRef PipelineText = "test-transform";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -774,7 +775,7 @@ TEST_F(ModuleCallbacksTest, AnalysisUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("<string>"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -784,7 +785,7 @@ TEST_F(CGSCCCallbacksTest, PassUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("(foo)"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -794,7 +795,7 @@ TEST_F(FunctionCallbacksTest, AnalysisUtilities) {
   EXPECT_CALL(AnalysisHandle, invalidate(HasName("foo"), _, _));
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -805,7 +806,7 @@ TEST_F(LoopCallbacksTest, PassUtilities) {
 
   StringRef PipelineText = "require<test-analysis>,invalidate<test-analysis>";
 
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 }
@@ -845,13 +846,13 @@ TEST_F(ModuleCallbacksTest, ParseTopLevelPipeline) {
 
   StringRef PipelineText =
       "another-pipeline(test-transform,invalidate<test-analysis>)";
-  ASSERT_TRUE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Succeeded())
       << "Pipeline was: " << PipelineText;
   PM.run(*M, AM);
 
   /// Test the negative case
   PipelineText = "another-pipeline(instcombine)";
-  ASSERT_FALSE(PB.parsePassPipeline(PM, PipelineText, true))
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, PipelineText, true), Failed())
       << "Pipeline was: " << PipelineText;
 }
 } // end anonymous namespace
diff --git a/unittests/MI/LiveIntervalTest.cpp b/unittests/MI/LiveIntervalTest.cpp
index a39fd7f73cf4349117f9f85099276f2dff96ddd0..5ee9d13dbd9234881ec45a8ebb3b613c5e92acac 100644
--- a/unittests/MI/LiveIntervalTest.cpp
+++ b/unittests/MI/LiveIntervalTest.cpp
@@ -35,7 +35,7 @@ void initLLVM() {
 /// Create a TargetMachine. As we lack a dedicated always available target for
 /// unittests, we go for "AMDGPU" to be able to test normal and subregister
 /// liveranges.
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   Triple TargetTriple("amdgcn--");
   std::string Error;
   const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
@@ -43,13 +43,14 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
     return nullptr;
 
   TargetOptions Options;
-  return std::unique_ptr<TargetMachine>(T->createTargetMachine(
-      "AMDGPU", "", "", Options, None, None, CodeGenOpt::Aggressive));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      T->createTargetMachine("AMDGPU", "", "", Options, None, None,
+                             CodeGenOpt::Aggressive)));
 }
 
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
     legacy::PassManagerBase &PM, std::unique_ptr<MIRParser> &MIR,
-    const TargetMachine &TM, StringRef MIRCode, const char *FuncName) {
+    const LLVMTargetMachine &TM, StringRef MIRCode, const char *FuncName) {
   SMDiagnostic Diagnostic;
   std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
   MIR = createMIRParser(std::move(MBuffer), Context);
@@ -128,7 +129,7 @@ static void testHandleMove(MachineFunction &MF, LiveIntervals &LIS,
 
 static void liveIntervalTest(StringRef MIRFunc, LiveIntervalTest T) {
   LLVMContext Context;
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   // This test is designed for the X86 backend; stop if it is not available.
   if (!TM)
     return;
diff --git a/unittests/OptRemarks/CMakeLists.txt b/unittests/OptRemarks/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..94c74867cc4047d9c13140ac72bc50db9d741153
--- /dev/null
+++ b/unittests/OptRemarks/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_LINK_COMPONENTS
+  OptRemarks
+  Support
+  )
+
+add_llvm_unittest(OptRemarksTests
+  OptRemarksParsingTest.cpp
+  )
diff --git a/unittests/OptRemarks/OptRemarksParsingTest.cpp b/unittests/OptRemarks/OptRemarksParsingTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3b28f038b5033c87ca0431abee84d0e0c253b09
--- /dev/null
+++ b/unittests/OptRemarks/OptRemarksParsingTest.cpp
@@ -0,0 +1,433 @@
+//===- unittest/Support/OptRemarksParsingTest.cpp - OptTable tests --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/OptRemarks.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+template <size_t N> bool tryParse(const char (&Buf)[N]) {
+  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1);
+  LLVMOptRemarkEntry *Remark = nullptr;
+  while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) {
+    EXPECT_TRUE(Remark == nullptr); // Only one remark per test.
+    Remark = NewRemark;
+  }
+  EXPECT_TRUE(Remark != nullptr); // We need *exactly* one remark per test.
+  bool HasError = LLVMOptRemarkParserHasError(Parser);
+  LLVMOptRemarkParserDispose(Parser);
+  return !HasError;
+}
+
+template <size_t N>
+bool parseExpectError(const char (&Buf)[N], const char *Error) {
+  LLVMOptRemarkParserRef Parser = LLVMOptRemarkParserCreate(Buf, N - 1);
+  LLVMOptRemarkEntry *Remark = nullptr;
+  while (LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser)) {
+    EXPECT_FALSE(NewRemark);
+  }
+  EXPECT_TRUE(Remark == nullptr); // We are parsing only one malformed remark.
+  EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser));
+  bool MatchesError =
+      StringRef(LLVMOptRemarkParserGetErrorMessage(Parser)).contains(Error);
+  LLVMOptRemarkParserDispose(Parser);
+
+  return MatchesError;
+}
+
+TEST(OptRemarks, OptRemarksParsingEmpty) {
+  StringRef Buf = "\n"
+                  "\n";
+  LLVMOptRemarkParserRef Parser =
+      LLVMOptRemarkParserCreate(Buf.data(), Buf.size());
+  LLVMOptRemarkEntry *NewRemark = LLVMOptRemarkParserGetNext(Parser);
+  EXPECT_TRUE(NewRemark == nullptr); // No remark expected.
+  EXPECT_TRUE(LLVMOptRemarkParserHasError(Parser));
+  EXPECT_TRUE(StringRef(LLVMOptRemarkParserGetErrorMessage(Parser))
+                  .contains("document root is not of mapping type."));
+  LLVMOptRemarkParserDispose(Parser);
+}
+
+TEST(OptRemarks, OptRemarksParsingGood) {
+  EXPECT_TRUE(tryParse("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"DebugLoc: { File: file.c, Line: 3, Column: 12 }\n"
+"Function: foo\n"
+"Args:\n"
+"  - Callee: bar\n"
+"  - String: ' will not be inlined into '\n"
+"  - Caller: foo\n"
+"    DebugLoc: { File: file.c, Line: 2, Column: 0 }\n"
+"  - String: ' because its definition is unavailable'\n"
+""));
+
+  // No debug loc should also pass.
+  EXPECT_TRUE(tryParse("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - Callee: bar\n"
+"  - String: ' will not be inlined into '\n"
+"  - Caller: foo\n"
+"    DebugLoc: { File: file.c, Line: 2, Column: 0 }\n"
+"  - String: ' because its definition is unavailable'\n"
+""));
+
+  // No args is also ok.
+  EXPECT_TRUE(tryParse("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"DebugLoc: { File: file.c, Line: 3, Column: 12 }\n"
+"Function: foo\n"
+""));
+
+  // Different order.
+  EXPECT_TRUE(tryParse("\n"
+"--- !Missed\n"
+"DebugLoc: { Line: 3, Column: 12, File: file.c }\n"
+"Function: foo\n"
+"Name: NoDefinition\n"
+"Args:\n"
+"  - Callee: bar\n"
+"  - String: ' will not be inlined into '\n"
+"  - Caller: foo\n"
+"    DebugLoc: { File: file.c, Line: 2, Column: 0 }\n"
+"  - String: ' because its definition is unavailable'\n"
+"Pass: inline\n"
+""));
+}
+
+// Mandatory common part of a remark.
+#define COMMON_REMARK "\nPass: inline\nName: NoDefinition\nFunction: foo\n\n"
+// Test all the types.
+TEST(OptRemarks, OptRemarksParsingTypes) {
+  // Type: Passed
+  EXPECT_TRUE(tryParse("--- !Passed" COMMON_REMARK));
+  // Type: Missed
+  EXPECT_TRUE(tryParse("--- !Missed" COMMON_REMARK));
+  // Type: Analysis
+  EXPECT_TRUE(tryParse("--- !Analysis" COMMON_REMARK));
+  // Type: AnalysisFPCompute
+  EXPECT_TRUE(tryParse("--- !AnalysisFPCompute" COMMON_REMARK));
+  // Type: AnalysisAliasing
+  EXPECT_TRUE(tryParse("--- !AnalysisAliasing" COMMON_REMARK));
+  // Type: Failure
+  EXPECT_TRUE(tryParse("--- !Failure" COMMON_REMARK));
+}
+#undef COMMON_REMARK
+
+TEST(OptRemarks, OptRemarksParsingMissingFields) {
+  // No type.
+  EXPECT_TRUE(parseExpectError("\n"
+"---\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"",
+                               "error: Type, Pass, Name or Function missing."));
+  // No pass.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"",
+                               "error: Type, Pass, Name or Function missing."));
+  // No name.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Function: foo\n"
+"",
+                               "error: Type, Pass, Name or Function missing."));
+  // No function.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"",
+                               "error: Type, Pass, Name or Function missing."));
+  // Debug loc but no file.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { Line: 3, Column: 12 }\n"
+"",
+                               "DebugLoc node incomplete."));
+  // Debug loc but no line.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: 12 }\n"
+"",
+                               "DebugLoc node incomplete."));
+  // Debug loc but no column.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Line: 3 }\n"
+"",
+                               "DebugLoc node incomplete."));
+}
+
+TEST(OptRemarks, OptRemarksParsingWrongTypes) {
+  // Wrong debug loc type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: foo\n"
+"",
+                               "expected a value of mapping type."));
+  // Wrong line type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Line: b, Column: 12 }\n"
+"",
+                               "expected a value of integer type."));
+  // Wrong column type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Line: 3, Column: c }\n"
+"",
+                               "expected a value of integer type."));
+  // Wrong args type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args: foo\n"
+"",
+                               "wrong value type for key."));
+  // Wrong key type.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"{ A: a }: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"",
+                               "key is not a string."));
+  // Debug loc with unknown entry.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: 12, Unknown: 12 }\n"
+"",
+                               "unknown entry in DebugLoc map."));
+  // Unknown entry.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Unknown: inline\n"
+"",
+                               "unknown key."));
+  // Not a scalar.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: { File: a, Line: 1, Column: 2 }\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"",
+                               "expected a value of scalar type."));
+  // Not a string file in debug loc.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: { a: b }, Column: 12, Line: 12 }\n"
+"",
+                               "expected a value of scalar type."));
+  // Not a integer column in debug loc.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: { a: b }, Line: 12 }\n"
+"",
+                               "expected a value of scalar type."));
+  // Not a integer line in debug loc.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: 12, Line: { a: b } }\n"
+"",
+                               "expected a value of scalar type."));
+  // Not a mapping type value for args.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"DebugLoc: { File: file.c, Column: 12, Line: { a: b } }\n"
+"",
+                               "expected a value of scalar type."));
+}
+
+TEST(OptRemarks, OptRemarksParsingWrongArgs) {
+  // Multiple debug locs per arg.
+  EXPECT_TRUE(
+      parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - Str: string\n"
+"    DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"    DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"",
+                       "only one DebugLoc entry is allowed per argument."));
+  // Multiple strings per arg.
+  EXPECT_TRUE(
+      parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - Str: string\n"
+"    Str2: string\n"
+"    DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"",
+                       "only one string entry is allowed per argument."));
+  // No arg value.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - Callee: ''\n"
+"  - DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"",
+                               "argument value is missing."));
+  // No arg value.
+  EXPECT_TRUE(parseExpectError("\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"Function: foo\n"
+"Args:\n"
+"  - DebugLoc: { File: a, Line: 1, Column: 2 }\n"
+"",
+                               "argument key is missing."));
+
+}
+
+TEST(OptRemarks, OptRemarksGoodStruct) {
+  StringRef Buf = "\n"
+"--- !Missed\n"
+"Pass: inline\n"
+"Name: NoDefinition\n"
+"DebugLoc: { File: file.c, Line: 3, Column: 12 }\n"
+"Function: foo\n"
+"Args:\n"
+"  - Callee: bar\n"
+"  - String: ' will not be inlined into '\n"
+"  - Caller: foo\n"
+"    DebugLoc: { File: file.c, Line: 2, Column: 0 }\n"
+"  - String: ' because its definition is unavailable'\n"
+"\n";
+
+  LLVMOptRemarkParserRef Parser =
+      LLVMOptRemarkParserCreate(Buf.data(), Buf.size());
+  LLVMOptRemarkEntry *Remark = LLVMOptRemarkParserGetNext(Parser);
+  EXPECT_FALSE(Remark == nullptr);
+  EXPECT_EQ(StringRef(Remark->RemarkType.Str, 7), "!Missed");
+  EXPECT_EQ(Remark->RemarkType.Len, 7U);
+  EXPECT_EQ(StringRef(Remark->PassName.Str, 6), "inline");
+  EXPECT_EQ(Remark->PassName.Len, 6U);
+  EXPECT_EQ(StringRef(Remark->RemarkName.Str, 12), "NoDefinition");
+  EXPECT_EQ(Remark->RemarkName.Len, 12U);
+  EXPECT_EQ(StringRef(Remark->FunctionName.Str, 3), "foo");
+  EXPECT_EQ(Remark->FunctionName.Len, 3U);
+  EXPECT_EQ(StringRef(Remark->DebugLoc.SourceFile.Str, 6), "file.c");
+  EXPECT_EQ(Remark->DebugLoc.SourceFile.Len, 6U);
+  EXPECT_EQ(Remark->DebugLoc.SourceLineNumber, 3U);
+  EXPECT_EQ(Remark->DebugLoc.SourceColumnNumber, 12U);
+  EXPECT_EQ(Remark->Hotness, 0U);
+  EXPECT_EQ(Remark->NumArgs, 4U);
+  // Arg 0
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[0];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Callee");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 3), "bar");
+    EXPECT_EQ(Arg.Value.Len, 3U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+  // Arg 1
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[1];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 26), " will not be inlined into ");
+    EXPECT_EQ(Arg.Value.Len, 26U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+  // Arg 2
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[2];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "Caller");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 3), "foo");
+    EXPECT_EQ(Arg.Value.Len, 3U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 6), "file.c");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 6U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 2U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+  // Arg 3
+  {
+    LLVMOptRemarkArg &Arg = Remark->Args[3];
+    EXPECT_EQ(StringRef(Arg.Key.Str, 6), "String");
+    EXPECT_EQ(Arg.Key.Len, 6U);
+    EXPECT_EQ(StringRef(Arg.Value.Str, 38),
+              " because its definition is unavailable");
+    EXPECT_EQ(Arg.Value.Len, 38U);
+    EXPECT_EQ(StringRef(Arg.DebugLoc.SourceFile.Str, 0), "");
+    EXPECT_EQ(Arg.DebugLoc.SourceFile.Len, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceLineNumber, 0U);
+    EXPECT_EQ(Arg.DebugLoc.SourceColumnNumber, 0U);
+  }
+
+  EXPECT_EQ(LLVMOptRemarkParserGetNext(Parser), nullptr);
+
+  EXPECT_FALSE(LLVMOptRemarkParserHasError(Parser));
+  LLVMOptRemarkParserDispose(Parser);
+}
diff --git a/unittests/Passes/CMakeLists.txt b/unittests/Passes/CMakeLists.txt
index d90df209d4ec5aa52a3aa9c9bdf954833db3e6a1..415f3a71734b053e1c2215864b4773d94a52a7b8 100644
--- a/unittests/Passes/CMakeLists.txt
+++ b/unittests/Passes/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_unittest(PluginsTests
   PluginsTest.cpp
   )
 export_executable_symbols(PluginsTests)
+target_link_libraries(PluginsTests PRIVATE LLVMTestingSupport)
 
 set(LLVM_LINK_COMPONENTS)
 add_llvm_loadable_module(TestPlugin
diff --git a/unittests/Passes/PluginsTest.cpp b/unittests/Passes/PluginsTest.cpp
index 726978714e87d5c697e36eab6c29133c20b7f6a8..abb7b57ee0c60fdb972836e497db0d47b513e327 100644
--- a/unittests/Passes/PluginsTest.cpp
+++ b/unittests/Passes/PluginsTest.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Testing/Support/Error.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "gtest/gtest.h"
 
@@ -54,8 +55,8 @@ TEST(PluginsTests, LoadPlugin) {
 
   PassBuilder PB;
   ModulePassManager PM;
-  ASSERT_FALSE(PB.parsePassPipeline(PM, "plugin-pass"));
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Failed());
 
   Plugin->registerPassBuilderCallbacks(PB);
-  ASSERT_TRUE(PB.parsePassPipeline(PM, "plugin-pass"));
+  ASSERT_THAT_ERROR(PB.parsePassPipeline(PM, "plugin-pass"), Succeeded());
 }
diff --git a/unittests/ProfileData/InstrProfTest.cpp b/unittests/ProfileData/InstrProfTest.cpp
index 0c99f7fde654de33476936698fab97ce9f4c43d0..2d915d4459824227c36ba5c44f488f895629f496 100644
--- a/unittests/ProfileData/InstrProfTest.cpp
+++ b/unittests/ProfileData/InstrProfTest.cpp
@@ -42,8 +42,10 @@ struct InstrProfTest : ::testing::Test {
 
   void SetUp() { Writer.setOutputSparse(false); }
 
-  void readProfile(std::unique_ptr<MemoryBuffer> Profile) {
-    auto ReaderOrErr = IndexedInstrProfReader::create(std::move(Profile));
+  void readProfile(std::unique_ptr<MemoryBuffer> Profile,
+                   std::unique_ptr<MemoryBuffer> Remapping = nullptr) {
+    auto ReaderOrErr = IndexedInstrProfReader::create(std::move(Profile),
+                                                      std::move(Remapping));
     EXPECT_THAT_ERROR(ReaderOrErr.takeError(), Succeeded());
     Reader = std::move(ReaderOrErr.get());
   }
@@ -990,6 +992,44 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_compression_test) {
   }
 }
 
+TEST_P(MaybeSparseInstrProfTest, remapping_test) {
+  Writer.addRecord({"_Z3fooi", 0x1234, {1, 2, 3, 4}}, Err);
+  Writer.addRecord({"file:_Z3barf", 0x567, {5, 6, 7}}, Err);
+  auto Profile = Writer.writeBuffer();
+  readProfile(std::move(Profile), llvm::MemoryBuffer::getMemBuffer(R"(
+    type i l
+    name 3bar 4quux
+  )"));
+
+  std::vector<uint64_t> Counts;
+  for (StringRef FooName : {"_Z3fooi", "_Z3fool"}) {
+    EXPECT_THAT_ERROR(Reader->getFunctionCounts(FooName, 0x1234, Counts),
+                      Succeeded());
+    ASSERT_EQ(4u, Counts.size());
+    EXPECT_EQ(1u, Counts[0]);
+    EXPECT_EQ(2u, Counts[1]);
+    EXPECT_EQ(3u, Counts[2]);
+    EXPECT_EQ(4u, Counts[3]);
+  }
+
+  for (StringRef BarName : {"file:_Z3barf", "file:_Z4quuxf"}) {
+    EXPECT_THAT_ERROR(Reader->getFunctionCounts(BarName, 0x567, Counts),
+                      Succeeded());
+    ASSERT_EQ(3u, Counts.size());
+    EXPECT_EQ(5u, Counts[0]);
+    EXPECT_EQ(6u, Counts[1]);
+    EXPECT_EQ(7u, Counts[2]);
+  }
+
+  for (StringRef BadName : {"_Z3foof", "_Z4quuxi", "_Z3barl", "", "_ZZZ",
+                            "_Z3barf", "otherfile:_Z4quuxf"}) {
+    EXPECT_THAT_ERROR(Reader->getFunctionCounts(BadName, 0x1234, Counts),
+                      Failed());
+    EXPECT_THAT_ERROR(Reader->getFunctionCounts(BadName, 0x567, Counts),
+                      Failed());
+  }
+}
+
 TEST_F(SparseInstrProfTest, preserve_no_records) {
   Writer.addRecord({"foo", 0x1234, {0}}, Err);
   Writer.addRecord({"bar", 0x4321, {0, 0}}, Err);
diff --git a/unittests/ProfileData/SampleProfTest.cpp b/unittests/ProfileData/SampleProfTest.cpp
index 73e8088b638642bc8768e2239d43d5ae8db76ad3..67e6e9fc95b958634ae4e04aa17c104cbddae5e3 100644
--- a/unittests/ProfileData/SampleProfTest.cpp
+++ b/unittests/ProfileData/SampleProfTest.cpp
@@ -58,7 +58,7 @@ struct SampleProfTest : ::testing::Test {
     Reader->collectFuncsToUse(M);
   }
 
-  void testRoundTrip(SampleProfileFormat Format) {
+  void testRoundTrip(SampleProfileFormat Format, bool Remap) {
     SmallVector<char, 128> ProfilePath;
     ASSERT_TRUE(NoError(llvm::sys::fs::createTemporaryFile("profile", "", ProfilePath)));
     StringRef Profile(ProfilePath.data(), ProfilePath.size());
@@ -108,22 +108,35 @@ struct SampleProfTest : ::testing::Test {
     EC = Reader->read();
     ASSERT_TRUE(NoError(EC));
 
-    StringMap<FunctionSamples> &ReadProfiles = Reader->getProfiles();
-    ASSERT_EQ(2u, ReadProfiles.size());
-
-    std::string FooGUID;
-    StringRef FooRep = getRepInFormat(FooName, Format, FooGUID);
-    FunctionSamples &ReadFooSamples = ReadProfiles[FooRep];
-    ASSERT_EQ(7711u, ReadFooSamples.getTotalSamples());
-    ASSERT_EQ(610u, ReadFooSamples.getHeadSamples());
-
-    std::string BarGUID;
-    StringRef BarRep = getRepInFormat(BarName, Format, BarGUID);
-    FunctionSamples &ReadBarSamples = ReadProfiles[BarRep];
-    ASSERT_EQ(20301u, ReadBarSamples.getTotalSamples());
-    ASSERT_EQ(1437u, ReadBarSamples.getHeadSamples());
+    if (Remap) {
+      auto MemBuffer = llvm::MemoryBuffer::getMemBuffer(R"(
+        # Types 'int' and 'long' are equivalent
+        type i l
+        # Function names 'foo' and 'faux' are equivalent
+        name 3foo 4faux
+      )");
+      Reader.reset(new SampleProfileReaderItaniumRemapper(
+          std::move(MemBuffer), Context, std::move(Reader)));
+      FooName = "_Z4fauxi";
+      BarName = "_Z3barl";
+
+      EC = Reader->read();
+      ASSERT_TRUE(NoError(EC));
+    }
+
+    ASSERT_EQ(2u, Reader->getProfiles().size());
+
+    FunctionSamples *ReadFooSamples = Reader->getSamplesFor(FooName);
+    ASSERT_TRUE(ReadFooSamples != nullptr);
+    ASSERT_EQ(7711u, ReadFooSamples->getTotalSamples());
+    ASSERT_EQ(610u, ReadFooSamples->getHeadSamples());
+
+    FunctionSamples *ReadBarSamples = Reader->getSamplesFor(BarName);
+    ASSERT_TRUE(ReadBarSamples != nullptr);
+    ASSERT_EQ(20301u, ReadBarSamples->getTotalSamples());
+    ASSERT_EQ(1437u, ReadBarSamples->getHeadSamples());
     ErrorOr<SampleRecord::CallTargetMap> CTMap =
-        ReadBarSamples.findCallTargetMapAt(1, 0);
+        ReadBarSamples->findCallTargetMapAt(1, 0);
     ASSERT_FALSE(CTMap.getError());
 
     std::string MconstructGUID;
@@ -184,15 +197,23 @@ struct SampleProfTest : ::testing::Test {
 };
 
 TEST_F(SampleProfTest, roundtrip_text_profile) {
-  testRoundTrip(SampleProfileFormat::SPF_Text);
+  testRoundTrip(SampleProfileFormat::SPF_Text, false);
 }
 
 TEST_F(SampleProfTest, roundtrip_raw_binary_profile) {
-  testRoundTrip(SampleProfileFormat::SPF_Binary);
+  testRoundTrip(SampleProfileFormat::SPF_Binary, false);
 }
 
 TEST_F(SampleProfTest, roundtrip_compact_binary_profile) {
-  testRoundTrip(SampleProfileFormat::SPF_Compact_Binary);
+  testRoundTrip(SampleProfileFormat::SPF_Compact_Binary, false);
+}
+
+TEST_F(SampleProfTest, remap_text_profile) {
+  testRoundTrip(SampleProfileFormat::SPF_Text, true);
+}
+
+TEST_F(SampleProfTest, remap_raw_binary_profile) {
+  testRoundTrip(SampleProfileFormat::SPF_Binary, true);
 }
 
 TEST_F(SampleProfTest, sample_overflow_saturation) {
diff --git a/unittests/Support/JSONTest.cpp b/unittests/Support/JSONTest.cpp
index 64a2bb97bd8bef9ec6688c25290b5182f6d4fa01..9f2d47b9aa980be765cb60254eb417654fefafaa 100644
--- a/unittests/Support/JSONTest.cpp
+++ b/unittests/Support/JSONTest.cpp
@@ -47,6 +47,8 @@ TEST(JSONTest, Constructors) {
             s(Object{{"A", Object{{"B", Object{{"X", "Y"}}}}}}));
   EXPECT_EQ("null", s(llvm::Optional<double>()));
   EXPECT_EQ("2.5", s(llvm::Optional<double>(2.5)));
+  EXPECT_EQ("[[2.5,null]]", s(std::vector<std::vector<llvm::Optional<double>>>{
+                                 {2.5, llvm::None}}));
 }
 
 TEST(JSONTest, StringOwnership) {
diff --git a/unittests/Support/VirtualFileSystemTest.cpp b/unittests/Support/VirtualFileSystemTest.cpp
index 58d928516f9d39fbc7ee2bf44eb2f6e419336628..466cd117a507fbabc52598622b38c7567e8b6c8b 100644
--- a/unittests/Support/VirtualFileSystemTest.cpp
+++ b/unittests/Support/VirtualFileSystemTest.cpp
@@ -478,6 +478,85 @@ TEST(VirtualFileSystemTest, BasicRealFSRecursiveIteration) {
   EXPECT_EQ(1, Counts[3]); // d
 }
 
+TEST(VirtualFileSystemTest, BasicRealFSRecursiveIterationNoPush) {
+  ScopedDir TestDirectory("virtual-file-system-test", /*Unique*/ true);
+
+  ScopedDir _a(TestDirectory + "/a");
+  ScopedDir _ab(TestDirectory + "/a/b");
+  ScopedDir _c(TestDirectory + "/c");
+  ScopedDir _cd(TestDirectory + "/c/d");
+  ScopedDir _e(TestDirectory + "/e");
+  ScopedDir _ef(TestDirectory + "/e/f");
+  ScopedDir _g(TestDirectory + "/g");
+
+  IntrusiveRefCntPtr<vfs::FileSystem> FS = vfs::getRealFileSystem();
+
+  // Test that calling no_push on entries without subdirectories has no effect.
+  {
+    std::error_code EC;
+    auto I = vfs::recursive_directory_iterator(*FS, Twine(TestDirectory), EC);
+    ASSERT_FALSE(EC);
+
+    std::vector<std::string> Contents;
+    for (auto E = vfs::recursive_directory_iterator(); !EC && I != E;
+         I.increment(EC)) {
+      Contents.push_back(I->path());
+      char last = I->path().back();
+      switch (last) {
+      case 'b':
+      case 'd':
+      case 'f':
+      case 'g':
+        I.no_push();
+        break;
+      default:
+        break;
+      }
+    }
+    EXPECT_EQ(7U, Contents.size());
+  }
+
+  // Test that calling no_push skips subdirectories.
+  {
+    std::error_code EC;
+    auto I = vfs::recursive_directory_iterator(*FS, Twine(TestDirectory), EC);
+    ASSERT_FALSE(EC);
+
+    std::vector<std::string> Contents;
+    for (auto E = vfs::recursive_directory_iterator(); !EC && I != E;
+         I.increment(EC)) {
+      Contents.push_back(I->path());
+      char last = I->path().back();
+      switch (last) {
+      case 'a':
+      case 'c':
+      case 'e':
+        I.no_push();
+        break;
+      default:
+        break;
+      }
+    }
+
+    // Check contents, which may be in any order
+    EXPECT_EQ(4U, Contents.size());
+    int Counts[7] = {0, 0, 0, 0, 0, 0, 0};
+    for (const std::string &Name : Contents) {
+      ASSERT_FALSE(Name.empty());
+      int Index = Name[Name.size() - 1] - 'a';
+      ASSERT_TRUE(Index >= 0 && Index < 7);
+      Counts[Index]++;
+    }
+    EXPECT_EQ(1, Counts[0]); // a
+    EXPECT_EQ(0, Counts[1]); // b
+    EXPECT_EQ(1, Counts[2]); // c
+    EXPECT_EQ(0, Counts[3]); // d
+    EXPECT_EQ(1, Counts[4]); // e
+    EXPECT_EQ(0, Counts[5]); // f
+    EXPECT_EQ(1, Counts[6]); // g
+  }
+}
+
 #ifdef LLVM_ON_UNIX
 TEST(VirtualFileSystemTest, BrokenSymlinkRealFSRecursiveIteration) {
   ScopedDir TestDirectory("virtual-file-system-test", /*Unique*/ true);
@@ -806,6 +885,17 @@ TEST_F(InMemoryFileSystemTest, WorkingDirectory) {
             getPosixPath(NormalizedFS.getCurrentWorkingDirectory().get()));
 }
 
+TEST_F(InMemoryFileSystemTest, IsLocal) {
+  FS.setCurrentWorkingDirectory("/b");
+  FS.addFile("c", 0, MemoryBuffer::getMemBuffer(""));
+
+  std::error_code EC;
+  bool IsLocal = true;
+  EC = FS.isLocal("c", IsLocal);
+  ASSERT_FALSE(EC);
+  ASSERT_FALSE(IsLocal);
+}
+
 #if !defined(_WIN32)
 TEST_F(InMemoryFileSystemTest, GetRealPath) {
   SmallString<16> Path;
@@ -1599,3 +1689,89 @@ TEST_F(VFSFromYAMLTest, RelativePaths) {
 
   EXPECT_EQ(3, NumDiagnostics);
 }
+
+TEST_F(VFSFromYAMLTest, NonFallthroughDirectoryIteration) {
+  IntrusiveRefCntPtr<DummyFileSystem> Lower(new DummyFileSystem());
+  Lower->addDirectory("//root/");
+  Lower->addRegularFile("//root/a");
+  Lower->addRegularFile("//root/b");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS = getFromYAMLString(
+      "{ 'use-external-names': false,\n"
+      "  'fallthrough': false,\n"
+      "  'roots': [\n"
+      "{\n"
+      "  'type': 'directory',\n"
+      "  'name': '//root/',\n"
+      "  'contents': [ {\n"
+      "                  'type': 'file',\n"
+      "                  'name': 'c',\n"
+      "                  'external-contents': '//root/a'\n"
+      "                }\n"
+      "              ]\n"
+      "}\n"
+      "]\n"
+      "}",
+      Lower);
+  ASSERT_TRUE(FS.get() != nullptr);
+
+  std::error_code EC;
+  checkContents(FS->dir_begin("//root/", EC),
+                {"//root/c"});
+}
+
+TEST_F(VFSFromYAMLTest, DirectoryIterationWithDuplicates) {
+  IntrusiveRefCntPtr<DummyFileSystem> Lower(new DummyFileSystem());
+  Lower->addDirectory("//root/");
+  Lower->addRegularFile("//root/a");
+  Lower->addRegularFile("//root/b");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS = getFromYAMLString(
+      "{ 'use-external-names': false,\n"
+      "  'roots': [\n"
+      "{\n"
+      "  'type': 'directory',\n"
+      "  'name': '//root/',\n"
+      "  'contents': [ {\n"
+      "                  'type': 'file',\n"
+      "                  'name': 'a',\n"
+      "                  'external-contents': '//root/a'\n"
+      "                }\n"
+      "              ]\n"
+      "}\n"
+      "]\n"
+      "}",
+	  Lower);
+  ASSERT_TRUE(FS.get() != nullptr);
+
+  std::error_code EC;
+  checkContents(FS->dir_begin("//root/", EC),
+                {"//root/a", "//root/b"});
+}
+
+TEST_F(VFSFromYAMLTest, DirectoryIterationErrorInVFSLayer) {
+  IntrusiveRefCntPtr<DummyFileSystem> Lower(new DummyFileSystem());
+  Lower->addDirectory("//root/");
+  Lower->addDirectory("//root/foo");
+  Lower->addRegularFile("//root/foo/a");
+  Lower->addRegularFile("//root/foo/b");
+  IntrusiveRefCntPtr<vfs::FileSystem> FS = getFromYAMLString(
+      "{ 'use-external-names': false,\n"
+      "  'roots': [\n"
+      "{\n"
+      "  'type': 'directory',\n"
+      "  'name': '//root/',\n"
+      "  'contents': [ {\n"
+      "                  'type': 'file',\n"
+      "                  'name': 'bar/a',\n"
+      "                  'external-contents': '//root/foo/a'\n"
+      "                }\n"
+      "              ]\n"
+      "}\n"
+      "]\n"
+      "}",
+      Lower);
+  ASSERT_TRUE(FS.get() != nullptr);
+
+  std::error_code EC;
+  checkContents(FS->dir_begin("//root/foo", EC),
+                {"//root/foo/a", "//root/foo/b"});
+}
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index 4530482ec8090a83cd80014b02f03f02fea62bc8..94e9874147f1cb90ec3b8bc6e069b792a2138f96 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -2543,7 +2543,9 @@ TEST(YAMLIO, TestEscaped) {
   // Single quote
   TestEscaped("@abc@", "'@abc@'");
   // No quote
-  TestEscaped("abc/", "abc/");
+  TestEscaped("abc", "abc");
+  // Forward slash quoted
+  TestEscaped("abc/", "'abc/'");
   // Double quote non-printable
   TestEscaped("\01@abc@", "\"\\x01@abc@\"");
   // Double quote inside single quote
diff --git a/unittests/Target/AArch64/InstSizes.cpp b/unittests/Target/AArch64/InstSizes.cpp
index e58df0a45cc95f61eb0c6f3cd89955b30a370cb2..a70f43c43796bfdd3bc743eef67e6d0599990608 100644
--- a/unittests/Target/AArch64/InstSizes.cpp
+++ b/unittests/Target/AArch64/InstSizes.cpp
@@ -10,7 +10,7 @@
 using namespace llvm;
 
 namespace {
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   auto TT(Triple::normalize("aarch64--"));
   std::string CPU("generic");
   std::string FS("");
@@ -22,8 +22,9 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(TT, Error);
 
-  return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-      TT, CPU, FS, TargetOptions(), None, None, CodeGenOpt::Default));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      TheTarget->createTargetMachine(TT, CPU, FS, TargetOptions(), None, None,
+                                     CodeGenOpt::Default)));
 }
 
 std::unique_ptr<AArch64InstrInfo> createInstrInfo(TargetMachine *TM) {
@@ -37,7 +38,7 @@ std::unique_ptr<AArch64InstrInfo> createInstrInfo(TargetMachine *TM) {
 /// TODO: Some of this might be useful for other architectures as well - extract
 ///       the platform-independent parts somewhere they can be reused.
 void runChecks(
-    TargetMachine *TM, AArch64InstrInfo *II, const StringRef InputIRSnippet,
+    LLVMTargetMachine *TM, AArch64InstrInfo *II, const StringRef InputIRSnippet,
     const StringRef InputMIRSnippet,
     std::function<void(AArch64InstrInfo &, MachineFunction &)> Checks) {
   LLVMContext Context;
@@ -78,7 +79,7 @@ void runChecks(
 } // anonymous namespace
 
 TEST(InstSizes, STACKMAP) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
   std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
 
@@ -93,7 +94,7 @@ TEST(InstSizes, STACKMAP) {
 }
 
 TEST(InstSizes, PATCHPOINT) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "",
@@ -108,7 +109,7 @@ TEST(InstSizes, PATCHPOINT) {
 }
 
 TEST(InstSizes, TLSDESC_CALLSEQ) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
 
   runChecks(
diff --git a/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
index 599f2e7f10fccacbf87e3d66e48c65d7d3d51358..095ee0665e1a1870fc6aea443faff5723a2815be 100644
--- a/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
+++ b/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 
 namespace {
 
-std::unique_ptr<TargetMachine> createTargetMachine() {
+std::unique_ptr<LLVMTargetMachine> createTargetMachine() {
   auto TT(Triple::normalize("wasm32-unknown-unknown"));
   std::string CPU("");
   std::string FS("");
@@ -35,8 +35,9 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
   const Target *TheTarget = TargetRegistry::lookupTarget(TT, Error);
   assert(TheTarget);
 
-  return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-      TT, CPU, FS, TargetOptions(), None, None, CodeGenOpt::Default));
+  return std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine*>(
+      TheTarget->createTargetMachine(TT, CPU, FS, TargetOptions(), None, None,
+                                     CodeGenOpt::Default)));
 }
 
 std::unique_ptr<Module> parseMIR(LLVMContext &Context,
@@ -64,7 +65,7 @@ std::unique_ptr<Module> parseMIR(LLVMContext &Context,
 } // namespace
 
 TEST(WebAssemblyExceptionInfoTest, TEST0) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
 
   StringRef MIRString = R"MIR(
@@ -227,7 +228,7 @@ body: |
 }
 
 TEST(WebAssemblyExceptionInfoTest, TEST1) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
 
   StringRef MIRString = R"MIR(
@@ -418,7 +419,7 @@ body: |
 
 // Terminate pad test
 TEST(WebAssemblyExceptionInfoTest, TEST2) {
-  std::unique_ptr<TargetMachine> TM = createTargetMachine();
+  std::unique_ptr<LLVMTargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
 
   StringRef MIRString = R"MIR(
diff --git a/unittests/Transforms/Utils/CodeExtractorTest.cpp b/unittests/Transforms/Utils/CodeExtractorTest.cpp
index c229be6d6952818af0c3718d62b4f795bda05cf9..c53b3152a7df142f70af0d23046eaeb8a88cde42 100644
--- a/unittests/Transforms/Utils/CodeExtractorTest.cpp
+++ b/unittests/Transforms/Utils/CodeExtractorTest.cpp
@@ -21,7 +21,7 @@
 using namespace llvm;
 
 namespace {
-TEST(CodeExtractor, ExitStub) {
+TEST(CodeExtractor, DISABLED_ExitStub) {
   LLVMContext Ctx;
   SMDiagnostic Err;
   std::unique_ptr<Module> M(parseAssemblyString(R"invalid(
@@ -46,6 +46,25 @@ TEST(CodeExtractor, ExitStub) {
   )invalid",
                                                 Err, Ctx));
 
+  // CodeExtractor miscompiles this function. There appear to be some issues
+  // with the handling of outlined regions with live output values.
+  //
+  // In the original function, CE adds two reloads in the codeReplacer block:
+  //
+  //   codeRepl:                                         ; preds = %header
+  //     call void @foo_header.split(i32 %z, i32 %x, i32 %y, i32* %.loc, i32* %.loc1)
+  //     %.reload = load i32, i32* %.loc
+  //     %.reload2 = load i32, i32* %.loc1
+  //     br label %notExtracted
+  //
+  // These reloads must flow into the notExtracted block:
+  //
+  //   notExtracted:                                     ; preds = %codeRepl
+  //     %0 = phi i32 [ %.reload, %codeRepl ], [ %.reload2, %body2 ]
+  //
+  // The problem is that the PHI node in notExtracted now has an incoming
+  // value from a BasicBlock that's in a different function.
+
   Function *Func = M->getFunction("foo");
   SmallVector<BasicBlock *, 3> Candidates;
   for (auto &BB : *Func) {
diff --git a/unittests/XRay/FDRProducerConsumerTest.cpp b/unittests/XRay/FDRProducerConsumerTest.cpp
index 838e6ca9bf19e8edc49027b4f6d8d4a441c184ce..09ec44db26ee92d5d14450a12a6725d94f199b64 100644
--- a/unittests/XRay/FDRProducerConsumerTest.cpp
+++ b/unittests/XRay/FDRProducerConsumerTest.cpp
@@ -54,7 +54,7 @@ template <> std::unique_ptr<Record> MakeRecord<WallclockRecord>() {
 }
 
 template <> std::unique_ptr<Record> MakeRecord<CustomEventRecord>() {
-  return make_unique<CustomEventRecord>(4, 1, "data");
+  return make_unique<CustomEventRecord>(4, 1, 2, "data");
 }
 
 template <> std::unique_ptr<Record> MakeRecord<CallArgRecord>() {
diff --git a/unittests/XRay/FDRRecordPrinterTest.cpp b/unittests/XRay/FDRRecordPrinterTest.cpp
index 339d4b0d428dc17cda8b1526820a9c0b1f19176a..a0ec3f22bf5bdc8e4b896a66ba1a8669e776249b 100644
--- a/unittests/XRay/FDRRecordPrinterTest.cpp
+++ b/unittests/XRay/FDRRecordPrinterTest.cpp
@@ -55,11 +55,11 @@ template <> struct Helper<TSCWrapRecord> {
 
 template <> struct Helper<CustomEventRecord> {
   static std::unique_ptr<Record> construct() {
-    return make_unique<CustomEventRecord>(4, 1, "data");
+    return make_unique<CustomEventRecord>(4, 1, 2, "data");
   }
 
   static const char *expected() {
-    return "<Custom Event: tsc = 1, size = 4, data = 'data'>";
+    return "<Custom Event: tsc = 1, cpu = 2, size = 4, data = 'data'>";
   }
 };
 
@@ -132,7 +132,7 @@ TEST(FDRRecordPrinterTest, WriteFunctionRecordEnter) {
   FunctionRecord R(RecordTypes::ENTER, 1, 2);
   ASSERT_FALSE(errorToBool(R.apply(P)));
   OS.flush();
-  EXPECT_THAT(Data, Eq("<Function Enter: #1 delta = +1>"));
+  EXPECT_THAT(Data, Eq("<Function Enter: #1 delta = +2>"));
 }
 
 TEST(FDRRecordPrinterTest, WriteFunctionRecordExit) {
@@ -142,7 +142,7 @@ TEST(FDRRecordPrinterTest, WriteFunctionRecordExit) {
   FunctionRecord R(RecordTypes::EXIT, 1, 2);
   ASSERT_FALSE(errorToBool(R.apply(P)));
   OS.flush();
-  EXPECT_THAT(Data, Eq("<Function Exit: #1 delta = +1>"));
+  EXPECT_THAT(Data, Eq("<Function Exit: #1 delta = +2>"));
 }
 
 TEST(FDRRecordPrinterTest, WriteFunctionRecordTailExit) {
@@ -152,7 +152,7 @@ TEST(FDRRecordPrinterTest, WriteFunctionRecordTailExit) {
   FunctionRecord R(RecordTypes::TAIL_EXIT, 1, 2);
   ASSERT_FALSE(errorToBool(R.apply(P)));
   OS.flush();
-  EXPECT_THAT(Data, Eq("<Function Tail Exit: #1 delta = +1>"));
+  EXPECT_THAT(Data, Eq("<Function Tail Exit: #1 delta = +2>"));
 }
 
 TEST(FDRRecordPrinterTest, WriteFunctionRecordEnterArg) {
@@ -162,7 +162,7 @@ TEST(FDRRecordPrinterTest, WriteFunctionRecordEnterArg) {
   FunctionRecord R(RecordTypes::ENTER_ARG, 1, 2);
   ASSERT_FALSE(errorToBool(R.apply(P)));
   OS.flush();
-  EXPECT_THAT(Data, Eq("<Function Enter With Arg: #1 delta = +1>"));
+  EXPECT_THAT(Data, Eq("<Function Enter With Arg: #1 delta = +2>"));
 }
 
 } // namespace
diff --git a/unittests/XRay/FDRRecordsTest.cpp b/unittests/XRay/FDRRecordsTest.cpp
index 1cce1c2b2c17e01dfbdc4a347371b21ee04b420b..86b478a5a4588eabca303272de206e2844284e4d 100644
--- a/unittests/XRay/FDRRecordsTest.cpp
+++ b/unittests/XRay/FDRRecordsTest.cpp
@@ -34,6 +34,8 @@ TEST(XRayFDRTest, BuilderAndBlockIndexer) {
                     .add<PIDRecord>(1)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   auto Block1 = LogBuilder()
                     .add<BufferExtents>(100)
@@ -42,6 +44,8 @@ TEST(XRayFDRTest, BuilderAndBlockIndexer) {
                     .add<PIDRecord>(1)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   auto Block2 = LogBuilder()
                     .add<BufferExtents>(100)
@@ -50,6 +54,8 @@ TEST(XRayFDRTest, BuilderAndBlockIndexer) {
                     .add<PIDRecord>(1)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   BlockIndexer::Index Index;
   BlockIndexer Indexer(Index);
@@ -92,6 +98,8 @@ TEST(XRayFDRTest, IndexAndVerifyBlocks) {
                     .add<NewCPUIDRecord>(1, 2)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   auto Block1 = LogBuilder()
                     .add<BufferExtents>(64)
@@ -101,6 +109,8 @@ TEST(XRayFDRTest, IndexAndVerifyBlocks) {
                     .add<NewCPUIDRecord>(1, 2)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
   auto Block2 = LogBuilder()
                     .add<BufferExtents>(64)
@@ -110,6 +120,8 @@ TEST(XRayFDRTest, IndexAndVerifyBlocks) {
                     .add<NewCPUIDRecord>(1, 2)
                     .add<FunctionRecord>(RecordTypes::ENTER, 1, 1)
                     .add<FunctionRecord>(RecordTypes::EXIT, 1, 100)
+                    .add<CustomEventRecordV5>(1, 4, "XRAY")
+                    .add<TypedEventRecord>(1, 4, 2, "XRAY")
                     .consume();
 
   // First, index the records in different blocks.
diff --git a/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp b/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
index 1f394ae2efd4831f676adeda9b4a342ab9f64706..8a519bb2e7b5fd90d8b11189f4fd549be1bd5153 100644
--- a/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
+++ b/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp
@@ -9,6 +9,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 void InitializeAArch64ExegesisTarget();
@@ -60,3 +61,4 @@ TEST_F(AArch64TargetTest, SetRegToConstant) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/ARM/AssemblerTest.cpp b/unittests/tools/llvm-exegesis/ARM/AssemblerTest.cpp
index db8b9dfc3b745d7d54eac7442582ce47bf004655..a20fa5556bb501d7763126c5af62e21ac7f34f2f 100644
--- a/unittests/tools/llvm-exegesis/ARM/AssemblerTest.cpp
+++ b/unittests/tools/llvm-exegesis/ARM/AssemblerTest.cpp
@@ -10,6 +10,7 @@
 #include "../Common/AssemblerUtils.h"
 #include "ARMInstrInfo.h"
 
+namespace llvm {
 namespace exegesis {
 namespace {
 
@@ -47,3 +48,4 @@ TEST_F(ARMMachineFunctionGeneratorTest, DISABLED_JitFunctionADDrr) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/BenchmarkRunnerTest.cpp b/unittests/tools/llvm-exegesis/BenchmarkRunnerTest.cpp
index 05b36a31b9e7c4c40beb0f697f2b3bfa2df77222..c518491063afd1326a286a882dd0ddbefabfa594 100644
--- a/unittests/tools/llvm-exegesis/BenchmarkRunnerTest.cpp
+++ b/unittests/tools/llvm-exegesis/BenchmarkRunnerTest.cpp
@@ -11,6 +11,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
@@ -29,3 +30,4 @@ TEST(ScratchSpaceTest, Works) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/ClusteringTest.cpp b/unittests/tools/llvm-exegesis/ClusteringTest.cpp
index e1bffd6345401d7ffd1188247d4bc1b52cb8930b..8ea77dcbddec14f1a469f13e4ac4330be2a1d808 100644
--- a/unittests/tools/llvm-exegesis/ClusteringTest.cpp
+++ b/unittests/tools/llvm-exegesis/ClusteringTest.cpp
@@ -14,6 +14,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
@@ -104,3 +105,4 @@ TEST(ClusteringTest, Ordering) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h b/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h
index cc00cee58e3ddb8422d633ab399c14d7a0ed6c67..8a144e5c26f0daf7be721931d9148693e3ef0142 100644
--- a/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h
+++ b/unittests/tools/llvm-exegesis/Common/AssemblerUtils.h
@@ -24,6 +24,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 class MachineFunctionGeneratorBaseTest : public ::testing::Test {
@@ -89,5 +90,6 @@ private:
 };
 
 } // namespace exegesis
+} // namespace llvm
 
 #endif
diff --git a/unittests/tools/llvm-exegesis/PerfHelperTest.cpp b/unittests/tools/llvm-exegesis/PerfHelperTest.cpp
index a8205f9e3eb274175eaf43e94415ba3531f91fd7..91ed4a609674071535f3db9dab509273de6e8308 100644
--- a/unittests/tools/llvm-exegesis/PerfHelperTest.cpp
+++ b/unittests/tools/llvm-exegesis/PerfHelperTest.cpp
@@ -12,6 +12,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 namespace pfm {
 namespace {
@@ -45,3 +46,4 @@ TEST(PerfHelperTest, FunctionalTest) {
 } // namespace
 } // namespace pfm
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/RegisterValueTest.cpp b/unittests/tools/llvm-exegesis/RegisterValueTest.cpp
index 4ade990382de786e2a0c0c5f92034028351d707c..8453720dc70846e3a2da82e7c7a13a933971d19b 100644
--- a/unittests/tools/llvm-exegesis/RegisterValueTest.cpp
+++ b/unittests/tools/llvm-exegesis/RegisterValueTest.cpp
@@ -11,6 +11,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 
 namespace {
@@ -69,3 +70,4 @@ TEST(RegisterValueTest, Double) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp b/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
index d2d4c152d79ee596ec6d400b53e761cae12a9c62..00ac6290aed358d7b0f3dff1cf2d8e2d90490e97 100644
--- a/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/AnalysisTest.cpp
@@ -8,6 +8,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 namespace {
 
@@ -100,3 +101,4 @@ TEST_F(AnalysisTest, ComputeIdealizedProcResPressure_1P1_1P05_2P0156) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/AssemblerTest.cpp b/unittests/tools/llvm-exegesis/X86/AssemblerTest.cpp
index 8e81106db8dc0538ff3d6da1da7ba36103fe5f61..451c3f67e75ed0d5162f962a3a39cb4a707784b5 100644
--- a/unittests/tools/llvm-exegesis/X86/AssemblerTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/AssemblerTest.cpp
@@ -10,6 +10,7 @@
 #include "../Common/AssemblerUtils.h"
 #include "X86InstrInfo.h"
 
+namespace llvm {
 namespace exegesis {
 
 void InitializeX86ExegesisTarget();
@@ -63,3 +64,4 @@ TEST_F(X86MachineFunctionGeneratorTest, DISABLED_JitFunctionMOV32ri) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp b/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp
index b17ae1caff3b437d3da94d6fe7930f3d0b5ec4a5..f069c21b364f00fbb89666a264c94cc3cb40eb7e 100644
--- a/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp
@@ -25,6 +25,7 @@ using ::testing::get;
 using ::testing::Pointwise;
 using ::testing::Property;
 
+namespace llvm {
 namespace exegesis {
 
 bool operator==(const BenchmarkMeasure &A, const BenchmarkMeasure &B) {
@@ -136,3 +137,4 @@ TEST(BenchmarkResultTest, PerInstructionStats) {
 }
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp b/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
index 12f76541d4d6afe41cf800f32074d7fecf46b96b..007b0156b1ffca66d42ae00bf50c219a7e17813c 100644
--- a/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/RegisterAliasingTest.cpp
@@ -9,6 +9,7 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+namespace llvm {
 namespace exegesis {
 namespace {
 
@@ -89,3 +90,4 @@ TEST_F(RegisterAliasingTest, TrackRegisterClassCache) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
index 9685c730b8b58a8b1e49e261342f92fdec9139c3..1689defded838e13e330dbb9e0bb0ff36b33636e 100644
--- a/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/SnippetGeneratorTest.cpp
@@ -17,6 +17,7 @@
 
 #include <unordered_set>
 
+namespace llvm {
 namespace exegesis {
 
 void InitializeX86ExegesisTarget();
@@ -25,6 +26,7 @@ namespace {
 
 using testing::AnyOf;
 using testing::ElementsAre;
+using testing::Gt;
 using testing::HasSubstr;
 using testing::Not;
 using testing::SizeIs;
@@ -57,9 +59,10 @@ class SnippetGeneratorTest : public X86SnippetGeneratorTest {
 protected:
   SnippetGeneratorTest() : Generator(State) {}
 
-  CodeTemplate checkAndGetCodeTemplate(unsigned Opcode) {
+  std::vector<CodeTemplate> checkAndGetCodeTemplates(unsigned Opcode) {
     randomGenerator().seed(0); // Initialize seed.
-    auto CodeTemplateOrError = Generator.generateCodeTemplate(Opcode);
+    const Instruction &Instr = State.getIC().getInstr(Opcode);
+    auto CodeTemplateOrError = Generator.generateCodeTemplates(Instr);
     EXPECT_FALSE(CodeTemplateOrError.takeError()); // Valid configuration.
     return std::move(CodeTemplateOrError.get());
   }
@@ -72,21 +75,25 @@ using LatencySnippetGeneratorTest =
 
 using UopsSnippetGeneratorTest = SnippetGeneratorTest<UopsSnippetGenerator>;
 
-TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependency) {
-  // ADC16i16 self alias because of implicit use and def.
-
-  // explicit use 0       : imm
-  // implicit def         : AX
-  // implicit def         : EFLAGS
-  // implicit use         : AX
-  // implicit use         : EFLAGS
+TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughImplicitReg) {
+  // - ADC16i16
+  // - Op0 Explicit Use Immediate
+  // - Op1 Implicit Def Reg(AX)
+  // - Op2 Implicit Def Reg(EFLAGS)
+  // - Op3 Implicit Use Reg(AX)
+  // - Op4 Implicit Use Reg(EFLAGS)
+  // - Var0 [Op0]
+  // - hasAliasingImplicitRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::ADC16i16;
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitDefs()[0], llvm::X86::AX);
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitDefs()[1], llvm::X86::EFLAGS);
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitUses()[0], llvm::X86::AX);
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitUses()[1], llvm::X86::EFLAGS);
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
-  EXPECT_THAT(CT.Info, HasSubstr("implicit"));
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Execution, ExecutionMode::ALWAYS_SERIAL_IMPLICIT_REGS_ALIAS);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
@@ -94,63 +101,105 @@ TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependency) {
   EXPECT_THAT(IT.VariableValues[0], IsInvalid()) << "Immediate is not set";
 }
 
-TEST_F(LatencySnippetGeneratorTest, ExplicitSelfDependency) {
-  // ADD16ri self alias because Op0 and Op1 are tied together.
-
-  // explicit def 0       : reg RegClass=GR16
-  // explicit use 1       : reg RegClass=GR16 | TIED_TO:0
-  // explicit use 2       : imm
-  // implicit def         : EFLAGS
+TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughTiedRegs) {
+  // - ADD16ri
+  // - Op0 Explicit Def RegClass(GR16)
+  // - Op1 Explicit Use RegClass(GR16) TiedToOp0
+  // - Op2 Explicit Use Immediate
+  // - Op3 Implicit Def Reg(EFLAGS)
+  // - Var0 [Op0,Op1]
+  // - Var1 [Op2]
+  // - hasTiedRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::ADD16ri;
   EXPECT_THAT(MCInstrInfo.get(Opcode).getImplicitDefs()[0], llvm::X86::EFLAGS);
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
-  EXPECT_THAT(CT.Info, HasSubstr("explicit"));
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Execution, ExecutionMode::ALWAYS_SERIAL_TIED_REGS_ALIAS);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
   ASSERT_THAT(IT.VariableValues, SizeIs(2));
-  EXPECT_THAT(IT.VariableValues[0], IsReg()) << "Operand 0 and 1";
+  EXPECT_THAT(IT.VariableValues[0], IsInvalid()) << "Operand 1 is not set";
   EXPECT_THAT(IT.VariableValues[1], IsInvalid()) << "Operand 2 is not set";
 }
 
-TEST_F(LatencySnippetGeneratorTest, DependencyThroughOtherOpcode) {
-  // CMP64rr
-  // explicit use 0       : reg RegClass=GR64
-  // explicit use 1       : reg RegClass=GR64
-  // implicit def         : EFLAGS
-
-  const unsigned Opcode = llvm::X86::CMP64rr;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
-  EXPECT_THAT(CT.Info, HasSubstr("cycle through"));
-  ASSERT_THAT(CT.Instructions, SizeIs(2));
+TEST_F(LatencySnippetGeneratorTest, ImplicitSelfDependencyThroughExplicitRegs) {
+  // - VXORPSrr
+  // - Op0 Explicit Def RegClass(VR128)
+  // - Op1 Explicit Use RegClass(VR128)
+  // - Op2 Explicit Use RegClass(VR128)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasAliasingRegisters
+  const unsigned Opcode = llvm::X86::VXORPSrr;
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
+  EXPECT_THAT(CT.Execution, ExecutionMode::SERIAL_VIA_EXPLICIT_REGS);
+  ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
-  ASSERT_THAT(IT.VariableValues, SizeIs(2));
-  EXPECT_THAT(IT.VariableValues, AnyOf(ElementsAre(IsReg(), IsInvalid()),
-                                       ElementsAre(IsInvalid(), IsReg())));
-  EXPECT_THAT(CT.Instructions[1].getOpcode(), Not(Opcode));
-  // TODO: check that the two instructions alias each other.
+  ASSERT_THAT(IT.VariableValues, SizeIs(3));
+  EXPECT_THAT(IT.VariableValues,
+              AnyOf(ElementsAre(IsReg(), IsInvalid(), IsReg()),
+                    ElementsAre(IsReg(), IsReg(), IsInvalid())))
+      << "Op0 is either set to Op1 or to Op2";
+}
+
+TEST_F(LatencySnippetGeneratorTest, DependencyThroughOtherOpcode) {
+  // - CMP64rr
+  // - Op0 Explicit Use RegClass(GR64)
+  // - Op1 Explicit Use RegClass(GR64)
+  // - Op2 Implicit Def Reg(EFLAGS)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  const unsigned Opcode = llvm::X86::CMP64rr;
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(Gt(1U))) << "Many templates are available";
+  for (const auto &CT : CodeTemplates) {
+    EXPECT_THAT(CT.Execution, ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR);
+    ASSERT_THAT(CT.Instructions, SizeIs(2));
+    const InstructionTemplate &IT = CT.Instructions[0];
+    EXPECT_THAT(IT.getOpcode(), Opcode);
+    ASSERT_THAT(IT.VariableValues, SizeIs(2));
+    EXPECT_THAT(IT.VariableValues, AnyOf(ElementsAre(IsReg(), IsInvalid()),
+                                         ElementsAre(IsInvalid(), IsReg())));
+    EXPECT_THAT(CT.Instructions[1].getOpcode(), Not(Opcode));
+    // TODO: check that the two instructions alias each other.
+  }
 }
 
 TEST_F(LatencySnippetGeneratorTest, LAHF) {
+  // - LAHF
+  // - Op0 Implicit Def Reg(AH)
+  // - Op1 Implicit Use Reg(EFLAGS)
   const unsigned Opcode = llvm::X86::LAHF;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
-  EXPECT_THAT(CT.Info, HasSubstr("cycle through"));
-  ASSERT_THAT(CT.Instructions, SizeIs(2));
-  const InstructionTemplate &IT = CT.Instructions[0];
-  EXPECT_THAT(IT.getOpcode(), Opcode);
-  ASSERT_THAT(IT.VariableValues, SizeIs(0));
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(Gt(1U))) << "Many templates are available";
+  for (const auto &CT : CodeTemplates) {
+    EXPECT_THAT(CT.Execution, ExecutionMode::SERIAL_VIA_NON_MEMORY_INSTR);
+    ASSERT_THAT(CT.Instructions, SizeIs(2));
+    const InstructionTemplate &IT = CT.Instructions[0];
+    EXPECT_THAT(IT.getOpcode(), Opcode);
+    ASSERT_THAT(IT.VariableValues, SizeIs(0));
+  }
 }
 
 TEST_F(UopsSnippetGeneratorTest, ParallelInstruction) {
-  // BNDCL32rr is parallel no matter what.
-
-  // explicit use 0       : reg RegClass=BNDR
-  // explicit use 1       : reg RegClass=GR32
-
+  // - BNDCL32rr
+  // - Op0 Explicit Use RegClass(BNDR)
+  // - Op1 Explicit Use RegClass(GR32)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
   const unsigned Opcode = llvm::X86::BNDCL32rr;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("parallel"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
@@ -160,14 +209,18 @@ TEST_F(UopsSnippetGeneratorTest, ParallelInstruction) {
 }
 
 TEST_F(UopsSnippetGeneratorTest, SerialInstruction) {
-  // CDQ is serial no matter what.
-
-  // implicit def         : EAX
-  // implicit def         : EDX
-  // implicit use         : EAX
+  // - CDQ
+  // - Op0 Implicit Def Reg(EAX)
+  // - Op1 Implicit Def Reg(EDX)
+  // - Op2 Implicit Use Reg(EAX)
+  // - hasAliasingImplicitRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::CDQ;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("serial"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
@@ -178,13 +231,21 @@ TEST_F(UopsSnippetGeneratorTest, StaticRenaming) {
   // CMOVA32rr has tied variables, we enumerate the possible values to execute
   // as many in parallel as possible.
 
-  // explicit def 0       : reg RegClass=GR32
-  // explicit use 1       : reg RegClass=GR32 | TIED_TO:0
-  // explicit use 2       : reg RegClass=GR32
-  // implicit use         : EFLAGS
+  // - CMOVA32rr
+  // - Op0 Explicit Def RegClass(GR32)
+  // - Op1 Explicit Use RegClass(GR32) TiedToOp0
+  // - Op2 Explicit Use RegClass(GR32)
+  // - Op3 Implicit Use Reg(EFLAGS)
+  // - Var0 [Op0,Op1]
+  // - Var1 [Op2]
+  // - hasTiedRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::CMOVA32rr;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("static renaming"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   constexpr const unsigned kInstructionCount = 15;
   ASSERT_THAT(CT.Instructions, SizeIs(kInstructionCount));
   std::unordered_set<unsigned> AllDefRegisters;
@@ -200,14 +261,23 @@ TEST_F(UopsSnippetGeneratorTest, NoTiedVariables) {
   // CMOV_GR32 has no tied variables, we make sure def and use are different
   // from each other.
 
-  // explicit def 0       : reg RegClass=GR32
-  // explicit use 1       : reg RegClass=GR32
-  // explicit use 2       : reg RegClass=GR32
-  // explicit use 3       : imm
-  // implicit use         : EFLAGS
+  // - CMOV_GR32
+  // - Op0 Explicit Def RegClass(GR32)
+  // - Op1 Explicit Use RegClass(GR32)
+  // - Op2 Explicit Use RegClass(GR32)
+  // - Op3 Explicit Use Immediate
+  // - Op4 Implicit Use Reg(EFLAGS)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - Var3 [Op3]
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::CMOV_GR32;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions, SizeIs(1));
   const InstructionTemplate &IT = CT.Instructions[0];
   EXPECT_THAT(IT.getOpcode(), Opcode);
@@ -221,9 +291,27 @@ TEST_F(UopsSnippetGeneratorTest, NoTiedVariables) {
 
 TEST_F(UopsSnippetGeneratorTest, MemoryUse) {
   // Mov32rm reads from memory.
+  // - MOV32rm
+  // - Op0 Explicit Def RegClass(GR32)
+  // - Op1 Explicit Use Memory RegClass(GR8)
+  // - Op2 Explicit Use Memory
+  // - Op3 Explicit Use Memory RegClass(GRH8)
+  // - Op4 Explicit Use Memory
+  // - Op5 Explicit Use Memory RegClass(SEGMENT_REG)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - Var3 [Op3]
+  // - Var4 [Op4]
+  // - Var5 [Op5]
+  // - hasMemoryOperands
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::MOV32rm;
-  const CodeTemplate CT = checkAndGetCodeTemplate(Opcode);
+  const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
+  ASSERT_THAT(CodeTemplates, SizeIs(1));
+  const auto &CT = CodeTemplates[0];
   EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
+  EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
   ASSERT_THAT(CT.Instructions,
               SizeIs(UopsSnippetGenerator::kMinNumDifferentAddresses));
   const InstructionTemplate &IT = CT.Instructions[0];
@@ -237,8 +325,24 @@ TEST_F(UopsSnippetGeneratorTest, MemoryUse) {
 
 TEST_F(UopsSnippetGeneratorTest, MemoryUse_Movsb) {
   // MOVSB writes to scratch memory register.
+  // - MOVSB
+  // - Op0 Explicit Use Memory RegClass(GR8)
+  // - Op1 Explicit Use Memory RegClass(GR8)
+  // - Op2 Explicit Use Memory RegClass(SEGMENT_REG)
+  // - Op3 Implicit Def Reg(EDI)
+  // - Op4 Implicit Def Reg(ESI)
+  // - Op5 Implicit Use Reg(EDI)
+  // - Op6 Implicit Use Reg(ESI)
+  // - Op7 Implicit Use Reg(DF)
+  // - Var0 [Op0]
+  // - Var1 [Op1]
+  // - Var2 [Op2]
+  // - hasMemoryOperands
+  // - hasAliasingImplicitRegisters (execution is always serial)
+  // - hasAliasingRegisters
   const unsigned Opcode = llvm::X86::MOVSB;
-  auto Error = Generator.generateCodeTemplate(Opcode).takeError();
+  const Instruction &Instr = State.getIC().getInstr(Opcode);
+  auto Error = Generator.generateCodeTemplates(Instr).takeError();
   EXPECT_TRUE((bool)Error);
   llvm::consumeError(std::move(Error));
 }
@@ -248,12 +352,12 @@ public:
   FakeSnippetGenerator(const LLVMState &State) : SnippetGenerator(State) {}
 
   Instruction createInstruction(unsigned Opcode) {
-    return Instruction(State.getInstrInfo().get(Opcode), RATC);
+    return State.getIC().getInstr(Opcode);
   }
 
 private:
-  llvm::Expected<CodeTemplate>
-  generateCodeTemplate(unsigned Opcode) const override {
+  llvm::Expected<std::vector<CodeTemplate>>
+  generateCodeTemplates(const Instruction &Instr) const override {
     return llvm::make_error<llvm::StringError>("not implemented",
                                                llvm::inconvertibleErrorCode());
   }
@@ -310,3 +414,4 @@ TEST_F(FakeSnippetGeneratorTest, ComputeRegisterInitialValuesAdd64rr) {
 
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index 6e7554c84452e8e3117638524d650989456a04c1..2d9d7bcd5598e60da9893ba9e5276ed9241b1656 100644
--- a/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -35,6 +35,7 @@ bool operator==(const MCInst &a, const MCInst &b) {
 
 } // namespace llvm
 
+namespace llvm {
 namespace exegesis {
 
 void InitializeX86ExegesisTarget();
@@ -296,12 +297,17 @@ TEST_F(Core2Avx512TargetTest, SetRegToVR512Value) {
            IsStackDeallocate(64)}));
 }
 
+// Note: We always put 80 bits on the stack independently of the size of the
+// value. This uses a bit more space but makes the code simpler.
+
 TEST_F(Core2TargetTest, SetRegToST0_32Bits) {
   EXPECT_THAT(
       setRegTo(llvm::X86::ST0, APInt(32, 0x11112222ULL)),
-      ElementsAre(IsStackAllocate(4),
+      ElementsAre(IsStackAllocate(10),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x11112222UL, 0),
-                  OpcodeIs(llvm::X86::LD_F32m), IsStackDeallocate(4)));
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_F80m), IsStackDeallocate(10)));
 }
 
 TEST_F(Core2TargetTest, SetRegToST1_32Bits) {
@@ -309,19 +315,22 @@ TEST_F(Core2TargetTest, SetRegToST1_32Bits) {
       llvm::MCInstBuilder(llvm::X86::ST_Frr).addReg(llvm::X86::ST1);
   EXPECT_THAT(
       setRegTo(llvm::X86::ST1, APInt(32, 0x11112222ULL)),
-      ElementsAre(IsStackAllocate(4),
+      ElementsAre(IsStackAllocate(10),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x11112222UL, 0),
-                  OpcodeIs(llvm::X86::LD_F32m), CopySt0ToSt1,
-                  IsStackDeallocate(4)));
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_F80m), CopySt0ToSt1,
+                  IsStackDeallocate(10)));
 }
 
 TEST_F(Core2TargetTest, SetRegToST0_64Bits) {
   EXPECT_THAT(
       setRegTo(llvm::X86::ST0, APInt(64, 0x1111222233334444ULL)),
-      ElementsAre(IsStackAllocate(8),
+      ElementsAre(IsStackAllocate(10),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x33334444UL, 0),
                   IsMovValueToStack(llvm::X86::MOV32mi, 0x11112222UL, 4),
-                  OpcodeIs(llvm::X86::LD_F64m), IsStackDeallocate(8)));
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_F80m), IsStackDeallocate(10)));
 }
 
 TEST_F(Core2TargetTest, SetRegToST0_80Bits) {
@@ -334,5 +343,38 @@ TEST_F(Core2TargetTest, SetRegToST0_80Bits) {
                   OpcodeIs(llvm::X86::LD_F80m), IsStackDeallocate(10)));
 }
 
+TEST_F(Core2TargetTest, SetRegToFP0_80Bits) {
+  EXPECT_THAT(
+      setRegTo(llvm::X86::FP0, APInt(80, "11112222333344445555", 16)),
+      ElementsAre(IsStackAllocate(10),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x44445555UL, 0),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x22223333UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x1111UL, 8),
+                  OpcodeIs(llvm::X86::LD_Fp80m), IsStackDeallocate(10)));
+}
+
+TEST_F(Core2TargetTest, SetRegToFP1_32Bits) {
+  EXPECT_THAT(
+      setRegTo(llvm::X86::FP1, APInt(32, 0x11112222ULL)),
+      ElementsAre(IsStackAllocate(10),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x11112222UL, 0),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_Fp80m),
+                  IsStackDeallocate(10)));
+}
+
+TEST_F(Core2TargetTest, SetRegToFP1_4Bits) {
+  EXPECT_THAT(
+      setRegTo(llvm::X86::FP1, APInt(4, 0x1ULL)),
+      ElementsAre(IsStackAllocate(10),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000001UL, 0),
+                  IsMovValueToStack(llvm::X86::MOV32mi, 0x00000000UL, 4),
+                  IsMovValueToStack(llvm::X86::MOV16mi, 0x0000UL, 8),
+                  OpcodeIs(llvm::X86::LD_Fp80m),
+                  IsStackDeallocate(10)));
+}
+
 } // namespace
 } // namespace exegesis
+} // namespace llvm
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 8ce1d9cbd0900556ba8801370f12eeb37275fbdc..967d22f12b6464b47ac1c9d6af027903e3240cf0 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -18,6 +18,7 @@
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/FileCheck.h"
 using namespace llvm;
@@ -108,8 +109,13 @@ static void DumpCommandLine(int argc, char **argv) {
 }
 
 int main(int argc, char **argv) {
+  // Enable use of ANSI color codes because FileCheck is using them to
+  // highlight text.
+  llvm::sys::Process::UseANSIEscapeCodes(true);
+
   InitLLVM X(argc, argv);
-  cl::ParseCommandLineOptions(argc, argv);
+  cl::ParseCommandLineOptions(argc, argv, /*Overview*/ "", /*Errs*/ nullptr,
+                              "FILECHECK_OPTS");
 
   FileCheckRequest Req;
   for (auto Prefix : CheckPrefixes)
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index e808661b7a51fd193bca5ea9c4cf054c82473125..5b4229e6468240335f49c45dfc8e9b59dfe11ebd 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -2415,10 +2415,9 @@ static void emitOperandMatchErrorDiagStrings(AsmMatcherInfo &Info, raw_ostream &
 static void emitRegisterMatchErrorFunc(AsmMatcherInfo &Info, raw_ostream &OS) {
   OS << "static unsigned getDiagKindFromRegisterClass(MatchClassKind "
         "RegisterClass) {\n";
-  if (std::none_of(Info.Classes.begin(), Info.Classes.end(),
-                   [](const ClassInfo &CI) {
-                     return CI.isRegisterClass() && !CI.DiagnosticType.empty();
-                   })) {
+  if (none_of(Info.Classes, [](const ClassInfo &CI) {
+        return CI.isRegisterClass() && !CI.DiagnosticType.empty();
+      })) {
     OS << "  return MCTargetAsmParser::Match_InvalidOperand;\n";
   } else {
     OS << "  switch (RegisterClass) {\n";
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 5ac3eca4c68714d2bcf3bdc5eb18d9eb782d3618..c88365a2b8cecfc8a58d0d49cfb63a9386beeaf3 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -21,6 +21,7 @@ add_tablegen(llvm-tblgen LLVM
   DAGISelMatcher.cpp
   DFAPacketizerEmitter.cpp
   DisassemblerEmitter.cpp
+  ExegesisEmitter.cpp
   FastISelEmitter.cpp
   FixedLenDecoderEmitter.cpp
   GlobalISelEmitter.cpp
@@ -46,7 +47,6 @@ add_tablegen(llvm-tblgen LLVM
   X86ModRMFilters.cpp
   X86RecognizableInstr.cpp
   WebAssemblyDisassemblerEmitter.cpp
-  WebAssemblyStackifierEmitter.cpp
   CTagsEmitter.cpp
   )
 set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning")
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index 881f1a813f2a9ff95dfedc283a2576ff585d2797..a9a36a87ef3fa9beab44c4e189e0b6214f335011 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -350,7 +350,7 @@ processSTIPredicate(STIPredicateFunction &Fn,
         unsigned OpcodeIdx = Opcode2Index[Opcode];
         if (OpcodeMasks[OpcodeIdx].first[ProcIndex]) {
           std::string Message =
-              "Opcode " + Opcode->getName().str() + 
+              "Opcode " + Opcode->getName().str() +
               " used by multiple InstructionEquivalenceClass definitions.";
           PrintFatalError(EC->getLoc(), Message);
         }
@@ -487,9 +487,6 @@ void CodeGenSchedModels::collectOptionalProcessorInfo() {
   // Collect processor RetireControlUnit descriptors if available.
   collectRetireControlUnits();
 
-  // Find pfm counter definitions for each processor.
-  collectPfmCounters();
-
   checkCompleteness();
 }
 
@@ -1759,42 +1756,33 @@ void CodeGenSchedModels::collectRegisterFiles() {
     CodeGenProcModel &PM = getProcModel(RF->getValueAsDef("SchedModel"));
     PM.RegisterFiles.emplace_back(CodeGenRegisterFile(RF->getName(),RF));
     CodeGenRegisterFile &CGRF = PM.RegisterFiles.back();
+    CGRF.MaxMovesEliminatedPerCycle =
+        RF->getValueAsInt("MaxMovesEliminatedPerCycle");
+    CGRF.AllowZeroMoveEliminationOnly =
+        RF->getValueAsBit("AllowZeroMoveEliminationOnly");
 
     // Now set the number of physical registers as well as the cost of registers
     // in each register class.
     CGRF.NumPhysRegs = RF->getValueAsInt("NumPhysRegs");
+    if (!CGRF.NumPhysRegs) {
+      PrintFatalError(RF->getLoc(),
+                      "Invalid RegisterFile with zero physical registers");
+    }
+
     RecVec RegisterClasses = RF->getValueAsListOfDefs("RegClasses");
     std::vector<int64_t> RegisterCosts = RF->getValueAsListOfInts("RegCosts");
+    ListInit *MoveElimInfo = RF->getValueAsListInit("AllowMoveElimination");
     for (unsigned I = 0, E = RegisterClasses.size(); I < E; ++I) {
       int Cost = RegisterCosts.size() > I ? RegisterCosts[I] : 1;
-      CGRF.Costs.emplace_back(RegisterClasses[I], Cost);
-    }
-  }
-}
 
-// Collect all the RegisterFile definitions available in this target.
-void CodeGenSchedModels::collectPfmCounters() {
-  for (Record *Def : Records.getAllDerivedDefinitions("PfmIssueCounter")) {
-    CodeGenProcModel &PM = getProcModel(Def->getValueAsDef("SchedModel"));
-    PM.PfmIssueCounterDefs.emplace_back(Def);
-  }
-  for (Record *Def : Records.getAllDerivedDefinitions("PfmCycleCounter")) {
-    CodeGenProcModel &PM = getProcModel(Def->getValueAsDef("SchedModel"));
-    if (PM.PfmCycleCounterDef) {
-      PrintFatalError(Def->getLoc(),
-                      "multiple cycle counters for " +
-                          Def->getValueAsDef("SchedModel")->getName());
-    }
-    PM.PfmCycleCounterDef = Def;
-  }
-  for (Record *Def : Records.getAllDerivedDefinitions("PfmUopsCounter")) {
-    CodeGenProcModel &PM = getProcModel(Def->getValueAsDef("SchedModel"));
-    if (PM.PfmUopsCounterDef) {
-      PrintFatalError(Def->getLoc(),
-                      "multiple uops counters for " +
-                          Def->getValueAsDef("SchedModel")->getName());
-    }
-    PM.PfmUopsCounterDef = Def;
+      bool AllowMoveElim = false;
+      if (MoveElimInfo->size() > I) {
+        BitInit *Val = cast<BitInit>(MoveElimInfo->getElement(I));
+        AllowMoveElim = Val->getValue();
+      }
+
+      CGRF.Costs.emplace_back(RegisterClasses[I], Cost, AllowMoveElim);
+    }
   }
 }
 
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index c2af28bbaa0360cf0d630703690ca3b0d6de1926..9bde5f4e75946710cbe7b022f3155773c6b114f8 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -167,8 +167,9 @@ struct CodeGenSchedClass {
 struct CodeGenRegisterCost {
   Record *RCDef;
   unsigned Cost;
-  CodeGenRegisterCost(Record *RC, unsigned RegisterCost)
-      : RCDef(RC), Cost(RegisterCost) {}
+  bool AllowMoveElimination;
+  CodeGenRegisterCost(Record *RC, unsigned RegisterCost, bool AllowMoveElim = false)
+      : RCDef(RC), Cost(RegisterCost), AllowMoveElimination(AllowMoveElim) {}
   CodeGenRegisterCost(const CodeGenRegisterCost &) = default;
   CodeGenRegisterCost &operator=(const CodeGenRegisterCost &) = delete;
 };
@@ -181,12 +182,18 @@ struct CodeGenRegisterCost {
 struct CodeGenRegisterFile {
   std::string Name;
   Record *RegisterFileDef;
+  unsigned MaxMovesEliminatedPerCycle;
+  bool AllowZeroMoveEliminationOnly;
 
   unsigned NumPhysRegs;
   std::vector<CodeGenRegisterCost> Costs;
 
-  CodeGenRegisterFile(StringRef name, Record *def)
-      : Name(name), RegisterFileDef(def), NumPhysRegs(0) {}
+  CodeGenRegisterFile(StringRef name, Record *def, unsigned MaxMoveElimPerCy = 0,
+                      bool AllowZeroMoveElimOnly = false)
+      : Name(name), RegisterFileDef(def),
+        MaxMovesEliminatedPerCycle(MaxMoveElimPerCy),
+        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly),
+        NumPhysRegs(0) {}
 
   bool hasDefaultCosts() const { return Costs.empty(); }
 };
@@ -239,11 +246,6 @@ struct CodeGenProcModel {
   // Optional Retire Control Unit definition.
   Record *RetireControlUnit;
 
-  // List of PfmCounters.
-  RecVec PfmIssueCounterDefs;
-  Record *PfmCycleCounterDef = nullptr;
-  Record *PfmUopsCounterDef = nullptr;
-
   CodeGenProcModel(unsigned Idx, std::string Name, Record *MDef,
                    Record *IDef) :
     Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
@@ -258,10 +260,7 @@ struct CodeGenProcModel {
   }
 
   bool hasExtraProcessorInfo() const {
-    return RetireControlUnit || !RegisterFiles.empty() ||
-        !PfmIssueCounterDefs.empty() ||
-        PfmCycleCounterDef != nullptr ||
-        PfmUopsCounterDef != nullptr;
+    return RetireControlUnit || !RegisterFiles.empty();
   }
 
   unsigned getProcResourceIdx(Record *PRDef) const;
@@ -586,8 +585,6 @@ private:
 
   void collectRegisterFiles();
 
-  void collectPfmCounters();
-
   void collectOptionalProcessorInfo();
 
   std::string createSchedClassName(Record *ItinClassDef,
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 2766fcca16165aed242eee8b17d37c775675be47..305d2d19ff4dbfaebbe6585f9ba017875a38a173 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -711,4 +711,3 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
   // Sort the argument attributes for later benefit.
   llvm::sort(ArgumentAttributes);
 }
-
diff --git a/utils/TableGen/ExegesisEmitter.cpp b/utils/TableGen/ExegesisEmitter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..083d7439451404f5450e3ce132e685a25df88604
--- /dev/null
+++ b/utils/TableGen/ExegesisEmitter.cpp
@@ -0,0 +1,212 @@
+//===- ExegesisEmitter.cpp - Generate exegesis target data ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend emits llvm-exegesis information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "exegesis-emitter"
+
+namespace {
+
+class ExegesisEmitter {
+public:
+  ExegesisEmitter(RecordKeeper &RK);
+
+  void run(raw_ostream &OS) const;
+
+private:
+  unsigned getPfmCounterId(llvm::StringRef Name) const {
+    const auto It = PfmCounterNameTable.find(Name);
+    if (It == PfmCounterNameTable.end())
+      PrintFatalError("no pfm counter id for " + Name);
+    return It->second;
+  }
+
+  // Collects all the ProcPfmCounters definitions available in this target.
+  void emitPfmCounters(raw_ostream &OS) const;
+
+  void emitPfmCountersInfo(const Record &Def,
+                           unsigned &IssueCountersTableOffset,
+                           raw_ostream &OS) const;
+
+  void emitPfmCountersLookupTable(raw_ostream &OS) const;
+
+  RecordKeeper &Records;
+  std::string Target;
+
+  // Table of counter name -> counter index.
+  const std::map<llvm::StringRef, unsigned> PfmCounterNameTable;
+};
+
+static std::map<llvm::StringRef, unsigned>
+collectPfmCounters(const RecordKeeper &Records) {
+  std::map<llvm::StringRef, unsigned> PfmCounterNameTable;
+  const auto AddPfmCounterName = [&PfmCounterNameTable](
+                                     const Record *PfmCounterDef) {
+    const llvm::StringRef Counter = PfmCounterDef->getValueAsString("Counter");
+    if (!Counter.empty())
+      PfmCounterNameTable.emplace(Counter, 0);
+  };
+  for (Record *Def : Records.getAllDerivedDefinitions("ProcPfmCounters")) {
+    // Check that ResourceNames are unique.
+    llvm::SmallSet<llvm::StringRef, 16> Seen;
+    for (const Record *IssueCounter :
+         Def->getValueAsListOfDefs("IssueCounters")) {
+      const llvm::StringRef ResourceName =
+          IssueCounter->getValueAsString("ResourceName");
+      if (ResourceName.empty())
+        PrintFatalError(IssueCounter->getLoc(), "invalid empty ResourceName");
+      if (!Seen.insert(ResourceName).second)
+        PrintFatalError(IssueCounter->getLoc(),
+                        "duplicate ResourceName " + ResourceName);
+      AddPfmCounterName(IssueCounter);
+    }
+    AddPfmCounterName(Def->getValueAsDef("CycleCounter"));
+    AddPfmCounterName(Def->getValueAsDef("UopsCounter"));
+  }
+  unsigned Index = 0;
+  for (auto &NameAndIndex : PfmCounterNameTable)
+    NameAndIndex.second = Index++;
+  return PfmCounterNameTable;
+}
+
+ExegesisEmitter::ExegesisEmitter(RecordKeeper &RK)
+    : Records(RK), PfmCounterNameTable(collectPfmCounters(RK)) {
+  std::vector<Record *> Targets = Records.getAllDerivedDefinitions("Target");
+  if (Targets.size() == 0)
+    PrintFatalError("ERROR: No 'Target' subclasses defined!");
+  if (Targets.size() != 1)
+    PrintFatalError("ERROR: Multiple subclasses of Target defined!");
+  Target = Targets[0]->getName();
+}
+
+void ExegesisEmitter::emitPfmCountersInfo(const Record &Def,
+                                          unsigned &IssueCountersTableOffset,
+                                          raw_ostream &OS) const {
+  const auto CycleCounter =
+      Def.getValueAsDef("CycleCounter")->getValueAsString("Counter");
+  const auto UopsCounter =
+      Def.getValueAsDef("UopsCounter")->getValueAsString("Counter");
+  const size_t NumIssueCounters =
+      Def.getValueAsListOfDefs("IssueCounters").size();
+
+  // This is the default, do not emit.
+  if (CycleCounter.empty() && UopsCounter.empty() && NumIssueCounters == 0)
+    return;
+
+  OS << "\nstatic const PfmCountersInfo " << Target << Def.getName()
+     << " = {\n";
+
+  // Cycle Counter.
+  if (CycleCounter.empty())
+    OS << "  nullptr,  // No cycle counter.\n";
+  else
+    OS << "  " << Target << "PfmCounterNames[" << getPfmCounterId(CycleCounter)
+       << "],  // Cycle counter\n";
+
+  // Uops Counter.
+  if (UopsCounter.empty())
+    OS << "  nullptr,  // No uops counter.\n";
+  else
+    OS << "  " << Target << "PfmCounterNames[" << getPfmCounterId(UopsCounter)
+       << "],  // Uops counter\n";
+
+  // Issue Counters
+  if (NumIssueCounters == 0)
+    OS << "  nullptr,  // No issue counters.\n  0\n";
+  else
+    OS << "  " << Target << "PfmIssueCounters + " << IssueCountersTableOffset
+       << ", " << NumIssueCounters << " // Issue counters.\n";
+
+  OS << "};\n";
+  IssueCountersTableOffset += NumIssueCounters;
+}
+
+void ExegesisEmitter::emitPfmCounters(raw_ostream &OS) const {
+  // Emit the counter name table.
+  OS << "\nstatic const char* " << Target << "PfmCounterNames[] = {\n";
+  for (const auto &NameAndIndex : PfmCounterNameTable)
+    OS << "  \"" << NameAndIndex.first << "\", // " << NameAndIndex.second
+       << "\n";
+  OS << "};\n\n";
+
+  // Emit the IssueCounters table.
+  const auto PfmCounterDefs =
+      Records.getAllDerivedDefinitions("ProcPfmCounters");
+  OS << "static const PfmCountersInfo::IssueCounter " << Target
+     << "PfmIssueCounters[] = {\n";
+  for (const Record *Def : PfmCounterDefs) {
+    for (const Record *ICDef : Def->getValueAsListOfDefs("IssueCounters"))
+      OS << "  { " << Target << "PfmCounterNames["
+         << getPfmCounterId(ICDef->getValueAsString("Counter")) << "], \""
+         << ICDef->getValueAsString("ResourceName") << "\"},\n";
+  }
+
+  OS << "};\n";
+
+  // Now generate the PfmCountersInfo.
+  unsigned IssueCountersTableOffset = 0;
+  for (const Record *Def : PfmCounterDefs)
+    emitPfmCountersInfo(*Def, IssueCountersTableOffset, OS);
+
+  OS << "\n";
+}
+
+void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const {
+  std::vector<Record *> Bindings =
+      Records.getAllDerivedDefinitions("PfmCountersBinding");
+  llvm::sort(Bindings, [](const Record *L, const Record *R) {
+    return L->getValueAsString("CpuName") < R->getValueAsString("CpuName");
+  });
+
+  OS << "// Sorted (by CpuName) array of pfm counters.\n"
+     << "static const CpuAndPfmCounters " << Target << "CpuPfmCounters[] = {\n";
+  for (Record *Binding : Bindings) {
+    // Emit as { "cpu", procinit },
+    OS << "  { \""                                                        //
+       << Binding->getValueAsString("CpuName") << "\","                   //
+       << " &" << Target << Binding->getValueAsDef("Counters")->getName() //
+       << " },\n";
+  }
+  OS << "};\n\n";
+}
+
+void ExegesisEmitter::run(raw_ostream &OS) const {
+  emitSourceFileHeader("Exegesis Tables", OS);
+  emitPfmCounters(OS);
+  emitPfmCountersLookupTable(OS);
+}
+
+} // end anonymous namespace
+
+namespace llvm {
+
+void EmitExegesis(RecordKeeper &RK, raw_ostream &OS) {
+  ExegesisEmitter(RK).run(OS);
+}
+
+} // end namespace llvm
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index 76ba1c001092a7bc9e5233c3ea36f18041ffa2a0..44cf6eadcb003e9f88a1383ff7001fc0e9b0a5fd 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -2067,21 +2067,59 @@ static bool populateInstruction(CodeGenTarget &Target,
 // using the VS compiler. It has a bug which causes the function
 // to be optimized out in some circustances. See llvm.org/pr38292
 static void emitFieldFromInstruction(formatted_raw_ostream &OS) {
-  OS << "// Helper function for extracting fields from encoded instructions.\n"
+  OS << "// Helper functions for extracting fields from encoded instructions.\n"
+     << "// InsnType must either be integral or an APInt-like object that "
+        "must:\n"
+     << "// * Have a static const max_size_in_bits equal to the number of bits "
+        "in the\n"
+     << "//   encoding.\n"
+     << "// * be default-constructible and copy-constructible\n"
+     << "// * be constructible from a uint64_t\n"
+     << "// * be constructible from an APInt (this can be private)\n"
+     << "// * Support getBitsSet(loBit, hiBit)\n"
+     << "// * be convertible to uint64_t\n"
+     << "// * Support the ~, &, ==, !=, and |= operators with other objects of "
+        "the same type\n"
+     << "// * Support shift (<<, >>) with signed and unsigned integers on the "
+        "RHS\n"
+     << "// * Support put (<<) to raw_ostream&\n"
      << "template<typename InsnType>\n"
      << "#if defined(_MSC_VER) && !defined(__clang__)\n"
      << "__declspec(noinline)\n"
      << "#endif\n"
-     << "static InsnType fieldFromInstruction(InsnType insn, unsigned startBit,\n"
+     << "static InsnType fieldFromInstruction(InsnType insn, unsigned "
+        "startBit,\n"
+     << "                                     unsigned numBits, "
+        "std::true_type) {\n"
+     << "  assert(startBit + numBits <= 64 && \"Cannot support >64-bit "
+        "extractions!\");\n"
+     << "  assert(startBit + numBits <= (sizeof(InsnType) * 8) &&\n"
+     << "         \"Instruction field out of bounds!\");\n"
+     << "  InsnType fieldMask;\n"
+     << "  if (numBits == sizeof(InsnType) * 8)\n"
+     << "    fieldMask = (InsnType)(-1LL);\n"
+     << "  else\n"
+     << "    fieldMask = (((InsnType)1 << numBits) - 1) << startBit;\n"
+     << "  return (insn & fieldMask) >> startBit;\n"
+     << "}\n"
+     << "\n"
+     << "template<typename InsnType>\n"
+     << "static InsnType fieldFromInstruction(InsnType insn, unsigned "
+        "startBit,\n"
+     << "                                     unsigned numBits, "
+        "std::false_type) {\n"
+     << "  assert(startBit + numBits <= InsnType::max_size_in_bits && "
+        "\"Instruction field out of bounds!\");\n"
+     << "  InsnType fieldMask = InsnType::getBitsSet(0, numBits);\n"
+     << "  return (insn >> startBit) & fieldMask;\n"
+     << "}\n"
+     << "\n"
+     << "template<typename InsnType>\n"
+     << "static InsnType fieldFromInstruction(InsnType insn, unsigned "
+        "startBit,\n"
      << "                                     unsigned numBits) {\n"
-     << "    assert(startBit + numBits <= (sizeof(InsnType)*8) &&\n"
-     << "           \"Instruction field out of bounds!\");\n"
-     << "    InsnType fieldMask;\n"
-     << "    if (numBits == sizeof(InsnType)*8)\n"
-     << "      fieldMask = (InsnType)(-1LL);\n"
-     << "    else\n"
-     << "      fieldMask = (((InsnType)1 << numBits) - 1) << startBit;\n"
-     << "    return (insn & fieldMask) >> startBit;\n"
+     << "  return fieldFromInstruction(insn, startBit, numBits, "
+        "std::is_integral<InsnType>());\n"
      << "}\n\n";
 }
 
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index ef8c849e25f4ab6bc8a60ec563f5f6f2ee5ff5af..55b6f192c2f0c504f2a5ca9bc7becee84957bae2 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -66,7 +66,8 @@ private:
   /// This method is used to custom expand TIIPredicate definitions.
   /// See file llvm/Target/TargetInstPredicates.td for a description of what is
   /// a TIIPredicate and how to use it.
-  void emitTIIHelperMethods(raw_ostream &OS, StringRef TargetName);
+  void emitTIIHelperMethods(raw_ostream &OS, StringRef TargetName,
+                            bool ExpandDefinition = true);
 
   /// Expand TIIPredicate definitions to functions that accept a const MCInst
   /// reference.
@@ -400,7 +401,8 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
 }
 
 void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
-                                            StringRef TargetName) {
+                                            StringRef TargetName,
+                                            bool ExpandDefinition) {
   RecVec TIIPredicates = Records.getAllDerivedDefinitions("TIIPredicate");
   if (TIIPredicates.empty())
     return;
@@ -410,8 +412,17 @@ void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
   PE.setIndentLevel(2);
 
   for (const Record *Rec : TIIPredicates) {
-    OS << "\n  static bool " << Rec->getValueAsString("FunctionName");
-    OS << "(const MachineInstr &MI) {\n";
+    OS << "\n  " << (ExpandDefinition ? "" : "static ") << "bool ";
+    if (ExpandDefinition)
+      OS << TargetName << "InstrInfo::";
+    OS << Rec->getValueAsString("FunctionName");
+    OS << "(const MachineInstr &MI)";
+    if (!ExpandDefinition) {
+      OS << ";\n";
+      continue;
+    }
+
+    OS << " {\n";
 
     OS.indent(PE.getIndentLevel() * 2);
     PE.expandStatement(OS, Rec->getValueAsDef("Body"));
@@ -517,12 +528,21 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
      << "(int CFSetupOpcode = -1, int CFDestroyOpcode = -1, int CatchRetOpcode = -1, int ReturnOpcode = -1);\n"
      << "  ~" << ClassName << "() override = default;\n";
 
-  emitTIIHelperMethods(OS, TargetName);
 
   OS << "\n};\n} // end llvm namespace\n";
 
   OS << "#endif // GET_INSTRINFO_HEADER\n\n";
 
+  OS << "#ifdef GET_TII_HELPER_DECLS\n";
+  OS << "#undef GET_TII_HELPER_DECLS\n";
+  emitTIIHelperMethods(OS, TargetName, /* ExpandDefintion = */false);
+  OS << "#endif // GET_TII_HELPER_DECLS\n\n";
+
+  OS << "#ifdef GET_TII_HELPERS\n";
+  OS << "#undef GET_TII_HELPERS\n";
+  emitTIIHelperMethods(OS, TargetName, /* ExpandDefintion = */true);
+  OS << "#endif // GET_TTI_HELPERS\n\n";
+
   OS << "#ifdef GET_INSTRINFO_CTOR_DTOR\n";
   OS << "#undef GET_INSTRINFO_CTOR_DTOR\n";
 
diff --git a/utils/TableGen/PredicateExpander.cpp b/utils/TableGen/PredicateExpander.cpp
index 83f67c023e51d00c8acb53bc86dd821b4d5ca42a..ad7bf60caab9f4f586c3d470cc187337fe1b388c 100644
--- a/utils/TableGen/PredicateExpander.cpp
+++ b/utils/TableGen/PredicateExpander.cpp
@@ -20,23 +20,43 @@ void PredicateExpander::expandTrue(raw_ostream &OS) { OS << "true"; }
 void PredicateExpander::expandFalse(raw_ostream &OS) { OS << "false"; }
 
 void PredicateExpander::expandCheckImmOperand(raw_ostream &OS, int OpIndex,
-                                              int ImmVal) {
+                                              int ImmVal,
+                                              StringRef FunctionMapper) {
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
-     << ").getImm() " << (shouldNegate() ? "!= " : "== ") << ImmVal;
+     << ").getImm()";
+  OS << (FunctionMapper.empty() ? " " : ") ");
+  OS << (shouldNegate() ? "!= " : "== ") << ImmVal;
 }
 
 void PredicateExpander::expandCheckImmOperand(raw_ostream &OS, int OpIndex,
-                                              StringRef ImmVal) {
+                                              StringRef ImmVal,
+                                              StringRef FunctionMapper) {
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
-     << ").getImm() " << (shouldNegate() ? "!= " : "== ") << ImmVal;
+     << ").getImm()";
+
+  OS << (FunctionMapper.empty() ? "" : ")");
+  if (ImmVal.empty())
+    return;
+  OS << (shouldNegate() ? " != " : " == ") << ImmVal;
 }
 
 void PredicateExpander::expandCheckRegOperand(raw_ostream &OS, int OpIndex,
-                                              const Record *Reg) {
+                                              const Record *Reg,
+                                              StringRef FunctionMapper) {
   assert(Reg->isSubClassOf("Register") && "Expected a register Record!");
 
+  if (!FunctionMapper.empty())
+    OS << FunctionMapper << "(";
   OS << "MI" << (isByRef() ? "." : "->") << "getOperand(" << OpIndex
-     << ").getReg() " << (shouldNegate() ? "!= " : "== ");
+     << ").getReg()";
+  OS << (FunctionMapper.empty() ? "" : ")");
+  if (!Reg)
+    return;
+  OS << (shouldNegate() ? " != " : " == ");
   const StringRef Str = Reg->getValueAsString("Namespace");
   if (!Str.empty())
     OS << Str << "::";
@@ -137,7 +157,7 @@ void PredicateExpander::expandPredicateSequence(raw_ostream &OS,
 void PredicateExpander::expandTIIFunctionCall(raw_ostream &OS,
                                               StringRef MethodName) {
   OS << (shouldNegate() ? "!" : "");
-  OS << TargetName << (shouldExpandForMC() ? "_MC::" : "GenInstrInfo::");
+  OS << TargetName << (shouldExpandForMC() ? "_MC::" : "InstrInfo::");
   OS << MethodName << (isByRef() ? "(MI)" : "(*MI)");
 }
 
@@ -266,18 +286,30 @@ void PredicateExpander::expandPredicate(raw_ostream &OS, const Record *Rec) {
 
   if (Rec->isSubClassOf("CheckRegOperand"))
     return expandCheckRegOperand(OS, Rec->getValueAsInt("OpIndex"),
-                                 Rec->getValueAsDef("Reg"));
+                                 Rec->getValueAsDef("Reg"),
+                                 Rec->getValueAsString("FunctionMapper"));
+
+  if (Rec->isSubClassOf("CheckRegOperandSimple"))
+    return expandCheckRegOperand(OS, Rec->getValueAsInt("OpIndex"),
+                                 nullptr,
+                                 Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckInvalidRegOperand"))
     return expandCheckInvalidRegOperand(OS, Rec->getValueAsInt("OpIndex"));
 
   if (Rec->isSubClassOf("CheckImmOperand"))
     return expandCheckImmOperand(OS, Rec->getValueAsInt("OpIndex"),
-                                 Rec->getValueAsInt("ImmVal"));
+                                 Rec->getValueAsInt("ImmVal"),
+                                 Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckImmOperand_s"))
     return expandCheckImmOperand(OS, Rec->getValueAsInt("OpIndex"),
-                                 Rec->getValueAsString("ImmVal"));
+                                 Rec->getValueAsString("ImmVal"),
+                                 Rec->getValueAsString("FunctionMapper"));
+
+  if (Rec->isSubClassOf("CheckImmOperandSimple"))
+    return expandCheckImmOperand(OS, Rec->getValueAsInt("OpIndex"), "", 
+                                 Rec->getValueAsString("FunctionMapper"));
 
   if (Rec->isSubClassOf("CheckSameRegOperand"))
     return expandCheckSameRegOperand(OS, Rec->getValueAsInt("FirstIndex"),
diff --git a/utils/TableGen/PredicateExpander.h b/utils/TableGen/PredicateExpander.h
index 255e40c499889fdecf2acdcfd64486909b161277..0f3ee6867e658880e07ec28c2fc4a3f3fa3d15c3 100644
--- a/utils/TableGen/PredicateExpander.h
+++ b/utils/TableGen/PredicateExpander.h
@@ -56,9 +56,16 @@ public:
   using RecVec = std::vector<Record *>;
   void expandTrue(raw_ostream &OS);
   void expandFalse(raw_ostream &OS);
-  void expandCheckImmOperand(raw_ostream &OS, int OpIndex, int ImmVal);
-  void expandCheckImmOperand(raw_ostream &OS, int OpIndex, StringRef ImmVal);
-  void expandCheckRegOperand(raw_ostream &OS, int OpIndex, const Record *Reg);
+  void expandCheckImmOperand(raw_ostream &OS, int OpIndex, int ImmVal,
+                             StringRef FunctionMapper);
+  void expandCheckImmOperand(raw_ostream &OS, int OpIndex, StringRef ImmVal,
+                             StringRef FunctionMapperer);
+  void expandCheckImmOperandSimple(raw_ostream &OS, int OpIndex,
+                                   StringRef FunctionMapper);
+  void expandCheckRegOperand(raw_ostream &OS, int OpIndex, const Record *Reg,
+                             StringRef FunctionMapper);
+  void expandCheckRegOperandSimple(raw_ostream &OS, int OpIndex,
+                                   StringRef FunctionMapper);
   void expandCheckSameRegOperand(raw_ostream &OS, int First, int Second);
   void expandCheckNumOperands(raw_ostream &OS, int NumOps);
   void expandCheckOpcode(raw_ostream &OS, const Record *Inst);
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 7c38dc55e817917312b93164d5606d8321dcb498..ded54c828bcd8a062c54bb9ff35ca92bbed664f8 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -296,7 +296,7 @@ EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
            PSetE = PSetIDs.end(); PSetI != PSetE; ++PSetI) {
       PSets[i].push_back(RegBank.getRegPressureSet(*PSetI).Order);
     }
-    llvm::sort(PSets[i].begin(), PSets[i].end());
+    llvm::sort(PSets[i]);
     PSetsSeqs.add(PSets[i]);
   }
 
diff --git a/utils/TableGen/SearchableTableEmitter.cpp b/utils/TableGen/SearchableTableEmitter.cpp
index 61c918bd014c1e7c13d750b130d569164f10fd1f..f98a7c74bf0c28720ac186cdc39e82471c8b0b24 100644
--- a/utils/TableGen/SearchableTableEmitter.cpp
+++ b/utils/TableGen/SearchableTableEmitter.cpp
@@ -155,17 +155,15 @@ private:
     } else if (BitsRecTy *BI = dyn_cast<BitsRecTy>(Field.RecType)) {
       unsigned NumBits = BI->getNumBits();
       if (NumBits <= 8)
-        NumBits = 8;
-      else if (NumBits <= 16)
-        NumBits = 16;
-      else if (NumBits <= 32)
-        NumBits = 32;
-      else if (NumBits <= 64)
-        NumBits = 64;
-      else
-        PrintFatalError(Twine("bitfield '") + Field.Name +
-                        "' too large to search");
-      return "uint" + utostr(NumBits) + "_t";
+        return "uint8_t";
+      if (NumBits <= 16)
+        return "uint16_t";
+      if (NumBits <= 32)
+        return "uint32_t";
+      if (NumBits <= 64)
+        return "uint64_t";
+      PrintFatalError(Twine("bitfield '") + Field.Name +
+                      "' too large to search");
     } else if (Field.Enum || Field.IsIntrinsic || Field.IsInstruction)
       return "unsigned";
     PrintFatalError(Twine("Field '") + Field.Name + "' has unknown type '" +
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index ef0428eeed0f81ac29af636328c6426fea2a2b5f..4ff52b3e44e8cbad9b1ef721090ec5408ab0502e 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -653,7 +653,7 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
     return 0;
 
   // Print the RegisterCost table first.
-  OS << "\n// {RegisterClassID, Register Cost}\n";
+  OS << "\n// {RegisterClassID, Register Cost, AllowMoveElimination }\n";
   OS << "static const llvm::MCRegisterCostEntry " << ProcModel.ModelName
      << "RegisterCosts"
      << "[] = {\n";
@@ -668,24 +668,28 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
       Record *Rec = RC.RCDef;
       if (Rec->getValue("Namespace"))
         OS << Rec->getValueAsString("Namespace") << "::";
-      OS << Rec->getName() << "RegClassID, " << RC.Cost << "},\n";
+      OS << Rec->getName() << "RegClassID, " << RC.Cost << ", "
+         << RC.AllowMoveElimination << "},\n";
     }
   }
   OS << "};\n";
 
   // Now generate a table with register file info.
-  OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl}\n";
+  OS << "\n // {Name, #PhysRegs, #CostEntries, IndexToCostTbl, "
+     << "MaxMovesEliminatedPerCycle, AllowZeroMoveEliminationOnly }\n";
   OS << "static const llvm::MCRegisterFileDesc " << ProcModel.ModelName
      << "RegisterFiles"
      << "[] = {\n"
-     << "  { \"InvalidRegisterFile\", 0, 0, 0 },\n";
+     << "  { \"InvalidRegisterFile\", 0, 0, 0, 0, 0 },\n";
   unsigned CostTblIndex = 0;
 
   for (const CodeGenRegisterFile &RD : ProcModel.RegisterFiles) {
     OS << "  { ";
     OS << '"' << RD.Name << '"' << ", " << RD.NumPhysRegs << ", ";
     unsigned NumCostEntries = RD.Costs.size();
-    OS << NumCostEntries << ", " << CostTblIndex << "},\n";
+    OS << NumCostEntries << ", " << CostTblIndex << ", "
+       << RD.MaxMovesEliminatedPerCycle << ", "
+       << RD.AllowZeroMoveEliminationOnly << "},\n";
     CostTblIndex += NumCostEntries;
   }
   OS << "};\n";
@@ -693,80 +697,12 @@ SubtargetEmitter::EmitRegisterFileTables(const CodeGenProcModel &ProcModel,
   return CostTblIndex;
 }
 
-static bool EmitPfmIssueCountersTable(const CodeGenProcModel &ProcModel,
-                                      raw_ostream &OS) {
-  unsigned NumCounterDefs = 1 + ProcModel.ProcResourceDefs.size();
-  std::vector<const Record *> CounterDefs(NumCounterDefs);
-  bool HasCounters = false;
-  for (const Record *CounterDef : ProcModel.PfmIssueCounterDefs) {
-    const Record *&CD = CounterDefs[ProcModel.getProcResourceIdx(
-        CounterDef->getValueAsDef("Resource"))];
-    if (CD) {
-      PrintFatalError(CounterDef->getLoc(),
-                      "multiple issue counters for " +
-                          CounterDef->getValueAsDef("Resource")->getName());
-    }
-    CD = CounterDef;
-    HasCounters = true;
-  }
-  if (!HasCounters) {
-    return false;
-  }
-  OS << "\nstatic const char* " << ProcModel.ModelName
-     << "PfmIssueCounters[] = {\n";
-  for (unsigned i = 0; i != NumCounterDefs; ++i) {
-    const Record *CounterDef = CounterDefs[i];
-    if (CounterDef) {
-      const auto PfmCounters = CounterDef->getValueAsListOfStrings("Counters");
-      if (PfmCounters.empty())
-        PrintFatalError(CounterDef->getLoc(), "empty counter list");
-      OS << "  \"" << PfmCounters[0];
-      for (unsigned p = 1, e = PfmCounters.size(); p != e; ++p)
-        OS << ",\" \"" << PfmCounters[p];
-      OS << "\",  // #" << i << " = ";
-      OS << CounterDef->getValueAsDef("Resource")->getName() << "\n";
-    } else {
-      OS << "  nullptr, // #" << i << "\n";
-    }
-  }
-  OS << "};\n";
-  return true;
-}
-
-static void EmitPfmCounters(const CodeGenProcModel &ProcModel,
-                            const bool HasPfmIssueCounters, raw_ostream &OS) {
-  OS << "  {\n";
-  // Emit the cycle counter.
-  if (ProcModel.PfmCycleCounterDef)
-    OS << "    \"" << ProcModel.PfmCycleCounterDef->getValueAsString("Counter")
-       << "\",  // Cycle counter.\n";
-  else
-    OS << "    nullptr,  // No cycle counter.\n";
-
-  // Emit the uops counter.
-  if (ProcModel.PfmUopsCounterDef)
-    OS << "    \"" << ProcModel.PfmUopsCounterDef->getValueAsString("Counter")
-       << "\",  // Uops counter.\n";
-  else
-    OS << "    nullptr,  // No uops counter.\n";
-
-  // Emit a reference to issue counters table.
-  if (HasPfmIssueCounters)
-    OS << "    " << ProcModel.ModelName << "PfmIssueCounters\n";
-  else
-    OS << "    nullptr  // No issue counters.\n";
-  OS << "  }\n";
-}
-
 void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
                                               raw_ostream &OS) {
   // Generate a table of register file descriptors (one entry per each user
   // defined register file), and a table of register costs.
   unsigned NumCostEntries = EmitRegisterFileTables(ProcModel, OS);
 
-  // Generate a table of ProcRes counter names.
-  const bool HasPfmIssueCounters = EmitPfmIssueCountersTable(ProcModel, OS);
-
   // Now generate a table for the extra processor info.
   OS << "\nstatic const llvm::MCExtraProcessorInfo " << ProcModel.ModelName
      << "ExtraInfo = {\n  ";
@@ -779,8 +715,6 @@ void SubtargetEmitter::EmitExtraProcessorInfo(const CodeGenProcModel &ProcModel,
   EmitRegisterFileInfo(ProcModel, ProcModel.RegisterFiles.size(),
                        NumCostEntries, OS);
 
-  EmitPfmCounters(ProcModel, HasPfmIssueCounters, OS);
-
   OS << "};\n";
 }
 
@@ -1406,7 +1340,7 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
 }
 
 //
-// EmitProcessorLookup - generate cpu name to itinerary lookup table.
+// EmitProcessorLookup - generate cpu name to sched model lookup tables.
 //
 void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
   // Gather and sort processor information
@@ -1414,12 +1348,11 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
                           Records.getAllDerivedDefinitions("Processor");
   llvm::sort(ProcessorList, LessRecordFieldName());
 
-  // Begin processor table
+  // Begin processor->sched model table
   OS << "\n";
-  OS << "// Sorted (by key) array of itineraries for CPU subtype.\n"
-     << "extern const llvm::SubtargetInfoKV "
-     << Target << "ProcSchedKV[] = {\n";
-
+  OS << "// Sorted (by key) array of sched model for CPU subtype.\n"
+     << "extern const llvm::SubtargetInfoKV " << Target
+     << "ProcSchedKV[] = {\n";
   // For each processor
   for (Record *Processor : ProcessorList) {
     StringRef Name = Processor->getValueAsString("Name");
@@ -1429,8 +1362,7 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
     // Emit as { "cpu", procinit },
     OS << "  { \"" << Name << "\", (const void *)&" << ProcModelName << " },\n";
   }
-
-  // End processor table
+  // End processor->sched model table
   OS << "};\n";
 }
 
@@ -1671,7 +1603,7 @@ void SubtargetEmitter::EmitSchedModelHelpers(const std::string &ClassName,
 
   // Emit target predicates.
   emitSchedModelHelpersImpl(OS);
-  
+
   OS << "} // " << ClassName << "::resolveSchedClass\n\n";
 
   OS << "unsigned " << ClassName
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 9e526b6d8f5503a9c97a9dab35babe5d64e450f2..d5b6a3c12647e49e2562796702310eb91ef91c4e 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -53,7 +53,7 @@ enum ActionType {
   GenX86EVEX2VEXTables,
   GenX86FoldTables,
   GenRegisterBank,
-  GenWebAssemblyStackifier,
+  GenExegesis,
 };
 
 namespace {
@@ -119,8 +119,8 @@ namespace {
                                "Generate X86 fold tables"),
                     clEnumValN(GenRegisterBank, "gen-register-bank",
                                "Generate registers bank descriptions"),
-                    clEnumValN(GenWebAssemblyStackifier, "gen-wasm-stackifier",
-                               "Generate WebAssembly stackification cases")));
+                    clEnumValN(GenExegesis, "gen-exegesis",
+                               "Generate llvm-exegesis tables")));
 
   cl::OptionCategory PrintEnumsCat("Options for -print-enums");
   cl::opt<std::string>
@@ -234,8 +234,8 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenX86FoldTables:
     EmitX86FoldTables(Records, OS);
     break;
-  case GenWebAssemblyStackifier:
-    EmitWebAssemblyStackifier(Records, OS);
+  case GenExegesis:
+    EmitExegesis(Records, OS);
     break;
   }
 
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index f7ed5cc87d3e5705de63f601f5b4fc34c4f7c7ef..f4f2909f8e888d89eae2c2f0a5a620e1651c52cf 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -89,7 +89,7 @@ void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS);
-void EmitWebAssemblyStackifier(RecordKeeper &RK, raw_ostream &OS);
+void EmitExegesis(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
 
diff --git a/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp b/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
index f9c3cb12f8595aacf6b809c16c985285b33720b4..a8edfdc623f437397d16d8470bc7c56eebbb0565 100644
--- a/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
+++ b/utils/TableGen/WebAssemblyDisassemblerEmitter.cpp
@@ -42,12 +42,13 @@ void emitWebAssemblyDisassemblerTables(
     auto Prefix = Opc >> 8;
     Opc = Opc & 0xFF;
     auto &CGIP = OpcodeTable[Prefix][Opc];
-    // All wasm instructions have a StackBased fieldof type bit, we only want
-    // the instructions for which this is 1.
-    auto Bit = Def.getValue("StackBased")->getValue()->
-                 getCastTo(BitRecTy::get());
-    auto IsStackBased = Bit && reinterpret_cast<const BitInit *>(Bit)
-                                 ->getValue();
+    // All wasm instructions have a StackBased field of type string, we only
+    // want the instructions for which this is "true".
+    auto StackString =
+        Def.getValue("StackBased")->getValue()->getCastTo(StringRecTy::get());
+    auto IsStackBased =
+        StackString &&
+        reinterpret_cast<const StringInit *>(StackString)->getValue() == "true";
     if (IsStackBased && !CGIP.second) {
       // this picks the first of many typed variants, which is
       // currently the except_ref one, though this shouldn't matter for
diff --git a/utils/TableGen/WebAssemblyStackifierEmitter.cpp b/utils/TableGen/WebAssemblyStackifierEmitter.cpp
deleted file mode 100644
index 0b9741d22b81cee03cd3c51ebf5d87812a3b979d..0000000000000000000000000000000000000000
--- a/utils/TableGen/WebAssemblyStackifierEmitter.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===- WebAssemblyStackifierEmitter.cpp - Stackifier cases ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file emits the switch statement cases to translate WebAssembly
-// instructions to their stack forms.
-//
-//===----------------------------------------------------------------------===//
-
-#include "WebAssemblyDisassemblerEmitter.h"
-#include "llvm/TableGen/Record.h"
-
-namespace llvm {
-
-// Find all register WebAssembly instructions and their corresponding stack
-// instructions. For each pair, emit a switch case of the form
-//
-//   case WebAssembly::RegisterInstr: return WebAssembly::StackInstr;
-//
-// For example,
-//
-//   case WebAssembly::ADD_I32: return WebAssembly::ADD_I32_S;
-//
-// This is useful for converting instructions from their register form to their
-// equivalent stack form.
-void EmitWebAssemblyStackifier(RecordKeeper &RK, raw_ostream &OS) {
-  Record *InstrClass = RK.getClass("WebAssemblyInst");
-  for (auto &RecordPair : RK.getDefs()) {
-    if (!RecordPair.second->isSubClassOf(InstrClass))
-      continue;
-    bool IsStackBased = RecordPair.second->getValueAsBit("StackBased");
-    if (IsStackBased)
-      continue;
-    OS << "  case WebAssembly::" << RecordPair.first << ": return "
-       << "WebAssembly::" << RecordPair.first << "_S;\n";
-  }
-}
-
-} // namespace llvm
diff --git a/utils/UpdateTestChecks/asm.py b/utils/UpdateTestChecks/asm.py
index 726a653d1516427c7010ad66b0058f63fe552a4a..923efd5bbef4eee2f2495555f0a93354e87032da 100644
--- a/utils/UpdateTestChecks/asm.py
+++ b/utils/UpdateTestChecks/asm.py
@@ -52,6 +52,7 @@ ASM_FUNCTION_MIPS_RE = re.compile(
 
 ASM_FUNCTION_PPC_RE = re.compile(
     r'^_?(?P<func>[^:]+):[ \t]*#+[ \t]*@(?P=func)\n'
+    r'.*?'
     r'\.Lfunc_begin[0-9]+:\n'
     r'(?:[ \t]+.cfi_startproc\n)?'
     r'(?:\.Lfunc_[gl]ep[0-9]+:\n(?:[ \t]+.*?\n)*)*'
diff --git a/utils/benchmark/CMakeLists.txt b/utils/benchmark/CMakeLists.txt
index 6522ecf9d6b533f06b3102d49b2fd5312f311c1d..686846bf1e052760236e5be022fa667a4aa01e59 100644
--- a/utils/benchmark/CMakeLists.txt
+++ b/utils/benchmark/CMakeLists.txt
@@ -99,6 +99,7 @@ if (MSVC)
   if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
     add_cxx_compiler_flag(-EHs-)
     add_cxx_compiler_flag(-EHa-)
+    add_definitions(-D_HAS_EXCEPTIONS=0)
   endif()
   # Link time optimisation
   if (BENCHMARK_ENABLE_LTO)
diff --git a/utils/benchmark/README.LLVM b/utils/benchmark/README.LLVM
index 0121b145deddbd8c9f8c348db3b32a58f9ad5c10..5a20ec665ad070e48ddefe7d8c6cb63aeec9c6c0 100644
--- a/utils/benchmark/README.LLVM
+++ b/utils/benchmark/README.LLVM
@@ -19,3 +19,5 @@ Changes:
   is applied to fix cross compilation with MinGW headers
 * https://github.com/google/benchmark/commit/439d6b1c2a6da5cb6adc4c4dfc555af235722396
   is applied to fix building with MinGW headers for ARM
+* https://github.com/google/benchmark/commit/a9b31c51b1ee7ec7b31438c647123c2cbac5d956
+  is applied to disable exceptions in Microsoft STL when exceptions are disabled
diff --git a/utils/bisect-skip-count b/utils/bisect-skip-count
index b18b4f41481b6dcdcac6b5a522d601863669b759..f4f8ddcec797ad2ad5f109a8020c109bbaad4279 100755
--- a/utils/bisect-skip-count
+++ b/utils/bisect-skip-count
@@ -1,6 +1,25 @@
 #!/usr/bin/env python
 # This script is used to bisect skip and count arguments for --debug-counter.
 # It is similar to bisect, except it understands how to increase skip and decrease count
+#
+# Typical usage:
+#
+# bisect-skip-count bisect-command.sh "%(skip)d" "%(count)d" 2>&1 | tee bisect.out
+#
+# bisect-command.sh is something like this:
+# #! /bin/bash
+#
+# skip=$1
+# count=$2
+#
+# opt -debug-counter=my-counter-skip=${skip},my-counter-count=${count}
+# ... Test output of opt and exit zero for pass, non-zero for fail
+#
+# Examine bisect.out to look for "Last good skip" and "Last good
+# count" to find the values of the counter that produce a passing
+# result.  Incrementing the last good count by one or decrementing the
+# last good skip by one should produce a failure.
+#
 import os
 import sys
 import argparse
diff --git a/utils/collect_and_build_with_pgo.py b/utils/collect_and_build_with_pgo.py
new file mode 100755
index 0000000000000000000000000000000000000000..5a8686a88b4fd073b362463e9ba08a78436ed235
--- /dev/null
+++ b/utils/collect_and_build_with_pgo.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python3
+"""
+This script:
+- Builds clang with user-defined flags
+- Uses that clang to build an instrumented clang, which can be used to collect
+  PGO samples
+- Builds a user-defined set of sources (default: clang) to act as a
+  "benchmark" to generate a PGO profile
+- Builds clang once more with the PGO profile generated above
+
+This is a total of four clean builds of clang (by default). This may take a
+while. :)
+"""
+
+import argparse
+import collections
+import multiprocessing
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+
+### User configuration
+
+
+# If you want to use a different 'benchmark' than building clang, make this
+# function do what you want. out_dir is the build directory for clang, so all
+# of the clang binaries will live under "${out_dir}/bin/". Using clang in
+# ${out_dir} will magically have the profiles go to the right place.
+#
+# You may assume that out_dir is a freshly-built directory that you can reach
+# in to build more things, if you'd like.
+def _run_benchmark(env, out_dir, include_debug_info):
+    """The 'benchmark' we run to generate profile data."""
+    target_dir = env.output_subdir('instrumentation_run')
+
+    # `check-llvm` and `check-clang` are cheap ways to increase coverage. The
+    # former lets us touch on the non-x86 backends a bit if configured, and the
+    # latter gives us more C to chew on (and will send us through diagnostic
+    # paths a fair amount, though the `if (stuff_is_broken) { diag() ... }`
+    # branches should still heavily be weighted in the not-taken direction,
+    # since we built all of LLVM/etc).
+    _build_things_in(env, out_dir, what=['check-llvm', 'check-clang'])
+
+    # Building tblgen gets us coverage; don't skip it. (out_dir may also not
+    # have them anyway, but that's less of an issue)
+    cmake = _get_cmake_invocation_for_bootstrap_from(
+        env, out_dir, skip_tablegens=False)
+
+    if include_debug_info:
+        cmake.add_flag('CMAKE_BUILD_TYPE', 'RelWithDebInfo')
+
+    _run_fresh_cmake(env, cmake, target_dir)
+
+    # Just build all the things. The more data we have, the better.
+    _build_things_in(env, target_dir, what=['all'])
+
+### Script
+
+
+class CmakeInvocation:
+    _cflags = ['CMAKE_C_FLAGS', 'CMAKE_CXX_FLAGS']
+    _ldflags = [
+        'CMAKE_EXE_LINKER_FLAGS',
+        'CMAKE_MODULE_LINKER_FLAGS',
+        'CMAKE_SHARED_LINKER_FLAGS',
+    ]
+
+    def __init__(self, cmake, maker, cmake_dir):
+        self._prefix = [cmake, '-G', maker, cmake_dir]
+
+        # Map of str -> (list|str).
+        self._flags = {}
+        for flag in CmakeInvocation._cflags + CmakeInvocation._ldflags:
+            self._flags[flag] = []
+
+    def add_new_flag(self, key, value):
+        self.add_flag(key, value, allow_overwrites=False)
+
+    def add_flag(self, key, value, allow_overwrites=True):
+        if key not in self._flags:
+            self._flags[key] = value
+            return
+
+        existing_value = self._flags[key]
+        if isinstance(existing_value, list):
+            existing_value.append(value)
+            return
+
+        if not allow_overwrites:
+            raise ValueError('Invalid overwrite of %s requested' % key)
+
+        self._flags[key] = value
+
+    def add_cflags(self, flags):
+        # No, I didn't intend to append ['-', 'O', '2'] to my flags, thanks :)
+        assert not isinstance(flags, str)
+        for f in CmakeInvocation._cflags:
+            self._flags[f].extend(flags)
+
+    def add_ldflags(self, flags):
+        assert not isinstance(flags, str)
+        for f in CmakeInvocation._ldflags:
+            self._flags[f].extend(flags)
+
+    def to_args(self):
+        args = self._prefix.copy()
+        for key, value in sorted(self._flags.items()):
+            if isinstance(value, list):
+                # We preload all of the list-y values (cflags, ...). If we've
+                # nothing to add, don't.
+                if not value:
+                    continue
+                value = ' '.join(value)
+
+            arg = '-D' + key
+            if value != '':
+                arg += '=' + value
+            args.append(arg)
+        return args
+
+
+class Env:
+    def __init__(self, llvm_dir, use_make, output_dir, default_cmake_args,
+                 dry_run):
+        self.llvm_dir = llvm_dir
+        self.use_make = use_make
+        self.output_dir = output_dir
+        self.default_cmake_args = default_cmake_args.copy()
+        self.dry_run = dry_run
+
+    def get_default_cmake_args_kv(self):
+        return self.default_cmake_args.items()
+
+    def get_cmake_maker(self):
+        return 'Ninja' if not self.use_make else 'Unix Makefiles'
+
+    def get_make_command(self):
+        if self.use_make:
+            return ['make', '-j{}'.format(multiprocessing.cpu_count())]
+        return ['ninja']
+
+    def output_subdir(self, name):
+        return os.path.join(self.output_dir, name)
+
+    def has_llvm_subproject(self, name):
+        if name == 'compiler-rt':
+            subdir = 'projects/compiler-rt'
+        elif name == 'clang':
+            subdir = 'tools/clang'
+        else:
+            raise ValueError('Unknown subproject: %s' % name)
+
+        return os.path.isdir(os.path.join(self.llvm_dir, subdir))
+
+    # Note that we don't allow capturing stdout/stderr. This works quite nicely
+    # with dry_run.
+    def run_command(self,
+                    cmd,
+                    cwd=None,
+                    check=False,
+                    silent_unless_error=False):
+        cmd_str = ' '.join(shlex.quote(s) for s in cmd)
+        print(
+            'Running `%s` in %s' % (cmd_str, shlex.quote(cwd or os.getcwd())))
+
+        if self.dry_run:
+            return
+
+        if silent_unless_error:
+            stdout, stderr = subprocess.PIPE, subprocess.STDOUT
+        else:
+            stdout, stderr = None, None
+
+        # Don't use subprocess.run because it's >= py3.5 only, and it's not too
+        # much extra effort to get what it gives us anyway.
+        popen = subprocess.Popen(
+            cmd,
+            stdin=subprocess.DEVNULL,
+            stdout=stdout,
+            stderr=stderr,
+            cwd=cwd)
+        stdout, _ = popen.communicate()
+        return_code = popen.wait(timeout=0)
+
+        if not return_code:
+            return
+
+        if silent_unless_error:
+            print(stdout.decode('utf-8', 'ignore'))
+
+        if check:
+            raise subprocess.CalledProcessError(
+                returncode=return_code, cmd=cmd, output=stdout, stderr=None)
+
+
+def _get_default_cmake_invocation(env):
+    inv = CmakeInvocation(
+        cmake='cmake', maker=env.get_cmake_maker(), cmake_dir=env.llvm_dir)
+    for key, value in env.get_default_cmake_args_kv():
+        inv.add_new_flag(key, value)
+    return inv
+
+
+def _get_cmake_invocation_for_bootstrap_from(env, out_dir,
+                                             skip_tablegens=True):
+    clang = os.path.join(out_dir, 'bin', 'clang')
+    cmake = _get_default_cmake_invocation(env)
+    cmake.add_new_flag('CMAKE_C_COMPILER', clang)
+    cmake.add_new_flag('CMAKE_CXX_COMPILER', clang + '++')
+
+    # We often get no value out of building new tblgens; the previous build
+    # should have them. It's still correct to build them, just slower.
+    def add_tablegen(key, binary):
+        path = os.path.join(out_dir, 'bin', binary)
+
+        # Check that this exists, since the user's allowed to specify their own
+        # stage1 directory (which is generally where we'll source everything
+        # from). Dry runs should hope for the best from our user, as well.
+        if env.dry_run or os.path.exists(path):
+            cmake.add_new_flag(key, path)
+
+    if skip_tablegens:
+        add_tablegen('LLVM_TABLEGEN', 'llvm-tblgen')
+        add_tablegen('CLANG_TABLEGEN', 'clang-tblgen')
+
+    return cmake
+
+
+def _build_things_in(env, target_dir, what):
+    cmd = env.get_make_command() + what
+    env.run_command(cmd, cwd=target_dir, check=True)
+
+
+def _run_fresh_cmake(env, cmake, target_dir):
+    if not env.dry_run:
+        try:
+            shutil.rmtree(target_dir)
+        except FileNotFoundError:
+            pass
+
+        os.makedirs(target_dir, mode=0o755)
+
+    cmake_args = cmake.to_args()
+    env.run_command(
+        cmake_args, cwd=target_dir, check=True, silent_unless_error=True)
+
+
+def _build_stage1_clang(env):
+    target_dir = env.output_subdir('stage1')
+    cmake = _get_default_cmake_invocation(env)
+    _run_fresh_cmake(env, cmake, target_dir)
+    _build_things_in(env, target_dir, what=['clang', 'llvm-profdata', 'profile'])
+    return target_dir
+
+
+def _generate_instrumented_clang_profile(env, stage1_dir, profile_dir,
+                                         output_file):
+    llvm_profdata = os.path.join(stage1_dir, 'bin', 'llvm-profdata')
+    if env.dry_run:
+        profiles = [os.path.join(profile_dir, '*.profraw')]
+    else:
+        profiles = [
+            os.path.join(profile_dir, f) for f in os.listdir(profile_dir)
+            if f.endswith('.profraw')
+        ]
+    cmd = [llvm_profdata, 'merge', '-output=' + output_file] + profiles
+    env.run_command(cmd, check=True)
+
+
+def _build_instrumented_clang(env, stage1_dir):
+    assert os.path.isabs(stage1_dir)
+
+    target_dir = os.path.join(env.output_dir, 'instrumented')
+    cmake = _get_cmake_invocation_for_bootstrap_from(env, stage1_dir)
+    cmake.add_new_flag('LLVM_BUILD_INSTRUMENTED', 'IR')
+
+    # libcxx's configure step messes with our link order: we'll link
+    # libclang_rt.profile after libgcc, and the former requires atexit from the
+    # latter. So, configure checks fail.
+    #
+    # Since we don't need libcxx or compiler-rt anyway, just disable them.
+    cmake.add_new_flag('LLVM_BUILD_RUNTIME', 'No')
+
+    _run_fresh_cmake(env, cmake, target_dir)
+    _build_things_in(env, target_dir, what=['clang', 'lld'])
+
+    profiles_dir = os.path.join(target_dir, 'profiles')
+    return target_dir, profiles_dir
+
+
+def _build_optimized_clang(env, stage1_dir, profdata_file):
+    if not env.dry_run and not os.path.exists(profdata_file):
+        raise ValueError('Looks like the profdata file at %s doesn\'t exist' %
+                         profdata_file)
+
+    target_dir = os.path.join(env.output_dir, 'optimized')
+    cmake = _get_cmake_invocation_for_bootstrap_from(env, stage1_dir)
+    cmake.add_new_flag('LLVM_PROFDATA_FILE', os.path.abspath(profdata_file))
+
+    # We'll get complaints about hash mismatches in `main` in tools/etc. Ignore
+    # it.
+    cmake.add_cflags(['-Wno-backend-plugin'])
+    _run_fresh_cmake(env, cmake, target_dir)
+    _build_things_in(env, target_dir, what=['clang'])
+    return target_dir
+
+
+Args = collections.namedtuple('Args', [
+    'do_optimized_build',
+    'include_debug_info',
+    'profile_location',
+    'stage1_dir',
+])
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description='Builds LLVM and Clang with instrumentation, collects '
+        'instrumentation profiles for them, and (optionally) builds things'
+        'with these PGO profiles. By default, it\'s assumed that you\'re '
+        'running this from your LLVM root, and all build artifacts will be '
+        'saved to $PWD/out.')
+    parser.add_argument(
+        '--cmake-extra-arg',
+        action='append',
+        default=[],
+        help='an extra arg to pass to all cmake invocations. Note that this '
+        'is interpreted as a -D argument, e.g. --cmake-extra-arg FOO=BAR will '
+        'be passed as -DFOO=BAR. This may be specified multiple times.')
+    parser.add_argument(
+        '--dry-run',
+        action='store_true',
+        help='print commands instead of running them')
+    parser.add_argument(
+        '--llvm-dir',
+        default='.',
+        help='directory containing an LLVM checkout (default: $PWD)')
+    parser.add_argument(
+        '--no-optimized-build',
+        action='store_true',
+        help='disable the final, PGO-optimized build')
+    parser.add_argument(
+        '--out-dir',
+        help='directory to write artifacts to (default: $llvm_dir/out)')
+    parser.add_argument(
+        '--profile-output',
+        help='where to output the profile (default is $out/pgo_profile.prof)')
+    parser.add_argument(
+        '--stage1-dir',
+        help='instead of having an initial build of everything, use the given '
+        'directory. It is expected that this directory will have clang, '
+        'llvm-profdata, and the appropriate libclang_rt.profile already built')
+    parser.add_argument(
+        '--use-debug-info-in-benchmark',
+        action='store_true',
+        help='use a regular build instead of RelWithDebInfo in the benchmark. '
+        'This increases benchmark execution time and disk space requirements, '
+        'but gives more coverage over debuginfo bits in LLVM and clang.')
+    parser.add_argument(
+        '--use-make',
+        action='store_true',
+        default=shutil.which('ninja') is None,
+        help='use Makefiles instead of ninja')
+
+    args = parser.parse_args()
+
+    llvm_dir = os.path.abspath(args.llvm_dir)
+    if args.out_dir is None:
+        output_dir = os.path.join(llvm_dir, 'out')
+    else:
+        output_dir = os.path.abspath(args.out_dir)
+
+    extra_args = {'CMAKE_BUILD_TYPE': 'Release'}
+    for arg in args.cmake_extra_arg:
+        if arg.startswith('-D'):
+            arg = arg[2:]
+        elif arg.startswith('-'):
+            raise ValueError('Unknown not- -D arg encountered; you may need '
+                             'to tweak the source...')
+        split = arg.split('=', 1)
+        if len(split) == 1:
+            key, val = split[0], ''
+        else:
+            key, val = split
+        extra_args[key] = val
+
+    env = Env(
+        default_cmake_args=extra_args,
+        dry_run=args.dry_run,
+        llvm_dir=llvm_dir,
+        output_dir=output_dir,
+        use_make=args.use_make,
+    )
+
+    if args.profile_output is not None:
+        profile_location = args.profile_output
+    else:
+        profile_location = os.path.join(env.output_dir, 'pgo_profile.prof')
+
+    result_args = Args(
+        do_optimized_build=not args.no_optimized_build,
+        include_debug_info=args.use_debug_info_in_benchmark,
+        profile_location=profile_location,
+        stage1_dir=args.stage1_dir,
+    )
+
+    return env, result_args
+
+
+def _looks_like_llvm_dir(directory):
+    """Arbitrary set of heuristics to determine if `directory` is an llvm dir.
+
+    Errs on the side of false-positives."""
+
+    contents = set(os.listdir(directory))
+    expected_contents = [
+        'CODE_OWNERS.TXT',
+        'cmake',
+        'docs',
+        'include',
+        'utils',
+    ]
+
+    if not all(c in contents for c in expected_contents):
+        return False
+
+    try:
+        include_listing = os.listdir(os.path.join(directory, 'include'))
+    except NotADirectoryError:
+        return False
+
+    return 'llvm' in include_listing
+
+
+def _die(*args, **kwargs):
+    kwargs['file'] = sys.stderr
+    print(*args, **kwargs)
+    sys.exit(1)
+
+
+def _main():
+    env, args = _parse_args()
+
+    if not _looks_like_llvm_dir(env.llvm_dir):
+        _die('Looks like %s isn\'t an LLVM directory; please see --help' %
+             env.llvm_dir)
+    if not env.has_llvm_subproject('clang'):
+        _die('Need a clang checkout at tools/clang')
+    if not env.has_llvm_subproject('compiler-rt'):
+        _die('Need a compiler-rt checkout at projects/compiler-rt')
+
+    def status(*args):
+        print(*args, file=sys.stderr)
+
+    if args.stage1_dir is None:
+        status('*** Building stage1 clang...')
+        stage1_out = _build_stage1_clang(env)
+    else:
+        stage1_out = args.stage1_dir
+
+    status('*** Building instrumented clang...')
+    instrumented_out, profile_dir = _build_instrumented_clang(env, stage1_out)
+    status('*** Running profdata benchmarks...')
+    _run_benchmark(env, instrumented_out, args.include_debug_info)
+    status('*** Generating profile...')
+    _generate_instrumented_clang_profile(env, stage1_out, profile_dir,
+                                         args.profile_location)
+
+    print('Final profile:', args.profile_location)
+    if args.do_optimized_build:
+        status('*** Building PGO-optimized binaries...')
+        optimized_out = _build_optimized_clang(env, stage1_out,
+                                               args.profile_location)
+        print('Final build directory:', optimized_out)
+
+
+if __name__ == '__main__':
+    _main()
diff --git a/utils/extract_vplan.py b/utils/extract_vplan.py
new file mode 100755
index 0000000000000000000000000000000000000000..ac0055d2e798ad0e57b0298d8eb25c0206fc03f1
--- /dev/null
+++ b/utils/extract_vplan.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+# This script extracts the VPlan digraphs from the vectoriser debug messages
+# and saves them in individual dot files (one for each plan). Optionally, and
+# providing 'dot' is installed, it can also render the dot into a PNG file.
+
+import sys
+import re
+import argparse
+import shutil
+import subprocess
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--png', action='store_true')
+args = parser.parse_args()
+
+dot = shutil.which('dot')
+if args.png and not dot:
+    raise RuntimeError("Can't export to PNG without 'dot' in the system")
+
+pattern = re.compile(r"(digraph VPlan {.*?\n})",re.DOTALL)
+matches = re.findall(pattern, sys.stdin.read())
+
+for vplan in matches:
+    m = re.search("graph \[.+(VF=.+,UF.+), ", vplan)
+    if not m:
+        raise ValueError("Can't get the right VPlan name")
+    name = re.sub('[^a-zA-Z0-9]', '', m.group(1))
+
+    if args.png:
+        filename = 'VPlan' + name + '.png'
+        print("Exporting " + name + " to PNG via dot: " + filename)
+        p = subprocess.Popen([dot, '-Tpng', '-o', filename],
+                              encoding='utf-8',
+                              stdin=subprocess.PIPE,
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE)
+        out, err = p.communicate(input=vplan)
+        if err:
+            raise RuntimeError("Error running dot: " + err)
+
+    else:
+        filename = 'VPlan' + name + '.dot'
+        print("Exporting " + name + " to DOT: " + filename)
+        with open(filename, 'w') as out:
+            out.write(vplan)
diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index e8fb1533a86147f75806a419a2f83fda9bdb6668..97c091085816df2b941675f1221bad90b0cee6f7 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -120,6 +120,22 @@ class LitConfig(object):
         if self.bashPath is None:
             self.bashPath = ''
 
+        # Check whether the found version of bash is able to cope with paths in
+        # the host path format. If not, don't return it as it can't be used to
+        # run scripts. For example, WSL's bash.exe requires '/mnt/c/foo' rather
+        # than 'C:\\foo' or 'C:/foo'.
+        if self.isWindows and self.bashPath:
+            command = [self.bashPath, '-c',
+                       '[[ -f "%s" ]]' % self.bashPath.replace('\\', '\\\\')]
+            _, _, exitCode = lit.util.executeCommand(command)
+            if exitCode:
+                self.note('bash command failed: %s' % (
+                    ' '.join('"%s"' % c for c in command)))
+                self.bashPath = ''
+
+        if not self.bashPath:
+            self.warning('Unable to find a usable version of bash.')
+
         return self.bashPath
 
     def getToolsPath(self, dir, paths, tools):
diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py
index e2ac73b0b426e350dda97d2e264896b304c5d435..d5adb535775d161db5352807c2386cc90f35fbf7 100644
--- a/utils/lit/lit/TestingConfig.py
+++ b/utils/lit/lit/TestingConfig.py
@@ -26,7 +26,7 @@ class TestingConfig:
                      'LSAN_OPTIONS', 'ADB', 'ANDROID_SERIAL',
                      'SANITIZER_IGNORE_CVE_2016_2143', 'TMPDIR', 'TMP', 'TEMP',
                      'TEMPDIR', 'AVRLIT_BOARD', 'AVRLIT_PORT',
-                     'FILECHECK_DUMP_INPUT_ON_FAILURE']
+                     'FILECHECK_DUMP_INPUT_ON_FAILURE', 'FILECHECK_OPTS']
         for var in pass_vars:
             val = os.environ.get(var, '')
             # Check for empty string as some variables such as LD_PRELOAD cannot be empty
diff --git a/utils/lit/lit/llvm/config.py b/utils/lit/lit/llvm/config.py
index 0e446da371090c731e3c0dd9d7224cec1a1746c1..6bb7135f65907ba7265fb43a548f3f2950241baf 100644
--- a/utils/lit/lit/llvm/config.py
+++ b/utils/lit/lit/llvm/config.py
@@ -55,6 +55,8 @@ class LLVMConfig(object):
             features.add('system-windows')
         elif platform.system() == "Linux":
             features.add('system-linux')
+        elif platform.system() in ['FreeBSD']:
+            config.available_features.add('system-freebsd')
 
         # Native compilation: host arch == default triple arch
         # Both of these values should probably be in every site config (e.g. as
diff --git a/utils/lldbDataFormatters.py b/utils/lldbDataFormatters.py
index db1e22af792e599bf22dfb8d650b96e8b6266357..fcb381cc54dec49dcb983f0affb8b03fd26bf337 100644
--- a/utils/lldbDataFormatters.py
+++ b/utils/lldbDataFormatters.py
@@ -26,9 +26,7 @@ class SmallVectorSynthProvider:
         self.update() # initialize this provider
 
     def num_children(self):
-        begin = self.begin.GetValueAsUnsigned(0)
-        end = self.end.GetValueAsUnsigned(0)
-        return (end - begin)/self.type_size
+        return self.size.GetValueAsUnsigned(0)
 
     def get_child_index(self, name):
         try:
@@ -49,7 +47,7 @@ class SmallVectorSynthProvider:
 
     def update(self):
         self.begin = self.valobj.GetChildMemberWithName('BeginX')
-        self.end = self.valobj.GetChildMemberWithName('EndX')
+        self.size = self.valobj.GetChildMemberWithName('Size')
         the_type = self.valobj.GetType()
         # If this is a reference type we have to dereference it to get to the
         # template parameter.
diff --git a/utils/prepare-code-coverage-artifact.py b/utils/prepare-code-coverage-artifact.py
index 883cdd78049bec04f604a0f44b0f94190d13bd35..5c4af242d0de56fab3f755b955b47adcada45170 100644
--- a/utils/prepare-code-coverage-artifact.py
+++ b/utils/prepare-code-coverage-artifact.py
@@ -51,7 +51,8 @@ def prepare_html_report(host_llvm_cov, profile, report_dir, binaries,
     subprocess.check_call(invocation)
     with open(os.path.join(report_dir, 'summary.txt'), 'wb') as Summary:
         subprocess.check_call([host_llvm_cov, 'report'] + objects +
-                               ['-instr-profile', profile], stdout=Summary)
+                               ['-instr-profile', profile] + restricted_dirs,
+                               stdout=Summary)
     print('Done!')
 
 def prepare_html_reports(host_llvm_cov, profdata_path, report_dir, binaries,
diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat
index 30767f6f3509b29a0288851aad8d5d0bfb3703d2..1dac7878ff3de8d9c1b356df6781b8d67811c9ce 100755
--- a/utils/release/build_llvm_package.bat
+++ b/utils/release/build_llvm_package.bat
@@ -44,8 +44,8 @@ svn.exe export -r %revision% http://llvm.org/svn/llvm-project/openmp/%branch% ll
 
 
 REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226.
-set cmake_flags=-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_USE_CRT_RELEASE=MT -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% -DPACKAGE_VERSION=%package_version% -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: "
-
+REM Excluding wasm target to work around PR39448.
+set cmake_flags=-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_USE_CRT_RELEASE=MT -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% -DPACKAGE_VERSION=%package_version% -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " -DLLVM_TARGETS_TO_BUILD="AArch64;AMDGPU;ARM;BPF;Hexagon;Lanai;Mips;MSP430;NVPTX;PowerPC;Sparc;SystemZ;X86;XCore"
 REM TODO: Run all tests, including lld and compiler-rt.
 
 set "VSCMD_START_DIR=%CD%"
diff --git a/utils/sanitizers/ubsan_blacklist.txt b/utils/sanitizers/ubsan_blacklist.txt
index 69230a3e46501f3cd4562a5e34e465f8a08488d7..b5bbfddceef6dcf6232a9d35204cd98c742c09f6 100644
--- a/utils/sanitizers/ubsan_blacklist.txt
+++ b/utils/sanitizers/ubsan_blacklist.txt
@@ -10,8 +10,3 @@ src:*bits/stl_tree.h
 # data() on an empty vector: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59829
 src:*bits/stl_iterator.h
 src:*bits/stl_vector.h
-
-# These auto-generated functions compile down to ~50k basic blocks with inlining
-# and UBSan enabled, causing long builds that lead to bot timeouts.
-# https://bugs.llvm.org/show_bug.cgi?id=37929
-fun:*AArch64*InstPrinter*printAliasInstr*
diff --git a/utils/update_mca_test_checks.py b/utils/update_mca_test_checks.py
index a83186cd3360127ad735808254b6673da7f72a65..54d1cb443c359986add2246e871d6231bc849457 100755
--- a/utils/update_mca_test_checks.py
+++ b/utils/update_mca_test_checks.py
@@ -267,10 +267,14 @@ def _align_matching_blocks(all_blocks, farthest_indexes):
         continue
 
       changed = False
-      while(index < farthest_indexes[block]):
-        blocks.insert(index, '')
-        index += 1
-        changed = True
+      # If the block has not already been subject to alignment (i.e. if the
+      # previous block is not empty) then insert empty blocks until the index
+      # matches the farthest index identified for that block.
+      if (index > 0) and blocks[index - 1]:
+        while(index < farthest_indexes[block]):
+          blocks.insert(index, '')
+          index += 1
+          changed = True
 
       if changed:
         # Bail out.  We'll need to re-do the farthest block analysis now that